var fs = require('fs');
var split = require('split');
var wuzzy = require('wuzzy');
function train (corrects, corpus, done) {
fs.readFile(corpus, function (err, corpStr) {
if (err) {
done(err);
} else {
var fs = require('fs');
var split = require('split');
var wuzzy = require('wuzzy');
function train (corrects, corpus, done) {
fs.readFile(corpus, function (err, corpStr) {
if (err) {
done(err);
} else {
provide an abstraction around the corpus of words
var Trained = function (freqs) {
this.freqs = freqs;
};
provide access to the frquency counts of words in the corpus. return a mimum of 1 to provide some handling of words that do not exist in the word corpus.
Trained.prototype.freq = function (w) {
return (this.freqs[w]
? this.freqs[w]
: 1
);
};
get the set of recognized words from the corpus
Trained.prototype.words = function (w) {
return Object.keys(this.freqs);
};
build up frequency counts of words in the training corpus
var f = {};
corpStr.toString().toLowerCase().match(/[a-z]+/g).forEach(function (w) {
limit words to those that exist in the English dictionary
if (corrects[w]) {
f[w] = (f[w]
? f[w] + 1
: 1
);
}
});
done(null, new Trained(f));
}
});
};
function readCorrects (correctsFile, done) {
var words = {};
read in the set of correct English words
fs.createReadStream(
correctsFile
).pipe(
split()
).on('data', function (word) {
words[word.toLowerCase().trim()] = 1;
}).on('end', function () {
done(null, words);
}).on('error', function (err) {
done(err);
});
};
function getChecker (mdl) {
var SpellChecker = function (model) {
this.model = model;
};
SpellChecker.prototype.check = function (word) {
return this._corrections(word.toLowerCase());
};
SpellChecker.prototype._corrections = function (w1) {
go over each word in the corpus and collect edit distance (using levenshtein distance) and frequency counts for each word
return this.model.words().map(function (w2) {
return {
w: w2,
r: wuzzy.levenshtein(w1, w2),
f: this.model.freq(w2)
};
}, this).sort(function (a, b) {
sort on edit distance
return (b.r - a.r);
}).slice(0, 5).sort(function (a, b) {
sort the top 5 results on edit distance weighted by frequency of appearance in the corpus
return ((b.f * b.r) - (a.f * b.r));
return the top result
})[0].w;
};
return new SpellChecker(mdl);
};
module.exports = function (correctsFile, trainCorpus, done) {
read in the dictionary of english words
readCorrects(correctsFile, function (err, corrects) {
if (err) {
done(err);
} else {
train the spell checker using the provided corpus of text
train(corrects, trainCorpus, function (err, model) {
if (err) {
done(err);
} else {
done(null, getChecker(model));
}
});
}
});
};