• spelling.js

  • ¶
    var fs = require('fs');
    var split = require('split');
    var wuzzy = require('wuzzy');
    
    function train (corrects, corpus, done) {
        fs.readFile(corpus, function (err, corpStr) {
            if (err) {
                done(err);
            } else {
  • ¶

    provide an abstraction around the corpus of words

                var Trained = function (freqs) {
                    this.freqs = freqs;
                };
  • ¶

    provide access to the frquency counts of words in the corpus. return a mimum of 1 to provide some handling of words that do not exist in the word corpus.

                Trained.prototype.freq = function (w) {
                    return (this.freqs[w]
                        ? this.freqs[w]
                        : 1
                    );
                };
  • ¶

    get the set of recognized words from the corpus

                Trained.prototype.words = function (w) {
                    return Object.keys(this.freqs);
                };
  • ¶

    build up frequency counts of words in the training corpus

                var f = {};
                corpStr.toString().toLowerCase().match(/[a-z]+/g).forEach(function (w) {
  • ¶

    limit words to those that exist in the English dictionary

                    if (corrects[w]) {
                        f[w] = (f[w] 
                            ? f[w] + 1
                            : 1
                        );
                    }
                });
    
                done(null, new Trained(f)); 
            }
        });
    };
    
    function readCorrects (correctsFile, done) {
        var words = {};
  • ¶

    read in the set of correct English words

        fs.createReadStream(
            correctsFile
        ).pipe(
            split()
        ).on('data', function (word) {
            words[word.toLowerCase().trim()] = 1;
        }).on('end', function () {
            done(null, words);
        }).on('error', function (err) {
            done(err);
        });
    };
    
    function getChecker (mdl) {
        var SpellChecker = function (model) {
            this.model = model;
        };
    
        SpellChecker.prototype.check = function (word) {
            return this._corrections(word.toLowerCase());
        };
    
        SpellChecker.prototype._corrections = function (w1) {
  • ¶

    go over each word in the corpus and collect edit distance (using levenshtein distance) and frequency counts for each word

            return this.model.words().map(function (w2) {
                return {
                    w: w2,
                    r: wuzzy.levenshtein(w1, w2),
                    f: this.model.freq(w2)
                };
            }, this).sort(function (a, b) {
  • ¶

    sort on edit distance

                return (b.r - a.r);
            }).slice(0, 5).sort(function (a, b) {
  • ¶

    sort the top 5 results on edit distance weighted by frequency of appearance in the corpus

                return ((b.f * b.r) - (a.f * b.r));
  • ¶

    return the top result

            })[0].w;
        };
    
        return new SpellChecker(mdl);
    };
    
    module.exports = function (correctsFile, trainCorpus, done) {
  • ¶

    read in the dictionary of english words

        readCorrects(correctsFile, function (err, corrects) {
            if (err) {
                done(err);
            } else {
  • ¶

    train the spell checker using the provided corpus of text

                train(corrects, trainCorpus, function (err, model) {
                    if (err) {
                        done(err);
                    } else {
                        done(null, getChecker(model));
                    }
                });
            }
        });
    };