package com.boyter.SpellingCorrector;

import java.util.*;
import java.util.stream.Stream;

/**
 * A simple spell checker based on a few implementations such as the infamous Peter Noving spell checker and
 * the like. Attempts to be highly performing by never changing the first character since we can assume that the
 * user got that correct.
 */
public class SpellingCorrector implements ISpellingCorrector {

    // word to count map - how may times a word is present - or a weight attached to a word
    private Map dictionary = null;

    public SpellingCorrector(int lruCount) {
        this.dictionary = Collections.synchronizedMap(new LruCache<>(lruCount));
    }

    @Override
    public void putWord(String word) {
        word = word.toLowerCase();
        if (dictionary.containsKey(word)) {
            dictionary.put(word, (dictionary.get(word) + 1));
        }
        else {
            dictionary.put(word, 1);
        }
    }

    @Override
    public String correct(String word) {
        if (word == null || word.trim().isEmpty()) {
            return word;
        }

        word = word.toLowerCase();

        // If the word exists in our dictionary then return
        if (dictionary.containsKey(word)) {
            return word;
        }

        Map possibleMatches = new HashMap<>();

        List closeEdits = wordEdits(word);
        for (String closeEdit: closeEdits) {
            if (dictionary.containsKey(closeEdit)) {
                possibleMatches.put(closeEdit, this.dictionary.get(closeEdit));
            }
        }

        if (!possibleMatches.isEmpty()) {
            // Sorted least likely first
            Object[] matches = this.sortByValue(possibleMatches).keySet().toArray();

            // Try to match anything of the same length first
            String bestMatch = "";
            for(Object o: matches) {
                if (o.toString().length() == word.length()) {
                    bestMatch = o.toString();
                }
            }

            if (!bestMatch.trim().isEmpty()) {
                return bestMatch;
            }

            // Just return whatever is the best match
            return matches[matches.length - 1].toString();
        }

        // Ok we did"t find anything, so lets run the edits function on the previous results and use those
        // this gives us results which are 2 characters away from whatever was entered
        List furtherEdits = new ArrayList<>();
        for(String closeEdit: closeEdits) {
            furtherEdits.addAll(this.wordEdits(closeEdit));
        }

        for (String futherEdit: furtherEdits) {
            if (dictionary.containsKey(futherEdit)) {
                possibleMatches.put(futherEdit, this.dictionary.get(futherEdit));
            }
        }

        if (!possibleMatches.isEmpty()) {
            // Sorted least likely first
            Object[] matches = this.sortByValue(possibleMatches).keySet().toArray();

            // Try to match anything of the same length first
            String bestMatch = "";
            for(Object o: matches) {
                if (o.toString().length() == word.length()) {
                    bestMatch = o.toString();
                }
            }

            if (!bestMatch.trim().isEmpty()) {
                return bestMatch;
            }

            // Just return whatever is the best match
            return matches[matches.length - 1].toString();
        }


        // If unable to find something better return the same string
        return word;
    }

    @Override
    public boolean containsWord(String word) {
        if (dictionary.containsKey(word)) {
            return true;
        }

        return false;
    }


    /**
     * Return a list of strings which are words similar to our one which could potentially be misspellings
     * Abuse the fact that a char can be used as an integer
     * Assume that they got the first letter correct for all edits to cut on CPU burn time
     */
    private List wordEdits(String word) {
        List closeWords = new ArrayList();

        for (int i = 1; i < word.length() + 1; i++) {
            for (char character = "a"; character <= "z"; character++) {
                // Maybe they forgot to type a letter? Try adding one
                StringBuilder sb = new StringBuilder(word);
                sb.insert(i, character);
                closeWords.add(sb.toString());
            }
        }

        for (int i = 1; i < word.length(); i++) {
            for (char character = "a"; character <= "z"; character++) {
                // Maybe they mistyped a single letter? Try replacing them all
                StringBuilder sb = new StringBuilder(word);
                sb.setCharAt(i, character);
                closeWords.add(sb.toString());

                // Maybe they added an extra letter? Try deleting one
                sb = new StringBuilder(word);
                sb.deleteCharAt(i);
                closeWords.add(sb.toString());
            }
        }

        return closeWords;
    }


    /**
     * Sorts a map by value taken from
     * http://stackoverflow.com/questions/109383/sort-a-mapkey-value-by-values-java
     */
    public static > Map sortByValue( Map map ) {
        Map result = new LinkedHashMap<>();
        Stream> st = map.entrySet().stream();

        st.sorted( Map.Entry.comparingByValue() ).forEachOrdered( e -> result.put(e.getKey(), e.getValue()) );

        return result;
    }

    /**
     * A very simple LRU cache implementation that can be used for random data types.
     */
    public class LruCache extends LinkedHashMap {
        private final int maxEntries;

        public LruCache(final int maxEntries) {
            super(maxEntries + 1, 1.0f, true);
            this.maxEntries = maxEntries;
        }

        @Override
        protected boolean removeEldestEntry(final Map.Entry eldest) {
            return super.size() > maxEntries;
        }
    }

}

Related articles

scc 6

package com.boyter.SpellingCorrector; import java.util.*; import java.util.stream.Stream; /** * A simple spell checker based on a few implementations such as the infamous Peter Noving spell checker and * the like. Attempts to be highly performing by

scc 13

package com.boyter.SpellingCorrector; import java.util.*; import java.util.stream.Stream; /** * A simple spell checker based on a few implementations such as the infamous Peter Noving spell checker and * the like. Attempts to be highly performing by

scc bosque

//------------------------------------------------------------------------------------------------------- // Copyright (C) Microsoft. All rights reserved. // Licensed under the MIT license. See LICENSE.txt file in the project root for full license inform

scc flow9

import runtime; /* If we list all the natural numbers below 10 that are multiples of 3 or 5, we get 3, 5, 6 and 9. The sum of these multiples is 23. Find the sum of all the multiples of 3 or 5 below 1000. */ euler1(limit : int) -> int { foldRange

scc app 5cddf2000f4491a89a40

(window.webpackJsonp=window.webpackJsonp||[]).push([[2],[function(e,t,n){"use strict";e.exports=n(184)},function(e,t,n){n(21),n(5),n(4),n(6),n(60),n(29),n(3),n(59),n(13);var r,o,a,i=(r=0,o={util:{encode:function(e){return e instanceof a?new a(e.type,o.ut

scc qsharp

operation BellTest (count : Int, initial: Result) : (Int, Int, Int) { mutable numOnes = 0; mutable agree = 0; using ((q0, q1) = (Qubit(), Qubit())) { for (test in 1..count) { Set (initial, q0); Set (Zer

scc bootstrap grid.min

/*! * Bootstrap Grid v4.3.1 (https://getbootstrap.com/) * Copyright 2011-2019 The Bootstrap Authors * Copyright 2011-2019 Twitter, Inc. * Licensed under MIT (https://github.com/twbs/bootstrap/blob/master/LICENSE) */html{box-sizing:border-box;-ms-ove

scc racket

#lang racket/base (require racket/private/norm-arity) (provide normalize-arity normalized-arity? arity=? arity-includes?) (define (normalized-arity? a) (or (null? a) (arity? a) (and (list? a) ((length a) . >= . 2) (

scc matlab

function [net,stats] = cnn_train_dag(net, imdb, getBatch, varargin) %CNN_TRAIN_DAG Demonstrates training a CNN using the DagNN wrapper % CNN_TRAIN_DAG() is similar to CNN_TRAIN(), but works with % the DagNN wrapper instead of the SimpleNN wrapper.

scc 4

package com.boyter.SpellingCorrector; import java.util.*; import java.util.stream.Stream; /** * A simple spell checker based on a few implementations such as the infamous Peter Noving spell checker and * the like. Attempts to be highly performing by