/jgeocoder/src/main/java/net/sourceforge/jgeocoder/us/SpellingCorrector.java

https://github.com/alexsenxu/JGeocoder-Sen · Java · 82 lines · 67 code · 5 blank · 10 comment · 16 complexity · a0987b6a0c51ecbefc2930c202c2cdf0 MD5 · raw file

  1. package net.sourceforge.jgeocoder.us;
  2. import java.util.HashMap;
  3. import java.util.HashSet;
  4. import java.util.Map;
  5. import java.util.Set;
  6. import java.util.regex.Pattern;
  7. import org.apache.commons.lang.StringUtils;
  8. /**
  9. * Javadocs me
  10. * @author jliang
  11. *
  12. */
  13. public class SpellingCorrector{
  14. private static final Map<Integer, Set<String>> STATE_TOKENS = new HashMap<Integer, Set<String>>();
  15. private static final Pattern DIGIT = Pattern.compile("^\\d+$");
  16. static{
  17. for(String s : Data.getSTATE_CODE_MAP().keySet()){
  18. int size = s.split("\\s+").length;
  19. Set<String> set = STATE_TOKENS.get(size);
  20. if(set == null){
  21. STATE_TOKENS.put(size, new HashSet<String>());
  22. }
  23. STATE_TOKENS.get(size).add(s);
  24. }
  25. }
  26. /**
  27. * Attempts to correct possible state mis-spellings
  28. * @param rawAddress
  29. * @return rawAddress or spelling corrected address if a state mis-spelling is found
  30. */
  31. public static String correctStateSpelling(String rawAddress){
  32. String[] originalTokens = rawAddress.split("\\s+");
  33. String[] tokens = rawAddress.toUpperCase().split("\\s+");
  34. int end = tokens.length -1;
  35. for(int i = end; i>0; i--){
  36. if(DIGIT.matcher(tokens[i]).matches()){
  37. end --;
  38. }else{
  39. break; //end is the index of the last non-all-digits token
  40. }
  41. }
  42. if(tokens[end].length()<=2){ //short word
  43. return rawAddress; //this almost never works so just skip it
  44. }
  45. for(int i = 1; i<=4; i++){
  46. if(end >= i-1){
  47. for(String s : STATE_TOKENS.get(i)){
  48. StringBuilder sb = new StringBuilder();
  49. int newEnd = end - i+1;
  50. for(int j = 0; j<i; j++){
  51. sb.append(tokens[newEnd+j]).append(" ");
  52. }
  53. float metrics = getNormalizedSimilarity(s, sb.toString().trim());
  54. if(metrics == 1f){
  55. return rawAddress;
  56. }else if(metrics >= 0.75f){ //assume mis-spelling
  57. if(i != 1){
  58. for(int j=0; j<i-1; j++){
  59. originalTokens[newEnd+j] = "";
  60. }
  61. }
  62. originalTokens[end] = s;
  63. return StringUtils.join(originalTokens, " ");
  64. }
  65. }
  66. }
  67. }
  68. return rawAddress;
  69. }
  70. private static float getNormalizedSimilarity(String s, String t){
  71. return 1f - StringUtils.getLevenshteinDistance(s, t)/(float)Math.max(s.length(), t.length());
  72. }
  73. public static void main(String[] args){
  74. String fulladdress="320 vairo blvd, state college, pennsylvaniaa 16803";//changed to PENNSYLVANIA
  75. System.out.println(correctStateSpelling(fulladdress));
  76. }
  77. }