/ime/latinime/src/com/googlecode/eyesfree/inputmethod/latin/Suggest.java

http://eyes-free.googlecode.com/ · Java · 534 lines · 396 code · 59 blank · 79 comment · 143 complexity · dc13d1e1f05497ea54faa8fc929e5c6c MD5 · raw file

  1. /*
  2. * Copyright (C) 2008 The Android Open Source Project
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License"); you may not
  5. * use this file except in compliance with the License. You may obtain a copy of
  6. * the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  12. * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  13. * License for the specific language governing permissions and limitations under
  14. * the License.
  15. */
  16. package com.googlecode.eyesfree.inputmethod.latin;
  17. import android.content.Context;
  18. import android.text.AutoText;
  19. import android.text.TextUtils;
  20. import android.util.Log;
  21. import android.view.View;
  22. import java.nio.ByteBuffer;
  23. import java.util.ArrayList;
  24. import java.util.Arrays;
  25. import java.util.List;
  26. /**
  27. * This class loads a dictionary and provides a list of suggestions for a given sequence of
  28. * characters. This includes corrections and completions.
  29. * @hide pending API Council Approval
  30. */
  31. public class Suggest implements Dictionary.WordCallback {
  32. public static final int APPROX_MAX_WORD_LENGTH = 32;
  33. public static final int CORRECTION_NONE = 0;
  34. public static final int CORRECTION_BASIC = 1;
  35. public static final int CORRECTION_FULL = 2;
  36. public static final int CORRECTION_FULL_BIGRAM = 3;
  37. /**
  38. * Words that appear in both bigram and unigram data gets multiplier ranging from
  39. * BIGRAM_MULTIPLIER_MIN to BIGRAM_MULTIPLIER_MAX depending on the frequency score from
  40. * bigram data.
  41. */
  42. public static final double BIGRAM_MULTIPLIER_MIN = 1.2;
  43. public static final double BIGRAM_MULTIPLIER_MAX = 1.5;
  44. /**
  45. * Maximum possible bigram frequency. Will depend on how many bits are being used in data
  46. * structure. Maximum bigram freqeuncy will get the BIGRAM_MULTIPLIER_MAX as the multiplier.
  47. */
  48. public static final int MAXIMUM_BIGRAM_FREQUENCY = 127;
  49. public static final int DIC_USER_TYPED = 0;
  50. public static final int DIC_MAIN = 1;
  51. public static final int DIC_USER = 2;
  52. public static final int DIC_AUTO = 3;
  53. public static final int DIC_CONTACTS = 4;
  54. // If you add a type of dictionary, increment DIC_TYPE_LAST_ID
  55. public static final int DIC_TYPE_LAST_ID = 4;
  56. static final int LARGE_DICTIONARY_THRESHOLD = 200 * 1000;
  57. private BinaryDictionary mMainDict;
  58. private Dictionary mUserDictionary;
  59. private Dictionary mAutoDictionary;
  60. private Dictionary mContactsDictionary;
  61. private Dictionary mUserBigramDictionary;
  62. private int mPrefMaxSuggestions = 12;
  63. private static final int PREF_MAX_BIGRAMS = 60;
  64. private boolean mAutoTextEnabled;
  65. private int[] mPriorities = new int[mPrefMaxSuggestions];
  66. private int[] mBigramPriorities = new int[PREF_MAX_BIGRAMS];
  67. // Handle predictive correction for only the first 1280 characters for performance reasons
  68. // If we support scripts that need latin characters beyond that, we should probably use some
  69. // kind of a sparse array or language specific list with a mapping lookup table.
  70. // 1280 is the size of the BASE_CHARS array in ExpandableDictionary, which is a basic set of
  71. // latin characters.
  72. private int[] mNextLettersFrequencies = new int[1280];
  73. private ArrayList<CharSequence> mSuggestions = new ArrayList<CharSequence>();
  74. ArrayList<CharSequence> mBigramSuggestions = new ArrayList<CharSequence>();
  75. private ArrayList<CharSequence> mStringPool = new ArrayList<CharSequence>();
  76. private boolean mHaveCorrection;
  77. private CharSequence mOriginalWord;
  78. private String mLowerOriginalWord;
  79. // TODO: Remove these member variables by passing more context to addWord() callback method
  80. private boolean mIsFirstCharCapitalized;
  81. private boolean mIsAllUpperCase;
  82. private int mCorrectionMode = CORRECTION_BASIC;
  83. public Suggest(Context context, int[] dictionaryResId) {
  84. mMainDict = new BinaryDictionary(context, dictionaryResId, DIC_MAIN);
  85. initPool();
  86. }
  87. public Suggest(Context context, ByteBuffer byteBuffer) {
  88. mMainDict = new BinaryDictionary(context, byteBuffer, DIC_MAIN);
  89. initPool();
  90. }
  91. private void initPool() {
  92. for (int i = 0; i < mPrefMaxSuggestions; i++) {
  93. StringBuilder sb = new StringBuilder(getApproxMaxWordLength());
  94. mStringPool.add(sb);
  95. }
  96. }
  97. public void setAutoTextEnabled(boolean enabled) {
  98. mAutoTextEnabled = enabled;
  99. }
  100. public int getCorrectionMode() {
  101. return mCorrectionMode;
  102. }
  103. public void setCorrectionMode(int mode) {
  104. mCorrectionMode = mode;
  105. }
  106. public boolean hasMainDictionary() {
  107. return mMainDict.getSize() > LARGE_DICTIONARY_THRESHOLD;
  108. }
  109. public int getApproxMaxWordLength() {
  110. return APPROX_MAX_WORD_LENGTH;
  111. }
  112. /**
  113. * Sets an optional user dictionary resource to be loaded. The user dictionary is consulted
  114. * before the main dictionary, if set.
  115. */
  116. public void setUserDictionary(Dictionary userDictionary) {
  117. mUserDictionary = userDictionary;
  118. }
  119. /**
  120. * Sets an optional contacts dictionary resource to be loaded.
  121. */
  122. public void setContactsDictionary(Dictionary userDictionary) {
  123. mContactsDictionary = userDictionary;
  124. }
  125. public void setAutoDictionary(Dictionary autoDictionary) {
  126. mAutoDictionary = autoDictionary;
  127. }
  128. public void setUserBigramDictionary(Dictionary userBigramDictionary) {
  129. mUserBigramDictionary = userBigramDictionary;
  130. }
  131. /**
  132. * Number of suggestions to generate from the input key sequence. This has
  133. * to be a number between 1 and 100 (inclusive).
  134. * @param maxSuggestions
  135. * @throws IllegalArgumentException if the number is out of range
  136. */
  137. public void setMaxSuggestions(int maxSuggestions) {
  138. if (maxSuggestions < 1 || maxSuggestions > 100) {
  139. throw new IllegalArgumentException("maxSuggestions must be between 1 and 100");
  140. }
  141. mPrefMaxSuggestions = maxSuggestions;
  142. mPriorities = new int[mPrefMaxSuggestions];
  143. mBigramPriorities = new int[PREF_MAX_BIGRAMS];
  144. collectGarbage(mSuggestions, mPrefMaxSuggestions);
  145. while (mStringPool.size() < mPrefMaxSuggestions) {
  146. StringBuilder sb = new StringBuilder(getApproxMaxWordLength());
  147. mStringPool.add(sb);
  148. }
  149. }
  150. private boolean haveSufficientCommonality(String original, CharSequence suggestion) {
  151. final int originalLength = original.length();
  152. final int suggestionLength = suggestion.length();
  153. final int minLength = Math.min(originalLength, suggestionLength);
  154. if (minLength <= 2) return true;
  155. int matching = 0;
  156. int lessMatching = 0; // Count matches if we skip one character
  157. int i;
  158. for (i = 0; i < minLength; i++) {
  159. final char origChar = ExpandableDictionary.toLowerCase(original.charAt(i));
  160. if (origChar == ExpandableDictionary.toLowerCase(suggestion.charAt(i))) {
  161. matching++;
  162. lessMatching++;
  163. } else if (i + 1 < suggestionLength
  164. && origChar == ExpandableDictionary.toLowerCase(suggestion.charAt(i + 1))) {
  165. lessMatching++;
  166. }
  167. }
  168. matching = Math.max(matching, lessMatching);
  169. if (minLength <= 4) {
  170. return matching >= 2;
  171. } else {
  172. return matching > minLength / 2;
  173. }
  174. }
  175. /**
  176. * Returns a list of words that match the list of character codes passed in.
  177. * This list will be overwritten the next time this function is called.
  178. * @param view a view for retrieving the context for AutoText
  179. * @param wordComposer contains what is currently being typed
  180. * @param prevWordForBigram previous word (used only for bigram)
  181. * @return list of suggestions.
  182. */
  183. public List<CharSequence> getSuggestions(View view, WordComposer wordComposer,
  184. boolean includeTypedWordIfValid, CharSequence prevWordForBigram) {
  185. LatinImeLogger.onStartSuggestion(prevWordForBigram);
  186. mHaveCorrection = false;
  187. mIsFirstCharCapitalized = wordComposer.isFirstCharCapitalized();
  188. mIsAllUpperCase = wordComposer.isAllUpperCase();
  189. collectGarbage(mSuggestions, mPrefMaxSuggestions);
  190. Arrays.fill(mPriorities, 0);
  191. Arrays.fill(mNextLettersFrequencies, 0);
  192. // Save a lowercase version of the original word
  193. mOriginalWord = wordComposer.getTypedWord();
  194. if (mOriginalWord != null) {
  195. final String mOriginalWordString = mOriginalWord.toString();
  196. mOriginalWord = mOriginalWordString;
  197. mLowerOriginalWord = mOriginalWordString.toLowerCase();
  198. // Treating USER_TYPED as UNIGRAM suggestion for logging now.
  199. LatinImeLogger.onAddSuggestedWord(mOriginalWordString, Suggest.DIC_USER_TYPED,
  200. Dictionary.DataType.UNIGRAM);
  201. } else {
  202. mLowerOriginalWord = "";
  203. }
  204. if (wordComposer.size() == 1 && (mCorrectionMode == CORRECTION_FULL_BIGRAM
  205. || mCorrectionMode == CORRECTION_BASIC)) {
  206. // At first character typed, search only the bigrams
  207. Arrays.fill(mBigramPriorities, 0);
  208. collectGarbage(mBigramSuggestions, PREF_MAX_BIGRAMS);
  209. if (!TextUtils.isEmpty(prevWordForBigram)) {
  210. CharSequence lowerPrevWord = prevWordForBigram.toString().toLowerCase();
  211. if (mMainDict.isValidWord(lowerPrevWord)) {
  212. prevWordForBigram = lowerPrevWord;
  213. }
  214. if (mUserBigramDictionary != null) {
  215. mUserBigramDictionary.getBigrams(wordComposer, prevWordForBigram, this,
  216. mNextLettersFrequencies);
  217. }
  218. if (mContactsDictionary != null) {
  219. mContactsDictionary.getBigrams(wordComposer, prevWordForBigram, this,
  220. mNextLettersFrequencies);
  221. }
  222. if (mMainDict != null) {
  223. mMainDict.getBigrams(wordComposer, prevWordForBigram, this,
  224. mNextLettersFrequencies);
  225. }
  226. char currentChar = wordComposer.getTypedWord().charAt(0);
  227. char currentCharUpper = Character.toUpperCase(currentChar);
  228. int count = 0;
  229. int bigramSuggestionSize = mBigramSuggestions.size();
  230. for (int i = 0; i < bigramSuggestionSize; i++) {
  231. if (mBigramSuggestions.get(i).charAt(0) == currentChar
  232. || mBigramSuggestions.get(i).charAt(0) == currentCharUpper) {
  233. int poolSize = mStringPool.size();
  234. StringBuilder sb = poolSize > 0 ?
  235. (StringBuilder) mStringPool.remove(poolSize - 1)
  236. : new StringBuilder(getApproxMaxWordLength());
  237. sb.setLength(0);
  238. sb.append(mBigramSuggestions.get(i));
  239. mSuggestions.add(count++, sb);
  240. if (count > mPrefMaxSuggestions) break;
  241. }
  242. }
  243. }
  244. } else if (wordComposer.size() > 1) {
  245. // At second character typed, search the unigrams (scores being affected by bigrams)
  246. if (mUserDictionary != null || mContactsDictionary != null) {
  247. if (mUserDictionary != null) {
  248. mUserDictionary.getWords(wordComposer, this, mNextLettersFrequencies);
  249. }
  250. if (mContactsDictionary != null) {
  251. mContactsDictionary.getWords(wordComposer, this, mNextLettersFrequencies);
  252. }
  253. if (mSuggestions.size() > 0 && isValidWord(mOriginalWord)
  254. && (mCorrectionMode == CORRECTION_FULL
  255. || mCorrectionMode == CORRECTION_FULL_BIGRAM)) {
  256. mHaveCorrection = true;
  257. }
  258. }
  259. mMainDict.getWords(wordComposer, this, mNextLettersFrequencies);
  260. if ((mCorrectionMode == CORRECTION_FULL || mCorrectionMode == CORRECTION_FULL_BIGRAM)
  261. && mSuggestions.size() > 0) {
  262. mHaveCorrection = true;
  263. }
  264. }
  265. if (mOriginalWord != null) {
  266. mSuggestions.add(0, mOriginalWord.toString());
  267. }
  268. // Check if the first suggestion has a minimum number of characters in common
  269. if (wordComposer.size() > 1 && mSuggestions.size() > 1
  270. && (mCorrectionMode == CORRECTION_FULL
  271. || mCorrectionMode == CORRECTION_FULL_BIGRAM)) {
  272. if (!haveSufficientCommonality(mLowerOriginalWord, mSuggestions.get(1))) {
  273. mHaveCorrection = false;
  274. }
  275. }
  276. if (mAutoTextEnabled) {
  277. int i = 0;
  278. int max = 6;
  279. // Don't autotext the suggestions from the dictionaries
  280. if (mCorrectionMode == CORRECTION_BASIC) max = 1;
  281. while (i < mSuggestions.size() && i < max) {
  282. String suggestedWord = mSuggestions.get(i).toString().toLowerCase();
  283. CharSequence autoText =
  284. AutoText.get(suggestedWord, 0, suggestedWord.length(), view);
  285. // Is there an AutoText correction?
  286. boolean canAdd = autoText != null;
  287. // Is that correction already the current prediction (or original word)?
  288. canAdd &= !TextUtils.equals(autoText, mSuggestions.get(i));
  289. // Is that correction already the next predicted word?
  290. if (canAdd && i + 1 < mSuggestions.size() && mCorrectionMode != CORRECTION_BASIC) {
  291. canAdd &= !TextUtils.equals(autoText, mSuggestions.get(i + 1));
  292. }
  293. if (canAdd) {
  294. mHaveCorrection = true;
  295. mSuggestions.add(i + 1, autoText);
  296. i++;
  297. }
  298. i++;
  299. }
  300. }
  301. removeDupes();
  302. return mSuggestions;
  303. }
  304. public int[] getNextLettersFrequencies() {
  305. return mNextLettersFrequencies;
  306. }
  307. private void removeDupes() {
  308. final ArrayList<CharSequence> suggestions = mSuggestions;
  309. if (suggestions.size() < 2) return;
  310. int i = 1;
  311. // Don't cache suggestions.size(), since we may be removing items
  312. while (i < suggestions.size()) {
  313. final CharSequence cur = suggestions.get(i);
  314. // Compare each candidate with each previous candidate
  315. for (int j = 0; j < i; j++) {
  316. CharSequence previous = suggestions.get(j);
  317. if (TextUtils.equals(cur, previous)) {
  318. removeFromSuggestions(i);
  319. i--;
  320. break;
  321. }
  322. }
  323. i++;
  324. }
  325. }
  326. private void removeFromSuggestions(int index) {
  327. CharSequence garbage = mSuggestions.remove(index);
  328. if (garbage != null && garbage instanceof StringBuilder) {
  329. mStringPool.add(garbage);
  330. }
  331. }
  332. public boolean hasMinimalCorrection() {
  333. return mHaveCorrection;
  334. }
  335. private boolean compareCaseInsensitive(final String mLowerOriginalWord,
  336. final char[] word, final int offset, final int length) {
  337. final int originalLength = mLowerOriginalWord.length();
  338. if (originalLength == length && Character.isUpperCase(word[offset])) {
  339. for (int i = 0; i < originalLength; i++) {
  340. if (mLowerOriginalWord.charAt(i) != Character.toLowerCase(word[offset+i])) {
  341. return false;
  342. }
  343. }
  344. return true;
  345. }
  346. return false;
  347. }
  348. public boolean addWord(final char[] word, final int offset, final int length, int freq,
  349. final int dicTypeId, final Dictionary.DataType dataType) {
  350. Dictionary.DataType dataTypeForLog = dataType;
  351. ArrayList<CharSequence> suggestions;
  352. int[] priorities;
  353. int prefMaxSuggestions;
  354. if(dataType == Dictionary.DataType.BIGRAM) {
  355. suggestions = mBigramSuggestions;
  356. priorities = mBigramPriorities;
  357. prefMaxSuggestions = PREF_MAX_BIGRAMS;
  358. } else {
  359. suggestions = mSuggestions;
  360. priorities = mPriorities;
  361. prefMaxSuggestions = mPrefMaxSuggestions;
  362. }
  363. int pos = 0;
  364. // Check if it's the same word, only caps are different
  365. if (compareCaseInsensitive(mLowerOriginalWord, word, offset, length)) {
  366. pos = 0;
  367. } else {
  368. if (dataType == Dictionary.DataType.UNIGRAM) {
  369. // Check if the word was already added before (by bigram data)
  370. int bigramSuggestion = searchBigramSuggestion(word,offset,length);
  371. if(bigramSuggestion >= 0) {
  372. dataTypeForLog = Dictionary.DataType.BIGRAM;
  373. // turn freq from bigram into multiplier specified above
  374. double multiplier = (((double) mBigramPriorities[bigramSuggestion])
  375. / MAXIMUM_BIGRAM_FREQUENCY)
  376. * (BIGRAM_MULTIPLIER_MAX - BIGRAM_MULTIPLIER_MIN)
  377. + BIGRAM_MULTIPLIER_MIN;
  378. /* Log.d(TAG,"bigram num: " + bigramSuggestion
  379. + " wordB: " + mBigramSuggestions.get(bigramSuggestion).toString()
  380. + " currentPriority: " + freq + " bigramPriority: "
  381. + mBigramPriorities[bigramSuggestion]
  382. + " multiplier: " + multiplier); */
  383. freq = (int)Math.round((freq * multiplier));
  384. }
  385. }
  386. // Check the last one's priority and bail
  387. if (priorities[prefMaxSuggestions - 1] >= freq) return true;
  388. while (pos < prefMaxSuggestions) {
  389. if (priorities[pos] < freq
  390. || (priorities[pos] == freq && length < suggestions.get(pos).length())) {
  391. break;
  392. }
  393. pos++;
  394. }
  395. }
  396. if (pos >= prefMaxSuggestions) {
  397. return true;
  398. }
  399. System.arraycopy(priorities, pos, priorities, pos + 1,
  400. prefMaxSuggestions - pos - 1);
  401. priorities[pos] = freq;
  402. int poolSize = mStringPool.size();
  403. StringBuilder sb = poolSize > 0 ? (StringBuilder) mStringPool.remove(poolSize - 1)
  404. : new StringBuilder(getApproxMaxWordLength());
  405. sb.setLength(0);
  406. if (mIsAllUpperCase) {
  407. sb.append(new String(word, offset, length).toUpperCase());
  408. } else if (mIsFirstCharCapitalized) {
  409. sb.append(Character.toUpperCase(word[offset]));
  410. if (length > 1) {
  411. sb.append(word, offset + 1, length - 1);
  412. }
  413. } else {
  414. sb.append(word, offset, length);
  415. }
  416. suggestions.add(pos, sb);
  417. if (suggestions.size() > prefMaxSuggestions) {
  418. CharSequence garbage = suggestions.remove(prefMaxSuggestions);
  419. if (garbage instanceof StringBuilder) {
  420. mStringPool.add(garbage);
  421. }
  422. } else {
  423. LatinImeLogger.onAddSuggestedWord(sb.toString(), dicTypeId, dataTypeForLog);
  424. }
  425. return true;
  426. }
  427. private int searchBigramSuggestion(final char[] word, final int offset, final int length) {
  428. // TODO This is almost O(n^2). Might need fix.
  429. // search whether the word appeared in bigram data
  430. int bigramSuggestSize = mBigramSuggestions.size();
  431. for(int i = 0; i < bigramSuggestSize; i++) {
  432. if(mBigramSuggestions.get(i).length() == length) {
  433. boolean chk = true;
  434. for(int j = 0; j < length; j++) {
  435. if(mBigramSuggestions.get(i).charAt(j) != word[offset+j]) {
  436. chk = false;
  437. break;
  438. }
  439. }
  440. if(chk) return i;
  441. }
  442. }
  443. return -1;
  444. }
  445. public boolean isValidWord(final CharSequence word) {
  446. if (word == null || word.length() == 0) {
  447. return false;
  448. }
  449. return mMainDict.isValidWord(word)
  450. || (mUserDictionary != null && mUserDictionary.isValidWord(word))
  451. || (mAutoDictionary != null && mAutoDictionary.isValidWord(word))
  452. || (mContactsDictionary != null && mContactsDictionary.isValidWord(word));
  453. }
  454. private void collectGarbage(ArrayList<CharSequence> suggestions, int prefMaxSuggestions) {
  455. int poolSize = mStringPool.size();
  456. int garbageSize = suggestions.size();
  457. while (poolSize < prefMaxSuggestions && garbageSize > 0) {
  458. CharSequence garbage = suggestions.get(garbageSize - 1);
  459. if (garbage != null && garbage instanceof StringBuilder) {
  460. mStringPool.add(garbage);
  461. poolSize++;
  462. }
  463. garbageSize--;
  464. }
  465. if (poolSize == prefMaxSuggestions + 1) {
  466. Log.w("Suggest", "String pool got too big: " + poolSize);
  467. }
  468. suggestions.clear();
  469. }
  470. public void close() {
  471. if (mMainDict != null) {
  472. mMainDict.close();
  473. }
  474. }
  475. }