/ime/latinime/jni/src/dictionary.cpp
C++ | 596 lines | 502 code | 52 blank | 42 comment | 148 complexity | 762c1ad4ca09994bd643f251d3714fac MD5 | raw file
1/* 2** 3** Copyright 2009, The Android Open Source Project 4** 5** Licensed under the Apache License, Version 2.0 (the "License"); 6** you may not use this file except in compliance with the License. 7** You may obtain a copy of the License at 8** 9** http://www.apache.org/licenses/LICENSE-2.0 10** 11** Unless required by applicable law or agreed to in writing, software 12** distributed under the License is distributed on an "AS IS" BASIS, 13** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14** See the License for the specific language governing permissions and 15** limitations under the License. 16*/ 17 18#include <stdio.h> 19#include <fcntl.h> 20#include <sys/mman.h> 21#include <string.h> 22//#define LOG_TAG "dictionary.cpp" 23//#include <cutils/log.h> 24#define LOGI 25 26#include "dictionary.h" 27#include "basechars.h" 28#include "char_utils.h" 29 30#define DEBUG_DICT 0 31#define DICTIONARY_VERSION_MIN 200 32#define DICTIONARY_HEADER_SIZE 2 33#define NOT_VALID_WORD -99 34 35namespace latinime { 36 37Dictionary::Dictionary(void *dict, int typedLetterMultiplier, int fullWordMultiplier) 38{ 39 mDict = (unsigned char*) dict; 40 mTypedLetterMultiplier = typedLetterMultiplier; 41 mFullWordMultiplier = fullWordMultiplier; 42 getVersionNumber(); 43} 44 45Dictionary::~Dictionary() 46{ 47} 48 49int Dictionary::getSuggestions(int *codes, int codesSize, unsigned short *outWords, int *frequencies, 50 int maxWordLength, int maxWords, int maxAlternatives, int skipPos, 51 int *nextLetters, int nextLettersSize) 52{ 53 int suggWords; 54 mFrequencies = frequencies; 55 mOutputChars = outWords; 56 mInputCodes = codes; 57 mInputLength = codesSize; 58 mMaxAlternatives = maxAlternatives; 59 mMaxWordLength = maxWordLength; 60 mMaxWords = maxWords; 61 mSkipPos = skipPos; 62 mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2; 63 mNextLettersFrequencies = nextLetters; 64 mNextLettersSize = nextLettersSize; 65 66 if (checkIfDictVersionIsLatest()) { 67 getWordsRec(DICTIONARY_HEADER_SIZE, 0, mInputLength * 3, false, 1, 0, 0); 68 } else { 69 getWordsRec(0, 0, mInputLength * 3, false, 1, 0, 0); 70 } 71 72 // Get the word count 73 suggWords = 0; 74 while (suggWords < mMaxWords && mFrequencies[suggWords] > 0) suggWords++; 75 if (DEBUG_DICT) LOGI("Returning %d words", suggWords); 76 77 if (DEBUG_DICT) { 78 LOGI("Next letters: "); 79 for (int k = 0; k < nextLettersSize; k++) { 80 if (mNextLettersFrequencies[k] > 0) { 81 LOGI("%c = %d,", k, mNextLettersFrequencies[k]); 82 } 83 } 84 LOGI("\n"); 85 } 86 return suggWords; 87} 88 89void 90Dictionary::registerNextLetter(unsigned short c) 91{ 92 if (c < mNextLettersSize) { 93 mNextLettersFrequencies[c]++; 94 } 95} 96 97void 98Dictionary::getVersionNumber() 99{ 100 mVersion = (mDict[0] & 0xFF); 101 mBigram = (mDict[1] & 0xFF); 102 LOGI("IN NATIVE SUGGEST Version: %d Bigram : %d \n", mVersion, mBigram); 103} 104 105// Checks whether it has the latest dictionary or the old dictionary 106bool 107Dictionary::checkIfDictVersionIsLatest() 108{ 109 return (mVersion >= DICTIONARY_VERSION_MIN) && (mBigram == 1 || mBigram == 0); 110} 111 112unsigned short 113Dictionary::getChar(int *pos) 114{ 115 unsigned short ch = (unsigned short) (mDict[(*pos)++] & 0xFF); 116 // If the code is 255, then actual 16 bit code follows (in big endian) 117 if (ch == 0xFF) { 118 ch = ((mDict[*pos] & 0xFF) << 8) | (mDict[*pos + 1] & 0xFF); 119 (*pos) += 2; 120 } 121 return ch; 122} 123 124int 125Dictionary::getAddress(int *pos) 126{ 127 int address = 0; 128 if ((mDict[*pos] & FLAG_ADDRESS_MASK) == 0) { 129 *pos += 1; 130 } else { 131 address += (mDict[*pos] & (ADDRESS_MASK >> 16)) << 16; 132 address += (mDict[*pos + 1] & 0xFF) << 8; 133 address += (mDict[*pos + 2] & 0xFF); 134 *pos += 3; 135 } 136 return address; 137} 138 139int 140Dictionary::getFreq(int *pos) 141{ 142 int freq = mDict[(*pos)++] & 0xFF; 143 144 if (checkIfDictVersionIsLatest()) { 145 // skipping bigram 146 int bigramExist = (mDict[*pos] & FLAG_BIGRAM_READ); 147 if (bigramExist > 0) { 148 int nextBigramExist = 1; 149 while (nextBigramExist > 0) { 150 (*pos) += 3; 151 nextBigramExist = (mDict[(*pos)++] & FLAG_BIGRAM_CONTINUED); 152 } 153 } else { 154 (*pos)++; 155 } 156 } 157 158 return freq; 159} 160 161int 162Dictionary::wideStrLen(unsigned short *str) 163{ 164 if (!str) return 0; 165 unsigned short *end = str; 166 while (*end) 167 end++; 168 return end - str; 169} 170 171bool 172Dictionary::addWord(unsigned short *word, int length, int frequency) 173{ 174 word[length] = 0; 175 if (DEBUG_DICT) { 176 char s[length + 1]; 177 for (int i = 0; i <= length; i++) s[i] = word[i]; 178 LOGI("Found word = %s, freq = %d : \n", s, frequency); 179 } 180 181 // Find the right insertion point 182 int insertAt = 0; 183 while (insertAt < mMaxWords) { 184 if (frequency > mFrequencies[insertAt] 185 || (mFrequencies[insertAt] == frequency 186 && length < wideStrLen(mOutputChars + insertAt * mMaxWordLength))) { 187 break; 188 } 189 insertAt++; 190 } 191 if (insertAt < mMaxWords) { 192 memmove((char*) mFrequencies + (insertAt + 1) * sizeof(mFrequencies[0]), 193 (char*) mFrequencies + insertAt * sizeof(mFrequencies[0]), 194 (mMaxWords - insertAt - 1) * sizeof(mFrequencies[0])); 195 mFrequencies[insertAt] = frequency; 196 memmove((char*) mOutputChars + (insertAt + 1) * mMaxWordLength * sizeof(short), 197 (char*) mOutputChars + (insertAt ) * mMaxWordLength * sizeof(short), 198 (mMaxWords - insertAt - 1) * sizeof(short) * mMaxWordLength); 199 unsigned short *dest = mOutputChars + (insertAt ) * mMaxWordLength; 200 while (length--) { 201 *dest++ = *word++; 202 } 203 *dest = 0; // NULL terminate 204 if (DEBUG_DICT) LOGI("Added word at %d\n", insertAt); 205 return true; 206 } 207 return false; 208} 209 210bool 211Dictionary::addWordBigram(unsigned short *word, int length, int frequency) 212{ 213 word[length] = 0; 214 if (DEBUG_DICT) { 215 char s[length + 1]; 216 for (int i = 0; i <= length; i++) s[i] = word[i]; 217 LOGI("Bigram: Found word = %s, freq = %d : \n", s, frequency); 218 } 219 220 // Find the right insertion point 221 int insertAt = 0; 222 while (insertAt < mMaxBigrams) { 223 if (frequency > mBigramFreq[insertAt] 224 || (mBigramFreq[insertAt] == frequency 225 && length < wideStrLen(mBigramChars + insertAt * mMaxWordLength))) { 226 break; 227 } 228 insertAt++; 229 } 230 LOGI("Bigram: InsertAt -> %d maxBigrams: %d\n", insertAt, mMaxBigrams); 231 if (insertAt < mMaxBigrams) { 232 memmove((char*) mBigramFreq + (insertAt + 1) * sizeof(mBigramFreq[0]), 233 (char*) mBigramFreq + insertAt * sizeof(mBigramFreq[0]), 234 (mMaxBigrams - insertAt - 1) * sizeof(mBigramFreq[0])); 235 mBigramFreq[insertAt] = frequency; 236 memmove((char*) mBigramChars + (insertAt + 1) * mMaxWordLength * sizeof(short), 237 (char*) mBigramChars + (insertAt ) * mMaxWordLength * sizeof(short), 238 (mMaxBigrams - insertAt - 1) * sizeof(short) * mMaxWordLength); 239 unsigned short *dest = mBigramChars + (insertAt ) * mMaxWordLength; 240 while (length--) { 241 *dest++ = *word++; 242 } 243 *dest = 0; // NULL terminate 244 if (DEBUG_DICT) LOGI("Bigram: Added word at %d\n", insertAt); 245 return true; 246 } 247 return false; 248} 249 250unsigned short 251Dictionary::toLowerCase(unsigned short c) { 252 if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) { 253 c = BASE_CHARS[c]; 254 } 255 if (c >='A' && c <= 'Z') { 256 c |= 32; 257 } else if (c > 127) { 258 c = latin_tolower(c); 259 } 260 return c; 261} 262 263bool 264Dictionary::sameAsTyped(unsigned short *word, int length) 265{ 266 if (length != mInputLength) { 267 return false; 268 } 269 int *inputCodes = mInputCodes; 270 while (length--) { 271 if ((unsigned int) *inputCodes != (unsigned int) *word) { 272 return false; 273 } 274 inputCodes += mMaxAlternatives; 275 word++; 276 } 277 return true; 278} 279 280static char QUOTE = '\''; 281 282void 283Dictionary::getWordsRec(int pos, int depth, int maxDepth, bool completion, int snr, int inputIndex, 284 int diffs) 285{ 286 // Optimization: Prune out words that are too long compared to how much was typed. 287 if (depth > maxDepth) { 288 return; 289 } 290 if (diffs > mMaxEditDistance) { 291 return; 292 } 293 int count = getCount(&pos); 294 int *currentChars = NULL; 295 if (mInputLength <= inputIndex) { 296 completion = true; 297 } else { 298 currentChars = mInputCodes + (inputIndex * mMaxAlternatives); 299 } 300 301 for (int i = 0; i < count; i++) { 302 // -- at char 303 unsigned short c = getChar(&pos); 304 // -- at flag/add 305 unsigned short lowerC = toLowerCase(c); 306 bool terminal = getTerminal(&pos); 307 int childrenAddress = getAddress(&pos); 308 // -- after address or flag 309 int freq = 1; 310 if (terminal) freq = getFreq(&pos); 311 // -- after add or freq 312 313 // If we are only doing completions, no need to look at the typed characters. 314 if (completion) { 315 mWord[depth] = c; 316 if (terminal) { 317 addWord(mWord, depth + 1, freq * snr); 318 if (depth >= mInputLength && mSkipPos < 0) { 319 registerNextLetter(mWord[mInputLength]); 320 } 321 } 322 if (childrenAddress != 0) { 323 getWordsRec(childrenAddress, depth + 1, maxDepth, 324 completion, snr, inputIndex, diffs); 325 } 326 } else if ((c == QUOTE && currentChars[0] != QUOTE) || mSkipPos == depth) { 327 // Skip the ' or other letter and continue deeper 328 mWord[depth] = c; 329 if (childrenAddress != 0) { 330 getWordsRec(childrenAddress, depth + 1, maxDepth, false, snr, inputIndex, diffs); 331 } 332 } else { 333 int j = 0; 334 while (currentChars[j] > 0) { 335 if (currentChars[j] == lowerC || currentChars[j] == c) { 336 int addedWeight = j == 0 ? mTypedLetterMultiplier : 1; 337 mWord[depth] = c; 338 if (mInputLength == inputIndex + 1) { 339 if (terminal) { 340 if (//INCLUDE_TYPED_WORD_IF_VALID || 341 !sameAsTyped(mWord, depth + 1)) { 342 int finalFreq = freq * snr * addedWeight; 343 if (mSkipPos < 0) finalFreq *= mFullWordMultiplier; 344 addWord(mWord, depth + 1, finalFreq); 345 } 346 } 347 if (childrenAddress != 0) { 348 getWordsRec(childrenAddress, depth + 1, 349 maxDepth, true, snr * addedWeight, inputIndex + 1, 350 diffs + (j > 0)); 351 } 352 } else if (childrenAddress != 0) { 353 getWordsRec(childrenAddress, depth + 1, maxDepth, 354 false, snr * addedWeight, inputIndex + 1, diffs + (j > 0)); 355 } 356 } 357 j++; 358 if (mSkipPos >= 0) break; 359 } 360 } 361 } 362} 363 364int 365Dictionary::getBigramAddress(int *pos, bool advance) 366{ 367 int address = 0; 368 369 address += (mDict[*pos] & 0x3F) << 16; 370 address += (mDict[*pos + 1] & 0xFF) << 8; 371 address += (mDict[*pos + 2] & 0xFF); 372 373 if (advance) { 374 *pos += 3; 375 } 376 377 return address; 378} 379 380int 381Dictionary::getBigramFreq(int *pos) 382{ 383 int freq = mDict[(*pos)++] & FLAG_BIGRAM_FREQ; 384 385 return freq; 386} 387 388 389int 390Dictionary::getBigrams(unsigned short *prevWord, int prevWordLength, int *codes, int codesSize, 391 unsigned short *bigramChars, int *bigramFreq, int maxWordLength, int maxBigrams, 392 int maxAlternatives) 393{ 394 mBigramFreq = bigramFreq; 395 mBigramChars = bigramChars; 396 mInputCodes = codes; 397 mInputLength = codesSize; 398 mMaxWordLength = maxWordLength; 399 mMaxBigrams = maxBigrams; 400 mMaxAlternatives = maxAlternatives; 401 402 if (mBigram == 1 && checkIfDictVersionIsLatest()) { 403 int pos = isValidWordRec(DICTIONARY_HEADER_SIZE, prevWord, 0, prevWordLength); 404 LOGI("Pos -> %d\n", pos); 405 if (pos < 0) { 406 return 0; 407 } 408 409 int bigramCount = 0; 410 int bigramExist = (mDict[pos] & FLAG_BIGRAM_READ); 411 if (bigramExist > 0) { 412 int nextBigramExist = 1; 413 while (nextBigramExist > 0 && bigramCount < maxBigrams) { 414 int bigramAddress = getBigramAddress(&pos, true); 415 int frequency = (FLAG_BIGRAM_FREQ & mDict[pos]); 416 // search for all bigrams and store them 417 searchForTerminalNode(bigramAddress, frequency); 418 nextBigramExist = (mDict[pos++] & FLAG_BIGRAM_CONTINUED); 419 bigramCount++; 420 } 421 } 422 423 return bigramCount; 424 } 425 return 0; 426} 427 428void 429Dictionary::searchForTerminalNode(int addressLookingFor, int frequency) 430{ 431 // track word with such address and store it in an array 432 unsigned short word[mMaxWordLength]; 433 434 int pos; 435 int followDownBranchAddress = DICTIONARY_HEADER_SIZE; 436 bool found = false; 437 char followingChar = ' '; 438 int depth = -1; 439 440 while(!found) { 441 bool followDownAddressSearchStop = false; 442 bool firstAddress = true; 443 bool haveToSearchAll = true; 444 445 if (depth >= 0) { 446 word[depth] = (unsigned short) followingChar; 447 } 448 pos = followDownBranchAddress; // pos start at count 449 int count = mDict[pos] & 0xFF; 450 LOGI("count - %d\n",count); 451 pos++; 452 for (int i = 0; i < count; i++) { 453 // pos at data 454 pos++; 455 // pos now at flag 456 if (!getFirstBitOfByte(&pos)) { // non-terminal 457 if (!followDownAddressSearchStop) { 458 int addr = getBigramAddress(&pos, false); 459 if (addr > addressLookingFor) { 460 followDownAddressSearchStop = true; 461 if (firstAddress) { 462 firstAddress = false; 463 haveToSearchAll = true; 464 } else if (!haveToSearchAll) { 465 break; 466 } 467 } else { 468 followDownBranchAddress = addr; 469 followingChar = (char)(0xFF & mDict[pos-1]); 470 if (firstAddress) { 471 firstAddress = false; 472 haveToSearchAll = false; 473 } 474 } 475 } 476 pos += 3; 477 } else if (getFirstBitOfByte(&pos)) { // terminal 478 if (addressLookingFor == (pos-1)) { // found !! 479 depth++; 480 word[depth] = (0xFF & mDict[pos-1]); 481 found = true; 482 break; 483 } 484 if (getSecondBitOfByte(&pos)) { // address + freq (4 byte) 485 if (!followDownAddressSearchStop) { 486 int addr = getBigramAddress(&pos, false); 487 if (addr > addressLookingFor) { 488 followDownAddressSearchStop = true; 489 if (firstAddress) { 490 firstAddress = false; 491 haveToSearchAll = true; 492 } else if (!haveToSearchAll) { 493 break; 494 } 495 } else { 496 followDownBranchAddress = addr; 497 followingChar = (char)(0xFF & mDict[pos-1]); 498 if (firstAddress) { 499 firstAddress = false; 500 haveToSearchAll = true; 501 } 502 } 503 } 504 pos += 4; 505 } else { // freq only (2 byte) 506 pos += 2; 507 } 508 509 // skipping bigram 510 int bigramExist = (mDict[pos] & FLAG_BIGRAM_READ); 511 if (bigramExist > 0) { 512 int nextBigramExist = 1; 513 while (nextBigramExist > 0) { 514 pos += 3; 515 nextBigramExist = (mDict[pos++] & FLAG_BIGRAM_CONTINUED); 516 } 517 } else { 518 pos++; 519 } 520 } 521 } 522 depth++; 523 if (followDownBranchAddress == 0) { 524 LOGI("ERROR!!! Cannot find bigram!!"); 525 break; 526 } 527 } 528 if (checkFirstCharacter(word)) { 529 addWordBigram(word, depth, frequency); 530 } 531} 532 533bool 534Dictionary::checkFirstCharacter(unsigned short *word) 535{ 536 // Checks whether this word starts with same character or neighboring characters of 537 // what user typed. 538 539 int *inputCodes = mInputCodes; 540 int maxAlt = mMaxAlternatives; 541 while (maxAlt > 0) { 542 if ((unsigned int) *inputCodes == (unsigned int) *word) { 543 return true; 544 } 545 inputCodes++; 546 maxAlt--; 547 } 548 return false; 549} 550 551bool 552Dictionary::isValidWord(unsigned short *word, int length) 553{ 554 if (checkIfDictVersionIsLatest()) { 555 return (isValidWordRec(DICTIONARY_HEADER_SIZE, word, 0, length) != NOT_VALID_WORD); 556 } else { 557 return (isValidWordRec(0, word, 0, length) != NOT_VALID_WORD); 558 } 559} 560 561int 562Dictionary::isValidWordRec(int pos, unsigned short *word, int offset, int length) { 563 // returns address of bigram data of that word 564 // return -99 if not found 565 566 int count = getCount(&pos); 567 unsigned short currentChar = (unsigned short) word[offset]; 568 for (int j = 0; j < count; j++) { 569 unsigned short c = getChar(&pos); 570 int terminal = getTerminal(&pos); 571 int childPos = getAddress(&pos); 572 if (c == currentChar) { 573 if (offset == length - 1) { 574 if (terminal) { 575 return (pos+1); 576 } 577 } else { 578 if (childPos != 0) { 579 int t = isValidWordRec(childPos, word, offset + 1, length); 580 if (t > 0) { 581 return t; 582 } 583 } 584 } 585 } 586 if (terminal) { 587 getFreq(&pos); 588 } 589 // There could be two instances of each alphabet - upper and lower case. So continue 590 // looking ... 591 } 592 return NOT_VALID_WORD; 593} 594 595 596} // namespace latinime