PageRenderTime 99ms CodeModel.GetById 26ms app.highlight 67ms RepoModel.GetById 1ms app.codeStats 0ms

/ime/latinime/jni/src/dictionary.cpp

http://eyes-free.googlecode.com/
C++ | 596 lines | 502 code | 52 blank | 42 comment | 148 complexity | 762c1ad4ca09994bd643f251d3714fac MD5 | raw file
  1/*
  2**
  3** Copyright 2009, The Android Open Source Project
  4**
  5** Licensed under the Apache License, Version 2.0 (the "License");
  6** you may not use this file except in compliance with the License.
  7** You may obtain a copy of the License at
  8**
  9**     http://www.apache.org/licenses/LICENSE-2.0
 10**
 11** Unless required by applicable law or agreed to in writing, software
 12** distributed under the License is distributed on an "AS IS" BASIS,
 13** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14** See the License for the specific language governing permissions and
 15** limitations under the License.
 16*/
 17
 18#include <stdio.h>
 19#include <fcntl.h>
 20#include <sys/mman.h>
 21#include <string.h>
 22//#define LOG_TAG "dictionary.cpp"
 23//#include <cutils/log.h>
 24#define LOGI
 25
 26#include "dictionary.h"
 27#include "basechars.h"
 28#include "char_utils.h"
 29
 30#define DEBUG_DICT 0
 31#define DICTIONARY_VERSION_MIN 200
 32#define DICTIONARY_HEADER_SIZE 2
 33#define NOT_VALID_WORD -99
 34
 35namespace latinime {
 36
 37Dictionary::Dictionary(void *dict, int typedLetterMultiplier, int fullWordMultiplier)
 38{
 39    mDict = (unsigned char*) dict;
 40    mTypedLetterMultiplier = typedLetterMultiplier;
 41    mFullWordMultiplier = fullWordMultiplier;
 42    getVersionNumber();
 43}
 44
 45Dictionary::~Dictionary()
 46{
 47}
 48
 49int Dictionary::getSuggestions(int *codes, int codesSize, unsigned short *outWords, int *frequencies,
 50        int maxWordLength, int maxWords, int maxAlternatives, int skipPos,
 51        int *nextLetters, int nextLettersSize)
 52{
 53    int suggWords;
 54    mFrequencies = frequencies;
 55    mOutputChars = outWords;
 56    mInputCodes = codes;
 57    mInputLength = codesSize;
 58    mMaxAlternatives = maxAlternatives;
 59    mMaxWordLength = maxWordLength;
 60    mMaxWords = maxWords;
 61    mSkipPos = skipPos;
 62    mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2;
 63    mNextLettersFrequencies = nextLetters;
 64    mNextLettersSize = nextLettersSize;
 65
 66    if (checkIfDictVersionIsLatest()) {
 67        getWordsRec(DICTIONARY_HEADER_SIZE, 0, mInputLength * 3, false, 1, 0, 0);
 68    } else {
 69        getWordsRec(0, 0, mInputLength * 3, false, 1, 0, 0);
 70    }
 71
 72    // Get the word count
 73    suggWords = 0;
 74    while (suggWords < mMaxWords && mFrequencies[suggWords] > 0) suggWords++;
 75    if (DEBUG_DICT) LOGI("Returning %d words", suggWords);
 76
 77    if (DEBUG_DICT) {
 78        LOGI("Next letters: ");
 79        for (int k = 0; k < nextLettersSize; k++) {
 80            if (mNextLettersFrequencies[k] > 0) {
 81                LOGI("%c = %d,", k, mNextLettersFrequencies[k]);
 82            }
 83        }
 84        LOGI("\n");
 85    }
 86    return suggWords;
 87}
 88
 89void
 90Dictionary::registerNextLetter(unsigned short c)
 91{
 92    if (c < mNextLettersSize) {
 93        mNextLettersFrequencies[c]++;
 94    }
 95}
 96
 97void
 98Dictionary::getVersionNumber()
 99{
100    mVersion = (mDict[0] & 0xFF);
101    mBigram = (mDict[1] & 0xFF);
102    LOGI("IN NATIVE SUGGEST Version: %d Bigram : %d \n", mVersion, mBigram);
103}
104
105// Checks whether it has the latest dictionary or the old dictionary
106bool
107Dictionary::checkIfDictVersionIsLatest()
108{
109    return (mVersion >= DICTIONARY_VERSION_MIN) && (mBigram == 1 || mBigram == 0);
110}
111
112unsigned short
113Dictionary::getChar(int *pos)
114{
115    unsigned short ch = (unsigned short) (mDict[(*pos)++] & 0xFF);
116    // If the code is 255, then actual 16 bit code follows (in big endian)
117    if (ch == 0xFF) {
118        ch = ((mDict[*pos] & 0xFF) << 8) | (mDict[*pos + 1] & 0xFF);
119        (*pos) += 2;
120    }
121    return ch;
122}
123
124int
125Dictionary::getAddress(int *pos)
126{
127    int address = 0;
128    if ((mDict[*pos] & FLAG_ADDRESS_MASK) == 0) {
129        *pos += 1;
130    } else {
131        address += (mDict[*pos] & (ADDRESS_MASK >> 16)) << 16;
132        address += (mDict[*pos + 1] & 0xFF) << 8;
133        address += (mDict[*pos + 2] & 0xFF);
134        *pos += 3;
135    }
136    return address;
137}
138
139int
140Dictionary::getFreq(int *pos)
141{
142    int freq = mDict[(*pos)++] & 0xFF;
143
144    if (checkIfDictVersionIsLatest()) {
145        // skipping bigram
146        int bigramExist = (mDict[*pos] & FLAG_BIGRAM_READ);
147        if (bigramExist > 0) {
148            int nextBigramExist = 1;
149            while (nextBigramExist > 0) {
150                (*pos) += 3;
151                nextBigramExist = (mDict[(*pos)++] & FLAG_BIGRAM_CONTINUED);
152            }
153        } else {
154            (*pos)++;
155        }
156    }
157
158    return freq;
159}
160
161int
162Dictionary::wideStrLen(unsigned short *str)
163{
164    if (!str) return 0;
165    unsigned short *end = str;
166    while (*end)
167        end++;
168    return end - str;
169}
170
171bool
172Dictionary::addWord(unsigned short *word, int length, int frequency)
173{
174    word[length] = 0;
175    if (DEBUG_DICT) {
176        char s[length + 1];
177        for (int i = 0; i <= length; i++) s[i] = word[i];
178        LOGI("Found word = %s, freq = %d : \n", s, frequency);
179    }
180
181    // Find the right insertion point
182    int insertAt = 0;
183    while (insertAt < mMaxWords) {
184        if (frequency > mFrequencies[insertAt]
185                 || (mFrequencies[insertAt] == frequency
186                     && length < wideStrLen(mOutputChars + insertAt * mMaxWordLength))) {
187            break;
188        }
189        insertAt++;
190    }
191    if (insertAt < mMaxWords) {
192        memmove((char*) mFrequencies + (insertAt + 1) * sizeof(mFrequencies[0]),
193               (char*) mFrequencies + insertAt * sizeof(mFrequencies[0]),
194               (mMaxWords - insertAt - 1) * sizeof(mFrequencies[0]));
195        mFrequencies[insertAt] = frequency;
196        memmove((char*) mOutputChars + (insertAt + 1) * mMaxWordLength * sizeof(short),
197               (char*) mOutputChars + (insertAt    ) * mMaxWordLength * sizeof(short),
198               (mMaxWords - insertAt - 1) * sizeof(short) * mMaxWordLength);
199        unsigned short *dest = mOutputChars + (insertAt    ) * mMaxWordLength;
200        while (length--) {
201            *dest++ = *word++;
202        }
203        *dest = 0; // NULL terminate
204        if (DEBUG_DICT) LOGI("Added word at %d\n", insertAt);
205        return true;
206    }
207    return false;
208}
209
210bool
211Dictionary::addWordBigram(unsigned short *word, int length, int frequency)
212{
213    word[length] = 0;
214    if (DEBUG_DICT) {
215        char s[length + 1];
216        for (int i = 0; i <= length; i++) s[i] = word[i];
217        LOGI("Bigram: Found word = %s, freq = %d : \n", s, frequency);
218    }
219
220    // Find the right insertion point
221    int insertAt = 0;
222    while (insertAt < mMaxBigrams) {
223        if (frequency > mBigramFreq[insertAt]
224                 || (mBigramFreq[insertAt] == frequency
225                     && length < wideStrLen(mBigramChars + insertAt * mMaxWordLength))) {
226            break;
227        }
228        insertAt++;
229    }
230    LOGI("Bigram: InsertAt -> %d maxBigrams: %d\n", insertAt, mMaxBigrams);
231    if (insertAt < mMaxBigrams) {
232        memmove((char*) mBigramFreq + (insertAt + 1) * sizeof(mBigramFreq[0]),
233               (char*) mBigramFreq + insertAt * sizeof(mBigramFreq[0]),
234               (mMaxBigrams - insertAt - 1) * sizeof(mBigramFreq[0]));
235        mBigramFreq[insertAt] = frequency;
236        memmove((char*) mBigramChars + (insertAt + 1) * mMaxWordLength * sizeof(short),
237               (char*) mBigramChars + (insertAt    ) * mMaxWordLength * sizeof(short),
238               (mMaxBigrams - insertAt - 1) * sizeof(short) * mMaxWordLength);
239        unsigned short *dest = mBigramChars + (insertAt    ) * mMaxWordLength;
240        while (length--) {
241            *dest++ = *word++;
242        }
243        *dest = 0; // NULL terminate
244        if (DEBUG_DICT) LOGI("Bigram: Added word at %d\n", insertAt);
245        return true;
246    }
247    return false;
248}
249
250unsigned short
251Dictionary::toLowerCase(unsigned short c) {
252    if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) {
253        c = BASE_CHARS[c];
254    }
255    if (c >='A' && c <= 'Z') {
256        c |= 32;
257    } else if (c > 127) {
258        c = latin_tolower(c);
259    }
260    return c;
261}
262
263bool
264Dictionary::sameAsTyped(unsigned short *word, int length)
265{
266    if (length != mInputLength) {
267        return false;
268    }
269    int *inputCodes = mInputCodes;
270    while (length--) {
271        if ((unsigned int) *inputCodes != (unsigned int) *word) {
272            return false;
273        }
274        inputCodes += mMaxAlternatives;
275        word++;
276    }
277    return true;
278}
279
280static char QUOTE = '\'';
281
282void
283Dictionary::getWordsRec(int pos, int depth, int maxDepth, bool completion, int snr, int inputIndex,
284                        int diffs)
285{
286    // Optimization: Prune out words that are too long compared to how much was typed.
287    if (depth > maxDepth) {
288        return;
289    }
290    if (diffs > mMaxEditDistance) {
291        return;
292    }
293    int count = getCount(&pos);
294    int *currentChars = NULL;
295    if (mInputLength <= inputIndex) {
296        completion = true;
297    } else {
298        currentChars = mInputCodes + (inputIndex * mMaxAlternatives);
299    }
300
301    for (int i = 0; i < count; i++) {
302        // -- at char
303        unsigned short c = getChar(&pos);
304        // -- at flag/add
305        unsigned short lowerC = toLowerCase(c);
306        bool terminal = getTerminal(&pos);
307        int childrenAddress = getAddress(&pos);
308        // -- after address or flag
309        int freq = 1;
310        if (terminal) freq = getFreq(&pos);
311        // -- after add or freq
312
313        // If we are only doing completions, no need to look at the typed characters.
314        if (completion) {
315            mWord[depth] = c;
316            if (terminal) {
317                addWord(mWord, depth + 1, freq * snr);
318                if (depth >= mInputLength && mSkipPos < 0) {
319                    registerNextLetter(mWord[mInputLength]);
320                }
321            }
322            if (childrenAddress != 0) {
323                getWordsRec(childrenAddress, depth + 1, maxDepth,
324                            completion, snr, inputIndex, diffs);
325            }
326        } else if ((c == QUOTE && currentChars[0] != QUOTE) || mSkipPos == depth) {
327            // Skip the ' or other letter and continue deeper
328            mWord[depth] = c;
329            if (childrenAddress != 0) {
330                getWordsRec(childrenAddress, depth + 1, maxDepth, false, snr, inputIndex, diffs);
331            }
332        } else {
333            int j = 0;
334            while (currentChars[j] > 0) {
335                if (currentChars[j] == lowerC || currentChars[j] == c) {
336                    int addedWeight = j == 0 ? mTypedLetterMultiplier : 1;
337                    mWord[depth] = c;
338                    if (mInputLength == inputIndex + 1) {
339                        if (terminal) {
340                            if (//INCLUDE_TYPED_WORD_IF_VALID ||
341                                !sameAsTyped(mWord, depth + 1)) {
342                                int finalFreq = freq * snr * addedWeight;
343                                if (mSkipPos < 0) finalFreq *= mFullWordMultiplier;
344                                addWord(mWord, depth + 1, finalFreq);
345                            }
346                        }
347                        if (childrenAddress != 0) {
348                            getWordsRec(childrenAddress, depth + 1,
349                                    maxDepth, true, snr * addedWeight, inputIndex + 1,
350                                    diffs + (j > 0));
351                        }
352                    } else if (childrenAddress != 0) {
353                        getWordsRec(childrenAddress, depth + 1, maxDepth,
354                                false, snr * addedWeight, inputIndex + 1, diffs + (j > 0));
355                    }
356                }
357                j++;
358                if (mSkipPos >= 0) break;
359            }
360        }
361    }
362}
363
364int
365Dictionary::getBigramAddress(int *pos, bool advance)
366{
367    int address = 0;
368
369    address += (mDict[*pos] & 0x3F) << 16;
370    address += (mDict[*pos + 1] & 0xFF) << 8;
371    address += (mDict[*pos + 2] & 0xFF);
372
373    if (advance) {
374        *pos += 3;
375    }
376
377    return address;
378}
379
380int
381Dictionary::getBigramFreq(int *pos)
382{
383    int freq = mDict[(*pos)++] & FLAG_BIGRAM_FREQ;
384
385    return freq;
386}
387
388
389int
390Dictionary::getBigrams(unsigned short *prevWord, int prevWordLength, int *codes, int codesSize,
391        unsigned short *bigramChars, int *bigramFreq, int maxWordLength, int maxBigrams,
392        int maxAlternatives)
393{
394    mBigramFreq = bigramFreq;
395    mBigramChars = bigramChars;
396    mInputCodes = codes;
397    mInputLength = codesSize;
398    mMaxWordLength = maxWordLength;
399    mMaxBigrams = maxBigrams;
400    mMaxAlternatives = maxAlternatives;
401
402    if (mBigram == 1 && checkIfDictVersionIsLatest()) {
403        int pos = isValidWordRec(DICTIONARY_HEADER_SIZE, prevWord, 0, prevWordLength);
404        LOGI("Pos -> %d\n", pos);
405        if (pos < 0) {
406            return 0;
407        }
408
409        int bigramCount = 0;
410        int bigramExist = (mDict[pos] & FLAG_BIGRAM_READ);
411        if (bigramExist > 0) {
412            int nextBigramExist = 1;
413            while (nextBigramExist > 0 && bigramCount < maxBigrams) {
414                int bigramAddress = getBigramAddress(&pos, true);
415                int frequency = (FLAG_BIGRAM_FREQ & mDict[pos]);
416                // search for all bigrams and store them
417                searchForTerminalNode(bigramAddress, frequency);
418                nextBigramExist = (mDict[pos++] & FLAG_BIGRAM_CONTINUED);
419                bigramCount++;
420            }
421        }
422
423        return bigramCount;
424    }
425    return 0;
426}
427
428void
429Dictionary::searchForTerminalNode(int addressLookingFor, int frequency)
430{
431    // track word with such address and store it in an array
432    unsigned short word[mMaxWordLength];
433
434    int pos;
435    int followDownBranchAddress = DICTIONARY_HEADER_SIZE;
436    bool found = false;
437    char followingChar = ' ';
438    int depth = -1;
439
440    while(!found) {
441        bool followDownAddressSearchStop = false;
442        bool firstAddress = true;
443        bool haveToSearchAll = true;
444
445        if (depth >= 0) {
446            word[depth] = (unsigned short) followingChar;
447        }
448        pos = followDownBranchAddress; // pos start at count
449        int count = mDict[pos] & 0xFF;
450        LOGI("count - %d\n",count);
451        pos++;
452        for (int i = 0; i < count; i++) {
453            // pos at data
454            pos++;
455            // pos now at flag
456            if (!getFirstBitOfByte(&pos)) { // non-terminal
457                if (!followDownAddressSearchStop) {
458                    int addr = getBigramAddress(&pos, false);
459                    if (addr > addressLookingFor) {
460                        followDownAddressSearchStop = true;
461                        if (firstAddress) {
462                            firstAddress = false;
463                            haveToSearchAll = true;
464                        } else if (!haveToSearchAll) {
465                            break;
466                        }
467                    } else {
468                        followDownBranchAddress = addr;
469                        followingChar = (char)(0xFF & mDict[pos-1]);
470                        if (firstAddress) {
471                            firstAddress = false;
472                            haveToSearchAll = false;
473                        }
474                    }
475                }
476                pos += 3;
477            } else if (getFirstBitOfByte(&pos)) { // terminal
478                if (addressLookingFor == (pos-1)) { // found !!
479                    depth++;
480                    word[depth] = (0xFF & mDict[pos-1]);
481                    found = true;
482                    break;
483                }
484                if (getSecondBitOfByte(&pos)) { // address + freq (4 byte)
485                    if (!followDownAddressSearchStop) {
486                        int addr = getBigramAddress(&pos, false);
487                        if (addr > addressLookingFor) {
488                            followDownAddressSearchStop = true;
489                            if (firstAddress) {
490                                firstAddress = false;
491                                haveToSearchAll = true;
492                            } else if (!haveToSearchAll) {
493                                break;
494                            }
495                        } else {
496                            followDownBranchAddress = addr;
497                            followingChar = (char)(0xFF & mDict[pos-1]);
498                            if (firstAddress) {
499                                firstAddress = false;
500                                haveToSearchAll = true;
501                            }
502                        }
503                    }
504                    pos += 4;
505                } else { // freq only (2 byte)
506                    pos += 2;
507                }
508
509                // skipping bigram
510                int bigramExist = (mDict[pos] & FLAG_BIGRAM_READ);
511                if (bigramExist > 0) {
512                    int nextBigramExist = 1;
513                    while (nextBigramExist > 0) {
514                        pos += 3;
515                        nextBigramExist = (mDict[pos++] & FLAG_BIGRAM_CONTINUED);
516                    }
517                } else {
518                    pos++;
519                }
520            }
521        }
522        depth++;
523        if (followDownBranchAddress == 0) {
524            LOGI("ERROR!!! Cannot find bigram!!");
525            break;
526        }
527    }
528    if (checkFirstCharacter(word)) {
529        addWordBigram(word, depth, frequency);
530    }
531}
532
533bool
534Dictionary::checkFirstCharacter(unsigned short *word)
535{
536    // Checks whether this word starts with same character or neighboring characters of
537    // what user typed.
538
539    int *inputCodes = mInputCodes;
540    int maxAlt = mMaxAlternatives;
541    while (maxAlt > 0) {
542        if ((unsigned int) *inputCodes == (unsigned int) *word) {
543            return true;
544        }
545        inputCodes++;
546        maxAlt--;
547    }
548    return false;
549}
550
551bool
552Dictionary::isValidWord(unsigned short *word, int length)
553{
554    if (checkIfDictVersionIsLatest()) {
555        return (isValidWordRec(DICTIONARY_HEADER_SIZE, word, 0, length) != NOT_VALID_WORD);
556    } else {
557        return (isValidWordRec(0, word, 0, length) != NOT_VALID_WORD);
558    }
559}
560
561int
562Dictionary::isValidWordRec(int pos, unsigned short *word, int offset, int length) {
563    // returns address of bigram data of that word
564    // return -99 if not found
565
566    int count = getCount(&pos);
567    unsigned short currentChar = (unsigned short) word[offset];
568    for (int j = 0; j < count; j++) {
569        unsigned short c = getChar(&pos);
570        int terminal = getTerminal(&pos);
571        int childPos = getAddress(&pos);
572        if (c == currentChar) {
573            if (offset == length - 1) {
574                if (terminal) {
575                    return (pos+1);
576                }
577            } else {
578                if (childPos != 0) {
579                    int t = isValidWordRec(childPos, word, offset + 1, length);
580                    if (t > 0) {
581                        return t;
582                    }
583                }
584            }
585        }
586        if (terminal) {
587            getFreq(&pos);
588        }
589        // There could be two instances of each alphabet - upper and lower case. So continue
590        // looking ...
591    }
592    return NOT_VALID_WORD;
593}
594
595
596} // namespace latinime