affixmgr.cpp - This C++ code is part of a spell-checking li…

/extensions/spellcheck/hunspell/src/affixmgr.cpp

http://github.com/zpao/v8monkey · C++ · 4575 lines · 3604 code · 489 blank · 482 comment · 1567 complexity · 240bb4bb862d7846dbcab5744698c2eb MD5 · raw file
Large files are truncated click here to view the full file

/******* BEGIN LICENSE BLOCK *******
 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
 * 
 * The contents of this file are subject to the Mozilla Public License Version
 * 1.1 (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 * 
 * Software distributed under the License is distributed on an "AS IS" basis,
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
 * for the specific language governing rights and limitations under the
 * License.
 * 
 * The Initial Developers of the Original Code are Kevin Hendricks (MySpell)
 * and L�szl� N�meth (Hunspell). Portions created by the Initial Developers
 * are Copyright (C) 2002-2005 the Initial Developers. All Rights Reserved.
 * 
 * Contributor(s): Kevin Hendricks (kevin.hendricks@sympatico.ca)
 *                 David Einstein (deinst@world.std.com)
 *                 L�szl� N�meth (nemethl@gyorsposta.hu)
 *                 Caolan McNamara (caolanm@redhat.com)
 *                 Davide Prina
 *                 Giuseppe Modugno
 *                 Gianluca Turconi
 *                 Simon Brouwer
 *                 Noll Janos
 *                 Biro Arpad
 *                 Goldman Eleonora
 *                 Sarlos Tamas
 *                 Bencsath Boldizsar
 *                 Halacsy Peter
 *                 Dvornik Laszlo
 *                 Gefferth Andras
 *                 Nagy Viktor
 *                 Varga Daniel
 *                 Chris Halls
 *                 Rene Engelhard
 *                 Bram Moolenaar
 *                 Dafydd Jones
 *                 Harri Pitkanen
 *                 Andras Timar
 *                 Tor Lillqvist
 * 
 * Alternatively, the contents of this file may be used under the terms of
 * either the GNU General Public License Version 2 or later (the "GPL"), or
 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
 * in which case the provisions of the GPL or the LGPL are applicable instead
 * of those above. If you wish to allow use of your version of this file only
 * under the terms of either the GPL or the LGPL, and not to allow others to
 * use your version of this file under the terms of the MPL, indicate your
 * decision by deleting the provisions above and replace them with the notice
 * and other provisions required by the GPL or the LGPL. If you do not delete
 * the provisions above, a recipient may use your version of this file under
 * the terms of any one of the MPL, the GPL or the LGPL.
 *
 ******* END LICENSE BLOCK *******/

#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <ctype.h>

#include <vector>

#include "affixmgr.hxx"
#include "affentry.hxx"
#include "langnum.hxx"

#include "csutil.hxx"

AffixMgr::AffixMgr(const char * affpath, HashMgr** ptr, int * md, const char * key) 
{
  // register hash manager and load affix data from aff file
  pHMgr = ptr[0];
  alldic = ptr;
  maxdic = md;
  keystring = NULL;
  trystring = NULL;
  encoding=NULL;
  csconv=NULL;
  utf8 = 0;
  complexprefixes = 0;
  maptable = NULL;
  nummap = 0;
  breaktable = NULL;
  numbreak = -1;
  reptable = NULL;
  numrep = 0;
  iconvtable = NULL;
  oconvtable = NULL;
  checkcpdtable = NULL;
  // allow simplified compound forms (see 3rd field of CHECKCOMPOUNDPATTERN)
  simplifiedcpd = 0;
  numcheckcpd = 0;
  defcpdtable = NULL;
  numdefcpd = 0;
  phone = NULL;
  compoundflag = FLAG_NULL; // permits word in compound forms
  compoundbegin = FLAG_NULL; // may be first word in compound forms
  compoundmiddle = FLAG_NULL; // may be middle word in compound forms
  compoundend = FLAG_NULL; // may be last word in compound forms
  compoundroot = FLAG_NULL; // compound word signing flag
  compoundpermitflag = FLAG_NULL; // compound permitting flag for suffixed word
  compoundforbidflag = FLAG_NULL; // compound fordidden flag for suffixed word
  checkcompounddup = 0; // forbid double words in compounds
  checkcompoundrep = 0; // forbid bad compounds (may be non compound word with a REP substitution)
  checkcompoundcase = 0; // forbid upper and lowercase combinations at word bounds
  checkcompoundtriple = 0; // forbid compounds with triple letters
  simplifiedtriple = 0; // allow simplified triple letters in compounds (Schiff+fahrt -> Schiffahrt)
  forbiddenword = FORBIDDENWORD; // forbidden word signing flag
  nosuggest = FLAG_NULL; // don't suggest words signed with NOSUGGEST flag
  nongramsuggest = FLAG_NULL;
  lang = NULL; // language
  langnum = 0; // language code (see http://l10n.openoffice.org/languages.html)
  needaffix = FLAG_NULL; // forbidden root, allowed only with suffixes
  cpdwordmax = -1; // default: unlimited wordcount in compound words
  cpdmin = -1;  // undefined
  cpdmaxsyllable = 0; // default: unlimited syllablecount in compound words
  cpdvowels=NULL; // vowels (for calculating of Hungarian compounding limit, O(n) search! XXX)
  cpdvowels_utf16=NULL; // vowels for UTF-8 encoding (bsearch instead of O(n) search)
  cpdvowels_utf16_len=0; // vowels
  pfxappnd=NULL; // previous prefix for counting the syllables of prefix BUG
  sfxappnd=NULL; // previous suffix for counting a special syllables BUG
  cpdsyllablenum=NULL; // syllable count incrementing flag
  checknum=0; // checking numbers, and word with numbers
  wordchars=NULL; // letters + spec. word characters
  wordchars_utf16=NULL; // letters + spec. word characters
  wordchars_utf16_len=0; // letters + spec. word characters
  ignorechars=NULL; // letters + spec. word characters
  ignorechars_utf16=NULL; // letters + spec. word characters
  ignorechars_utf16_len=0; // letters + spec. word characters
  version=NULL; // affix and dictionary file version string
  havecontclass=0; // flags of possible continuing classes (double affix)
  // LEMMA_PRESENT: not put root into the morphological output. Lemma presents
  // in morhological description in dictionary file. It's often combined with PSEUDOROOT.
  lemma_present = FLAG_NULL; 
  circumfix = FLAG_NULL; 
  onlyincompound = FLAG_NULL; 
  maxngramsugs = -1; // undefined
  maxdiff = -1; // undefined
  onlymaxdiff = 0;
  maxcpdsugs = -1; // undefined
  nosplitsugs = 0;
  sugswithdots = 0;
  keepcase = 0;
  forceucase = 0;
  warn = 0;
  forbidwarn = 0;
  checksharps = 0;
  substandard = FLAG_NULL;
  fullstrip = 0;

  sfx = NULL;
  pfx = NULL;

  for (int i=0; i < SETSIZE; i++) {
     pStart[i] = NULL;
     sStart[i] = NULL;
     pFlag[i] = NULL;
     sFlag[i] = NULL;
  }

  for (int j=0; j < CONTSIZE; j++) {
    contclasses[j] = 0;
  }

  if (parse_file(affpath, key)) {
     HUNSPELL_WARNING(stderr, "Failure loading aff file %s\n",affpath);
  }
  
  if (cpdmin == -1) cpdmin = MINCPDLEN;

}


AffixMgr::~AffixMgr() 
{
  // pass through linked prefix entries and clean up
  for (int i=0; i < SETSIZE ;i++) {
       pFlag[i] = NULL;
       PfxEntry * ptr = pStart[i];
       PfxEntry * nptr = NULL;
       while (ptr) {
            nptr = ptr->getNext();
            delete(ptr);
            ptr = nptr;
            nptr = NULL;
       }  
  }

  // pass through linked suffix entries and clean up
  for (int j=0; j < SETSIZE ; j++) {
       sFlag[j] = NULL;
       SfxEntry * ptr = sStart[j];
       SfxEntry * nptr = NULL;
       while (ptr) {
            nptr = ptr->getNext();
            delete(ptr);
            ptr = nptr;
            nptr = NULL;
       }
       sStart[j] = NULL;
  }

  if (keystring) free(keystring);
  keystring=NULL;
  if (trystring) free(trystring);
  trystring=NULL;
  if (encoding) free(encoding);
  encoding=NULL;
  if (maptable) {  
     for (int j=0; j < nummap; j++) {
        for (int k=0; k < maptable[j].len; k++) {
           if (maptable[j].set[k]) free(maptable[j].set[k]);
        }
        free(maptable[j].set);
        maptable[j].set = NULL;
        maptable[j].len = 0;
     }
     free(maptable);  
     maptable = NULL;
  }
  nummap = 0;
  if (breaktable) {
     for (int j=0; j < numbreak; j++) {
        if (breaktable[j]) free(breaktable[j]);
        breaktable[j] = NULL;
     }
     free(breaktable);  
     breaktable = NULL;
  }
  numbreak = 0;
  if (reptable) {
     for (int j=0; j < numrep; j++) {
        free(reptable[j].pattern);
        free(reptable[j].pattern2);
     }
     free(reptable);  
     reptable = NULL;
  }
  if (iconvtable) delete iconvtable;
  if (oconvtable) delete oconvtable;
  if (phone && phone->rules) {
     for (int j=0; j < phone->num + 1; j++) {
        free(phone->rules[j * 2]);
        free(phone->rules[j * 2 + 1]);
     }
     free(phone->rules);
     free(phone);  
     phone = NULL;
  }

  if (defcpdtable) {  
     for (int j=0; j < numdefcpd; j++) {
        free(defcpdtable[j].def);
        defcpdtable[j].def = NULL;
     }
     free(defcpdtable);  
     defcpdtable = NULL;
  }
  numrep = 0;
  if (checkcpdtable) {  
     for (int j=0; j < numcheckcpd; j++) {
        free(checkcpdtable[j].pattern);
        free(checkcpdtable[j].pattern2);
        free(checkcpdtable[j].pattern3);
        checkcpdtable[j].pattern = NULL;
        checkcpdtable[j].pattern2 = NULL;
        checkcpdtable[j].pattern3 = NULL;
     }
     free(checkcpdtable);  
     checkcpdtable = NULL;
  }
  numcheckcpd = 0;
  FREE_FLAG(compoundflag);
  FREE_FLAG(compoundbegin);
  FREE_FLAG(compoundmiddle);
  FREE_FLAG(compoundend);
  FREE_FLAG(compoundpermitflag);
  FREE_FLAG(compoundforbidflag);
  FREE_FLAG(compoundroot);
  FREE_FLAG(forbiddenword);
  FREE_FLAG(nosuggest);
  FREE_FLAG(nongramsuggest);
  FREE_FLAG(needaffix);
  FREE_FLAG(lemma_present);
  FREE_FLAG(circumfix);
  FREE_FLAG(onlyincompound);
  
  cpdwordmax = 0;
  pHMgr = NULL;
  cpdmin = 0;
  cpdmaxsyllable = 0;
  if (cpdvowels) free(cpdvowels);
  if (cpdvowels_utf16) free(cpdvowels_utf16);
  if (cpdsyllablenum) free(cpdsyllablenum);
  free_utf_tbl();
  if (lang) free(lang);
  if (wordchars) free(wordchars);
  if (wordchars_utf16) free(wordchars_utf16);
  if (ignorechars) free(ignorechars);
  if (ignorechars_utf16) free(ignorechars_utf16);
  if (version) free(version);
  checknum=0;
#ifdef MOZILLA_CLIENT
  delete [] csconv;
#endif
}


// read in aff file and build up prefix and suffix entry objects 
int  AffixMgr::parse_file(const char * affpath, const char * key)
{
  char * line; // io buffers
  char ft;     // affix type
  
  // checking flag duplication
  char dupflags[CONTSIZE];
  char dupflags_ini = 1;

  // first line indicator for removing byte order mark
  int firstline = 1;
  
  // open the affix file
  FileMgr * afflst = new FileMgr(affpath, key);
  if (!afflst) {
    HUNSPELL_WARNING(stderr, "error: could not open affix description file %s\n",affpath);
    return 1;
  }

  // step one is to parse the affix file building up the internal
  // affix data structures

    // read in each line ignoring any that do not
    // start with a known line type indicator
    while ((line = afflst->getline())) {
       mychomp(line);

       /* remove byte order mark */
       if (firstline) {
         firstline = 0;
         // Affix file begins with byte order mark: possible incompatibility with old Hunspell versions
         if (strncmp(line,"\xEF\xBB\xBF",3) == 0) {
            memmove(line, line+3, strlen(line+3)+1);
         }
       }

       /* parse in the keyboard string */
       if (strncmp(line,"KEY",3) == 0) {
          if (parse_string(line, &keystring, afflst->getlinenum())) {
             delete afflst;
             return 1;
          }
       }

       /* parse in the try string */
       if (strncmp(line,"TRY",3) == 0) {
          if (parse_string(line, &trystring, afflst->getlinenum())) {
             delete afflst;
             return 1;
          }
       }

       /* parse in the name of the character set used by the .dict and .aff */
       if (strncmp(line,"SET",3) == 0) {
          if (parse_string(line, &encoding, afflst->getlinenum())) {
             delete afflst;
             return 1;
          }
          if (strcmp(encoding, "UTF-8") == 0) {
             utf8 = 1;
#ifndef OPENOFFICEORG
#ifndef MOZILLA_CLIENT
             if (initialize_utf_tbl()) return 1;
#endif
#endif
          }
       }

       /* parse COMPLEXPREFIXES for agglutinative languages with right-to-left writing system */
       if (strncmp(line,"COMPLEXPREFIXES",15) == 0)
                   complexprefixes = 1;

       /* parse in the flag used by the controlled compound words */
       if (strncmp(line,"COMPOUNDFLAG",12) == 0) {
          if (parse_flag(line, &compoundflag, afflst)) {
             delete afflst;
             return 1;
          }
       }

       /* parse in the flag used by compound words */
       if (strncmp(line,"COMPOUNDBEGIN",13) == 0) {
          if (complexprefixes) {
            if (parse_flag(line, &compoundend, afflst)) {
              delete afflst;
              return 1;
            }
          } else {
            if (parse_flag(line, &compoundbegin, afflst)) {
              delete afflst;
              return 1;
            }
          }
       }

       /* parse in the flag used by compound words */
       if (strncmp(line,"COMPOUNDMIDDLE",14) == 0) {
          if (parse_flag(line, &compoundmiddle, afflst)) {
             delete afflst;
             return 1;
          }
       }
       /* parse in the flag used by compound words */
       if (strncmp(line,"COMPOUNDEND",11) == 0) {
          if (complexprefixes) {
            if (parse_flag(line, &compoundbegin, afflst)) {
              delete afflst;
              return 1;
            }
          } else {
            if (parse_flag(line, &compoundend, afflst)) {
              delete afflst;
              return 1;
            }
          }
       }

       /* parse in the data used by compound_check() method */
       if (strncmp(line,"COMPOUNDWORDMAX",15) == 0) {
          if (parse_num(line, &cpdwordmax, afflst)) {
             delete afflst;
             return 1;
          }
       }

       /* parse in the flag sign compounds in dictionary */
       if (strncmp(line,"COMPOUNDROOT",12) == 0) {
          if (parse_flag(line, &compoundroot, afflst)) {
             delete afflst;
             return 1;
          }
       }

       /* parse in the flag used by compound_check() method */
       if (strncmp(line,"COMPOUNDPERMITFLAG",18) == 0) {
          if (parse_flag(line, &compoundpermitflag, afflst)) {
             delete afflst;
             return 1;
          }
       }

       /* parse in the flag used by compound_check() method */
       if (strncmp(line,"COMPOUNDFORBIDFLAG",18) == 0) {
          if (parse_flag(line, &compoundforbidflag, afflst)) {
             delete afflst;
             return 1;
          }
       }

       if (strncmp(line,"CHECKCOMPOUNDDUP",16) == 0) {
                   checkcompounddup = 1;
       }

       if (strncmp(line,"CHECKCOMPOUNDREP",16) == 0) {
                   checkcompoundrep = 1;
       }

       if (strncmp(line,"CHECKCOMPOUNDTRIPLE",19) == 0) {
                   checkcompoundtriple = 1;
       }

       if (strncmp(line,"SIMPLIFIEDTRIPLE",16) == 0) {
                   simplifiedtriple = 1;
       }

       if (strncmp(line,"CHECKCOMPOUNDCASE",17) == 0) {
                   checkcompoundcase = 1;
       }

       if (strncmp(line,"NOSUGGEST",9) == 0) {
          if (parse_flag(line, &nosuggest, afflst)) {
             delete afflst;
             return 1;
          }
       }

       if (strncmp(line,"NONGRAMSUGGEST",14) == 0) {
          if (parse_flag(line, &nongramsuggest, afflst)) {
             delete afflst;
             return 1;
          }
       }

       /* parse in the flag used by forbidden words */
       if (strncmp(line,"FORBIDDENWORD",13) == 0) {
          if (parse_flag(line, &forbiddenword, afflst)) {
             delete afflst;
             return 1;
          }
       }

       /* parse in the flag used by forbidden words */
       if (strncmp(line,"LEMMA_PRESENT",13) == 0) {
          if (parse_flag(line, &lemma_present, afflst)) {
             delete afflst;
             return 1;
          }
       }

       /* parse in the flag used by circumfixes */
       if (strncmp(line,"CIRCUMFIX",9) == 0) {
          if (parse_flag(line, &circumfix, afflst)) {
             delete afflst;
             return 1;
          }
       }

       /* parse in the flag used by fogemorphemes */
       if (strncmp(line,"ONLYINCOMPOUND",14) == 0) {
          if (parse_flag(line, &onlyincompound, afflst)) {
             delete afflst;
             return 1;
          }
       }

       /* parse in the flag used by `needaffixs' */
       if (strncmp(line,"PSEUDOROOT",10) == 0) {
          if (parse_flag(line, &needaffix, afflst)) {
             delete afflst;
             return 1;
          }
       }

       /* parse in the flag used by `needaffixs' */
       if (strncmp(line,"NEEDAFFIX",9) == 0) {
          if (parse_flag(line, &needaffix, afflst)) {
             delete afflst;
             return 1;
          }
       }

       /* parse in the minimal length for words in compounds */
       if (strncmp(line,"COMPOUNDMIN",11) == 0) {
          if (parse_num(line, &cpdmin, afflst)) {
             delete afflst;
             return 1;
          }
          if (cpdmin < 1) cpdmin = 1;
       }

       /* parse in the max. words and syllables in compounds */
       if (strncmp(line,"COMPOUNDSYLLABLE",16) == 0) {
          if (parse_cpdsyllable(line, afflst)) {
             delete afflst;
             return 1;
          }
       }

       /* parse in the flag used by compound_check() method */
       if (strncmp(line,"SYLLABLENUM",11) == 0) {
          if (parse_string(line, &cpdsyllablenum, afflst->getlinenum())) {
             delete afflst;
             return 1;
          }
       }

       /* parse in the flag used by the controlled compound words */
       if (strncmp(line,"CHECKNUM",8) == 0) {
           checknum=1;
       }

       /* parse in the extra word characters */
       if (strncmp(line,"WORDCHARS",9) == 0) {
          if (parse_array(line, &wordchars, &wordchars_utf16, &wordchars_utf16_len, utf8, afflst->getlinenum())) {
             delete afflst;
             return 1;
          }
       }

       /* parse in the ignored characters (for example, Arabic optional diacretics charachters */
       if (strncmp(line,"IGNORE",6) == 0) {
          if (parse_array(line, &ignorechars, &ignorechars_utf16, &ignorechars_utf16_len, utf8, afflst->getlinenum())) {
             delete afflst;
             return 1;
          }
       }

       /* parse in the typical fault correcting table */
       if (strncmp(line,"REP",3) == 0) {
          if (parse_reptable(line, afflst)) {
             delete afflst;
             return 1;
          }
       }

       /* parse in the input conversion table */
       if (strncmp(line,"ICONV",5) == 0) {
          if (parse_convtable(line, afflst, &iconvtable, "ICONV")) {
             delete afflst;
             return 1;
          }
       }

       /* parse in the input conversion table */
       if (strncmp(line,"OCONV",5) == 0) {
          if (parse_convtable(line, afflst, &oconvtable, "OCONV")) {
             delete afflst;
             return 1;
          }
       }

       /* parse in the phonetic translation table */
       if (strncmp(line,"PHONE",5) == 0) {
          if (parse_phonetable(line, afflst)) {
             delete afflst;
             return 1;
          }
       }

       /* parse in the checkcompoundpattern table */
       if (strncmp(line,"CHECKCOMPOUNDPATTERN",20) == 0) {
          if (parse_checkcpdtable(line, afflst)) {
             delete afflst;
             return 1;
          }
       }

       /* parse in the defcompound table */
       if (strncmp(line,"COMPOUNDRULE",12) == 0) {
          if (parse_defcpdtable(line, afflst)) {
             delete afflst;
             return 1;
          }
       }

       /* parse in the related character map table */
       if (strncmp(line,"MAP",3) == 0) {
          if (parse_maptable(line, afflst)) {
             delete afflst;
             return 1;
          }
       }

       /* parse in the word breakpoints table */
       if (strncmp(line,"BREAK",5) == 0) {
          if (parse_breaktable(line, afflst)) {
             delete afflst;
             return 1;
          }
       }

       /* parse in the language for language specific codes */
       if (strncmp(line,"LANG",4) == 0) {
          if (parse_string(line, &lang, afflst->getlinenum())) {
             delete afflst;
             return 1;
          }
          langnum = get_lang_num(lang);
       }

       if (strncmp(line,"VERSION",7) == 0) {
          for(line = line + 7; *line == ' ' || *line == '\t'; line++);
          version = mystrdup(line);
       }

       if (strncmp(line,"MAXNGRAMSUGS",12) == 0) {
          if (parse_num(line, &maxngramsugs, afflst)) {
             delete afflst;
             return 1;
          }
       }

       if (strncmp(line,"ONLYMAXDIFF", 11) == 0)
                   onlymaxdiff = 1;

       if (strncmp(line,"MAXDIFF",7) == 0) {
          if (parse_num(line, &maxdiff, afflst)) {
             delete afflst;
             return 1;
          }
       }

       if (strncmp(line,"MAXCPDSUGS",10) == 0) {
          if (parse_num(line, &maxcpdsugs, afflst)) {
             delete afflst;
             return 1;
          }
       }

       if (strncmp(line,"NOSPLITSUGS",11) == 0) {
                   nosplitsugs=1;
       }

       if (strncmp(line,"FULLSTRIP",9) == 0) {
                   fullstrip=1;
       }

       if (strncmp(line,"SUGSWITHDOTS",12) == 0) {
                   sugswithdots=1;
       }

       /* parse in the flag used by forbidden words */
       if (strncmp(line,"KEEPCASE",8) == 0) {
          if (parse_flag(line, &keepcase, afflst)) {
             delete afflst;
             return 1;
          }
       }

       /* parse in the flag used by `forceucase' */
       if (strncmp(line,"FORCEUCASE",10) == 0) {
          if (parse_flag(line, &forceucase, afflst)) {
             delete afflst;
             return 1;
          }
       }

       /* parse in the flag used by `warn' */
       if (strncmp(line,"WARN",4) == 0) {
          if (parse_flag(line, &warn, afflst)) {
             delete afflst;
             return 1;
          }
       }

       if (strncmp(line,"FORBIDWARN",10) == 0) {
                   forbidwarn=1;
       }

       /* parse in the flag used by the affix generator */
       if (strncmp(line,"SUBSTANDARD",11) == 0) {
          if (parse_flag(line, &substandard, afflst)) {
             delete afflst;
             return 1;
          }
       }

       if (strncmp(line,"CHECKSHARPS",11) == 0) {
                   checksharps=1;
       }

       /* parse this affix: P - prefix, S - suffix */
       ft = ' ';
       if (strncmp(line,"PFX",3) == 0) ft = complexprefixes ? 'S' : 'P';
       if (strncmp(line,"SFX",3) == 0) ft = complexprefixes ? 'P' : 'S';
       if (ft != ' ') {
          if (dupflags_ini) {
            memset(dupflags, 0, sizeof(dupflags));
            dupflags_ini = 0;
          }
          if (parse_affix(line, ft, afflst, dupflags)) {
             delete afflst;
             process_pfx_tree_to_list();
             process_sfx_tree_to_list();
             return 1;
          }
       }

    }
    delete afflst;

    // convert affix trees to sorted list
    process_pfx_tree_to_list();
    process_sfx_tree_to_list();

    // now we can speed up performance greatly taking advantage of the 
    // relationship between the affixes and the idea of "subsets".

    // View each prefix as a potential leading subset of another and view
    // each suffix (reversed) as a potential trailing subset of another.

    // To illustrate this relationship if we know the prefix "ab" is found in the
    // word to examine, only prefixes that "ab" is a leading subset of need be examined.
    // Furthermore is "ab" is not present then none of the prefixes that "ab" is
    // is a subset need be examined.
    // The same argument goes for suffix string that are reversed.

    // Then to top this off why not examine the first char of the word to quickly
    // limit the set of prefixes to examine (i.e. the prefixes to examine must 
    // be leading supersets of the first character of the word (if they exist)
 
    // To take advantage of this "subset" relationship, we need to add two links
    // from entry.  One to take next if the current prefix is found (call it nexteq)
    // and one to take next if the current prefix is not found (call it nextne).

    // Since we have built ordered lists, all that remains is to properly initialize 
    // the nextne and nexteq pointers that relate them

    process_pfx_order();
    process_sfx_order();

    /* get encoding for CHECKCOMPOUNDCASE */
    if (!utf8) {
    char * enc = get_encoding();
    csconv = get_current_cs(enc);
    free(enc);
    enc = NULL;

    char expw[MAXLNLEN];
    if (wordchars) {
        strcpy(expw, wordchars);
        free(wordchars);
    } else *expw = '\0';

    for (int i = 0; i <= 255; i++) {
        if ( (csconv[i].cupper != csconv[i].clower) &&
            (! strchr(expw, (char) i))) {
                *(expw + strlen(expw) + 1) = '\0';
                *(expw + strlen(expw)) = (char) i;
        }
    }

    wordchars = mystrdup(expw);
    }

    // default BREAK definition
    if (numbreak == -1) {
        breaktable = (char **) malloc(sizeof(char *) * 3);
        if (!breaktable) return 1;
        breaktable[0] = mystrdup("-");
        breaktable[1] = mystrdup("^-");
        breaktable[2] = mystrdup("-$");
        if (breaktable[0] && breaktable[1] && breaktable[2]) numbreak = 3;
    }
    return 0;
}


// we want to be able to quickly access prefix information
// both by prefix flag, and sorted by prefix string itself 
// so we need to set up two indexes

int AffixMgr::build_pfxtree(PfxEntry* pfxptr)
{
  PfxEntry * ptr;
  PfxEntry * pptr;
  PfxEntry * ep = pfxptr;

  // get the right starting points
  const char * key = ep->getKey();
  const unsigned char flg = (unsigned char) (ep->getFlag() & 0x00FF);

  // first index by flag which must exist
  ptr = pFlag[flg];
  ep->setFlgNxt(ptr);
  pFlag[flg] = ep;


  // handle the special case of null affix string
  if (strlen(key) == 0) {
    // always inset them at head of list at element 0
     ptr = pStart[0];
     ep->setNext(ptr);
     pStart[0] = ep;
     return 0;
  }

  // now handle the normal case
  ep->setNextEQ(NULL);
  ep->setNextNE(NULL);

  unsigned char sp = *((const unsigned char *)key);
  ptr = pStart[sp];
  
  // handle the first insert 
  if (!ptr) {
     pStart[sp] = ep;
     return 0;
  }


  // otherwise use binary tree insertion so that a sorted
  // list can easily be generated later
  pptr = NULL;
  for (;;) {
    pptr = ptr;
    if (strcmp(ep->getKey(), ptr->getKey() ) <= 0) {
       ptr = ptr->getNextEQ();
       if (!ptr) {
          pptr->setNextEQ(ep);
          break;
       }
    } else {
       ptr = ptr->getNextNE();
       if (!ptr) {
          pptr->setNextNE(ep);
          break;
       }
    }
  }
  return 0;
}

// we want to be able to quickly access suffix information
// both by suffix flag, and sorted by the reverse of the
// suffix string itself; so we need to set up two indexes
int AffixMgr::build_sfxtree(SfxEntry* sfxptr)
{
  SfxEntry * ptr;
  SfxEntry * pptr;
  SfxEntry * ep = sfxptr;

  /* get the right starting point */
  const char * key = ep->getKey();
  const unsigned char flg = (unsigned char) (ep->getFlag() & 0x00FF);

  // first index by flag which must exist
  ptr = sFlag[flg];
  ep->setFlgNxt(ptr);
  sFlag[flg] = ep;

  // next index by affix string

  // handle the special case of null affix string
  if (strlen(key) == 0) {
    // always inset them at head of list at element 0
     ptr = sStart[0];
     ep->setNext(ptr);
     sStart[0] = ep;
     return 0;
  }

  // now handle the normal case
  ep->setNextEQ(NULL);
  ep->setNextNE(NULL);

  unsigned char sp = *((const unsigned char *)key);
  ptr = sStart[sp];
  
  // handle the first insert 
  if (!ptr) {
     sStart[sp] = ep;
     return 0;
  }

  // otherwise use binary tree insertion so that a sorted
  // list can easily be generated later
  pptr = NULL;
  for (;;) {
    pptr = ptr;
    if (strcmp(ep->getKey(), ptr->getKey() ) <= 0) {
       ptr = ptr->getNextEQ();
       if (!ptr) {
          pptr->setNextEQ(ep);
          break;
       }
    } else {
       ptr = ptr->getNextNE();
       if (!ptr) {
          pptr->setNextNE(ep);
          break;
       }
    }
  }
  return 0;
}

// convert from binary tree to sorted list
int AffixMgr::process_pfx_tree_to_list()
{
  for (int i=1; i< SETSIZE; i++) {
    pStart[i] = process_pfx_in_order(pStart[i],NULL);
  }
  return 0;
}


PfxEntry* AffixMgr::process_pfx_in_order(PfxEntry* ptr, PfxEntry* nptr)
{
  if (ptr) {
    nptr = process_pfx_in_order(ptr->getNextNE(), nptr);
    ptr->setNext(nptr);
    nptr = process_pfx_in_order(ptr->getNextEQ(), ptr);
  }
  return nptr;
}


// convert from binary tree to sorted list
int AffixMgr:: process_sfx_tree_to_list()
{
  for (int i=1; i< SETSIZE; i++) {
    sStart[i] = process_sfx_in_order(sStart[i],NULL);
  }
  return 0;
}

SfxEntry* AffixMgr::process_sfx_in_order(SfxEntry* ptr, SfxEntry* nptr)
{
  if (ptr) {
    nptr = process_sfx_in_order(ptr->getNextNE(), nptr);
    ptr->setNext(nptr);
    nptr = process_sfx_in_order(ptr->getNextEQ(), ptr);
  }
  return nptr;
}


// reinitialize the PfxEntry links NextEQ and NextNE to speed searching
// using the idea of leading subsets this time
int AffixMgr::process_pfx_order()
{
    PfxEntry* ptr;

    // loop through each prefix list starting point
    for (int i=1; i < SETSIZE; i++) {

         ptr = pStart[i];

         // look through the remainder of the list
         //  and find next entry with affix that 
         // the current one is not a subset of
         // mark that as destination for NextNE
         // use next in list that you are a subset
         // of as NextEQ

         for (; ptr != NULL; ptr = ptr->getNext()) {

             PfxEntry * nptr = ptr->getNext();
             for (; nptr != NULL; nptr = nptr->getNext()) {
                 if (! isSubset( ptr->getKey() , nptr->getKey() )) break;
             }
             ptr->setNextNE(nptr);
             ptr->setNextEQ(NULL);
             if ((ptr->getNext()) && isSubset(ptr->getKey() , (ptr->getNext())->getKey())) 
                 ptr->setNextEQ(ptr->getNext());
         }

         // now clean up by adding smart search termination strings:
         // if you are already a superset of the previous prefix
         // but not a subset of the next, search can end here
         // so set NextNE properly

         ptr = pStart[i];
         for (; ptr != NULL; ptr = ptr->getNext()) {
             PfxEntry * nptr = ptr->getNext();
             PfxEntry * mptr = NULL;
             for (; nptr != NULL; nptr = nptr->getNext()) {
                 if (! isSubset(ptr->getKey(),nptr->getKey())) break;
                 mptr = nptr;
             }
             if (mptr) mptr->setNextNE(NULL);
         }
    }
    return 0;
}

// initialize the SfxEntry links NextEQ and NextNE to speed searching
// using the idea of leading subsets this time
int AffixMgr::process_sfx_order()
{
    SfxEntry* ptr;

    // loop through each prefix list starting point
    for (int i=1; i < SETSIZE; i++) {

         ptr = sStart[i];

         // look through the remainder of the list
         //  and find next entry with affix that 
         // the current one is not a subset of
         // mark that as destination for NextNE
         // use next in list that you are a subset
         // of as NextEQ

         for (; ptr != NULL; ptr = ptr->getNext()) {
             SfxEntry * nptr = ptr->getNext();
             for (; nptr != NULL; nptr = nptr->getNext()) {
                 if (! isSubset(ptr->getKey(),nptr->getKey())) break;
             }
             ptr->setNextNE(nptr);
             ptr->setNextEQ(NULL);
             if ((ptr->getNext()) && isSubset(ptr->getKey(),(ptr->getNext())->getKey())) 
                 ptr->setNextEQ(ptr->getNext());
         }


         // now clean up by adding smart search termination strings:
         // if you are already a superset of the previous suffix
         // but not a subset of the next, search can end here
         // so set NextNE properly

         ptr = sStart[i];
         for (; ptr != NULL; ptr = ptr->getNext()) {
             SfxEntry * nptr = ptr->getNext();
             SfxEntry * mptr = NULL;
             for (; nptr != NULL; nptr = nptr->getNext()) {
                 if (! isSubset(ptr->getKey(),nptr->getKey())) break;
                 mptr = nptr;
             }
             if (mptr) mptr->setNextNE(NULL);
         }
    }
    return 0;
}

// add flags to the result for dictionary debugging
void AffixMgr::debugflag(char * result, unsigned short flag) {
    char * st = encode_flag(flag);
    mystrcat(result, " ", MAXLNLEN);
    mystrcat(result, MORPH_FLAG, MAXLNLEN);
    if (st) {
        mystrcat(result, st, MAXLNLEN);
        free(st);
    }
}

// calculate the character length of the condition
int AffixMgr::condlen(char * st)
{
  int l = 0;
  bool group = false;
  for(; *st; st++) {
    if (*st == '[') {
        group = true;
        l++;
    } else if (*st == ']') group = false;
    else if (!group && (!utf8 ||
        (!(*st & 0x80) || ((*st & 0xc0) == 0x80)))) l++;
  }
  return l;
}

int AffixMgr::encodeit(affentry &entry, char * cs)
{
  if (strcmp(cs,".") != 0) {
    entry.numconds = (char) condlen(cs);
    strncpy(entry.c.conds, cs, MAXCONDLEN);
    // long condition (end of conds padded by strncpy)
    if (entry.c.conds[MAXCONDLEN - 1] && cs[MAXCONDLEN]) {
      entry.opts += aeLONGCOND;
      entry.c.l.conds2 = mystrdup(cs + MAXCONDLEN_1);
      if (!entry.c.l.conds2) return 1;
    }
  } else {
    entry.numconds = 0;
    entry.c.conds[0] = '\0';
  }
  return 0;
}

// return 1 if s1 is a leading subset of s2 (dots are for infixes)
inline int AffixMgr::isSubset(const char * s1, const char * s2)
 {
    while (((*s1 == *s2) || (*s1 == '.')) && (*s1 != '\0')) {
        s1++;
        s2++;
    }
    return (*s1 == '\0');
 }


// check word for prefixes
struct hentry * AffixMgr::prefix_check(const char * word, int len, char in_compound,
    const FLAG needflag)
{
    struct hentry * rv= NULL;

    pfx = NULL;
    pfxappnd = NULL;
    sfxappnd = NULL;
    
    // first handle the special case of 0 length prefixes
    PfxEntry * pe = pStart[0];
    while (pe) {
        if (
            // fogemorpheme
              ((in_compound != IN_CPD_NOT) || !(pe->getCont() &&
                  (TESTAFF(pe->getCont(), onlyincompound, pe->getContLen())))) &&
            // permit prefixes in compounds
              ((in_compound != IN_CPD_END) || (pe->getCont() &&
                  (TESTAFF(pe->getCont(), compoundpermitflag, pe->getContLen()))))
              ) {
                    // check prefix
                    rv = pe->checkword(word, len, in_compound, needflag);
                    if (rv) {
                        pfx=pe; // BUG: pfx not stateless
                        return rv;
                    }
             }
       pe = pe->getNext();
    }
  
    // now handle the general case
    unsigned char sp = *((const unsigned char *)word);
    PfxEntry * pptr = pStart[sp];

    while (pptr) {
        if (isSubset(pptr->getKey(),word)) {
             if (
            // fogemorpheme
              ((in_compound != IN_CPD_NOT) || !(pptr->getCont() &&
                  (TESTAFF(pptr->getCont(), onlyincompound, pptr->getContLen())))) &&
            // permit prefixes in compounds
              ((in_compound != IN_CPD_END) || (pptr->getCont() &&
                  (TESTAFF(pptr->getCont(), compoundpermitflag, pptr->getContLen()))))
              ) {
            // check prefix
                  rv = pptr->checkword(word, len, in_compound, needflag);
                  if (rv) {
                    pfx=pptr; // BUG: pfx not stateless
                    return rv;
                  }
             }
             pptr = pptr->getNextEQ();
        } else {
             pptr = pptr->getNextNE();
        }
    }
    
    return NULL;
}

// check word for prefixes
struct hentry * AffixMgr::prefix_check_twosfx(const char * word, int len,
    char in_compound, const FLAG needflag)
{
    struct hentry * rv= NULL;

    pfx = NULL;
    sfxappnd = NULL;
    
    // first handle the special case of 0 length prefixes
    PfxEntry * pe = pStart[0];
    
    while (pe) {
        rv = pe->check_twosfx(word, len, in_compound, needflag);
        if (rv) return rv;
        pe = pe->getNext();
    }
  
    // now handle the general case
    unsigned char sp = *((const unsigned char *)word);
    PfxEntry * pptr = pStart[sp];

    while (pptr) {
        if (isSubset(pptr->getKey(),word)) {
            rv = pptr->check_twosfx(word, len, in_compound, needflag);
            if (rv) {
                pfx = pptr;
                return rv;
            }
            pptr = pptr->getNextEQ();
        } else {
             pptr = pptr->getNextNE();
        }
    }
    
    return NULL;
}

// check word for prefixes
char * AffixMgr::prefix_check_morph(const char * word, int len, char in_compound,
    const FLAG needflag)
{
    char * st;

    char result[MAXLNLEN];
    result[0] = '\0';

    pfx = NULL;
    sfxappnd = NULL;
    
    // first handle the special case of 0 length prefixes
    PfxEntry * pe = pStart[0];
    while (pe) {
       st = pe->check_morph(word,len,in_compound, needflag);
       if (st) {
            mystrcat(result, st, MAXLNLEN);
            free(st);
       }
       // if (rv) return rv;
       pe = pe->getNext();
    }
  
    // now handle the general case
    unsigned char sp = *((const unsigned char *)word);
    PfxEntry * pptr = pStart[sp];

    while (pptr) {
        if (isSubset(pptr->getKey(),word)) {
            st = pptr->check_morph(word,len,in_compound, needflag);
            if (st) {
              // fogemorpheme
              if ((in_compound != IN_CPD_NOT) || !((pptr->getCont() && 
                        (TESTAFF(pptr->getCont(), onlyincompound, pptr->getContLen()))))) {
                    mystrcat(result, st, MAXLNLEN);
                    pfx = pptr;
                }
                free(st);
            }
            pptr = pptr->getNextEQ();
        } else {
            pptr = pptr->getNextNE();
        }
    }
    
    if (*result) return mystrdup(result);
    return NULL;
}


// check word for prefixes
char * AffixMgr::prefix_check_twosfx_morph(const char * word, int len,
    char in_compound, const FLAG needflag)
{
    char * st;

    char result[MAXLNLEN];
    result[0] = '\0';

    pfx = NULL;
    sfxappnd = NULL;
    
    // first handle the special case of 0 length prefixes
    PfxEntry * pe = pStart[0];
    while (pe) {
        st = pe->check_twosfx_morph(word,len,in_compound, needflag);
        if (st) {
            mystrcat(result, st, MAXLNLEN);
            free(st);
        }
        pe = pe->getNext();
    }
  
    // now handle the general case
    unsigned char sp = *((const unsigned char *)word);
    PfxEntry * pptr = pStart[sp];

    while (pptr) {
        if (isSubset(pptr->getKey(),word)) {
            st = pptr->check_twosfx_morph(word, len, in_compound, needflag);
            if (st) {
                mystrcat(result, st, MAXLNLEN);
                free(st);
                pfx = pptr;
            }
            pptr = pptr->getNextEQ();
        } else {
            pptr = pptr->getNextNE();
        }
    }
    
    if (*result) return mystrdup(result);
    return NULL;
}

// Is word a non compound with a REP substitution (see checkcompoundrep)?
int AffixMgr::cpdrep_check(const char * word, int wl)
{
  char candidate[MAXLNLEN];
  const char * r;
  int lenr, lenp;

  if ((wl < 2) || !numrep) return 0;

  for (int i=0; i < numrep; i++ ) {
      r = word;
      lenr = strlen(reptable[i].pattern2);
      lenp = strlen(reptable[i].pattern);
      // search every occurence of the pattern in the word
      while ((r=strstr(r, reptable[i].pattern)) != NULL) {
          strcpy(candidate, word);
          if (r-word + lenr + strlen(r+lenp) >= MAXLNLEN) break;
          strcpy(candidate+(r-word),reptable[i].pattern2);
          strcpy(candidate+(r-word)+lenr, r+lenp);
          if (candidate_check(candidate,strlen(candidate))) return 1;
          r++; // search for the next letter
      }
   }
   return 0;
}

// forbid compoundings when there are special patterns at word bound
int AffixMgr::cpdpat_check(const char * word, int pos, hentry * r1, hentry * r2, const char affixed)
{
  int len;
  for (int i = 0; i < numcheckcpd; i++) {
      if (isSubset(checkcpdtable[i].pattern2, word + pos) &&
        (!r1 || !checkcpdtable[i].cond ||
          (r1->astr && TESTAFF(r1->astr, checkcpdtable[i].cond, r1->alen))) &&
        (!r2 || !checkcpdtable[i].cond2 ||
          (r2->astr && TESTAFF(r2->astr, checkcpdtable[i].cond2, r2->alen))) &&
        // zero length pattern => only TESTAFF
        // zero pattern (0/flag) => unmodified stem (zero affixes allowed)
        (!*(checkcpdtable[i].pattern) || (
            (*(checkcpdtable[i].pattern)=='0' && r1->blen <= pos && strncmp(word + pos - r1->blen, r1->word, r1->blen) == 0) ||
            (*(checkcpdtable[i].pattern)!='0' && (len = strlen(checkcpdtable[i].pattern)) &&
                strncmp(word + pos - len, checkcpdtable[i].pattern, len) == 0)))) {
            return 1;
        }
  }
  return 0;
}

// forbid compounding with neighbouring upper and lower case characters at word bounds
int AffixMgr::cpdcase_check(const char * word, int pos)
{
  if (utf8) {
      w_char u, w;
      const char * p;
      u8_u16(&u, 1, word + pos);
      for (p = word + pos - 1; (*p & 0xc0) == 0x80; p--);
      u8_u16(&w, 1, p);
      unsigned short a = (u.h << 8) + u.l;
      unsigned short b = (w.h << 8) + w.l;
      if (((unicodetoupper(a, langnum) == a) || (unicodetoupper(b, langnum) == b)) &&
          (a != '-') && (b != '-')) return 1;
  } else {
      unsigned char a = *(word + pos - 1);
      unsigned char b = *(word + pos);
      if ((csconv[a].ccase || csconv[b].ccase) && (a != '-') && (b != '-')) return 1;
  }
  return 0;
}

// check compound patterns
int AffixMgr::defcpd_check(hentry *** words, short wnum, hentry * rv, hentry ** def, char all)
{
  signed short btpp[MAXWORDLEN]; // metacharacter (*, ?) positions for backtracking
  signed short btwp[MAXWORDLEN]; // word positions for metacharacters
  int btnum[MAXWORDLEN]; // number of matched characters in metacharacter positions
  short bt = 0;  
  int i, j;
  int ok;
  int w = 0;

  if (!*words) {
    w = 1;
    *words = def;
  }

  if (!*words) {
    return 0;
  }

  (*words)[wnum] = rv;

  // has the last word COMPOUNDRULE flag?
  if (rv->alen == 0) {
    (*words)[wnum] = NULL;
    if (w) *words = NULL;
    return 0;
  }
  ok = 0;
  for (i = 0; i < numdefcpd; i++) {
    for (j = 0; j < defcpdtable[i].len; j++) {
       if (defcpdtable[i].def[j] != '*' && defcpdtable[i].def[j] != '?' &&
          TESTAFF(rv->astr, defcpdtable[i].def[j], rv->alen)) ok = 1;
    }
  }
  if (ok == 0) {
    (*words)[wnum] = NULL;
    if (w) *words = NULL;
    return 0;
  }

  for (i = 0; i < numdefcpd; i++) {
    signed short pp = 0; // pattern position
    signed short wp = 0; // "words" position
    int ok2;
    ok = 1;
    ok2 = 1;
    do {
      while ((pp < defcpdtable[i].len) && (wp <= wnum)) {
        if (((pp+1) < defcpdtable[i].len) &&
          ((defcpdtable[i].def[pp+1] == '*') || (defcpdtable[i].def[pp+1] == '?'))) {
            int wend = (defcpdtable[i].def[pp+1] == '?') ? wp : wnum;
            ok2 = 1;
            pp+=2;
            btpp[bt] = pp;
            btwp[bt] = wp;
            while (wp <= wend) {
                if (!(*words)[wp]->alen || 
                  !TESTAFF((*words)[wp]->astr, defcpdtable[i].def[pp-2], (*words)[wp]->alen)) {
                    ok2 = 0;
                    break;
                }
                wp++;
            }
            if (wp <= wnum) ok2 = 0;
            btnum[bt] = wp - btwp[bt];
            if (btnum[bt] > 0) bt++;
            if (ok2) break;
        } else {
            ok2 = 1;
            if (!(*words)[wp] || !(*words)[wp]->alen || 
              !TESTAFF((*words)[wp]->astr, defcpdtable[i].def[pp], (*words)[wp]->alen)) {
                ok = 0;
                break;
            }
            pp++;
            wp++;
            if ((defcpdtable[i].len == pp) && !(wp > wnum)) ok = 0;
        }
      }
    if (ok && ok2) { 
        int r = pp;
        while ((defcpdtable[i].len > r) && ((r+1) < defcpdtable[i].len) &&
            ((defcpdtable[i].def[r+1] == '*') || (defcpdtable[i].def[r+1] == '?'))) r+=2;
        if (defcpdtable[i].len <= r) return 1;
    }
    // backtrack
    if (bt) do {
        ok = 1;
        btnum[bt - 1]--;
        pp = btpp[bt - 1];
        wp = btwp[bt - 1] + (signed short) btnum[bt - 1];
    } while ((btnum[bt - 1] < 0) && --bt);
  } while (bt);

  if (ok && ok2 && (!all || (defcpdtable[i].len <= pp))) return 1;

  // check zero ending
  while (ok && ok2 && (defcpdtable[i].len > pp) && ((pp+1) < defcpdtable[i].len) &&
    ((defcpdtable[i].def[pp+1] == '*') || (defcpdtable[i].def[pp+1] == '?'))) pp+=2;
  if (ok && ok2 && (defcpdtable[i].len <= pp)) return 1;
  }
  (*words)[wnum] = NULL;
  if (w) *words = NULL;
  return 0;
}

inline int AffixMgr::candidate_check(const char * word, int len)
{
  struct hentry * rv=NULL;
  
  rv = lookup(word);
  if (rv) return 1;

//  rv = prefix_check(word,len,1);
//  if (rv) return 1;
  
  rv = affix_check(word,len);
  if (rv) return 1;
  return 0;
}

// calculate number of syllable for compound-checking
short AffixMgr::get_syllable(const char * word, int wlen)
{
    if (cpdmaxsyllable==0) return 0;
    
    short num=0;

    if (!utf8) {
        for (int i=0; i<wlen; i++) {
            if (strchr(cpdvowels, word[i])) num++;
        }
    } else if (cpdvowels_utf16) {
        w_char w[MAXWORDUTF8LEN];
        int i = u8_u16(w, MAXWORDUTF8LEN, word);
        for (; i > 0; i--) {
            if (flag_bsearch((unsigned short *) cpdvowels_utf16,
                ((unsigned short *) w)[i - 1], cpdvowels_utf16_len)) num++;
        }
    }
    return num;
}

void AffixMgr::setcminmax(int * cmin, int * cmax, const char * word, int len) {
    if (utf8) {
        int i;
        for (*cmin = 0, i = 0; (i < cpdmin) && word[*cmin]; i++) {
          for ((*cmin)++; (word[*cmin] & 0xc0) == 0x80; (*cmin)++);
        }
        for (*cmax = len, i = 0; (i < (cpdmin - 1)) && *cmax; i++) {
          for ((*cmax)--; (word[*cmax] & 0xc0) == 0x80; (*cmax)--);
        }
    } else {
        *cmin = cpdmin;
        *cmax = len - cpdmin + 1;
    }
}


// check if compound word is correctly spelled
// hu_mov_rule = spec. Hungarian rule (XXX)
struct hentry * AffixMgr::compound_check(const char * word, int len, 
    short wordnum, short numsyllable, short maxwordnum, short wnum, hentry ** words = NULL,
    char hu_mov_rule = 0, char is_sug = 0, int * info = NULL)
{
    int i; 
    short oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2;
    struct hentry * rv = NULL;
    struct hentry * rv_first;
    struct hentry * rwords[MAXWORDLEN]; // buffer for COMPOUND pattern checking
    char st [MAXWORDUTF8LEN + 4];
    char ch = '\0';
    int cmin;
    int cmax;
    int striple = 0;
    int scpd = 0;
    int soldi = 0;
    int oldcmin = 0;
    int oldcmax = 0;
    int oldlen = 0;
    int checkedstriple = 0;
    int onlycpdrule;
    int affixed = 0;
    hentry ** oldwords = words;

    int checked_prefix;

    setcminmax(&cmin, &cmax, word, len);

    strcpy(st, word);

    for (i = cmin; i < cmax; i++) {
        // go to end of the UTF-8 character
        if (utf8) {
            for (; (st[i] & 0xc0) == 0x80; i++);
            if (i >= cmax) return NULL;
        }

        words = oldwords;
        onlycpdrule = (words) ? 1 : 0;

        do { // onlycpdrule loop

        oldnumsyllable = numsyllable;
        oldwordnum = wordnum;
        checked_prefix = 0;


        do { // simplified checkcompoundpattern loop

        if (scpd > 0) {
          for (; scpd <= numcheckcpd && (!checkcpdtable[scpd-1].pattern3 ||
            strncmp(word + i, checkcpdtable[scpd-1].pattern3, strlen(checkcpdtable[scpd-1].pattern3)) != 0); scpd++);

          if (scpd > numcheckcpd) break; // break simplified checkcompoundpattern loop
          strcpy(st + i, checkcpdtable[scpd-1].pattern);
          soldi = i;
          i += strlen(checkcpdtable[scpd-1].pattern);
          strcpy(st + i, checkcpdtable[scpd-1].pattern2);
          strcpy(st + i + strlen(checkcpdtable[scpd-1].pattern2), word + soldi + strlen(checkcpdtable[scpd-1].pattern3));

          oldlen = len;
          len += strlen(checkcpdtable[scpd-1].pattern) + strlen(checkcpdtable[scpd-1].pattern2) - strlen(checkcpdtable[scpd-1].pattern3);
          oldcmin = cmin;
          oldcmax = cmax;
          setcminmax(&cmin, &cmax, st, len);

          cmax = len - cpdmin + 1;
        }

        ch = st[i];
        st[i] = '\0';

        sfx = NULL;
        pfx = NULL;

        // FIRST WORD

        affixed = 1;
        rv = lookup(st); // perhaps without prefix

        // search homonym with compound flag
        while ((rv) && !hu_mov_rule &&
            ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
                !((compoundflag && !words && !onlycpdrule && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
                  (compoundbegin && !wordnum && !onlycpdrule && 
                        TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
                  (compoundmiddle && wordnum && !words && !onlycpdrule &&
                    TESTAFF(rv->astr, compoundmiddle, rv->alen)) ||
                  (numdefcpd && onlycpdrul…
Summary ✨

This C++ code is part of a spell-checking library, specifically managing affixes (prefixes and suffixes) in a dictionary. It processes a line of text, extracts affix information, and checks for redundant conditions that would affect stripping characters from the text. The code creates objects to store this information and builds trees based on these affixes, allowing for efficient lookup and sorting of related entries.
Alerts (3)

Complexity hotspot; lines 1516 to 1518 (total complexity: 10)
1516 1517 1518