affixmgr.cxx | searchcode

/ext/hunspell/affixmgr.cxx

Large files files are truncated, but you can click here to view the full file

#include "license.hunspell"

#include "license.myspell"



#ifndef MOZILLA_CLIENT

#include <cstdlib>

#include <cstring>

#include <cctype>

#include <cstdio>

#else

#include <stdlib.h> 

#include <string.h>

#include <stdio.h> 

#include <ctype.h>

#endif



#include "affixmgr.hxx"

#include "affentry.hxx"

#include "langnum.hxx"



#include "csutil.hxx"



#ifndef MOZILLA_CLIENT

#ifndef W32

using namespace std;

#endif

#endif



AffixMgr::AffixMgr(const char * affpath, HashMgr* ptr) 

{

  // register hash manager and load affix data from aff file

  pHMgr = ptr;

  trystring = NULL;

  encoding=NULL;

  utf8 = 0;

  complexprefixes = 0;

  maptable = NULL;

  nummap = 0;

  breaktable = NULL;

  numbreak = 0;

  reptable = NULL;

  numrep = 0;

  checkcpdtable = NULL;

  numcheckcpd = 0;

  defcpdtable = NULL;

  numdefcpd = 0;

  compoundflag = FLAG_NULL; // permits word in compound forms

  compoundbegin = FLAG_NULL; // may be first word in compound forms

  compoundmiddle = FLAG_NULL; // may be middle word in compound forms

  compoundend = FLAG_NULL; // may be last word in compound forms

  compoundroot = FLAG_NULL; // compound word signing flag

  compoundpermitflag = FLAG_NULL; // compound permitting flag for suffixed word

  compoundforbidflag = FLAG_NULL; // compound fordidden flag for suffixed word

  checkcompounddup = 0; // forbid double words in compounds

  checkcompoundrep = 0; // forbid bad compounds (may be non compound word with a REP substitution)

  checkcompoundcase = 0; // forbid upper and lowercase combinations at word bounds

  checkcompoundtriple = 0; // forbid compounds with triple letters

  forbiddenword = FLAG_NULL; // forbidden word signing flag

  nosuggest = FLAG_NULL; // don't suggest words signed with NOSUGGEST flag

  lang = NULL; // language

  langnum = 0; // language code (see http://l10n.openoffice.org/languages.html)

  pseudoroot = FLAG_NULL; // forbidden root, allowed only with suffixes

  cpdwordmax = -1; // default: unlimited wordcount in compound words

  cpdmin = -1;  // undefined

  cpdmaxsyllable = 0; // default: unlimited syllablecount in compound words

  cpdvowels=NULL; // vowels (for calculating of Hungarian compounding limit, O(n) search! XXX)

  cpdvowels_utf16=NULL; // vowels for UTF-8 encoding (bsearch instead of O(n) search)

  cpdvowels_utf16_len=0; // vowels

  pfxappnd=NULL; // previous prefix for counting the syllables of prefix BUG

  sfxappnd=NULL; // previous suffix for counting a special syllables BUG

  cpdsyllablenum=NULL; // syllable count incrementing flag

  checknum=0; // checking numbers, and word with numbers

  wordchars=NULL; // letters + spec. word characters

  wordchars_utf16=NULL; // letters + spec. word characters

  wordchars_utf16_len=0; // letters + spec. word characters

  ignorechars=NULL; // letters + spec. word characters

  ignorechars_utf16=NULL; // letters + spec. word characters

  ignorechars_utf16_len=0; // letters + spec. word characters

  version=NULL; // affix and dictionary file version string

  havecontclass=0; // flags of possible continuing classes (double affix)

  // LEMMA_PRESENT: not put root into the morphological output. Lemma presents

  // in morhological description in dictionary file. It's often combined with PSEUDOROOT.

  lemma_present = FLAG_NULL; 

  circumfix = FLAG_NULL; 

  onlyincompound = FLAG_NULL; 

  flag_mode = FLAG_CHAR; // default one-character flags in affix and dic file

  maxngramsugs = -1; // undefined

  nosplitsugs = 0;

  sugswithdots = 0;

  keepcase = 0;

  checksharps = 0;



  derived = NULL; // XXX not threadsafe variable for experimental stemming

  sfx = NULL;

  pfx = NULL;



  for (int i=0; i < SETSIZE; i++) {

     pStart[i] = NULL;

     sStart[i] = NULL;

     pFlag[i] = NULL;

     sFlag[i] = NULL;

  }



  for (int j=0; j < CONTSIZE; j++) {

    contclasses[j] = 0;

  }



  if (parse_file(affpath)) {

     HUNSPELL_WARNING(stderr, "Failure loading aff file %s\n",affpath);

     wordchars = mystrdup("qwertzuiopasdfghjklyxcvbnmQWERTZUIOPASDFGHJKLYXCVBNM");

  }

  

  if (cpdmin == -1) cpdmin = MINCPDLEN;



}





AffixMgr::~AffixMgr() 

{

 

  // pass through linked prefix entries and clean up

  for (int i=0; i < SETSIZE ;i++) {

       pFlag[i] = NULL;

       PfxEntry * ptr = (PfxEntry *)pStart[i];

       PfxEntry * nptr = NULL;

       while (ptr) {

            nptr = ptr->getNext();

            delete(ptr);

            ptr = nptr;

            nptr = NULL;

       }  

  }



  // pass through linked suffix entries and clean up

  for (int j=0; j < SETSIZE ; j++) {

       sFlag[j] = NULL;

       SfxEntry * ptr = (SfxEntry *)sStart[j];

       SfxEntry * nptr = NULL;

       while (ptr) {

            nptr = ptr->getNext();

            delete(ptr);

            ptr = nptr;

            nptr = NULL;

       }

       sStart[j] = NULL;

  }



  if (trystring) free(trystring);

  trystring=NULL;

  if (encoding) free(encoding);

  encoding=NULL;

  if (maptable) {  

     for (int j=0; j < nummap; j++) {

        if (maptable[j].set) free(maptable[j].set);

        if (maptable[j].set_utf16) free(maptable[j].set_utf16);

        maptable[j].set = NULL;

        maptable[j].len = 0;

     }

     free(maptable);  

     maptable = NULL;

  }

  nummap = 0;

  if (breaktable) {

     for (int j=0; j < numbreak; j++) {

        if (breaktable[j]) free(breaktable[j]);

        breaktable[j] = NULL;

     }

     free(breaktable);  

     breaktable = NULL;

  }

  numbreak = 0;

  if (reptable) {  

     for (int j=0; j < numrep; j++) {

        free(reptable[j].pattern);

        free(reptable[j].pattern2);

        reptable[j].pattern = NULL;

        reptable[j].pattern2 = NULL;

     }

     free(reptable);  

     reptable = NULL;

  }

  if (defcpdtable) {  

     for (int j=0; j < numdefcpd; j++) {

        free(defcpdtable[j].def);

        defcpdtable[j].def = NULL;

     }

     free(defcpdtable);  

     defcpdtable = NULL;

  }

  numrep = 0;

  if (checkcpdtable) {  

     for (int j=0; j < numcheckcpd; j++) {

        free(checkcpdtable[j].pattern);

        free(checkcpdtable[j].pattern2);

        checkcpdtable[j].pattern = NULL;

        checkcpdtable[j].pattern2 = NULL;

     }

     free(checkcpdtable);  

     checkcpdtable = NULL;

  }

  numcheckcpd = 0;

  FREE_FLAG(compoundflag);

  FREE_FLAG(compoundbegin);

  FREE_FLAG(compoundmiddle);

  FREE_FLAG(compoundend);

  FREE_FLAG(compoundpermitflag);

  FREE_FLAG(compoundforbidflag);

  FREE_FLAG(compoundroot);

  FREE_FLAG(forbiddenword);

  FREE_FLAG(nosuggest);

  FREE_FLAG(pseudoroot);

  FREE_FLAG(lemma_present);

  FREE_FLAG(circumfix);

  FREE_FLAG(onlyincompound);

  

  cpdwordmax = 0;

  pHMgr = NULL;

  cpdmin = 0;

  cpdmaxsyllable = 0;

  if (cpdvowels) free(cpdvowels);

  if (cpdvowels_utf16) free(cpdvowels_utf16);

  if (cpdsyllablenum) free(cpdsyllablenum);

  free_utf_tbl();

  if (lang) free(lang);

  if (wordchars) free(wordchars);

  if (wordchars_utf16) free(wordchars_utf16);

  if (ignorechars) free(ignorechars);

  if (ignorechars_utf16) free(ignorechars_utf16);

  if (version) free(version);

  if (derived) free(derived);

  checknum=0;

}





// read in aff file and build up prefix and suffix entry objects 

int  AffixMgr::parse_file(const char * affpath)

{



  // io buffers

  char line[MAXLNLEN+1];

 

  // affix type

  char ft;

  

  // checking flag duplication

  char dupflags[CONTSIZE];

  char dupflags_ini = 1;



  // first line indicator for removing byte order mark

  int firstline = 1;

  

  // open the affix file

  FILE * afflst;

  afflst = fopen(affpath,"r");

  if (!afflst) {

    HUNSPELL_WARNING(stderr, "error: could not open affix description file %s\n",affpath);

    return 1;

  }



  // step one is to parse the affix file building up the internal

  // affix data structures





    // read in each line ignoring any that do not

    // start with a known line type indicator

    while (fgets(line,MAXLNLEN,afflst)) {

       mychomp(line);



       /* remove byte order mark */

       if (firstline) {

         firstline = 0;

         if (strncmp(line,"",3) == 0) {

            memmove(line, line+3, strlen(line+3)+1);

            HUNSPELL_WARNING(stderr, "warning: affix file begins with byte order mark: possible incompatibility with old Hunspell versions\n");

         }

       }



       /* parse in the try string */

       if (strncmp(line,"TRY",3) == 0) {

          if (parse_string(line, &trystring, "TRY")) {

             fclose(afflst);

             return 1;

          }

       }



       /* parse in the name of the character set used by the .dict and .aff */

       if (strncmp(line,"SET",3) == 0) {

          if (parse_string(line, &encoding, "SET")) {

             fclose(afflst);

             return 1;

          }

          if (strcmp(encoding, "UTF-8") == 0) {

             utf8 = 1;

#ifndef OPENOFFICEORG

#ifndef MOZILLA_CLIENT

             if (initialize_utf_tbl()) {

               fclose(afflst);

               return 1;

             }

#endif

#endif

          }

       }



       /* parse COMPLEXPREFIXES for agglutinative languages with right-to-left writing system */

       if (strncmp(line,"COMPLEXPREFIXES",15) == 0)

                   complexprefixes = 1;



       /* parse in the flag used by the controlled compound words */

       if (strncmp(line,"COMPOUNDFLAG",12) == 0) {

          if (parse_flag(line, &compoundflag, "COMPOUNDFLAG")) {

             fclose(afflst);

             return 1;

          }

       }



       /* parse in the flag used by compound words */

       if (strncmp(line,"COMPOUNDBEGIN",13) == 0) {

          if (complexprefixes) {

            if (parse_flag(line, &compoundend, "COMPOUNDBEGIN")) {

              fclose(afflst);

              return 1;

            }

          } else {

            if (parse_flag(line, &compoundbegin, "COMPOUNDBEGIN")) {

              fclose(afflst);

              return 1;

            }

          }

       }



       /* parse in the flag used by compound words */

       if (strncmp(line,"COMPOUNDMIDDLE",14) == 0) {

          if (parse_flag(line, &compoundmiddle, "COMPOUNDMIDDLE")) {

             fclose(afflst);

             return 1;

          }

       }

       /* parse in the flag used by compound words */

       if (strncmp(line,"COMPOUNDEND",11) == 0) {

          if (complexprefixes) {

            if (parse_flag(line, &compoundbegin, "COMPOUNDEND")) {

              fclose(afflst);

              return 1;

            }

          } else {

            if (parse_flag(line, &compoundend, "COMPOUNDEND")) {

              fclose(afflst);

              return 1;

            }

          }

       }



       /* parse in the data used by compound_check() method */

       if (strncmp(line,"COMPOUNDWORDMAX",15) == 0) {

          if (parse_num(line, &cpdwordmax, "COMPOUNDWORDMAX")) {

             fclose(afflst);

             return 1;

          }

       }



       /* parse in the flag sign compounds in dictionary */

       if (strncmp(line,"COMPOUNDROOT",12) == 0) {

          if (parse_flag(line, &compoundroot, "COMPOUNDROOT")) {

             fclose(afflst);

             return 1;

          }

       }



       /* parse in the flag used by compound_check() method */

       if (strncmp(line,"COMPOUNDPERMITFLAG",18) == 0) {

          if (parse_flag(line, &compoundpermitflag, "COMPOUNDPERMITFLAG")) {

             fclose(afflst);

             return 1;

          }

       }



       /* parse in the flag used by compound_check() method */

       if (strncmp(line,"COMPOUNDFORBIDFLAG",18) == 0) {

          if (parse_flag(line, &compoundforbidflag, "COMPOUNDFORBIDFLAG")) {

             fclose(afflst);

             return 1;

          }

       }



       if (strncmp(line,"CHECKCOMPOUNDDUP",16) == 0) {

                   checkcompounddup = 1;

       }



       if (strncmp(line,"CHECKCOMPOUNDREP",16) == 0) {

                   checkcompoundrep = 1;

       }



       if (strncmp(line,"CHECKCOMPOUNDTRIPLE",19) == 0) {

                   checkcompoundtriple = 1;

       }



       if (strncmp(line,"CHECKCOMPOUNDCASE",17) == 0) {

                   checkcompoundcase = 1;

       }



       if (strncmp(line,"NOSUGGEST",9) == 0) {

          if (parse_flag(line, &nosuggest, "NOSUGGEST")) {

             fclose(afflst);

             return 1;

          }

       }



       /* parse in the flag used by forbidden words */

       if (strncmp(line,"FORBIDDENWORD",13) == 0) {

          if (parse_flag(line, &forbiddenword, "FORBIDDENWORD")) {

             fclose(afflst);

             return 1;

          }

       }



       /* parse in the flag used by forbidden words */

       if (strncmp(line,"LEMMA_PRESENT",13) == 0) {

          if (parse_flag(line, &lemma_present, "LEMMA_PRESENT")) {

             fclose(afflst);

             return 1;

          }

       }



       /* parse in the flag used by circumfixes */

       if (strncmp(line,"CIRCUMFIX",9) == 0) {

          if (parse_flag(line, &circumfix, "CIRCUMFIX")) {

             fclose(afflst);

             return 1;

          }

       }



       /* parse in the flag used by fogemorphemes */

       if (strncmp(line,"ONLYINCOMPOUND",14) == 0) {

          if (parse_flag(line, &onlyincompound, "ONLYINCOMPOUND")) {

             fclose(afflst);

             return 1;

          }

       }



       /* parse in the flag used by `pseudoroots' */

       if (strncmp(line,"PSEUDOROOT",10) == 0) {

          if (parse_flag(line, &pseudoroot, "PSEUDOROOT")) {

             fclose(afflst);

             return 1;

          }

       }



       /* parse in the flag used by `pseudoroots' */

       if (strncmp(line,"NEEDAFFIX",9) == 0) {

          if (parse_flag(line, &pseudoroot, "NEEDAFFIX")) {

             fclose(afflst);

             return 1;

          }

       }



       /* parse in the minimal length for words in compounds */

       if (strncmp(line,"COMPOUNDMIN",11) == 0) {

          if (parse_num(line, &cpdmin, "COMPOUNDMIN")) {

             fclose(afflst);

             return 1;

          }

          if (cpdmin < 1) cpdmin = 1;

       }



       /* parse in the max. words and syllables in compounds */

       if (strncmp(line,"COMPOUNDSYLLABLE",16) == 0) {

          if (parse_cpdsyllable(line)) {

             fclose(afflst);

             return 1;

          }

       }



       /* parse in the flag used by compound_check() method */

       if (strncmp(line,"SYLLABLENUM",11) == 0) {

          if (parse_string(line, &cpdsyllablenum, "SYLLABLENUM")) {

             fclose(afflst);

             return 1;

          }

       }



       /* parse in the flag used by the controlled compound words */

       if (strncmp(line,"CHECKNUM",8) == 0) {

           checknum=1;

       }



       /* parse in the extra word characters */

       if (strncmp(line,"WORDCHARS",9) == 0) {

          if (parse_array(line, &wordchars, &wordchars_utf16, &wordchars_utf16_len, "WORDCHARS", utf8)) {

             fclose(afflst);

             return 1;

          }

       }



       /* parse in the ignored characters (for example, Arabic optional diacretics charachters */

       if (strncmp(line,"IGNORE",6) == 0) {

          if (parse_array(line, &ignorechars, &ignorechars_utf16, &ignorechars_utf16_len, "IGNORE", utf8)) {

             fclose(afflst);

             return 1;

          }

       }



       /* parse in the typical fault correcting table */

       if (strncmp(line,"REP",3) == 0) {

          if (parse_reptable(line, afflst)) {

             fclose(afflst);

             return 1;

          }

       }



       /* parse in the checkcompoundpattern table */

       if (strncmp(line,"CHECKCOMPOUNDPATTERN",20) == 0) {

          if (parse_checkcpdtable(line, afflst)) {

             fclose(afflst);

             return 1;

          }

       }



       /* parse in the defcompound table */

       if (strncmp(line,"COMPOUNDRULE",12) == 0) {

          if (parse_defcpdtable(line, afflst)) {

             fclose(afflst);

             return 1;

          }

       }



       /* parse in the related character map table */

       if (strncmp(line,"MAP",3) == 0) {

          if (parse_maptable(line, afflst)) {

             fclose(afflst);

             return 1;

          }

       }



       /* parse in the word breakpoints table */

       if (strncmp(line,"BREAK",5) == 0) {

          if (parse_breaktable(line, afflst)) {

             fclose(afflst);

             return 1;

          }

       }



       /* parse in the language for language specific codes */

       if (strncmp(line,"LANG",4) == 0) {

          if (parse_string(line, &lang, "LANG")) {

             fclose(afflst);

             return 1;

          }

          langnum = get_lang_num(lang);

       }



       if (strncmp(line,"VERSION",7) == 0) {

          if (parse_string(line, &version, "VERSION")) {

             fclose(afflst);

             return 1;

          }

       }



       if (strncmp(line,"MAXNGRAMSUGS",12) == 0) {

          if (parse_num(line, &maxngramsugs, "MAXNGRAMSUGS")) {

             fclose(afflst);

             return 1;

          }

       }



       if (strncmp(line,"NOSPLITSUGS",11) == 0) {

                   nosplitsugs=1;

       }



       if (strncmp(line,"SUGSWITHDOTS",12) == 0) {

                   sugswithdots=1;

       }



       /* parse in the flag used by forbidden words */

       if (strncmp(line,"KEEPCASE",8) == 0) {

          if (parse_flag(line, &keepcase, "KEEPCASE")) {

             fclose(afflst);

             return 1;

          }

       }



       if (strncmp(line,"CHECKSHARPS",11) == 0) {

                   checksharps=1;

       }



       /* parse this affix: P - prefix, S - suffix */

       ft = ' ';

       if (strncmp(line,"PFX",3) == 0) ft = complexprefixes ? 'S' : 'P';

       if (strncmp(line,"SFX",3) == 0) ft = complexprefixes ? 'P' : 'S';

       if (ft != ' ') {

          if (dupflags_ini) {

            for (int i = 0; i < CONTSIZE; i++) dupflags[i] = 0;

            dupflags_ini = 0;

          }

          if (parse_affix(line, ft, afflst, dupflags)) {

             fclose(afflst);

             process_pfx_tree_to_list();

             process_sfx_tree_to_list();

             return 1;

          }

       }



    }

    fclose(afflst);



    // convert affix trees to sorted list

    process_pfx_tree_to_list();

    process_sfx_tree_to_list();



    // now we can speed up performance greatly taking advantage of the 

    // relationship between the affixes and the idea of "subsets".



    // View each prefix as a potential leading subset of another and view

    // each suffix (reversed) as a potential trailing subset of another.



    // To illustrate this relationship if we know the prefix "ab" is found in the

    // word to examine, only prefixes that "ab" is a leading subset of need be examined.

    // Furthermore is "ab" is not present then none of the prefixes that "ab" is

    // is a subset need be examined.

    // The same argument goes for suffix string that are reversed.



    // Then to top this off why not examine the first char of the word to quickly

    // limit the set of prefixes to examine (i.e. the prefixes to examine must 

    // be leading supersets of the first character of the word (if they exist)

 

    // To take advantage of this "subset" relationship, we need to add two links

    // from entry.  One to take next if the current prefix is found (call it nexteq)

    // and one to take next if the current prefix is not found (call it nextne).



    // Since we have built ordered lists, all that remains is to properly intialize 

    // the nextne and nexteq pointers that relate them



    process_pfx_order();

    process_sfx_order();



    // expand wordchars string, based on csutil (for external tokenization)



    char * enc = get_encoding();

    csconv = get_current_cs(enc);

    free(enc);

    enc = NULL;



    char expw[MAXLNLEN];

    if (wordchars) {

        strcpy(expw, wordchars);

        free(wordchars);

    } else *expw = '\0';



    for (int i = 0; i <= 255; i++) {

        if ( (csconv[i].cupper != csconv[i].clower) &&

            (! strchr(expw, (char) i))) {

                *(expw + strlen(expw) + 1) = '\0';

                *(expw + strlen(expw)) = (char) i;

        }

    }



    wordchars = mystrdup(expw);



    // temporary BREAK definition for German dash handling (OOo issue 64400)

    if ((langnum == LANG_de) && (!breaktable)) {

        breaktable = (char **) malloc(sizeof(char *));

        if (!breaktable) return 1;

        breaktable[0] = mystrdup("-");

        numbreak = 1;

    }

    return 0;

}





// we want to be able to quickly access prefix information

// both by prefix flag, and sorted by prefix string itself 

// so we need to set up two indexes



int AffixMgr::build_pfxtree(AffEntry* pfxptr)

{

  PfxEntry * ptr;

  PfxEntry * pptr;

  PfxEntry * ep = (PfxEntry*) pfxptr;



  // get the right starting points

  const char * key = ep->getKey();

  const unsigned char flg = (unsigned char) (ep->getFlag() & 0x00FF);



  // first index by flag which must exist

  ptr = (PfxEntry*)pFlag[flg];

  ep->setFlgNxt(ptr);

  pFlag[flg] = (AffEntry *) ep;





  // handle the special case of null affix string

  if (strlen(key) == 0) {

    // always inset them at head of list at element 0

     ptr = (PfxEntry*)pStart[0];

     ep->setNext(ptr);

     pStart[0] = (AffEntry*)ep;

     return 0;

  }



  // now handle the normal case

  ep->setNextEQ(NULL);

  ep->setNextNE(NULL);



  unsigned char sp = *((const unsigned char *)key);

  ptr = (PfxEntry*)pStart[sp];

  

  // handle the first insert 

  if (!ptr) {

     pStart[sp] = (AffEntry*)ep;

     return 0;

  }





  // otherwise use binary tree insertion so that a sorted

  // list can easily be generated later

  pptr = NULL;

  for (;;) {

    pptr = ptr;

    if (strcmp(ep->getKey(), ptr->getKey() ) <= 0) {

       ptr = ptr->getNextEQ();

       if (!ptr) {

          pptr->setNextEQ(ep);

          break;

       }

    } else {

       ptr = ptr->getNextNE();

       if (!ptr) {

          pptr->setNextNE(ep);

          break;

       }

    }

  }

  return 0;

}



// we want to be able to quickly access suffix information

// both by suffix flag, and sorted by the reverse of the

// suffix string itself; so we need to set up two indexes

int AffixMgr::build_sfxtree(AffEntry* sfxptr)

{

  SfxEntry * ptr;

  SfxEntry * pptr;

  SfxEntry * ep = (SfxEntry *) sfxptr;



  /* get the right starting point */

  const char * key = ep->getKey();

  const unsigned char flg = (unsigned char) (ep->getFlag() & 0x00FF);



  // first index by flag which must exist

  ptr = (SfxEntry*)sFlag[flg];

  ep->setFlgNxt(ptr);

  sFlag[flg] = (AffEntry *) ep;



  // next index by affix string



  // handle the special case of null affix string

  if (strlen(key) == 0) {

    // always inset them at head of list at element 0

     ptr = (SfxEntry*)sStart[0];

     ep->setNext(ptr);

     sStart[0] = (AffEntry*)ep;

     return 0;

  }



  // now handle the normal case

  ep->setNextEQ(NULL);

  ep->setNextNE(NULL);



  unsigned char sp = *((const unsigned char *)key);

  ptr = (SfxEntry*)sStart[sp];

  

  // handle the first insert 

  if (!ptr) {

     sStart[sp] = (AffEntry*)ep;

     return 0;

  }



  // otherwise use binary tree insertion so that a sorted

  // list can easily be generated later

  pptr = NULL;

  for (;;) {

    pptr = ptr;

    if (strcmp(ep->getKey(), ptr->getKey() ) <= 0) {

       ptr = ptr->getNextEQ();

       if (!ptr) {

          pptr->setNextEQ(ep);

          break;

       }

    } else {

       ptr = ptr->getNextNE();

       if (!ptr) {

          pptr->setNextNE(ep);

          break;

       }

    }

  }

  return 0;

}



// convert from binary tree to sorted list

int AffixMgr::process_pfx_tree_to_list()

{

  for (int i=1; i< SETSIZE; i++) {

    pStart[i] = process_pfx_in_order(pStart[i],NULL);

  }

  return 0;

}





AffEntry* AffixMgr::process_pfx_in_order(AffEntry* ptr, AffEntry* nptr)

{

  if (ptr) {

    nptr = process_pfx_in_order(((PfxEntry*) ptr)->getNextNE(), nptr);

    ((PfxEntry*) ptr)->setNext((PfxEntry*) nptr);

    nptr = process_pfx_in_order(((PfxEntry*) ptr)->getNextEQ(), ptr);

  }

  return nptr;

}





// convert from binary tree to sorted list

int AffixMgr:: process_sfx_tree_to_list()

{

  for (int i=1; i< SETSIZE; i++) {

    sStart[i] = process_sfx_in_order(sStart[i],NULL);

  }

  return 0;

}



AffEntry* AffixMgr::process_sfx_in_order(AffEntry* ptr, AffEntry* nptr)

{

  if (ptr) {

    nptr = process_sfx_in_order(((SfxEntry*) ptr)->getNextNE(), nptr);

    ((SfxEntry*) ptr)->setNext((SfxEntry*) nptr);

    nptr = process_sfx_in_order(((SfxEntry*) ptr)->getNextEQ(), ptr);

  }

  return nptr;

}





// reinitialize the PfxEntry links NextEQ and NextNE to speed searching

// using the idea of leading subsets this time

int AffixMgr::process_pfx_order()

{

    PfxEntry* ptr;



    // loop through each prefix list starting point

    for (int i=1; i < SETSIZE; i++) {



         ptr = (PfxEntry*)pStart[i];



         // look through the remainder of the list

         //  and find next entry with affix that 

         // the current one is not a subset of

         // mark that as destination for NextNE

         // use next in list that you are a subset

         // of as NextEQ



         for (; ptr != NULL; ptr = ptr->getNext()) {



             PfxEntry * nptr = ptr->getNext();

             for (; nptr != NULL; nptr = nptr->getNext()) {

                 if (! isSubset( ptr->getKey() , nptr->getKey() )) break;

             }

             ptr->setNextNE(nptr);

             ptr->setNextEQ(NULL);

             if ((ptr->getNext()) && isSubset(ptr->getKey() , (ptr->getNext())->getKey())) 

                 ptr->setNextEQ(ptr->getNext());

         }



         // now clean up by adding smart search termination strings:

         // if you are already a superset of the previous prefix

         // but not a subset of the next, search can end here

         // so set NextNE properly



         ptr = (PfxEntry *) pStart[i];

         for (; ptr != NULL; ptr = ptr->getNext()) {

             PfxEntry * nptr = ptr->getNext();

             PfxEntry * mptr = NULL;

             for (; nptr != NULL; nptr = nptr->getNext()) {

                 if (! isSubset(ptr->getKey(),nptr->getKey())) break;

                 mptr = nptr;

             }

             if (mptr) mptr->setNextNE(NULL);

         }

    }

    return 0;

}



// initialize the SfxEntry links NextEQ and NextNE to speed searching

// using the idea of leading subsets this time

int AffixMgr::process_sfx_order()

{

    SfxEntry* ptr;



    // loop through each prefix list starting point

    for (int i=1; i < SETSIZE; i++) {



         ptr = (SfxEntry *) sStart[i];



         // look through the remainder of the list

         //  and find next entry with affix that 

         // the current one is not a subset of

         // mark that as destination for NextNE

         // use next in list that you are a subset

         // of as NextEQ



         for (; ptr != NULL; ptr = ptr->getNext()) {

             SfxEntry * nptr = ptr->getNext();

             for (; nptr != NULL; nptr = nptr->getNext()) {

                 if (! isSubset(ptr->getKey(),nptr->getKey())) break;

             }

             ptr->setNextNE(nptr);

             ptr->setNextEQ(NULL);

             if ((ptr->getNext()) && isSubset(ptr->getKey(),(ptr->getNext())->getKey())) 

                 ptr->setNextEQ(ptr->getNext());

         }





         // now clean up by adding smart search termination strings:

         // if you are already a superset of the previous suffix

         // but not a subset of the next, search can end here

         // so set NextNE properly



         ptr = (SfxEntry *) sStart[i];

         for (; ptr != NULL; ptr = ptr->getNext()) {

             SfxEntry * nptr = ptr->getNext();

             SfxEntry * mptr = NULL;

             for (; nptr != NULL; nptr = nptr->getNext()) {

                 if (! isSubset(ptr->getKey(),nptr->getKey())) break;

                 mptr = nptr;

             }

             if (mptr) mptr->setNextNE(NULL);

         }

    }

    return 0;

}







// takes aff file condition string and creates the

// conds array - please see the appendix at the end of the

// file affentry.cxx which describes what is going on here

// in much more detail



int AffixMgr::encodeit(struct affentry * ptr, char * cs)

{

  unsigned char c;

  int i, j, k;

  unsigned char mbr[MAXLNLEN];

  w_char wmbr[MAXLNLEN];

  w_char * wpos = wmbr;



  // now clear the conditions array */

  for (i=0;i<SETSIZE;i++) ptr->conds.base[i] = (unsigned char) 0;



  // now parse the string to create the conds array */

  int nc = strlen(cs);

  unsigned char neg = 0;   // complement indicator

  int grp = 0;   // group indicator

  unsigned char n = 0;     // number of conditions

  int ec = 0;    // end condition indicator

  int nm = 0;    // number of member in group



  // if no condition just return

  if (strcmp(cs,".")==0) {

    ptr->numconds = 0;

    return 0;

  }



  i = 0;

  while (i < nc) {

    c = *((unsigned char *)(cs + i));



    // start group indicator

    if (c == '[') {

       grp = 1;

       c = 0;

    }



    // complement flag

    if ((grp == 1) && (c == '^')) {

       neg = 1;

       c = 0;

    }



    // end goup indicator

    if (c == ']') {

       ec = 1;

       c = 0;

    }



    // add character of group to list

    if ((grp == 1) && (c != 0)) {

      *(mbr + nm) = c;

      nm++;

      c = 0;

    }



    // end of condition 

    if (c != 0) {

       ec = 1;

    }



  if (ec) {    

    if (!utf8) {

      if (grp == 1) {

        if (neg == 0) {

          // set the proper bits in the condition array vals for those chars

          for (j=0;j<nm;j++) {

             k = (unsigned int) mbr[j];

             ptr->conds.base[k] = ptr->conds.base[k] | ((unsigned char)1 << n);

          }

        } else {

          // complement so set all of them and then unset indicated ones

           for (j=0;j<SETSIZE;j++) ptr->conds.base[j] = ptr->conds.base[j] | ((unsigned char)1 << n);

           for (j=0;j<nm;j++) {

             k = (unsigned int) mbr[j];

             ptr->conds.base[k] = ptr->conds.base[k] & ~((unsigned char)1 << n);

           }

        }

        neg = 0;

        grp = 0;   

        nm = 0;

      } else {

         // not a group so just set the proper bit for this char

         // but first handle special case of . inside condition

         if (c == '.') {

            // wild card character so set them all

            for (j=0;j<SETSIZE;j++) ptr->conds.base[j] = ptr->conds.base[j] | ((unsigned char)1 << n);

         } else {  

            ptr->conds.base[(unsigned int) c] = ptr->conds.base[(unsigned int)c] | ((unsigned char)1 << n);

         }

      }

      n++;

      ec = 0;

    } else { // UTF-8 character set

      if (grp == 1) {

        ptr->conds.utf8.neg[n] = neg;

        if (neg == 0) {

          // set the proper bits in the condition array vals for those chars

          for (j=0;j<nm;j++) {

             k = (unsigned int) mbr[j];

             if (k >> 7) {

                u8_u16(wpos, 1, (char *) mbr + j);

                wpos++;

                if ((k & 0xe0) == 0xe0) j+=2; else j++; // 3-byte UTF-8 character

             } else {

                ptr->conds.utf8.ascii[k] = ptr->conds.utf8.ascii[k] | ((unsigned char)1 << n);

             }

          }

        } else { // neg == 1

          // complement so set all of them and then unset indicated ones

           for (j=0;j<(SETSIZE/2);j++) ptr->conds.utf8.ascii[j] = ptr->conds.utf8.ascii[j] | ((unsigned char)1 << n);

           for (j=0;j<nm;j++) {

             k = (unsigned int) mbr[j];

             if (k >> 7) {

                u8_u16(wpos, 1, (char *) mbr + j);

                wpos++;

                if ((k & 0xe0) == 0xe0) j+=2; else j++; // 3-byte UTF-8 character

             } else {

                ptr->conds.utf8.ascii[k] = ptr->conds.utf8.ascii[k] & ~((unsigned char)1 << n);

             }

           }

        }

        neg = 0;

        grp = 0;   

        nm = 0;

        ptr->conds.utf8.wlen[n] = wpos - wmbr;

        if ((wpos - wmbr) != 0) {

            ptr->conds.utf8.wchars[n] = (w_char *) malloc(sizeof(w_char) * (wpos - wmbr));

            if (!ptr->conds.utf8.wchars[n]) return 1;

            memcpy(ptr->conds.utf8.wchars[n], wmbr, sizeof(w_char) * (wpos - wmbr));

            flag_qsort((unsigned short *) ptr->conds.utf8.wchars[n], 0, ptr->conds.utf8.wlen[n]);

            wpos = wmbr;

        }

      } else { // grp == 0

         // is UTF-8 character?

         if (c >> 7) {

            ptr->conds.utf8.wchars[n] = (w_char *) malloc(sizeof(w_char));

            if (!ptr->conds.utf8.wchars[n]) return 1;

            ptr->conds.utf8.wlen[n] = 1;

            u8_u16(ptr->conds.utf8.wchars[n], 1, cs + i);

            if ((c & 0xe0) == 0xe0) i+=2; else i++; // 3-byte UFT-8 character

         } else {

            ptr->conds.utf8.wchars[n] = NULL;

            // not a group so just set the proper bit for this char

            // but first handle special case of . inside condition

            if (c == '.') {

                ptr->conds.utf8.all[n] = 1;

                // wild card character so set them all

                for (j=0;j<(SETSIZE/2);j++) ptr->conds.utf8.ascii[j] = ptr->conds.utf8.ascii[j] | ((unsigned char)1 << n);

            } else {

                ptr->conds.utf8.all[n] = 0;

                ptr->conds.utf8.ascii[(unsigned int) c] = ptr->conds.utf8.ascii[(unsigned int)c] | ((unsigned char)1 << n);

            }

         }

         neg = 0;

      }

      n++;

      ec = 0;

      neg = 0;

    }  

  }



    i++;

  }

  ptr->numconds = n;

  return 0;

}



 // return 1 if s1 is a leading subset of s2

/* inline int AffixMgr::isSubset(const char * s1, const char * s2)

 {

    while ((*s1 == *s2) && *s1) {

        s1++;

        s2++;

    }

    return (*s1 == '\0');

 }

*/



 // return 1 if s1 is a leading subset of s2 (dots are for infixes)

inline int AffixMgr::isSubset(const char * s1, const char * s2)

 {

    while (((*s1 == *s2) || (*s1 == '.')) && (*s1 != '\0')) {

        s1++;

        s2++;

    }

    return (*s1 == '\0');

 }





// check word for prefixes

struct hentry * AffixMgr::prefix_check(const char * word, int len, char in_compound,

    const FLAG needflag)

{

    struct hentry * rv= NULL;



    pfx = NULL;

    pfxappnd = NULL;

    sfxappnd = NULL;

    

    // first handle the special case of 0 length prefixes

    PfxEntry * pe = (PfxEntry *) pStart[0];

    while (pe) {

        if (

            // fogemorpheme

              ((in_compound != IN_CPD_NOT) || !(pe->getCont() &&

                  (TESTAFF(pe->getCont(), onlyincompound, pe->getContLen())))) &&

            // permit prefixes in compounds

              ((in_compound != IN_CPD_END) || (pe->getCont() &&

                  (TESTAFF(pe->getCont(), compoundpermitflag, pe->getContLen()))))

              ) {

                    // check prefix

                    rv = pe->checkword(word, len, in_compound, needflag);

                    if (rv) {

                        pfx=(AffEntry *)pe; // BUG: pfx not stateless

                        return rv;

                    }

             }

       pe = pe->getNext();

    }

  

    // now handle the general case

    unsigned char sp = *((const unsigned char *)word);

    PfxEntry * pptr = (PfxEntry *)pStart[sp];



    while (pptr) {

        if (isSubset(pptr->getKey(),word)) {

             if (

            // fogemorpheme

              ((in_compound != IN_CPD_NOT) || !(pptr->getCont() &&

                  (TESTAFF(pptr->getCont(), onlyincompound, pptr->getContLen())))) &&

            // permit prefixes in compounds

              ((in_compound != IN_CPD_END) || (pptr->getCont() &&

                  (TESTAFF(pptr->getCont(), compoundpermitflag, pptr->getContLen()))))

              ) {

            // check prefix

                  rv = pptr->checkword(word, len, in_compound, needflag);

                  if (rv) {

                    pfx=(AffEntry *)pptr; // BUG: pfx not stateless

                    return rv;

                  }

             }

             pptr = pptr->getNextEQ();

        } else {

             pptr = pptr->getNextNE();

        }

    }

    

    return NULL;

}



// check word for prefixes

struct hentry * AffixMgr::prefix_check_twosfx(const char * word, int len,

    char in_compound, const FLAG needflag)

{

    struct hentry * rv= NULL;



    pfx = NULL;

    sfxappnd = NULL;

    

    // first handle the special case of 0 length prefixes

    PfxEntry * pe = (PfxEntry *) pStart[0];

    

    while (pe) {

        rv = pe->check_twosfx(word, len, in_compound, needflag);

        if (rv) return rv;

        pe = pe->getNext();

    }

  

    // now handle the general case

    unsigned char sp = *((const unsigned char *)word);

    PfxEntry * pptr = (PfxEntry *)pStart[sp];



    while (pptr) {

        if (isSubset(pptr->getKey(),word)) {

            rv = pptr->check_twosfx(word, len, in_compound, needflag);

            if (rv) {

                pfx = (AffEntry *)pptr;

                return rv;

            }

            pptr = pptr->getNextEQ();

        } else {

             pptr = pptr->getNextNE();

        }

    }

    

    return NULL;

}



#ifdef HUNSPELL_EXPERIMENTAL

// check word for prefixes

char * AffixMgr::prefix_check_morph(const char * word, int len, char in_compound,

    const FLAG needflag)

{

    char * st;



    char result[MAXLNLEN];

    result[0] = '\0';



    pfx = NULL;

    sfxappnd = NULL;

    

    // first handle the special case of 0 length prefixes

    PfxEntry * pe = (PfxEntry *) pStart[0];

    while (pe) {

       st = pe->check_morph(word,len,in_compound, needflag);

       if (st) {

            strcat(result, st);

            free(st);

       }

       // if (rv) return rv;

       pe = pe->getNext();

    }

  

    // now handle the general case

    unsigned char sp = *((const unsigned char *)word);

    PfxEntry * pptr = (PfxEntry *)pStart[sp];



    while (pptr) {

        if (isSubset(pptr->getKey(),word)) {

            st = pptr->check_morph(word,len,in_compound, needflag);

            if (st) {

              // fogemorpheme

              if ((in_compound != IN_CPD_NOT) || !((pptr->getCont() && 

                        (TESTAFF(pptr->getCont(), onlyincompound, pptr->getContLen()))))) {

                    strcat(result, st);

                    pfx = (AffEntry *)pptr;

                }

                free(st);

            }

            pptr = pptr->getNextEQ();

        } else {

            pptr = pptr->getNextNE();

        }

    }

    

    if (*result) return mystrdup(result);

    return NULL;

}





// check word for prefixes

char * AffixMgr::prefix_check_twosfx_morph(const char * word, int len,

    char in_compound, const FLAG needflag)

{

    char * st;



    char result[MAXLNLEN];

    result[0] = '\0';



    pfx = NULL;

    sfxappnd = NULL;

    

    // first handle the special case of 0 length prefixes

    PfxEntry * pe = (PfxEntry *) pStart[0];

    while (pe) {

        st = pe->check_twosfx_morph(word,len,in_compound, needflag);

        if (st) {

            strcat(result, st);

            free(st);

        }

        pe = pe->getNext();

    }

  

    // now handle the general case

    unsigned char sp = *((const unsigned char *)word);

    PfxEntry * pptr = (PfxEntry *)pStart[sp];



    while (pptr) {

        if (isSubset(pptr->getKey(),word)) {

            st = pptr->check_twosfx_morph(word, len, in_compound, needflag);

            if (st) {

                strcat(result, st);

                free(st);

                pfx = (AffEntry *)pptr;

            }

            pptr = pptr->getNextEQ();

        } else {

            pptr = pptr->getNextNE();

        }

    }

    

    if (*result) return mystrdup(result);

    return NULL;

}

#endif // END OF HUNSPELL_EXPERIMENTAL CODE





// Is word a non compound with a REP substitution (see checkcompoundrep)?

int AffixMgr::cpdrep_check(const char * word, int wl)

{

  char candidate[MAXLNLEN];

  const char * r;

  int lenr, lenp;



  if ((wl < 2) || !numrep) return 0;



  for (int i=0; i < numrep; i++ ) {

      r = word;

      lenr = strlen(reptable[i].pattern2);

      lenp = strlen(reptable[i].pattern);

      // search every occurence of the pattern in the word

      while ((r=strstr(r, reptable[i].pattern)) != NULL) {

          strcpy(candidate, word);

          if (r-word + lenr + strlen(r+lenp) >= MAXLNLEN) break;

          strcpy(candidate+(r-word),reptable[i].pattern2);

          strcpy(candidate+(r-word)+lenr, r+lenp);

          if (candidate_check(candidate,strlen(candidate))) return 1;

          r++; // search for the next letter

      }

   }

   return 0;

}



// forbid compoundings when there are special patterns at word bound

int AffixMgr::cpdpat_check(const char * word, int pos)

{

  int len;

  for (int i = 0; i < numcheckcpd; i++) {

      if (isSubset(checkcpdtable[i].pattern2, word + pos) &&

        (len = strlen(checkcpdtable[i].pattern)) && (pos > len) &&

        (strncmp(word + pos - len, checkcpdtable[i].pattern, len) == 0)) return 1;

  }

  return 0;

}



// forbid compounding with neighbouring upper and lower case characters at word bounds

int AffixMgr::cpdcase_check(const char * word, int pos)

{

  if (utf8) {

      w_char u, w;

      const char * p;

      u8_u16(&u, 1, word + pos);

      for (p = word + pos - 1; (*p & 0xc0) == 0x80; p--);

      u8_u16(&w, 1, p);

      unsigned short a = (u.h << 8) + u.l;

      unsigned short b = (w.h << 8) + w.l;

      if (((unicodetoupper(a, langnum) == a) || (unicodetoupper(b, langnum) == b))) return 1;

  } else {

      unsigned char a = *(word + pos - 1);

      unsigned char b = *(word + pos);

      if ((csconv[a].ccase || csconv[b].ccase) && (a != '-') && (b != '-')) return 1;

  }

  return 0;

}



// check compound patterns

int AffixMgr::defcpd_check(hentry *** words, short wnum, hentry * rv, hentry ** def, char all)

{

  signed short btpp[MAXWORDLEN]; // metacharacter (*, ?) positions for backtracking

  signed short btwp[MAXWORDLEN]; // word positions for metacharacters

  int btnum[MAXWORDLEN]; // number of matched characters in metacharacter positions

  short bt = 0;  

  int i;

  int ok;

  int w = 0;

  if (!*words) {

    w = 1;

    *words = def;

  }

  (*words)[wnum] = rv;



  for (i = 0; i < numdefcpd; i++) {

    signed short pp = 0; // pattern position

    signed short wp = 0; // "words" position

    int ok2;

    ok = 1;

    ok2 = 1;

    do {

      while ((pp < defcpdtable[i].len) && (wp <= wnum)) {

        if (((pp+1) < defcpdtable[i].len) &&

          ((defcpdtable[i].def[pp+1] == '*') || (defcpdtable[i].def[pp+1] == '?'))) {

            int wend = (defcpdtable[i].def[pp+1] == '?') ? wp : wnum;

            ok2 = 1;

            pp+=2;

            btpp[bt] = pp;

            btwp[bt] = wp;

            while (wp <= wend) {

                if (!(*words)[wp]->alen || 

                  !TESTAFF((*words)[wp]->astr, defcpdtable[i].def[pp-2], (*words)[wp]->alen)) {

                    ok2 = 0;

                    break;

                }

                wp++;

            }

            if (wp <= wnum) ok2 = 0;

            btnum[bt] = wp - btwp[bt];

            if (btnum[bt] > 0) bt++;

            if (ok2) break;

        } else {

            ok2 = 1;

            if (!(*words)[wp] || !(*words)[wp]->alen || 

              !TESTAFF((*words)[wp]->astr, defcpdtable[i].def[pp], (*words)[wp]->alen)) {

                ok = 0;

                break;

            }

            pp++;

            wp++;

            if ((defcpdtable[i].len == pp) && !(wp > wnum)) ok = 0;

        }

      }

    if (ok && ok2) { 

        int r = pp;

        while ((defcpdtable[i].len > r) && ((r+1) < defcpdtable[i].len) &&

            ((defcpdtable[i].def[r+1] == '*') || (defcpdtable[i].def[r+1] == '?'))) r+=2;

        if (defcpdtable[i].len <= r) return 1;

    }    

    // backtrack

    if (bt) do {

        ok = 1;

        btnum[bt - 1]--;

        pp = btpp[bt - 1];

        wp = btwp[bt - 1] + btnum[bt - 1];

    } while ((btnum[bt - 1] < 0) && --bt);

  } while (bt);



  if (ok && ok2 && (!all || (defcpdtable[i].len <= pp))) return 1; 

  // check zero ending

  while (ok && ok2 && (defcpdtable[i].len > pp) && ((pp+1) < defcpdtable[i].len) &&

    ((defcpdtable[i].def[pp+1] == '*') || (defcpdtable[i].def[pp+1] == '?'))) pp+=2;

  if (ok && ok2 && (defcpdtable[i].len <= pp)) return 1;

  }

  (*words)[wnum] = NULL;

  if (w) *words = NULL;

  return 0;

}



inline int AffixMgr::candidate_check(const char * word, int len)

{

  struct hentry * rv=NULL;

  

  rv = lookup(word);

  if (rv) return 1;



//  rv = prefix_check(word,len,1);

//  if (rv) return 1;

  

  rv = affix_check(word,len);

  if (rv) return 1;

  return 0;

}



// calculate number of syllable for compound-checking

short AffixMgr::get_syllable(const char * word, int wlen)

{

    if (cpdmaxsyllable==0) return 0;

    

    short num=0;



    if (!utf8) {

        for (int i=0; i<wlen; i++) {

            if (strchr(cpdvowels, word[i])) num++;

        }

    } else if (cpdvowels_utf16) {

        w_char w[MAXWORDUTF8LEN];

        int i = u8_u16(w, MAXWORDUTF8LEN, word);

        for (; i; i--) {

            if (flag_bsearch((unsigned short *) cpdvowels_utf16,

                ((unsigned short *) w)[i - 1], cpdvowels_utf16_len)) num++;

        }

    }

    return num;

}



// check if compound word is correctly spelled

// hu_mov_rule = spec. Hungarian rule (XXX)

struct hentry * AffixMgr::compound_check(const char * word, int len, 

    short wordnum, short numsyllable, short maxwordnum, short wnum, hentry ** words = NULL,

    char hu_mov_rule = 0, int * cmpdstemnum = NULL, int * cmpdstem = NULL, char is_sug = 0)

{

    int i; 

    short oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2;

    int oldcmpdstemnum = 0;

    struct hentry * rv = NULL;

    struct hentry * rv_first;

    struct hentry * rwords[MAXWORDLEN]; // buffer for COMPOUND pattern checking

    char st [MAXWORDUTF8LEN + 4];

    char ch;

    int cmin;

    int cmax;

    

    int checked_prefix;



#ifdef HUNSTEM

    if (cmpdstemnum) {

        if (wordnum == 0) {

            *cmpdstemnum = 1;

        } else {

            (*cmpdstemnum)++;

        }

    }

#endif

    if (utf8) {

        for (cmin = 0, i = 0; (i < cpdmin) && word[cmin]; i++) {

          cmin++;

          for (; (word[cmin] & 0xc0) == 0x80; cmin++);

        }

        for (cmax = len, i = 0; (i < (cpdmin - 1)) && cmax; i++) {

          cmax--;

          for (; (word[cmax] & 0xc0) == 0x80; cmax--);

        }

    } else {

        cmin = cpdmin;

        cmax = len - cpdmin + 1;

    }



    strcpy(st, word);



    for (i = cmin; i < cmax; i++) {



        oldnumsyllable = numsyllable;

        oldwordnum = wordnum;

        checked_prefix = 0;



        // go to end of the UTF-8 character

        if (utf8) {

            for (; (st[i] & 0xc0) == 0x80; i++);

            if (i >= cmax) return NULL;

        }



        

        ch = st[i];

        st[i] = '\0';



        sfx = NULL;

        pfx = NULL;

        

        // FIRST WORD

        

        rv = lookup(st); // perhaps without prefix



        // search homonym with compound flag

        while ((rv) && !hu_mov_rule &&

            ((pseudoroot && TESTAFF(rv->astr, pseudoroot, rv->alen)) ||

                !((compoundflag && !words && TESTAFF(rv->astr, compoundflag, rv->alen)) ||

                  (compoundbegin && !wordnum &&

                        TESTAFF(rv->astr, compoundbegin, rv->alen)) ||

                  (compoundmiddle && wordnum && !words &&

                    TESTAFF(rv->astr, compoundmiddle, rv->alen)) ||

                  (numdefcpd &&

                    ((!words && !wordnum && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0)) ||

                    (words && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0))))

                  ))) {

            rv = rv->next_homonym;

        }



        if (!rv) {

            if (compoundflag && 

             !(rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundflag))) {

                if ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL,

                        FLAG_NULL, compoundflag, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) && !hu_mov_rule &&

                    ((SfxEntry*)sfx)->getCont() &&

                        ((compoundforbidflag && TESTAFF(((SfxEntry*)sfx)->getCont(), compoundforbidflag, 

                            ((SfxEntry*)sfx)->getContLen())) || (compoundend &&

                        TESTAFF(((SfxEntry*)sfx)->getCont(), compoundend, 

                            ((SfxEntry*)sfx)->getContLen())))) {

                        rv = NULL;

                }

            }

            if (rv ||

              (((wordnum == 0) && compoundbegin &&

                ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundbegin, hu_…
Large files files are truncated, but you can click here to view the full file