hunspell.cpp - This is a C++ implementation of the Hunspell…

/extensions/spellcheck/hunspell/src/hunspell.cpp

http://github.com/zpao/v8monkey · C++ · 2060 lines · 1753 code · 146 blank · 161 comment · 611 complexity · 29929269e48533727bddc53f90b211c9 MD5 · raw file

/******* BEGIN LICENSE BLOCK *******
 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
 * 
 * The contents of this file are subject to the Mozilla Public License Version
 * 1.1 (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 * 
 * Software distributed under the License is distributed on an "AS IS" basis,
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
 * for the specific language governing rights and limitations under the
 * License.
 * 
 * The Initial Developers of the Original Code are Kevin Hendricks (MySpell)
 * and L�szl� N�meth (Hunspell). Portions created by the Initial Developers
 * are Copyright (C) 2002-2005 the Initial Developers. All Rights Reserved.
 * 
 * Contributor(s): Kevin Hendricks (kevin.hendricks@sympatico.ca)
 *                 David Einstein (deinst@world.std.com)
 *                 L�szl� N�meth (nemethl@gyorsposta.hu)
 *                 Caolan McNamara (caolanm@redhat.com)
 *                 Davide Prina
 *                 Giuseppe Modugno
 *                 Gianluca Turconi
 *                 Simon Brouwer
 *                 Noll Janos
 *                 Biro Arpad
 *                 Goldman Eleonora
 *                 Sarlos Tamas
 *                 Bencsath Boldizsar
 *                 Halacsy Peter
 *                 Dvornik Laszlo
 *                 Gefferth Andras
 *                 Nagy Viktor
 *                 Varga Daniel
 *                 Chris Halls
 *                 Rene Engelhard
 *                 Bram Moolenaar
 *                 Dafydd Jones
 *                 Harri Pitkanen
 *                 Andras Timar
 *                 Tor Lillqvist
 * 
 * Alternatively, the contents of this file may be used under the terms of
 * either the GNU General Public License Version 2 or later (the "GPL"), or
 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
 * in which case the provisions of the GPL or the LGPL are applicable instead
 * of those above. If you wish to allow use of your version of this file only
 * under the terms of either the GPL or the LGPL, and not to allow others to
 * use your version of this file under the terms of the MPL, indicate your
 * decision by deleting the provisions above and replace them with the notice
 * and other provisions required by the GPL or the LGPL. If you do not delete
 * the provisions above, a recipient may use your version of this file under
 * the terms of any one of the MPL, the GPL or the LGPL.
 *
 ******* END LICENSE BLOCK *******/

#include <stdlib.h>
#include <string.h>
#include <stdio.h>

#include "hunspell.hxx"
#include "hunspell.h"
#ifndef MOZILLA_CLIENT
#    include "config.h"
#endif
#include "csutil.hxx"

Hunspell::Hunspell(const char * affpath, const char * dpath, const char * key)
{
    encoding = NULL;
    csconv = NULL;
    utf8 = 0;
    complexprefixes = 0;
    affixpath = mystrdup(affpath);
    maxdic = 0;

    /* first set up the hash manager */
    pHMgr[0] = new HashMgr(dpath, affpath, key);
    if (pHMgr[0]) maxdic = 1;

    /* next set up the affix manager */
    /* it needs access to the hash manager lookup methods */
    pAMgr = new AffixMgr(affpath, pHMgr, &maxdic, key);

    /* get the preferred try string and the dictionary */
    /* encoding from the Affix Manager for that dictionary */
    char * try_string = pAMgr->get_try_string();
    encoding = pAMgr->get_encoding();
    langnum = pAMgr->get_langnum();
    utf8 = pAMgr->get_utf8();
    if (!utf8)
        csconv = get_current_cs(encoding);
    complexprefixes = pAMgr->get_complexprefixes();
    wordbreak = pAMgr->get_breaktable();

    /* and finally set up the suggestion manager */
    pSMgr = new SuggestMgr(try_string, MAXSUGGESTION, pAMgr);
    if (try_string) free(try_string);
}

Hunspell::~Hunspell()
{
    if (pSMgr) delete pSMgr;
    if (pAMgr) delete pAMgr;
    for (int i = 0; i < maxdic; i++) delete pHMgr[i];
    maxdic = 0;
    pSMgr = NULL;
    pAMgr = NULL;
#ifdef MOZILLA_CLIENT
    delete [] csconv;
#endif
    csconv= NULL;
    if (encoding) free(encoding);
    encoding = NULL;
    if (affixpath) free(affixpath);
    affixpath = NULL;
}

// load extra dictionaries
int Hunspell::add_dic(const char * dpath, const char * key) {
    if (maxdic == MAXDIC || !affixpath) return 1;
    pHMgr[maxdic] = new HashMgr(dpath, affixpath, key);
    if (pHMgr[maxdic]) maxdic++; else return 1;
    return 0;
}

// make a copy of src at destination while removing all leading
// blanks and removing any trailing periods after recording
// their presence with the abbreviation flag
// also since already going through character by character,
// set the capitalization type
// return the length of the "cleaned" (and UTF-8 encoded) word

int Hunspell::cleanword2(char * dest, const char * src,
    w_char * dest_utf, int * nc, int * pcaptype, int * pabbrev)
{
   unsigned char * p = (unsigned char *) dest;
   const unsigned char * q = (const unsigned char * ) src;

   // first skip over any leading blanks
   while ((*q != '\0') && (*q == ' ')) q++;

   // now strip off any trailing periods (recording their presence)
   *pabbrev = 0;
   int nl = strlen((const char *)q);
   while ((nl > 0) && (*(q+nl-1)=='.')) {
       nl--;
       (*pabbrev)++;
   }

   // if no characters are left it can't be capitalized
   if (nl <= 0) {
       *pcaptype = NOCAP;
       *p = '\0';
       return 0;
   }

   strncpy(dest, (char *) q, nl);
   *(dest + nl) = '\0';
   nl = strlen(dest);
   if (utf8) {
      *nc = u8_u16(dest_utf, MAXWORDLEN, dest);
      // don't check too long words
      if (*nc >= MAXWORDLEN) return 0;
      if (*nc == -1) { // big Unicode character (non BMP area)
         *pcaptype = NOCAP;
         return nl;
      }
     *pcaptype = get_captype_utf8(dest_utf, *nc, langnum);
   } else {
     *pcaptype = get_captype(dest, nl, csconv);
     *nc = nl;
   }
   return nl;
}

int Hunspell::cleanword(char * dest, const char * src,
    int * pcaptype, int * pabbrev)
{
   unsigned char * p = (unsigned char *) dest;
   const unsigned char * q = (const unsigned char * ) src;
   int firstcap = 0;

   // first skip over any leading blanks
   while ((*q != '\0') && (*q == ' ')) q++;

   // now strip off any trailing periods (recording their presence)
   *pabbrev = 0;
   int nl = strlen((const char *)q);
   while ((nl > 0) && (*(q+nl-1)=='.')) {
       nl--;
       (*pabbrev)++;
   }

   // if no characters are left it can't be capitalized
   if (nl <= 0) {
       *pcaptype = NOCAP;
       *p = '\0';
       return 0;
   }

   // now determine the capitalization type of the first nl letters
   int ncap = 0;
   int nneutral = 0;
   int nc = 0;

   if (!utf8) {
      while (nl > 0) {
         nc++;
         if (csconv[(*q)].ccase) ncap++;
         if (csconv[(*q)].cupper == csconv[(*q)].clower) nneutral++;
         *p++ = *q++;
         nl--;
      }
      // remember to terminate the destination string
      *p = '\0';
      firstcap = csconv[(unsigned char)(*dest)].ccase;
   } else {
      unsigned short idx;
      w_char t[MAXWORDLEN];
      nc = u8_u16(t, MAXWORDLEN, src);
      for (int i = 0; i < nc; i++) {
         idx = (t[i].h << 8) + t[i].l;
         unsigned short low = unicodetolower(idx, langnum);
         if (idx != low) ncap++;
         if (unicodetoupper(idx, langnum) == low) nneutral++;
      }
      u16_u8(dest, MAXWORDUTF8LEN, t, nc);
      if (ncap) {
         idx = (t[0].h << 8) + t[0].l;
         firstcap = (idx != unicodetolower(idx, langnum));
      }
   }

   // now finally set the captype
   if (ncap == 0) {
        *pcaptype = NOCAP;
   } else if ((ncap == 1) && firstcap) {
        *pcaptype = INITCAP;
   } else if ((ncap == nc) || ((ncap + nneutral) == nc)){
        *pcaptype = ALLCAP;
   } else if ((ncap > 1) && firstcap) {
        *pcaptype = HUHINITCAP;
   } else {
        *pcaptype = HUHCAP;
   }
   return strlen(dest);
}

void Hunspell::mkallcap(char * p)
{
  if (utf8) {
      w_char u[MAXWORDLEN];
      int nc = u8_u16(u, MAXWORDLEN, p);
      unsigned short idx;
      for (int i = 0; i < nc; i++) {
         idx = (u[i].h << 8) + u[i].l;
         if (idx != unicodetoupper(idx, langnum)) {
            u[i].h = (unsigned char) (unicodetoupper(idx, langnum) >> 8);
            u[i].l = (unsigned char) (unicodetoupper(idx, langnum) & 0x00FF);
         }
      }
      u16_u8(p, MAXWORDUTF8LEN, u, nc);
  } else {
    while (*p != '\0') {
        *p = csconv[((unsigned char) *p)].cupper;
        p++;
    }
  }
}

int Hunspell::mkallcap2(char * p, w_char * u, int nc)
{
  if (utf8) {
      unsigned short idx;
      for (int i = 0; i < nc; i++) {
         idx = (u[i].h << 8) + u[i].l;
         unsigned short up = unicodetoupper(idx, langnum);
         if (idx != up) {
            u[i].h = (unsigned char) (up >> 8);
            u[i].l = (unsigned char) (up & 0x00FF);
         }
      }
      u16_u8(p, MAXWORDUTF8LEN, u, nc);
      return strlen(p);
  } else {
    while (*p != '\0') {
        *p = csconv[((unsigned char) *p)].cupper;
        p++;
    }
  }
  return nc;
}


void Hunspell::mkallsmall(char * p)
{
    while (*p != '\0') {
        *p = csconv[((unsigned char) *p)].clower;
        p++;
    }
}

int Hunspell::mkallsmall2(char * p, w_char * u, int nc)
{
  if (utf8) {
      unsigned short idx;
      for (int i = 0; i < nc; i++) {
         idx = (u[i].h << 8) + u[i].l;
         unsigned short low = unicodetolower(idx, langnum);
         if (idx != low) {
            u[i].h = (unsigned char) (low >> 8);
            u[i].l = (unsigned char) (low & 0x00FF);
         }
      }
      u16_u8(p, MAXWORDUTF8LEN, u, nc);
      return strlen(p);
  } else {
    while (*p != '\0') {
        *p = csconv[((unsigned char) *p)].clower;
        p++;
    }
  }
  return nc;
}

// convert UTF-8 sharp S codes to latin 1
char * Hunspell::sharps_u8_l1(char * dest, char * source) {
    char * p = dest;
    *p = *source;
    for (p++, source++; *(source - 1); p++, source++) {
        *p = *source;
        if (*source == '\x9F') *--p = '\xDF';
    }
    return dest;
}

// recursive search for right ss - sharp s permutations
hentry * Hunspell::spellsharps(char * base, char * pos, int n,
        int repnum, char * tmp, int * info, char **root) {
    pos = strstr(pos, "ss");
    if (pos && (n < MAXSHARPS)) {
        *pos = '\xC3';
        *(pos + 1) = '\x9F';
        hentry * h = spellsharps(base, pos + 2, n + 1, repnum + 1, tmp, info, root);
        if (h) return h;
        *pos = 's';
        *(pos + 1) = 's';
        h = spellsharps(base, pos + 2, n + 1, repnum, tmp, info, root);
        if (h) return h;
    } else if (repnum > 0) {
        if (utf8) return checkword(base, info, root);
        return checkword(sharps_u8_l1(tmp, base), info, root);
    }
    return NULL;
}

int Hunspell::is_keepcase(const hentry * rv) {
    return pAMgr && rv->astr && pAMgr->get_keepcase() &&
        TESTAFF(rv->astr, pAMgr->get_keepcase(), rv->alen);
}

/* insert a word to the beginning of the suggestion array and return ns */
int Hunspell::insert_sug(char ***slst, char * word, int ns) {
    char * dup = mystrdup(word);
    if (!dup) return ns;
    if (ns == MAXSUGGESTION) {
        ns--;
        free((*slst)[ns]);
    }
    for (int k = ns; k > 0; k--) (*slst)[k] = (*slst)[k - 1];
    (*slst)[0] = dup;
    return ns + 1;
}

int Hunspell::spell(const char * word, int * info, char ** root)
{
  struct hentry * rv=NULL;
  // need larger vector. For example, Turkish capital letter I converted a
  // 2-byte UTF-8 character (dotless i) by mkallsmall.
  char cw[MAXWORDUTF8LEN];
  char wspace[MAXWORDUTF8LEN];
  w_char unicw[MAXWORDLEN];
  // Hunspell supports XML input of the simplified API (see manual)
  if (strcmp(word, SPELL_XML) == 0) return 1;
  int nc = strlen(word);
  int wl2 = 0;
  if (utf8) {
    if (nc >= MAXWORDUTF8LEN) return 0;
  } else {
    if (nc >= MAXWORDLEN) return 0;
  }
  int captype = 0;
  int abbv = 0;
  int wl = 0;

  // input conversion
  RepList * rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL;
  if (rl && rl->conv(word, wspace)) wl = cleanword2(cw, wspace, unicw, &nc, &captype, &abbv);
  else wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv);

  int info2 = 0;
  if (wl == 0 || maxdic == 0) return 1;
  if (root) *root = NULL;

  // allow numbers with dots, dashes and commas (but forbid double separators: "..", "--" etc.)
  enum { NBEGIN, NNUM, NSEP };
  int nstate = NBEGIN;
  int i;

  for (i = 0; (i < wl); i++) {
    if ((cw[i] <= '9') && (cw[i] >= '0')) {
        nstate = NNUM;
    } else if ((cw[i] == ',') || (cw[i] == '.') || (cw[i] == '-')) {
        if ((nstate == NSEP) || (i == 0)) break;
        nstate = NSEP;
    } else break;
  }
  if ((i == wl) && (nstate == NNUM)) return 1;
  if (!info) info = &info2; else *info = 0;

  switch(captype) {
     case HUHCAP:
     case HUHINITCAP:
            *info += SPELL_ORIGCAP;
     case NOCAP: {
            rv = checkword(cw, info, root);
            if ((abbv) && !(rv)) {
                memcpy(wspace,cw,wl);
                *(wspace+wl) = '.';
                *(wspace+wl+1) = '\0';
                rv = checkword(wspace, info, root);
            }
            break;
         }
     case ALLCAP: {
            *info += SPELL_ORIGCAP;
            rv = checkword(cw, info, root);
            if (rv) break;
            if (abbv) {
                memcpy(wspace,cw,wl);
                *(wspace+wl) = '.';
                *(wspace+wl+1) = '\0';
                rv = checkword(wspace, info, root);
                if (rv) break;
            }
            // Spec. prefix handling for Catalan, French, Italian:
	    // prefixes separated by apostrophe (SANT'ELIA -> Sant'+Elia).
            if (pAMgr && strchr(cw, '\'')) {
                wl = mkallsmall2(cw, unicw, nc);
        	//There are no really sane circumstances where this could fail,
        	//but anyway...
        	if (char * apostrophe = strchr(cw, '\'')) {
                    if (utf8) {
            	        w_char tmpword[MAXWORDLEN];
            	        *apostrophe = '\0';
            	        wl2 = u8_u16(tmpword, MAXWORDLEN, cw);
            	        *apostrophe = '\'';
		        if (wl2 < nc) {
		            mkinitcap2(apostrophe + 1, unicw + wl2 + 1, nc - wl2 - 1);
			    rv = checkword(cw, info, root);
			    if (rv) break;
		        }
                    } else {
		        mkinitcap2(apostrophe + 1, unicw, nc);
		        rv = checkword(cw, info, root);
		        if (rv) break;
		    }
		}
		mkinitcap2(cw, unicw, nc);
		rv = checkword(cw, info, root);
		if (rv) break;
            }
            if (pAMgr && pAMgr->get_checksharps() && strstr(cw, "SS")) {
                char tmpword[MAXWORDUTF8LEN];
                wl = mkallsmall2(cw, unicw, nc);
                memcpy(wspace,cw,(wl+1));
                rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root);
                if (!rv) {
                    wl2 = mkinitcap2(cw, unicw, nc);
                    rv = spellsharps(cw, cw, 0, 0, tmpword, info, root);
                }
                if ((abbv) && !(rv)) {
                    *(wspace+wl) = '.';
                    *(wspace+wl+1) = '\0';
                    rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root);
                    if (!rv) {
                        memcpy(wspace, cw, wl2);
                        *(wspace+wl2) = '.';
                        *(wspace+wl2+1) = '\0';
                        rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root);
                    }
                }
                if (rv) break;
            }
        }
     case INITCAP: {
             *info += SPELL_ORIGCAP;
             wl = mkallsmall2(cw, unicw, nc);
             memcpy(wspace,cw,(wl+1));
             wl2 = mkinitcap2(cw, unicw, nc);
             if (captype == INITCAP) *info += SPELL_INITCAP;
             rv = checkword(cw, info, root);
             if (captype == INITCAP) *info -= SPELL_INITCAP;
             // forbid bad capitalization
             // (for example, ijs -> Ijs instead of IJs in Dutch)
             // use explicit forms in dic: Ijs/F (F = FORBIDDENWORD flag)
             if (*info & SPELL_FORBIDDEN) {
                rv = NULL;
                break;
             }
             if (rv && is_keepcase(rv) && (captype == ALLCAP)) rv = NULL;
             if (rv) break;

             rv = checkword(wspace, info, root);
             if (abbv && !rv) {

                 *(wspace+wl) = '.';
                 *(wspace+wl+1) = '\0';
                 rv = checkword(wspace, info, root);
                 if (!rv) {
                    memcpy(wspace, cw, wl2);
                    *(wspace+wl2) = '.';
                    *(wspace+wl2+1) = '\0';
    	    	    if (captype == INITCAP) *info += SPELL_INITCAP;
                    rv = checkword(wspace, info, root);
    	    	    if (captype == INITCAP) *info -= SPELL_INITCAP;
                    if (rv && is_keepcase(rv) && (captype == ALLCAP)) rv = NULL;
                    break;
                 }
             }
             if (rv && is_keepcase(rv) &&
                ((captype == ALLCAP) ||
                   // if CHECKSHARPS: KEEPCASE words with \xDF  are allowed
                   // in INITCAP form, too.
                   !(pAMgr->get_checksharps() &&
                      ((utf8 && strstr(wspace, "\xC3\x9F")) ||
                      (!utf8 && strchr(wspace, '\xDF')))))) rv = NULL;
             break;
           }
  }

  if (rv) {
      if (pAMgr && pAMgr->get_warn() && rv->astr &&
          TESTAFF(rv->astr, pAMgr->get_warn(), rv->alen)) {
              *info += SPELL_WARN;
	      if (pAMgr->get_forbidwarn()) return 0;
              return HUNSPELL_OK_WARN;
      }
      return HUNSPELL_OK;
  }

  // recursive breaking at break points
  if (wordbreak) {
    char * s;
    char r;
    int nbr = 0;
    wl = strlen(cw);
    int numbreak = pAMgr ? pAMgr->get_numbreak() : 0;

    // calculate break points for recursion limit
    for (int j = 0; j < numbreak; j++) {
      s = cw;
      do {
      	s = (char *) strstr(s, wordbreak[j]);
      	if (s) { 
		nbr++;
		s++;
	}
      } while (s);
    } 
    if (nbr >= 10) return 0;

    // check boundary patterns (^begin and end$)
    for (int j = 0; j < numbreak; j++) {
      int plen = strlen(wordbreak[j]);
      if (plen == 1 || plen > wl) continue;
      if (wordbreak[j][0] == '^' && strncmp(cw, wordbreak[j] + 1, plen - 1) == 0
        && spell(cw + plen - 1)) return 1;
      if (wordbreak[j][plen - 1] == '$' &&
        strncmp(cw + wl - plen + 1, wordbreak[j], plen - 1) == 0) {
	    r = cw[wl - plen + 1];
	    cw[wl - plen + 1] = '\0';
    	    if (spell(cw)) return 1;
	    cw[wl - plen + 1] = r;
	}
    }

    // other patterns
    for (int j = 0; j < numbreak; j++) {
      int plen = strlen(wordbreak[j]);
      s=(char *) strstr(cw, wordbreak[j]);
      if (s && (s > cw) && (s < cw + wl - plen)) {
	if (!spell(s + plen)) continue;
        r = *s;
        *s = '\0';
        // examine 2 sides of the break point
        if (spell(cw)) return 1;
        *s = r;

        // LANG_hu: spec. dash rule
	if (langnum == LANG_hu && strcmp(wordbreak[j], "-") == 0) {
	  r = s[1];
	  s[1] = '\0';
          if (spell(cw)) return 1; // check the first part with dash
          s[1] = r;
	}
        // end of LANG speficic region

      }
    }
  }

  return 0;
}

struct hentry * Hunspell::checkword(const char * w, int * info, char ** root)
{
  struct hentry * he = NULL;
  int len, i;
  char w2[MAXWORDUTF8LEN];
  const char * word;

  char * ignoredchars = pAMgr->get_ignore();
  if (ignoredchars != NULL) {
     strcpy(w2, w);
     if (utf8) {
        int ignoredchars_utf16_len;
        unsigned short * ignoredchars_utf16 = pAMgr->get_ignore_utf16(&ignoredchars_utf16_len);
        remove_ignored_chars_utf(w2, ignoredchars_utf16, ignoredchars_utf16_len);
     } else {
        remove_ignored_chars(w2,ignoredchars);
     }
     word = w2;
  } else word = w;

  len = strlen(word);

  if (!len)
      return NULL;

  // word reversing wrapper for complex prefixes
  if (complexprefixes) {
    if (word != w2) {
      strcpy(w2, word);
      word = w2;
    }
    if (utf8) reverseword_utf(w2); else reverseword(w2);
  }

  // look word in hash table
  for (i = 0; (i < maxdic) && !he; i ++) {
  he = (pHMgr[i])->lookup(word);

  // check forbidden and onlyincompound words
  if ((he) && (he->astr) && (pAMgr) && TESTAFF(he->astr, pAMgr->get_forbiddenword(), he->alen)) {
    if (info) *info += SPELL_FORBIDDEN;
    // LANG_hu section: set dash information for suggestions
    if (langnum == LANG_hu) {
        if (pAMgr->get_compoundflag() &&
            TESTAFF(he->astr, pAMgr->get_compoundflag(), he->alen)) {
                if (info) *info += SPELL_COMPOUND;
        }
    }
    return NULL;
  }

  // he = next not needaffix, onlyincompound homonym or onlyupcase word
  while (he && (he->astr) &&
    ((pAMgr->get_needaffix() && TESTAFF(he->astr, pAMgr->get_needaffix(), he->alen)) ||
       (pAMgr->get_onlyincompound() && TESTAFF(he->astr, pAMgr->get_onlyincompound(), he->alen)) ||
       (info && (*info & SPELL_INITCAP) && TESTAFF(he->astr, ONLYUPCASEFLAG, he->alen))
    )) he = he->next_homonym;
  }

  // check with affixes
  if (!he && pAMgr) {
     // try stripping off affixes */
     he = pAMgr->affix_check(word, len, 0);

     // check compound restriction and onlyupcase
     if (he && he->astr && (
        (pAMgr->get_onlyincompound() &&
    	    TESTAFF(he->astr, pAMgr->get_onlyincompound(), he->alen)) ||
        (info && (*info & SPELL_INITCAP) &&
    	    TESTAFF(he->astr, ONLYUPCASEFLAG, he->alen)))) {
    	    he = NULL;
     }

     if (he) {
        if ((he->astr) && (pAMgr) && TESTAFF(he->astr, pAMgr->get_forbiddenword(), he->alen)) {
            if (info) *info += SPELL_FORBIDDEN;
            return NULL;
        }
        if (root) {
            *root = mystrdup(he->word);
            if (*root && complexprefixes) {
                if (utf8) reverseword_utf(*root); else reverseword(*root);
            }
        }
     // try check compound word
     } else if (pAMgr->get_compound()) {
          he = pAMgr->compound_check(word, len, 0, 0, 100, 0, NULL, 0, 0, info);
          // LANG_hu section: `moving rule' with last dash
          if ((!he) && (langnum == LANG_hu) && (word[len-1] == '-')) {
             char * dup = mystrdup(word);
             if (!dup) return NULL;
             dup[len-1] = '\0';
             he = pAMgr->compound_check(dup, len-1, -5, 0, 100, 0, NULL, 1, 0, info);
             free(dup);
          }
          // end of LANG speficic region
          if (he) {
                if (root) {
                    *root = mystrdup(he->word);
                    if (*root && complexprefixes) {
                        if (utf8) reverseword_utf(*root); else reverseword(*root);
                    }
                }
                if (info) *info += SPELL_COMPOUND;
          }
     }

  }

  return he;
}

int Hunspell::suggest(char*** slst, const char * word)
{
  int onlycmpdsug = 0;
  char cw[MAXWORDUTF8LEN];
  char wspace[MAXWORDUTF8LEN];
  if (!pSMgr || maxdic == 0) return 0;
  w_char unicw[MAXWORDLEN];
  *slst = NULL;
  // process XML input of the simplified API (see manual)
  if (strncmp(word, SPELL_XML, sizeof(SPELL_XML) - 3) == 0) {
     return spellml(slst, word);
  }
  int nc = strlen(word);
  if (utf8) {
    if (nc >= MAXWORDUTF8LEN) return 0;
  } else {
    if (nc >= MAXWORDLEN) return 0;
  }
  int captype = 0;
  int abbv = 0;
  int wl = 0;

  // input conversion
  RepList * rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL;
  if (rl && rl->conv(word, wspace)) wl = cleanword2(cw, wspace, unicw, &nc, &captype, &abbv);
  else wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv);

  if (wl == 0) return 0;
  int ns = 0;
  int capwords = 0;

  // check capitalized form for FORCEUCASE
  if (pAMgr && captype == NOCAP && pAMgr->get_forceucase()) {
    int info = SPELL_ORIGCAP;
    char ** wlst;
    if (checkword(cw, &info, NULL)) {
        if (*slst) {
            wlst = *slst;
        } else {
            wlst = (char **) malloc(MAXSUGGESTION * sizeof(char *));
            if (wlst == NULL) return -1;
            *slst = wlst;
            for (int i = 0; i < MAXSUGGESTION; i++) {
                wlst[i] = NULL;
            }
        }
        wlst[0] = mystrdup(cw);
        mkinitcap(wlst[0]);
        return 1;
    }
  }
 
  switch(captype) {
     case NOCAP:   {
                     ns = pSMgr->suggest(slst, cw, ns, &onlycmpdsug);
                     break;
                   }

     case INITCAP: {
                     capwords = 1;
                     ns = pSMgr->suggest(slst, cw, ns, &onlycmpdsug);
                     if (ns == -1) break;
                     memcpy(wspace,cw,(wl+1));
                     mkallsmall2(wspace, unicw, nc);
                     ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug);
                     break;
                   }
     case HUHINITCAP:
                    capwords = 1;
     case HUHCAP: {
                     ns = pSMgr->suggest(slst, cw, ns, &onlycmpdsug);
                     if (ns != -1) {
                        int prevns;
    		        // something.The -> something. The
                        char * dot = strchr(cw, '.');
		        if (dot && (dot > cw)) {
		            int captype_;
		            if (utf8) {
		               w_char w_[MAXWORDLEN];
			       int wl_ = u8_u16(w_, MAXWORDLEN, dot + 1);
		               captype_ = get_captype_utf8(w_, wl_, langnum);
		            } else captype_ = get_captype(dot+1, strlen(dot+1), csconv);
		    	    if (captype_ == INITCAP) {
                        	char * st = mystrdup(cw);
                        	if (st) st = (char *) realloc(st, wl + 2);
				if (st) {
                        		st[(dot - cw) + 1] = ' ';
                        		strcpy(st + (dot - cw) + 2, dot + 1);
                    			ns = insert_sug(slst, st, ns);
					free(st);
				}
		    	    }
		        }
                        if (captype == HUHINITCAP) {
                            // TheOpenOffice.org -> The OpenOffice.org
                            memcpy(wspace,cw,(wl+1));
                            mkinitsmall2(wspace, unicw, nc);
                            ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug);
                        }
                        memcpy(wspace,cw,(wl+1));
                        mkallsmall2(wspace, unicw, nc);
                        if (spell(wspace)) ns = insert_sug(slst, wspace, ns);
                        prevns = ns;
                        ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug);
                        if (captype == HUHINITCAP) {
                            mkinitcap2(wspace, unicw, nc);
                            if (spell(wspace)) ns = insert_sug(slst, wspace, ns);
                            ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug);
                        }
                        // aNew -> "a New" (instead of "a new")
                        for (int j = prevns; j < ns; j++) {
                           char * space = strchr((*slst)[j],' ');
                           if (space) {
                                int slen = strlen(space + 1);
                                // different case after space (need capitalisation)
                                if ((slen < wl) && strcmp(cw + wl - slen, space + 1)) {
                                    w_char w[MAXWORDLEN];
                                    int wc = 0;
                                    char * r = (*slst)[j];
                                    if (utf8) wc = u8_u16(w, MAXWORDLEN, space + 1);
                                    mkinitcap2(space + 1, w, wc);
                                    // set as first suggestion
                                    for (int k = j; k > 0; k--) (*slst)[k] = (*slst)[k - 1];
                                    (*slst)[0] = r;
                                }
                           }
                        }
                     }
                     break;
                   }

     case ALLCAP: {
                     memcpy(wspace, cw, (wl+1));
                     mkallsmall2(wspace, unicw, nc);
                     ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug);
                     if (ns == -1) break;
                     if (pAMgr && pAMgr->get_keepcase() && spell(wspace))
                        ns = insert_sug(slst, wspace, ns);
                     mkinitcap2(wspace, unicw, nc);
                     ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug);
                     for (int j=0; j < ns; j++) {
                        mkallcap((*slst)[j]);
                        if (pAMgr && pAMgr->get_checksharps()) {
                            char * pos;
                            if (utf8) {
                                pos = strstr((*slst)[j], "\xC3\x9F");
                                while (pos) {
                                    *pos = 'S';
                                    *(pos+1) = 'S';
                                    pos = strstr(pos+2, "\xC3\x9F");
                                }
                            } else {
                                pos = strchr((*slst)[j], '\xDF');
                                while (pos) {
                                    (*slst)[j] = (char *) realloc((*slst)[j], strlen((*slst)[j]) + 2);
                                    mystrrep((*slst)[j], "\xDF", "SS");
                                    pos = strchr((*slst)[j], '\xDF');
                                }
                            }
                        }
                     }
                     break;
                   }
  }

 // LANG_hu section: replace '-' with ' ' in Hungarian
  if (langnum == LANG_hu) {
      for (int j=0; j < ns; j++) {
          char * pos = strchr((*slst)[j],'-');
          if (pos) {
              int info;
              char w[MAXWORDUTF8LEN];
              *pos = '\0';
              strcpy(w, (*slst)[j]);
              strcat(w, pos + 1);
              spell(w, &info, NULL);
              if ((info & SPELL_COMPOUND) && (info & SPELL_FORBIDDEN)) {
                  *pos = ' ';
              } else *pos = '-';
          }
      }
  }
  // END OF LANG_hu section

  // try ngram approach since found nothing or only compound words
  if (pAMgr && (ns == 0 || onlycmpdsug) && (pAMgr->get_maxngramsugs() != 0) && (*slst)) {
      switch(captype) {
          case NOCAP: {
              ns = pSMgr->ngsuggest(*slst, cw, ns, pHMgr, maxdic);
              break;
          }
	  case HUHINITCAP:
              capwords = 1;
          case HUHCAP: {
              memcpy(wspace,cw,(wl+1));
              mkallsmall2(wspace, unicw, nc);
              ns = pSMgr->ngsuggest(*slst, wspace, ns, pHMgr, maxdic);
	      break;
          }
         case INITCAP: {
              capwords = 1;
              memcpy(wspace,cw,(wl+1));
              mkallsmall2(wspace, unicw, nc);
              ns = pSMgr->ngsuggest(*slst, wspace, ns, pHMgr, maxdic);
              break;
          }
          case ALLCAP: {
              memcpy(wspace,cw,(wl+1));
              mkallsmall2(wspace, unicw, nc);
	      int oldns = ns;
              ns = pSMgr->ngsuggest(*slst, wspace, ns, pHMgr, maxdic);
              for (int j = oldns; j < ns; j++)
                  mkallcap((*slst)[j]);
              break;
         }
      }
  }

  // try dash suggestion (Afo-American -> Afro-American)
  if (char * pos = strchr(cw, '-')) {
     char * ppos = cw;
     int nodashsug = 1;
     char ** nlst = NULL;
     int nn = 0;
     int last = 0;
     if (*slst) {
        for (int j = 0; j < ns && nodashsug == 1; j++) {
           if (strchr((*slst)[j], '-')) nodashsug = 0;
        }
     }
     while (nodashsug && !last) {
	if (*pos == '\0') last = 1; else *pos = '\0';
        if (!spell(ppos)) {
          nn = suggest(&nlst, ppos);
          for (int j = nn - 1; j >= 0; j--) {
            strncpy(wspace, cw, ppos - cw);
            strcpy(wspace + (ppos - cw), nlst[j]);
            if (!last) {
            	strcat(wspace, "-");
		strcat(wspace, pos + 1);
	    }
            ns = insert_sug(slst, wspace, ns);
            free(nlst[j]);
          }
          if (nlst != NULL) free(nlst);
          nodashsug = 0;
        }
	if (!last) {
          *pos = '-';
          ppos = pos + 1;
          pos = strchr(ppos, '-');
        }
	if (!pos) pos = cw + strlen(cw);
     }
  }

  // word reversing wrapper for complex prefixes
  if (complexprefixes) {
    for (int j = 0; j < ns; j++) {
      if (utf8) reverseword_utf((*slst)[j]); else reverseword((*slst)[j]);
    }
  }

  // capitalize
  if (capwords) for (int j=0; j < ns; j++) {
      mkinitcap((*slst)[j]);
  }

  // expand suggestions with dot(s)
  if (abbv && pAMgr && pAMgr->get_sugswithdots()) {
    for (int j = 0; j < ns; j++) {
      (*slst)[j] = (char *) realloc((*slst)[j], strlen((*slst)[j]) + 1 + abbv);
      strcat((*slst)[j], word + strlen(word) - abbv);
    }
  }

  // remove bad capitalized and forbidden forms
  if (pAMgr && (pAMgr->get_keepcase() || pAMgr->get_forbiddenword())) {
  switch (captype) {
    case INITCAP:
    case ALLCAP: {
      int l = 0;
      for (int j=0; j < ns; j++) {
        if (!strchr((*slst)[j],' ') && !spell((*slst)[j])) {
          char s[MAXSWUTF8L];
          w_char w[MAXSWL];
          int len;
          if (utf8) {
            len = u8_u16(w, MAXSWL, (*slst)[j]);
          } else {
            strcpy(s, (*slst)[j]);
            len = strlen(s);
          }
          mkallsmall2(s, w, len);
          free((*slst)[j]);
          if (spell(s)) {
            (*slst)[l] = mystrdup(s);
            if ((*slst)[l]) l++;
          } else {
            mkinitcap2(s, w, len);
            if (spell(s)) {
              (*slst)[l] = mystrdup(s);
              if ((*slst)[l]) l++;
            }
          }
        } else {
          (*slst)[l] = (*slst)[j];
          l++;
        }
      }
      ns = l;
    }
  }
  }

  // remove duplications
  int l = 0;
  for (int j = 0; j < ns; j++) {
    (*slst)[l] = (*slst)[j];
    for (int k = 0; k < l; k++) {
      if (strcmp((*slst)[k], (*slst)[j]) == 0) {
        free((*slst)[j]);
        l--;
        break;
      }
    }
    l++;
  }
  ns = l;

  // output conversion
  rl = (pAMgr) ? pAMgr->get_oconvtable() : NULL;
  for (int j = 0; rl && j < ns; j++) {
    if (rl->conv((*slst)[j], wspace)) {
      free((*slst)[j]);
      (*slst)[j] = mystrdup(wspace);
    }
  }

  // if suggestions removed by nosuggest, onlyincompound parameters
  if (l == 0 && *slst) {
    free(*slst);
    *slst = NULL;
  }
  return l;
}

void Hunspell::free_list(char *** slst, int n) {
        freelist(slst, n);
}

char * Hunspell::get_dic_encoding()
{
  return encoding;
}

#ifdef HUNSPELL_EXPERIMENTAL
// XXX need UTF-8 support
int Hunspell::suggest_auto(char*** slst, const char * word)
{
  char cw[MAXWORDUTF8LEN];
  char wspace[MAXWORDUTF8LEN];
  if (!pSMgr || maxdic == 0) return 0;
  int wl = strlen(word);
  if (utf8) {
    if (wl >= MAXWORDUTF8LEN) return 0;
  } else {
    if (wl >= MAXWORDLEN) return 0;
  }
  int captype = 0;
  int abbv = 0;
  wl = cleanword(cw, word, &captype, &abbv);
  if (wl == 0) return 0;
  int ns = 0;
  *slst = NULL; // HU, nsug in pSMgr->suggest

  switch(captype) {
     case NOCAP:   {
                     ns = pSMgr->suggest_auto(slst, cw, ns);
                     if (ns>0) break;
                     break;
                   }

     case INITCAP: {
                     memcpy(wspace,cw,(wl+1));
                     mkallsmall(wspace);
                     ns = pSMgr->suggest_auto(slst, wspace, ns);
                     for (int j=0; j < ns; j++)
                       mkinitcap((*slst)[j]);
                     ns = pSMgr->suggest_auto(slst, cw, ns);
                     break;

                   }

     case HUHINITCAP:
     case HUHCAP: {
                     ns = pSMgr->suggest_auto(slst, cw, ns);
                     if (ns == 0) {
                        memcpy(wspace,cw,(wl+1));
                        mkallsmall(wspace);
                        ns = pSMgr->suggest_auto(slst, wspace, ns);
                     }
                     break;
                   }

     case ALLCAP: {
                     memcpy(wspace,cw,(wl+1));
                     mkallsmall(wspace);
                     ns = pSMgr->suggest_auto(slst, wspace, ns);

                     mkinitcap(wspace);
                     ns = pSMgr->suggest_auto(slst, wspace, ns);

                     for (int j=0; j < ns; j++)
                       mkallcap((*slst)[j]);
                     break;
                   }
  }

  // word reversing wrapper for complex prefixes
  if (complexprefixes) {
    for (int j = 0; j < ns; j++) {
      if (utf8) reverseword_utf((*slst)[j]); else reverseword((*slst)[j]);
    }
  }

  // expand suggestions with dot(s)
  if (abbv && pAMgr && pAMgr->get_sugswithdots()) {
    for (int j = 0; j < ns; j++) {
      (*slst)[j] = (char *) realloc((*slst)[j], strlen((*slst)[j]) + 1 + abbv);
      strcat((*slst)[j], word + strlen(word) - abbv);
    }
  }

  // LANG_hu section: replace '-' with ' ' in Hungarian
  if (langnum == LANG_hu) {
      for (int j=0; j < ns; j++) {
          char * pos = strchr((*slst)[j],'-');
          if (pos) {
              int info;
              char w[MAXWORDUTF8LEN];
              *pos = '\0';
              strcpy(w, (*slst)[j]);
              strcat(w, pos + 1);
              spell(w, &info, NULL);
              if ((info & SPELL_COMPOUND) && (info & SPELL_FORBIDDEN)) {
                  *pos = ' ';
              } else *pos = '-';
          }
      }
  }
  // END OF LANG_hu section
  return ns;
}
#endif

int Hunspell::stem(char*** slst, char ** desc, int n)
{
  char result[MAXLNLEN];
  char result2[MAXLNLEN];
  *slst = NULL;
  if (n == 0) return 0;
  *result2 = '\0';
  for (int i = 0; i < n; i++) {
    *result = '\0';
    // add compound word parts (except the last one)
    char * s = (char *) desc[i];
    char * part = strstr(s, MORPH_PART);
    if (part) {
        char * nextpart = strstr(part + 1, MORPH_PART);
        while (nextpart) {
            copy_field(result + strlen(result), part, MORPH_PART);
            part = nextpart;
            nextpart = strstr(part + 1, MORPH_PART);
        }
        s = part;
    }

    char **pl;
    char tok[MAXLNLEN];
    strcpy(tok, s);
    char * alt = strstr(tok, " | ");
    while (alt) {
        alt[1] = MSEP_ALT;
        alt = strstr(alt, " | ");
    }
    int pln = line_tok(tok, &pl, MSEP_ALT);
    for (int k = 0; k < pln; k++) {
        // add derivational suffixes
        if (strstr(pl[k], MORPH_DERI_SFX)) {
            // remove inflectional suffixes
            char * is = strstr(pl[k], MORPH_INFL_SFX);
            if (is) *is = '\0';
            char * sg = pSMgr->suggest_gen(&(pl[k]), 1, pl[k]);
            if (sg) {
                char ** gen;
                int genl = line_tok(sg, &gen, MSEP_REC);
                free(sg);
                for (int j = 0; j < genl; j++) {
                    sprintf(result2 + strlen(result2), "%c%s%s",
                            MSEP_REC, result, gen[j]);
                }
                freelist(&gen, genl);
            }
        } else {
            sprintf(result2 + strlen(result2), "%c%s", MSEP_REC, result);
            if (strstr(pl[k], MORPH_SURF_PFX)) {
                copy_field(result2 + strlen(result2), pl[k], MORPH_SURF_PFX);
            }
            copy_field(result2 + strlen(result2), pl[k], MORPH_STEM);
        }
    }
    freelist(&pl, pln);
  }
  int sln = line_tok(result2, slst, MSEP_REC);
  return uniqlist(*slst, sln);

}

int Hunspell::stem(char*** slst, const char * word)
{
  char ** pl;
  int pln = analyze(&pl, word);
  int pln2 = stem(slst, pl, pln);
  freelist(&pl, pln);
  return pln2;
}

#ifdef HUNSPELL_EXPERIMENTAL
int Hunspell::suggest_pos_stems(char*** slst, const char * word)
{
  char cw[MAXWORDUTF8LEN];
  char wspace[MAXWORDUTF8LEN];
  if (! pSMgr || maxdic == 0) return 0;
  int wl = strlen(word);
  if (utf8) {
    if (wl >= MAXWORDUTF8LEN) return 0;
  } else {
    if (wl >= MAXWORDLEN) return 0;
  }
  int captype = 0;
  int abbv = 0;
  wl = cleanword(cw, word, &captype, &abbv);
  if (wl == 0) return 0;

  int ns = 0; // ns=0 = normalized input

  *slst = NULL; // HU, nsug in pSMgr->suggest

  switch(captype) {
     case HUHCAP:
     case NOCAP:   {
                     ns = pSMgr->suggest_pos_stems(slst, cw, ns);

                     if ((abbv) && (ns == 0)) {
                         memcpy(wspace,cw,wl);
                         *(wspace+wl) = '.';
                         *(wspace+wl+1) = '\0';
                         ns = pSMgr->suggest_pos_stems(slst, wspace, ns);
                     }

                     break;
                   }

     case INITCAP: {

                     ns = pSMgr->suggest_pos_stems(slst, cw, ns);

                     if (ns == 0 || ((*slst)[0][0] == '#')) {
                        memcpy(wspace,cw,(wl+1));
                        mkallsmall(wspace);
                        ns = pSMgr->suggest_pos_stems(slst, wspace, ns);
                     }

                     break;

                   }

     case ALLCAP: {
                     ns = pSMgr->suggest_pos_stems(slst, cw, ns);
                     if (ns != 0) break;

                     memcpy(wspace,cw,(wl+1));
                     mkallsmall(wspace);
                     ns = pSMgr->suggest_pos_stems(slst, wspace, ns);

                     if (ns == 0) {
                         mkinitcap(wspace);
                         ns = pSMgr->suggest_pos_stems(slst, wspace, ns);
                     }
                     break;
                   }
  }

  return ns;
}
#endif // END OF HUNSPELL_EXPERIMENTAL CODE

const char * Hunspell::get_wordchars()
{
  return pAMgr->get_wordchars();
}

unsigned short * Hunspell::get_wordchars_utf16(int * len)
{
  return pAMgr->get_wordchars_utf16(len);
}

void Hunspell::mkinitcap(char * p)
{
  if (!utf8) {
    if (*p != '\0') *p = csconv[((unsigned char)*p)].cupper;
  } else {
      int len;
      w_char u[MAXWORDLEN];
      len = u8_u16(u, MAXWORDLEN, p);
      unsigned short i = unicodetoupper((u[0].h << 8) + u[0].l, langnum);
      u[0].h = (unsigned char) (i >> 8);
      u[0].l = (unsigned char) (i & 0x00FF);
      u16_u8(p, MAXWORDUTF8LEN, u, len);
  }
}

int Hunspell::mkinitcap2(char * p, w_char * u, int nc)
{
  if (!utf8) {
    if (*p != '\0') *p = csconv[((unsigned char)*p)].cupper;
  } else if (nc > 0) {
      unsigned short i = unicodetoupper((u[0].h << 8) + u[0].l, langnum);
      u[0].h = (unsigned char) (i >> 8);
      u[0].l = (unsigned char) (i & 0x00FF);
      u16_u8(p, MAXWORDUTF8LEN, u, nc);
      return strlen(p);
  }
  return nc;
}

int Hunspell::mkinitsmall2(char * p, w_char * u, int nc)
{
  if (!utf8) {
    if (*p != '\0') *p = csconv[((unsigned char)*p)].clower;
  } else if (nc > 0) {
      unsigned short i = unicodetolower((u[0].h << 8) + u[0].l, langnum);
      u[0].h = (unsigned char) (i >> 8);
      u[0].l = (unsigned char) (i & 0x00FF);
      u16_u8(p, MAXWORDUTF8LEN, u, nc);
      return strlen(p);
  }
  return nc;
}

int Hunspell::add(const char * word)
{
    if (pHMgr[0]) return (pHMgr[0])->add(word);
    return 0;
}

int Hunspell::add_with_affix(const char * word, const char * example)
{
    if (pHMgr[0]) return (pHMgr[0])->add_with_affix(word, example);
    return 0;
}

int Hunspell::remove(const char * word)
{
    if (pHMgr[0]) return (pHMgr[0])->remove(word);
    return 0;
}

const char * Hunspell::get_version()
{
  return pAMgr->get_version();
}

struct cs_info * Hunspell::get_csconv()
{
  return csconv;
}

void Hunspell::cat_result(char * result, char * st)
{
    if (st) {
        if (*result) mystrcat(result, "\n", MAXLNLEN);
        mystrcat(result, st, MAXLNLEN);
        free(st);
    }
}

int Hunspell::analyze(char*** slst, const char * word)
{
  char cw[MAXWORDUTF8LEN];
  char wspace[MAXWORDUTF8LEN];
  w_char unicw[MAXWORDLEN];
  int wl2 = 0;
  *slst = NULL;
  if (! pSMgr || maxdic == 0) return 0;
  int nc = strlen(word);
  if (utf8) {
    if (nc >= MAXWORDUTF8LEN) return 0;
  } else {
    if (nc >= MAXWORDLEN) return 0;
  }
  int captype = 0;
  int abbv = 0;
  int wl = 0;

  // input conversion
  RepList * rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL;
  if (rl && rl->conv(word, wspace)) wl = cleanword2(cw, wspace, unicw, &nc, &captype, &abbv);
  else wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv);

  if (wl == 0) {
      if (abbv) {
          for (wl = 0; wl < abbv; wl++) cw[wl] = '.';
          cw[wl] = '\0';
          abbv = 0;
      } else return 0;
  }

  char result[MAXLNLEN];
  char * st = NULL;

  *result = '\0';

  int n = 0;
  int n2 = 0;
  int n3 = 0;

  // test numbers
  // LANG_hu section: set dash information for suggestions
  if (langnum == LANG_hu) {
  while ((n < wl) &&
        (((cw[n] <= '9') && (cw[n] >= '0')) || (((cw[n] == '.') || (cw[n] == ',')) && (n > 0)))) {
        n++;
        if ((cw[n] == '.') || (cw[n] == ',')) {
                if (((n2 == 0) && (n > 3)) ||
                        ((n2 > 0) && ((cw[n-1] == '.') || (cw[n-1] == ',')))) break;
                n2++;
                n3 = n;
        }
  }

  if ((n == wl) && (n3 > 0) && (n - n3 > 3)) return 0;
  if ((n == wl) || ((n>0) && ((cw[n]=='%') || (cw[n]=='\xB0')) && checkword(cw+n, NULL, NULL))) {
        mystrcat(result, cw, MAXLNLEN);
        result[n - 1] = '\0';
        if (n == wl) cat_result(result, pSMgr->suggest_morph(cw + n - 1));
        else {
                char sign = cw[n];
                cw[n] = '\0';
                cat_result(result, pSMgr->suggest_morph(cw + n - 1));
                mystrcat(result, "+", MAXLNLEN); // XXX SPEC. MORPHCODE
                cw[n] = sign;
                cat_result(result, pSMgr->suggest_morph(cw + n));
        }
        return line_tok(result, slst, MSEP_REC);
  }
  }
  // END OF LANG_hu section

  switch(captype) {
     case HUHCAP:
     case HUHINITCAP:
     case NOCAP:  {
                    cat_result(result, pSMgr->suggest_morph(cw));
                    if (abbv) {
                        memcpy(wspace,cw,wl);
                        *(wspace+wl) = '.';
                        *(wspace+wl+1) = '\0';
                        cat_result(result, pSMgr->suggest_morph(wspace));
                    }
                    break;
                }
     case INITCAP: {
                     wl = mkallsmall2(cw, unicw, nc);
                     memcpy(wspace,cw,(wl+1));
                     wl2 = mkinitcap2(cw, unicw, nc);
                     cat_result(result, pSMgr->suggest_morph(wspace));
                     cat_result(result, pSMgr->suggest_morph(cw));
                     if (abbv) {
                         *(wspace+wl) = '.';
                         *(wspace+wl+1) = '\0';
                         cat_result(result, pSMgr->suggest_morph(wspace));

                         memcpy(wspace, cw, wl2);
                         *(wspace+wl2) = '.';
                         *(wspace+wl2+1) = '\0';

                         cat_result(result, pSMgr->suggest_morph(wspace));
                     }
                     break;
                   }
     case ALLCAP: {
                     cat_result(result, pSMgr->suggest_morph(cw));
                     if (abbv) {
                         memcpy(wspace,cw,wl);
                         *(wspace+wl) = '.';
                         *(wspace+wl+1) = '\0';
                         cat_result(result, pSMgr->suggest_morph(cw));
                     }
                     wl = mkallsmall2(cw, unicw, nc);
                     memcpy(wspace,cw,(wl+1));
                     wl2 = mkinitcap2(cw, unicw, nc);

                     cat_result(result, pSMgr->suggest_morph(wspace));
                     cat_result(result, pSMgr->suggest_morph(cw));
                     if (abbv) {
                         *(wspace+wl) = '.';
                         *(wspace+wl+1) = '\0';
                         cat_result(result, pSMgr->suggest_morph(wspace));

                         memcpy(wspace, cw, wl2);
                         *(wspace+wl2) = '.';
                         *(wspace+wl2+1) = '\0';

                         cat_result(result, pSMgr->suggest_morph(wspace));
                     }
                     break;
                   }
  }

  if (*result) {
    // word reversing wrapper for complex prefixes
    if (complexprefixes) {
      if (utf8) reverseword_utf(result); else reverseword(result);
    }
    return line_tok(result, slst, MSEP_REC);
  }

  // compound word with dash (HU) I18n
  char * dash = NULL;
  int nresult = 0;
  // LANG_hu section: set dash information for suggestions
  if (langnum == LANG_hu) dash = (char *) strchr(cw,'-');
  if ((langnum == LANG_hu) && dash) {
      *dash='\0';
      // examine 2 sides of the dash
      if (dash[1] == '\0') { // base word ending with dash
        if (spell(cw)) {
		char * p = pSMgr->suggest_morph(cw);
		if (p) {
		    int ret = line_tok(p, slst, MSEP_REC);
		    free(p);
		    return ret;
		}
		
	}
      } else if ((dash[1] == 'e') && (dash[2] == '\0')) { // XXX (HU) -e hat.
        if (spell(cw) && (spell("-e"))) {
                        st = pSMgr->suggest_morph(cw);
                        if (st) {
                                mystrcat(result, st, MAXLNLEN);
                                free(st);
                        }
                        mystrcat(result,"+", MAXLNLEN); // XXX spec. separator in MORPHCODE
                        st = pSMgr->suggest_morph("-e");
                        if (st) {
                                mystrcat(result, st, MAXLNLEN);
                                free(st);
                        }
                        return line_tok(result, slst, MSEP_REC);
                }
      } else {
      // first word ending with dash: word- XXX ???
        char r2 = *(dash + 1);
        dash[0]='-';
        dash[1]='\0';
        nresult = spell(cw);
        dash[1] = r2;
        dash[0]='\0';
        if (nresult && spell(dash+1) && ((strlen(dash+1) > 1) ||
                ((dash[1] > '0') && (dash[1] < '9')))) {
                            st = pSMgr->suggest_morph(cw);
                            if (st) {
                                mystrcat(result, st, MAXLNLEN);
                                    free(st);
                                mystrcat(result,"+", MAXLNLEN); // XXX spec. separator in MORPHCODE
                            }
                            st = pSMgr->suggest_morph(dash+1);
                            if (st) {
                                    mystrcat(result, st, MAXLNLEN);
                                    free(st);
                            }
                            return line_tok(result, slst, MSEP_REC);
                        }
      }
      // affixed number in correct word
     if (nresult && (dash > cw) && (((*(dash-1)<='9') &&
                        (*(dash-1)>='0')) || (*(dash-1)=='.'))) {
         *dash='-';
         n = 1;
         if (*(dash - n) == '.') n++;
         // search first not a number character to left from dash
         while (((dash - n)>=cw) && ((*(dash - n)=='0') || (n < 3)) && (n < 6)) {
            n++;
         }
         if ((dash - n) < cw) n--;
         // numbers: valami1000000-hoz
         // examine 100000-hoz, 10000-hoz 1000-hoz, 10-hoz,
         // 56-hoz, 6-hoz
         for(; n >= 1; n--) {
            if ((*(dash - n) >= '0') && (*(dash - n) <= '9') && checkword(dash - n, NULL, NULL)) {
                    mystrcat(result, cw, MAXLNLEN);
                    result[dash - cw - n] = '\0';
                        st = pSMgr->suggest_morph(dash - n);
                        if (st) {
                        mystrcat(result, st, MAXLNLEN);
                                free(st);
                        }
                        return line_tok(result, slst, MSEP_REC);
            }
         }
     }
  }
  return 0;
}

int Hunspell::generate(char*** slst, const char * word, char ** pl, int pln)
{
  *slst = NULL;
  if (!pSMgr || !pln) return 0;
  char **pl2;
  int pl2n = analyze(&pl2, word);
  int captype = 0;
  int abbv = 0;
  char cw[MAXWORDUTF8LEN];
  cleanword(cw, word, &captype, &abbv);
  char result[MAXLNLEN];
  *result = '\0';

  for (int i = 0; i < pln; i++) {
    cat_result(result, pSMgr->suggest_gen(pl2, pl2n, pl[i]));
  }
  freelist(&pl2, pl2n);

  if (*result) {
    // allcap
    if (captype == ALLCAP) mkallcap(result);

    // line split
    int linenum = line_tok(result, slst, MSEP_REC);

    // capitalize
    if (captype == INITCAP || captype == HUHINITCAP) {
        for (int j=0; j < linenum; j++) mkinitcap((*slst)[j]);
    }

    // temporary filtering of prefix related errors (eg.
    // generate("undrinkable", "eats") --> "undrinkables" and "*undrinks")

    int r = 0;
    for (int j=0; j < linenum; j++) {
        if (!spell((*slst)[j])) {
            free((*slst)[j]);
            (*slst)[j] = NULL;
        } else {
            if (r < j) (*slst)[r] = (*slst)[j];
            r++;
        }
    }
    if (r > 0) return r;
    free(*slst);
    *slst = NULL;
  }
  return 0;
}

int Hunspell::generate(char*** slst, const char * word, const char * pattern)
{
  char **pl;
  int pln = analyze(&pl, pattern);
  int n = generate(slst, word, pl, pln);
  freelist(&pl, pln);
  return uniqlist(*slst, n);
}

// minimal XML parser functions
int Hunspell::get_xml_par(char * dest, const char * par, int max)
{
   char * d = dest;
   if (!par) return 0;
   char end = *par;
   char * dmax = dest + max;
   if (end == '>') end = '<';
   else if (end != '\'' && end != '"') return 0; // bad XML
   for (par++; d < dmax && *par != '\0' && *par != end; par++, d++) *d = *par;
   *d = '\0';
   mystrrep(dest, "&lt;", "<");
   mystrrep(dest, "&amp;", "&");
   return (int)(d - dest);
}

int Hunspell::get_langnum() const
{
   return langnum;
}

// return the beginning of the element (attr == NULL) or the attribute
const char * Hunspell::get_xml_pos(const char * s, const char * attr)
{
  const char * end = strchr(s, '>');
  const char * p = s;
  if (attr == NULL) return end;
  do {
    p = strstr(p, attr);
    if (!p || p >= end) return 0;
  } while (*(p-1) != ' ' &&  *(p-1) != '\n');
  return p + strlen(attr);
}

int Hunspell::check_xml_par(const char * q, const char * attr, const char * value) {
  char cw[MAXWORDUTF8LEN];
  if (get_xml_par(cw, get_xml_pos(q, attr), MAXWORDUTF8LEN - 1) &&
    strcmp(cw, value) == 0) return 1;
  return 0;
}

int Hunspell::get_xml_list(char ***slst, char * list, const char * tag) {
    int n = 0;
    char * p;
    if (!list) return 0;
    for (p = list; (p = strstr(p, tag)); p++) n++;
    if (n == 0) return 0;
    *slst = (char **) malloc(sizeof(char *) * n);
    if (!*slst) return 0;
    for (p = list, n = 0; (p = strstr(p, tag)); p++, n++) {
        int l = strlen(p);
        (*slst)[n] = (char *) malloc(l + 1);
        if (!(*slst)[n]) return n;
        if (!get_xml_par((*slst)[n], p + strlen(tag) - 1, l)) {
            free((*slst)[n]);
            break;
        }
    }
    return n;
}

int Hunspell::spellml(char*** slst, const char * word)
{
  char *q, *q2;
  char cw[MAXWORDUTF8LEN], cw2[MAXWORDUTF8LEN];
  q = (char *) strstr(word, "<query");
  if (!q) return 0; // bad XML input
  q2 = strchr(q, '>');
  if (!q2) return 0; // bad XML input
  q2 = strstr(q2, "<word");
  if (!q2) return 0; // bad XML input
  if (check_xml_par(q, "type=", "analyze")) {
      int n = 0, s = 0;
      if (get_xml_par(cw, strchr(q2, '>'), MAXWORDUTF8LEN - 10)) n = analyze(slst, cw);
      if (n == 0) return 0;
      // convert the result to <code><a>ana1</a><a>ana2</a></code> format
      for (int i = 0; i < n; i++) s+= strlen((*slst)[i]);
      char * r = (char *) malloc(6 + 5 * s + 7 * n + 7 + 1); // XXX 5*s->&->&amp;
      if (!r) return 0;
      strcpy(r, "<code>");
      for (int i = 0; i < n; i++) {
        int l = strlen(r);
        strcpy(r + l, "<a>");
        strcpy(r + l + 3, (*slst)[i]);
        mystrrep(r + l + 3, "\t", " ");
        mystrrep(r + l + 3, "<", "&lt;");
        mystrrep(r + l + 3, "&", "&amp;");
        strcat(r, "</a>");
        free((*slst)[i]);
      }
      strcat(r, "</code>");
      (*slst)[0] = r;
      return 1;
  } else if (check_xml_par(q, "type=", "stem")) {
      if (get_xml_par(cw, strchr(q2, '>'), MAXWORDUTF8LEN - 1)) return stem(slst, cw);
  } else if (check_xml_par(q, "type=", "generate")) {
      int n = get_xml_par(cw, strchr(q2, '>'), MAXWORDUTF8LEN - 1);
      if (n == 0) return 0;
      char * q3 = strstr(q2 + 1, "<word");
      if (q3) {
        if (get_xml_par(cw2, strchr(q3, '>'), MAXWORDUTF8LEN - 1)) {
            return generate(slst, cw, cw2);
        }
      } else {
        if ((q2 = strstr(q2 + 1, "<code"))) {
          char ** slst2;
          if ((n = get_xml_list(&slst2, strchr(q2, '>'), "<a>"))) {
            int n2 = generate(slst, cw, slst2, n);
            freelist(&slst2, n);
            return uniqlist(*slst, n2);
          }
          freelist(&slst2, n);
        }
      }
  }
  return 0;
}


#ifdef HUNSPELL_EXPERIMENTAL
// XXX need UTF-8 support
char * Hunspell::morph_with_correction(const char * word)
{
  char cw[MAXWORDUTF8LEN];
  char wspace[MAXWORDUTF8LEN];
  if (! pSMgr || maxdic == 0) return NULL;
  int wl = strlen(word);
  if (utf8) {
    if (wl >= MAXWORDUTF8LEN) return NULL;
  } else {
    if (wl >= MAXWORDLEN) return NULL;
  }
  int captype = 0;
  int abbv = 0;
  wl = cleanword(cw, word, &captype, &abbv);
  if (wl == 0) return NULL;

  char result[MAXLNLEN];
  char * st = NULL;

  *result = '\0';


  switch(captype) {
     case NOCAP:   {
                     st = pSMgr->suggest_morph_for_spelling_error(cw);
                     if (st) {
                        mystrcat(result, st, MAXLNLEN);
                        free(st);
                     }
                     if (abbv) {
                         memcpy(wspace,cw,wl);
                         *(wspace+wl) = '.';
                         *(wspace+wl+1) = '\0';
                         st = pSMgr->suggest_morph_for_spelling_error(wspace);
                         if (st) {
                            if (*result) mystrcat(result, "\n", MAXLNLEN);
                            mystrcat(result, st, MAXLNLEN);
                            free(st);
                                                 }
                     }
                                         break;
                   }
     case INITCAP: {
                     memcpy(wspace,cw,(wl+1));
                     mkallsmall(wspace);
                     st = pSMgr->suggest_morph_for_spelling_error(wspace);
                     if (st) {
                        mystrcat(result, st, MAXLNLEN);
                        free(st);
                     }
                     st = pSMgr->suggest_morph_for_spelling_error(cw);
                     if (st) {
                        if (*result) mystrcat(result, "\n", MAXLNLEN);
                        mystrcat(result, st, MAXLNLEN);
                        free(st);
                     }
                     if (abbv) {
                         memcpy(wspace,cw,wl);
                         *(wspace+wl) = '.';
                         *(wspace+wl+1) = '\0';
                         mkallsmall(wspace);
                         st = pSMgr->suggest_morph_for_spelling_error(wspace);
                         if (st) {
                            if (*result) mystrcat(result, "\n", MAXLNLEN);
                            mystrcat(result, st, MAXLNLEN);
                            free(st);
                         }
                         mkinitcap(wspace);
                         st = pSMgr->suggest_morph_for_spelling_error(wspace);
                         if (st) {
                            if (*result) mystrcat(result, "\n", MAXLNLEN);
                            mystrcat(result, st, MAXLNLEN);
                            free(st);
                         }
                     }
                     break;
                   }
     case HUHCAP: {
                     st = pSMgr->suggest_morph_for_spelling_error(cw);
                     if (st) {
                        mystrcat(result, st, MAXLNLEN);
                        free(st);
                     }
                     memcpy(wspace,cw,(wl+1));
                     mkallsmall(wspace);
                     st = pSMgr->suggest_morph_for_spelling_error(wspace);
                     if (st) {
                        if (*result) mystrcat(result, "\n", MAXLNLEN);
                        mystrcat(result, st, MAXLNLEN);
                        free(st);
                     }
                     break;
                 }
     case ALLCAP: {
                     memcpy(wspace,cw,(wl+1));
                     st = pSMgr->suggest_morph_for_spelling_error(wspace);
                     if (st) {
                        mystrcat(result, st, MAXLNLEN);
                        free(st);
                     }
                     mkallsmall(wspace);
                     st = pSMgr->suggest_morph_for_spelling_error(wspace);
                     if (st) {
                        if (*result) mystrcat(result, "\n", MAXLNLEN);
                        mystrcat(result, st, MAXLNLEN);
                        free(st);
                     }
                     mkinitcap(wspace);
                     st = pSMgr->suggest_morph_for_spelling_error(wspace);
                     if (st) {
                        if (*result) mystrcat(result, "\n", MAXLNLEN);
                        mystrcat(result, st, MAXLNLEN);
                        free(st);
                     }
                     if (abbv) {
                        memcpy(wspace,cw,(wl+1));
                        *(wspace+wl) = '.';
                        *(wspace+wl+1) = '\0';
                        if (*result) mystrcat(result, "\n", MAXLNLEN);
                        st = pSMgr->suggest_morph_for_spelling_error(wspace);
                        if (st) {
                            mystrcat(result, st, MAXLNLEN);
                            free(st);
                        }
                        mkallsmall(wspace);
                        st = pSMgr->suggest_morph_for_spelling_error(wspace);
                        if (st) {
                          if (*result) mystrcat(result, "\n", MAXLNLEN);
                          mystrcat(result, st, MAXLNLEN);
                          free(st);
                        }
                        mkinitcap(wspace);
                        st = pSMgr->suggest_morph_for_spelling_error(wspace);
                        if (st) {
                          if (*result) mystrcat(result, "\n", MAXLNLEN);
                          mystrcat(result, st, MAXLNLEN);
                          free(st);
                        }
                     }
                     break;
                   }
  }

  if (*result) return mystrdup(result);
  return NULL;
}

#endif // END OF HUNSPELL_EXPERIMENTAL CODE

Hunhandle *Hunspell_create(const char * affpath, const char * dpath)
{
        return (Hunhandle*)(new Hunspell(affpath, dpath));
}

Hunhandle *Hunspell_create_key(const char * affpath, const char * dpath,
    const char * key)
{
        return (Hunhandle*)(new Hunspell(affpath, dpath, key));
}

void Hunspell_destroy(Hunhandle *pHunspell)
{
        delete (Hunspell*)(pHunspell);
}

int Hunspell_spell(Hunhandle *pHunspell, const char *word)
{
        return ((Hunspell*)pHunspell)->spell(word);
}

char *Hunspell_get_dic_encoding(Hunhandle *pHunspell)
{
        return ((Hunspell*)pHunspell)->get_dic_encoding();
}

int Hunspell_suggest(Hunhandle *pHunspell, char*** slst, const char * word)
{
        return ((Hunspell*)pHunspell)->suggest(slst, word);
}

int Hunspell_analyze(Hunhandle *pHunspell, char*** slst, const char * word)
{
        return ((Hunspell*)pHunspell)->analyze(slst, word);
}

int Hunspell_stem(Hunhandle *pHunspell, char*** slst, const char * word)
{
        return ((Hunspell*)pHunspell)->stem(slst, word);
}

int Hunspell_stem2(Hunhandle *pHunspell, char*** slst, char** desc, int n)
{
        return ((Hunspell*)pHunspell)->stem(slst, desc, n);
}

int Hunspell_generate(Hunhandle *pHunspell, char*** slst, const char * word,
    const char * word2)
{
        return ((Hunspell*)pHunspell)->generate(slst, word, word2);
}

int Hunspell_generate2(Hunhandle *pHunspell, char*** slst, const char * word,
    char** desc, int n)
{
        return ((Hunspell*)pHunspell)->generate(slst, word, desc, n);
}

  /* functions for run-time modification of the dictionary */

  /* add word to the run-time dictionary */

int Hunspell_add(Hunhandle *pHunspell, const char * word) {
        return ((Hunspell*)pHunspell)->add(word);
}

  /* add word to the run-time dictionary with affix flags of
   * the example (a dictionary word): Hunspell will recognize
   * affixed forms of the new word, too.
   */

int Hunspell_add_with_affix(Hunhandle *pHunspell, const char * word,
        const char * example) {
        return ((Hunspell*)pHunspell)->add_with_affix(word, example);
}

  /* remove word from the run-time dictionary */

int Hunspell_remove(Hunhandle *pHunspell, const char * word) {
        return ((Hunspell*)pHunspell)->remove(word);
}

void Hunspell_free_list(Hunhandle *, char *** slst, int n) {
        freelist(slst, n);
}
Summary ✨

This is a C++ implementation of the Hunspell spell-checking library. It provides functions for creating and managing dictionaries, suggesting corrections, analyzing words, stemming words, generating affixed forms, and modifying dictionaries at runtime. The code defines classes and functions that interact with these dictionary operations, allowing users to create and manipulate Hunspell instances.
Alerts (10)

Complexity hotspot; lines 415 to 416 (total complexity: 11)
415 416
Complexity hotspot; lines 578 to 582 (total complexity: 11)
578 579 580 581 582
Complexity hotspot; lines 1464 to 1466 (total complexity: 11)
1464 1465 1466