PageRenderTime 157ms CodeModel.GetById 13ms app.highlight 130ms RepoModel.GetById 1ms app.codeStats 1ms

/extensions/spellcheck/hunspell/src/hunspell.cpp

http://github.com/zpao/v8monkey
C++ | 2060 lines | 1753 code | 146 blank | 161 comment | 605 complexity | 29929269e48533727bddc53f90b211c9 MD5 | raw file

Large files files are truncated, but you can click here to view the full file

   1/******* BEGIN LICENSE BLOCK *******
   2 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
   3 * 
   4 * The contents of this file are subject to the Mozilla Public License Version
   5 * 1.1 (the "License"); you may not use this file except in compliance with
   6 * the License. You may obtain a copy of the License at
   7 * http://www.mozilla.org/MPL/
   8 * 
   9 * Software distributed under the License is distributed on an "AS IS" basis,
  10 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
  11 * for the specific language governing rights and limitations under the
  12 * License.
  13 * 
  14 * The Initial Developers of the Original Code are Kevin Hendricks (MySpell)
  15 * and L�szl� N�meth (Hunspell). Portions created by the Initial Developers
  16 * are Copyright (C) 2002-2005 the Initial Developers. All Rights Reserved.
  17 * 
  18 * Contributor(s): Kevin Hendricks (kevin.hendricks@sympatico.ca)
  19 *                 David Einstein (deinst@world.std.com)
  20 *                 L�szl� N�meth (nemethl@gyorsposta.hu)
  21 *                 Caolan McNamara (caolanm@redhat.com)
  22 *                 Davide Prina
  23 *                 Giuseppe Modugno
  24 *                 Gianluca Turconi
  25 *                 Simon Brouwer
  26 *                 Noll Janos
  27 *                 Biro Arpad
  28 *                 Goldman Eleonora
  29 *                 Sarlos Tamas
  30 *                 Bencsath Boldizsar
  31 *                 Halacsy Peter
  32 *                 Dvornik Laszlo
  33 *                 Gefferth Andras
  34 *                 Nagy Viktor
  35 *                 Varga Daniel
  36 *                 Chris Halls
  37 *                 Rene Engelhard
  38 *                 Bram Moolenaar
  39 *                 Dafydd Jones
  40 *                 Harri Pitkanen
  41 *                 Andras Timar
  42 *                 Tor Lillqvist
  43 * 
  44 * Alternatively, the contents of this file may be used under the terms of
  45 * either the GNU General Public License Version 2 or later (the "GPL"), or
  46 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
  47 * in which case the provisions of the GPL or the LGPL are applicable instead
  48 * of those above. If you wish to allow use of your version of this file only
  49 * under the terms of either the GPL or the LGPL, and not to allow others to
  50 * use your version of this file under the terms of the MPL, indicate your
  51 * decision by deleting the provisions above and replace them with the notice
  52 * and other provisions required by the GPL or the LGPL. If you do not delete
  53 * the provisions above, a recipient may use your version of this file under
  54 * the terms of any one of the MPL, the GPL or the LGPL.
  55 *
  56 ******* END LICENSE BLOCK *******/
  57
  58#include <stdlib.h>
  59#include <string.h>
  60#include <stdio.h>
  61
  62#include "hunspell.hxx"
  63#include "hunspell.h"
  64#ifndef MOZILLA_CLIENT
  65#    include "config.h"
  66#endif
  67#include "csutil.hxx"
  68
  69Hunspell::Hunspell(const char * affpath, const char * dpath, const char * key)
  70{
  71    encoding = NULL;
  72    csconv = NULL;
  73    utf8 = 0;
  74    complexprefixes = 0;
  75    affixpath = mystrdup(affpath);
  76    maxdic = 0;
  77
  78    /* first set up the hash manager */
  79    pHMgr[0] = new HashMgr(dpath, affpath, key);
  80    if (pHMgr[0]) maxdic = 1;
  81
  82    /* next set up the affix manager */
  83    /* it needs access to the hash manager lookup methods */
  84    pAMgr = new AffixMgr(affpath, pHMgr, &maxdic, key);
  85
  86    /* get the preferred try string and the dictionary */
  87    /* encoding from the Affix Manager for that dictionary */
  88    char * try_string = pAMgr->get_try_string();
  89    encoding = pAMgr->get_encoding();
  90    langnum = pAMgr->get_langnum();
  91    utf8 = pAMgr->get_utf8();
  92    if (!utf8)
  93        csconv = get_current_cs(encoding);
  94    complexprefixes = pAMgr->get_complexprefixes();
  95    wordbreak = pAMgr->get_breaktable();
  96
  97    /* and finally set up the suggestion manager */
  98    pSMgr = new SuggestMgr(try_string, MAXSUGGESTION, pAMgr);
  99    if (try_string) free(try_string);
 100}
 101
 102Hunspell::~Hunspell()
 103{
 104    if (pSMgr) delete pSMgr;
 105    if (pAMgr) delete pAMgr;
 106    for (int i = 0; i < maxdic; i++) delete pHMgr[i];
 107    maxdic = 0;
 108    pSMgr = NULL;
 109    pAMgr = NULL;
 110#ifdef MOZILLA_CLIENT
 111    delete [] csconv;
 112#endif
 113    csconv= NULL;
 114    if (encoding) free(encoding);
 115    encoding = NULL;
 116    if (affixpath) free(affixpath);
 117    affixpath = NULL;
 118}
 119
 120// load extra dictionaries
 121int Hunspell::add_dic(const char * dpath, const char * key) {
 122    if (maxdic == MAXDIC || !affixpath) return 1;
 123    pHMgr[maxdic] = new HashMgr(dpath, affixpath, key);
 124    if (pHMgr[maxdic]) maxdic++; else return 1;
 125    return 0;
 126}
 127
 128// make a copy of src at destination while removing all leading
 129// blanks and removing any trailing periods after recording
 130// their presence with the abbreviation flag
 131// also since already going through character by character,
 132// set the capitalization type
 133// return the length of the "cleaned" (and UTF-8 encoded) word
 134
 135int Hunspell::cleanword2(char * dest, const char * src,
 136    w_char * dest_utf, int * nc, int * pcaptype, int * pabbrev)
 137{
 138   unsigned char * p = (unsigned char *) dest;
 139   const unsigned char * q = (const unsigned char * ) src;
 140
 141   // first skip over any leading blanks
 142   while ((*q != '\0') && (*q == ' ')) q++;
 143
 144   // now strip off any trailing periods (recording their presence)
 145   *pabbrev = 0;
 146   int nl = strlen((const char *)q);
 147   while ((nl > 0) && (*(q+nl-1)=='.')) {
 148       nl--;
 149       (*pabbrev)++;
 150   }
 151
 152   // if no characters are left it can't be capitalized
 153   if (nl <= 0) {
 154       *pcaptype = NOCAP;
 155       *p = '\0';
 156       return 0;
 157   }
 158
 159   strncpy(dest, (char *) q, nl);
 160   *(dest + nl) = '\0';
 161   nl = strlen(dest);
 162   if (utf8) {
 163      *nc = u8_u16(dest_utf, MAXWORDLEN, dest);
 164      // don't check too long words
 165      if (*nc >= MAXWORDLEN) return 0;
 166      if (*nc == -1) { // big Unicode character (non BMP area)
 167         *pcaptype = NOCAP;
 168         return nl;
 169      }
 170     *pcaptype = get_captype_utf8(dest_utf, *nc, langnum);
 171   } else {
 172     *pcaptype = get_captype(dest, nl, csconv);
 173     *nc = nl;
 174   }
 175   return nl;
 176}
 177
 178int Hunspell::cleanword(char * dest, const char * src,
 179    int * pcaptype, int * pabbrev)
 180{
 181   unsigned char * p = (unsigned char *) dest;
 182   const unsigned char * q = (const unsigned char * ) src;
 183   int firstcap = 0;
 184
 185   // first skip over any leading blanks
 186   while ((*q != '\0') && (*q == ' ')) q++;
 187
 188   // now strip off any trailing periods (recording their presence)
 189   *pabbrev = 0;
 190   int nl = strlen((const char *)q);
 191   while ((nl > 0) && (*(q+nl-1)=='.')) {
 192       nl--;
 193       (*pabbrev)++;
 194   }
 195
 196   // if no characters are left it can't be capitalized
 197   if (nl <= 0) {
 198       *pcaptype = NOCAP;
 199       *p = '\0';
 200       return 0;
 201   }
 202
 203   // now determine the capitalization type of the first nl letters
 204   int ncap = 0;
 205   int nneutral = 0;
 206   int nc = 0;
 207
 208   if (!utf8) {
 209      while (nl > 0) {
 210         nc++;
 211         if (csconv[(*q)].ccase) ncap++;
 212         if (csconv[(*q)].cupper == csconv[(*q)].clower) nneutral++;
 213         *p++ = *q++;
 214         nl--;
 215      }
 216      // remember to terminate the destination string
 217      *p = '\0';
 218      firstcap = csconv[(unsigned char)(*dest)].ccase;
 219   } else {
 220      unsigned short idx;
 221      w_char t[MAXWORDLEN];
 222      nc = u8_u16(t, MAXWORDLEN, src);
 223      for (int i = 0; i < nc; i++) {
 224         idx = (t[i].h << 8) + t[i].l;
 225         unsigned short low = unicodetolower(idx, langnum);
 226         if (idx != low) ncap++;
 227         if (unicodetoupper(idx, langnum) == low) nneutral++;
 228      }
 229      u16_u8(dest, MAXWORDUTF8LEN, t, nc);
 230      if (ncap) {
 231         idx = (t[0].h << 8) + t[0].l;
 232         firstcap = (idx != unicodetolower(idx, langnum));
 233      }
 234   }
 235
 236   // now finally set the captype
 237   if (ncap == 0) {
 238        *pcaptype = NOCAP;
 239   } else if ((ncap == 1) && firstcap) {
 240        *pcaptype = INITCAP;
 241   } else if ((ncap == nc) || ((ncap + nneutral) == nc)){
 242        *pcaptype = ALLCAP;
 243   } else if ((ncap > 1) && firstcap) {
 244        *pcaptype = HUHINITCAP;
 245   } else {
 246        *pcaptype = HUHCAP;
 247   }
 248   return strlen(dest);
 249}
 250
 251void Hunspell::mkallcap(char * p)
 252{
 253  if (utf8) {
 254      w_char u[MAXWORDLEN];
 255      int nc = u8_u16(u, MAXWORDLEN, p);
 256      unsigned short idx;
 257      for (int i = 0; i < nc; i++) {
 258         idx = (u[i].h << 8) + u[i].l;
 259         if (idx != unicodetoupper(idx, langnum)) {
 260            u[i].h = (unsigned char) (unicodetoupper(idx, langnum) >> 8);
 261            u[i].l = (unsigned char) (unicodetoupper(idx, langnum) & 0x00FF);
 262         }
 263      }
 264      u16_u8(p, MAXWORDUTF8LEN, u, nc);
 265  } else {
 266    while (*p != '\0') {
 267        *p = csconv[((unsigned char) *p)].cupper;
 268        p++;
 269    }
 270  }
 271}
 272
 273int Hunspell::mkallcap2(char * p, w_char * u, int nc)
 274{
 275  if (utf8) {
 276      unsigned short idx;
 277      for (int i = 0; i < nc; i++) {
 278         idx = (u[i].h << 8) + u[i].l;
 279         unsigned short up = unicodetoupper(idx, langnum);
 280         if (idx != up) {
 281            u[i].h = (unsigned char) (up >> 8);
 282            u[i].l = (unsigned char) (up & 0x00FF);
 283         }
 284      }
 285      u16_u8(p, MAXWORDUTF8LEN, u, nc);
 286      return strlen(p);
 287  } else {
 288    while (*p != '\0') {
 289        *p = csconv[((unsigned char) *p)].cupper;
 290        p++;
 291    }
 292  }
 293  return nc;
 294}
 295
 296
 297void Hunspell::mkallsmall(char * p)
 298{
 299    while (*p != '\0') {
 300        *p = csconv[((unsigned char) *p)].clower;
 301        p++;
 302    }
 303}
 304
 305int Hunspell::mkallsmall2(char * p, w_char * u, int nc)
 306{
 307  if (utf8) {
 308      unsigned short idx;
 309      for (int i = 0; i < nc; i++) {
 310         idx = (u[i].h << 8) + u[i].l;
 311         unsigned short low = unicodetolower(idx, langnum);
 312         if (idx != low) {
 313            u[i].h = (unsigned char) (low >> 8);
 314            u[i].l = (unsigned char) (low & 0x00FF);
 315         }
 316      }
 317      u16_u8(p, MAXWORDUTF8LEN, u, nc);
 318      return strlen(p);
 319  } else {
 320    while (*p != '\0') {
 321        *p = csconv[((unsigned char) *p)].clower;
 322        p++;
 323    }
 324  }
 325  return nc;
 326}
 327
 328// convert UTF-8 sharp S codes to latin 1
 329char * Hunspell::sharps_u8_l1(char * dest, char * source) {
 330    char * p = dest;
 331    *p = *source;
 332    for (p++, source++; *(source - 1); p++, source++) {
 333        *p = *source;
 334        if (*source == '\x9F') *--p = '\xDF';
 335    }
 336    return dest;
 337}
 338
 339// recursive search for right ss - sharp s permutations
 340hentry * Hunspell::spellsharps(char * base, char * pos, int n,
 341        int repnum, char * tmp, int * info, char **root) {
 342    pos = strstr(pos, "ss");
 343    if (pos && (n < MAXSHARPS)) {
 344        *pos = '\xC3';
 345        *(pos + 1) = '\x9F';
 346        hentry * h = spellsharps(base, pos + 2, n + 1, repnum + 1, tmp, info, root);
 347        if (h) return h;
 348        *pos = 's';
 349        *(pos + 1) = 's';
 350        h = spellsharps(base, pos + 2, n + 1, repnum, tmp, info, root);
 351        if (h) return h;
 352    } else if (repnum > 0) {
 353        if (utf8) return checkword(base, info, root);
 354        return checkword(sharps_u8_l1(tmp, base), info, root);
 355    }
 356    return NULL;
 357}
 358
 359int Hunspell::is_keepcase(const hentry * rv) {
 360    return pAMgr && rv->astr && pAMgr->get_keepcase() &&
 361        TESTAFF(rv->astr, pAMgr->get_keepcase(), rv->alen);
 362}
 363
 364/* insert a word to the beginning of the suggestion array and return ns */
 365int Hunspell::insert_sug(char ***slst, char * word, int ns) {
 366    char * dup = mystrdup(word);
 367    if (!dup) return ns;
 368    if (ns == MAXSUGGESTION) {
 369        ns--;
 370        free((*slst)[ns]);
 371    }
 372    for (int k = ns; k > 0; k--) (*slst)[k] = (*slst)[k - 1];
 373    (*slst)[0] = dup;
 374    return ns + 1;
 375}
 376
 377int Hunspell::spell(const char * word, int * info, char ** root)
 378{
 379  struct hentry * rv=NULL;
 380  // need larger vector. For example, Turkish capital letter I converted a
 381  // 2-byte UTF-8 character (dotless i) by mkallsmall.
 382  char cw[MAXWORDUTF8LEN];
 383  char wspace[MAXWORDUTF8LEN];
 384  w_char unicw[MAXWORDLEN];
 385  // Hunspell supports XML input of the simplified API (see manual)
 386  if (strcmp(word, SPELL_XML) == 0) return 1;
 387  int nc = strlen(word);
 388  int wl2 = 0;
 389  if (utf8) {
 390    if (nc >= MAXWORDUTF8LEN) return 0;
 391  } else {
 392    if (nc >= MAXWORDLEN) return 0;
 393  }
 394  int captype = 0;
 395  int abbv = 0;
 396  int wl = 0;
 397
 398  // input conversion
 399  RepList * rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL;
 400  if (rl && rl->conv(word, wspace)) wl = cleanword2(cw, wspace, unicw, &nc, &captype, &abbv);
 401  else wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv);
 402
 403  int info2 = 0;
 404  if (wl == 0 || maxdic == 0) return 1;
 405  if (root) *root = NULL;
 406
 407  // allow numbers with dots, dashes and commas (but forbid double separators: "..", "--" etc.)
 408  enum { NBEGIN, NNUM, NSEP };
 409  int nstate = NBEGIN;
 410  int i;
 411
 412  for (i = 0; (i < wl); i++) {
 413    if ((cw[i] <= '9') && (cw[i] >= '0')) {
 414        nstate = NNUM;
 415    } else if ((cw[i] == ',') || (cw[i] == '.') || (cw[i] == '-')) {
 416        if ((nstate == NSEP) || (i == 0)) break;
 417        nstate = NSEP;
 418    } else break;
 419  }
 420  if ((i == wl) && (nstate == NNUM)) return 1;
 421  if (!info) info = &info2; else *info = 0;
 422
 423  switch(captype) {
 424     case HUHCAP:
 425     case HUHINITCAP:
 426            *info += SPELL_ORIGCAP;
 427     case NOCAP: {
 428            rv = checkword(cw, info, root);
 429            if ((abbv) && !(rv)) {
 430                memcpy(wspace,cw,wl);
 431                *(wspace+wl) = '.';
 432                *(wspace+wl+1) = '\0';
 433                rv = checkword(wspace, info, root);
 434            }
 435            break;
 436         }
 437     case ALLCAP: {
 438            *info += SPELL_ORIGCAP;
 439            rv = checkword(cw, info, root);
 440            if (rv) break;
 441            if (abbv) {
 442                memcpy(wspace,cw,wl);
 443                *(wspace+wl) = '.';
 444                *(wspace+wl+1) = '\0';
 445                rv = checkword(wspace, info, root);
 446                if (rv) break;
 447            }
 448            // Spec. prefix handling for Catalan, French, Italian:
 449	    // prefixes separated by apostrophe (SANT'ELIA -> Sant'+Elia).
 450            if (pAMgr && strchr(cw, '\'')) {
 451                wl = mkallsmall2(cw, unicw, nc);
 452        	//There are no really sane circumstances where this could fail,
 453        	//but anyway...
 454        	if (char * apostrophe = strchr(cw, '\'')) {
 455                    if (utf8) {
 456            	        w_char tmpword[MAXWORDLEN];
 457            	        *apostrophe = '\0';
 458            	        wl2 = u8_u16(tmpword, MAXWORDLEN, cw);
 459            	        *apostrophe = '\'';
 460		        if (wl2 < nc) {
 461		            mkinitcap2(apostrophe + 1, unicw + wl2 + 1, nc - wl2 - 1);
 462			    rv = checkword(cw, info, root);
 463			    if (rv) break;
 464		        }
 465                    } else {
 466		        mkinitcap2(apostrophe + 1, unicw, nc);
 467		        rv = checkword(cw, info, root);
 468		        if (rv) break;
 469		    }
 470		}
 471		mkinitcap2(cw, unicw, nc);
 472		rv = checkword(cw, info, root);
 473		if (rv) break;
 474            }
 475            if (pAMgr && pAMgr->get_checksharps() && strstr(cw, "SS")) {
 476                char tmpword[MAXWORDUTF8LEN];
 477                wl = mkallsmall2(cw, unicw, nc);
 478                memcpy(wspace,cw,(wl+1));
 479                rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root);
 480                if (!rv) {
 481                    wl2 = mkinitcap2(cw, unicw, nc);
 482                    rv = spellsharps(cw, cw, 0, 0, tmpword, info, root);
 483                }
 484                if ((abbv) && !(rv)) {
 485                    *(wspace+wl) = '.';
 486                    *(wspace+wl+1) = '\0';
 487                    rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root);
 488                    if (!rv) {
 489                        memcpy(wspace, cw, wl2);
 490                        *(wspace+wl2) = '.';
 491                        *(wspace+wl2+1) = '\0';
 492                        rv = spellsharps(wspace, wspace, 0, 0, tmpword, info, root);
 493                    }
 494                }
 495                if (rv) break;
 496            }
 497        }
 498     case INITCAP: {
 499             *info += SPELL_ORIGCAP;
 500             wl = mkallsmall2(cw, unicw, nc);
 501             memcpy(wspace,cw,(wl+1));
 502             wl2 = mkinitcap2(cw, unicw, nc);
 503             if (captype == INITCAP) *info += SPELL_INITCAP;
 504             rv = checkword(cw, info, root);
 505             if (captype == INITCAP) *info -= SPELL_INITCAP;
 506             // forbid bad capitalization
 507             // (for example, ijs -> Ijs instead of IJs in Dutch)
 508             // use explicit forms in dic: Ijs/F (F = FORBIDDENWORD flag)
 509             if (*info & SPELL_FORBIDDEN) {
 510                rv = NULL;
 511                break;
 512             }
 513             if (rv && is_keepcase(rv) && (captype == ALLCAP)) rv = NULL;
 514             if (rv) break;
 515
 516             rv = checkword(wspace, info, root);
 517             if (abbv && !rv) {
 518
 519                 *(wspace+wl) = '.';
 520                 *(wspace+wl+1) = '\0';
 521                 rv = checkword(wspace, info, root);
 522                 if (!rv) {
 523                    memcpy(wspace, cw, wl2);
 524                    *(wspace+wl2) = '.';
 525                    *(wspace+wl2+1) = '\0';
 526    	    	    if (captype == INITCAP) *info += SPELL_INITCAP;
 527                    rv = checkword(wspace, info, root);
 528    	    	    if (captype == INITCAP) *info -= SPELL_INITCAP;
 529                    if (rv && is_keepcase(rv) && (captype == ALLCAP)) rv = NULL;
 530                    break;
 531                 }
 532             }
 533             if (rv && is_keepcase(rv) &&
 534                ((captype == ALLCAP) ||
 535                   // if CHECKSHARPS: KEEPCASE words with \xDF  are allowed
 536                   // in INITCAP form, too.
 537                   !(pAMgr->get_checksharps() &&
 538                      ((utf8 && strstr(wspace, "\xC3\x9F")) ||
 539                      (!utf8 && strchr(wspace, '\xDF')))))) rv = NULL;
 540             break;
 541           }
 542  }
 543
 544  if (rv) {
 545      if (pAMgr && pAMgr->get_warn() && rv->astr &&
 546          TESTAFF(rv->astr, pAMgr->get_warn(), rv->alen)) {
 547              *info += SPELL_WARN;
 548	      if (pAMgr->get_forbidwarn()) return 0;
 549              return HUNSPELL_OK_WARN;
 550      }
 551      return HUNSPELL_OK;
 552  }
 553
 554  // recursive breaking at break points
 555  if (wordbreak) {
 556    char * s;
 557    char r;
 558    int nbr = 0;
 559    wl = strlen(cw);
 560    int numbreak = pAMgr ? pAMgr->get_numbreak() : 0;
 561
 562    // calculate break points for recursion limit
 563    for (int j = 0; j < numbreak; j++) {
 564      s = cw;
 565      do {
 566      	s = (char *) strstr(s, wordbreak[j]);
 567      	if (s) { 
 568		nbr++;
 569		s++;
 570	}
 571      } while (s);
 572    } 
 573    if (nbr >= 10) return 0;
 574
 575    // check boundary patterns (^begin and end$)
 576    for (int j = 0; j < numbreak; j++) {
 577      int plen = strlen(wordbreak[j]);
 578      if (plen == 1 || plen > wl) continue;
 579      if (wordbreak[j][0] == '^' && strncmp(cw, wordbreak[j] + 1, plen - 1) == 0
 580        && spell(cw + plen - 1)) return 1;
 581      if (wordbreak[j][plen - 1] == '$' &&
 582        strncmp(cw + wl - plen + 1, wordbreak[j], plen - 1) == 0) {
 583	    r = cw[wl - plen + 1];
 584	    cw[wl - plen + 1] = '\0';
 585    	    if (spell(cw)) return 1;
 586	    cw[wl - plen + 1] = r;
 587	}
 588    }
 589
 590    // other patterns
 591    for (int j = 0; j < numbreak; j++) {
 592      int plen = strlen(wordbreak[j]);
 593      s=(char *) strstr(cw, wordbreak[j]);
 594      if (s && (s > cw) && (s < cw + wl - plen)) {
 595	if (!spell(s + plen)) continue;
 596        r = *s;
 597        *s = '\0';
 598        // examine 2 sides of the break point
 599        if (spell(cw)) return 1;
 600        *s = r;
 601
 602        // LANG_hu: spec. dash rule
 603	if (langnum == LANG_hu && strcmp(wordbreak[j], "-") == 0) {
 604	  r = s[1];
 605	  s[1] = '\0';
 606          if (spell(cw)) return 1; // check the first part with dash
 607          s[1] = r;
 608	}
 609        // end of LANG speficic region
 610
 611      }
 612    }
 613  }
 614
 615  return 0;
 616}
 617
 618struct hentry * Hunspell::checkword(const char * w, int * info, char ** root)
 619{
 620  struct hentry * he = NULL;
 621  int len, i;
 622  char w2[MAXWORDUTF8LEN];
 623  const char * word;
 624
 625  char * ignoredchars = pAMgr->get_ignore();
 626  if (ignoredchars != NULL) {
 627     strcpy(w2, w);
 628     if (utf8) {
 629        int ignoredchars_utf16_len;
 630        unsigned short * ignoredchars_utf16 = pAMgr->get_ignore_utf16(&ignoredchars_utf16_len);
 631        remove_ignored_chars_utf(w2, ignoredchars_utf16, ignoredchars_utf16_len);
 632     } else {
 633        remove_ignored_chars(w2,ignoredchars);
 634     }
 635     word = w2;
 636  } else word = w;
 637
 638  len = strlen(word);
 639
 640  if (!len)
 641      return NULL;
 642
 643  // word reversing wrapper for complex prefixes
 644  if (complexprefixes) {
 645    if (word != w2) {
 646      strcpy(w2, word);
 647      word = w2;
 648    }
 649    if (utf8) reverseword_utf(w2); else reverseword(w2);
 650  }
 651
 652  // look word in hash table
 653  for (i = 0; (i < maxdic) && !he; i ++) {
 654  he = (pHMgr[i])->lookup(word);
 655
 656  // check forbidden and onlyincompound words
 657  if ((he) && (he->astr) && (pAMgr) && TESTAFF(he->astr, pAMgr->get_forbiddenword(), he->alen)) {
 658    if (info) *info += SPELL_FORBIDDEN;
 659    // LANG_hu section: set dash information for suggestions
 660    if (langnum == LANG_hu) {
 661        if (pAMgr->get_compoundflag() &&
 662            TESTAFF(he->astr, pAMgr->get_compoundflag(), he->alen)) {
 663                if (info) *info += SPELL_COMPOUND;
 664        }
 665    }
 666    return NULL;
 667  }
 668
 669  // he = next not needaffix, onlyincompound homonym or onlyupcase word
 670  while (he && (he->astr) &&
 671    ((pAMgr->get_needaffix() && TESTAFF(he->astr, pAMgr->get_needaffix(), he->alen)) ||
 672       (pAMgr->get_onlyincompound() && TESTAFF(he->astr, pAMgr->get_onlyincompound(), he->alen)) ||
 673       (info && (*info & SPELL_INITCAP) && TESTAFF(he->astr, ONLYUPCASEFLAG, he->alen))
 674    )) he = he->next_homonym;
 675  }
 676
 677  // check with affixes
 678  if (!he && pAMgr) {
 679     // try stripping off affixes */
 680     he = pAMgr->affix_check(word, len, 0);
 681
 682     // check compound restriction and onlyupcase
 683     if (he && he->astr && (
 684        (pAMgr->get_onlyincompound() &&
 685    	    TESTAFF(he->astr, pAMgr->get_onlyincompound(), he->alen)) ||
 686        (info && (*info & SPELL_INITCAP) &&
 687    	    TESTAFF(he->astr, ONLYUPCASEFLAG, he->alen)))) {
 688    	    he = NULL;
 689     }
 690
 691     if (he) {
 692        if ((he->astr) && (pAMgr) && TESTAFF(he->astr, pAMgr->get_forbiddenword(), he->alen)) {
 693            if (info) *info += SPELL_FORBIDDEN;
 694            return NULL;
 695        }
 696        if (root) {
 697            *root = mystrdup(he->word);
 698            if (*root && complexprefixes) {
 699                if (utf8) reverseword_utf(*root); else reverseword(*root);
 700            }
 701        }
 702     // try check compound word
 703     } else if (pAMgr->get_compound()) {
 704          he = pAMgr->compound_check(word, len, 0, 0, 100, 0, NULL, 0, 0, info);
 705          // LANG_hu section: `moving rule' with last dash
 706          if ((!he) && (langnum == LANG_hu) && (word[len-1] == '-')) {
 707             char * dup = mystrdup(word);
 708             if (!dup) return NULL;
 709             dup[len-1] = '\0';
 710             he = pAMgr->compound_check(dup, len-1, -5, 0, 100, 0, NULL, 1, 0, info);
 711             free(dup);
 712          }
 713          // end of LANG speficic region
 714          if (he) {
 715                if (root) {
 716                    *root = mystrdup(he->word);
 717                    if (*root && complexprefixes) {
 718                        if (utf8) reverseword_utf(*root); else reverseword(*root);
 719                    }
 720                }
 721                if (info) *info += SPELL_COMPOUND;
 722          }
 723     }
 724
 725  }
 726
 727  return he;
 728}
 729
 730int Hunspell::suggest(char*** slst, const char * word)
 731{
 732  int onlycmpdsug = 0;
 733  char cw[MAXWORDUTF8LEN];
 734  char wspace[MAXWORDUTF8LEN];
 735  if (!pSMgr || maxdic == 0) return 0;
 736  w_char unicw[MAXWORDLEN];
 737  *slst = NULL;
 738  // process XML input of the simplified API (see manual)
 739  if (strncmp(word, SPELL_XML, sizeof(SPELL_XML) - 3) == 0) {
 740     return spellml(slst, word);
 741  }
 742  int nc = strlen(word);
 743  if (utf8) {
 744    if (nc >= MAXWORDUTF8LEN) return 0;
 745  } else {
 746    if (nc >= MAXWORDLEN) return 0;
 747  }
 748  int captype = 0;
 749  int abbv = 0;
 750  int wl = 0;
 751
 752  // input conversion
 753  RepList * rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL;
 754  if (rl && rl->conv(word, wspace)) wl = cleanword2(cw, wspace, unicw, &nc, &captype, &abbv);
 755  else wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv);
 756
 757  if (wl == 0) return 0;
 758  int ns = 0;
 759  int capwords = 0;
 760
 761  // check capitalized form for FORCEUCASE
 762  if (pAMgr && captype == NOCAP && pAMgr->get_forceucase()) {
 763    int info = SPELL_ORIGCAP;
 764    char ** wlst;
 765    if (checkword(cw, &info, NULL)) {
 766        if (*slst) {
 767            wlst = *slst;
 768        } else {
 769            wlst = (char **) malloc(MAXSUGGESTION * sizeof(char *));
 770            if (wlst == NULL) return -1;
 771            *slst = wlst;
 772            for (int i = 0; i < MAXSUGGESTION; i++) {
 773                wlst[i] = NULL;
 774            }
 775        }
 776        wlst[0] = mystrdup(cw);
 777        mkinitcap(wlst[0]);
 778        return 1;
 779    }
 780  }
 781 
 782  switch(captype) {
 783     case NOCAP:   {
 784                     ns = pSMgr->suggest(slst, cw, ns, &onlycmpdsug);
 785                     break;
 786                   }
 787
 788     case INITCAP: {
 789                     capwords = 1;
 790                     ns = pSMgr->suggest(slst, cw, ns, &onlycmpdsug);
 791                     if (ns == -1) break;
 792                     memcpy(wspace,cw,(wl+1));
 793                     mkallsmall2(wspace, unicw, nc);
 794                     ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug);
 795                     break;
 796                   }
 797     case HUHINITCAP:
 798                    capwords = 1;
 799     case HUHCAP: {
 800                     ns = pSMgr->suggest(slst, cw, ns, &onlycmpdsug);
 801                     if (ns != -1) {
 802                        int prevns;
 803    		        // something.The -> something. The
 804                        char * dot = strchr(cw, '.');
 805		        if (dot && (dot > cw)) {
 806		            int captype_;
 807		            if (utf8) {
 808		               w_char w_[MAXWORDLEN];
 809			       int wl_ = u8_u16(w_, MAXWORDLEN, dot + 1);
 810		               captype_ = get_captype_utf8(w_, wl_, langnum);
 811		            } else captype_ = get_captype(dot+1, strlen(dot+1), csconv);
 812		    	    if (captype_ == INITCAP) {
 813                        	char * st = mystrdup(cw);
 814                        	if (st) st = (char *) realloc(st, wl + 2);
 815				if (st) {
 816                        		st[(dot - cw) + 1] = ' ';
 817                        		strcpy(st + (dot - cw) + 2, dot + 1);
 818                    			ns = insert_sug(slst, st, ns);
 819					free(st);
 820				}
 821		    	    }
 822		        }
 823                        if (captype == HUHINITCAP) {
 824                            // TheOpenOffice.org -> The OpenOffice.org
 825                            memcpy(wspace,cw,(wl+1));
 826                            mkinitsmall2(wspace, unicw, nc);
 827                            ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug);
 828                        }
 829                        memcpy(wspace,cw,(wl+1));
 830                        mkallsmall2(wspace, unicw, nc);
 831                        if (spell(wspace)) ns = insert_sug(slst, wspace, ns);
 832                        prevns = ns;
 833                        ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug);
 834                        if (captype == HUHINITCAP) {
 835                            mkinitcap2(wspace, unicw, nc);
 836                            if (spell(wspace)) ns = insert_sug(slst, wspace, ns);
 837                            ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug);
 838                        }
 839                        // aNew -> "a New" (instead of "a new")
 840                        for (int j = prevns; j < ns; j++) {
 841                           char * space = strchr((*slst)[j],' ');
 842                           if (space) {
 843                                int slen = strlen(space + 1);
 844                                // different case after space (need capitalisation)
 845                                if ((slen < wl) && strcmp(cw + wl - slen, space + 1)) {
 846                                    w_char w[MAXWORDLEN];
 847                                    int wc = 0;
 848                                    char * r = (*slst)[j];
 849                                    if (utf8) wc = u8_u16(w, MAXWORDLEN, space + 1);
 850                                    mkinitcap2(space + 1, w, wc);
 851                                    // set as first suggestion
 852                                    for (int k = j; k > 0; k--) (*slst)[k] = (*slst)[k - 1];
 853                                    (*slst)[0] = r;
 854                                }
 855                           }
 856                        }
 857                     }
 858                     break;
 859                   }
 860
 861     case ALLCAP: {
 862                     memcpy(wspace, cw, (wl+1));
 863                     mkallsmall2(wspace, unicw, nc);
 864                     ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug);
 865                     if (ns == -1) break;
 866                     if (pAMgr && pAMgr->get_keepcase() && spell(wspace))
 867                        ns = insert_sug(slst, wspace, ns);
 868                     mkinitcap2(wspace, unicw, nc);
 869                     ns = pSMgr->suggest(slst, wspace, ns, &onlycmpdsug);
 870                     for (int j=0; j < ns; j++) {
 871                        mkallcap((*slst)[j]);
 872                        if (pAMgr && pAMgr->get_checksharps()) {
 873                            char * pos;
 874                            if (utf8) {
 875                                pos = strstr((*slst)[j], "\xC3\x9F");
 876                                while (pos) {
 877                                    *pos = 'S';
 878                                    *(pos+1) = 'S';
 879                                    pos = strstr(pos+2, "\xC3\x9F");
 880                                }
 881                            } else {
 882                                pos = strchr((*slst)[j], '\xDF');
 883                                while (pos) {
 884                                    (*slst)[j] = (char *) realloc((*slst)[j], strlen((*slst)[j]) + 2);
 885                                    mystrrep((*slst)[j], "\xDF", "SS");
 886                                    pos = strchr((*slst)[j], '\xDF');
 887                                }
 888                            }
 889                        }
 890                     }
 891                     break;
 892                   }
 893  }
 894
 895 // LANG_hu section: replace '-' with ' ' in Hungarian
 896  if (langnum == LANG_hu) {
 897      for (int j=0; j < ns; j++) {
 898          char * pos = strchr((*slst)[j],'-');
 899          if (pos) {
 900              int info;
 901              char w[MAXWORDUTF8LEN];
 902              *pos = '\0';
 903              strcpy(w, (*slst)[j]);
 904              strcat(w, pos + 1);
 905              spell(w, &info, NULL);
 906              if ((info & SPELL_COMPOUND) && (info & SPELL_FORBIDDEN)) {
 907                  *pos = ' ';
 908              } else *pos = '-';
 909          }
 910      }
 911  }
 912  // END OF LANG_hu section
 913
 914  // try ngram approach since found nothing or only compound words
 915  if (pAMgr && (ns == 0 || onlycmpdsug) && (pAMgr->get_maxngramsugs() != 0) && (*slst)) {
 916      switch(captype) {
 917          case NOCAP: {
 918              ns = pSMgr->ngsuggest(*slst, cw, ns, pHMgr, maxdic);
 919              break;
 920          }
 921	  case HUHINITCAP:
 922              capwords = 1;
 923          case HUHCAP: {
 924              memcpy(wspace,cw,(wl+1));
 925              mkallsmall2(wspace, unicw, nc);
 926              ns = pSMgr->ngsuggest(*slst, wspace, ns, pHMgr, maxdic);
 927	      break;
 928          }
 929         case INITCAP: {
 930              capwords = 1;
 931              memcpy(wspace,cw,(wl+1));
 932              mkallsmall2(wspace, unicw, nc);
 933              ns = pSMgr->ngsuggest(*slst, wspace, ns, pHMgr, maxdic);
 934              break;
 935          }
 936          case ALLCAP: {
 937              memcpy(wspace,cw,(wl+1));
 938              mkallsmall2(wspace, unicw, nc);
 939	      int oldns = ns;
 940              ns = pSMgr->ngsuggest(*slst, wspace, ns, pHMgr, maxdic);
 941              for (int j = oldns; j < ns; j++)
 942                  mkallcap((*slst)[j]);
 943              break;
 944         }
 945      }
 946  }
 947
 948  // try dash suggestion (Afo-American -> Afro-American)
 949  if (char * pos = strchr(cw, '-')) {
 950     char * ppos = cw;
 951     int nodashsug = 1;
 952     char ** nlst = NULL;
 953     int nn = 0;
 954     int last = 0;
 955     if (*slst) {
 956        for (int j = 0; j < ns && nodashsug == 1; j++) {
 957           if (strchr((*slst)[j], '-')) nodashsug = 0;
 958        }
 959     }
 960     while (nodashsug && !last) {
 961	if (*pos == '\0') last = 1; else *pos = '\0';
 962        if (!spell(ppos)) {
 963          nn = suggest(&nlst, ppos);
 964          for (int j = nn - 1; j >= 0; j--) {
 965            strncpy(wspace, cw, ppos - cw);
 966            strcpy(wspace + (ppos - cw), nlst[j]);
 967            if (!last) {
 968            	strcat(wspace, "-");
 969		strcat(wspace, pos + 1);
 970	    }
 971            ns = insert_sug(slst, wspace, ns);
 972            free(nlst[j]);
 973          }
 974          if (nlst != NULL) free(nlst);
 975          nodashsug = 0;
 976        }
 977	if (!last) {
 978          *pos = '-';
 979          ppos = pos + 1;
 980          pos = strchr(ppos, '-');
 981        }
 982	if (!pos) pos = cw + strlen(cw);
 983     }
 984  }
 985
 986  // word reversing wrapper for complex prefixes
 987  if (complexprefixes) {
 988    for (int j = 0; j < ns; j++) {
 989      if (utf8) reverseword_utf((*slst)[j]); else reverseword((*slst)[j]);
 990    }
 991  }
 992
 993  // capitalize
 994  if (capwords) for (int j=0; j < ns; j++) {
 995      mkinitcap((*slst)[j]);
 996  }
 997
 998  // expand suggestions with dot(s)
 999  if (abbv && pAMgr && pAMgr->get_sugswithdots()) {
1000    for (int j = 0; j < ns; j++) {
1001      (*slst)[j] = (char *) realloc((*slst)[j], strlen((*slst)[j]) + 1 + abbv);
1002      strcat((*slst)[j], word + strlen(word) - abbv);
1003    }
1004  }
1005
1006  // remove bad capitalized and forbidden forms
1007  if (pAMgr && (pAMgr->get_keepcase() || pAMgr->get_forbiddenword())) {
1008  switch (captype) {
1009    case INITCAP:
1010    case ALLCAP: {
1011      int l = 0;
1012      for (int j=0; j < ns; j++) {
1013        if (!strchr((*slst)[j],' ') && !spell((*slst)[j])) {
1014          char s[MAXSWUTF8L];
1015          w_char w[MAXSWL];
1016          int len;
1017          if (utf8) {
1018            len = u8_u16(w, MAXSWL, (*slst)[j]);
1019          } else {
1020            strcpy(s, (*slst)[j]);
1021            len = strlen(s);
1022          }
1023          mkallsmall2(s, w, len);
1024          free((*slst)[j]);
1025          if (spell(s)) {
1026            (*slst)[l] = mystrdup(s);
1027            if ((*slst)[l]) l++;
1028          } else {
1029            mkinitcap2(s, w, len);
1030            if (spell(s)) {
1031              (*slst)[l] = mystrdup(s);
1032              if ((*slst)[l]) l++;
1033            }
1034          }
1035        } else {
1036          (*slst)[l] = (*slst)[j];
1037          l++;
1038        }
1039      }
1040      ns = l;
1041    }
1042  }
1043  }
1044
1045  // remove duplications
1046  int l = 0;
1047  for (int j = 0; j < ns; j++) {
1048    (*slst)[l] = (*slst)[j];
1049    for (int k = 0; k < l; k++) {
1050      if (strcmp((*slst)[k], (*slst)[j]) == 0) {
1051        free((*slst)[j]);
1052        l--;
1053        break;
1054      }
1055    }
1056    l++;
1057  }
1058  ns = l;
1059
1060  // output conversion
1061  rl = (pAMgr) ? pAMgr->get_oconvtable() : NULL;
1062  for (int j = 0; rl && j < ns; j++) {
1063    if (rl->conv((*slst)[j], wspace)) {
1064      free((*slst)[j]);
1065      (*slst)[j] = mystrdup(wspace);
1066    }
1067  }
1068
1069  // if suggestions removed by nosuggest, onlyincompound parameters
1070  if (l == 0 && *slst) {
1071    free(*slst);
1072    *slst = NULL;
1073  }
1074  return l;
1075}
1076
1077void Hunspell::free_list(char *** slst, int n) {
1078        freelist(slst, n);
1079}
1080
1081char * Hunspell::get_dic_encoding()
1082{
1083  return encoding;
1084}
1085
1086#ifdef HUNSPELL_EXPERIMENTAL
1087// XXX need UTF-8 support
1088int Hunspell::suggest_auto(char*** slst, const char * word)
1089{
1090  char cw[MAXWORDUTF8LEN];
1091  char wspace[MAXWORDUTF8LEN];
1092  if (!pSMgr || maxdic == 0) return 0;
1093  int wl = strlen(word);
1094  if (utf8) {
1095    if (wl >= MAXWORDUTF8LEN) return 0;
1096  } else {
1097    if (wl >= MAXWORDLEN) return 0;
1098  }
1099  int captype = 0;
1100  int abbv = 0;
1101  wl = cleanword(cw, word, &captype, &abbv);
1102  if (wl == 0) return 0;
1103  int ns = 0;
1104  *slst = NULL; // HU, nsug in pSMgr->suggest
1105
1106  switch(captype) {
1107     case NOCAP:   {
1108                     ns = pSMgr->suggest_auto(slst, cw, ns);
1109                     if (ns>0) break;
1110                     break;
1111                   }
1112
1113     case INITCAP: {
1114                     memcpy(wspace,cw,(wl+1));
1115                     mkallsmall(wspace);
1116                     ns = pSMgr->suggest_auto(slst, wspace, ns);
1117                     for (int j=0; j < ns; j++)
1118                       mkinitcap((*slst)[j]);
1119                     ns = pSMgr->suggest_auto(slst, cw, ns);
1120                     break;
1121
1122                   }
1123
1124     case HUHINITCAP:
1125     case HUHCAP: {
1126                     ns = pSMgr->suggest_auto(slst, cw, ns);
1127                     if (ns == 0) {
1128                        memcpy(wspace,cw,(wl+1));
1129                        mkallsmall(wspace);
1130                        ns = pSMgr->suggest_auto(slst, wspace, ns);
1131                     }
1132                     break;
1133                   }
1134
1135     case ALLCAP: {
1136                     memcpy(wspace,cw,(wl+1));
1137                     mkallsmall(wspace);
1138                     ns = pSMgr->suggest_auto(slst, wspace, ns);
1139
1140                     mkinitcap(wspace);
1141                     ns = pSMgr->suggest_auto(slst, wspace, ns);
1142
1143                     for (int j=0; j < ns; j++)
1144                       mkallcap((*slst)[j]);
1145                     break;
1146                   }
1147  }
1148
1149  // word reversing wrapper for complex prefixes
1150  if (complexprefixes) {
1151    for (int j = 0; j < ns; j++) {
1152      if (utf8) reverseword_utf((*slst)[j]); else reverseword((*slst)[j]);
1153    }
1154  }
1155
1156  // expand suggestions with dot(s)
1157  if (abbv && pAMgr && pAMgr->get_sugswithdots()) {
1158    for (int j = 0; j < ns; j++) {
1159      (*slst)[j] = (char *) realloc((*slst)[j], strlen((*slst)[j]) + 1 + abbv);
1160      strcat((*slst)[j], word + strlen(word) - abbv);
1161    }
1162  }
1163
1164  // LANG_hu section: replace '-' with ' ' in Hungarian
1165  if (langnum == LANG_hu) {
1166      for (int j=0; j < ns; j++) {
1167          char * pos = strchr((*slst)[j],'-');
1168          if (pos) {
1169              int info;
1170              char w[MAXWORDUTF8LEN];
1171              *pos = '\0';
1172              strcpy(w, (*slst)[j]);
1173              strcat(w, pos + 1);
1174              spell(w, &info, NULL);
1175              if ((info & SPELL_COMPOUND) && (info & SPELL_FORBIDDEN)) {
1176                  *pos = ' ';
1177              } else *pos = '-';
1178          }
1179      }
1180  }
1181  // END OF LANG_hu section
1182  return ns;
1183}
1184#endif
1185
1186int Hunspell::stem(char*** slst, char ** desc, int n)
1187{
1188  char result[MAXLNLEN];
1189  char result2[MAXLNLEN];
1190  *slst = NULL;
1191  if (n == 0) return 0;
1192  *result2 = '\0';
1193  for (int i = 0; i < n; i++) {
1194    *result = '\0';
1195    // add compound word parts (except the last one)
1196    char * s = (char *) desc[i];
1197    char * part = strstr(s, MORPH_PART);
1198    if (part) {
1199        char * nextpart = strstr(part + 1, MORPH_PART);
1200        while (nextpart) {
1201            copy_field(result + strlen(result), part, MORPH_PART);
1202            part = nextpart;
1203            nextpart = strstr(part + 1, MORPH_PART);
1204        }
1205        s = part;
1206    }
1207
1208    char **pl;
1209    char tok[MAXLNLEN];
1210    strcpy(tok, s);
1211    char * alt = strstr(tok, " | ");
1212    while (alt) {
1213        alt[1] = MSEP_ALT;
1214        alt = strstr(alt, " | ");
1215    }
1216    int pln = line_tok(tok, &pl, MSEP_ALT);
1217    for (int k = 0; k < pln; k++) {
1218        // add derivational suffixes
1219        if (strstr(pl[k], MORPH_DERI_SFX)) {
1220            // remove inflectional suffixes
1221            char * is = strstr(pl[k], MORPH_INFL_SFX);
1222            if (is) *is = '\0';
1223            char * sg = pSMgr->suggest_gen(&(pl[k]), 1, pl[k]);
1224            if (sg) {
1225                char ** gen;
1226                int genl = line_tok(sg, &gen, MSEP_REC);
1227                free(sg);
1228                for (int j = 0; j < genl; j++) {
1229                    sprintf(result2 + strlen(result2), "%c%s%s",
1230                            MSEP_REC, result, gen[j]);
1231                }
1232                freelist(&gen, genl);
1233            }
1234        } else {
1235            sprintf(result2 + strlen(result2), "%c%s", MSEP_REC, result);
1236            if (strstr(pl[k], MORPH_SURF_PFX)) {
1237                copy_field(result2 + strlen(result2), pl[k], MORPH_SURF_PFX);
1238            }
1239            copy_field(result2 + strlen(result2), pl[k], MORPH_STEM);
1240        }
1241    }
1242    freelist(&pl, pln);
1243  }
1244  int sln = line_tok(result2, slst, MSEP_REC);
1245  return uniqlist(*slst, sln);
1246
1247}
1248
1249int Hunspell::stem(char*** slst, const char * word)
1250{
1251  char ** pl;
1252  int pln = analyze(&pl, word);
1253  int pln2 = stem(slst, pl, pln);
1254  freelist(&pl, pln);
1255  return pln2;
1256}
1257
1258#ifdef HUNSPELL_EXPERIMENTAL
1259int Hunspell::suggest_pos_stems(char*** slst, const char * word)
1260{
1261  char cw[MAXWORDUTF8LEN];
1262  char wspace[MAXWORDUTF8LEN];
1263  if (! pSMgr || maxdic == 0) return 0;
1264  int wl = strlen(word);
1265  if (utf8) {
1266    if (wl >= MAXWORDUTF8LEN) return 0;
1267  } else {
1268    if (wl >= MAXWORDLEN) return 0;
1269  }
1270  int captype = 0;
1271  int abbv = 0;
1272  wl = cleanword(cw, word, &captype, &abbv);
1273  if (wl == 0) return 0;
1274
1275  int ns = 0; // ns=0 = normalized input
1276
1277  *slst = NULL; // HU, nsug in pSMgr->suggest
1278
1279  switch(captype) {
1280     case HUHCAP:
1281     case NOCAP:   {
1282                     ns = pSMgr->suggest_pos_stems(slst, cw, ns);
1283
1284                     if ((abbv) && (ns == 0)) {
1285                         memcpy(wspace,cw,wl);
1286                         *(wspace+wl) = '.';
1287                         *(wspace+wl+1) = '\0';
1288                         ns = pSMgr->suggest_pos_stems(slst, wspace, ns);
1289                     }
1290
1291                     break;
1292                   }
1293
1294     case INITCAP: {
1295
1296                     ns = pSMgr->suggest_pos_stems(slst, cw, ns);
1297
1298                     if (ns == 0 || ((*slst)[0][0] == '#')) {
1299                        memcpy(wspace,cw,(wl+1));
1300                        mkallsmall(wspace);
1301                        ns = pSMgr->suggest_pos_stems(slst, wspace, ns);
1302                     }
1303
1304                     break;
1305
1306                   }
1307
1308     case ALLCAP: {
1309                     ns = pSMgr->suggest_pos_stems(slst, cw, ns);
1310                     if (ns != 0) break;
1311
1312                     memcpy(wspace,cw,(wl+1));
1313                     mkallsmall(wspace);
1314                     ns = pSMgr->suggest_pos_stems(slst, wspace, ns);
1315
1316                     if (ns == 0) {
1317                         mkinitcap(wspace);
1318                         ns = pSMgr->suggest_pos_stems(slst, wspace, ns);
1319                     }
1320                     break;
1321                   }
1322  }
1323
1324  return ns;
1325}
1326#endif // END OF HUNSPELL_EXPERIMENTAL CODE
1327
1328const char * Hunspell::get_wordchars()
1329{
1330  return pAMgr->get_wordchars();
1331}
1332
1333unsigned short * Hunspell::get_wordchars_utf16(int * len)
1334{
1335  return pAMgr->get_wordchars_utf16(len);
1336}
1337
1338void Hunspell::mkinitcap(char * p)
1339{
1340  if (!utf8) {
1341    if (*p != '\0') *p = csconv[((unsigned char)*p)].cupper;
1342  } else {
1343      int len;
1344      w_char u[MAXWORDLEN];
1345      len = u8_u16(u, MAXWORDLEN, p);
1346      unsigned short i = unicodetoupper((u[0].h << 8) + u[0].l, langnum);
1347      u[0].h = (unsigned char) (i >> 8);
1348      u[0].l = (unsigned char) (i & 0x00FF);
1349      u16_u8(p, MAXWORDUTF8LEN, u, len);
1350  }
1351}
1352
1353int Hunspell::mkinitcap2(char * p, w_char * u, int nc)
1354{
1355  if (!utf8) {
1356    if (*p != '\0') *p = csconv[((unsigned char)*p)].cupper;
1357  } else if (nc > 0) {
1358      unsigned short i = unicodetoupper((u[0].h << 8) + u[0].l, langnum);
1359      u[0].h = (unsigned char) (i >> 8);
1360      u[0].l = (unsigned char) (i & 0x00FF);
1361      u16_u8(p, MAXWORDUTF8LEN, u, nc);
1362      return strlen(p);
1363  }
1364  return nc;
1365}
1366
1367int Hunspell::mkinitsmall2(char * p, w_char * u, int nc)
1368{
1369  if (!utf8) {
1370    if (*p != '\0') *p = csconv[((unsigned char)*p)].clower;
1371  } else if (nc > 0) {
1372      unsigned short i = unicodetolower((u[0].h << 8) + u[0].l, langnum);
1373      u[0].h = (unsigned char) (i >> 8);
1374      u[0].l = (unsigned char) (i & 0x00FF);
1375      u16_u8(p, MAXWORDUTF8LEN, u, nc);
1376      return strlen(p);
1377  }
1378  return nc;
1379}
1380
1381int Hunspell::add(const char * word)
1382{
1383    if (pHMgr[0]) return (pHMgr[0])->add(word);
1384    return 0;
1385}
1386
1387int Hunspell::add_with_affix(const char * word, const char * example)
1388{
1389    if (pHMgr[0]) return (pHMgr[0])->add_with_affix(word, example);
1390    return 0;
1391}
1392
1393int Hunspell::remove(const char * word)
1394{
1395    if (pHMgr[0]) return (pHMgr[0])->remove(word);
1396    return 0;
1397}
1398
1399const char * Hunspell::get_version()
1400{
1401  return pAMgr->get_version();
1402}
1403
1404struct cs_info * Hunspell::get_csconv()
1405{
1406  return csconv;
1407}
1408
1409void Hunspell::cat_result(char * result, char * st)
1410{
1411    if (st) {
1412        if (*result) mystrcat(result, "\n", MAXLNLEN);
1413        mystrcat(result, st, MAXLNLEN);
1414        free(st);
1415    }
1416}
1417
1418int Hunspell::analyze(char*** slst, const char * word)
1419{
1420  char cw[MAXWORDUTF8LEN];
1421  char wspace[MAXWORDUTF8LEN];
1422  w_char unicw[MAXWORDLEN];
1423  int wl2 = 0;
1424  *slst = NULL;
1425  if (! pSMgr || maxdic == 0) return 0;
1426  int nc = strlen(word);
1427  if (utf8) {
1428    if (nc >= MAXWORDUTF8LEN) return 0;
1429  } else {
1430    if (nc >= MAXWORDLEN) return 0;
1431  }
1432  int captype = 0;
1433  int abbv = 0;
1434  int wl = 0;
1435
1436  // input conversion
1437  RepList * rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL;
1438  if (rl && rl->conv(word, wspace)) wl = cleanword2(cw, wspace, unicw, &nc, &captype, &abbv);
1439  else wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv);
1440
1441  if (wl == 0) {
1442      if (abbv) {
1443          for (wl = 0; wl < abbv; wl++) cw[wl] = '.';
1444          cw[wl] = '\0';
1445          abbv = 0;
1446      } else return 0;
1447  }
1448
1449  char result[MAXLNLEN];
1450  char * st = NULL;
1451
1452  *result = '\0';
1453
1454  int n = 0;
1455  int n2 = 0;
1456  int n3 = 0;
1457
1458  // test numbers
1459  // LANG_hu section: set dash information for suggestions
1460  if (langnum == LANG_hu) {
1461  while ((n < wl) &&
1462        (((cw[n] <= '9') && (cw[n] >= '0')) || (((cw[n] == '.') || (cw[n] == ',')) && (n > 0)))) {
1463        n++;
1464        if ((cw[n] == '.') || (cw[n] == ',')) {
1465                if (((n2 == 0) && (n > 3)) ||
1466                        ((n2 > 0) && ((cw[n-1] == '.') || (cw[n-1] == ',')))) break;
1467                n2++;
1468                n3 = n;
1469        }
1470  }
1471
1472  if ((n == wl) && (n3 > 0) && (n - n3 > 3)) return 0;
1473  if ((n == wl) || ((n>0) && ((cw[n]=='%') || (cw[n]=='\xB0')) && checkword(cw+n, NULL, NULL))) {
1474        mystrcat(result, cw, MAXLNLEN);
1475        result[n - 1] = '\0';
1476        if (n == wl) cat_result(result, pSMgr->suggest_morph(cw + n - 1));
1477        else {
1478                char sign = cw[n];
1479                cw[n] = '\0';
1480                cat_result(result, pSMgr->suggest_morph(cw + n - 1));
1481                mystrcat(result, "+", MAXLNLEN); // XXX SPEC. MORPHCODE
1482                cw[n] = sign;
1483                cat_result(result, pSMgr->suggest_morph(cw + n));
1484        }
1485        return line_tok(result, slst, MSEP_REC);
1486  }
1487  }
1488  // END OF LANG_hu section
1489
1490  switch(captype) {
1491     case HUHCAP:
1492     case HUHINITCAP:
1493     case NOCAP:  {
1494                    cat_result(result, pSMgr->suggest_morph(cw));
1495                    if (abbv) {
1496                        memcpy(wspace,cw,wl);
1497                        *(wspace+wl) = '.';
1498                        *(wspace+wl+1) = '\0';
1499                        cat_result(result, pSMgr->suggest_morph(wspace));
1500                    }
1501                    break;
1502                }
1503     case INITCAP: {
1504                     wl = mkallsmall2(cw, unicw, nc);
1505                     memcpy(wspace,cw,(wl+1));
1506                     wl2 = mkinitcap2(cw, unicw, nc);
1507                     cat_result(result, pSMgr->suggest_morph(wspace));
1508                     cat_result(result, pSMgr->suggest_morph(cw));
1509                     if (abbv) {
1510                         *(wspace+wl) = '.';
1511                         *(wspace+wl+1) = '\0';
1512                         cat_result(result, pSMgr->suggest_morph(wspace));
1513
1514                         memcpy(wspace, cw, wl2);
1515                         *(wspace+wl2) = '.';
1516                         *(wspace+wl2+1) = '\0';
1517
1518                         cat_result(result, pSMgr->suggest_morph(wspace));
1519                     }
1520                     break;
1521                   }
1522     case ALLCAP: {
1523                     cat_result(result, pSMgr->suggest_morph(cw));
1524                     if (abbv) {
1525                         memcpy(wspace,cw,wl);
1526                         *(wspace+wl) = '.';
1527                         *(wspace+wl+1) = '\0';
1528                         cat_result(result, pSMgr->suggest_morph(cw));
1529                     }
1530                     wl = mkallsmall2(cw, unicw, nc);
1531                     memcpy(wspace,cw,(wl+1));
1532                     wl2 = mkinitcap2(cw, unicw, nc);
1533
1534                     cat_result(result, pSMgr->suggest_morph(wspace));
1535                     cat_result(result, pSMgr->suggest_morph(cw));
1536                     if (abbv) {
1537                         *(wspace+wl) = '.';
1538                         *(wspace+wl+1) = '\0';
1539                         cat_result(result, pSMgr->suggest_morph(wspace));
1540
1541                         memcpy(wspace, cw, wl2);
1542                         *(wspace+wl2) = '.';
1543                         *(wspace+wl2+1) = '\0';
1544
1545                         cat_result(result, pSMgr->suggest_morph(wspace));
1546                     }
1547                     break;
1548                   }
1549  }
1550
1551  if (*result) {
1552    // word reversing wrapper for complex prefixes
1553    if (complexprefixes) {
1554      if (utf8) reverseword_utf(result); else reverseword(result);
1555    }
1556    return line_tok(result, slst, MSEP_REC);
1557  }
1558
1559  // compound word with dash (HU) I18n
1560  char * dash = NULL;
1561  int nresult = 0;
1562  // LANG_hu section: set dash information for suggestions
1563  if (langnum == LANG_hu) dash = (char *) strchr(cw,'-');
1564  if ((langnum == LANG_hu) && dash) {
1565      *dash='\0';
1566      // examine 2 sides of the dash
1567      if (dash[1] == '\0') { // base word ending with dash
1568        if (spell(cw)) {
1569		char * p = pSMgr->suggest_morph(cw);
1570		if (p) {
1571		    int ret = line_tok(p, slst, MSEP_REC);
1572		    free(p);
1573		    return ret;
1574		}
1575		
1576	}
1577      } else if ((dash[1] == 'e') && (dash[2] == '\0')) { // XXX (HU) -e hat.
1578        if (spell(cw) && (spell("-e"))) {
1579                        st = pSMgr->suggest_morph(cw);
1580                        if (st) {
1581                                mystrcat(result, st, MAXLNLEN);
1582                                free(st);
1583                        }
1584                        mystrcat(result,"+", MAXLNLEN); // XXX spec. separator in MORPHCODE
1585                        st = pSMgr->suggest_morph("-e");
1586                        if (st) {
1587                                mystrcat(result, st, MAXLNLEN);
1588                                free(st);
1589                        }
1590                        return line_tok(result, slst, MSEP_REC);
1591                }
1592      } else {
1593      // fi…

Large files files are truncated, but you can click here to view the full file