/extensions/spellcheck/hunspell/src/phonet.cpp

http://github.com/zpao/v8monkey · C++ · 309 lines · 203 code · 26 blank · 80 comment · 142 complexity · 6aee07d0a5ab1c2336ae52bdb5ce7ec1 MD5 · raw file

  1. /******* BEGIN LICENSE BLOCK *******
  2. * Version: MPL 1.1/GPL 2.0/LGPL 2.1
  3. *
  4. * The contents of this file are subject to the Mozilla Public License Version
  5. * 1.1 (the "License"); you may not use this file except in compliance with
  6. * the License. You may obtain a copy of the License at
  7. * http://www.mozilla.org/MPL/
  8. *
  9. * Software distributed under the License is distributed on an "AS IS" basis,
  10. * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
  11. * for the specific language governing rights and limitations under the
  12. * License.
  13. *
  14. * The Initial Developer of the Original Code is Björn Jacke. Portions created
  15. * by the Initial Developers are Copyright (C) 2000-2007 the Initial
  16. * Developers. All Rights Reserved.
  17. *
  18. * Contributor(s): Björn Jacke (bjoern.jacke@gmx.de)
  19. * László Németh (nemethl@gyorsposta.hu)
  20. * Caolan McNamara (caolanm@redhat.com)
  21. *
  22. * Alternatively, the contents of this file may be used under the terms of
  23. * either the GNU General Public License Version 2 or later (the "GPL"), or
  24. * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
  25. * in which case the provisions of the GPL or the LGPL are applicable instead
  26. * of those above. If you wish to allow use of your version of this file only
  27. * under the terms of either the GPL or the LGPL, and not to allow others to
  28. * use your version of this file under the terms of the MPL, indicate your
  29. * decision by deleting the provisions above and replace them with the notice
  30. * and other provisions required by the GPL or the LGPL. If you do not delete
  31. * the provisions above, a recipient may use your version of this file under
  32. * the terms of any one of the MPL, the GPL or the LGPL.
  33. *
  34. * Changelog:
  35. * 2000-01-05 Björn Jacke <bjoern.jacke AT gmx.de>
  36. * Initial Release insprired by the article about phonetic
  37. * transformations out of c't 25/1999
  38. *
  39. * 2007-07-26 Björn Jacke <bjoern.jacke AT gmx.de>
  40. * Released under MPL/GPL/LGPL tri-license for Hunspell
  41. *
  42. * 2007-08-23 László Németh <nemeth at OOo>
  43. * Porting from Aspell to Hunspell using C-like structs
  44. *
  45. ******* END LICENSE BLOCK *******/
  46. #include <stdlib.h>
  47. #include <string.h>
  48. #include <stdio.h>
  49. #include <ctype.h>
  50. #include "csutil.hxx"
  51. #include "phonet.hxx"
  52. void init_phonet_hash(phonetable & parms)
  53. {
  54. int i, k;
  55. for (i = 0; i < HASHSIZE; i++) {
  56. parms.hash[i] = -1;
  57. }
  58. for (i = 0; parms.rules[i][0] != '\0'; i += 2) {
  59. /** set hash value **/
  60. k = (unsigned char) parms.rules[i][0];
  61. if (parms.hash[k] < 0) {
  62. parms.hash[k] = i;
  63. }
  64. }
  65. }
  66. // like strcpy but safe if the strings overlap
  67. // but only if dest < src
  68. static inline void strmove(char * dest, char * src) {
  69. while (*src)
  70. *dest++ = *src++;
  71. *dest = '\0';
  72. }
  73. static int myisalpha(char ch) {
  74. if ((unsigned char) ch < 128) return isalpha(ch);
  75. return 1;
  76. }
  77. /* phonetic transcription algorithm */
  78. /* see: http://aspell.net/man-html/Phonetic-Code.html */
  79. /* convert string to uppercase before this call */
  80. int phonet (const char * inword, char * target,
  81. int len,
  82. phonetable & parms)
  83. {
  84. /** Do phonetic transformation. **/
  85. /** "len" = length of "inword" incl. '\0'. **/
  86. /** result: >= 0: length of "target" **/
  87. /** otherwise: error **/
  88. int i,j,k=0,n,p,z;
  89. int k0,n0,p0=-333,z0;
  90. char c, c0;
  91. const char * s;
  92. typedef unsigned char uchar;
  93. char word[MAXPHONETUTF8LEN + 1];
  94. if (len == -1) len = strlen(inword);
  95. if (len > MAXPHONETUTF8LEN) return 0;
  96. strcpy(word, inword);
  97. /** check word **/
  98. i = j = z = 0;
  99. while ((c = word[i]) != '\0') {
  100. n = parms.hash[(uchar) c];
  101. z0 = 0;
  102. if (n >= 0) {
  103. /** check all rules for the same letter **/
  104. while (parms.rules[n][0] == c) {
  105. /** check whole string **/
  106. k = 1; /** number of found letters **/
  107. p = 5; /** default priority **/
  108. s = parms.rules[n];
  109. s++; /** important for (see below) "*(s-1)" **/
  110. while (*s != '\0' && word[i+k] == *s
  111. && !isdigit ((unsigned char) *s) && strchr ("(-<^$", *s) == NULL) {
  112. k++;
  113. s++;
  114. }
  115. if (*s == '(') {
  116. /** check letters in "(..)" **/
  117. if (myisalpha(word[i+k]) // ...could be implied?
  118. && strchr(s+1, word[i+k]) != NULL) {
  119. k++;
  120. while (*s != ')')
  121. s++;
  122. s++;
  123. }
  124. }
  125. p0 = (int) *s;
  126. k0 = k;
  127. while (*s == '-' && k > 1) {
  128. k--;
  129. s++;
  130. }
  131. if (*s == '<')
  132. s++;
  133. if (isdigit ((unsigned char) *s)) {
  134. /** determine priority **/
  135. p = *s - '0';
  136. s++;
  137. }
  138. if (*s == '^' && *(s+1) == '^')
  139. s++;
  140. if (*s == '\0'
  141. || (*s == '^'
  142. && (i == 0 || ! myisalpha(word[i-1]))
  143. && (*(s+1) != '$'
  144. || (! myisalpha(word[i+k0]) )))
  145. || (*s == '$' && i > 0
  146. && myisalpha(word[i-1])
  147. && (! myisalpha(word[i+k0]) )))
  148. {
  149. /** search for followup rules, if: **/
  150. /** parms.followup and k > 1 and NO '-' in searchstring **/
  151. c0 = word[i+k-1];
  152. n0 = parms.hash[(uchar) c0];
  153. // if (parms.followup && k > 1 && n0 >= 0
  154. if (k > 1 && n0 >= 0
  155. && p0 != (int) '-' && word[i+k] != '\0') {
  156. /** test follow-up rule for "word[i+k]" **/
  157. while (parms.rules[n0][0] == c0) {
  158. /** check whole string **/
  159. k0 = k;
  160. p0 = 5;
  161. s = parms.rules[n0];
  162. s++;
  163. while (*s != '\0' && word[i+k0] == *s
  164. && ! isdigit((unsigned char) *s) && strchr("(-<^$",*s) == NULL) {
  165. k0++;
  166. s++;
  167. }
  168. if (*s == '(') {
  169. /** check letters **/
  170. if (myisalpha(word[i+k0])
  171. && strchr (s+1, word[i+k0]) != NULL) {
  172. k0++;
  173. while (*s != ')' && *s != '\0')
  174. s++;
  175. if (*s == ')')
  176. s++;
  177. }
  178. }
  179. while (*s == '-') {
  180. /** "k0" gets NOT reduced **/
  181. /** because "if (k0 == k)" **/
  182. s++;
  183. }
  184. if (*s == '<')
  185. s++;
  186. if (isdigit ((unsigned char) *s)) {
  187. p0 = *s - '0';
  188. s++;
  189. }
  190. if (*s == '\0'
  191. /** *s == '^' cuts **/
  192. || (*s == '$' && ! myisalpha(word[i+k0])))
  193. {
  194. if (k0 == k) {
  195. /** this is just a piece of the string **/
  196. n0 += 2;
  197. continue;
  198. }
  199. if (p0 < p) {
  200. /** priority too low **/
  201. n0 += 2;
  202. continue;
  203. }
  204. /** rule fits; stop search **/
  205. break;
  206. }
  207. n0 += 2;
  208. } /** End of "while (parms.rules[n0][0] == c0)" **/
  209. if (p0 >= p && parms.rules[n0][0] == c0) {
  210. n += 2;
  211. continue;
  212. }
  213. } /** end of follow-up stuff **/
  214. /** replace string **/
  215. s = parms.rules[n+1];
  216. p0 = (parms.rules[n][0] != '\0'
  217. && strchr (parms.rules[n]+1,'<') != NULL) ? 1:0;
  218. if (p0 == 1 && z == 0) {
  219. /** rule with '<' is used **/
  220. if (j > 0 && *s != '\0'
  221. && (target[j-1] == c || target[j-1] == *s)) {
  222. j--;
  223. }
  224. z0 = 1;
  225. z = 1;
  226. k0 = 0;
  227. while (*s != '\0' && word[i+k0] != '\0') {
  228. word[i+k0] = *s;
  229. k0++;
  230. s++;
  231. }
  232. if (k > k0)
  233. strmove (&word[0]+i+k0, &word[0]+i+k);
  234. /** new "actual letter" **/
  235. c = word[i];
  236. }
  237. else { /** no '<' rule used **/
  238. i += k - 1;
  239. z = 0;
  240. while (*s != '\0'
  241. && *(s+1) != '\0' && j < len) {
  242. if (j == 0 || target[j-1] != *s) {
  243. target[j] = *s;
  244. j++;
  245. }
  246. s++;
  247. }
  248. /** new "actual letter" **/
  249. c = *s;
  250. if (parms.rules[n][0] != '\0'
  251. && strstr (parms.rules[n]+1, "^^") != NULL) {
  252. if (c != '\0') {
  253. target[j] = c;
  254. j++;
  255. }
  256. strmove (&word[0], &word[0]+i+1);
  257. i = 0;
  258. z0 = 1;
  259. }
  260. }
  261. break;
  262. } /** end of follow-up stuff **/
  263. n += 2;
  264. } /** end of while (parms.rules[n][0] == c) **/
  265. } /** end of if (n >= 0) **/
  266. if (z0 == 0) {
  267. // if (k && (assert(p0!=-333),!p0) && j < len && c != '\0'
  268. // && (!parms.collapse_result || j == 0 || target[j-1] != c)){
  269. if (k && !p0 && j < len && c != '\0'
  270. && (1 || j == 0 || target[j-1] != c)){
  271. /** condense only double letters **/
  272. target[j] = c;
  273. ///printf("\n setting \n");
  274. j++;
  275. }
  276. i++;
  277. z = 0;
  278. k=0;
  279. }
  280. } /** end of while ((c = word[i]) != '\0') **/
  281. target[j] = '\0';
  282. return (j);
  283. } /** end of function "phonet" **/