PageRenderTime 48ms CodeModel.GetById 18ms RepoModel.GetById 1ms app.codeStats 0ms

/demo_version_0.1/Mobwiz.SpellChecker.Core/Phonets/AspellPhonet.cs

http://spellchecker.codeplex.com
C# | 624 lines | 410 code | 80 blank | 134 comment | 74 complexity | 192db82b1f1f2ed77ab50c2db7c5113a MD5 | raw file
  1. /********************** File Header *****************\
  2. File Name : <Aspell Phonet>
  3. Project Name : <SpellChecker>
  4. Author : <Mobwiz>
  5. Description : Phonet based on Aspell's phonet.
  6. Change log :
  7. ------------------------------------------------
  8. #1 2012-6-21
  9. #2
  10. **/
  11. namespace Mobwiz.SpellChecker.Core.Phonets
  12. {
  13. using System;
  14. using System.Collections.Generic;
  15. using System.Diagnostics;
  16. using System.IO;
  17. using System.Linq;
  18. using System.Text;
  19. public class AspellPhonet : IPhonet
  20. {
  21. #region Fields
  22. string version;
  23. bool collapse = false;
  24. bool followup = true;
  25. //Int16[] hash = new Int16[256];
  26. //char[] to_upper = new char[256];
  27. //bool[] is_alpha = new bool[256];
  28. char[] alphabets;
  29. //string alphabetString;
  30. List<PhonetRule> rules;
  31. private static readonly string[] IGNORED_KEYWORDS = { "version", "followup", "collapse_result" };
  32. private static readonly string KEYWORD_ALPHBET = "alphabet";
  33. private static readonly char[] defaultEnglishAlphabet = { 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z' };
  34. /**
  35. * The alphabet start marker.
  36. * @see GenericTransformator#KEYWORD_ALPHBET KEYWORD_ALPHBET
  37. */
  38. private static readonly char ALPHABET_START = '[';
  39. /**
  40. * The alphabet end marker.
  41. * @see GenericTransformator#KEYWORD_ALPHBET KEYWORD_ALPHBET
  42. */
  43. private static readonly char ALPHABET_END = ']';
  44. private static readonly char STARTMULTI = '(';
  45. /**
  46. * End a group of characters which can be appended to the match expression
  47. * of the phonetic file.
  48. */
  49. private static readonly char ENDMULTI = ')';
  50. /**
  51. * Phonetic file character code indicating that the replace expression
  52. * is empty.
  53. */
  54. private static readonly string RESERVEDSTRING = @"-()^$[]0123456789";
  55. #endregion
  56. #region Constructors
  57. /// <summary>
  58. /// Constructor. Init the rules and load the rules from the phonet rule file
  59. /// </summary>
  60. /// <param name="phonetFile">Phonet Rule File Path</param>
  61. public AspellPhonet(string phonetFile)
  62. {
  63. rules = new List<PhonetRule>();
  64. loadPhonetFile(phonetFile);
  65. }
  66. #endregion
  67. #region IPhonet Members
  68. /// <summary>
  69. /// Implements this method from the interface IPhonet
  70. /// </summary>
  71. /// <returns>The chars can be used to do the replace operaction in the nearmiss method</returns>
  72. public char[] GetReplaceList()
  73. {
  74. if (rules.Count > 0)
  75. {
  76. HashSet<char> chars = new HashSet<char>();
  77. for (int i = 0; i < rules.Count; i++)
  78. {
  79. char[] ruleChars = rules[i].ReplaceResult.ToCharArray();
  80. for (int j = 0; j < ruleChars.Length; j++)
  81. chars.Add(ruleChars[j]);
  82. }
  83. return chars.ToArray();
  84. //StringBuilder sb = new StringBuilder();
  85. //foreach (char c in chars)
  86. //{
  87. // sb.Append(c);
  88. //}
  89. //return sb.ToString().ToCharArray();
  90. }
  91. else
  92. return new char[0];
  93. }
  94. /// <summary>
  95. /// The public method to do the phonet transform
  96. /// </summary>
  97. /// <param name="OriginalWord">The incoming word</param>
  98. /// <returns>the phonet Code</returns>
  99. public string Transform(string OriginalWord)
  100. {
  101. string wordStr = OriginalWord.ToUpper();
  102. char[] wordchars = wordStr.ToCharArray();
  103. string target = "";
  104. int i = 0;
  105. MatchResult result, followResult;
  106. while (i < wordStr.Length)
  107. {
  108. //Console.WriteLine("Check pos:"+i.ToString());
  109. IEnumerable<PhonetRule> validRules = from rule in rules
  110. where rule.MatchExpression[0].StartsWith(wordStr.Substring(i, 1))
  111. select rule;
  112. result = new MatchResult(-1, -333, -1, false);
  113. foreach (PhonetRule rule in validRules)
  114. {
  115. if ((result = rule.IsMatching(wordStr, i)).Match)
  116. {
  117. if (followup
  118. && result.K > 1 //至少匹配了2个字母
  119. && rule.Reduce == 0 //规则中没有-号
  120. && i + result.K < wordStr.Length //还没有超出长度
  121. )
  122. {
  123. followResult = new MatchResult(-1, -333, -1, false);
  124. IEnumerable<PhonetRule> followupRules = from frule in rules
  125. where frule.MatchExpression[0].StartsWith(wordStr.Substring(i + result.K, 1))
  126. select frule;
  127. foreach (PhonetRule frule in followupRules)
  128. {
  129. if ((followResult = frule.IsMatching(wordStr, i + result.K - 1)).Match)
  130. {
  131. if (followResult.K == 1) continue; //discard; too short
  132. if (followResult.P < result.P) continue; //discard ; 优先级太低
  133. break; // matched and priority is high enought use this rule. so break the foreach
  134. }
  135. }
  136. // not use the rule... use the followup
  137. if (followResult.Match && followResult.P >= result.P)
  138. {
  139. continue;
  140. }
  141. } // end the followup search
  142. if (rule.Unknown) // rule with < used
  143. {
  144. // need to modify the original word........ how to?
  145. StringBuilder sb = new StringBuilder();
  146. sb.Append(wordStr.ToCharArray(), 0, i);
  147. sb.Append(rule.ReplaceResult);
  148. sb.Append(wordStr.ToCharArray(), 0 + i + result.K, wordStr.Length - i - result.K);
  149. wordStr = sb.ToString(); // the rule with < means replace the matchExp with the ReplaceExp,
  150. // and do the match again.
  151. result.K = 0;
  152. }
  153. else
  154. {
  155. if (rule.ReplaceResult != "_") // the '_' means replace to nothing
  156. target += rule.ReplaceResult;
  157. }
  158. break; // matched ,then break the foreach
  159. }
  160. } // end the checking rules
  161. if (result.Match && result.K >= 0)
  162. i += result.K;
  163. else
  164. i++;
  165. }
  166. return target;
  167. }
  168. #endregion
  169. #region Structs for internal use
  170. /// <summary>
  171. /// Struct used to store the match result
  172. /// </summary>
  173. internal struct MatchResult
  174. {
  175. int k;
  176. int p;
  177. bool match;
  178. int matchIndex;
  179. /// <summary>
  180. /// Construct
  181. /// </summary>
  182. /// <param name="k"></param>
  183. /// <param name="p"></param>
  184. /// <param name="mindex"></param>
  185. /// <param name="match"></param>
  186. public MatchResult(int k, int p, int mindex, bool match)
  187. {
  188. this.k = k;
  189. this.p = p;
  190. this.match = match;
  191. this.matchIndex = mindex;
  192. }
  193. /// <summary>
  194. /// the Matched item's index in the MatchExpression array
  195. /// </summary>
  196. public int MatchIndex
  197. {
  198. get { return matchIndex; }
  199. set { matchIndex = value; }
  200. }
  201. /// <summary>
  202. /// the length to be replaced..due to the ASPELL..use the word "K"
  203. /// </summary>
  204. public int K
  205. {
  206. get { return k; }
  207. set { k = value; }
  208. }
  209. /// <summary>
  210. /// the priority of this rule..due to the ASPELL, used the word "p"
  211. /// </summary>
  212. public int P
  213. {
  214. get { return p; }
  215. set { p = value; }
  216. }
  217. /// <summary>
  218. /// the match result;true match, false not match
  219. /// </summary>
  220. public bool Match
  221. {
  222. get { return match; }
  223. set { match = value; }
  224. }
  225. }
  226. /// <summary>
  227. /// Struct to be used to store the phonet rule
  228. /// </summary>
  229. internal struct PhonetRule
  230. {
  231. string[] matchExps;
  232. string[] toBeReplacedExp;
  233. string result;
  234. bool start;
  235. bool end;
  236. bool unknown;
  237. int priority;
  238. int reduce;
  239. /// <summary>
  240. /// If the rule with a '-' the reduce = 1
  241. /// if there is two '-',the reduce = 2 ... etc
  242. /// </summary>
  243. public int Reduce
  244. {
  245. get { return reduce; }
  246. }
  247. /// <summary>
  248. /// Convert the object to a text presention.
  249. /// </summary>
  250. /// <returns></returns>
  251. public override string ToString()
  252. {
  253. string str = "[";
  254. int len = matchExps.Length;
  255. for (int i = 0; i < len; i++)
  256. {
  257. str += matchExps[i] + " -> " + ToBeReplacedExpression[i];
  258. str += ",";
  259. }
  260. str += "]" + priority.ToString();
  261. str += " To ";
  262. str += result;
  263. return str;
  264. }
  265. /// <summary>
  266. /// constructor of the phonetRULE struct
  267. /// </summary>
  268. public PhonetRule(string[] matchExps, string[] replaceExp, string result,
  269. bool start, bool end, bool unknown, int priority, int reduce)
  270. {
  271. this.matchExps = matchExps;
  272. this.toBeReplacedExp = replaceExp;
  273. this.result = result;
  274. this.start = start;
  275. this.end = end;
  276. this.unknown = unknown;
  277. this.priority = priority;
  278. this.reduce = reduce;
  279. }
  280. /// <summary>
  281. /// Get the matchExpression array
  282. /// </summary>
  283. public string[] MatchExpression
  284. {
  285. get { return matchExps; }
  286. }
  287. /// <summary>
  288. /// Get the ToBereplaceExpression array
  289. /// </summary>
  290. public string[] ToBeReplacedExpression
  291. {
  292. get { return toBeReplacedExp; }
  293. }
  294. /// <summary>
  295. /// Get the replaceResult string
  296. /// </summary>
  297. public string ReplaceResult
  298. {
  299. get { return result; }
  300. }
  301. /// <summary>
  302. /// if this rule is coming with a '^'
  303. /// </summary>
  304. public bool Start
  305. {
  306. get { return start; }
  307. }
  308. /// <summary>
  309. /// if this rule is coming with a '$'
  310. /// </summary>
  311. public bool End
  312. {
  313. get { return end; }
  314. }
  315. /// <summary>
  316. /// if this rule is coming with a '<'
  317. /// </summary>
  318. public bool Unknown
  319. {
  320. get { return unknown; }
  321. }
  322. /// <summary>
  323. /// the rule's priority
  324. /// </summary>
  325. public int Priority
  326. {
  327. get { return priority; }
  328. }
  329. /// <summary>
  330. /// Judge if the rule match the curent word...
  331. /// </summary>
  332. /// <param name="word">The incoming word</param>
  333. /// <returns>a Struct MatchResult</returns>
  334. public MatchResult IsMatching(string word, int startPos)
  335. {
  336. //int resultValue = -1; // no match
  337. MatchResult result = new MatchResult(-1, -333, -1, false);
  338. string matchValue = word.Length > startPos ? word.Substring(startPos) : "";
  339. int indicator = 0;
  340. foreach (string str in matchExps)
  341. {
  342. if (matchValue.StartsWith(str) // must be true..
  343. && !(start && startPos > 0) // if
  344. && !(end && !matchValue.EndsWith(str)))
  345. {
  346. result.K = toBeReplacedExp[indicator].Length;
  347. result.P = priority;
  348. result.MatchIndex = indicator;
  349. result.Match = true;
  350. break;
  351. }
  352. indicator++;
  353. }
  354. return result;
  355. }
  356. }
  357. #endregion
  358. #region Private functions
  359. /// <summary>
  360. /// Load the phonetfile to the list of PhonetRule rules
  361. /// </summary>
  362. /// <param name="phonetFile">phonetFile name</param>
  363. void loadPhonetFile(string phonetFile)
  364. {
  365. if (!File.Exists(phonetFile))
  366. {
  367. Exception e = new Exception(@"Can't find the file " + phonetFile + @"!\n");
  368. throw (e);
  369. }
  370. try
  371. {
  372. using (TextReader tr = new StreamReader(phonetFile))
  373. {
  374. string line;
  375. while ((line = tr.ReadLine()) != null)
  376. {
  377. line = line.Trim();
  378. /// startwith # is 注释
  379. if (line.StartsWith(@"#"))
  380. continue;
  381. if (line != "")
  382. {
  383. /// if line is the real data. use the buildRule to build one RULE
  384. BuildPhonetRule(line);
  385. }
  386. }
  387. tr.Close();
  388. tr.Dispose();
  389. }
  390. }
  391. catch (IOException ee)
  392. {
  393. throw (ee);
  394. }
  395. }
  396. /// <summary>
  397. /// Build one rule from the line
  398. /// </summary>
  399. /// <param name="line">the line from the phonet file</param>
  400. private void BuildPhonetRule(string line)
  401. {
  402. //str = str.Remove(str.IndexOf(" "), str.LastIndexOf(" ") - str.IndexOf(" "));
  403. string ruleline = line.Remove(line.IndexOf(" "), line.LastIndexOf(" ") - line.IndexOf(" "));
  404. ruleline = ruleline.ToUpper();
  405. string[] values = ruleline.Split(new char[] { ' ' });
  406. if (values.Length != 2) return;
  407. int i;
  408. string key = values[0];
  409. string value = values[1];
  410. for (i = 0; i < IGNORED_KEYWORDS.Length; i++)
  411. {
  412. if (key.Equals(IGNORED_KEYWORDS[i], StringComparison.OrdinalIgnoreCase))
  413. {
  414. if (key.Equals(IGNORED_KEYWORDS[0], StringComparison.OrdinalIgnoreCase)) //version
  415. version = value;
  416. else if (key.Equals(IGNORED_KEYWORDS[1], StringComparison.OrdinalIgnoreCase)) // followup
  417. followup = bool.Parse(value);
  418. if (key.Equals(IGNORED_KEYWORDS[1], StringComparison.OrdinalIgnoreCase)) // collapse
  419. collapse = bool.Parse(value);
  420. return;
  421. }
  422. }
  423. // A different alphabet is used for this language, will be read into
  424. // the alphabetString variable.
  425. if (key.Equals(KEYWORD_ALPHBET, StringComparison.OrdinalIgnoreCase))
  426. {
  427. alphabets = value.Substring(value.IndexOf(ALPHABET_START) + 1, value.LastIndexOf(ALPHABET_END) - value.IndexOf(ALPHABET_START) - 1).ToCharArray();
  428. return;
  429. }
  430. // build the real rules..
  431. StringBuilder matchExp, replaceResult; //replaceExp,
  432. int reduce = 0, priority = 5; // default priority is 5
  433. List<string> matchExps = new List<string>();
  434. List<string> replaceExps = new List<string>();
  435. char[] matchChars = key.ToCharArray();
  436. matchExp = new StringBuilder();
  437. replaceResult = new StringBuilder(value);
  438. bool start = false, end = false, unknow = false;
  439. bool withMulti = false;
  440. i = 0;
  441. string reservedChars = RESERVEDSTRING;
  442. while (true)
  443. {
  444. while ((!reservedChars.Contains(matchChars[i])) && i < matchChars.Length)
  445. {
  446. matchExp.Append(matchChars[i]);
  447. i++;
  448. if (i >= matchChars.Length) break;
  449. }
  450. if (i < matchChars.Length && matchChars[i] == STARTMULTI)
  451. {
  452. i++;
  453. while (matchChars[i] != ENDMULTI) // if there is a '(' there must be a ')';
  454. {
  455. StringBuilder sb1 = new StringBuilder(matchExp.ToString());
  456. sb1.Append(matchChars[i]);
  457. matchExps.Add(sb1.ToString());
  458. replaceExps.Add(sb1.ToString());
  459. i++;
  460. }
  461. i++;
  462. withMulti = true;
  463. }
  464. if (!withMulti)
  465. {
  466. matchExps.Add(matchExp.ToString());
  467. replaceExps.Add(matchExp.ToString());
  468. }
  469. if (i >= matchChars.Length) break;
  470. reduce = 0;
  471. while (matchChars[i] == '-')
  472. {
  473. reduce++;
  474. for (int k = 0; k < replaceExps.Count; k++)
  475. {
  476. replaceExps[k] = replaceExps[k].Substring(0, replaceExps[k].Length - 1);
  477. }
  478. i++;
  479. if (i >= matchChars.Length)
  480. break;
  481. }
  482. if (i >= matchChars.Length) break;
  483. if (matchChars[i] == '^')
  484. {
  485. start = true;
  486. i++;
  487. }
  488. if (i >= matchChars.Length) break;
  489. if (matchChars[i] == '$')
  490. {
  491. end = true;
  492. i++;
  493. }
  494. if (i >= matchChars.Length) break;
  495. if (matchChars[i] == '<')
  496. {
  497. unknow = true;
  498. i++;
  499. }
  500. if (i >= matchChars.Length) break;
  501. if (char.IsDigit(matchChars[i]))
  502. {
  503. priority = (int)(matchChars[i] - '0');
  504. i++;
  505. }
  506. if (i >= matchChars.Length) break;
  507. }
  508. if (matchExps.Count > 0)
  509. {
  510. rules.Add(new PhonetRule(matchExps.ToArray(), replaceExps.ToArray(), replaceResult.ToString(),
  511. start, end, unknow, priority, reduce));
  512. }
  513. }
  514. #endregion
  515. /// <summary>
  516. /// For debug.
  517. /// </summary>
  518. [Conditional("TRACE_ON")]
  519. private void DumpRules()
  520. {
  521. int n = 0;
  522. foreach (PhonetRule rule in rules)
  523. {
  524. Console.WriteLine("Rule NO: " + n.ToString());
  525. Console.WriteLine("Priority: " + rule.Priority.ToString() + " Start:" + rule.Start.ToString() +
  526. " End:" + rule.End.ToString() + " Unkow:" + rule.Unknown.ToString());
  527. string[] strs1 = rule.MatchExpression;
  528. string[] strs2 = rule.MatchExpression;
  529. string temp;
  530. if (strs1.Length == strs2.Length)
  531. {
  532. for (int i = 0; i < strs1.Length; i++)
  533. {
  534. temp = string.Format("{0,5} [{1,5}] ----> {2}", strs1[i], strs2[i], rule.ReplaceResult);
  535. Console.WriteLine(temp);
  536. }
  537. }
  538. Console.WriteLine("---------------------------------------");
  539. n++;
  540. }
  541. }
  542. }
  543. }