/demo_version_0.1/Mobwiz.SpellChecker.Core/Phonets/AspellPhonet.cs
C# | 624 lines | 410 code | 80 blank | 134 comment | 74 complexity | 192db82b1f1f2ed77ab50c2db7c5113a MD5 | raw file
- /********************** File Header *****************\
- File Name : <Aspell Phonet>
- Project Name : <SpellChecker>
- Author : <Mobwiz>
- Description : Phonet based on Aspell's phonet.
- Change log :
- ------------------------------------------------
- #1 2012-6-21
- #2
-
- **/
-
- namespace Mobwiz.SpellChecker.Core.Phonets
- {
- using System;
- using System.Collections.Generic;
- using System.Diagnostics;
- using System.IO;
- using System.Linq;
- using System.Text;
-
- public class AspellPhonet : IPhonet
- {
- #region Fields
-
- string version;
- bool collapse = false;
- bool followup = true;
-
- //Int16[] hash = new Int16[256];
- //char[] to_upper = new char[256];
- //bool[] is_alpha = new bool[256];
-
- char[] alphabets;
-
- //string alphabetString;
-
- List<PhonetRule> rules;
-
- private static readonly string[] IGNORED_KEYWORDS = { "version", "followup", "collapse_result" };
- private static readonly string KEYWORD_ALPHBET = "alphabet";
- private static readonly char[] defaultEnglishAlphabet = { 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z' };
-
- /**
- * The alphabet start marker.
- * @see GenericTransformator#KEYWORD_ALPHBET KEYWORD_ALPHBET
- */
- private static readonly char ALPHABET_START = '[';
- /**
- * The alphabet end marker.
- * @see GenericTransformator#KEYWORD_ALPHBET KEYWORD_ALPHBET
- */
- private static readonly char ALPHABET_END = ']';
-
- private static readonly char STARTMULTI = '(';
- /**
- * End a group of characters which can be appended to the match expression
- * of the phonetic file.
- */
- private static readonly char ENDMULTI = ')';
- /**
- * Phonetic file character code indicating that the replace expression
- * is empty.
- */
- private static readonly string RESERVEDSTRING = @"-()^$[]0123456789";
-
- #endregion
-
- #region Constructors
-
- /// <summary>
- /// Constructor. Init the rules and load the rules from the phonet rule file
- /// </summary>
- /// <param name="phonetFile">Phonet Rule File Path</param>
- public AspellPhonet(string phonetFile)
- {
- rules = new List<PhonetRule>();
- loadPhonetFile(phonetFile);
- }
-
- #endregion
-
- #region IPhonet Members
-
- /// <summary>
- /// Implements this method from the interface IPhonet
- /// </summary>
- /// <returns>The chars can be used to do the replace operaction in the nearmiss method</returns>
- public char[] GetReplaceList()
- {
- if (rules.Count > 0)
- {
- HashSet<char> chars = new HashSet<char>();
- for (int i = 0; i < rules.Count; i++)
- {
- char[] ruleChars = rules[i].ReplaceResult.ToCharArray();
- for (int j = 0; j < ruleChars.Length; j++)
- chars.Add(ruleChars[j]);
- }
-
- return chars.ToArray();
-
- //StringBuilder sb = new StringBuilder();
- //foreach (char c in chars)
- //{
- // sb.Append(c);
- //}
-
- //return sb.ToString().ToCharArray();
-
- }
- else
- return new char[0];
- }
-
- /// <summary>
- /// The public method to do the phonet transform
- /// </summary>
- /// <param name="OriginalWord">The incoming word</param>
- /// <returns>the phonet Code</returns>
- public string Transform(string OriginalWord)
- {
- string wordStr = OriginalWord.ToUpper();
-
- char[] wordchars = wordStr.ToCharArray();
-
- string target = "";
-
- int i = 0;
-
- MatchResult result, followResult;
-
- while (i < wordStr.Length)
- {
- //Console.WriteLine("Check pos:"+i.ToString());
- IEnumerable<PhonetRule> validRules = from rule in rules
- where rule.MatchExpression[0].StartsWith(wordStr.Substring(i, 1))
- select rule;
- result = new MatchResult(-1, -333, -1, false);
- foreach (PhonetRule rule in validRules)
- {
- if ((result = rule.IsMatching(wordStr, i)).Match)
- {
-
- if (followup
- && result.K > 1 //至少匹配了2个字母
- && rule.Reduce == 0 //规则中没有-号
- && i + result.K < wordStr.Length //还没有超出长度
- )
- {
- followResult = new MatchResult(-1, -333, -1, false);
- IEnumerable<PhonetRule> followupRules = from frule in rules
- where frule.MatchExpression[0].StartsWith(wordStr.Substring(i + result.K, 1))
- select frule;
- foreach (PhonetRule frule in followupRules)
- {
- if ((followResult = frule.IsMatching(wordStr, i + result.K - 1)).Match)
- {
- if (followResult.K == 1) continue; //discard; too short
- if (followResult.P < result.P) continue; //discard ; 优先级太低
- break; // matched and priority is high enought use this rule. so break the foreach
- }
- }
-
- // not use the rule... use the followup
- if (followResult.Match && followResult.P >= result.P)
- {
- continue;
- }
- } // end the followup search
-
- if (rule.Unknown) // rule with < used
- {
- // need to modify the original word........ how to?
- StringBuilder sb = new StringBuilder();
- sb.Append(wordStr.ToCharArray(), 0, i);
- sb.Append(rule.ReplaceResult);
- sb.Append(wordStr.ToCharArray(), 0 + i + result.K, wordStr.Length - i - result.K);
-
- wordStr = sb.ToString(); // the rule with < means replace the matchExp with the ReplaceExp,
- // and do the match again.
- result.K = 0;
- }
- else
- {
- if (rule.ReplaceResult != "_") // the '_' means replace to nothing
- target += rule.ReplaceResult;
- }
- break; // matched ,then break the foreach
- }
- } // end the checking rules
- if (result.Match && result.K >= 0)
- i += result.K;
- else
- i++;
- }
-
- return target;
- }
-
- #endregion
-
- #region Structs for internal use
-
- /// <summary>
- /// Struct used to store the match result
- /// </summary>
- internal struct MatchResult
- {
- int k;
- int p;
- bool match;
- int matchIndex;
-
- /// <summary>
- /// Construct
- /// </summary>
- /// <param name="k"></param>
- /// <param name="p"></param>
- /// <param name="mindex"></param>
- /// <param name="match"></param>
- public MatchResult(int k, int p, int mindex, bool match)
- {
- this.k = k;
- this.p = p;
- this.match = match;
- this.matchIndex = mindex;
- }
-
- /// <summary>
- /// the Matched item's index in the MatchExpression array
- /// </summary>
- public int MatchIndex
- {
- get { return matchIndex; }
- set { matchIndex = value; }
- }
-
- /// <summary>
- /// the length to be replaced..due to the ASPELL..use the word "K"
- /// </summary>
- public int K
- {
- get { return k; }
- set { k = value; }
- }
-
- /// <summary>
- /// the priority of this rule..due to the ASPELL, used the word "p"
- /// </summary>
- public int P
- {
- get { return p; }
- set { p = value; }
- }
-
- /// <summary>
- /// the match result;true match, false not match
- /// </summary>
- public bool Match
- {
- get { return match; }
- set { match = value; }
- }
- }
-
- /// <summary>
- /// Struct to be used to store the phonet rule
- /// </summary>
- internal struct PhonetRule
- {
- string[] matchExps;
- string[] toBeReplacedExp;
- string result;
- bool start;
- bool end;
- bool unknown;
- int priority;
- int reduce;
-
- /// <summary>
- /// If the rule with a '-' the reduce = 1
- /// if there is two '-',the reduce = 2 ... etc
- /// </summary>
- public int Reduce
- {
- get { return reduce; }
- }
-
- /// <summary>
- /// Convert the object to a text presention.
- /// </summary>
- /// <returns></returns>
- public override string ToString()
- {
- string str = "[";
- int len = matchExps.Length;
- for (int i = 0; i < len; i++)
- {
- str += matchExps[i] + " -> " + ToBeReplacedExpression[i];
- str += ",";
- }
- str += "]" + priority.ToString();
- str += " To ";
- str += result;
- return str;
- }
- /// <summary>
- /// constructor of the phonetRULE struct
- /// </summary>
- public PhonetRule(string[] matchExps, string[] replaceExp, string result,
- bool start, bool end, bool unknown, int priority, int reduce)
- {
- this.matchExps = matchExps;
- this.toBeReplacedExp = replaceExp;
- this.result = result;
- this.start = start;
- this.end = end;
- this.unknown = unknown;
- this.priority = priority;
- this.reduce = reduce;
- }
-
- /// <summary>
- /// Get the matchExpression array
- /// </summary>
- public string[] MatchExpression
- {
- get { return matchExps; }
- }
- /// <summary>
- /// Get the ToBereplaceExpression array
- /// </summary>
- public string[] ToBeReplacedExpression
- {
- get { return toBeReplacedExp; }
- }
-
- /// <summary>
- /// Get the replaceResult string
- /// </summary>
- public string ReplaceResult
- {
- get { return result; }
- }
-
- /// <summary>
- /// if this rule is coming with a '^'
- /// </summary>
- public bool Start
- {
- get { return start; }
- }
- /// <summary>
- /// if this rule is coming with a '$'
- /// </summary>
- public bool End
- {
- get { return end; }
- }
-
- /// <summary>
- /// if this rule is coming with a '<'
- /// </summary>
- public bool Unknown
- {
- get { return unknown; }
- }
-
- /// <summary>
- /// the rule's priority
- /// </summary>
- public int Priority
- {
- get { return priority; }
- }
-
- /// <summary>
- /// Judge if the rule match the curent word...
- /// </summary>
- /// <param name="word">The incoming word</param>
- /// <returns>a Struct MatchResult</returns>
- public MatchResult IsMatching(string word, int startPos)
- {
- //int resultValue = -1; // no match
-
- MatchResult result = new MatchResult(-1, -333, -1, false);
-
- string matchValue = word.Length > startPos ? word.Substring(startPos) : "";
-
- int indicator = 0;
- foreach (string str in matchExps)
- {
- if (matchValue.StartsWith(str) // must be true..
- && !(start && startPos > 0) // if
- && !(end && !matchValue.EndsWith(str)))
- {
- result.K = toBeReplacedExp[indicator].Length;
- result.P = priority;
- result.MatchIndex = indicator;
- result.Match = true;
- break;
- }
- indicator++;
- }
- return result;
- }
- }
-
- #endregion
-
- #region Private functions
-
- /// <summary>
- /// Load the phonetfile to the list of PhonetRule rules
- /// </summary>
- /// <param name="phonetFile">phonetFile name</param>
- void loadPhonetFile(string phonetFile)
- {
- if (!File.Exists(phonetFile))
- {
- Exception e = new Exception(@"Can't find the file " + phonetFile + @"!\n");
- throw (e);
- }
- try
- {
-
- using (TextReader tr = new StreamReader(phonetFile))
- {
- string line;
- while ((line = tr.ReadLine()) != null)
- {
- line = line.Trim();
- /// startwith # is 注释
- if (line.StartsWith(@"#"))
- continue;
- if (line != "")
- {
- /// if line is the real data. use the buildRule to build one RULE
- BuildPhonetRule(line);
- }
- }
- tr.Close();
- tr.Dispose();
- }
- }
- catch (IOException ee)
- {
- throw (ee);
- }
- }
-
- /// <summary>
- /// Build one rule from the line
- /// </summary>
- /// <param name="line">the line from the phonet file</param>
- private void BuildPhonetRule(string line)
- {
- //str = str.Remove(str.IndexOf(" "), str.LastIndexOf(" ") - str.IndexOf(" "));
- string ruleline = line.Remove(line.IndexOf(" "), line.LastIndexOf(" ") - line.IndexOf(" "));
- ruleline = ruleline.ToUpper();
- string[] values = ruleline.Split(new char[] { ' ' });
- if (values.Length != 2) return;
- int i;
-
- string key = values[0];
- string value = values[1];
-
- for (i = 0; i < IGNORED_KEYWORDS.Length; i++)
- {
- if (key.Equals(IGNORED_KEYWORDS[i], StringComparison.OrdinalIgnoreCase))
- {
- if (key.Equals(IGNORED_KEYWORDS[0], StringComparison.OrdinalIgnoreCase)) //version
- version = value;
- else if (key.Equals(IGNORED_KEYWORDS[1], StringComparison.OrdinalIgnoreCase)) // followup
- followup = bool.Parse(value);
- if (key.Equals(IGNORED_KEYWORDS[1], StringComparison.OrdinalIgnoreCase)) // collapse
- collapse = bool.Parse(value);
- return;
- }
- }
-
- // A different alphabet is used for this language, will be read into
- // the alphabetString variable.
- if (key.Equals(KEYWORD_ALPHBET, StringComparison.OrdinalIgnoreCase))
- {
- alphabets = value.Substring(value.IndexOf(ALPHABET_START) + 1, value.LastIndexOf(ALPHABET_END) - value.IndexOf(ALPHABET_START) - 1).ToCharArray();
- return;
- }
-
- // build the real rules..
- StringBuilder matchExp, replaceResult; //replaceExp,
- int reduce = 0, priority = 5; // default priority is 5
- List<string> matchExps = new List<string>();
- List<string> replaceExps = new List<string>();
- char[] matchChars = key.ToCharArray();
-
- matchExp = new StringBuilder();
- replaceResult = new StringBuilder(value);
-
- bool start = false, end = false, unknow = false;
- bool withMulti = false;
- i = 0;
- string reservedChars = RESERVEDSTRING;
-
- while (true)
- {
-
- while ((!reservedChars.Contains(matchChars[i])) && i < matchChars.Length)
- {
- matchExp.Append(matchChars[i]);
- i++;
- if (i >= matchChars.Length) break;
- }
-
- if (i < matchChars.Length && matchChars[i] == STARTMULTI)
- {
- i++;
- while (matchChars[i] != ENDMULTI) // if there is a '(' there must be a ')';
- {
- StringBuilder sb1 = new StringBuilder(matchExp.ToString());
- sb1.Append(matchChars[i]);
- matchExps.Add(sb1.ToString());
- replaceExps.Add(sb1.ToString());
- i++;
- }
- i++;
- withMulti = true;
- }
-
- if (!withMulti)
- {
- matchExps.Add(matchExp.ToString());
- replaceExps.Add(matchExp.ToString());
- }
-
- if (i >= matchChars.Length) break;
-
- reduce = 0;
- while (matchChars[i] == '-')
- {
- reduce++;
- for (int k = 0; k < replaceExps.Count; k++)
- {
- replaceExps[k] = replaceExps[k].Substring(0, replaceExps[k].Length - 1);
- }
- i++;
- if (i >= matchChars.Length)
- break;
- }
-
- if (i >= matchChars.Length) break;
-
- if (matchChars[i] == '^')
- {
- start = true;
- i++;
- }
- if (i >= matchChars.Length) break;
-
- if (matchChars[i] == '$')
- {
- end = true;
- i++;
- }
- if (i >= matchChars.Length) break;
-
- if (matchChars[i] == '<')
- {
- unknow = true;
- i++;
- }
- if (i >= matchChars.Length) break;
-
- if (char.IsDigit(matchChars[i]))
- {
- priority = (int)(matchChars[i] - '0');
- i++;
- }
- if (i >= matchChars.Length) break;
-
- }
-
- if (matchExps.Count > 0)
- {
- rules.Add(new PhonetRule(matchExps.ToArray(), replaceExps.ToArray(), replaceResult.ToString(),
- start, end, unknow, priority, reduce));
- }
- }
-
- #endregion
-
- /// <summary>
- /// For debug.
- /// </summary>
- [Conditional("TRACE_ON")]
- private void DumpRules()
- {
- int n = 0;
- foreach (PhonetRule rule in rules)
- {
- Console.WriteLine("Rule NO: " + n.ToString());
- Console.WriteLine("Priority: " + rule.Priority.ToString() + " Start:" + rule.Start.ToString() +
- " End:" + rule.End.ToString() + " Unkow:" + rule.Unknown.ToString());
- string[] strs1 = rule.MatchExpression;
- string[] strs2 = rule.MatchExpression;
-
- string temp;
-
- if (strs1.Length == strs2.Length)
- {
- for (int i = 0; i < strs1.Length; i++)
- {
- temp = string.Format("{0,5} [{1,5}] ----> {2}", strs1[i], strs2[i], rule.ReplaceResult);
- Console.WriteLine(temp);
- }
- }
-
- Console.WriteLine("---------------------------------------");
- n++;
- }
- }
- }
- }