PageRenderTime 48ms CodeModel.GetById 22ms RepoModel.GetById 0ms app.codeStats 0ms

/Main/Libraries/NHunspell/MyThes.cs

#
C# | 481 lines | 297 code | 72 blank | 112 comment | 49 complexity | be6108038a57869fff54db0729d1c756 MD5 | raw file
Possible License(s): MPL-2.0-no-copyleft-exception
  1. // --------------------------------------------------------------------------------------------------------------------
  2. // <copyright file="MyThes.cs" company="Maierhofer Software and the Hunspell Developers">
  3. // (c) by Maierhofer Software an the Hunspell Developers
  4. // </copyright>
  5. // <summary>
  6. // provides thesaurus functions to get synonyms for a word
  7. // </summary>
  8. // --------------------------------------------------------------------------------------------------------------------
  9. namespace NHunspell
  10. {
  11. using System;
  12. using System.Collections.Generic;
  13. using System.IO;
  14. using System.Text;
  15. /// <summary>
  16. /// provides thesaurus functions to get synonyms for a word
  17. /// </summary>
  18. public class MyThes
  19. {
  20. #region Constants and Fields
  21. /// <summary>
  22. /// The dictionary lock.
  23. /// </summary>
  24. private readonly object dictionaryLock = new object();
  25. /// <summary>
  26. /// The synonyms.
  27. /// </summary>
  28. private readonly Dictionary<string, ThesMeaning[]> synonyms = new Dictionary<string, ThesMeaning[]>();
  29. #endregion
  30. #region Constructors and Destructors
  31. /// <summary>
  32. /// Initializes a new instance of the <see cref="MyThes"/> class.
  33. /// </summary>
  34. public MyThes()
  35. {
  36. }
  37. /// <summary>
  38. /// Initializes a new instance of the <see cref="MyThes"/> class.
  39. /// </summary>
  40. /// <param name="datBytes">
  41. /// The thesaurus dictionary bytes.
  42. /// </param>
  43. public MyThes(byte[] datBytes)
  44. {
  45. Load(datBytes);
  46. }
  47. /// <summary>
  48. /// Initializes a new instance of the <see cref="MyThes"/> class.
  49. /// </summary>
  50. /// <param name="datFile">
  51. /// The path to the thesaurus dictionary file.
  52. /// </param>
  53. public MyThes(string datFile)
  54. {
  55. Load(datFile);
  56. }
  57. /// <summary>
  58. /// Initializes a new instance of the <see cref="MyThes"/> class.
  59. /// </summary>
  60. /// <param name="idxFile">
  61. /// The thesuarus idx file.
  62. /// </param>
  63. /// <param name="datFile">
  64. /// The thesaurus dat file.
  65. /// </param>
  66. /// <remarks>
  67. /// This function is obsolete, idx File is not longer needed, <see cref="MyThes"/> works now completely in memory
  68. /// </remarks>
  69. [Obsolete("idx File is not longer needed, MyThes works completely in memory")]
  70. public MyThes(string idxFile, string datFile)
  71. {
  72. Load(datFile);
  73. }
  74. #endregion
  75. #region Public Methods
  76. /// <summary>
  77. /// Gets the .NET encoding for the specified dictionary encoding.
  78. /// </summary>
  79. /// <param name="encoding">
  80. /// The encoding.
  81. /// </param>
  82. /// <returns>
  83. /// </returns>
  84. /// <exception cref="NotSupportedException">
  85. /// </exception>
  86. public static Encoding GetEncoding(string encoding)
  87. {
  88. encoding = encoding.Trim().ToLower();
  89. switch (encoding)
  90. {
  91. case "utf-8":
  92. case "utf8":
  93. return Encoding.GetEncoding(65001);
  94. case "iso8859-1":
  95. case "iso-8859-1":
  96. return Encoding.GetEncoding(28591);
  97. case "iso8859-2":
  98. case "iso-8859-2":
  99. return Encoding.GetEncoding(28592);
  100. case "iso8859-3":
  101. case "iso-8859-3":
  102. return Encoding.GetEncoding(28593);
  103. case "iso8859-4":
  104. case "iso-8859-4":
  105. return Encoding.GetEncoding(28594);
  106. case "iso8859-5":
  107. case "iso-8859-5":
  108. return Encoding.GetEncoding(28595);
  109. case "iso8859-6":
  110. case "iso-8859-6":
  111. return Encoding.GetEncoding(28596);
  112. case "iso8859-7":
  113. case "iso-8859-7":
  114. return Encoding.GetEncoding(28597);
  115. case "iso8859-8":
  116. case "iso-8859-8":
  117. return Encoding.GetEncoding(28598);
  118. case "iso8859-9":
  119. case "iso-8859-9":
  120. return Encoding.GetEncoding(28599);
  121. case "iso8859-13":
  122. case "iso-8859-13":
  123. return Encoding.GetEncoding(28603);
  124. case "iso8859-15":
  125. case "iso-8859-15":
  126. return Encoding.GetEncoding(28605);
  127. case "windows-1250":
  128. case "microsoft-cp1250":
  129. return Encoding.GetEncoding(1250);
  130. case "windows-1251":
  131. case "microsoft-cp1251":
  132. return Encoding.GetEncoding(1251);
  133. case "windows-1252":
  134. case "microsoft-cp1252":
  135. return Encoding.GetEncoding(1252);
  136. case "windows-1253":
  137. case "microsoft-cp1253":
  138. return Encoding.GetEncoding(1253);
  139. case "windows-1254":
  140. case "microsoft-cp1254":
  141. return Encoding.GetEncoding(1254);
  142. case "windows-1255":
  143. case "microsoft-cp1255":
  144. return Encoding.GetEncoding(1255);
  145. case "windows-1256":
  146. case "microsoft-cp1256":
  147. return Encoding.GetEncoding(1256);
  148. case "windows-1257":
  149. case "microsoft-cp1257":
  150. return Encoding.GetEncoding(1257);
  151. case "windows-1258":
  152. case "microsoft-cp1258":
  153. return Encoding.GetEncoding(1258);
  154. case "windows-1259":
  155. case "microsoft-cp1259":
  156. return Encoding.GetEncoding(1259);
  157. case "koi8-r":
  158. case "koi8-u":
  159. return Encoding.GetEncoding(20866);
  160. default:
  161. throw new NotSupportedException("Encoding: " + encoding + " is not supported");
  162. }
  163. }
  164. /// <summary>
  165. /// Loads the thesaurus from a in memory dictionary.
  166. /// </summary>
  167. /// <param name="dictionaryBytes">
  168. /// The dictionary Bytes.
  169. /// </param>
  170. public void Load(byte[] dictionaryBytes)
  171. {
  172. if (this.synonyms.Count > 0)
  173. {
  174. throw new InvalidOperationException("Thesaurus already loaded");
  175. }
  176. int currentPos = 0;
  177. int currentLength = this.GetLineLength(dictionaryBytes, currentPos);
  178. string fileEncoding = Encoding.ASCII.GetString(dictionaryBytes, currentPos, currentLength);
  179. Encoding enc = GetEncoding(fileEncoding);
  180. currentPos += currentLength;
  181. string word = string.Empty;
  182. var meanings = new List<ThesMeaning>();
  183. while (currentPos < dictionaryBytes.Length)
  184. {
  185. currentPos += this.GetCrLfLength(dictionaryBytes, currentPos);
  186. currentLength = this.GetLineLength(dictionaryBytes, currentPos);
  187. string lineText = enc.GetString(dictionaryBytes, currentPos, currentLength).Trim();
  188. if (lineText != null && lineText.Length > 0)
  189. {
  190. string[] tokens = lineText.Split('|');
  191. if (tokens.Length > 0)
  192. {
  193. bool wordLine = true;
  194. if (tokens[0].StartsWith("-"))
  195. {
  196. wordLine = false;
  197. }
  198. if (tokens[0].StartsWith("(") && tokens[0].EndsWith(")"))
  199. {
  200. wordLine = false;
  201. }
  202. if (wordLine)
  203. {
  204. lock (this.dictionaryLock)
  205. {
  206. if (word.Length > 0 && ! this.synonyms.ContainsKey(word.ToLower()))
  207. {
  208. this.synonyms.Add(word.ToLower(), meanings.ToArray());
  209. }
  210. }
  211. meanings = new List<ThesMeaning>();
  212. word = tokens[0];
  213. }
  214. else
  215. {
  216. var currentSynonyms = new List<string>();
  217. string description = null;
  218. for (int tokIndex = 1; tokIndex < tokens.Length; ++tokIndex)
  219. {
  220. currentSynonyms.Add(tokens[tokIndex]);
  221. if (tokIndex == 1)
  222. {
  223. description = tokens[tokIndex];
  224. }
  225. }
  226. var meaning = new ThesMeaning(description, currentSynonyms);
  227. meanings.Add(meaning);
  228. }
  229. }
  230. }
  231. currentPos += currentLength;
  232. }
  233. lock (this.dictionaryLock)
  234. {
  235. if (word.Length > 0 && !this.synonyms.ContainsKey(word.ToLower()))
  236. {
  237. this.synonyms.Add(word.ToLower(), meanings.ToArray());
  238. }
  239. }
  240. }
  241. /// <summary>
  242. /// Loads the thesaurus from the specified dictionary file.
  243. /// </summary>
  244. /// <param name="dictionaryFile">
  245. /// The dictionary file.
  246. /// </param>
  247. public void Load(string dictionaryFile)
  248. {
  249. dictionaryFile = Path.GetFullPath(dictionaryFile);
  250. if (!File.Exists(dictionaryFile))
  251. {
  252. throw new FileNotFoundException("DAT File not found: " + dictionaryFile);
  253. }
  254. byte[] dictionaryData;
  255. using (FileStream stream = File.OpenRead(dictionaryFile))
  256. {
  257. using (var reader = new BinaryReader(stream))
  258. {
  259. dictionaryData = reader.ReadBytes((int)stream.Length);
  260. }
  261. }
  262. Load(dictionaryData);
  263. }
  264. /// <summary>
  265. /// Lookups synonyms for the specified word.
  266. /// </summary>
  267. /// <param name="word">
  268. /// The word to lookup
  269. /// </param>
  270. /// <returns>
  271. /// list of synonyms
  272. /// </returns>
  273. public ThesResult Lookup(string word)
  274. {
  275. if (this.synonyms.Count == 0)
  276. {
  277. throw new InvalidOperationException("Thesaurus not loaded");
  278. }
  279. word = word.ToLower();
  280. ThesMeaning[] meanings;
  281. lock (this.dictionaryLock)
  282. {
  283. if (!this.synonyms.TryGetValue(word, out meanings))
  284. {
  285. return null;
  286. }
  287. }
  288. var result = new ThesResult(new List<ThesMeaning>(meanings), false);
  289. return result;
  290. }
  291. /// <summary>
  292. /// Lookups the specified word with word stemming and generation
  293. /// </summary>
  294. /// <param name="word">
  295. /// The word.
  296. /// </param>
  297. /// <param name="stemming">
  298. /// The <see cref="Hunspell"/> object for stemming and generation.
  299. /// </param>
  300. /// <returns>
  301. /// </returns>
  302. public ThesResult Lookup(string word, Hunspell stemming)
  303. {
  304. if (this.synonyms.Count == 0)
  305. {
  306. throw new InvalidOperationException("Thesaurus not loaded");
  307. }
  308. ThesResult result = this.Lookup(word);
  309. if (result != null)
  310. {
  311. return result;
  312. }
  313. List<string> stems = stemming.Stem(word);
  314. if (stems == null || stems.Count == 0)
  315. {
  316. return null;
  317. }
  318. var meanings = new List<ThesMeaning>();
  319. foreach (string stem in stems)
  320. {
  321. ThesResult stemSynonyms = this.Lookup(stem);
  322. if (stemSynonyms != null)
  323. {
  324. foreach (ThesMeaning meaning in stemSynonyms.Meanings)
  325. {
  326. var currentSynonyms = new List<string>();
  327. foreach (string synonym in meaning.Synonyms)
  328. {
  329. List<string> generatedSynonyms = stemming.Generate(synonym, word);
  330. foreach (string generatedSynonym in generatedSynonyms)
  331. {
  332. currentSynonyms.Add(generatedSynonym);
  333. }
  334. }
  335. if (currentSynonyms.Count > 0)
  336. {
  337. meanings.Add(new ThesMeaning(meaning.Description, currentSynonyms));
  338. }
  339. }
  340. }
  341. }
  342. if (meanings.Count > 0)
  343. {
  344. return new ThesResult(meanings, true);
  345. }
  346. return null;
  347. }
  348. #endregion
  349. #region Methods
  350. /// <summary>
  351. /// The get cr lf length.
  352. /// </summary>
  353. /// <param name="buffer">
  354. /// The buffer.
  355. /// </param>
  356. /// <param name="pos">
  357. /// The pos.
  358. /// </param>
  359. /// <returns>
  360. /// The get cr lf length.
  361. /// </returns>
  362. /// <exception cref="ArgumentException">
  363. /// </exception>
  364. private int GetCrLfLength(byte[] buffer, int pos)
  365. {
  366. if (buffer[pos] == 10)
  367. {
  368. if (buffer.Length > pos + 1 && buffer[pos] == 13)
  369. {
  370. return 2;
  371. }
  372. return 1;
  373. }
  374. if (buffer[pos] == 13)
  375. {
  376. if (buffer.Length > pos + 1 && buffer[pos] == 10)
  377. {
  378. return 2;
  379. }
  380. return 1;
  381. }
  382. throw new ArgumentException("buffer[pos] dosen't point to CR or LF");
  383. }
  384. /// <summary>
  385. /// Gets the length of the line.
  386. /// </summary>
  387. /// <param name="buffer">
  388. /// The buffer.
  389. /// </param>
  390. /// <param name="start">
  391. /// The start.
  392. /// </param>
  393. /// <returns>
  394. /// The get line length.
  395. /// </returns>
  396. private int GetLineLength(byte[] buffer, int start)
  397. {
  398. for (int i = start; i < buffer.Length; ++i)
  399. {
  400. if (buffer[i] == 10 || buffer[i] == 13)
  401. {
  402. return i - start;
  403. }
  404. }
  405. return buffer.Length - start;
  406. }
  407. #endregion
  408. }
  409. }