PageRenderTime 61ms CodeModel.GetById 31ms RepoModel.GetById 1ms app.codeStats 0ms

/Source/FacetedSearch3/FacetedSearch3/FacetedSearch3/Keywords/HdSearchOntologyHelper.cs

#
C# | 305 lines | 170 code | 35 blank | 100 comment | 14 complexity | fb9306fd4fd253e63ed3d5f3e610365a MD5 | raw file
Possible License(s): MIT
  1. using System;
  2. using System.Collections.Generic;
  3. using System.IO;
  4. using System.Linq;
  5. using System.Text;
  6. using System.Xml;
  7. namespace FacetedSearch3.Keywords
  8. {
  9. public class HdSearchOntologyHelper
  10. {
  11. //todo: Copied from Search2. Need to be refactored.
  12. private static readonly string _ontologyFilename = Properties.Settings.Default.OntologyFilename;
  13. private static readonly string _ontologySynonymsFilename = Properties.Settings.Default.SynonymsFilename;
  14. public static XmlDocument ReadOntologyXmlFile()
  15. {
  16. return ReadXmlFile(_ontologyFilename);
  17. }
  18. public static XmlDocument ReadOntologyXmlFile(string filename)
  19. {
  20. return ReadXmlFile(filename);
  21. }
  22. public static XmlDocument ReadOntologySymbologyXmlFile()
  23. {
  24. return ReadXmlFile(_ontologySynonymsFilename);
  25. }
  26. public static XmlDocument ReadOntologySymbologyXmlFile(string filename)
  27. {
  28. return ReadXmlFile(filename);
  29. }
  30. private static XmlDocument ReadXmlFile(string filename)
  31. {
  32. XmlDocument tmpxmldoc = new XmlDocument();
  33. string assemblyFolder = Path.GetDirectoryName(System.Reflection.Assembly.GetExecutingAssembly().Location);
  34. tmpxmldoc.Load(Path.Combine(assemblyFolder, filename));
  35. return tmpxmldoc;
  36. }
  37. #region Ontology Utilities
  38. /* for refatoring
  39. * we need a set of test cases
  40. * top level terms that should return empty string
  41. * couple of leaf nodes, should be return the same
  42. * terms that are under, and only the top level should be retained
  43. * disjount terms, that should be returned
  44. * disjoint terms with a term or two under
  45. *
  46. * Note this is a good method to test (Tim good job).
  47. * We could make a small sample ontology xml file to represent a subset of
  48. * ontology and make a very small set of controlled use cases
  49. * */
  50. /// <summary>
  51. /// Modifies the input keyword list by removing redundant or otherwise unnecessary items for efficient searching.
  52. /// </summary>
  53. /// <param name="KeywordList">List of input keywords to refine.</param>
  54. /// <param name="OntologyXml">XML of the CUAHSI hydrologic ontology.</param>
  55. public void RefineKeywordList(List<string> KeywordList, XmlDocument OntologyXml)
  56. {
  57. // Refactoring. This is the entry point
  58. // If searching 1st tier keywords, clear the list.
  59. List<string> tier1Keywords = GetKeywordsAtTier(1, OntologyXml);
  60. foreach (string tier1keyword in tier1Keywords)
  61. {
  62. if (KeywordList.Contains(tier1keyword) == true)
  63. {
  64. KeywordList.Clear();
  65. return;
  66. }
  67. }
  68. // Remove repeated keywords.
  69. List<string> tmpList = KeywordList.Distinct().ToList();
  70. if (tmpList.Count != KeywordList.Count)
  71. {
  72. KeywordList.Clear();
  73. KeywordList.AddRange(tmpList);
  74. }
  75. // Remove keywords that don't have a match in the ontology.
  76. RemoveUnmatchedKeywords(KeywordList, OntologyXml);
  77. // Remove keywords if their ancestors are also in the list.
  78. RemoveRedundantChildKeywords(KeywordList, OntologyXml);
  79. // Replace 2nd tier keywords with their 3rd tier child keywords.
  80. // 2nd tier keywords cannot be searched at HIS Central.
  81. List<string> tier2Keywords = GetKeywordsAtTier(2, OntologyXml);
  82. foreach (string tier2keyword in tier2Keywords)
  83. {
  84. if (KeywordList.Contains(tier2keyword) == true)
  85. {
  86. // Remove 2nd tier keyword
  87. RemoveAllFromList(KeywordList, tier2keyword);
  88. // Add 3rd tier keywords that are children of the removed 2nd tier keyword.
  89. List<string> tier3Keywords = GetChildKeywords(tier2keyword, OntologyXml);
  90. foreach (string tier3keyword in tier3Keywords)
  91. {
  92. if (KeywordList.Contains(tier3keyword) == false)
  93. {
  94. KeywordList.Add(tier3keyword);
  95. }
  96. }
  97. }
  98. }
  99. }
  100. /// <summary>
  101. /// Gets all child keywords for the given keyword from the ontology XML.
  102. /// </summary>
  103. /// <param name="Keyword">The keyword for which child keywords are sought.</param>
  104. /// <param name="OntologyXml">XML of the CUAHSI hydrologic ontology.</param>
  105. /// <returns>List of child keywords for the given keyword from the ontology XML.</returns>
  106. private List<string> GetChildKeywords(string Keyword, XmlDocument OntologyXml)
  107. {
  108. // Create a namespace manager to enable XPath searching. Otherwise, no results are returned if a namespace is present.
  109. // This works even if no namespace is present.
  110. XmlNamespaceManager nsmgr = new XmlNamespaceManager(OntologyXml.NameTable);
  111. nsmgr.AddNamespace("x", OntologyXml.DocumentElement.NamespaceURI);
  112. // Create an XPath expression to find all child keywords of the given keyword.
  113. string xpathExpression = "//x:OntologyNode[x:keyword='" + Keyword + "']/x:childNodes/x:OntologyNode/x:keyword";
  114. // Select all nodes that match the XPath expression.
  115. XmlNodeList keywordNodes = OntologyXml.SelectNodes(xpathExpression, nsmgr);
  116. // Return a list of the parent keywords.
  117. return NodeListToStringList(keywordNodes);
  118. }
  119. /// <summary>
  120. /// Gets keywords at a given tier within the hierarchical CUAHSI hydrologic ontology.
  121. /// </summary>
  122. /// <param name="Tier">The tier for which keywords are sought. The highlest level is tier 1, the next level is tier 2, and so on.</param>
  123. /// <param name="OntologyXml">XML of the CUAHSI hydrologic ontology.</param>
  124. /// <returns>List of keywords at the given tier in the ontology XML.</returns>
  125. private List<string> GetKeywordsAtTier(int Tier, XmlDocument OntologyXml)
  126. {
  127. // Validate inputs.
  128. if (Tier < 1)
  129. {
  130. throw new ArgumentOutOfRangeException("Tier", "Tier must be greater than or equal to 1");
  131. }
  132. // Create a namespace manager to enable XPath searching. Otherwise, no results are returned if a namespace is present.
  133. // This works even if no namespace is present.
  134. XmlNamespaceManager nsmgr = new XmlNamespaceManager(OntologyXml.NameTable);
  135. nsmgr.AddNamespace("x", OntologyXml.DocumentElement.NamespaceURI);
  136. // Create an XPath expression to find all keywords at the given tier.
  137. StringBuilder expressionBuilder = new StringBuilder(Tier * 25);
  138. for (int i = 2; i <= Tier; i++)
  139. {
  140. expressionBuilder.Append("/x:OntologyNode/x:childNodes");
  141. }
  142. expressionBuilder.Append("/x:OntologyNode/x:keyword");
  143. string xpathExpression = expressionBuilder.ToString();
  144. // Select all nodes that match the XPath expression.
  145. XmlNodeList keywordNodes = OntologyXml.SelectNodes(xpathExpression, nsmgr);
  146. // Return a list of the keywords.
  147. return NodeListToStringList(keywordNodes);
  148. }
  149. /// <summary>
  150. /// Gets all ancestor keywords (parent, grandparent, etc.) for the given keyword from the ontology XML.
  151. /// </summary>
  152. /// <param name="Keyword">The keyword for which ancestor keywords are sought.</param>
  153. /// <param name="OntologyXml">XML of the CUAHSI hydrologic ontology.</param>
  154. /// <returns>List of ancestor keywords for the given keyword from the ontology XML.</returns>
  155. private List<string> GetAncestorKeywords(string Keyword, XmlDocument OntologyXml)
  156. {
  157. // Create a namespace manager to enable XPath searching. Otherwise, no results are returned if a namespace is present.
  158. // This works even if no namespace is present.
  159. XmlNamespaceManager nsmgr = new XmlNamespaceManager(OntologyXml.NameTable);
  160. nsmgr.AddNamespace("x", OntologyXml.DocumentElement.NamespaceURI);
  161. // Create an XPath expression to find all parent keywords of the given keyword.
  162. string xpathExpression = "//x:OntologyNode[x:keyword='" + Keyword + "']/ancestor::x:OntologyNode/x:keyword";
  163. // Select all nodes that match the XPath expression.
  164. XmlNodeList keywordNodes = OntologyXml.SelectNodes(xpathExpression, nsmgr);
  165. // Return a list of the keywords.
  166. return NodeListToStringList(keywordNodes);
  167. }
  168. /// <summary>
  169. /// Gets keyword nodes from the CUAHSI hydrologic ontology XML that match the given keyword.
  170. /// </summary>
  171. /// <param name="Keyword">The keyword for which keyword nodes are sought.</param>
  172. /// <param name="OntologyXml">XML of the CUAHSI hydrologic ontology.</param>
  173. /// <returns>Keyword nodes from the CUAHSI hydrologic ontology XML that match the given keyword.</returns>
  174. private XmlNodeList GetKeywordNodes(string Keyword, XmlDocument OntologyXml)
  175. {
  176. // Create a namespace manager to enable XPath searching. Otherwise, no results are returned if a namespace is present.
  177. // This works even if no namespace is present.
  178. XmlNamespaceManager nsmgr = new XmlNamespaceManager(OntologyXml.NameTable);
  179. nsmgr.AddNamespace("x", OntologyXml.DocumentElement.NamespaceURI);
  180. // Create an XPath expression to find the given keyword.
  181. string xpathExpression = "//x:keyword[. = '" + Keyword + "']";
  182. // Select all nodes that match the XPath expression.
  183. return OntologyXml.SelectNodes(xpathExpression, nsmgr);
  184. }
  185. /// <summary>
  186. /// Modifies the input list by removing items whose ancestors from the Ontology XML also appear in the list.
  187. /// </summary>
  188. /// <param name="KeywordList">List of keywords for which redundant child keywords should be removed.</param>
  189. /// <param name="OntologyXml">XML of the CUAHSI hydrologic ontology.</param>
  190. private void RemoveRedundantChildKeywords(List<string> KeywordList, XmlDocument OntologyXml)
  191. {
  192. // Find parents for each keyword. If parent also exists in the keyword list, mark the keyword for removal.
  193. List<string> keywordsToRemove = new List<string>();
  194. foreach (string keyword in KeywordList)
  195. {
  196. List<string> parentKeywords = GetAncestorKeywords(keyword, OntologyXml);
  197. if (parentKeywords.Intersect(KeywordList).Count() > 0)
  198. {
  199. keywordsToRemove.Add(keyword);
  200. }
  201. }
  202. // Remove unnecessary keywords.
  203. foreach (string keywordToRemove in keywordsToRemove)
  204. {
  205. RemoveAllFromList(KeywordList, keywordToRemove);
  206. }
  207. }
  208. /// <summary>
  209. /// Modifies the input list by removing keywords that do not appear in the CUAHSI hydrologic Ontology.
  210. /// </summary>
  211. /// <param name="KeywordList">List of keywords for which redundant child keywords should be removed.</param>
  212. /// <param name="OntologyXml">XML of the CUAHSI hydrologic ontology.</param>
  213. private void RemoveUnmatchedKeywords(List<string> KeywordList, XmlDocument OntologyXml)
  214. {
  215. // Find keywords with no match in the ontology.
  216. List<string> keywordsToRemove = new List<string>();
  217. foreach (string keyword in KeywordList)
  218. {
  219. XmlNodeList matchingNodes = GetKeywordNodes(keyword, OntologyXml);
  220. if (matchingNodes.Count == 0)
  221. {
  222. keywordsToRemove.Add(keyword);
  223. }
  224. }
  225. // Remove unmatched keywords.
  226. foreach (string keywordToRemove in keywordsToRemove)
  227. {
  228. RemoveAllFromList(KeywordList, keywordToRemove);
  229. }
  230. }
  231. /// <summary>
  232. /// Removes all occurrences of a specific string from the System.Collections.Generic.List.
  233. /// </summary>
  234. /// <param name="StringList">System.Collections.Generic.List of strings</param>
  235. /// <param name="Item">The item to remove from the list</param>
  236. private void RemoveAllFromList(List<string> StringList, string Item)
  237. {
  238. while (StringList.Contains(Item))
  239. {
  240. StringList.Remove(Item);
  241. }
  242. }
  243. /// <summary>
  244. /// Creates a list of InnerText values from the input XML node list.
  245. /// </summary>
  246. /// <param name="NodeList">XML node list whose InnerText values will be added to a string list.</param>
  247. /// <returns>String list of InnerText values from the input XML list.</returns>
  248. private List<string> NodeListToStringList(XmlNodeList NodeList)
  249. {
  250. List<string> stringList = new List<string>();
  251. foreach (XmlNode node in NodeList)
  252. {
  253. stringList.Add(node.InnerText);
  254. }
  255. return stringList;
  256. }
  257. /// <summary>
  258. /// Gets the full path to the XML file storing the CUAHSI hydrologic ontology.
  259. /// </summary>
  260. /// <returns>The full path to the XML file storing the CUAHSI hydrologic ontology.</returns>
  261. private string GetOntologyFilePath()
  262. {
  263. // note for refactoring. load file on creation of object
  264. string hydroDesktopFolder = Path.GetDirectoryName(System.Reflection.Assembly.GetExecutingAssembly().Location);
  265. string ontologyFilePath = Path.Combine(hydroDesktopFolder, _ontologyFilename);
  266. return ontologyFilePath;
  267. }
  268. #endregion
  269. }
  270. }