PageRenderTime 48ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 0ms

/Sources/DevRain.Data.Extracting.Features/ContentAnalyzer.cs

#
C# | 309 lines | 221 code | 45 blank | 43 comment | 34 complexity | ba143dcc0150ea2120ca5786e63a5e66 MD5 | raw file
  1. // ============================================================================
  2. // <copyright file="ContentAnalyzer.cs" company="DevRain">
  3. // Copyright (c) DevRain 2011. All rights reserved.
  4. // </copyright>
  5. // <author>Oleksandr Krakovetskyi</author>
  6. // <date>11.02.2011</date>
  7. // ============================================================================
  8. namespace DevRain.Data.Extracting.Features
  9. {
  10. using System;
  11. using System.Collections.Generic;
  12. using System.Linq;
  13. using DevRain.Data.Extracting;
  14. /// <summary>
  15. /// ContentAnalyzer is used for content analysis.
  16. /// </summary>
  17. public class ContentAnalyzer : HtmlProcessor
  18. {
  19. /// <summary>
  20. /// Gets collection of words.
  21. /// </summary>
  22. public List<string> Words
  23. {
  24. get
  25. {
  26. return this.InnerText.SplitToWords();
  27. }
  28. }
  29. /// <summary>
  30. /// Gets list of sentences.
  31. /// </summary>
  32. public List<string> Sentences
  33. {
  34. get
  35. {
  36. var sentences = new List<string>();
  37. string text = this.InnerText.TrimSafe() + " ";
  38. text = text.Replace("Dr.", "Dr").Replace("Ms.", "Ms").Replace("Mr.", "Mr").Replace("Dept.", "Dept");
  39. text = text.Replace(System.Environment.NewLine, " ~~ ");
  40. if (!string.IsNullOrEmpty(text))
  41. {
  42. string[] split = text.Split(new string[] { " ~~ " }, StringSplitOptions.RemoveEmptyEntries);
  43. //string[] splitSentences = Regex.Split(text, @"(?<=['""A-Za-z0-9][\.\!\?])\s+(?=[A-Z])");
  44. foreach (string str in split)
  45. {
  46. if (str.IndexOfAny(new string[] { "...", ".", "!", "?" }) != -1)
  47. {
  48. string[] splitSentences = str.Split(
  49. new string[] { "... ", ". ", "! ", "? " }, StringSplitOptions.RemoveEmptyEntries);
  50. // loop the sentences
  51. for (int i = 0; i < splitSentences.Length; i++)
  52. {
  53. // clean up the sentence one more time, trim it, and add it to the array list
  54. string sSingleSentence = splitSentences[i];
  55. string sent = splitSentences[i];
  56. int wordsCount = sent.SplitToWords().Count;
  57. if (wordsCount > 2 && wordsCount < 30)
  58. {
  59. sentences.Add(sent);
  60. this.WordsInSentencesCount += wordsCount;
  61. }
  62. }
  63. }
  64. }
  65. }
  66. return sentences;
  67. }
  68. }
  69. /// <summary>
  70. /// Average sentences length (only words which are the part of sentences are taken).
  71. /// </summary>
  72. public double SentencesAvgLength
  73. {
  74. get
  75. {
  76. return (this.Sentences.Count == 0) ? 0 : (double)this.WordsInSentencesCount / (double)this.Sentences.Count;
  77. }
  78. }
  79. /// <summary>
  80. /// Gets list of known stop words
  81. /// </summary>
  82. public List<string> StopWords
  83. {
  84. get
  85. {
  86. List<string> stopWordsList = Extensions.GetStopWords();
  87. var stopWords = new List<string>();
  88. foreach (string stopWord in stopWordsList)
  89. {
  90. if (this.InnerText.ToUpperInvariant().Contains(stopWord.ToUpperInvariant()))
  91. {
  92. stopWords.Add(stopWord);
  93. }
  94. }
  95. return stopWords;
  96. }
  97. }
  98. /// <summary>
  99. /// Gets number of words which are links
  100. /// </summary>
  101. public List<string> WordsAsLinks
  102. {
  103. get
  104. {
  105. List<string> list = new List<string>();
  106. foreach (var link in this.Links)
  107. list.AddRange(link.InnerText.SplitToWords());
  108. return list;
  109. }
  110. }
  111. /// <summary>
  112. /// Gets words in lists ("li").
  113. /// </summary>
  114. public List<string> WordsAsLists
  115. {
  116. get
  117. {
  118. List<string> words = new List<string>();
  119. var collection = this.GetElementsByTagName("li");
  120. foreach (var li in collection)
  121. {
  122. words.AddRange(li.InnerText.SplitToWords());
  123. }
  124. return words;
  125. }
  126. }
  127. public ContentAnalyzer(string html): base(html) { }
  128. public ContentAnalyzer(Uri uri) : base(uri) { }
  129. public double GetSeoRank(string keyword)
  130. {
  131. double[] _rates = new double[] {
  132. 0.66,
  133. 0.63,
  134. 0.60,
  135. 0.49,
  136. 0.47,
  137. 0.45,
  138. 0.45,
  139. 0.42,
  140. 0.38,
  141. 0.37,
  142. 0.35,
  143. 0.33,
  144. 0.33,
  145. 0.26,
  146. 0.25,
  147. 0.23,
  148. 0.22,
  149. 0.21,
  150. 0.19,
  151. 0.12,
  152. 0.06,
  153. 0.05
  154. };
  155. double rank = 0.0;
  156. keyword = keyword.ToLower();
  157. // Title contains keyword
  158. if (this.Title.IndexOf(keyword, StringComparison.OrdinalIgnoreCase) != -1)
  159. rank += _rates[0];
  160. // Keyword is first in Title
  161. if (this.Title.StartsWith(keyword))
  162. rank += _rates[1];
  163. //// Keyword is a part of URL (e.g. keyword.com)
  164. //if (this._uri.Host.IndexOf(keyword, StringComparison.OrdinalIgnoreCase) != -1)
  165. // rank += _rates[2];
  166. foreach (var header in this.Headers)
  167. {
  168. if (header.InnerText.IndexOf(keyword, StringComparison.OrdinalIgnoreCase) != -1)
  169. {
  170. rank += _rates[3];
  171. break;
  172. }
  173. }
  174. foreach (var link in this.Links)
  175. {
  176. if (link.InnerText.IndexOf(keyword, StringComparison.OrdinalIgnoreCase) != -1)
  177. {
  178. rank += _rates[4];
  179. break;
  180. }
  181. }
  182. foreach (var header in this.Headers)
  183. {
  184. if (header.InnerText.StartsWith(keyword, StringComparison.OrdinalIgnoreCase))
  185. {
  186. if (header.TagName.ToUpperInvariant() == "H1")
  187. {
  188. rank += _rates[5];
  189. break;
  190. }
  191. else
  192. {
  193. rank += _rates[10];
  194. break;
  195. }
  196. }
  197. }
  198. int i = 0;
  199. foreach (var word in this.Words)
  200. {
  201. if (i == 100) break;
  202. if (word.Equals(keyword, StringComparison.OrdinalIgnoreCase))
  203. rank += _rates[6];
  204. i++;
  205. }
  206. foreach (var image in this.Images)
  207. {
  208. if (image.Attributes["alt"].IndexOf(keyword, StringComparison.OrdinalIgnoreCase) != -1)
  209. {
  210. rank += _rates[11];
  211. break;
  212. }
  213. }
  214. foreach (var image in this.Images)
  215. {
  216. if (image.Attributes["src"].IndexOf(keyword, StringComparison.OrdinalIgnoreCase) != -1)
  217. {
  218. rank += _rates[12];
  219. break;
  220. }
  221. }
  222. foreach (var word in this.WordsAsLists)
  223. {
  224. if (word.Equals(keyword, StringComparison.OrdinalIgnoreCase))
  225. {
  226. rank += _rates[15];
  227. break;
  228. }
  229. }
  230. if (this.MetaDescription.IndexOf(keyword, StringComparison.OrdinalIgnoreCase) != -1)
  231. rank += _rates[16];
  232. if (this.MetaKeywords.IndexOf(keyword, StringComparison.OrdinalIgnoreCase) != -1)
  233. rank += _rates[17];
  234. return rank / (_rates.Length - 9);
  235. }
  236. /// <summary>
  237. /// Gets document content uniqueness.
  238. /// </summary>
  239. /// <param name="documents">List of all documents.</param>
  240. /// <param name="index">Index of document to be checked.</param>
  241. /// <param name="minimumSentenseLength">Minimum sentense length to be taken.</param>
  242. /// <returns>Calcilated document uniqueness in range [0; 1].</returns>
  243. public static double GetDocumentUniqueness(List<string> documents, int index, int minimumSentenseLength)
  244. {
  245. ContentAnalyzer a = new ContentAnalyzer(documents[index]);
  246. var sentenses = a.Sentences.Where(s => s.Length > minimumSentenseLength);
  247. int count = 0;
  248. for (int i = 0; i < documents.Count; i++)
  249. {
  250. if (i == index) continue;
  251. a = new ContentAnalyzer(documents[i]);
  252. foreach (var sentense in sentenses)
  253. {
  254. if (a.Sentences.Contains(sentense))
  255. {
  256. count++;
  257. }
  258. }
  259. }
  260. return ((double)sentenses.Count() - (double)count) / (double)sentenses.Count();
  261. }
  262. public int WordsInSentencesCount { get; protected set; }
  263. }
  264. }