PageRenderTime 49ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 0ms

/BlogEngine/DotNetSlave.BusinessLogic/Search.cs

#
C# | 681 lines | 463 code | 58 blank | 160 comment | 20 complexity | e73ab1716db538a0920e63cbcfef6cc8 MD5 | raw file
Possible License(s): LGPL-2.1, Apache-2.0, BSD-3-Clause
  1. namespace BlogEngine.Core
  2. {
  3. using System;
  4. using System.Collections.Generic;
  5. using System.Collections.ObjectModel;
  6. using System.Collections.Specialized;
  7. using System.Globalization;
  8. using System.Linq;
  9. using System.Text;
  10. using System.Text.RegularExpressions;
  11. using System.Web;
  12. using System.Xml;
  13. using BlogEngine.Core.Providers;
  14. /// <summary>
  15. /// Searches the post collection and returns a result based on a search term.
  16. /// <remarks>
  17. /// It is used for related posts and the in-site search feature.
  18. /// </remarks>
  19. /// </summary>
  20. public static class Search
  21. {
  22. #region Constants and Fields
  23. /// <summary>
  24. /// The catalog.
  25. /// </summary>
  26. private static readonly Dictionary<Guid, Collection<Entry>> _catalog = new Dictionary<Guid, Collection<Entry>>();
  27. /// <summary>
  28. /// The stop words.
  29. /// </summary>
  30. private static readonly Dictionary<Guid, StringCollection> _stopWords = new Dictionary<Guid, StringCollection>();
  31. /// <summary>
  32. /// The sync root.
  33. /// </summary>
  34. private static readonly object _syncRoot = new object();
  35. #endregion
  36. #region Properties
  37. private static StringCollection StopWords
  38. {
  39. get
  40. {
  41. Guid blogId = Blog.CurrentInstance.Id;
  42. StringCollection stopWords;
  43. lock (_syncRoot)
  44. {
  45. if (!_stopWords.TryGetValue(blogId, out stopWords))
  46. {
  47. stopWords = BlogService.LoadStopWords();
  48. _stopWords.Add(blogId, stopWords);
  49. }
  50. }
  51. return stopWords;
  52. }
  53. }
  54. private static Collection<Entry> Catalog
  55. {
  56. get
  57. {
  58. Guid blogId = Blog.CurrentInstance.Id;
  59. Collection<Entry> catalog;
  60. lock (_syncRoot)
  61. {
  62. if (!_catalog.TryGetValue(blogId, out catalog))
  63. {
  64. catalog = new Collection<Entry>();
  65. _catalog.Add(blogId, catalog);
  66. BuildCatalog();
  67. }
  68. }
  69. return catalog;
  70. }
  71. }
  72. #endregion
  73. #region Constructors and Destructors
  74. /// <summary>
  75. /// Initializes static members of the <see cref="Search"/> class.
  76. /// </summary>
  77. static Search()
  78. {
  79. Post.Saved += Post_Saved;
  80. Page.Saved += Page_Saved;
  81. BlogSettings.Changed += delegate { BuildCatalog(); };
  82. Post.CommentAdded += Post_CommentAdded;
  83. Post.CommentRemoved += delegate { BuildCatalog(); };
  84. Comment.Approved += Post_CommentAdded;
  85. }
  86. #endregion
  87. #region Events
  88. /// <summary>
  89. /// Occurs after the index has been build.
  90. /// </summary>
  91. public static event EventHandler<EventArgs> IndexBuild;
  92. /// <summary>
  93. /// Occurs just before the search index is being build.
  94. /// </summary>
  95. public static event EventHandler<EventArgs> IndexBuilding;
  96. /// <summary>
  97. /// Occurs when a search is performed. (The search term is the sender).
  98. /// </summary>
  99. public static event EventHandler<EventArgs> Searching;
  100. #endregion
  101. #region Public Methods
  102. /// <summary>
  103. /// Adds an IPublishable item to the search catalog.
  104. /// That will make it immediately searchable.
  105. /// </summary>
  106. /// <param name="item">
  107. /// The item to add.
  108. /// </param>
  109. public static void AddItem(IPublishable item)
  110. {
  111. var entry = new Entry
  112. {
  113. Item = item,
  114. Title = CleanContent(item.Title, false),
  115. Content = HttpUtility.HtmlDecode(CleanContent(item.Content, true))
  116. };
  117. if (item is Comment)
  118. {
  119. entry.Content += HttpUtility.HtmlDecode(CleanContent(item.Author, false));
  120. }
  121. Catalog.Add(entry);
  122. }
  123. // public static List<IPublishable> ApmlMatches(Uri url, int maxInterests)
  124. // {
  125. // using (System.Net.WebClient client = new System.Net.WebClient())
  126. // {
  127. // client.UseDefaultCredentials = true;
  128. // client.Headers.Add(System.Net.HttpRequestHeader.UserAgent, "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1;)");
  129. // using (StreamReader reader = new StreamReader(client.OpenRead(url)))
  130. // {
  131. // XmlDocument doc = new XmlDocument();
  132. // string content = reader.ReadToEnd();
  133. // string upper = content.ToUpperInvariant();
  134. // if (upper.Contains("<HTML") && upper.Contains("</HTML"))
  135. // {
  136. // Collection<Uri> urls = FindLinks("apml", content);
  137. // if (urls.Count > 0)
  138. // {
  139. // LoadDocument(url, doc, urls[0]);
  140. // }
  141. // else
  142. // {
  143. // throw new NotSupportedException("No APML link on page");
  144. // }
  145. // }
  146. // else
  147. // {
  148. // doc.LoadXml(content);
  149. // }
  150. // return Search.ApmlMatches(doc, 10);
  151. // }
  152. // }
  153. // }
  154. // private static void LoadDocument(Uri url, XmlDocument doc, Uri ApmlUrl)
  155. // {
  156. // if (url.IsAbsoluteUri)
  157. // {
  158. // doc.Load(ApmlUrl.ToString());
  159. // }
  160. // else
  161. // {
  162. // string absoluteUrl = null;
  163. // if (!url.ToString().StartsWith("/"))
  164. // absoluteUrl = (url + ApmlUrl.ToString());
  165. // else
  166. // absoluteUrl = url.Scheme + "://" + url.Authority + ApmlUrl;
  167. // doc.Load(absoluteUrl);
  168. // }
  169. // }
  170. /// <summary>
  171. /// The apml matches.
  172. /// </summary>
  173. /// <param name="apmlFile">
  174. /// The apml file.
  175. /// </param>
  176. /// <param name="maxInterests">
  177. /// The max interests.
  178. /// </param>
  179. /// <returns>
  180. /// A list of IPublishable.
  181. /// </returns>
  182. public static List<IPublishable> ApmlMatches(XmlDocument apmlFile, int maxInterests)
  183. {
  184. var concepts = new Dictionary<string, float>();
  185. var nodes = apmlFile.SelectNodes("//Concept");
  186. if (nodes != null)
  187. {
  188. foreach (XmlNode node in nodes)
  189. {
  190. if (node.Attributes == null)
  191. {
  192. continue;
  193. }
  194. var key = node.Attributes["key"].InnerText.ToLowerInvariant().Trim();
  195. var value = float.Parse(node.Attributes["value"].InnerText, CultureInfo.InvariantCulture);
  196. if (!concepts.ContainsKey(key))
  197. {
  198. concepts.Add(key, value);
  199. }
  200. else if (concepts[key] < value)
  201. {
  202. concepts[key] = value;
  203. }
  204. }
  205. }
  206. concepts = SortDictionary(concepts);
  207. var max = Math.Min(concepts.Count, maxInterests);
  208. var counter = 0;
  209. var resultSet = new List<Result>();
  210. foreach (var key in concepts.Keys)
  211. {
  212. counter++;
  213. var results = BuildResultSet(key, false);
  214. // results = results.FindAll(delegate(Result r) { return r.Rank > 1; });
  215. resultSet.AddRange(results);
  216. if (counter == max)
  217. {
  218. break;
  219. }
  220. }
  221. resultSet.Sort();
  222. var aggregatedResults = new List<Result>();
  223. foreach (var r in resultSet)
  224. {
  225. if (!aggregatedResults.Contains(r))
  226. {
  227. aggregatedResults.Add(r);
  228. }
  229. else
  230. {
  231. var r1 = r;
  232. var existingResult =
  233. aggregatedResults.Find(res => res.GetHashCode() == r1.GetHashCode());
  234. existingResult.Rank += r.Rank;
  235. }
  236. }
  237. aggregatedResults = aggregatedResults.FindAll(r => r.Rank > 1);
  238. var items = aggregatedResults.ConvertAll(ResultToPost);
  239. var uniqueItems = new List<IPublishable>();
  240. foreach (var item in items.Where(item => !uniqueItems.Contains(item)))
  241. {
  242. uniqueItems.Add(item);
  243. }
  244. return uniqueItems;
  245. }
  246. /// <summary>
  247. /// Returns a list of posts that is related to the specified post.
  248. /// </summary>
  249. /// <param name="post">The IPublishable post.</param>
  250. /// <returns>A list of IPublishable.</returns>
  251. public static List<IPublishable> FindRelatedItems(IPublishable post)
  252. {
  253. var term = CleanContent(post.Title, false);
  254. return Hits(term, false);
  255. }
  256. /// <summary>
  257. /// Searches all the posts and returns a ranked result set.
  258. /// </summary>
  259. /// <param name="searchTerm">The term to search for</param>
  260. /// <param name="includeComments">True to include a post's comments and their authors in search</param>
  261. /// <returns>A list of IPublishable.</returns>
  262. public static List<IPublishable> Hits(string searchTerm, bool includeComments)
  263. {
  264. lock (_syncRoot)
  265. {
  266. var results = BuildResultSet(searchTerm, includeComments);
  267. var items = results.ConvertAll(ResultToPost);
  268. results.Clear();
  269. OnSearcing(searchTerm);
  270. return items;
  271. }
  272. }
  273. #endregion
  274. // private const string PATTERN = "<head.*<link( [^>]*title=\"{0}\"[^>]*)>.*</head>";
  275. // private static readonly Regex HREF = new Regex("href=\"(.*)\"", RegexOptions.IgnoreCase | RegexOptions.Compiled);
  276. ///// <summary>
  277. ///// Finds semantic links in a given HTML document.
  278. ///// </summary>
  279. ///// <param name="type">The type of link. Could be foaf, apml or sioc.</param>
  280. ///// <param name="html">The HTML to look through.</param>
  281. ///// <returns></returns>
  282. // public static Collection<Uri> FindLinks(string type, string html)
  283. // {
  284. // MatchCollection matches = Regex.Matches(html, string.Format(PATTERN, type), RegexOptions.IgnoreCase | RegexOptions.Singleline);
  285. // Collection<Uri> urls = new Collection<Uri>();
  286. // foreach (Match match in matches)
  287. // {
  288. // if (match.Groups.Count == 2)
  289. // {
  290. // string link = match.Groups[1].Value;
  291. // Match hrefMatch = HREF.Match(link);
  292. // if (hrefMatch.Groups.Count == 2)
  293. // {
  294. // Uri url;
  295. // string value = hrefMatch.Groups[1].Value;
  296. // if (Uri.TryCreate(value, UriKind.Absolute, out url))
  297. // {
  298. // urls.Add(url);
  299. // }
  300. // }
  301. // }
  302. // }
  303. // return urls;
  304. // }
  305. #region Methods
  306. /// <summary>
  307. /// Builds the catalog so it can be searched.
  308. /// </summary>
  309. private static void BuildCatalog()
  310. {
  311. OnIndexBuilding();
  312. lock (_syncRoot)
  313. {
  314. Catalog.Clear();
  315. foreach (var post in Post.Posts.Where(post => post.IsVisibleToPublic))
  316. {
  317. AddItem(post);
  318. if (!BlogSettings.Instance.EnableCommentSearch)
  319. {
  320. continue;
  321. }
  322. foreach (var comment in post.Comments.Where(comment => comment.IsApproved))
  323. {
  324. AddItem(comment);
  325. }
  326. }
  327. foreach (var page in Page.Pages.Where(page => page.IsVisibleToPublic))
  328. {
  329. AddItem(page);
  330. }
  331. }
  332. OnIndexBuild();
  333. }
  334. /// <summary>
  335. /// Builds the results set and ranks it.
  336. /// </summary>
  337. /// <param name="searchTerm">
  338. /// The search Term.
  339. /// </param>
  340. /// <param name="includeComments">
  341. /// The include Comments.
  342. /// </param>
  343. private static List<Result> BuildResultSet(string searchTerm, bool includeComments)
  344. {
  345. var results = new List<Result>();
  346. var term = CleanContent(searchTerm.ToLowerInvariant().Trim(), false);
  347. var terms = term.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
  348. var regex = new Regex(string.Format(CultureInfo.InvariantCulture, "({0})", string.Join("|", terms)));
  349. foreach (var entry in Catalog)
  350. {
  351. var result = new Result();
  352. if (!(entry.Item is Comment))
  353. {
  354. var titleMatches = regex.Matches(entry.Title).Count;
  355. result.Rank = titleMatches * 20;
  356. var postMatches = regex.Matches(entry.Content).Count;
  357. result.Rank += postMatches;
  358. var descriptionMatches = regex.Matches(entry.Item.Description).Count;
  359. result.Rank += descriptionMatches * 2;
  360. }
  361. else if (includeComments)
  362. {
  363. var commentMatches = regex.Matches(entry.Content + entry.Title).Count;
  364. result.Rank += commentMatches;
  365. }
  366. if (result.Rank > 0)
  367. {
  368. result.Item = entry.Item;
  369. results.Add(result);
  370. }
  371. }
  372. results.Sort();
  373. return results;
  374. }
  375. /// <summary>
  376. /// Removes stop words and HTML from the specified string.
  377. /// </summary>
  378. /// <param name="content">
  379. /// The content.
  380. /// </param>
  381. /// <param name="removeHtml">
  382. /// The remove Html.
  383. /// </param>
  384. /// <returns>
  385. /// The clean content.
  386. /// </returns>
  387. private static string CleanContent(string content, bool removeHtml)
  388. {
  389. if (removeHtml)
  390. {
  391. content = Utils.StripHtml(content);
  392. }
  393. content =
  394. content.Replace("\\", string.Empty).Replace("|", string.Empty).Replace("(", string.Empty).Replace(
  395. ")", string.Empty).Replace("[", string.Empty).Replace("]", string.Empty).Replace("*", string.Empty).
  396. Replace("?", string.Empty).Replace("}", string.Empty).Replace("{", string.Empty).Replace(
  397. "^", string.Empty).Replace("+", string.Empty);
  398. var words = content.Split(new[] { ' ', '\n', '\r' }, StringSplitOptions.RemoveEmptyEntries);
  399. var sb = new StringBuilder();
  400. foreach (var word in
  401. words.Select(t => t.ToLowerInvariant().Trim()).Where(word => word.Length > 1 && !StopWords.Contains(word)))
  402. {
  403. sb.AppendFormat("{0} ", word);
  404. }
  405. return sb.ToString();
  406. }
  407. /// <summary>
  408. /// Raises the event in a safe way
  409. /// </summary>
  410. private static void OnIndexBuild()
  411. {
  412. if (IndexBuild != null)
  413. {
  414. IndexBuild(null, EventArgs.Empty);
  415. }
  416. }
  417. /// <summary>
  418. /// Raises the event in a safe way
  419. /// </summary>
  420. private static void OnIndexBuilding()
  421. {
  422. if (IndexBuilding != null)
  423. {
  424. IndexBuilding(null, EventArgs.Empty);
  425. }
  426. }
  427. /// <summary>
  428. /// Raises the event in a safe way
  429. /// </summary>
  430. /// <param name="searchTerm">
  431. /// The search Term.
  432. /// </param>
  433. private static void OnSearcing(string searchTerm)
  434. {
  435. if (Searching != null)
  436. {
  437. Searching(searchTerm, EventArgs.Empty);
  438. }
  439. }
  440. /// <summary>
  441. /// Handles the Saved event of the Page control.
  442. /// </summary>
  443. /// <param name="sender">The source of the event.</param>
  444. /// <param name="e">The <see cref="BlogEngine.Core.SavedEventArgs"/> instance containing the event data.</param>
  445. private static void Page_Saved(object sender, SavedEventArgs e)
  446. {
  447. lock (_syncRoot)
  448. {
  449. if (e.Action == SaveAction.Insert)
  450. {
  451. AddItem(sender as Page);
  452. }
  453. else
  454. {
  455. BuildCatalog();
  456. }
  457. }
  458. }
  459. /// <summary>
  460. /// Handles the CommentAdded event of the Post control.
  461. /// </summary>
  462. /// <param name="sender">The source of the event.</param>
  463. /// <param name="e">The <see cref="System.EventArgs"/> instance containing the event data.</param>
  464. private static void Post_CommentAdded(object sender, EventArgs e)
  465. {
  466. if (!BlogSettings.Instance.EnableCommentSearch)
  467. {
  468. return;
  469. }
  470. var comment = (Comment)sender;
  471. if (comment.IsApproved)
  472. {
  473. AddItem(comment);
  474. }
  475. }
  476. /// <summary>
  477. /// Handles the Saved event of the Post control.
  478. /// </summary>
  479. /// <param name="sender">The source of the event.</param>
  480. /// <param name="e">The <see cref="BlogEngine.Core.SavedEventArgs"/> instance containing the event data.</param>
  481. private static void Post_Saved(object sender, SavedEventArgs e)
  482. {
  483. lock (_syncRoot)
  484. {
  485. if (e.Action == SaveAction.Insert)
  486. {
  487. AddItem(sender as Post);
  488. }
  489. else
  490. {
  491. BuildCatalog();
  492. }
  493. }
  494. }
  495. /// <summary>
  496. /// A converter delegate used for converting Results to Posts.
  497. /// </summary>
  498. /// <param name="result">The IPublishable result.</param>
  499. /// <returns>An IPublishable.</returns>
  500. private static IPublishable ResultToPost(Result result)
  501. {
  502. return result.Item;
  503. }
  504. /// <summary>
  505. /// The sort dictionary.
  506. /// </summary>
  507. /// <param name="dic">
  508. /// The dictionary of string keys with float values.
  509. /// </param>
  510. /// <returns>
  511. /// A dictionary of string keys with float values.
  512. /// </returns>
  513. private static Dictionary<string, float> SortDictionary(Dictionary<string, float> dic)
  514. {
  515. var list = dic.Keys.Select(key => new KeyValuePair<string, float>(key, dic[key])).ToList();
  516. list.Sort((obj1, obj2) => obj2.Value.CompareTo(obj1.Value));
  517. return list.ToDictionary(pair => pair.Key, pair => pair.Value);
  518. }
  519. #endregion
  520. }
  521. #region Entry and Result structs
  522. /// <summary>
  523. /// A search optimized post object cleansed from HTML and stop words.
  524. /// </summary>
  525. internal struct Entry
  526. {
  527. #region Constants and Fields
  528. /// <summary>
  529. /// The content of the post cleansed for stop words and HTML
  530. /// </summary>
  531. internal string Content;
  532. /// <summary>
  533. /// The post object reference
  534. /// </summary>
  535. internal IPublishable Item;
  536. /// <summary>
  537. /// The title of the post cleansed for stop words
  538. /// </summary>
  539. internal string Title;
  540. #endregion
  541. }
  542. /// <summary>
  543. /// A result is a search result which contains a post and its ranking.
  544. /// </summary>
  545. internal class Result : IComparable<Result>
  546. {
  547. #region Constants and Fields
  548. /// <summary>
  549. /// The post of the result.
  550. /// </summary>
  551. internal IPublishable Item;
  552. /// <summary>
  553. /// The rank of the post based on the search term. The higher the rank, the higher the post is in the result set.
  554. /// </summary>
  555. internal int Rank;
  556. #endregion
  557. #region Public Methods
  558. /// <summary>
  559. /// Returns a hash code for this instance.
  560. /// </summary>
  561. /// <returns>
  562. /// A hash code for this instance, suitable for use in hashing algorithms and data structures like a hash table.
  563. /// </returns>
  564. public override int GetHashCode()
  565. {
  566. return this.Item.Id.GetHashCode();
  567. }
  568. #endregion
  569. #region Implemented Interfaces
  570. #region IComparable<Result>
  571. /// <summary>
  572. /// Compares the current object with another object of the same type.
  573. /// </summary>
  574. /// <param name="other">
  575. /// An object to compare with this object.
  576. /// </param>
  577. /// <returns>
  578. /// A 32-bit signed integer that indicates the relative order of the objects being compared. The return value
  579. /// has the following meanings: Value Meaning Less than zero This object is less than the other parameter.Zero
  580. /// This object is equal to other. Greater than zero This object is greater than other.
  581. /// </returns>
  582. public int CompareTo(Result other)
  583. {
  584. return other.Rank.CompareTo(this.Rank);
  585. }
  586. #endregion
  587. #endregion
  588. }
  589. #endregion
  590. }