PageRenderTime 55ms CodeModel.GetById 2ms app.highlight 46ms RepoModel.GetById 1ms app.codeStats 0ms

/BlogEngine/DotNetSlave.BusinessLogic/Search.cs

#
C# | 681 lines | 463 code | 58 blank | 160 comment | 20 complexity | e73ab1716db538a0920e63cbcfef6cc8 MD5 | raw file
  1namespace BlogEngine.Core
  2{
  3    using System;
  4    using System.Collections.Generic;
  5    using System.Collections.ObjectModel;
  6    using System.Collections.Specialized;
  7    using System.Globalization;
  8    using System.Linq;
  9    using System.Text;
 10    using System.Text.RegularExpressions;
 11    using System.Web;
 12    using System.Xml;
 13
 14    using BlogEngine.Core.Providers;
 15
 16    /// <summary>
 17    /// Searches the post collection and returns a result based on a search term.
 18    ///     <remarks>
 19    /// It is used for related posts and the in-site search feature.
 20    ///     </remarks>
 21    /// </summary>
 22    public static class Search
 23    {
 24        #region Constants and Fields
 25
 26        /// <summary>
 27        /// The catalog.
 28        /// </summary>
 29        private static readonly Dictionary<Guid, Collection<Entry>> _catalog = new Dictionary<Guid, Collection<Entry>>();
 30
 31        /// <summary>
 32        /// The stop words.
 33        /// </summary>
 34        private static readonly Dictionary<Guid, StringCollection> _stopWords = new Dictionary<Guid, StringCollection>();
 35
 36        /// <summary>
 37        /// The sync root.
 38        /// </summary>
 39        private static readonly object _syncRoot = new object();
 40
 41        #endregion
 42
 43        #region Properties
 44
 45        private static StringCollection StopWords
 46        {       
 47            get
 48            {
 49                Guid blogId = Blog.CurrentInstance.Id;
 50                StringCollection stopWords;
 51                lock (_syncRoot)
 52                {
 53                    if (!_stopWords.TryGetValue(blogId, out stopWords))
 54                    {
 55                        stopWords = BlogService.LoadStopWords();
 56                        _stopWords.Add(blogId, stopWords);
 57                    }
 58                }
 59
 60                return stopWords;
 61            }
 62        }
 63
 64        private static Collection<Entry> Catalog
 65        {
 66            get
 67            {
 68                Guid blogId = Blog.CurrentInstance.Id;
 69                Collection<Entry> catalog;
 70                lock (_syncRoot)
 71                {   
 72                    if (!_catalog.TryGetValue(blogId, out catalog))
 73                    {
 74                        catalog = new Collection<Entry>();
 75                        _catalog.Add(blogId, catalog);
 76                        BuildCatalog();
 77                    }
 78                }
 79
 80                return catalog;
 81            }
 82        }
 83
 84        
 85        #endregion
 86
 87        #region Constructors and Destructors
 88
 89        /// <summary>
 90        /// Initializes static members of the <see cref="Search"/> class.
 91        /// </summary>
 92        static Search()
 93        {
 94            Post.Saved += Post_Saved;
 95            Page.Saved += Page_Saved;
 96            BlogSettings.Changed += delegate { BuildCatalog(); };
 97            Post.CommentAdded += Post_CommentAdded;
 98            Post.CommentRemoved += delegate { BuildCatalog(); };
 99            Comment.Approved += Post_CommentAdded;
100        }
101
102        #endregion
103
104        #region Events
105
106        /// <summary>
107        ///     Occurs after the index has been build.
108        /// </summary>
109        public static event EventHandler<EventArgs> IndexBuild;
110
111        /// <summary>
112        ///     Occurs just before the search index is being build.
113        /// </summary>
114        public static event EventHandler<EventArgs> IndexBuilding;
115
116        /// <summary>
117        ///     Occurs when a search is performed. (The search term is the sender).
118        /// </summary>
119        public static event EventHandler<EventArgs> Searching;
120
121        #endregion
122
123        #region Public Methods
124
125        /// <summary>
126        /// Adds an IPublishable item to the search catalog. 
127        ///     That will make it immediately searchable.
128        /// </summary>
129        /// <param name="item">
130        /// The item to add.
131        /// </param>
132        public static void AddItem(IPublishable item)
133        {
134            var entry = new Entry
135                {
136                    Item = item,
137                    Title = CleanContent(item.Title, false),
138                    Content = HttpUtility.HtmlDecode(CleanContent(item.Content, true))
139                };
140            if (item is Comment)
141            {
142                entry.Content += HttpUtility.HtmlDecode(CleanContent(item.Author, false));
143            }
144
145            Catalog.Add(entry);
146        }
147
148        // public static List<IPublishable> ApmlMatches(Uri url, int maxInterests)
149        // {
150        // using (System.Net.WebClient client = new System.Net.WebClient())
151        // {
152        // client.UseDefaultCredentials = true;
153        // client.Headers.Add(System.Net.HttpRequestHeader.UserAgent, "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1;)");
154        // using (StreamReader reader = new StreamReader(client.OpenRead(url)))
155        // {
156        // XmlDocument doc = new XmlDocument();
157        // string content = reader.ReadToEnd();
158        // string upper = content.ToUpperInvariant();
159
160        // if (upper.Contains("<HTML") && upper.Contains("</HTML"))
161        // {
162        // Collection<Uri> urls = FindLinks("apml", content);
163        // if (urls.Count > 0)
164        // {
165        // LoadDocument(url, doc, urls[0]);
166        // }
167        // else
168        // {
169        // throw new NotSupportedException("No APML link on page");
170        // }
171        // }
172        // else
173        // {
174        // doc.LoadXml(content);
175        // }
176
177        // return Search.ApmlMatches(doc, 10);
178        // }
179        // }
180        // }
181
182        // private static void LoadDocument(Uri url, XmlDocument doc, Uri ApmlUrl)
183        // {
184        // if (url.IsAbsoluteUri)
185        // {
186        // doc.Load(ApmlUrl.ToString());
187        // }
188        // else
189        // {
190        // string absoluteUrl = null;
191        // if (!url.ToString().StartsWith("/"))
192        // absoluteUrl = (url + ApmlUrl.ToString());
193        // else
194        // absoluteUrl = url.Scheme + "://" + url.Authority + ApmlUrl;
195
196        // doc.Load(absoluteUrl);
197        // }
198        // }
199
200        /// <summary>
201        /// The apml matches.
202        /// </summary>
203        /// <param name="apmlFile">
204        /// The apml file.
205        /// </param>
206        /// <param name="maxInterests">
207        /// The max interests.
208        /// </param>
209        /// <returns>
210        /// A list of IPublishable.
211        /// </returns>
212        public static List<IPublishable> ApmlMatches(XmlDocument apmlFile, int maxInterests)
213        {
214            var concepts = new Dictionary<string, float>();
215            var nodes = apmlFile.SelectNodes("//Concept");
216            if (nodes != null)
217            {
218                foreach (XmlNode node in nodes)
219                {
220                    if (node.Attributes == null)
221                    {
222                        continue;
223                    }
224
225                    var key = node.Attributes["key"].InnerText.ToLowerInvariant().Trim();
226                    var value = float.Parse(node.Attributes["value"].InnerText, CultureInfo.InvariantCulture);
227                    if (!concepts.ContainsKey(key))
228                    {
229                        concepts.Add(key, value);
230                    }
231                    else if (concepts[key] < value)
232                    {
233                        concepts[key] = value;
234                    }
235                }
236            }
237
238            concepts = SortDictionary(concepts);
239            var max = Math.Min(concepts.Count, maxInterests);
240            var counter = 0;
241            var resultSet = new List<Result>();
242            foreach (var key in concepts.Keys)
243            {
244                counter++;
245                var results = BuildResultSet(key, false);
246
247                // results = results.FindAll(delegate(Result r) { return r.Rank > 1; });
248                resultSet.AddRange(results);
249                if (counter == max)
250                {
251                    break;
252                }
253            }
254
255            resultSet.Sort();
256            var aggregatedResults = new List<Result>();
257            foreach (var r in resultSet)
258            {
259                if (!aggregatedResults.Contains(r))
260                {
261                    aggregatedResults.Add(r);
262                }
263                else
264                {
265                    var r1 = r;
266                    var existingResult =
267                        aggregatedResults.Find(res => res.GetHashCode() == r1.GetHashCode());
268                    existingResult.Rank += r.Rank;
269                }
270            }
271
272            aggregatedResults = aggregatedResults.FindAll(r => r.Rank > 1);
273            var items = aggregatedResults.ConvertAll(ResultToPost);
274            var uniqueItems = new List<IPublishable>();
275
276            foreach (var item in items.Where(item => !uniqueItems.Contains(item)))
277            {
278                uniqueItems.Add(item);
279            }
280
281            return uniqueItems;
282        }
283
284        /// <summary>
285        /// Returns a list of posts that is related to the specified post.
286        /// </summary>
287        /// <param name="post">The IPublishable post.</param>
288        /// <returns>A list of IPublishable.</returns>
289        public static List<IPublishable> FindRelatedItems(IPublishable post)
290        {
291            var term = CleanContent(post.Title, false);
292            return Hits(term, false);
293        }
294
295        /// <summary>
296        /// Searches all the posts and returns a ranked result set.
297        /// </summary>
298        /// <param name="searchTerm">The term to search for</param>
299        /// <param name="includeComments">True to include a post's comments and their authors in search</param>
300        /// <returns>A list of IPublishable.</returns>
301        public static List<IPublishable> Hits(string searchTerm, bool includeComments)
302        {
303            lock (_syncRoot)
304            {
305                var results = BuildResultSet(searchTerm, includeComments);
306                var items = results.ConvertAll(ResultToPost);
307                results.Clear();
308                OnSearcing(searchTerm);
309                return items;
310            }
311        }
312
313        #endregion
314
315        // private const string PATTERN = "<head.*<link( [^>]*title=\"{0}\"[^>]*)>.*</head>";
316        // private static readonly Regex HREF = new Regex("href=\"(.*)\"", RegexOptions.IgnoreCase | RegexOptions.Compiled);
317
318        ///// <summary>
319        ///// Finds semantic links in a given HTML document.
320        ///// </summary>
321        ///// <param name="type">The type of link. Could be foaf, apml or sioc.</param>
322        ///// <param name="html">The HTML to look through.</param>
323        ///// <returns></returns>
324        // public static Collection<Uri> FindLinks(string type, string html)
325        // {
326        // MatchCollection matches = Regex.Matches(html, string.Format(PATTERN, type), RegexOptions.IgnoreCase | RegexOptions.Singleline);
327        // Collection<Uri> urls = new Collection<Uri>();
328
329        // foreach (Match match in matches)
330        // {
331        // if (match.Groups.Count == 2)
332        // {
333        // string link = match.Groups[1].Value;
334        // Match hrefMatch = HREF.Match(link);
335
336        // if (hrefMatch.Groups.Count == 2)
337        // {
338        // Uri url;
339        // string value = hrefMatch.Groups[1].Value;
340        // if (Uri.TryCreate(value, UriKind.Absolute, out url))
341        // {
342        // urls.Add(url);
343        // }
344        // }
345        // }
346        // }
347
348        // return urls;
349        // }
350        #region Methods
351
352        /// <summary>
353        /// Builds the catalog so it can be searched.
354        /// </summary>
355        private static void BuildCatalog()
356        {
357            OnIndexBuilding();
358
359            lock (_syncRoot)
360            {
361                Catalog.Clear();
362                foreach (var post in Post.Posts.Where(post => post.IsVisibleToPublic))
363                {
364                    AddItem(post);
365                    if (!BlogSettings.Instance.EnableCommentSearch)
366                    {
367                        continue;
368                    }
369
370                    foreach (var comment in post.Comments.Where(comment => comment.IsApproved))
371                    {
372                        AddItem(comment);
373                    }
374                }
375
376                foreach (var page in Page.Pages.Where(page => page.IsVisibleToPublic))
377                {
378                    AddItem(page);
379                }
380            }
381
382            OnIndexBuild();
383        }
384
385        /// <summary>
386        /// Builds the results set and ranks it.
387        /// </summary>
388        /// <param name="searchTerm">
389        /// The search Term.
390        /// </param>
391        /// <param name="includeComments">
392        /// The include Comments.
393        /// </param>
394        private static List<Result> BuildResultSet(string searchTerm, bool includeComments)
395        {
396            var results = new List<Result>();
397            var term = CleanContent(searchTerm.ToLowerInvariant().Trim(), false);
398            var terms = term.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
399            var regex = new Regex(string.Format(CultureInfo.InvariantCulture, "({0})", string.Join("|", terms)));
400
401            foreach (var entry in Catalog)
402            {
403                var result = new Result();
404                if (!(entry.Item is Comment))
405                {
406                    var titleMatches = regex.Matches(entry.Title).Count;
407                    result.Rank = titleMatches * 20;
408
409                    var postMatches = regex.Matches(entry.Content).Count;
410                    result.Rank += postMatches;
411
412                    var descriptionMatches = regex.Matches(entry.Item.Description).Count;
413                    result.Rank += descriptionMatches * 2;
414                }
415                else if (includeComments)
416                {
417                    var commentMatches = regex.Matches(entry.Content + entry.Title).Count;
418                    result.Rank += commentMatches;
419                }
420
421                if (result.Rank > 0)
422                {
423                    result.Item = entry.Item;
424                    results.Add(result);
425                }
426            }
427
428            results.Sort();
429            return results;
430        }
431
432        /// <summary>
433        /// Removes stop words and HTML from the specified string.
434        /// </summary>
435        /// <param name="content">
436        /// The content.
437        /// </param>
438        /// <param name="removeHtml">
439        /// The remove Html.
440        /// </param>
441        /// <returns>
442        /// The clean content.
443        /// </returns>
444        private static string CleanContent(string content, bool removeHtml)
445        {
446            if (removeHtml)
447            {
448                content = Utils.StripHtml(content);
449            }
450
451            content =
452                content.Replace("\\", string.Empty).Replace("|", string.Empty).Replace("(", string.Empty).Replace(
453                    ")", string.Empty).Replace("[", string.Empty).Replace("]", string.Empty).Replace("*", string.Empty).
454                    Replace("?", string.Empty).Replace("}", string.Empty).Replace("{", string.Empty).Replace(
455                        "^", string.Empty).Replace("+", string.Empty);
456
457            var words = content.Split(new[] { ' ', '\n', '\r' }, StringSplitOptions.RemoveEmptyEntries);
458            var sb = new StringBuilder();
459            foreach (var word in
460                words.Select(t => t.ToLowerInvariant().Trim()).Where(word => word.Length > 1 && !StopWords.Contains(word)))
461            {
462                sb.AppendFormat("{0} ", word);
463            }
464
465            return sb.ToString();
466        }
467
468        /// <summary>
469        /// Raises the event in a safe way
470        /// </summary>
471        private static void OnIndexBuild()
472        {
473            if (IndexBuild != null)
474            {
475                IndexBuild(null, EventArgs.Empty);
476            }
477        }
478
479        /// <summary>
480        /// Raises the event in a safe way
481        /// </summary>
482        private static void OnIndexBuilding()
483        {
484            if (IndexBuilding != null)
485            {
486                IndexBuilding(null, EventArgs.Empty);
487            }
488        }
489
490        /// <summary>
491        /// Raises the event in a safe way
492        /// </summary>
493        /// <param name="searchTerm">
494        /// The search Term.
495        /// </param>
496        private static void OnSearcing(string searchTerm)
497        {
498            if (Searching != null)
499            {
500                Searching(searchTerm, EventArgs.Empty);
501            }
502        }
503
504        /// <summary>
505        /// Handles the Saved event of the Page control.
506        /// </summary>
507        /// <param name="sender">The source of the event.</param>
508        /// <param name="e">The <see cref="BlogEngine.Core.SavedEventArgs"/> instance containing the event data.</param>
509        private static void Page_Saved(object sender, SavedEventArgs e)
510        {
511            lock (_syncRoot)
512            {
513                if (e.Action == SaveAction.Insert)
514                {
515                    AddItem(sender as Page);
516                }
517                else
518                {
519                    BuildCatalog();
520                }
521            }
522        }
523
524        /// <summary>
525        /// Handles the CommentAdded event of the Post control.
526        /// </summary>
527        /// <param name="sender">The source of the event.</param>
528        /// <param name="e">The <see cref="System.EventArgs"/> instance containing the event data.</param>
529        private static void Post_CommentAdded(object sender, EventArgs e)
530        {
531            if (!BlogSettings.Instance.EnableCommentSearch)
532            {
533                return;
534            }
535
536            var comment = (Comment)sender;
537            if (comment.IsApproved)
538            {
539                AddItem(comment);
540            }
541        }
542
543        /// <summary>
544        /// Handles the Saved event of the Post control.
545        /// </summary>
546        /// <param name="sender">The source of the event.</param>
547        /// <param name="e">The <see cref="BlogEngine.Core.SavedEventArgs"/> instance containing the event data.</param>
548        private static void Post_Saved(object sender, SavedEventArgs e)
549        {
550            lock (_syncRoot)
551            {
552                if (e.Action == SaveAction.Insert)
553                {
554                    AddItem(sender as Post);
555                }
556                else
557                {
558                    BuildCatalog();
559                }
560            }
561        }
562
563        /// <summary>
564        /// A converter delegate used for converting Results to Posts.
565        /// </summary>
566        /// <param name="result">The IPublishable result.</param>
567        /// <returns>An IPublishable.</returns>
568        private static IPublishable ResultToPost(Result result)
569        {
570            return result.Item;
571        }
572
573        /// <summary>
574        /// The sort dictionary.
575        /// </summary>
576        /// <param name="dic">
577        /// The dictionary of string keys with float values.
578        /// </param>
579        /// <returns>
580        /// A dictionary of string keys with float values.
581        /// </returns>
582        private static Dictionary<string, float> SortDictionary(Dictionary<string, float> dic)
583        {
584            var list = dic.Keys.Select(key => new KeyValuePair<string, float>(key, dic[key])).ToList();
585
586            list.Sort((obj1, obj2) => obj2.Value.CompareTo(obj1.Value));
587
588            return list.ToDictionary(pair => pair.Key, pair => pair.Value);
589        }
590
591        #endregion
592    }
593
594    #region Entry and Result structs
595
596    /// <summary>
597    /// A search optimized post object cleansed from HTML and stop words.
598    /// </summary>
599    internal struct Entry
600    {
601        #region Constants and Fields
602
603        /// <summary>
604        ///     The content of the post cleansed for stop words and HTML
605        /// </summary>
606        internal string Content;
607
608        /// <summary>
609        ///     The post object reference
610        /// </summary>
611        internal IPublishable Item;
612
613        /// <summary>
614        ///     The title of the post cleansed for stop words
615        /// </summary>
616        internal string Title;
617
618        #endregion
619    }
620
621    /// <summary>
622    /// A result is a search result which contains a post and its ranking.
623    /// </summary>
624    internal class Result : IComparable<Result>
625    {
626        #region Constants and Fields
627
628        /// <summary>
629        ///     The post of the result.
630        /// </summary>
631        internal IPublishable Item;
632
633        /// <summary>
634        ///     The rank of the post based on the search term. The higher the rank, the higher the post is in the result set.
635        /// </summary>
636        internal int Rank;
637
638        #endregion
639
640        #region Public Methods
641
642        /// <summary>
643        /// Returns a hash code for this instance.
644        /// </summary>
645        /// <returns>
646        /// A hash code for this instance, suitable for use in hashing algorithms and data structures like a hash table. 
647        /// </returns>
648        public override int GetHashCode()
649        {
650            return this.Item.Id.GetHashCode();
651        }
652
653        #endregion
654
655        #region Implemented Interfaces
656
657        #region IComparable<Result>
658
659        /// <summary>
660        /// Compares the current object with another object of the same type.
661        /// </summary>
662        /// <param name="other">
663        /// An object to compare with this object.
664        /// </param>
665        /// <returns>
666        /// A 32-bit signed integer that indicates the relative order of the objects being compared. The return value 
667        ///     has the following meanings: Value Meaning Less than zero This object is less than the other parameter.Zero 
668        ///     This object is equal to other. Greater than zero This object is greater than other.
669        /// </returns>
670        public int CompareTo(Result other)
671        {
672            return other.Rank.CompareTo(this.Rank);
673        }
674
675        #endregion
676
677        #endregion
678    }
679
680    #endregion
681}