ContentAnalyzer.cs - Copyright (c) DevRain 2011. All rights…

/Sources/DevRain.Data.Extracting.Features/ContentAnalyzer.cs

# · C# · 309 lines · 221 code · 45 blank · 43 comment · 34 complexity · ba143dcc0150ea2120ca5786e63a5e66 MD5 · raw file

// ============================================================================

// <copyright file="ContentAnalyzer.cs" company="DevRain">

//     Copyright (c) DevRain 2011. All rights reserved.

// </copyright>

// <author>Oleksandr Krakovetskyi</author>

// <date>11.02.2011</date>

// ============================================================================



namespace DevRain.Data.Extracting.Features

{

    using System;

    using System.Collections.Generic;

    using System.Linq;

    using DevRain.Data.Extracting;



    /// <summary>

    /// ContentAnalyzer is used for content analysis.

    /// </summary>

    public class ContentAnalyzer : HtmlProcessor

    {

        /// <summary>

        /// Gets collection of words.

        /// </summary>

        public List<string> Words

        {

            get

            {

                return this.InnerText.SplitToWords();

            }

        }



        /// <summary>

        /// Gets list of sentences.

        /// </summary>

        public List<string> Sentences

        {

            get

            {

                var sentences = new List<string>();

                string text = this.InnerText.TrimSafe() + " ";



                text = text.Replace("Dr.", "Dr").Replace("Ms.", "Ms").Replace("Mr.", "Mr").Replace("Dept.", "Dept");

                text = text.Replace(System.Environment.NewLine, " ~~ ");



                if (!string.IsNullOrEmpty(text))

                {

                    string[] split = text.Split(new string[] { " ~~ " }, StringSplitOptions.RemoveEmptyEntries);



                    //string[] splitSentences = Regex.Split(text, @"(?<=['""A-Za-z0-9][\.\!\?])\s+(?=[A-Z])");

                    foreach (string str in split)

                    {

                        if (str.IndexOfAny(new string[] { "...", ".", "!", "?" }) != -1)

                        {

                            string[] splitSentences = str.Split(

                                    new string[] { "... ", ". ", "! ", "? " }, StringSplitOptions.RemoveEmptyEntries);



                            // loop the sentences

                            for (int i = 0; i < splitSentences.Length; i++)

                            {

                                // clean up the sentence one more time, trim it, and add it to the array list

                                string sSingleSentence = splitSentences[i];



                                string sent = splitSentences[i];

                                int wordsCount = sent.SplitToWords().Count;



                                if (wordsCount > 2 && wordsCount < 30)

                                {

                                    sentences.Add(sent);

                                    this.WordsInSentencesCount += wordsCount;

                                }

                            }

                        }

                    }

                }

                return sentences;

            }

        }



        /// <summary>

        /// Average sentences length (only words which are the part of sentences are taken). 

        /// </summary>

        public double SentencesAvgLength

        {

            get

            {

                return (this.Sentences.Count == 0) ? 0 : (double)this.WordsInSentencesCount / (double)this.Sentences.Count;

            }

        }



        /// <summary>

        /// Gets list of known stop words

        /// </summary>

        public List<string> StopWords

        {

            get

            {

                List<string> stopWordsList = Extensions.GetStopWords();

                var stopWords = new List<string>();



                foreach (string stopWord in stopWordsList)

                {

                    if (this.InnerText.ToUpperInvariant().Contains(stopWord.ToUpperInvariant()))

                    {

                        stopWords.Add(stopWord);

                    }

                }

                return stopWords;

            }

        }



        /// <summary>

        /// Gets number of words which are links

        /// </summary>

        public List<string> WordsAsLinks

        {

            get

            {

                List<string> list = new List<string>();

                foreach (var link in this.Links)

                    list.AddRange(link.InnerText.SplitToWords());

                return list;

            }



        }



        /// <summary>

        /// Gets words in lists ("li").

        /// </summary>

        public List<string> WordsAsLists

        {

            get

            {

                List<string> words = new List<string>();



                var collection = this.GetElementsByTagName("li");



                foreach (var li in collection)

                {

                    words.AddRange(li.InnerText.SplitToWords());

                }

                return words;

            }

        }



        public ContentAnalyzer(string html): base(html) { }

        public ContentAnalyzer(Uri uri) : base(uri) { }



        public double GetSeoRank(string keyword)

        {



            double[] _rates = new double[] { 

            0.66,

            0.63,

            0.60,

            0.49,

            0.47,

            0.45,

            0.45,

            0.42,

            0.38,

            0.37,

            0.35,

            0.33,

            0.33,

            0.26,

            0.25,

            0.23,

            0.22,

            0.21,

            0.19,

            0.12,

            0.06,

            0.05

        };



            double rank = 0.0;

            keyword = keyword.ToLower();



            // Title contains keyword

            if (this.Title.IndexOf(keyword, StringComparison.OrdinalIgnoreCase) != -1)

                rank += _rates[0];



            // Keyword is first in Title

            if (this.Title.StartsWith(keyword))

                rank += _rates[1];



            //// Keyword is a part of URL (e.g. keyword.com)

            //if (this._uri.Host.IndexOf(keyword, StringComparison.OrdinalIgnoreCase) != -1)

            //    rank += _rates[2];



            foreach (var header in this.Headers)

            {

                if (header.InnerText.IndexOf(keyword, StringComparison.OrdinalIgnoreCase) != -1)

                {

                    rank += _rates[3];

                    break;

                }

            }



            foreach (var link in this.Links)

            {

                if (link.InnerText.IndexOf(keyword, StringComparison.OrdinalIgnoreCase) != -1)

                {

                    rank += _rates[4];

                    break;

                }

            }



            foreach (var header in this.Headers)

            {

                if (header.InnerText.StartsWith(keyword, StringComparison.OrdinalIgnoreCase))

                {

                    if (header.TagName.ToUpperInvariant() == "H1")

                    {

                        rank += _rates[5];

                        break;

                    }

                    else

                    {

                        rank += _rates[10];

                        break;

                    }

                }



            }



            int i = 0;

            foreach (var word in this.Words)

            {

                if (i == 100) break;



                if (word.Equals(keyword, StringComparison.OrdinalIgnoreCase))

                    rank += _rates[6];



                i++;

            }



            foreach (var image in this.Images)

            {

                if (image.Attributes["alt"].IndexOf(keyword, StringComparison.OrdinalIgnoreCase) != -1)

                {

                    rank += _rates[11];

                    break;

                }

            }





            foreach (var image in this.Images)

            {

                if (image.Attributes["src"].IndexOf(keyword, StringComparison.OrdinalIgnoreCase) != -1)

                {

                    rank += _rates[12];

                    break;

                }

            }



            foreach (var word in this.WordsAsLists)

            {

                if (word.Equals(keyword, StringComparison.OrdinalIgnoreCase))

                {

                    rank += _rates[15];

                    break;

                }

            }



            if (this.MetaDescription.IndexOf(keyword, StringComparison.OrdinalIgnoreCase) != -1)

                rank += _rates[16];

            

            if (this.MetaKeywords.IndexOf(keyword, StringComparison.OrdinalIgnoreCase) != -1)

                rank += _rates[17];



            return rank / (_rates.Length - 9);

        }



        /// <summary>

        /// Gets document content uniqueness.

        /// </summary>

        /// <param name="documents">List of all documents.</param>

        /// <param name="index">Index of document to be checked.</param>

        /// <param name="minimumSentenseLength">Minimum sentense length to be taken.</param>

        /// <returns>Calcilated document uniqueness in range [0; 1].</returns>

        public static double GetDocumentUniqueness(List<string> documents, int index, int minimumSentenseLength)

        {

            ContentAnalyzer a = new ContentAnalyzer(documents[index]);

            var sentenses = a.Sentences.Where(s => s.Length > minimumSentenseLength);



            int count = 0;



            for (int i = 0; i < documents.Count; i++)

            {

                if (i == index) continue;



                a = new ContentAnalyzer(documents[i]);



                foreach (var sentense in sentenses)

                {

                    if (a.Sentences.Contains(sentense))

                    {

                        count++;

                    }

                }

            }

            return ((double)sentenses.Count() - (double)count) / (double)sentenses.Count();

        }





        public int WordsInSentencesCount { get; protected set; }

    }

}