/Source/Shrinkr.Core/Infrastructure/Spam/Google/Lookup.cs

http://shrinkr.codeplex.com · C# · 243 lines · 151 code · 51 blank · 41 comment · 26 complexity · f3d375970e4f372e97ca751bfd822967 MD5 · raw file

  1. // This class is copied from Subkismet
  2. namespace Shrinkr.Infrastructure
  3. {
  4. using System;
  5. using System.Collections.Generic;
  6. using System.Linq;
  7. using System.Text.RegularExpressions;
  8. /// <summary>
  9. /// Manages the lookup process for match URLs according to Google spec.
  10. /// </summary>
  11. internal static class Lookup
  12. {
  13. private static readonly Regex hostNameExpression = new Regex("http://[^/]*/", RegexOptions.Compiled);
  14. private static readonly Regex fourHostNameExpression = new Regex(@"[a-zA-Z0-9-]+", RegexOptions.Compiled);
  15. private static readonly Regex fourPathExpression = new Regex(@"/[^/]+", RegexOptions.Compiled);
  16. /// <summary>
  17. /// Returns a list of matched URLs for lookup process.
  18. /// </summary>
  19. /// <param name="url">String value of the incoming URL.</param>
  20. /// <returns>A list of strings for matched URLs.</returns>
  21. public static IList<string> GetUrls(string url)
  22. {
  23. List<string> results = new List<string>();
  24. string hostname = GetExactHostname(url);
  25. string urlWithoutParameters = GetExactPathWithoutParameters(url);
  26. results.Add(hostname);
  27. results.AddRange(GetFourHostNames(hostname));
  28. results.Add(GetExactPath(url));
  29. results.Add(urlWithoutParameters);
  30. results.AddRange(GetFourPaths(hostname, urlWithoutParameters));
  31. return RemoveDuplicates(results);
  32. }
  33. /// <summary>
  34. /// Returns the exact hostname for the URL.
  35. /// </summary>
  36. /// <param name="url">URL to get its exact hostname.</param>
  37. /// <returns>The hostname.</returns>
  38. private static string GetExactHostname(string url)
  39. {
  40. if (!url.StartsWith("http://", StringComparison.OrdinalIgnoreCase))
  41. {
  42. url = "http://" + url;
  43. }
  44. string result = url;
  45. if (hostNameExpression.IsMatch(url))
  46. {
  47. result = hostNameExpression.Match(url).Value;
  48. }
  49. if (result.StartsWith("http://", StringComparison.OrdinalIgnoreCase))
  50. {
  51. result = result.Remove(0, 7);
  52. }
  53. if (!result.EndsWith("/", StringComparison.Ordinal))
  54. {
  55. result = result + "/";
  56. }
  57. return result;
  58. }
  59. /// <summary>
  60. /// Returns four hostnames that match the incoming hostname by
  61. /// removing its components from the left.
  62. /// </summary>
  63. /// <param name="hostname">The hostname.</param>
  64. /// <returns>A list of up to four hostnames matching the input.</returns>
  65. private static IEnumerable<string> GetFourHostNames(string hostname)
  66. {
  67. List<string> results = new List<string>();
  68. MatchCollection matches = null;
  69. if (fourHostNameExpression.IsMatch(hostname))
  70. {
  71. matches = fourHostNameExpression.Matches(hostname);
  72. }
  73. if (matches != null)
  74. {
  75. int count = matches.Count;
  76. if (count > 2)
  77. {
  78. for (int index = count - 2; (index > (count - 6)) && (index >= 0); index--)
  79. {
  80. List<string> components = new List<string>();
  81. for (int reverseIndex = index; reverseIndex < count; reverseIndex++)
  82. {
  83. components.Add(matches[reverseIndex].Value);
  84. }
  85. string newHostname = string.Join(".", components.ToArray());
  86. if (!newHostname.EndsWith("/", StringComparison.Ordinal))
  87. {
  88. newHostname = newHostname + "/";
  89. }
  90. results.Add(newHostname);
  91. }
  92. }
  93. }
  94. return results;
  95. }
  96. /// <summary>
  97. /// Returns the exact path for the incoming URL.
  98. /// </summary>
  99. /// <param name="url">The URL.</param>
  100. /// <returns>String value of the exact path for the input.</returns>
  101. private static string GetExactPath(string url)
  102. {
  103. string result = url;
  104. if (result.StartsWith("http://", StringComparison.OrdinalIgnoreCase))
  105. {
  106. result = result.Remove(0, 7);
  107. }
  108. string exactPathWithoutParameters = GetExactPathWithoutParameters(url);
  109. int dotIndex = exactPathWithoutParameters.LastIndexOf(".", StringComparison.Ordinal);
  110. int slashIndex = exactPathWithoutParameters.LastIndexOf("/", StringComparison.Ordinal);
  111. if (dotIndex < slashIndex)
  112. {
  113. if (!result.EndsWith("/", StringComparison.Ordinal))
  114. {
  115. result = result + "/";
  116. }
  117. }
  118. return result;
  119. }
  120. /// <summary>
  121. /// Returns the exact path for a URL without parameters.
  122. /// </summary>
  123. /// <param name="url">The URL.</param>
  124. /// <returns>String value of the exact path without parameters for the input.</returns>
  125. private static string GetExactPathWithoutParameters(string url)
  126. {
  127. string result = url;
  128. if (result.StartsWith("http://", StringComparison.OrdinalIgnoreCase))
  129. {
  130. result = result.Remove(0, 7);
  131. }
  132. if (result.Contains("?"))
  133. {
  134. int pos = result.LastIndexOf("?", StringComparison.Ordinal);
  135. result = result.Remove(pos);
  136. }
  137. return result;
  138. }
  139. /// <summary>
  140. /// Returns four paths for the hostname and the URL without parameters.
  141. /// </summary>
  142. /// <param name="hostname">The hostname.</param>
  143. /// <param name="urlWithoutParameters">The URL without parameters.</param>
  144. /// <returns>A list of up to four strings for the matched paths.</returns>
  145. private static IEnumerable<string> GetFourPaths(string hostname, string urlWithoutParameters)
  146. {
  147. List<string> results = new List<string>();
  148. MatchCollection matches = null;
  149. if (fourPathExpression.IsMatch(urlWithoutParameters))
  150. {
  151. matches = fourPathExpression.Matches(urlWithoutParameters);
  152. }
  153. int count = 0;
  154. if (matches != null)
  155. {
  156. count = matches.Count;
  157. }
  158. if (!urlWithoutParameters.EndsWith("/", StringComparison.Ordinal))
  159. {
  160. count--;
  161. }
  162. if (matches != null)
  163. {
  164. for (int index1 = 0; (index1 < 4) && (index1 < count); index1++)
  165. {
  166. List<string> components = new List<string>();
  167. for (int index2 = 0; index2 <= index1; index2++)
  168. {
  169. components.Add(matches[index2].Value);
  170. }
  171. string tempPath = string.Join(string.Empty, components.ToArray());
  172. results.Add(hostname.Remove(hostname.Length - 1, 1) + tempPath + "/");
  173. }
  174. }
  175. return results;
  176. }
  177. /// <summary>
  178. /// Removes duplicate items from the list.
  179. /// </summary>
  180. /// <param name="results">A list of strings with duplicate items.</param>
  181. /// <returns>A list of strings without duplicate items.</returns>
  182. private static List<string> RemoveDuplicates(IEnumerable<string> results)
  183. {
  184. List<string> finalResults = new List<string>();
  185. foreach (string item in results.Where(item => !finalResults.Contains(item)))
  186. {
  187. finalResults.Add(item);
  188. }
  189. return finalResults;
  190. }
  191. }
  192. }