PageRenderTime 83ms CodeModel.GetById 27ms RepoModel.GetById 0ms app.codeStats 0ms

/Abot.Tests.Unit/Core/HyperlinkParserTest.cs

https://gitlab.com/kokeiro001/abot
C# | 566 lines | 433 code | 130 blank | 3 comment | 0 complexity | 0760e9b88155cb266b86f72d1e9a991a MD5 | raw file
  1. using Abot.Core;
  2. using Abot.Poco;
  3. using Commoner.Core.Testing;
  4. using NUnit.Framework;
  5. using System;
  6. using System.Collections.Generic;
  7. using System.Collections.Specialized;
  8. using System.Linq;
  9. using System.Net;
  10. namespace Abot.Tests.Unit.Core
  11. {
  12. [TestFixture]
  13. public abstract class HyperLinkParserTest
  14. {
  15. HyperLinkParser _unitUnderTest;
  16. Uri _uri = new Uri("http://a.com/");
  17. CrawledPage _crawledPage;
  18. protected abstract HyperLinkParser GetInstance(bool isRespectMetaRobotsNoFollowEnabled, bool isRespectAnchorRelNoFollowEnabled, Func<string, string> cleanUrlDelegate, bool isRespectUrlNamedAnchorOrHashbangEnabled, bool isRespectHttpXRobotsTagHeaderNoFollow);
  19. [SetUp]
  20. public void Setup()
  21. {
  22. _crawledPage = new PageRequester(new CrawlConfiguration()).MakeRequest(new Uri("http://localhost.fiddler:1111/"));
  23. //Make the real request above look like it came from the fake uri
  24. _crawledPage.ParentUri = _uri;
  25. _crawledPage.HttpWebRequest = (HttpWebRequest)WebRequest.Create(_uri);
  26. _unitUnderTest = GetInstance(false, false, null, false, false);
  27. }
  28. [Test]
  29. public void GetLinks_AnchorTags_ReturnsLinks()
  30. {
  31. _crawledPage.Content.Text = "<a href=\"http://aaa.com/\" ></a><a href=\"/aaa/a.html\" /></a>";
  32. IEnumerable<Uri> result = _unitUnderTest.GetLinks(_crawledPage);
  33. Assert.AreEqual(2, result.Count());
  34. Assert.AreEqual("http://aaa.com/", result.ElementAt(0).AbsoluteUri);
  35. Assert.AreEqual("http://a.com/aaa/a.html", result.ElementAt(1).AbsoluteUri);
  36. }
  37. [Test]
  38. public void GetLinks_AreaTags_ReturnsLinks()
  39. {
  40. _crawledPage.Content.Text = "<area href=\"http://bbb.com\" /><area href=\"bbb/b.html\" />";
  41. IEnumerable<Uri> result = _unitUnderTest.GetLinks(_crawledPage);
  42. Assert.AreEqual(2, result.Count());
  43. Assert.AreEqual("http://bbb.com/", result.ElementAt(0).AbsoluteUri);
  44. Assert.AreEqual("http://a.com/bbb/b.html", result.ElementAt(1).AbsoluteUri);
  45. }
  46. [Test]
  47. public void GetLinks_AnchorTagsUpperCase_ReturnsLinks()
  48. {
  49. _crawledPage.Content.Text = "<A HREF=\"http://aaa.com/\" ></A><A HREF=\"/aaa/a.html\" /></A>";
  50. IEnumerable<Uri> result = _unitUnderTest.GetLinks(_crawledPage);
  51. Assert.AreEqual(2, result.Count());
  52. Assert.AreEqual("http://aaa.com/", result.ElementAt(0).AbsoluteUri);
  53. Assert.AreEqual("http://a.com/aaa/a.html", result.ElementAt(1).AbsoluteUri);
  54. }
  55. [Test]
  56. public void GetLinks_AreaTagsUpperCase_ReturnsLinks()
  57. {
  58. _crawledPage.Content.Text = "<AREA HREF=\"http://bbb.com\" /><AREA HREF=\"bbb/b.html\" />";
  59. IEnumerable<Uri> result = _unitUnderTest.GetLinks(_crawledPage);
  60. Assert.AreEqual(2, result.Count());
  61. Assert.AreEqual("http://bbb.com/", result.ElementAt(0).AbsoluteUri);
  62. Assert.AreEqual("http://a.com/bbb/b.html", result.ElementAt(1).AbsoluteUri);
  63. }
  64. [Test]
  65. public void GetLinks_NoLinks_NotReturned()
  66. {
  67. _crawledPage.Content.Text = "<html></html>";
  68. IEnumerable<Uri> result = _unitUnderTest.GetLinks(_crawledPage);
  69. Assert.AreEqual(0, result.Count());
  70. }
  71. [Test]
  72. public void GetLinks_AnyScheme_Returned()
  73. {
  74. _crawledPage.Content.Text = "<a href=\"mailto:aaa@gmail.com\" /><a href=\"tel:+123456789\" /><a href=\"callto:+123456789\" /><a href=\"ftp://user@yourdomainname.com/\" /><a href=\"file:///C:/Users/\" />";
  75. IEnumerable<Uri> result = _unitUnderTest.GetLinks(_crawledPage);
  76. Assert.AreEqual(5, result.Count());
  77. Assert.AreEqual("mailto:aaa@gmail.com", result.ElementAt(0).AbsoluteUri);
  78. Assert.AreEqual("tel:+123456789", result.ElementAt(1).AbsoluteUri);
  79. Assert.AreEqual("callto:+123456789", result.ElementAt(2).AbsoluteUri);
  80. Assert.AreEqual("ftp://user@yourdomainname.com/", result.ElementAt(3).AbsoluteUri);
  81. Assert.AreEqual("file:///C:/Users/", result.ElementAt(4).AbsoluteUri);
  82. }
  83. [Test]
  84. public void GetLinks_InvalidFormatUrl_NotReturned()
  85. {
  86. _crawledPage.Content.Text = "<a href=\"http://////\" />";
  87. IEnumerable<Uri> result = _unitUnderTest.GetLinks(_crawledPage);
  88. Assert.AreEqual(0, result.Count());
  89. }
  90. [Test]
  91. public void GetLinks_LinksInComments_NotReturned()
  92. {
  93. _crawledPage.Content.Text = @"<html>
  94. <head>
  95. <!--
  96. <a href='http://a1.com' />
  97. <area href='http://a2.com' />
  98. -->
  99. </head>
  100. <body>
  101. <!--
  102. <a href='http://b1.com' />
  103. <area href='http://b2.com' />
  104. -->
  105. </body>
  106. </html";
  107. IEnumerable<Uri> result = _unitUnderTest.GetLinks(_crawledPage);
  108. Assert.AreEqual(0, result.Count());
  109. }
  110. [Test]
  111. public void GetLinks_LinksInScript_NotReturned()
  112. {
  113. _crawledPage.Content.Text = @"<html>
  114. <head>
  115. <script>
  116. <a href='http://a1.com' />
  117. <area href='http://a2.com' />
  118. </script>
  119. </head>
  120. <body>
  121. <script>
  122. <a href='http://b1.com' />
  123. <area href='http://b2.com' />
  124. </script>
  125. </body>
  126. </html";
  127. IEnumerable<Uri> result = _unitUnderTest.GetLinks(_crawledPage);
  128. Assert.AreEqual(0, result.Count());
  129. }
  130. [Test]
  131. public void GetLinks_LinksInStyleTag_NotReturned()
  132. {
  133. _crawledPage.Content.Text = @"<html>
  134. <head>
  135. <style>
  136. <a href='http://a1.com' />
  137. <area href='http://a2.com' />
  138. </style>
  139. </head>
  140. <body>
  141. <style>
  142. <a href='http://b1.com' />
  143. <area href='http://b2.com' />
  144. </style>
  145. </body>
  146. </html";
  147. IEnumerable<Uri> result = _unitUnderTest.GetLinks(_crawledPage);
  148. Assert.AreEqual(0, result.Count());
  149. }
  150. [Test]
  151. public void GetLinks_DuplicateLinks_ReturnsOnlyOne()
  152. {
  153. _crawledPage.Content.Text = "<a href=\"/aaa/a.html\" ></a><a href=\"/aaa/a.html\" /></a>";
  154. IEnumerable<Uri> result = _unitUnderTest.GetLinks(_crawledPage);
  155. Assert.AreEqual(1, result.Count());
  156. Assert.AreEqual("http://a.com/aaa/a.html", result.ElementAt(0).AbsoluteUri);
  157. }
  158. [Test]
  159. public void GetLinks_NamedAnchorsOrHashbangs_Ignores()
  160. {
  161. _crawledPage.Content.Text = "<a href=\"/aaa/a.html\" ></a><a href=\"/aaa/a.html#top\" ></a><a href=\"/aaa/a.html#bottom\" /></a><a href=\"/aaa/a.html/#someaction/someid\" /></a>";
  162. IEnumerable<Uri> result = _unitUnderTest.GetLinks(_crawledPage);
  163. Assert.AreEqual(2, result.Count());
  164. Assert.AreEqual("http://a.com/aaa/a.html", result.ElementAt(0).AbsoluteUri);
  165. Assert.AreEqual("http://a.com/aaa/a.html/", result.ElementAt(1).AbsoluteUri);
  166. }
  167. [Test]
  168. public void GetLinks_NamedAnchorsOrHashbangs_Enabled_ReturnsLinks()
  169. {
  170. _unitUnderTest = GetInstance(false, false, null, true, false);
  171. _crawledPage.Content.Text = "<a href=\"/aaa/a.html\" ></a><a href=\"/aaa/a.html#top\" ></a><a href=\"/aaa/a.html#bottom\" /></a><a href=\"/aaa/a.html/#someaction/someid\" /></a>";
  172. IEnumerable<Uri> result = _unitUnderTest.GetLinks(_crawledPage);
  173. Assert.AreEqual(4, result.Count());
  174. Assert.AreEqual("http://a.com/aaa/a.html", result.ElementAt(0).AbsoluteUri);
  175. Assert.AreEqual("http://a.com/aaa/a.html#top", result.ElementAt(1).AbsoluteUri);
  176. Assert.AreEqual("http://a.com/aaa/a.html#bottom", result.ElementAt(2).AbsoluteUri);
  177. Assert.AreEqual("http://a.com/aaa/a.html/#someaction/someid", result.ElementAt(3).AbsoluteUri);
  178. }
  179. [Test]
  180. public void GetLinks_EmptyHtml()
  181. {
  182. _crawledPage.Content.Text = "";
  183. IEnumerable<Uri> result = _unitUnderTest.GetLinks(_crawledPage);
  184. Assert.IsNotNull(result);
  185. Assert.AreEqual(0, result.Count());
  186. }
  187. [Test]
  188. public void GetLinks_WhiteSpaceHtml()
  189. {
  190. _crawledPage.Content.Text = " ";
  191. IEnumerable<Uri> result = _unitUnderTest.GetLinks(_crawledPage);
  192. Assert.IsNotNull(result);
  193. Assert.AreEqual(0, result.Count());
  194. }
  195. [Test]
  196. public void GetLinks_ValidBaseTagPresent_ReturnsRelativeLinksUsingBase()
  197. {
  198. _crawledPage.Content.Text = "<base href=\"http://bbb.com\"><a href=\"http://aaa.com/\" ></a><a href=\"/aaa/a.html\" /></a>";
  199. IEnumerable<Uri> result = _unitUnderTest.GetLinks(_crawledPage);
  200. Assert.AreEqual(2, result.Count());
  201. Assert.AreEqual("http://aaa.com/", result.ElementAt(0).AbsoluteUri);
  202. Assert.AreEqual("http://bbb.com/aaa/a.html", result.ElementAt(1).AbsoluteUri);
  203. }
  204. [Test]
  205. public void GetLinks_RelativeBaseTagPresent_ReturnsRelativeLinksPageUri()
  206. {
  207. _crawledPage.Content.Text = "<base href=\"/images\"><a href=\"http://aaa.com/\" ></a><a href=\"/aaa/a.html\" /></a>";
  208. IEnumerable<Uri> result = _unitUnderTest.GetLinks(_crawledPage);
  209. Assert.AreEqual(2, result.Count());
  210. Assert.AreEqual("http://aaa.com/", result.ElementAt(0).AbsoluteUri);
  211. Assert.AreEqual("http://a.com/aaa/a.html", result.ElementAt(1).AbsoluteUri);
  212. }
  213. [Test]
  214. public void GetLinks_InvalidBaseTagPresent_ReturnsRelativeLinksPageUri()
  215. {
  216. _crawledPage.Content.Text = "<base href=\"http:http://http:\"><a href=\"http://aaa.com/\" ></a><a href=\"/aaa/a.html\" /></a>";
  217. IEnumerable<Uri> result = _unitUnderTest.GetLinks(_crawledPage);
  218. Assert.AreEqual(2, result.Count());
  219. Assert.AreEqual("http://aaa.com/", result.ElementAt(0).AbsoluteUri);
  220. Assert.AreEqual("http://a.com/aaa/a.html", result.ElementAt(1).AbsoluteUri);
  221. }
  222. [Test]
  223. public void GetLinks_BaseTagNoScheme_ParentPageHttp_AddsParentPageScheme()
  224. {
  225. _crawledPage.Uri = new Uri("http://aaa.com/");//http
  226. _crawledPage.Content.Text = "<base href=\"//aaa.com\"><a href=\"/aaa/a.html\" ></a>";
  227. IEnumerable<Uri> result = _unitUnderTest.GetLinks(_crawledPage);
  228. Assert.AreEqual(1, result.Count());
  229. Assert.AreEqual("http://aaa.com/aaa/a.html", result.ElementAt(0).AbsoluteUri);
  230. }
  231. [Test]
  232. public void GetLinks_BaseTagNoScheme_ParentPageHttps_AddsParentPageScheme()
  233. {
  234. _crawledPage.Uri = new Uri("https://aaa.com/");//https
  235. _crawledPage.Content.Text = "<base href=\"//aaa.com\"><a href=\"/aaa/a.html\" ></a>";
  236. IEnumerable<Uri> result = _unitUnderTest.GetLinks(_crawledPage);
  237. Assert.AreEqual(1, result.Count());
  238. Assert.AreEqual("https://aaa.com/aaa/a.html", result.ElementAt(0).AbsoluteUri);
  239. }
  240. [Test]
  241. [ExpectedException(typeof(ArgumentNullException))]
  242. public void GetLinks_NullCrawledPage()
  243. {
  244. _unitUnderTest.GetLinks(null);
  245. }
  246. [Test]
  247. public void GetLinks_ResponseUriDiffFromRequestUri_UsesResponseUri()
  248. {
  249. _crawledPage.Content.Text = "<a href=\"/aaa/a.html\" ></a><a href=\"/bbb/b.html\" /></a>";
  250. //This sets the Address properties backing field which does not have a public set method
  251. ValueHelper.SetFieldValue(_crawledPage.HttpWebRequest, "_Uri", new Uri("http://zzz.com/"));
  252. IEnumerable<Uri> result = _unitUnderTest.GetLinks(_crawledPage);
  253. Assert.AreEqual(2, result.Count());
  254. Assert.AreEqual("http://zzz.com/aaa/a.html", result.ElementAt(0).AbsoluteUri);
  255. Assert.AreEqual("http://zzz.com/bbb/b.html", result.ElementAt(1).AbsoluteUri);
  256. }
  257. [Test]
  258. public void GetLinks_HtmlEncodedHref_UrlDecodes()
  259. {
  260. _crawledPage.Content.Text = "<a href=\"http://a.com/search?rls=en&amp;q=stack+overflow\" ></a>";
  261. //This sets the Address properties backing field
  262. IEnumerable<Uri> result = _unitUnderTest.GetLinks(_crawledPage);
  263. Assert.AreEqual(1, result.Count());
  264. Assert.AreEqual("http://a.com/search?rls=en&q=stack+overflow", result.ElementAt(0).AbsoluteUri);
  265. }
  266. [Test]
  267. public void GetLinks_MetaNoIndexNoFollowNotSet_ReturnsLinks()
  268. {
  269. _unitUnderTest = GetInstance(false, false, null, false, false);
  270. _crawledPage.Content.Text = "<meta name=\"robots\" content=\"noindex, nofollow\" /><a href=\"/aaa/a.html\" ></a><a href=\"/bbb/b.html\" /></a>";
  271. IEnumerable<Uri> result = _unitUnderTest.GetLinks(_crawledPage);
  272. Assert.IsNotNull(result);
  273. Assert.AreEqual(2, result.Count());
  274. }
  275. [Test]
  276. public void GetLinks_MetaNoIndexNoFollow_ReturnsEmptyList()
  277. {
  278. _unitUnderTest = GetInstance(true, false, null, false, false);
  279. _crawledPage.Content.Text = "<meta name=\"robots\" content=\"noindex, nofollow\" /><a href=\"/aaa/a.html\" ></a><a href=\"/bbb/b.html\" /></a>";
  280. IEnumerable<Uri> result = _unitUnderTest.GetLinks(_crawledPage);
  281. Assert.IsNotNull(result);
  282. Assert.AreEqual(0, result.Count());
  283. }
  284. [Test]
  285. public void GetLinks_MetaNoIndexNoFollowUpperCase_ReturnsEmptyList()
  286. {
  287. _unitUnderTest = GetInstance(true, false, null, false, false);
  288. _crawledPage.Content.Text = "<META NAME=\"ROBOTS\" CONTENT=\"NOINDEX, NOFOLLOW\" /><a href=\"/aaa/a.html\" ></a><a href=\"/bbb/b.html\" /></a>";
  289. IEnumerable<Uri> result = _unitUnderTest.GetLinks(_crawledPage);
  290. Assert.IsNotNull(result);
  291. Assert.AreEqual(0, result.Count());
  292. }
  293. [Test]
  294. public void GetLinks_MetaNoIndexNoFollowUsingNone_ReturnsEmptyList()
  295. {
  296. _unitUnderTest = GetInstance(true, false, null, false, false);
  297. _crawledPage.Content.Text = "<meta name=\"robots\" content=\"none\" /><a href=\"/aaa/a.html\" ></a><a href=\"/bbb/b.html\" /></a>";
  298. IEnumerable<Uri> result = _unitUnderTest.GetLinks(_crawledPage);
  299. Assert.IsNotNull(result);
  300. Assert.AreEqual(0, result.Count());
  301. }
  302. [Test]
  303. public void GetLinks_MetaNoIndexNoFollowUsingNoneUpperCase_ReturnsEmptyList()
  304. {
  305. _unitUnderTest = GetInstance(true, false, null, false, false);
  306. _crawledPage.Content.Text = "<META NAME=\"ROBOTS\" CONTENT=\"NONE\" /><a href=\"/aaa/a.html\" ></a><a href=\"/bbb/b.html\" /></a>";
  307. IEnumerable<Uri> result = _unitUnderTest.GetLinks(_crawledPage);
  308. Assert.IsNotNull(result);
  309. Assert.AreEqual(0, result.Count());
  310. }
  311. [Test]
  312. public void GetLinks_MetaNoFollow_ReturnsEmptyList()
  313. {
  314. _unitUnderTest = GetInstance(true, false, null, false, false);
  315. _crawledPage.Content.Text = "<meta name=\"robots\" content=\"nofollow\" /><a href=\"/aaa/a.html\" ></a><a href=\"/bbb/b.html\" /></a>";
  316. IEnumerable<Uri> result = _unitUnderTest.GetLinks(_crawledPage);
  317. Assert.IsNotNull(result);
  318. Assert.AreEqual(0, result.Count());
  319. }
  320. [Test]
  321. public void GetLinks_MetaNoIndex_ReturnsLinks()
  322. {
  323. _unitUnderTest = GetInstance(true, false, null, false, false);
  324. _crawledPage.Content.Text = "<meta name=\"robots\" content=\"noindex\" /><a href=\"/aaa/a.html\" ></a><a href=\"/bbb/b.html\" /></a>";
  325. IEnumerable<Uri> result = _unitUnderTest.GetLinks(_crawledPage);
  326. Assert.IsNotNull(result);
  327. Assert.AreEqual(2, result.Count());
  328. }
  329. [Test]
  330. public void GetLinks_HttpXRobotsTagHeaderNoIndexNoFollow_ReturnsEmptyList()
  331. {
  332. _crawledPage.HttpWebResponse.Headers.Add(new NameValueCollection() {{ "X-Robots-Tag", "noindex, nofollow" } });
  333. _unitUnderTest = GetInstance(false, false, null, false, true);
  334. _crawledPage.Content.Text = "<a href=\"/aaa/a.html\" ></a><a href=\"/bbb/b.html\" /></a>";
  335. IEnumerable<Uri> result = _unitUnderTest.GetLinks(_crawledPage);
  336. Assert.IsNotNull(result);
  337. Assert.AreEqual(0, result.Count());
  338. }
  339. [Test]
  340. public void GetLinks_HttpXRobotsTagHeaderNoIndexNoFollowUpperCase_ReturnsEmptyList()
  341. {
  342. _crawledPage.HttpWebResponse.Headers.Add(new NameValueCollection() { { "X-Robots-Tag", "NOINDEX, NOFOLLOW" } });
  343. _unitUnderTest = GetInstance(false, false, null, false, true);
  344. _crawledPage.Content.Text = "<a href=\"/aaa/a.html\" ></a><a href=\"/bbb/b.html\" /></a>";
  345. IEnumerable<Uri> result = _unitUnderTest.GetLinks(_crawledPage);
  346. Assert.IsNotNull(result);
  347. Assert.AreEqual(0, result.Count());
  348. }
  349. [Test]
  350. public void GetLinks_HttpXRobotsTagHeaderNoIndexNoFollowUsingNone_ReturnsEmptyList()
  351. {
  352. _crawledPage.HttpWebResponse.Headers.Add(new NameValueCollection() { { "X-Robots-Tag", "none" } });
  353. _unitUnderTest = GetInstance(false, false, null, false, true);
  354. _crawledPage.Content.Text = "<a href=\"/aaa/a.html\" ></a><a href=\"/bbb/b.html\" /></a>";
  355. IEnumerable<Uri> result = _unitUnderTest.GetLinks(_crawledPage);
  356. Assert.IsNotNull(result);
  357. Assert.AreEqual(0, result.Count());
  358. }
  359. [Test]
  360. public void GetLinks_HttpXRobotsTagHeaderNoIndexNoFollowUsingNoneUpperCase_ReturnsEmptyList()
  361. {
  362. _crawledPage.HttpWebResponse.Headers.Add(new NameValueCollection() { { "X-Robots-Tag", "NONE" } });
  363. _unitUnderTest = GetInstance(false, false, null, false, true);
  364. _crawledPage.Content.Text = "<a href=\"/aaa/a.html\" ></a><a href=\"/bbb/b.html\" /></a>";
  365. IEnumerable<Uri> result = _unitUnderTest.GetLinks(_crawledPage);
  366. Assert.IsNotNull(result);
  367. Assert.AreEqual(0, result.Count());
  368. }
  369. [Test]
  370. public void GetLinks_HttpXRobotsNoFollow_ReturnsEmptyList()
  371. {
  372. _crawledPage.HttpWebResponse.Headers.Add(new NameValueCollection() { { "X-Robots-Tag", "nofollow" } });
  373. _unitUnderTest = GetInstance(false, false, null, false, true);
  374. _crawledPage.Content.Text = "<a href=\"/aaa/a.html\" ></a><a href=\"/bbb/b.html\" /></a>";
  375. IEnumerable<Uri> result = _unitUnderTest.GetLinks(_crawledPage);
  376. Assert.IsNotNull(result);
  377. Assert.AreEqual(0, result.Count());
  378. }
  379. [Test]
  380. public void GetLinks_HttpXRobotsTagHeaderNoIndex_ReturnsLinks()
  381. {
  382. _crawledPage.HttpWebResponse.Headers.Add(new NameValueCollection() { { "X-Robots-Tag", "noindex" } });
  383. _unitUnderTest = GetInstance(false, false, null, false, true);
  384. _crawledPage.Content.Text = "<a href=\"/aaa/a.html\" ></a><a href=\"/bbb/b.html\" /></a>";
  385. IEnumerable<Uri> result = _unitUnderTest.GetLinks(_crawledPage);
  386. Assert.IsNotNull(result);
  387. Assert.AreEqual(2, result.Count());
  388. }
  389. [Test]
  390. public void GetLinks_RelNoFollow_NotReturned()
  391. {
  392. _unitUnderTest = GetInstance(false, true, null, false, false);
  393. _crawledPage.Content.Text = "<a href=\"/aaa/a.html\" rel=\"nofollow\"></a><a href=\"/bbb/b.html\" rel=\"nofollow\" /></a>";
  394. IEnumerable<Uri> result = _unitUnderTest.GetLinks(_crawledPage);
  395. Assert.IsNotNull(result);
  396. Assert.AreEqual(0, result.Count());
  397. }
  398. [Test]
  399. public void GetLinks_RelNoFollowUpperCase_NotReturned()
  400. {
  401. _unitUnderTest = GetInstance(false, true, null, false, false);
  402. _crawledPage.Content.Text = "<a href=\"/aaa/a.html\" REL=\"NOFOLLOW\"></a><a href=\"/bbb/b.html\" REL=\"NOFOLLOW\" /></a>";
  403. IEnumerable<Uri> result = _unitUnderTest.GetLinks(_crawledPage);
  404. Assert.IsNotNull(result);
  405. Assert.AreEqual(0, result.Count());
  406. }
  407. [Test]
  408. public void GetLinks_CleanUrlDelegateSet_ReturnsCleanLinks()
  409. {
  410. _unitUnderTest = GetInstance(false, false, (x) => x.Replace("a", "x").Replace("b", "y"), false, false);
  411. _crawledPage.Content.Text = "<a href=\"/aaa/a.html\" ></a><a href=\"/bbb/b.html\" /></a>";
  412. IEnumerable<Uri> result = _unitUnderTest.GetLinks(_crawledPage);
  413. Assert.IsNotNull(result);
  414. Assert.AreEqual(2, result.Count());
  415. Assert.AreEqual("http://x.com/xxx/x.html", result.ElementAt(0).AbsoluteUri);
  416. Assert.AreEqual("http://x.com/yyy/y.html", result.ElementAt(1).AbsoluteUri);
  417. }
  418. [Test] //https://github.com/sjdirect/abot/issues/15
  419. public void GetLinks_ColonInUrl_DoesNotThrowException()
  420. {
  421. _crawledPage.Content.Text = "<a href=\"http://www.gamespot.com/pc/rpg/numen/index.html?om_act=convert&om_clk=tabs&tag=tabs;summary\" ></a>";
  422. IEnumerable<Uri> result = _unitUnderTest.GetLinks(_crawledPage);
  423. Assert.AreEqual(1, result.Count());
  424. Assert.AreEqual("http://www.gamespot.com/pc/rpg/numen/index.html?om_act=convert&om_clk=tabs&tag=tabs;summary", result.ElementAt(0).AbsoluteUri);
  425. }
  426. [Test]
  427. public void GetLinks_LinkRelConical_ReturnsLink()
  428. {
  429. _crawledPage.Content.Text = "<html><head><link rel=\"canonical\" href=\"http://a.com/page1\" /></head><body><a href=\"http://a.com/page2\"></a></body></html>";
  430. IEnumerable<Uri> result = _unitUnderTest.GetLinks(_crawledPage);
  431. Assert.AreEqual(2, result.Count());
  432. Assert.AreEqual("http://a.com/page2", result.ElementAt(0).AbsoluteUri);
  433. Assert.AreEqual("http://a.com/page1", result.ElementAt(1).AbsoluteUri);
  434. }
  435. }
  436. }