PageRenderTime 52ms CodeModel.GetById 8ms RepoModel.GetById 0ms app.codeStats 0ms

/src/test/java/bixo/fetcher/SimpleHttpFetcherTest.java

http://github.com/sguo/bixo
Java | 435 lines | 348 code | 74 blank | 13 comment | 11 complexity | 1814178e819bad7326c63d5d908c5007 MD5 | raw file
  1. package bixo.fetcher;
  2. import static org.junit.Assert.assertEquals;
  3. import static org.junit.Assert.assertNotNull;
  4. import static org.junit.Assert.assertNull;
  5. import static org.junit.Assert.assertTrue;
  6. import static org.junit.Assert.fail;
  7. import java.io.IOException;
  8. import java.util.HashSet;
  9. import java.util.Set;
  10. import junit.framework.Assert;
  11. import org.apache.http.HttpStatus;
  12. import org.apache.http.conn.HttpHostConnectException;
  13. import org.junit.Test;
  14. import org.mortbay.http.HttpException;
  15. import org.mortbay.http.HttpRequest;
  16. import org.mortbay.http.HttpResponse;
  17. import org.mortbay.http.HttpServer;
  18. import org.mortbay.http.SocketListener;
  19. import org.mortbay.http.handler.AbstractHttpHandler;
  20. import bixo.config.DefaultFetchJobPolicy;
  21. import bixo.config.FetcherPolicy;
  22. import bixo.config.FetcherPolicy.RedirectMode;
  23. import bixo.datum.FetchedDatum;
  24. import bixo.datum.ScoredUrlDatum;
  25. import bixo.exceptions.AbortedFetchException;
  26. import bixo.exceptions.AbortedFetchReason;
  27. import bixo.exceptions.IOFetchException;
  28. import bixo.exceptions.RedirectFetchException;
  29. import bixo.exceptions.RedirectFetchException.RedirectExceptionReason;
  30. import bixo.fetcher.simulation.SimulationWebServer;
  31. import bixo.utils.ConfigUtils;
  32. public class SimpleHttpFetcherTest extends SimulationWebServer {
  33. @SuppressWarnings("serial")
  34. private class RedirectResponseHandler extends AbstractHttpHandler {
  35. private boolean _permanent;
  36. public RedirectResponseHandler() {
  37. this(false);
  38. }
  39. public RedirectResponseHandler(boolean permanent) {
  40. super();
  41. _permanent = permanent;
  42. }
  43. @Override
  44. public void handle(String pathInContext, String pathParams, HttpRequest request, HttpResponse response) throws HttpException, IOException {
  45. if (pathInContext.endsWith("base")) {
  46. if (_permanent) {
  47. // Can't use sendRedirect, as that forces it to be a temp redirect.
  48. response.setStatus(HttpStatus.SC_MOVED_PERMANENTLY);
  49. response.addField("Location", "http://localhost:8089/redirect");
  50. request.setHandled(true);
  51. } else {
  52. response.sendRedirect("http://localhost:8089/redirect");
  53. }
  54. } else {
  55. response.setStatus(HttpStatus.SC_OK);
  56. response.setContentType("text/plain");
  57. String content = "redirected";
  58. response.setContentLength(content.length());
  59. response.getOutputStream().write(content.getBytes());
  60. }
  61. }
  62. }
  63. @SuppressWarnings("serial")
  64. private class LanguageResponseHandler extends AbstractHttpHandler {
  65. private String _englishContent;
  66. private String _foreignContent;
  67. public LanguageResponseHandler(String englishContent, String foreignContent) {
  68. _englishContent = englishContent;
  69. _foreignContent = foreignContent;
  70. }
  71. @Override
  72. public void handle(String pathInContext, String pathParams, HttpRequest request, HttpResponse response) throws HttpException, IOException {
  73. String language = request.getField(HttpHeaderNames.ACCEPT_LANGUAGE);
  74. String content;
  75. if ((language != null) && (language.contains("en"))) {
  76. content = _englishContent;
  77. } else {
  78. content = _foreignContent;
  79. }
  80. response.setStatus(HttpStatus.SC_OK);
  81. response.setContentType("text/plain");
  82. response.setContentLength(content.length());
  83. response.getOutputStream().write(content.getBytes());
  84. }
  85. }
  86. @SuppressWarnings("serial")
  87. private class MimeTypeResponseHandler extends AbstractHttpHandler {
  88. private String _mimeType;
  89. public MimeTypeResponseHandler(String mimeType) {
  90. _mimeType = mimeType;
  91. }
  92. @Override
  93. public void handle(String pathInContext, String pathParams, HttpRequest request, HttpResponse response) throws HttpException, IOException {
  94. String content = "test";
  95. response.setStatus(HttpStatus.SC_OK);
  96. if (_mimeType != null) {
  97. response.setContentType(_mimeType);
  98. }
  99. response.setContentLength(content.length());
  100. response.getOutputStream().write(content.getBytes());
  101. }
  102. }
  103. @Test
  104. public final void testConnectionTimeout() throws Exception {
  105. HttpServer server = startServer(new ResourcesResponseHandler(), 8089);
  106. BaseFetcher fetcher = new SimpleHttpFetcher(1, ConfigUtils.BIXO_TEST_AGENT);
  107. String url = "http://localhost:8088/simple-page.html";
  108. try {
  109. fetcher.get(new ScoredUrlDatum(url));
  110. fail("Exception not thrown");
  111. } catch (IOFetchException e) {
  112. assertTrue(e.getCause() instanceof HttpHostConnectException);
  113. } finally {
  114. server.stop();
  115. }
  116. }
  117. @Test
  118. public final void testStaleConnection() throws Exception {
  119. HttpServer server = startServer(new ResourcesResponseHandler(), 8089);
  120. SocketListener sl = (SocketListener)server.getListeners()[0];
  121. sl.setLingerTimeSecs(-1);
  122. BaseFetcher fetcher = new SimpleHttpFetcher(1, ConfigUtils.BIXO_TEST_AGENT);
  123. String url = "http://localhost:8089/simple-page.html";
  124. fetcher.get(new ScoredUrlDatum(url));
  125. // TODO KKr - control keep-alive (linger?) value for Jetty, so we can set it
  126. // to something short and thus make this sleep delay much shorter.
  127. Thread.sleep(2000);
  128. fetcher.get(new ScoredUrlDatum(url));
  129. server.stop();
  130. }
  131. @Test
  132. public final void testSlowServerTermination() throws Exception {
  133. // Need to read in more than 2 8K blocks currently, due to how
  134. // HttpClientFetcher
  135. // is designed...so use 20K bytes. And the duration is 2 seconds, so 10K
  136. // bytes/sec.
  137. HttpServer server = startServer(new RandomResponseHandler(20000, 2 * 1000L), 8089);
  138. // Set up for a minimum response rate of 20000 bytes/second.
  139. FetcherPolicy policy = new FetcherPolicy();
  140. policy.setMinResponseRate(20000);
  141. BaseFetcher fetcher = new SimpleHttpFetcher(1, policy, ConfigUtils.BIXO_TEST_AGENT);
  142. String url = "http://localhost:8089/test.html";
  143. try {
  144. fetcher.get(new ScoredUrlDatum(url));
  145. fail("Aborted fetch exception not thrown");
  146. } catch (AbortedFetchException e) {
  147. assertEquals(AbortedFetchReason.SLOW_RESPONSE_RATE, e.getAbortReason());
  148. }
  149. server.stop();
  150. }
  151. @Test
  152. public final void testNotTerminatingSlowServers() throws Exception {
  153. // Return 1K bytes at 2K bytes/second - would normally trigger an
  154. // error.
  155. HttpServer server = startServer(new RandomResponseHandler(1000, 500), 8089);
  156. // Set up for no minimum response rate.
  157. FetcherPolicy policy = new FetcherPolicy();
  158. policy.setMinResponseRate(FetcherPolicy.NO_MIN_RESPONSE_RATE);
  159. BaseFetcher fetcher = new SimpleHttpFetcher(1, policy, ConfigUtils.BIXO_TEST_AGENT);
  160. String url = "http://localhost:8089/test.html";
  161. fetcher.get(new ScoredUrlDatum(url));
  162. server.stop();
  163. }
  164. @Test
  165. public final void testLargeContent() throws Exception {
  166. FetcherPolicy policy = new FetcherPolicy();
  167. HttpServer server = startServer(new RandomResponseHandler(policy.getMaxContentSize() * 2), 8089);
  168. BaseFetcher fetcher = new SimpleHttpFetcher(1, policy, ConfigUtils.BIXO_TEST_AGENT);
  169. String url = "http://localhost:8089/test.html";
  170. FetchedDatum result = fetcher.get(new ScoredUrlDatum(url));
  171. server.stop();
  172. assertTrue("Content size should be truncated", result.getContentLength() <= policy.getMaxContentSize());
  173. }
  174. @Test
  175. public final void testTruncationWithKeepAlive() throws Exception {
  176. HttpServer server = startServer(new ResourcesResponseHandler(), 8089);
  177. FetcherPolicy policy = new FetcherPolicy();
  178. BaseFetcher fetcher = new SimpleHttpFetcher(1, policy, ConfigUtils.BIXO_TEST_AGENT);
  179. fetcher.setDefaultMaxContentSize(1000);
  180. fetcher.setMaxContentSize("image/png", 5000);
  181. ScoredUrlDatum datumToFetch = new ScoredUrlDatum("http://localhost:8089/karlie.html");
  182. FetchedDatum result1 = fetcher.get(datumToFetch);
  183. FetchedDatum result2 = fetcher.get(datumToFetch);
  184. // Verify that we got the same data from each fetch request.
  185. assertEquals(1000, result1.getContentLength());
  186. assertEquals(1000, result2.getContentLength());
  187. byte[] bytes1 = result1.getContentBytes();
  188. byte[] bytes2 = result2.getContentBytes();
  189. for (int i = 0; i < bytes1.length; i++) {
  190. assertEquals(bytes1[i], bytes2[i]);
  191. }
  192. datumToFetch = new ScoredUrlDatum("http://localhost:8089/bixolabs_mining.png");
  193. FetchedDatum result3 = fetcher.get(datumToFetch);
  194. assertTrue(result3.getContentLength() > 1000);
  195. fetcher.setMaxContentSize("image/png", 1500);
  196. try {
  197. fetcher.get(datumToFetch);
  198. fail("Aborted fetch exception not thrown");
  199. } catch (AbortedFetchException e) {
  200. Assert.assertEquals(AbortedFetchReason.CONTENT_SIZE, e.getAbortReason());
  201. }
  202. server.stop();
  203. }
  204. @Test
  205. public final void testLargeHtml() throws Exception {
  206. FetcherPolicy policy = new FetcherPolicy();
  207. HttpServer server = startServer(new ResourcesResponseHandler(), 8089);
  208. BaseFetcher fetcher = new SimpleHttpFetcher(1, policy, ConfigUtils.BIXO_TEST_AGENT);
  209. String url = "http://localhost:8089/karlie.html";
  210. FetchedDatum result = fetcher.get(new ScoredUrlDatum(url));
  211. server.stop();
  212. assertTrue("Content size should be truncated", result.getContentLength() <= policy.getMaxContentSize());
  213. }
  214. @Test
  215. public final void testContentTypeHeader() throws Exception {
  216. FetcherPolicy policy = new FetcherPolicy();
  217. HttpServer server = startServer(new ResourcesResponseHandler(), 8089);
  218. BaseFetcher fetcher = new SimpleHttpFetcher(1, policy, ConfigUtils.BIXO_TEST_AGENT);
  219. String url = "http://localhost:8089/simple-page.html";
  220. FetchedDatum result = fetcher.get(new ScoredUrlDatum(url));
  221. server.stop();
  222. String contentType = result.getHeaders().getFirst(HttpHeaderNames.CONTENT_TYPE);
  223. assertNotNull(contentType);
  224. assertEquals("text/html", contentType);
  225. }
  226. @Test
  227. public final void testTempRedirectHandling() throws Exception {
  228. FetcherPolicy policy = new FetcherPolicy();
  229. HttpServer server = startServer(new RedirectResponseHandler(), 8089);
  230. BaseFetcher fetcher = new SimpleHttpFetcher(1, policy, ConfigUtils.BIXO_TEST_AGENT);
  231. String url = "http://localhost:8089/base";
  232. FetchedDatum result = fetcher.get(new ScoredUrlDatum(url));
  233. server.stop();
  234. assertEquals("Redirected URL", "http://localhost:8089/redirect", result.getFetchedUrl());
  235. assertNull(result.getNewBaseUrl());
  236. assertEquals(1, result.getNumRedirects());
  237. }
  238. @Test
  239. public final void testPermRedirectHandling() throws Exception {
  240. FetcherPolicy policy = new FetcherPolicy();
  241. HttpServer server = startServer(new RedirectResponseHandler(true), 8089);
  242. BaseFetcher fetcher = new SimpleHttpFetcher(1, policy, ConfigUtils.BIXO_TEST_AGENT);
  243. String url = "http://localhost:8089/base";
  244. ScoredUrlDatum scoredUrl = new ScoredUrlDatum(url);
  245. scoredUrl.setPayloadValue("payload-field-1", 1);
  246. FetchedDatum result = fetcher.get(scoredUrl);
  247. server.stop();
  248. assertEquals("Redirected URL", "http://localhost:8089/redirect", result.getFetchedUrl());
  249. assertEquals("New base URL", "http://localhost:8089/redirect", result.getNewBaseUrl());
  250. assertEquals(1, result.getNumRedirects());
  251. assertEquals(1, result.getPayloadValue("payload-field-1"));
  252. }
  253. @Test
  254. public final void testRedirectPolicy() throws Exception {
  255. FetcherPolicy policy = new FetcherPolicy();
  256. policy.setRedirectMode(RedirectMode.FOLLOW_TEMP);
  257. HttpServer server = startServer(new RedirectResponseHandler(true), 8089);
  258. BaseFetcher fetcher = new SimpleHttpFetcher(1, policy, ConfigUtils.BIXO_TEST_AGENT);
  259. String url = "http://localhost:8089/base";
  260. try {
  261. fetcher.get(new ScoredUrlDatum(url));
  262. fail("Exception should have been thrown");
  263. } catch (RedirectFetchException e) {
  264. assertEquals("Redirected URL", "http://localhost:8089/redirect", e.getRedirectedUrl());
  265. assertEquals(RedirectExceptionReason.PERM_REDIRECT_DISALLOWED, e.getReason());
  266. } finally {
  267. server.stop();
  268. }
  269. // Now try setting the mode to follow none
  270. policy.setRedirectMode(RedirectMode.FOLLOW_NONE);
  271. server = startServer(new RedirectResponseHandler(false), 8089);
  272. fetcher = new SimpleHttpFetcher(1, policy, ConfigUtils.BIXO_TEST_AGENT);
  273. try {
  274. fetcher.get(new ScoredUrlDatum(url));
  275. fail("Exception should have been thrown");
  276. } catch (RedirectFetchException e) {
  277. assertEquals("Redirected URL", "http://localhost:8089/redirect", e.getRedirectedUrl());
  278. assertEquals(RedirectExceptionReason.TEMP_REDIRECT_DISALLOWED, e.getReason());
  279. } finally {
  280. server.stop();
  281. }
  282. }
  283. @Test
  284. public final void testAcceptLanguage() throws Exception {
  285. final String englishContent = "English";
  286. final String foreignContent = "Foreign";
  287. FetcherPolicy policy = new FetcherPolicy();
  288. HttpServer server = startServer(new LanguageResponseHandler(englishContent, foreignContent), 8089);
  289. BaseFetcher fetcher = new SimpleHttpFetcher(1, policy, ConfigUtils.BIXO_TEST_AGENT);
  290. String url = "http://localhost:8089/";
  291. FetchedDatum result = fetcher.get(new ScoredUrlDatum(url));
  292. server.stop();
  293. String contentStr = new String(result.getContentBytes(), 0, result.getContentLength());
  294. assertTrue( englishContent.equals(contentStr));
  295. }
  296. @Test
  297. public final void testMimeTypeFiltering() throws Exception {
  298. FetcherPolicy policy = new FetcherPolicy();
  299. Set<String> validMimeTypes = new HashSet<String>();
  300. validMimeTypes.add("text/html");
  301. policy.setValidMimeTypes(validMimeTypes);
  302. HttpServer server = startServer(new MimeTypeResponseHandler("text/xml"), 8089);
  303. BaseFetcher fetcher = new SimpleHttpFetcher(1, policy, ConfigUtils.BIXO_TEST_AGENT);
  304. String url = "http://localhost:8089/";
  305. try {
  306. fetcher.get(new ScoredUrlDatum(url));
  307. fail("Fetch should have failed");
  308. } catch (AbortedFetchException e) {
  309. assertEquals(AbortedFetchReason.INVALID_MIMETYPE, e.getAbortReason());
  310. } finally {
  311. server.stop();
  312. }
  313. }
  314. @Test
  315. public final void testMimeTypeFilteringNoContentType() throws Exception {
  316. FetcherPolicy policy = new FetcherPolicy();
  317. Set<String> validMimeTypes = new HashSet<String>();
  318. validMimeTypes.add("text/html");
  319. validMimeTypes.add(""); // We want unknown (not reported) mime-types too.
  320. policy.setValidMimeTypes(validMimeTypes);
  321. HttpServer server = startServer(new MimeTypeResponseHandler(null), 8089);
  322. BaseFetcher fetcher = new SimpleHttpFetcher(1, policy, ConfigUtils.BIXO_TEST_AGENT);
  323. String url = "http://localhost:8089/";
  324. try {
  325. fetcher.get(new ScoredUrlDatum(url));
  326. } catch (AbortedFetchException e) {
  327. fail("Fetch should not have failed if no mime-type is specified");
  328. } finally {
  329. server.stop();
  330. }
  331. }
  332. @Test
  333. public final void testMimeTypeFilteringWithCharset() throws Exception {
  334. FetcherPolicy policy = new FetcherPolicy();
  335. Set<String> validMimeTypes = new HashSet<String>();
  336. validMimeTypes.add("text/html");
  337. policy.setValidMimeTypes(validMimeTypes);
  338. HttpServer server = startServer(new MimeTypeResponseHandler("text/html; charset=UTF-8"), 8089);
  339. BaseFetcher fetcher = new SimpleHttpFetcher(1, policy, ConfigUtils.BIXO_TEST_AGENT);
  340. String url = "http://localhost:8089/";
  341. try {
  342. fetcher.get(new ScoredUrlDatum(url));
  343. } catch (AbortedFetchException e) {
  344. fail("Fetch should have worked");
  345. } finally {
  346. server.stop();
  347. }
  348. }
  349. @Test
  350. public final void testHostAddress() throws Exception {
  351. FetcherPolicy policy = new FetcherPolicy();
  352. HttpServer server = startServer(new ResourcesResponseHandler(), 8089);
  353. BaseFetcher fetcher = new SimpleHttpFetcher(1, policy, ConfigUtils.BIXO_TEST_AGENT);
  354. String url = "http://localhost:8089/simple-page.html";
  355. FetchedDatum result = fetcher.get(new ScoredUrlDatum(url));
  356. server.stop();
  357. String hostAddress = result.getHostAddress();
  358. assertNotNull(hostAddress);
  359. assertEquals("127.0.0.1", hostAddress);
  360. }
  361. }