PageRenderTime 23ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 1ms

/src/main/java/bixo/fetcher/SimpleHttpFetcher.java

http://github.com/sguo/bixo
Java | 865 lines | 615 code | 128 blank | 122 comment | 102 complexity | 3fe66068dc355ef68be1156fe7d28894 MD5 | raw file
  1. /*
  2. * Copyright (c) 2010-2011 TransPac Software, Inc.
  3. *
  4. * Permission is hereby granted, free of charge, to any person obtaining a copy
  5. * of this software and associated documentation files (the "Software"), to deal
  6. * in the Software without restriction, including without limitation the rights
  7. * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8. * copies of the Software, and to permit persons to whom the Software is
  9. * furnished to do so, subject to the following conditions:
  10. *
  11. * The above copyright notice and this permission notice shall be included in
  12. * all copies or substantial portions of the Software.
  13. *
  14. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  17. * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  18. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  19. * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  20. * SOFTWARE.
  21. *
  22. */
  23. package bixo.fetcher;
  24. import java.io.ByteArrayOutputStream;
  25. import java.io.Closeable;
  26. import java.io.IOException;
  27. import java.io.InputStream;
  28. import java.net.MalformedURLException;
  29. import java.net.URI;
  30. import java.net.URISyntaxException;
  31. import java.net.URL;
  32. import java.security.NoSuchAlgorithmException;
  33. import java.util.HashSet;
  34. import java.util.Set;
  35. import java.util.concurrent.TimeUnit;
  36. import javax.net.ssl.SSLContext;
  37. import javax.net.ssl.SSLException;
  38. import javax.net.ssl.SSLHandshakeException;
  39. import javax.net.ssl.TrustManager;
  40. import org.apache.http.Header;
  41. import org.apache.http.HttpEntity;
  42. import org.apache.http.HttpEntityEnclosingRequest;
  43. import org.apache.http.HttpException;
  44. import org.apache.http.HttpHost;
  45. import org.apache.http.HttpInetConnection;
  46. import org.apache.http.HttpRequest;
  47. import org.apache.http.HttpRequestInterceptor;
  48. import org.apache.http.HttpResponse;
  49. import org.apache.http.HttpStatus;
  50. import org.apache.http.HttpVersion;
  51. import org.apache.http.NoHttpResponseException;
  52. import org.apache.http.ProtocolException;
  53. import org.apache.http.client.ClientProtocolException;
  54. import org.apache.http.client.CookieStore;
  55. import org.apache.http.client.HttpRequestRetryHandler;
  56. import org.apache.http.client.RedirectException;
  57. import org.apache.http.client.methods.HttpGet;
  58. import org.apache.http.client.methods.HttpRequestBase;
  59. import org.apache.http.client.methods.HttpUriRequest;
  60. import org.apache.http.client.params.ClientParamBean;
  61. import org.apache.http.client.params.CookiePolicy;
  62. import org.apache.http.client.params.HttpClientParams;
  63. import org.apache.http.client.protocol.ClientContext;
  64. import org.apache.http.conn.ConnectionPoolTimeoutException;
  65. import org.apache.http.conn.params.ConnManagerParams;
  66. import org.apache.http.conn.params.ConnPerRouteBean;
  67. import org.apache.http.conn.scheme.PlainSocketFactory;
  68. import org.apache.http.conn.scheme.Scheme;
  69. import org.apache.http.conn.scheme.SchemeRegistry;
  70. import org.apache.http.conn.ssl.AbstractVerifier;
  71. import org.apache.http.conn.ssl.SSLSocketFactory;
  72. import org.apache.http.cookie.params.CookieSpecParamBean;
  73. import org.apache.http.impl.client.BasicCookieStore;
  74. import org.apache.http.impl.client.DefaultHttpClient;
  75. import org.apache.http.impl.client.DefaultRedirectHandler;
  76. import org.apache.http.impl.conn.tsccm.ThreadSafeClientConnManager;
  77. import org.apache.http.message.BasicHeader;
  78. import org.apache.http.params.BasicHttpParams;
  79. import org.apache.http.params.HttpConnectionParams;
  80. import org.apache.http.params.HttpParams;
  81. import org.apache.http.params.HttpProtocolParams;
  82. import org.apache.http.protocol.BasicHttpContext;
  83. import org.apache.http.protocol.ExecutionContext;
  84. import org.apache.http.protocol.HttpContext;
  85. import org.apache.log4j.Logger;
  86. import org.apache.tika.mime.MediaType;
  87. import org.apache.tika.parser.ParseContext;
  88. import org.apache.tika.parser.Parser;
  89. import org.apache.tika.parser.html.HtmlParser;
  90. import com.bixolabs.cascading.Payload;
  91. import bixo.config.FetcherPolicy;
  92. import bixo.config.UserAgent;
  93. import bixo.config.FetcherPolicy.RedirectMode;
  94. import bixo.datum.ContentBytes;
  95. import bixo.datum.FetchedDatum;
  96. import bixo.datum.HttpHeaders;
  97. import bixo.datum.ScoredUrlDatum;
  98. import bixo.exceptions.AbortedFetchException;
  99. import bixo.exceptions.AbortedFetchReason;
  100. import bixo.exceptions.BaseFetchException;
  101. import bixo.exceptions.HttpFetchException;
  102. import bixo.exceptions.IOFetchException;
  103. import bixo.exceptions.RedirectFetchException;
  104. import bixo.exceptions.UrlFetchException;
  105. import bixo.exceptions.RedirectFetchException.RedirectExceptionReason;
  106. import bixo.utils.EncodingUtils;
  107. import bixo.utils.HttpUtils;
  108. import bixo.utils.EncodingUtils.ExpandedResult;
  109. @SuppressWarnings("serial")
  110. public class SimpleHttpFetcher extends BaseFetcher {
  111. private static Logger LOGGER = Logger.getLogger(SimpleHttpFetcher.class);
  112. // We tried 10 seconds for all of these, but got a number of connection/read timeouts for
  113. // sites that would have eventually worked, so bumping it up to 30 seconds.
  114. private static final int DEFAULT_SOCKET_TIMEOUT = 30 * 1000;
  115. private static final int DEFAULT_CONNECTION_TIMEOUT = 30 * 1000;
  116. private static final int DEFAULT_MAX_THREADS = 1;
  117. // This normally don't ever hit this timeout, since we manage the number of
  118. // fetcher threads to be <= the maxThreads value used to configure an IHttpFetcher.
  119. // But the limit of connections/host can cause a timeout, when redirects cause
  120. // multiple threads to hit the same domain. So jack the value way up.
  121. private static final long CONNECTION_POOL_TIMEOUT = 100 * 1000L;
  122. private static final int BUFFER_SIZE = 8 * 1024;
  123. private static final int DEFAULT_MAX_RETRY_COUNT = 10;
  124. private static final int DEFAULT_BYTEARRAY_SIZE = 32 * 1024;
  125. // Use the same values as Firefox (except that we don't accept deflate,
  126. // which we're not sure is implemented correctly - see the notes in
  127. // EncodingUtils/EncodingUtilsTest for more details).
  128. private static final String DEFAULT_ACCEPT = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
  129. private static final String DEFAULT_ACCEPT_CHARSET = "utf-8,ISO-8859-1;q=0.7,*;q=0.7";
  130. private static final String DEFAULT_ACCEPT_ENCODING = "x-gzip, gzip";
  131. // Keys used to access data in the Http execution context.
  132. private static final String PERM_REDIRECT_CONTEXT_KEY = "perm-redirect";
  133. private static final String REDIRECT_COUNT_CONTEXT_KEY = "redirect-count";
  134. private static final String HOST_ADDRESS = "host-address";
  135. private static final String SSL_CONTEXT_NAMES[] = {
  136. "TLS",
  137. "Default",
  138. "SSL",
  139. };
  140. private static final String TEXT_MIME_TYPES[] = {
  141. "text/html",
  142. "application/x-asp",
  143. "application/xhtml+xml",
  144. "application/vnd.wap.xhtml+xml",
  145. };
  146. private HttpVersion _httpVersion;
  147. private int _socketTimeout;
  148. private int _connectionTimeout;
  149. private int _maxRetryCount;
  150. transient private DefaultHttpClient _httpClient;
  151. private static class MyRequestRetryHandler implements HttpRequestRetryHandler {
  152. private int _maxRetryCount;
  153. public MyRequestRetryHandler(int maxRetryCount) {
  154. _maxRetryCount = maxRetryCount;
  155. }
  156. @Override
  157. public boolean retryRequest(IOException exception, int executionCount, HttpContext context) {
  158. if (LOGGER.isTraceEnabled()) {
  159. LOGGER.trace("Decide about retry #" + executionCount + " for exception " + exception.getMessage());
  160. }
  161. if (executionCount >= _maxRetryCount) {
  162. // Do not retry if over max retry count
  163. return false;
  164. } else if (exception instanceof NoHttpResponseException) {
  165. // Retry if the server dropped connection on us
  166. return true;
  167. } else if (exception instanceof SSLHandshakeException) {
  168. // Do not retry on SSL handshake exception
  169. return false;
  170. }
  171. HttpRequest request = (HttpRequest)context.getAttribute(ExecutionContext.HTTP_REQUEST);
  172. boolean idempotent = !(request instanceof HttpEntityEnclosingRequest);
  173. // Retry if the request is considered idempotent
  174. return idempotent;
  175. }
  176. }
  177. private static class MyRedirectException extends RedirectException {
  178. private URI _uri;
  179. private RedirectExceptionReason _reason;
  180. public MyRedirectException(String message, URI uri, RedirectExceptionReason reason) {
  181. super(message);
  182. _uri = uri;
  183. _reason = reason;
  184. }
  185. public URI getUri() {
  186. return _uri;
  187. }
  188. public RedirectExceptionReason getReason() {
  189. return _reason;
  190. }
  191. }
  192. /**
  193. * Handler to record last permanent redirect (if any) in context.
  194. *
  195. */
  196. private static class MyRedirectHandler extends DefaultRedirectHandler {
  197. private RedirectMode _redirectMode;
  198. public MyRedirectHandler(RedirectMode redirectMode) {
  199. super();
  200. _redirectMode = redirectMode;
  201. }
  202. @Override
  203. public URI getLocationURI(HttpResponse response, HttpContext context) throws ProtocolException {
  204. URI result = super.getLocationURI(response, context);
  205. // HACK - some sites return a redirect with an explicit port number that's the same as
  206. // the default port (e.g. 80 for http), and then when you use this to make the next
  207. // request, the presence of the port in the domain triggers another redirect, so you
  208. // fail with a circular redirect error. Avoid that by converting the port number to
  209. // -1 in that case.
  210. if (result.getScheme().equalsIgnoreCase("http") && (result.getPort() == 80)) {
  211. try {
  212. result = new URI(result.getScheme(), result.getUserInfo(), result.getHost(), -1, result.getPath(), result.getQuery(), result.getFragment());
  213. } catch (URISyntaxException e) {
  214. LOGGER.warn("Unexpected exception removing port from URI", e);
  215. }
  216. }
  217. // Keep track of the number of redirects.
  218. Integer count = (Integer)context.getAttribute(REDIRECT_COUNT_CONTEXT_KEY);
  219. if (count == null) {
  220. count = new Integer(0);
  221. }
  222. context.setAttribute(REDIRECT_COUNT_CONTEXT_KEY, count + 1);
  223. // Record the last permanent redirect
  224. int statusCode = response.getStatusLine().getStatusCode();
  225. if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY) {
  226. context.setAttribute(PERM_REDIRECT_CONTEXT_KEY, result);
  227. }
  228. // Based on the redirect mode, decide how we want to handle this.
  229. boolean isPermRedirect = statusCode == HttpStatus.SC_MOVED_PERMANENTLY;
  230. if ((_redirectMode == RedirectMode.FOLLOW_NONE) ||
  231. ((_redirectMode == RedirectMode.FOLLOW_TEMP) && isPermRedirect)) {
  232. RedirectExceptionReason reason = isPermRedirect ? RedirectExceptionReason.PERM_REDIRECT_DISALLOWED :
  233. RedirectExceptionReason.TEMP_REDIRECT_DISALLOWED;
  234. throw new MyRedirectException("RedirectMode disallowed redirect: " + _redirectMode, result, reason);
  235. }
  236. return result;
  237. }
  238. }
  239. /**
  240. * Interceptor to record host address in context.
  241. *
  242. */
  243. private static class MyRequestInterceptor implements HttpRequestInterceptor {
  244. @Override
  245. public void process(HttpRequest request,
  246. HttpContext context)
  247. throws HttpException, IOException {
  248. HttpInetConnection connection
  249. = (HttpInetConnection)(context.getAttribute(ExecutionContext.HTTP_CONNECTION));
  250. context.setAttribute( HOST_ADDRESS,
  251. connection.getRemoteAddress().getHostAddress());
  252. }
  253. }
  254. private static class DummyX509HostnameVerifier extends AbstractVerifier {
  255. @Override
  256. public void verify(String host, String[] cns, String[] subjectAlts) throws SSLException {
  257. try {
  258. verify(host, cns, subjectAlts, false);
  259. } catch (SSLException e) {
  260. LOGGER.warn("Invalid SSL certificate for " + host + ": " + e.getMessage());
  261. }
  262. }
  263. @Override
  264. public final String toString() {
  265. return "DUMMY_VERIFIER";
  266. }
  267. }
  268. public SimpleHttpFetcher(UserAgent userAgent) {
  269. this(DEFAULT_MAX_THREADS, userAgent);
  270. }
  271. public SimpleHttpFetcher(int maxThreads, UserAgent userAgent) {
  272. this(maxThreads, new FetcherPolicy(), userAgent);
  273. }
  274. public SimpleHttpFetcher(int maxThreads, FetcherPolicy fetcherPolicy, UserAgent userAgent) {
  275. super(maxThreads, fetcherPolicy, userAgent);
  276. _httpVersion = HttpVersion.HTTP_1_1;
  277. _socketTimeout = DEFAULT_SOCKET_TIMEOUT;
  278. _connectionTimeout = DEFAULT_CONNECTION_TIMEOUT;
  279. _maxRetryCount = DEFAULT_MAX_RETRY_COUNT;
  280. // Just to be explicit, we rely on lazy initialization of this so that
  281. // we don't have to worry about serializing it.
  282. _httpClient = null;
  283. }
  284. public HttpVersion getHttpVersion() {
  285. return _httpVersion;
  286. }
  287. public void setHttpVersion(HttpVersion httpVersion) {
  288. if (_httpClient == null) {
  289. _httpVersion = httpVersion;
  290. } else {
  291. throw new IllegalStateException("Can't change HTTP version after HttpClient has been initialized");
  292. }
  293. }
  294. public int getSocketTimeout() {
  295. return _socketTimeout;
  296. }
  297. public void setSocketTimeout(int socketTimeoutInMs) {
  298. if (_httpClient == null) {
  299. _socketTimeout = socketTimeoutInMs;
  300. } else {
  301. throw new IllegalStateException("Can't change socket timeout after HttpClient has been initialized");
  302. }
  303. }
  304. public int getConnectionTimeout() {
  305. return _connectionTimeout;
  306. }
  307. public void setConnectionTimeout(int connectionTimeoutInMs) {
  308. if (_httpClient == null) {
  309. _connectionTimeout = connectionTimeoutInMs;
  310. } else {
  311. throw new IllegalStateException("Can't change connection timeout after HttpClient has been initialized");
  312. }
  313. }
  314. public int getMaxRetryCount() {
  315. return _maxRetryCount;
  316. }
  317. public void setMaxRetryCount(int maxRetryCount) {
  318. _maxRetryCount = maxRetryCount;
  319. }
  320. private static FetchedDatum convert(FetchedResult result) {
  321. FetchedDatum datum = new FetchedDatum(result.getBaseUrl(), result.getFetchedUrl(), result.getFetchTime(),
  322. result.getHeaders(), new ContentBytes(result.getContent()), result.getContentType(),
  323. result.getResponseRate());
  324. datum.setNewBaseUrl(result.getNewBaseUrl());
  325. datum.setNumRedirects(result.getNumRedirects());
  326. datum.setHostAddress(result.getHostAddress());
  327. datum.setPayload(result.getPayload());
  328. return datum;
  329. }
  330. @Override
  331. public FetchedDatum get(ScoredUrlDatum scoredUrl) throws BaseFetchException {
  332. return convert(request(new HttpGet(), scoredUrl));
  333. }
  334. private FetchedResult request(HttpRequestBase request, ScoredUrlDatum scoredUrl) throws BaseFetchException {
  335. init();
  336. try {
  337. return doRequest(request, scoredUrl.getUrl(), scoredUrl.getPayload());
  338. } catch (HttpFetchException e) {
  339. // Don't bother generating a trace for a 404 (not found)
  340. if (LOGGER.isTraceEnabled() && (e.getHttpStatus() != HttpStatus.SC_NOT_FOUND)) {
  341. LOGGER.trace(String.format("Exception fetching %s (%s)", scoredUrl.getUrl(), e.getMessage()));
  342. }
  343. throw e;
  344. } catch (AbortedFetchException e) {
  345. // Don't bother reporting that we bailed because the mime-type wasn't one that we wanted.
  346. if (e.getAbortReason() != AbortedFetchReason.INVALID_MIMETYPE) {
  347. LOGGER.debug(String.format("Exception fetching %s (%s)", scoredUrl.getUrl(), e.getMessage()));
  348. }
  349. throw e;
  350. } catch (BaseFetchException e) {
  351. LOGGER.debug(String.format("Exception fetching %s (%s)", scoredUrl.getUrl(), e.getMessage()));
  352. throw e;
  353. }
  354. }
  355. public FetchedResult fetch(String url) throws BaseFetchException{
  356. return fetch(new HttpGet(), url, new Payload());
  357. }
  358. public FetchedResult fetch(HttpRequestBase request, String url, Payload payload) throws BaseFetchException{
  359. init();
  360. try {
  361. return doRequest(request, url, payload);
  362. } catch (BaseFetchException e) {
  363. if (LOGGER.isTraceEnabled()) {
  364. LOGGER.trace(String.format("Exception fetching %s", url), e);
  365. }
  366. throw e;
  367. }
  368. }
  369. private FetchedResult doRequest(HttpRequestBase request, String url, Payload payload) throws BaseFetchException {
  370. LOGGER.trace("Fetching " + url);
  371. HttpResponse response;
  372. long readStartTime;
  373. HttpHeaders headerMap = new HttpHeaders();
  374. String redirectedUrl = null;
  375. String newBaseUrl = null;
  376. int numRedirects = 0;
  377. boolean needAbort = true;
  378. String contentType = "";
  379. String mimeType = "";
  380. String hostAddress = null;
  381. // Create a local instance of cookie store, and bind to local context
  382. // Without this we get killed w/lots of threads, due to sync() on single cookie store.
  383. HttpContext localContext = new BasicHttpContext();
  384. CookieStore cookieStore = new BasicCookieStore();
  385. localContext.setAttribute(ClientContext.COOKIE_STORE, cookieStore);
  386. StringBuilder fetchTrace = null;
  387. if (LOGGER.isTraceEnabled()) {
  388. fetchTrace = new StringBuilder("Fetched url: " + url);
  389. }
  390. try {
  391. request.setURI(new URI(url));
  392. readStartTime = System.currentTimeMillis();
  393. response = _httpClient.execute(request, localContext);
  394. Header[] headers = response.getAllHeaders();
  395. for (Header header : headers) {
  396. headerMap.add(header.getName(), header.getValue());
  397. }
  398. int httpStatus = response.getStatusLine().getStatusCode();
  399. if (LOGGER.isTraceEnabled()) {
  400. fetchTrace.append("; status code: " + httpStatus);
  401. if (headerMap.getFirst(HttpHeaderNames.CONTENT_LENGTH) != null) {
  402. fetchTrace.append("; Content-Length: " + headerMap.getFirst(HttpHeaderNames.CONTENT_LENGTH));
  403. }
  404. if (headerMap.getFirst(HttpHeaderNames.LOCATION) != null) {
  405. fetchTrace.append("; Location: " + headerMap.getFirst(HttpHeaderNames.LOCATION));
  406. }
  407. }
  408. if ((httpStatus < 200) || (httpStatus >= 300)) {
  409. // We can't just check against SC_OK, as some wackos return 201, 202, etc
  410. throw new HttpFetchException(url, "Error fetching " + url, httpStatus, headerMap);
  411. }
  412. redirectedUrl = extractRedirectedUrl(url, localContext);
  413. URI permRedirectUri = (URI)localContext.getAttribute(PERM_REDIRECT_CONTEXT_KEY);
  414. if (permRedirectUri != null) {
  415. newBaseUrl = permRedirectUri.toURL().toExternalForm();
  416. }
  417. Integer redirects = (Integer)localContext.getAttribute(REDIRECT_COUNT_CONTEXT_KEY);
  418. if (redirects != null) {
  419. numRedirects = redirects.intValue();
  420. }
  421. hostAddress = (String)(localContext.getAttribute(HOST_ADDRESS));
  422. if (hostAddress == null) {
  423. throw new UrlFetchException(url, "Host address not saved in context");
  424. }
  425. Header cth = response.getFirstHeader(HttpHeaderNames.CONTENT_TYPE);
  426. if (cth != null) {
  427. contentType = cth.getValue();
  428. }
  429. // Check if we should abort due to mime-type filtering. Note that this will fail if the server
  430. // doesn't report a mime-type, but that's how we want it as this configuration is typically
  431. // used when only a subset of parsers are installed/enabled, so we don't want the auto-detect
  432. // code in Tika to get triggered & try to process an unsupported type. If you want unknown
  433. // mime-types from the server to be processed, set "" as one of the valid mime-types in FetcherPolicy.
  434. mimeType = HttpUtils.getMimeTypeFromContentType(contentType);
  435. Set<String> mimeTypes = _fetcherPolicy.getValidMimeTypes();
  436. if ((mimeTypes != null) && (mimeTypes.size() > 0)) {
  437. if (!mimeTypes.contains(mimeType)) {
  438. throw new AbortedFetchException(url, "Invalid mime-type: " + mimeType, AbortedFetchReason.INVALID_MIMETYPE);
  439. }
  440. }
  441. needAbort = false;
  442. } catch (ClientProtocolException e) {
  443. // Oleg guarantees that no abort is needed in the case of an IOException (which is is a subclass of)
  444. needAbort = false;
  445. // If the root case was a "too many redirects" error, we want to map this to a specific
  446. // exception that contains the final redirect.
  447. if (e.getCause() instanceof MyRedirectException) {
  448. MyRedirectException mre = (MyRedirectException)e.getCause();
  449. String redirectUrl = url;
  450. try {
  451. redirectUrl = mre.getUri().toURL().toExternalForm();
  452. } catch (MalformedURLException e2) {
  453. LOGGER.warn("Invalid URI saved during redirect handling: " + mre.getUri());
  454. }
  455. throw new RedirectFetchException(url, redirectUrl, mre.getReason());
  456. } else if (e.getCause() instanceof RedirectException) {
  457. throw new RedirectFetchException(url, extractRedirectedUrl(url, localContext), RedirectExceptionReason.TOO_MANY_REDIRECTS);
  458. } else {
  459. throw new IOFetchException(url, e);
  460. }
  461. } catch (IOException e) {
  462. // Oleg guarantees that no abort is needed in the case of an IOException
  463. needAbort = false;
  464. if (e instanceof ConnectionPoolTimeoutException) {
  465. // Should never happen, so let's dump some info about the connection pool.
  466. ThreadSafeClientConnManager cm = (ThreadSafeClientConnManager)_httpClient.getConnectionManager();
  467. int numConnections = cm.getConnectionsInPool();
  468. cm.closeIdleConnections(0, TimeUnit.MILLISECONDS);
  469. LOGGER.error(String.format("Got ConnectionPoolTimeoutException: %d connections before, %d after idle close", numConnections, cm.getConnectionsInPool()));
  470. }
  471. throw new IOFetchException(url, e);
  472. } catch (URISyntaxException e) {
  473. throw new UrlFetchException(url, e.getMessage());
  474. } catch (IllegalStateException e) {
  475. throw new UrlFetchException(url, e.getMessage());
  476. } catch (BaseFetchException e) {
  477. throw e;
  478. } catch (Exception e) {
  479. // Map anything else to a generic IOFetchException
  480. // TODO KKr - create generic fetch exception
  481. throw new IOFetchException(url, new IOException(e));
  482. } finally {
  483. safeAbort(needAbort, request);
  484. }
  485. // Figure out how much data we want to try to fetch.
  486. int maxContentSize = getMaxContentSize(mimeType);
  487. int targetLength = maxContentSize;
  488. boolean truncated = false;
  489. String contentLengthStr = headerMap.getFirst(HttpHeaderNames.CONTENT_LENGTH);
  490. if (contentLengthStr != null) {
  491. try {
  492. int contentLength = Integer.parseInt(contentLengthStr);
  493. if (contentLength > targetLength) {
  494. truncated = true;
  495. } else {
  496. targetLength = contentLength;
  497. }
  498. } catch (NumberFormatException e) {
  499. // Ignore (and log) invalid content length values.
  500. LOGGER.warn("Invalid content length in header: " + contentLengthStr);
  501. }
  502. }
  503. // Now finally read in response body, up to targetLength bytes.
  504. // Note that entity might be null, for zero length responses.
  505. byte[] content = new byte[0];
  506. long readRate = 0;
  507. HttpEntity entity = response.getEntity();
  508. needAbort = true;
  509. if (entity != null) {
  510. InputStream in = null;
  511. try {
  512. in = entity.getContent();
  513. byte[] buffer = new byte[BUFFER_SIZE];
  514. int bytesRead = 0;
  515. int totalRead = 0;
  516. ByteArrayOutputStream out = new ByteArrayOutputStream(DEFAULT_BYTEARRAY_SIZE);
  517. int readRequests = 0;
  518. int minResponseRate = _fetcherPolicy.getMinResponseRate();
  519. // TODO KKr - we need to monitor the rate while reading a
  520. // single block. Look at HttpClient
  521. // metrics support for how to do this. Once we fix this, fix
  522. // the test to read a smaller (< 20K)
  523. // chuck of data.
  524. while ((totalRead < targetLength) &&
  525. ((bytesRead = in.read(buffer, 0, Math.min(buffer.length, targetLength - totalRead))) != -1)) {
  526. readRequests += 1;
  527. totalRead += bytesRead;
  528. out.write(buffer, 0, bytesRead);
  529. // Assume read time is at least one millisecond, to avoid DBZ exception.
  530. long totalReadTime = Math.max(1, System.currentTimeMillis() - readStartTime);
  531. readRate = (totalRead * 1000L) / totalReadTime;
  532. // Don't bail on the first read cycle, as we can get a hiccup starting out.
  533. // Also don't bail if we've read everything we need.
  534. if ((readRequests > 1) && (totalRead < targetLength) && (readRate < minResponseRate)) {
  535. throw new AbortedFetchException(url, "Slow response rate of " + readRate + " bytes/sec", AbortedFetchReason.SLOW_RESPONSE_RATE);
  536. }
  537. // Check to see if we got interrupted.
  538. if (Thread.interrupted()) {
  539. throw new AbortedFetchException(url, AbortedFetchReason.INTERRUPTED);
  540. }
  541. }
  542. content = out.toByteArray();
  543. needAbort = truncated || (in.available() > 0);
  544. } catch (IOException e) {
  545. // We don't need to abort if there's an IOException
  546. throw new IOFetchException(url, e);
  547. } finally {
  548. safeAbort(needAbort, request);
  549. safeClose(in);
  550. }
  551. }
  552. // Toss truncated image content.
  553. if ( (truncated)
  554. && (!isTextMimeType(mimeType))) {
  555. throw new AbortedFetchException(url, "Truncated image", AbortedFetchReason.CONTENT_SIZE);
  556. }
  557. // Now see if we need to uncompress the content.
  558. String contentEncoding = headerMap.getFirst(HttpHeaderNames.CONTENT_ENCODING);
  559. if (contentEncoding != null) {
  560. if (LOGGER.isTraceEnabled()) {
  561. fetchTrace.append("; Content-Encoding: " + contentEncoding);
  562. }
  563. // TODO KKr We might want to just decompress a truncated gzip
  564. // containing text (since we have a max content size to save us
  565. // from any gzip corruption). We might want to break the following
  566. // out into a separate method, by the way (if not refactor this
  567. // entire monolithic method).
  568. //
  569. try {
  570. if ("gzip".equals(contentEncoding) || "x-gzip".equals(contentEncoding)) {
  571. if (truncated) {
  572. throw new AbortedFetchException(url, "Truncated compressed data", AbortedFetchReason.CONTENT_SIZE);
  573. } else {
  574. ExpandedResult expandedResult = EncodingUtils.processGzipEncoded(content, maxContentSize);
  575. truncated = expandedResult.isTruncated();
  576. if ( (truncated)
  577. && (!isTextMimeType(mimeType))) {
  578. throw new AbortedFetchException(url, "Truncated decompressed image", AbortedFetchReason.CONTENT_SIZE);
  579. } else {
  580. content = expandedResult.getExpanded();
  581. if (LOGGER.isTraceEnabled()) {
  582. fetchTrace.append("; unzipped to " + content.length + " bytes");
  583. }
  584. }
  585. // } else if ("deflate".equals(contentEncoding)) {
  586. // content = EncodingUtils.processDeflateEncoded(content);
  587. // if (LOGGER.isTraceEnabled()) {
  588. // fetchTrace.append("; inflated to " + content.length + " bytes");
  589. // }
  590. }
  591. }
  592. } catch (IOException e) {
  593. throw new IOFetchException(url, e);
  594. }
  595. }
  596. // Finally dump out the trace msg we've been building.
  597. if (LOGGER.isTraceEnabled()) {
  598. LOGGER.trace(fetchTrace.toString());
  599. }
  600. // TODO KKr - Save truncated flag in FetchedResult/FetchedDatum.
  601. return new FetchedResult( url,
  602. redirectedUrl,
  603. System.currentTimeMillis(),
  604. headerMap,
  605. content,
  606. contentType,
  607. (int)readRate,
  608. payload,
  609. newBaseUrl,
  610. numRedirects,
  611. hostAddress);
  612. }
  613. private boolean isTextMimeType(String mimeType) {
  614. for (String textContentType : TEXT_MIME_TYPES) {
  615. if (textContentType.equals(mimeType)) {
  616. return true;
  617. }
  618. }
  619. return false;
  620. }
  621. private String extractRedirectedUrl(String url, HttpContext localContext) {
  622. // This was triggered by HttpClient with the redirect count was exceeded.
  623. HttpHost host = (HttpHost)localContext.getAttribute(ExecutionContext.HTTP_TARGET_HOST);
  624. HttpUriRequest finalRequest = (HttpUriRequest)localContext.getAttribute(ExecutionContext.HTTP_REQUEST);
  625. try {
  626. URL hostUrl = new URI(host.toURI()).toURL();
  627. return new URL(hostUrl, finalRequest.getURI().toString()).toExternalForm();
  628. } catch (MalformedURLException e) {
  629. LOGGER.warn("Invalid host/uri specified in final fetch: " + host + finalRequest.getURI());
  630. return url;
  631. } catch (URISyntaxException e) {
  632. LOGGER.warn("Invalid host/uri specified in final fetch: " + host + finalRequest.getURI());
  633. return url;
  634. }
  635. }
  636. private static void safeClose(Closeable o) {
  637. if (o != null) {
  638. try {
  639. o.close();
  640. } catch (Exception e) {
  641. // Ignore any errors
  642. }
  643. }
  644. }
  645. private static void safeAbort(boolean needAbort, HttpRequestBase request) {
  646. if (needAbort && (request != null)) {
  647. try {
  648. request.abort();
  649. } catch (Throwable t) {
  650. // Ignore any errors
  651. }
  652. }
  653. }
  654. private synchronized void init() {
  655. if (_httpClient == null) {
  656. // Create and initialize HTTP parameters
  657. HttpParams params = new BasicHttpParams();
  658. // TODO KKr - w/4.1, switch to new api (ThreadSafeClientConnManager)
  659. // cm.setMaxTotalConnections(_maxThreads);
  660. // cm.setDefaultMaxPerRoute(Math.max(10, _maxThreads/10));
  661. ConnManagerParams.setMaxTotalConnections(params, _maxThreads);
  662. // Set the maximum time we'll wait for a spare connection in the connection pool. We
  663. // shouldn't actually hit this, as we make sure (in FetcherManager) that the max number
  664. // of active requests doesn't exceed the value returned by getMaxThreads() here.
  665. ConnManagerParams.setTimeout(params, CONNECTION_POOL_TIMEOUT);
  666. // Set the socket and connection timeout to be something reasonable.
  667. HttpConnectionParams.setSoTimeout(params, _socketTimeout);
  668. HttpConnectionParams.setConnectionTimeout(params, _connectionTimeout);
  669. // Even with stale checking enabled, a connection can "go stale" between the check and the
  670. // next request. So we still need to handle the case of a closed socket (from the server side),
  671. // and disabling this check improves performance.
  672. HttpConnectionParams.setStaleCheckingEnabled(params, false);
  673. // FUTURE - set this on a per-route (host) basis when we have per-host policies for
  674. // doing partner crawls. We could define a BixoConnPerRoute class that supports this.
  675. ConnPerRouteBean connPerRoute = new ConnPerRouteBean(_fetcherPolicy.getMaxConnectionsPerHost());
  676. ConnManagerParams.setMaxConnectionsPerRoute(params, connPerRoute);
  677. HttpProtocolParams.setVersion(params, _httpVersion);
  678. HttpProtocolParams.setUserAgent(params, _userAgent.getUserAgentString());
  679. HttpProtocolParams.setContentCharset(params, "UTF-8");
  680. HttpProtocolParams.setHttpElementCharset(params, "UTF-8");
  681. HttpProtocolParams.setUseExpectContinue(params, true);
  682. // TODO KKr - set on connection manager params, or client params?
  683. CookieSpecParamBean cookieParams = new CookieSpecParamBean(params);
  684. cookieParams.setSingleHeader(true);
  685. // Create and initialize scheme registry
  686. SchemeRegistry schemeRegistry = new SchemeRegistry();
  687. schemeRegistry.register(new Scheme("http", PlainSocketFactory.getSocketFactory(), 80));
  688. SSLSocketFactory sf = null;
  689. for (String contextName : SSL_CONTEXT_NAMES) {
  690. try {
  691. SSLContext sslContext = SSLContext.getInstance(contextName);
  692. sslContext.init(null, new TrustManager[] { new DummyX509TrustManager(null) }, null);
  693. sf = new SSLSocketFactory(sslContext);
  694. break;
  695. } catch (NoSuchAlgorithmException e) {
  696. LOGGER.debug("SSLContext algorithm not available: " + contextName);
  697. } catch (Exception e) {
  698. LOGGER.debug("SSLContext can't be initialized: " + contextName, e);
  699. }
  700. }
  701. if (sf != null) {
  702. sf.setHostnameVerifier(new DummyX509HostnameVerifier());
  703. schemeRegistry.register(new Scheme("https", sf, 443));
  704. } else {
  705. LOGGER.warn("No valid SSLContext found for https");
  706. }
  707. // Use ThreadSafeClientConnManager since more than one thread will be using the HttpClient.
  708. ThreadSafeClientConnManager cm = new ThreadSafeClientConnManager(params, schemeRegistry);
  709. _httpClient = new DefaultHttpClient(cm, params);
  710. _httpClient.setHttpRequestRetryHandler(new MyRequestRetryHandler(_maxRetryCount));
  711. _httpClient.setRedirectHandler(new MyRedirectHandler(_fetcherPolicy.getRedirectMode()));
  712. _httpClient.addRequestInterceptor(new MyRequestInterceptor());
  713. params = _httpClient.getParams();
  714. // FUTURE KKr - support authentication
  715. HttpClientParams.setAuthenticating(params, false);
  716. HttpClientParams.setCookiePolicy(params, CookiePolicy.BEST_MATCH);
  717. ClientParamBean clientParams = new ClientParamBean(params);
  718. if (_fetcherPolicy.getMaxRedirects() == 0) {
  719. clientParams.setHandleRedirects(false);
  720. } else {
  721. clientParams.setHandleRedirects(true);
  722. clientParams.setMaxRedirects(_fetcherPolicy.getMaxRedirects());
  723. }
  724. // Set up default headers. This helps us get back from servers what we want.
  725. HashSet<Header> defaultHeaders = new HashSet<Header>();
  726. defaultHeaders.add(new BasicHeader(HttpHeaderNames.ACCEPT_LANGUAGE, _fetcherPolicy.getAcceptLanguage()));
  727. defaultHeaders.add(new BasicHeader(HttpHeaderNames.ACCEPT_CHARSET, DEFAULT_ACCEPT_CHARSET));
  728. defaultHeaders.add(new BasicHeader(HttpHeaderNames.ACCEPT_ENCODING, DEFAULT_ACCEPT_ENCODING));
  729. defaultHeaders.add(new BasicHeader(HttpHeaderNames.ACCEPT, DEFAULT_ACCEPT));
  730. clientParams.setDefaultHeaders(defaultHeaders);
  731. }
  732. }
  733. @Override
  734. public void abort() {
  735. // TODO Actually try to abort
  736. }
  737. }