PageRenderTime 40ms CodeModel.GetById 7ms RepoModel.GetById 0ms app.codeStats 0ms

/src/main/java/edu/uci/ics/crawler4j/fetcher/PageFetcher.java

https://code.google.com/
Java | 276 lines | 210 code | 43 blank | 23 comment | 43 complexity | d810d4b08eaff4189c8627f10aa2c8b8 MD5 | raw file
  1. /**
  2. * Licensed to the Apache Software Foundation (ASF) under one or more
  3. * contributor license agreements. See the NOTICE file distributed with
  4. * this work for additional information regarding copyright ownership.
  5. * The ASF licenses this file to You under the Apache License, Version 2.0
  6. * (the "License"); you may not use this file except in compliance with
  7. * the License. You may obtain a copy of the License at
  8. *
  9. * http://www.apache.org/licenses/LICENSE-2.0
  10. *
  11. * Unless required by applicable law or agreed to in writing, software
  12. * distributed under the License is distributed on an "AS IS" BASIS,
  13. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. * See the License for the specific language governing permissions and
  15. * limitations under the License.
  16. */
  17. package edu.uci.ics.crawler4j.fetcher;
  18. import java.io.IOException;
  19. import java.io.InputStream;
  20. import java.util.Date;
  21. import java.util.zip.GZIPInputStream;
  22. import org.apache.http.Header;
  23. import org.apache.http.HeaderElement;
  24. import org.apache.http.HttpEntity;
  25. import org.apache.http.HttpException;
  26. import org.apache.http.HttpHost;
  27. import org.apache.http.HttpResponse;
  28. import org.apache.http.HttpResponseInterceptor;
  29. import org.apache.http.HttpStatus;
  30. import org.apache.http.HttpVersion;
  31. import org.apache.http.auth.AuthScope;
  32. import org.apache.http.auth.UsernamePasswordCredentials;
  33. import org.apache.http.client.HttpClient;
  34. import org.apache.http.client.methods.HttpGet;
  35. import org.apache.http.client.params.ClientPNames;
  36. import org.apache.http.client.params.CookiePolicy;
  37. import org.apache.http.conn.params.ConnRoutePNames;
  38. import org.apache.http.conn.scheme.PlainSocketFactory;
  39. import org.apache.http.conn.scheme.Scheme;
  40. import org.apache.http.conn.scheme.SchemeRegistry;
  41. import org.apache.http.conn.ssl.SSLSocketFactory;
  42. import org.apache.http.entity.HttpEntityWrapper;
  43. import org.apache.http.impl.client.DefaultHttpClient;
  44. import org.apache.http.impl.conn.PoolingClientConnectionManager;
  45. import org.apache.http.params.BasicHttpParams;
  46. import org.apache.http.params.CoreConnectionPNames;
  47. import org.apache.http.params.CoreProtocolPNames;
  48. import org.apache.http.params.HttpParams;
  49. import org.apache.http.params.HttpProtocolParamBean;
  50. import org.apache.http.protocol.HttpContext;
  51. import org.apache.log4j.Logger;
  52. import edu.uci.ics.crawler4j.crawler.Configurable;
  53. import edu.uci.ics.crawler4j.crawler.CrawlConfig;
  54. import edu.uci.ics.crawler4j.url.URLCanonicalizer;
  55. import edu.uci.ics.crawler4j.url.WebURL;
  56. /**
  57. * @author Yasser Ganjisaffar <lastname at gmail dot com>
  58. */
  59. public class PageFetcher extends Configurable {
  60. protected static final Logger logger = Logger.getLogger(PageFetcher.class);
  61. protected PoolingClientConnectionManager connectionManager;
  62. protected DefaultHttpClient httpClient;
  63. protected final Object mutex = new Object();
  64. protected long lastFetchTime = 0;
  65. protected IdleConnectionMonitorThread connectionMonitorThread = null;
  66. public PageFetcher(CrawlConfig config) {
  67. super(config);
  68. HttpParams params = new BasicHttpParams();
  69. HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params);
  70. paramsBean.setVersion(HttpVersion.HTTP_1_1);
  71. paramsBean.setContentCharset("UTF-8");
  72. paramsBean.setUseExpectContinue(false);
  73. params.setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BROWSER_COMPATIBILITY);
  74. params.setParameter(CoreProtocolPNames.USER_AGENT, config.getUserAgentString());
  75. params.setIntParameter(CoreConnectionPNames.SO_TIMEOUT, config.getSocketTimeout());
  76. params.setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, config.getConnectionTimeout());
  77. params.setBooleanParameter("http.protocol.handle-redirects", false);
  78. SchemeRegistry schemeRegistry = new SchemeRegistry();
  79. schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory()));
  80. if (config.isIncludeHttpsPages()) {
  81. schemeRegistry.register(new Scheme("https", 443, SSLSocketFactory.getSocketFactory()));
  82. }
  83. connectionManager = new PoolingClientConnectionManager(schemeRegistry);
  84. connectionManager.setMaxTotal(config.getMaxTotalConnections());
  85. connectionManager.setDefaultMaxPerRoute(config.getMaxConnectionsPerHost());
  86. httpClient = new DefaultHttpClient(connectionManager, params);
  87. if (config.getProxyHost() != null) {
  88. if (config.getProxyUsername() != null) {
  89. httpClient.getCredentialsProvider().setCredentials(
  90. new AuthScope(config.getProxyHost(), config.getProxyPort()),
  91. new UsernamePasswordCredentials(config.getProxyUsername(), config.getProxyPassword()));
  92. }
  93. HttpHost proxy = new HttpHost(config.getProxyHost(), config.getProxyPort());
  94. httpClient.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxy);
  95. }
  96. httpClient.addResponseInterceptor(new HttpResponseInterceptor() {
  97. @Override
  98. public void process(final HttpResponse response, final HttpContext context) throws HttpException,
  99. IOException {
  100. HttpEntity entity = response.getEntity();
  101. Header contentEncoding = entity.getContentEncoding();
  102. if (contentEncoding != null) {
  103. HeaderElement[] codecs = contentEncoding.getElements();
  104. for (HeaderElement codec : codecs) {
  105. if (codec.getName().equalsIgnoreCase("gzip")) {
  106. response.setEntity(new GzipDecompressingEntity(response.getEntity()));
  107. return;
  108. }
  109. }
  110. }
  111. }
  112. });
  113. if (connectionMonitorThread == null) {
  114. connectionMonitorThread = new IdleConnectionMonitorThread(connectionManager);
  115. }
  116. connectionMonitorThread.start();
  117. }
  118. public PageFetchResult fetchHeader(WebURL webUrl) {
  119. PageFetchResult fetchResult = new PageFetchResult();
  120. String toFetchURL = webUrl.getURL();
  121. HttpGet get = null;
  122. try {
  123. get = new HttpGet(toFetchURL);
  124. synchronized (mutex) {
  125. long now = (new Date()).getTime();
  126. if (now - lastFetchTime < config.getPolitenessDelay()) {
  127. Thread.sleep(config.getPolitenessDelay() - (now - lastFetchTime));
  128. }
  129. lastFetchTime = (new Date()).getTime();
  130. }
  131. get.addHeader("Accept-Encoding", "gzip");
  132. HttpResponse response = httpClient.execute(get);
  133. fetchResult.setEntity(response.getEntity());
  134. fetchResult.setResponseHeaders(response.getAllHeaders());
  135. int statusCode = response.getStatusLine().getStatusCode();
  136. if (statusCode != HttpStatus.SC_OK) {
  137. if (statusCode != HttpStatus.SC_NOT_FOUND) {
  138. if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY) {
  139. Header header = response.getFirstHeader("Location");
  140. if (header != null) {
  141. String movedToUrl = header.getValue();
  142. movedToUrl = URLCanonicalizer.getCanonicalURL(movedToUrl, toFetchURL);
  143. fetchResult.setMovedToUrl(movedToUrl);
  144. }
  145. fetchResult.setStatusCode(statusCode);
  146. return fetchResult;
  147. }
  148. logger.info("Failed: " + response.getStatusLine().toString() + ", while fetching " + toFetchURL);
  149. }
  150. fetchResult.setStatusCode(response.getStatusLine().getStatusCode());
  151. return fetchResult;
  152. }
  153. fetchResult.setFetchedUrl(toFetchURL);
  154. String uri = get.getURI().toString();
  155. if (!uri.equals(toFetchURL)) {
  156. if (!URLCanonicalizer.getCanonicalURL(uri).equals(toFetchURL)) {
  157. fetchResult.setFetchedUrl(uri);
  158. }
  159. }
  160. if (fetchResult.getEntity() != null) {
  161. long size = fetchResult.getEntity().getContentLength();
  162. if (size == -1) {
  163. Header length = response.getLastHeader("Content-Length");
  164. if (length == null) {
  165. length = response.getLastHeader("Content-length");
  166. }
  167. if (length != null) {
  168. size = Integer.parseInt(length.getValue());
  169. } else {
  170. size = -1;
  171. }
  172. }
  173. if (size > config.getMaxDownloadSize()) {
  174. fetchResult.setStatusCode(CustomFetchStatus.PageTooBig);
  175. get.abort();
  176. return fetchResult;
  177. }
  178. fetchResult.setStatusCode(HttpStatus.SC_OK);
  179. return fetchResult;
  180. }
  181. get.abort();
  182. } catch (IOException e) {
  183. logger.error("Fatal transport error: " + e.getMessage() + " while fetching " + toFetchURL
  184. + " (link found in doc #" + webUrl.getParentDocid() + ")");
  185. fetchResult.setStatusCode(CustomFetchStatus.FatalTransportError);
  186. return fetchResult;
  187. } catch (IllegalStateException e) {
  188. // ignoring exceptions that occur because of not registering https
  189. // and other schemes
  190. } catch (Exception e) {
  191. if (e.getMessage() == null) {
  192. logger.error("Error while fetching " + webUrl.getURL());
  193. } else {
  194. logger.error(e.getMessage() + " while fetching " + webUrl.getURL());
  195. }
  196. } finally {
  197. try {
  198. if (fetchResult.getEntity() == null && get != null) {
  199. get.abort();
  200. }
  201. } catch (Exception e) {
  202. e.printStackTrace();
  203. }
  204. }
  205. fetchResult.setStatusCode(CustomFetchStatus.UnknownError);
  206. return fetchResult;
  207. }
  208. public synchronized void shutDown() {
  209. if (connectionMonitorThread != null) {
  210. connectionManager.shutdown();
  211. connectionMonitorThread.shutdown();
  212. }
  213. }
  214. public HttpClient getHttpClient() {
  215. return httpClient;
  216. }
  217. private static class GzipDecompressingEntity extends HttpEntityWrapper {
  218. public GzipDecompressingEntity(final HttpEntity entity) {
  219. super(entity);
  220. }
  221. @Override
  222. public InputStream getContent() throws IOException, IllegalStateException {
  223. // the wrapped entity's getContent() decides about repeatability
  224. InputStream wrappedin = wrappedEntity.getContent();
  225. return new GZIPInputStream(wrappedin);
  226. }
  227. @Override
  228. public long getContentLength() {
  229. // length of ungzipped content is not known
  230. return -1;
  231. }
  232. }
  233. }