/src/main/java/edu/uci/ics/crawler4j/fetcher/PageFetcher.java
Java | 276 lines | 210 code | 43 blank | 23 comment | 43 complexity | d810d4b08eaff4189c8627f10aa2c8b8 MD5 | raw file
- /**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- package edu.uci.ics.crawler4j.fetcher;
- import java.io.IOException;
- import java.io.InputStream;
- import java.util.Date;
- import java.util.zip.GZIPInputStream;
- import org.apache.http.Header;
- import org.apache.http.HeaderElement;
- import org.apache.http.HttpEntity;
- import org.apache.http.HttpException;
- import org.apache.http.HttpHost;
- import org.apache.http.HttpResponse;
- import org.apache.http.HttpResponseInterceptor;
- import org.apache.http.HttpStatus;
- import org.apache.http.HttpVersion;
- import org.apache.http.auth.AuthScope;
- import org.apache.http.auth.UsernamePasswordCredentials;
- import org.apache.http.client.HttpClient;
- import org.apache.http.client.methods.HttpGet;
- import org.apache.http.client.params.ClientPNames;
- import org.apache.http.client.params.CookiePolicy;
- import org.apache.http.conn.params.ConnRoutePNames;
- import org.apache.http.conn.scheme.PlainSocketFactory;
- import org.apache.http.conn.scheme.Scheme;
- import org.apache.http.conn.scheme.SchemeRegistry;
- import org.apache.http.conn.ssl.SSLSocketFactory;
- import org.apache.http.entity.HttpEntityWrapper;
- import org.apache.http.impl.client.DefaultHttpClient;
- import org.apache.http.impl.conn.PoolingClientConnectionManager;
- import org.apache.http.params.BasicHttpParams;
- import org.apache.http.params.CoreConnectionPNames;
- import org.apache.http.params.CoreProtocolPNames;
- import org.apache.http.params.HttpParams;
- import org.apache.http.params.HttpProtocolParamBean;
- import org.apache.http.protocol.HttpContext;
- import org.apache.log4j.Logger;
- import edu.uci.ics.crawler4j.crawler.Configurable;
- import edu.uci.ics.crawler4j.crawler.CrawlConfig;
- import edu.uci.ics.crawler4j.url.URLCanonicalizer;
- import edu.uci.ics.crawler4j.url.WebURL;
- /**
- * @author Yasser Ganjisaffar <lastname at gmail dot com>
- */
- public class PageFetcher extends Configurable {
- protected static final Logger logger = Logger.getLogger(PageFetcher.class);
- protected PoolingClientConnectionManager connectionManager;
- protected DefaultHttpClient httpClient;
- protected final Object mutex = new Object();
- protected long lastFetchTime = 0;
- protected IdleConnectionMonitorThread connectionMonitorThread = null;
- public PageFetcher(CrawlConfig config) {
- super(config);
- HttpParams params = new BasicHttpParams();
- HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params);
- paramsBean.setVersion(HttpVersion.HTTP_1_1);
- paramsBean.setContentCharset("UTF-8");
- paramsBean.setUseExpectContinue(false);
- params.setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BROWSER_COMPATIBILITY);
- params.setParameter(CoreProtocolPNames.USER_AGENT, config.getUserAgentString());
- params.setIntParameter(CoreConnectionPNames.SO_TIMEOUT, config.getSocketTimeout());
- params.setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, config.getConnectionTimeout());
- params.setBooleanParameter("http.protocol.handle-redirects", false);
- SchemeRegistry schemeRegistry = new SchemeRegistry();
- schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory()));
- if (config.isIncludeHttpsPages()) {
- schemeRegistry.register(new Scheme("https", 443, SSLSocketFactory.getSocketFactory()));
- }
- connectionManager = new PoolingClientConnectionManager(schemeRegistry);
- connectionManager.setMaxTotal(config.getMaxTotalConnections());
- connectionManager.setDefaultMaxPerRoute(config.getMaxConnectionsPerHost());
- httpClient = new DefaultHttpClient(connectionManager, params);
- if (config.getProxyHost() != null) {
- if (config.getProxyUsername() != null) {
- httpClient.getCredentialsProvider().setCredentials(
- new AuthScope(config.getProxyHost(), config.getProxyPort()),
- new UsernamePasswordCredentials(config.getProxyUsername(), config.getProxyPassword()));
- }
- HttpHost proxy = new HttpHost(config.getProxyHost(), config.getProxyPort());
- httpClient.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxy);
- }
- httpClient.addResponseInterceptor(new HttpResponseInterceptor() {
- @Override
- public void process(final HttpResponse response, final HttpContext context) throws HttpException,
- IOException {
- HttpEntity entity = response.getEntity();
- Header contentEncoding = entity.getContentEncoding();
- if (contentEncoding != null) {
- HeaderElement[] codecs = contentEncoding.getElements();
- for (HeaderElement codec : codecs) {
- if (codec.getName().equalsIgnoreCase("gzip")) {
- response.setEntity(new GzipDecompressingEntity(response.getEntity()));
- return;
- }
- }
- }
- }
- });
- if (connectionMonitorThread == null) {
- connectionMonitorThread = new IdleConnectionMonitorThread(connectionManager);
- }
- connectionMonitorThread.start();
- }
- public PageFetchResult fetchHeader(WebURL webUrl) {
- PageFetchResult fetchResult = new PageFetchResult();
- String toFetchURL = webUrl.getURL();
- HttpGet get = null;
- try {
- get = new HttpGet(toFetchURL);
- synchronized (mutex) {
- long now = (new Date()).getTime();
- if (now - lastFetchTime < config.getPolitenessDelay()) {
- Thread.sleep(config.getPolitenessDelay() - (now - lastFetchTime));
- }
- lastFetchTime = (new Date()).getTime();
- }
- get.addHeader("Accept-Encoding", "gzip");
- HttpResponse response = httpClient.execute(get);
- fetchResult.setEntity(response.getEntity());
- fetchResult.setResponseHeaders(response.getAllHeaders());
-
- int statusCode = response.getStatusLine().getStatusCode();
- if (statusCode != HttpStatus.SC_OK) {
- if (statusCode != HttpStatus.SC_NOT_FOUND) {
- if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY) {
- Header header = response.getFirstHeader("Location");
- if (header != null) {
- String movedToUrl = header.getValue();
- movedToUrl = URLCanonicalizer.getCanonicalURL(movedToUrl, toFetchURL);
- fetchResult.setMovedToUrl(movedToUrl);
- }
- fetchResult.setStatusCode(statusCode);
- return fetchResult;
- }
- logger.info("Failed: " + response.getStatusLine().toString() + ", while fetching " + toFetchURL);
- }
- fetchResult.setStatusCode(response.getStatusLine().getStatusCode());
- return fetchResult;
- }
- fetchResult.setFetchedUrl(toFetchURL);
- String uri = get.getURI().toString();
- if (!uri.equals(toFetchURL)) {
- if (!URLCanonicalizer.getCanonicalURL(uri).equals(toFetchURL)) {
- fetchResult.setFetchedUrl(uri);
- }
- }
- if (fetchResult.getEntity() != null) {
- long size = fetchResult.getEntity().getContentLength();
- if (size == -1) {
- Header length = response.getLastHeader("Content-Length");
- if (length == null) {
- length = response.getLastHeader("Content-length");
- }
- if (length != null) {
- size = Integer.parseInt(length.getValue());
- } else {
- size = -1;
- }
- }
- if (size > config.getMaxDownloadSize()) {
- fetchResult.setStatusCode(CustomFetchStatus.PageTooBig);
- get.abort();
- return fetchResult;
- }
- fetchResult.setStatusCode(HttpStatus.SC_OK);
- return fetchResult;
- }
-
- get.abort();
-
- } catch (IOException e) {
- logger.error("Fatal transport error: " + e.getMessage() + " while fetching " + toFetchURL
- + " (link found in doc #" + webUrl.getParentDocid() + ")");
- fetchResult.setStatusCode(CustomFetchStatus.FatalTransportError);
- return fetchResult;
- } catch (IllegalStateException e) {
- // ignoring exceptions that occur because of not registering https
- // and other schemes
- } catch (Exception e) {
- if (e.getMessage() == null) {
- logger.error("Error while fetching " + webUrl.getURL());
- } else {
- logger.error(e.getMessage() + " while fetching " + webUrl.getURL());
- }
- } finally {
- try {
- if (fetchResult.getEntity() == null && get != null) {
- get.abort();
- }
- } catch (Exception e) {
- e.printStackTrace();
- }
- }
- fetchResult.setStatusCode(CustomFetchStatus.UnknownError);
- return fetchResult;
- }
- public synchronized void shutDown() {
- if (connectionMonitorThread != null) {
- connectionManager.shutdown();
- connectionMonitorThread.shutdown();
- }
- }
-
- public HttpClient getHttpClient() {
- return httpClient;
- }
- private static class GzipDecompressingEntity extends HttpEntityWrapper {
- public GzipDecompressingEntity(final HttpEntity entity) {
- super(entity);
- }
- @Override
- public InputStream getContent() throws IOException, IllegalStateException {
- // the wrapped entity's getContent() decides about repeatability
- InputStream wrappedin = wrappedEntity.getContent();
- return new GZIPInputStream(wrappedin);
- }
- @Override
- public long getContentLength() {
- // length of ungzipped content is not known
- return -1;
- }
- }
- }