PageRenderTime 48ms CodeModel.GetById 18ms RepoModel.GetById 1ms app.codeStats 0ms

/projects/heritrix-1.14.4/src/java/org/archive/crawler/fetcher/FetchHTTP.java

https://gitlab.com/essere.lab.public/qualitas.class-corpus
Java | 1175 lines | 785 code | 94 blank | 296 comment | 113 complexity | a01a24a28b00f2521529b8ddac671b0f MD5 | raw file
  1. /* FetchHTTP.java
  2. *
  3. * $Id: FetchHTTP.java 6803 2010-04-02 01:03:46Z gojomo $
  4. *
  5. * Created on Jun 5, 2003
  6. *
  7. * Copyright (C) 2003 Internet Archive.
  8. *
  9. * This file is part of the Heritrix web crawler (crawler.archive.org).
  10. *
  11. * Heritrix is free software; you can redistribute it and/or modify
  12. * it under the terms of the GNU Lesser Public License as published by
  13. * the Free Software Foundation; either version 2.1 of the License, or
  14. * any later version.
  15. *
  16. * Heritrix is distributed in the hope that it will be useful,
  17. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  19. * GNU Lesser Public License for more details.
  20. *
  21. * You should have received a copy of the GNU Lesser Public License
  22. * along with Heritrix; if not, write to the Free Software
  23. * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  24. */
  25. package org.archive.crawler.fetcher;
  26. import it.unimi.dsi.mg4j.util.MutableString;
  27. import java.io.File;
  28. import java.io.FileNotFoundException;
  29. import java.io.FileOutputStream;
  30. import java.io.IOException;
  31. import java.io.ObjectInputStream;
  32. import java.io.ObjectOutputStream;
  33. import java.io.RandomAccessFile;
  34. import java.net.InetAddress;
  35. import java.net.UnknownHostException;
  36. import java.security.KeyManagementException;
  37. import java.security.KeyStoreException;
  38. import java.security.MessageDigest;
  39. import java.security.NoSuchAlgorithmException;
  40. import java.util.Collection;
  41. import java.util.HashSet;
  42. import java.util.Iterator;
  43. import java.util.List;
  44. import java.util.ListIterator;
  45. import java.util.Map;
  46. import java.util.Set;
  47. import java.util.logging.Level;
  48. import java.util.logging.Logger;
  49. import javax.management.AttributeNotFoundException;
  50. import javax.management.MBeanException;
  51. import javax.management.ReflectionException;
  52. import javax.net.ssl.SSLContext;
  53. import javax.net.ssl.SSLSocketFactory;
  54. import javax.net.ssl.TrustManager;
  55. import org.apache.commons.httpclient.Cookie;
  56. import org.apache.commons.httpclient.Header;
  57. import org.apache.commons.httpclient.HostConfiguration;
  58. import org.apache.commons.httpclient.HttpClient;
  59. import org.apache.commons.httpclient.HttpConnection;
  60. import org.apache.commons.httpclient.HttpConnectionManager;
  61. import org.apache.commons.httpclient.HttpException;
  62. import org.apache.commons.httpclient.HttpMethod;
  63. import org.apache.commons.httpclient.HttpMethodBase;
  64. import org.apache.commons.httpclient.HttpState;
  65. import org.apache.commons.httpclient.HttpStatus;
  66. import org.apache.commons.httpclient.HttpVersion;
  67. import org.apache.commons.httpclient.auth.AuthChallengeParser;
  68. import org.apache.commons.httpclient.auth.AuthScheme;
  69. import org.apache.commons.httpclient.auth.BasicScheme;
  70. import org.apache.commons.httpclient.auth.DigestScheme;
  71. import org.apache.commons.httpclient.auth.MalformedChallengeException;
  72. import org.apache.commons.httpclient.cookie.CookiePolicy;
  73. import org.apache.commons.httpclient.params.HttpClientParams;
  74. import org.apache.commons.httpclient.params.HttpConnectionManagerParams;
  75. import org.apache.commons.httpclient.params.HttpMethodParams;
  76. import org.apache.commons.httpclient.protocol.Protocol;
  77. import org.apache.commons.httpclient.protocol.ProtocolSocketFactory;
  78. import org.archive.crawler.Heritrix;
  79. import org.archive.crawler.datamodel.CoreAttributeConstants;
  80. import org.archive.crawler.datamodel.CrawlHost;
  81. import org.archive.crawler.datamodel.CrawlOrder;
  82. import org.archive.crawler.datamodel.CrawlServer;
  83. import org.archive.crawler.datamodel.CrawlURI;
  84. import org.archive.crawler.datamodel.CredentialStore;
  85. import org.archive.crawler.datamodel.FetchStatusCodes;
  86. import org.archive.crawler.datamodel.ServerCache;
  87. import org.archive.crawler.datamodel.credential.Credential;
  88. import org.archive.crawler.datamodel.credential.CredentialAvatar;
  89. import org.archive.crawler.datamodel.credential.Rfc2617Credential;
  90. import org.archive.crawler.deciderules.DecideRule;
  91. import org.archive.crawler.deciderules.DecideRuleSequence;
  92. import org.archive.crawler.event.CrawlStatusListener;
  93. import org.archive.crawler.extractor.Link;
  94. import org.archive.crawler.framework.Processor;
  95. import org.archive.crawler.settings.SettingsHandler;
  96. import org.archive.crawler.settings.SimpleType;
  97. import org.archive.crawler.settings.StringList;
  98. import org.archive.crawler.settings.Type;
  99. import org.archive.httpclient.ConfigurableX509TrustManager;
  100. import org.archive.httpclient.HttpRecorderGetMethod;
  101. import org.archive.httpclient.HttpRecorderMethod;
  102. import org.archive.httpclient.HttpRecorderPostMethod;
  103. import org.archive.httpclient.SingleHttpConnectionManager;
  104. import org.archive.io.ObjectPlusFilesInputStream;
  105. import org.archive.io.RecorderLengthExceededException;
  106. import org.archive.io.RecorderTimeoutException;
  107. import org.archive.io.RecorderTooMuchHeaderException;
  108. import org.archive.util.ArchiveUtils;
  109. import org.archive.util.HttpRecorder;
  110. import org.archive.util.bdbje.EnhancedEnvironment;
  111. import st.ata.util.AList;
  112. import com.sleepycat.bind.serial.SerialBinding;
  113. import com.sleepycat.bind.serial.StoredClassCatalog;
  114. import com.sleepycat.bind.tuple.StringBinding;
  115. import com.sleepycat.collections.StoredSortedMap;
  116. import com.sleepycat.je.Database;
  117. import com.sleepycat.je.DatabaseConfig;
  118. import com.sleepycat.je.DatabaseException;
  119. /**
  120. * HTTP fetcher that uses <a
  121. * href="http://jakarta.apache.org/commons/httpclient/">Apache Jakarta Commons
  122. * HttpClient</a> library.
  123. *
  124. * @author Gordon Mohr
  125. * @author Igor Ranitovic
  126. * @author others
  127. * @version $Id: FetchHTTP.java 6803 2010-04-02 01:03:46Z gojomo $
  128. */
  129. public class FetchHTTP extends Processor
  130. implements CoreAttributeConstants, FetchStatusCodes, CrawlStatusListener {
  131. // be robust against trivial implementation changes
  132. private static final long serialVersionUID =
  133. ArchiveUtils.classnameBasedUID(FetchHTTP.class,1);
  134. private static Logger logger = Logger.getLogger(FetchHTTP.class.getName());
  135. public static final String ATTR_HTTP_PROXY_HOST = A_HTTP_PROXY_HOST;
  136. public static final String ATTR_HTTP_PROXY_PORT = A_HTTP_PROXY_PORT;
  137. public static final String ATTR_TIMEOUT_SECONDS = "timeout-seconds";
  138. public static final String ATTR_SOTIMEOUT_MS = "sotimeout-ms";
  139. public static final String ATTR_MAX_LENGTH_BYTES = "max-length-bytes";
  140. public static final String ATTR_LOAD_COOKIES = "load-cookies-from-file";
  141. public static final String ATTR_SAVE_COOKIES = "save-cookies-to-file";
  142. public static final String ATTR_ACCEPT_HEADERS = "accept-headers";
  143. public static final String ATTR_DEFAULT_ENCODING = "default-encoding";
  144. public static final String ATTR_DIGEST_CONTENT = "digest-content";
  145. public static final String ATTR_DIGEST_ALGORITHM = "digest-algorithm";
  146. public static final String ATTR_FETCH_BANDWIDTH_MAX = "fetch-bandwidth";
  147. public static final String DESC_DIGEST_CONTENT = "Whether or not to"
  148. + " perform an on-the-fly digest hash of retrieved content-bodies.";
  149. public static final String DESC_DIGEST_ALGORITHM = "Which algorithm (for"
  150. + " example MD5 or SHA-1) to use to perform an on-the-fly digest hash"
  151. + " of retrieved content-bodies.";
  152. /**
  153. * SSL trust level setting attribute name.
  154. */
  155. public static final String ATTR_TRUST = "trust-level";
  156. private static Integer DEFAULT_TIMEOUT_SECONDS = new Integer(1200);
  157. private static Integer DEFAULT_SOTIMEOUT_MS = new Integer(20000);
  158. private static Long DEFAULT_MAX_LENGTH_BYTES = new Long(0);
  159. private static Integer DEFAULT_FETCH_BANDWIDTH_MAX = 0;
  160. /**
  161. * This is the default value pre-1.4. Needs special handling else
  162. * treated as negative number doing math later in processing.
  163. */
  164. private static long OLD_DEFAULT_MAX_LENGTH_BYTES = 9223372036854775807L;
  165. /**
  166. * Default character encoding to use for pages that do not specify.
  167. */
  168. private static String DEFAULT_CONTENT_CHARSET = Heritrix.DEFAULT_ENCODING;
  169. /**
  170. * Default whether to perform on-the-fly digest hashing of content-bodies.
  171. */
  172. static Boolean DEFAULT_DIGEST_CONTENT = new Boolean(true);
  173. /**
  174. * The different digest algorithms to choose between,
  175. * SHA-1 or MD-5 at the moment.
  176. */
  177. public static final String SHA1 = "sha1";
  178. public static final String MD5 = "md5";
  179. public static String [] DIGEST_ALGORITHMS = {SHA1, MD5};
  180. /**
  181. * Default algorithm to use for message disgesting.
  182. */
  183. public static final String DEFAULT_DIGEST_ALGORITHM = SHA1;
  184. private transient HttpClient http = null;
  185. /**
  186. * How many 'instant retries' of HttpRecoverableExceptions have occurred
  187. *
  188. * Would like it to be 'long', but longs aren't atomic
  189. */
  190. private int recoveryRetries = 0;
  191. /**
  192. * Count of crawl uris handled.
  193. * Would like to be 'long', but longs aren't atomic
  194. */
  195. private int curisHandled = 0;
  196. /**
  197. * Rules to apply mid-fetch, just after receipt of the response
  198. * headers before we start to download body.
  199. */
  200. public static final String ATTR_MIDFETCH_DECIDE_RULES = "midfetch-decide-rules";
  201. /**
  202. * What to log if midfetch abort.
  203. */
  204. private static final String MIDFETCH_ABORT_LOG = "midFetchAbort";
  205. public static final String ATTR_SEND_CONNECTION_CLOSE =
  206. "send-connection-close";
  207. private static final Header HEADER_SEND_CONNECTION_CLOSE =
  208. new Header("Connection", "close");
  209. public static final String ATTR_SEND_REFERER = "send-referer";
  210. public static final String ATTR_SEND_RANGE = "send-range";
  211. public static final String ATTR_SEND_IF_MODIFIED_SINCE = "send-if-modified-since";
  212. public static final String ATTR_SEND_IF_NONE_MATCH = "send-if-none-match";
  213. public static final String REFERER = "Referer";
  214. public static final String RANGE = "Range";
  215. public static final String RANGE_PREFIX = "bytes=0-";
  216. public static final String HTTP_SCHEME = "http";
  217. public static final String HTTPS_SCHEME = "https";
  218. public static final String ATTR_IGNORE_COOKIES = "ignore-cookies";
  219. private static Boolean DEFAULT_IGNORE_COOKIES = new Boolean(false);
  220. public static final String ATTR_BDB_COOKIES = "use-bdb-for-cookies";
  221. private static Boolean DEFAULT_BDB_COOKIES = new Boolean(true);
  222. public static final String ATTR_HTTP_BIND_ADDRESS = A_HTTP_BIND_ADDRESS;
  223. /**
  224. * Database backing cookie map, if using BDB
  225. */
  226. protected Database cookieDb;
  227. /**
  228. * Name of cookie BDB Database
  229. */
  230. public static final String COOKIEDB_NAME = "http_cookies";
  231. static {
  232. Protocol.registerProtocol("http", new Protocol("http",
  233. new HeritrixProtocolSocketFactory(), 80));
  234. try {
  235. Protocol.registerProtocol("https",
  236. new Protocol("https", ((ProtocolSocketFactory)
  237. new HeritrixSSLProtocolSocketFactory()), 443));
  238. } catch (KeyManagementException e) {
  239. e.printStackTrace();
  240. } catch (KeyStoreException e) {
  241. e.printStackTrace();
  242. } catch (NoSuchAlgorithmException e) {
  243. e.printStackTrace();
  244. }
  245. }
  246. static final String SERVER_CACHE_KEY = "heritrix.server.cache";
  247. static final String SSL_FACTORY_KEY = "heritrix.ssl.factory";
  248. /***
  249. * Socket factory that has the configurable trust manager installed.
  250. */
  251. private SSLSocketFactory sslfactory = null;
  252. /**
  253. * Constructor.
  254. *
  255. * @param name Name of this processor.
  256. */
  257. public FetchHTTP(String name) {
  258. super(name, "HTTP Fetcher");
  259. addElementToDefinition(
  260. new DecideRuleSequence(ATTR_MIDFETCH_DECIDE_RULES,
  261. "DecideRules which, if final decision is REJECT, " +
  262. "abort fetch after headers before all content is" +
  263. "read."));
  264. addElementToDefinition(new SimpleType(ATTR_TIMEOUT_SECONDS,
  265. "If the fetch is not completed in this number of seconds, "
  266. + "even if it is making progress, give up. The URI will be "
  267. + "annotated as timeTrunc. Set to zero for no timeout. "
  268. + "(This is not recommended: threads could wait indefinitely "
  269. + "for the fetch to end.)",
  270. DEFAULT_TIMEOUT_SECONDS));
  271. Type e = addElementToDefinition(new SimpleType(ATTR_SOTIMEOUT_MS,
  272. "If a socket is unresponsive for this number of milliseconds, " +
  273. "give up on that connects/read. (This does not necessarily give " +
  274. "up on the fetch immediately; connects are subject to retries " +
  275. "and reads will be retried until " + ATTR_TIMEOUT_SECONDS +
  276. " have elapsed. Set to zero for no socket timeout. (This is " +
  277. "note recommended: a socket operation could hand indefinitely.",
  278. DEFAULT_SOTIMEOUT_MS));
  279. e.setExpertSetting(true);
  280. e = addElementToDefinition(new SimpleType(ATTR_FETCH_BANDWIDTH_MAX,
  281. "The maximum KB/sec to use when fetching data from a server. " +
  282. "0 means no maximum. Default: "+ DEFAULT_FETCH_BANDWIDTH_MAX
  283. + ".", DEFAULT_FETCH_BANDWIDTH_MAX));
  284. e.setExpertSetting(true);
  285. e.setOverrideable(true);
  286. addElementToDefinition(new SimpleType(ATTR_MAX_LENGTH_BYTES,
  287. "Maximum length in bytes to fetch.\n" +
  288. "Fetch is truncated at this length. A value of 0 means no limit.",
  289. DEFAULT_MAX_LENGTH_BYTES));
  290. e = addElementToDefinition(new SimpleType(ATTR_IGNORE_COOKIES,
  291. "Disable cookie-handling.", DEFAULT_IGNORE_COOKIES));
  292. e.setOverrideable(true);
  293. e.setExpertSetting(true);
  294. e = addElementToDefinition(new SimpleType(ATTR_BDB_COOKIES,
  295. "Store cookies in BDB-backed map.", DEFAULT_BDB_COOKIES));
  296. e.setExpertSetting(true);
  297. e = addElementToDefinition(new SimpleType(ATTR_LOAD_COOKIES,
  298. "File to preload cookies from", ""));
  299. e.setExpertSetting(true);
  300. e = addElementToDefinition(new SimpleType(ATTR_SAVE_COOKIES,
  301. "When crawl finishes save cookies to this file", ""));
  302. e.setExpertSetting(true);
  303. e = addElementToDefinition(new SimpleType(ATTR_TRUST,
  304. "SSL certificate trust level. Range is from the default 'open'"
  305. + " (trust all certs including expired, selfsigned, and those for"
  306. + " which we do not have a CA) through 'loose' (trust all valid"
  307. + " certificates including selfsigned), 'normal' (all valid"
  308. + " certificates not including selfsigned) to 'strict' (Cert is"
  309. + " valid and DN must match servername)",
  310. ConfigurableX509TrustManager.DEFAULT,
  311. ConfigurableX509TrustManager.LEVELS_AS_ARRAY));
  312. e.setOverrideable(false);
  313. e.setExpertSetting(true);
  314. e = addElementToDefinition(new StringList(ATTR_ACCEPT_HEADERS,
  315. "Accept Headers to include in each request. Each must be the"
  316. + " complete header, e.g., 'Accept-Language: en'"));
  317. e.setExpertSetting(true);
  318. e = addElementToDefinition(new SimpleType(ATTR_HTTP_PROXY_HOST,
  319. "Proxy host IP (set only if needed).", ""));
  320. e.setExpertSetting(true);
  321. e = addElementToDefinition(new SimpleType(ATTR_HTTP_PROXY_PORT,
  322. "Proxy port (set only if needed)", ""));
  323. e.setExpertSetting(true);
  324. e = addElementToDefinition(new SimpleType(ATTR_DEFAULT_ENCODING,
  325. "The character encoding to use for files that do not have one" +
  326. " specified in the HTTP response headers. Default: " +
  327. DEFAULT_CONTENT_CHARSET + ".",
  328. DEFAULT_CONTENT_CHARSET));
  329. e.setExpertSetting(true);
  330. e = addElementToDefinition(new SimpleType(ATTR_DIGEST_CONTENT, DESC_DIGEST_CONTENT,
  331. DEFAULT_DIGEST_CONTENT));
  332. e.setExpertSetting(true);
  333. e = addElementToDefinition(new SimpleType(ATTR_DIGEST_ALGORITHM, DESC_DIGEST_ALGORITHM,
  334. DEFAULT_DIGEST_ALGORITHM, DIGEST_ALGORITHMS));
  335. e.setExpertSetting(true);
  336. e = addElementToDefinition(new SimpleType(ATTR_SEND_IF_MODIFIED_SINCE,
  337. "Send 'If-Modified-Since' header, if previous 'Last-Modified' " +
  338. "fetch history information is available in URI history.",
  339. new Boolean(true)));
  340. e.setOverrideable(true);
  341. e.setExpertSetting(true);
  342. e = addElementToDefinition(new SimpleType(ATTR_SEND_IF_NONE_MATCH,
  343. "Send 'If-None-Match' header, if previous 'Etag' fetch " +
  344. "history information is available in URI history.",
  345. new Boolean(true)));
  346. e.setOverrideable(true);
  347. e.setExpertSetting(true);
  348. e = addElementToDefinition(new SimpleType(ATTR_SEND_CONNECTION_CLOSE,
  349. "Send 'Connection: close' header with every request.",
  350. new Boolean(true)));
  351. e.setOverrideable(true);
  352. e.setExpertSetting(true);
  353. e = addElementToDefinition(new SimpleType(ATTR_SEND_REFERER,
  354. "Send 'Referer' header with every request.\n" +
  355. "The 'Referer' header contans the location the crawler came " +
  356. " from, " +
  357. "the page the current URI was discovered in. The 'Referer' " +
  358. "usually is " +
  359. "logged on the remote server and can be of assistance to " +
  360. "webmasters trying to figure how a crawler got to a " +
  361. "particular area on a site.",
  362. new Boolean(true)));
  363. e.setOverrideable(true);
  364. e.setExpertSetting(true);
  365. e = addElementToDefinition(new SimpleType(ATTR_SEND_RANGE,
  366. "Send 'Range' header when a limit (" + ATTR_MAX_LENGTH_BYTES +
  367. ") on document size.\n" +
  368. "Be polite to the HTTP servers and send the 'Range' header," +
  369. "stating that you are only interested in the first n bytes. " +
  370. "Only pertinent if " + ATTR_MAX_LENGTH_BYTES + " > 0. " +
  371. "Sending the 'Range' header results in a " +
  372. "'206 Partial Content' status response, which is better than " +
  373. "just cutting the response mid-download. On rare occasion, " +
  374. " sending 'Range' will " +
  375. "generate '416 Request Range Not Satisfiable' response.",
  376. new Boolean(false)));
  377. e.setOverrideable(true);
  378. e.setExpertSetting(true);
  379. e = addElementToDefinition(new SimpleType(ATTR_HTTP_BIND_ADDRESS,
  380. "Local IP address or hostname to use when making connections " +
  381. "(binding sockets). When not specified, uses default local" +
  382. "address(es).", ""));
  383. e.setExpertSetting(true);
  384. }
  385. protected void innerProcess(final CrawlURI curi)
  386. throws InterruptedException {
  387. if (!canFetch(curi)) {
  388. // Cannot fetch this, due to protocol, retries, or other problems
  389. return;
  390. }
  391. this.curisHandled++;
  392. // Note begin time
  393. curi.putLong(A_FETCH_BEGAN_TIME, System.currentTimeMillis());
  394. // Get a reference to the HttpRecorder that is set into this ToeThread.
  395. HttpRecorder rec = HttpRecorder.getHttpRecorder();
  396. // Shall we get a digest on the content downloaded?
  397. boolean digestContent = ((Boolean)getUncheckedAttribute(curi,
  398. ATTR_DIGEST_CONTENT)).booleanValue();
  399. String algorithm = null;
  400. if (digestContent) {
  401. algorithm = ((String)getUncheckedAttribute(curi,
  402. ATTR_DIGEST_ALGORITHM));
  403. rec.getRecordedInput().setDigest(algorithm);
  404. } else {
  405. // clear
  406. rec.getRecordedInput().setDigest((MessageDigest)null);
  407. }
  408. // Below we do two inner classes that add check of midfetch
  409. // filters just as we're about to receive the response body.
  410. String curiString = curi.getUURI().toString();
  411. HttpMethodBase method = null;
  412. if (curi.isPost()) {
  413. method = new HttpRecorderPostMethod(curiString, rec) {
  414. protected void readResponseBody(HttpState state,
  415. HttpConnection conn)
  416. throws IOException, HttpException {
  417. addResponseContent(this, curi);
  418. if (checkMidfetchAbort(curi, this.httpRecorderMethod, conn)) {
  419. doAbort(curi, this, MIDFETCH_ABORT_LOG);
  420. } else {
  421. super.readResponseBody(state, conn);
  422. }
  423. }
  424. };
  425. } else {
  426. method = new HttpRecorderGetMethod(curiString, rec) {
  427. protected void readResponseBody(HttpState state,
  428. HttpConnection conn)
  429. throws IOException, HttpException {
  430. addResponseContent(this, curi);
  431. if (checkMidfetchAbort(curi, this.httpRecorderMethod,
  432. conn)) {
  433. doAbort(curi, this, MIDFETCH_ABORT_LOG);
  434. } else {
  435. super.readResponseBody(state, conn);
  436. }
  437. }
  438. };
  439. }
  440. HostConfiguration customConfigOrNull = configureMethod(curi, method);
  441. // Set httpRecorder into curi. Subsequent code both here and later
  442. // in extractors expects to find the HttpRecorder in the CrawlURI.
  443. curi.setHttpRecorder(rec);
  444. // Populate credentials. Set config so auth. is not automatic.
  445. boolean addedCredentials = populateCredentials(curi, method);
  446. method.setDoAuthentication(addedCredentials);
  447. // set hardMax on bytes (if set by operator)
  448. long hardMax = getMaxLength(curi);
  449. // set overall timeout (if set by operator)
  450. long timeoutMs = 1000 * getTimeout(curi);
  451. // Get max fetch rate (bytes/ms). It comes in in KB/sec
  452. long maxRateKBps = getMaxFetchRate(curi);
  453. rec.getRecordedInput().setLimits(hardMax, timeoutMs, maxRateKBps);
  454. try {
  455. this.http.executeMethod(customConfigOrNull, method);
  456. } catch (RecorderTooMuchHeaderException ex) {
  457. // when too much header material, abort like other truncations
  458. doAbort(curi, method, HEADER_TRUNC);
  459. } catch (IOException e) {
  460. failedExecuteCleanup(method, curi, e);
  461. return;
  462. } catch (ArrayIndexOutOfBoundsException e) {
  463. // For weird windows-only ArrayIndex exceptions in native
  464. // code... see
  465. // http://forum.java.sun.com/thread.jsp?forum=11&thread=378356
  466. // treating as if it were an IOException
  467. failedExecuteCleanup(method, curi, e);
  468. return;
  469. }
  470. // set softMax on bytes to get (if implied by content-length)
  471. long softMax = method.getResponseContentLength();
  472. try {
  473. if (!method.isAborted()) {
  474. // Force read-to-end, so that any socket hangs occur here,
  475. // not in later modules.
  476. rec.getRecordedInput().readFullyOrUntil(softMax);
  477. }
  478. } catch (RecorderTimeoutException ex) {
  479. doAbort(curi, method, TIMER_TRUNC);
  480. } catch (RecorderLengthExceededException ex) {
  481. doAbort(curi, method, LENGTH_TRUNC);
  482. } catch (IOException e) {
  483. cleanup(curi, e, "readFully", S_CONNECT_LOST);
  484. return;
  485. } catch (ArrayIndexOutOfBoundsException e) {
  486. // For weird windows-only ArrayIndex exceptions from native code
  487. // see http://forum.java.sun.com/thread.jsp?forum=11&thread=378356
  488. // treating as if it were an IOException
  489. cleanup(curi, e, "readFully", S_CONNECT_LOST);
  490. return;
  491. } finally {
  492. // ensure recording has stopped
  493. rec.closeRecorders();
  494. if (!method.isAborted()) {
  495. method.releaseConnection();
  496. }
  497. // Note completion time
  498. curi.putLong(A_FETCH_COMPLETED_TIME, System.currentTimeMillis());
  499. // Set the response charset into the HttpRecord if available.
  500. setCharacterEncoding(rec, method);
  501. setSizes(curi, rec);
  502. }
  503. if (digestContent) {
  504. curi.setContentDigest(algorithm,
  505. rec.getRecordedInput().getDigestValue());
  506. }
  507. if (logger.isLoggable(Level.INFO)) {
  508. logger.info((curi.isPost()? "POST": "GET") + " " +
  509. curi.getUURI().toString() + " " + method.getStatusCode() +
  510. " " + rec.getRecordedInput().getSize() + " " +
  511. curi.getContentType());
  512. }
  513. if (curi.isSuccess() && addedCredentials) {
  514. // Promote the credentials from the CrawlURI to the CrawlServer
  515. // so they are available for all subsequent CrawlURIs on this
  516. // server.
  517. promoteCredentials(curi);
  518. if (logger.isLoggable(Level.FINE)) {
  519. // Print out the cookie. Might help with the debugging.
  520. Header setCookie = method.getResponseHeader("set-cookie");
  521. if (setCookie != null) {
  522. logger.fine(setCookie.toString().trim());
  523. }
  524. }
  525. } else if (method.getStatusCode() == HttpStatus.SC_UNAUTHORIZED) {
  526. // 401 is not 'success'.
  527. handle401(method, curi);
  528. }
  529. if (rec.getRecordedInput().isOpen()) {
  530. logger.severe(curi.toString() + " RIS still open. Should have" +
  531. " been closed by method release: " +
  532. Thread.currentThread().getName());
  533. try {
  534. rec.getRecordedInput().close();
  535. } catch (IOException e) {
  536. logger.log(Level.SEVERE,"second-chance RIS close failed",e);
  537. }
  538. }
  539. }
  540. /**
  541. * Update CrawlURI internal sizes based on current transaction (and
  542. * in the case of 304s, history)
  543. *
  544. * @param curi CrawlURI
  545. * @param rec HttpRecorder
  546. */
  547. protected void setSizes(final CrawlURI curi, HttpRecorder rec) {
  548. // set reporting size
  549. curi.setContentSize(rec.getRecordedInput().getSize());
  550. // special handling for 304-not modified
  551. if (curi.getFetchStatus() == HttpStatus.SC_NOT_MODIFIED
  552. && curi.containsKey(A_FETCH_HISTORY)) {
  553. AList history[] = curi.getAList().getAListArray(A_FETCH_HISTORY);
  554. if (history[0] != null
  555. && history[0]
  556. .containsKey(CoreAttributeConstants.A_REFERENCE_LENGTH)) {
  557. long referenceLength = history[0].getLong(A_REFERENCE_LENGTH);
  558. // carry-forward previous 'reference-length' for future
  559. curi.putLong(A_REFERENCE_LENGTH, referenceLength);
  560. // increase content-size to virtual-size for reporting
  561. curi.setContentSize(rec.getRecordedInput().getSize()
  562. + referenceLength);
  563. }
  564. }
  565. }
  566. protected void doAbort(CrawlURI curi, HttpMethod method,
  567. String annotation) {
  568. curi.addAnnotation(annotation);
  569. curi.getHttpRecorder().close();
  570. method.abort();
  571. }
  572. protected boolean checkMidfetchAbort(CrawlURI curi,
  573. HttpRecorderMethod method, HttpConnection conn) {
  574. if (curi.isPrerequisite() || rulesAccept(getMidfetchRule(curi), curi)) {
  575. return false;
  576. }
  577. method.markContentBegin(conn);
  578. return true;
  579. }
  580. protected DecideRule getMidfetchRule(Object o) {
  581. try {
  582. return (DecideRule)getAttribute(o, ATTR_MIDFETCH_DECIDE_RULES);
  583. } catch (AttributeNotFoundException e) {
  584. throw new RuntimeException(e);
  585. }
  586. }
  587. /**
  588. * This method populates <code>curi</code> with response status and
  589. * content type.
  590. * @param curi CrawlURI to populate.
  591. * @param method Method to get response status and headers from.
  592. */
  593. protected void addResponseContent (HttpMethod method, CrawlURI curi) {
  594. curi.setFetchStatus(method.getStatusCode());
  595. Header ct = method.getResponseHeader("content-type");
  596. curi.setContentType((ct == null)? null: ct.getValue());
  597. // Save method into curi too. Midfetch filters may want to leverage
  598. // info in here.
  599. curi.putObject(A_HTTP_TRANSACTION, method);
  600. }
  601. /**
  602. * Set the character encoding based on the result headers or default.
  603. *
  604. * The HttpClient returns its own default encoding ("ISO-8859-1") if one
  605. * isn't specified in the Content-Type response header. We give the user
  606. * the option of overriding this, so we need to detect the case where the
  607. * default is returned.
  608. *
  609. * Now, it may well be the case that the default returned by HttpClient
  610. * and the default defined by the user are the same.
  611. *
  612. * @param rec Recorder for this request.
  613. * @param method Method used for the request.
  614. */
  615. private void setCharacterEncoding(final HttpRecorder rec,
  616. final HttpMethod method) {
  617. String encoding = null;
  618. try {
  619. encoding = ((HttpMethodBase) method).getResponseCharSet();
  620. if (encoding == null ||
  621. encoding.equals(DEFAULT_CONTENT_CHARSET)) {
  622. encoding = (String) getAttribute(ATTR_DEFAULT_ENCODING);
  623. }
  624. } catch (Exception e) {
  625. logger.warning("Failed get default encoding: " +
  626. e.getLocalizedMessage());
  627. }
  628. rec.setCharacterEncoding(encoding);
  629. }
  630. /**
  631. * Cleanup after a failed method execute.
  632. * @param curi CrawlURI we failed on.
  633. * @param method Method we failed on.
  634. * @param exception Exception we failed with.
  635. */
  636. private void failedExecuteCleanup(final HttpMethod method,
  637. final CrawlURI curi, final Exception exception) {
  638. cleanup(curi, exception, "executeMethod", (method.isRequestSent() ? S_CONNECT_LOST : S_CONNECT_FAILED));
  639. method.releaseConnection();
  640. }
  641. /**
  642. * Cleanup after a failed method execute.
  643. * @param curi CrawlURI we failed on.
  644. * @param exception Exception we failed with.
  645. * @param message Message to log with failure.
  646. * @param status Status to set on the fetch.
  647. */
  648. private void cleanup(final CrawlURI curi, final Exception exception,
  649. final String message, final int status) {
  650. curi.addLocalizedError(this.getName(), exception, message);
  651. curi.setFetchStatus(status);
  652. curi.getHttpRecorder().close();
  653. }
  654. /**
  655. * Can this processor fetch the given CrawlURI. May set a fetch
  656. * status if this processor would usually handle the CrawlURI,
  657. * but cannot in this instance.
  658. *
  659. * @param curi
  660. * @return True if processor can fetch.
  661. */
  662. private boolean canFetch(CrawlURI curi) {
  663. if(curi.getFetchStatus()<0) {
  664. // already marked as errored, this pass through
  665. // skip to end
  666. curi.skipToProcessorChain(getController().getPostprocessorChain());
  667. return false;
  668. }
  669. String scheme = curi.getUURI().getScheme();
  670. if (!(scheme.equals("http") || scheme.equals("https"))) {
  671. // handles only plain http and https
  672. return false;
  673. }
  674. CrawlHost host = getController().getServerCache().getHostFor(curi);
  675. // make sure the dns lookup succeeded
  676. if (host.getIP() == null && host.hasBeenLookedUp()) {
  677. curi.setFetchStatus(S_DOMAIN_PREREQUISITE_FAILURE);
  678. return false;
  679. }
  680. return true;
  681. }
  682. /**
  683. * Configure the HttpMethod setting options and headers.
  684. *
  685. * @param curi CrawlURI from which we pull configuration.
  686. * @param method The Method to configure.
  687. * @return HostConfiguration copy customized for this CrawlURI
  688. */
  689. protected HostConfiguration configureMethod(CrawlURI curi, HttpMethod method) {
  690. // Don't auto-follow redirects
  691. method.setFollowRedirects(false);
  692. // // set soTimeout
  693. // method.getParams().setSoTimeout(
  694. // ((Integer) getUncheckedAttribute(curi, ATTR_SOTIMEOUT_MS))
  695. // .intValue());
  696. // Set cookie policy.
  697. method.getParams().setCookiePolicy(
  698. (((Boolean)getUncheckedAttribute(curi, ATTR_IGNORE_COOKIES)).
  699. booleanValue())?
  700. CookiePolicy.IGNORE_COOKIES:
  701. CookiePolicy.BROWSER_COMPATIBILITY);
  702. // Use only HTTP/1.0 (to avoid receiving chunked responses)
  703. method.getParams().setVersion(HttpVersion.HTTP_1_0);
  704. CrawlOrder order = getSettingsHandler().getOrder();
  705. String userAgent = curi.getUserAgent();
  706. if (userAgent == null) {
  707. userAgent = order.getUserAgent(curi);
  708. }
  709. method.setRequestHeader("User-Agent", userAgent);
  710. method.setRequestHeader("From", order.getFrom(curi));
  711. // Set retry handler.
  712. method.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,
  713. new HeritrixHttpMethodRetryHandler());
  714. final long maxLength = getMaxLength(curi);
  715. if(maxLength > 0 &&
  716. ((Boolean)getUncheckedAttribute(curi, ATTR_SEND_RANGE)).
  717. booleanValue()) {
  718. method.addRequestHeader(RANGE,
  719. RANGE_PREFIX.concat(Long.toString(maxLength - 1)));
  720. }
  721. if (((Boolean)getUncheckedAttribute(curi,
  722. ATTR_SEND_CONNECTION_CLOSE)).booleanValue()) {
  723. method.addRequestHeader(HEADER_SEND_CONNECTION_CLOSE);
  724. }
  725. if (((Boolean)getUncheckedAttribute(curi,
  726. ATTR_SEND_REFERER)).booleanValue()
  727. && (curi.getViaContext()==null ||
  728. !Link.PREREQ_MISC.equals(
  729. curi.getViaContext().toString()))) {
  730. // RFC2616 says no referer header if referer is https and the url
  731. // is not
  732. String via = curi.flattenVia();
  733. if (via != null && via.length() > 0 &&
  734. !(via.startsWith(HTTPS_SCHEME) &&
  735. curi.getUURI().getScheme().equals(HTTP_SCHEME))) {
  736. method.setRequestHeader(REFERER, via);
  737. }
  738. }
  739. if(!curi.isPrerequisite()) {
  740. setConditionalGetHeader(curi, method, ATTR_SEND_IF_MODIFIED_SINCE,
  741. CoreAttributeConstants.A_LAST_MODIFIED_HEADER, "If-Modified-Since");
  742. setConditionalGetHeader(curi, method, ATTR_SEND_IF_NONE_MATCH,
  743. CoreAttributeConstants.A_ETAG_HEADER, "If-None-Match");
  744. }
  745. // TODO: What happens if below method adds a header already
  746. // added above: e.g. Connection, Range, or Referer?
  747. setAcceptHeaders(curi, method);
  748. HostConfiguration config = new HostConfiguration(http.getHostConfiguration());
  749. configureProxy(curi, config);
  750. configureBindAddress(curi, config);
  751. return config;
  752. }
  753. /**
  754. * Set the given conditional-GET header, if the setting is enabled and
  755. * a suitable value is available in the URI history.
  756. * @param curi source CrawlURI
  757. * @param method HTTP operation pending
  758. * @param setting true/false enablement setting name to consult
  759. * @param sourceHeader header to consult in URI history
  760. * @param targetHeader header to set if possible
  761. */
  762. protected void setConditionalGetHeader(CrawlURI curi, HttpMethod method,
  763. String setting, String sourceHeader, String targetHeader) {
  764. if(((Boolean)getUncheckedAttribute(curi,setting))) {
  765. try {
  766. int previousStatus = curi.getAList().getAListArray(
  767. A_FETCH_HISTORY)[0].getInt(A_STATUS);
  768. if(previousStatus<=0) {
  769. // do not reuse headers from any broken fetch
  770. return;
  771. }
  772. String previousValue = curi.getAList().getAListArray(
  773. A_FETCH_HISTORY)[0].getString(sourceHeader);
  774. if(previousValue!=null) {
  775. method.setRequestHeader(targetHeader, previousValue);
  776. }
  777. } catch (RuntimeException e) {
  778. // for absent key, bad index, etc. just do nothing
  779. }
  780. }
  781. }
  782. /**
  783. * Setup proxy, based on attributes in CrawlURI and settings,
  784. * in the given HostConfiguration
  785. */
  786. private void configureProxy(CrawlURI curi, HostConfiguration config) {
  787. String proxy = (String) getAttributeEither(curi, ATTR_HTTP_PROXY_HOST);
  788. int port = -1;
  789. if(proxy.length()==0) {
  790. proxy = null;
  791. } else {
  792. String portString = (String)getAttributeEither(curi, ATTR_HTTP_PROXY_PORT);
  793. port = portString.length()>0 ? Integer.parseInt(portString) : -1;
  794. }
  795. if(proxy!=null) {
  796. config.setProxy(proxy,port);
  797. }
  798. }
  799. /**
  800. * Setup local bind address, based on attributes in CrawlURI and settings,
  801. * in the given HostConfiguration
  802. */
  803. private void configureBindAddress(CrawlURI curi, HostConfiguration config) {
  804. String addressString = (String) getAttributeEither(curi, ATTR_HTTP_BIND_ADDRESS);
  805. if(addressString != null && addressString.length() > 0) {
  806. try {
  807. InetAddress localAddress = InetAddress.getByName(addressString);
  808. config.setLocalAddress(localAddress);
  809. } catch (UnknownHostException e) {
  810. // Convert all to RuntimeException so get an exception out
  811. // if initialization fails.
  812. throw new RuntimeException("Unknown host " + addressString
  813. + " in " + ATTR_HTTP_BIND_ADDRESS);
  814. }
  815. }
  816. }
  817. /**
  818. * Get a value either from inside the CrawlURI instance, or from
  819. * settings (module attributes).
  820. *
  821. * @param curi CrawlURI to consult
  822. * @param key key to lookup
  823. * @return value from either CrawlURI (preferred) or settings
  824. */
  825. protected Object getAttributeEither(CrawlURI curi, String key) {
  826. Object obj = curi!=null ? curi.getObject(key) : null;
  827. if(obj==null) {
  828. obj = getUncheckedAttribute(curi, key);
  829. }
  830. return obj;
  831. }
  832. /**
  833. * Add credentials if any to passed <code>method</code>.
  834. *
  835. * Do credential handling. Credentials are in two places. 1. Credentials
  836. * that succeeded are added to the CrawlServer (Or rather, avatars for
  837. * credentials are whats added because its not safe to keep around
  838. * references to credentials). 2. Credentials to be tried are in the curi.
  839. * Returns true if found credentials to be tried.
  840. *
  841. * @param curi Current CrawlURI.
  842. * @param method The method to add to.
  843. * @return True if prepopulated <code>method</code> with credentials AND the
  844. * credentials came from the <code>curi</code>, not from the CrawlServer.
  845. * The former is special in that if the <code>curi</curi> credentials
  846. * succeed, then the caller needs to promote them from the CrawlURI to the
  847. * CrawlServer so they are available for all subsequent CrawlURIs on this
  848. * server.
  849. */
  850. private boolean populateCredentials(CrawlURI curi, HttpMethod method) {
  851. // First look at the server avatars. Add any that are to be volunteered
  852. // on every request (e.g. RFC2617 credentials). Every time creds will
  853. // return true when we call 'isEveryTime().
  854. CrawlServer server =
  855. getController().getServerCache().getServerFor(curi);
  856. if (server.hasCredentialAvatars()) {
  857. Set avatars = server.getCredentialAvatars();
  858. for (Iterator i = avatars.iterator(); i.hasNext();) {
  859. CredentialAvatar ca = (CredentialAvatar)i.next();
  860. Credential c = ca.getCredential(getSettingsHandler(), curi);
  861. if (c.isEveryTime()) {
  862. c.populate(curi, this.http, method, ca.getPayload());
  863. }
  864. }
  865. }
  866. boolean result = false;
  867. // Now look in the curi. The Curi will have credentials loaded either
  868. // by the handle401 method if its a rfc2617 or it'll have been set into
  869. // the curi by the preconditionenforcer as this login uri came through.
  870. if (curi.hasCredentialAvatars()) {
  871. Set avatars = curi.getCredentialAvatars();
  872. for (Iterator i = avatars.iterator(); i.hasNext();) {
  873. CredentialAvatar ca = (CredentialAvatar)i.next();
  874. Credential c = ca.getCredential(getSettingsHandler(), curi);
  875. if (c.populate(curi, this.http, method, ca.getPayload())) {
  876. result = true;
  877. }
  878. }
  879. }
  880. return result;
  881. }
  882. /**
  883. * Promote successful credential to the server.
  884. *
  885. * @param curi CrawlURI whose credentials we are to promote.
  886. */
  887. private void promoteCredentials(final CrawlURI curi) {
  888. if (!curi.hasCredentialAvatars()) {
  889. logger.severe("No credentials to promote when there should be " +
  890. curi);
  891. } else {
  892. Set avatars = curi.getCredentialAvatars();
  893. for (Iterator i = avatars.iterator(); i.hasNext();) {
  894. CredentialAvatar ca = (CredentialAvatar)i.next();
  895. curi.removeCredentialAvatar(ca);
  896. // The server to attach too may not be the server that hosts
  897. // this passed curi. It might be of another subdomain.
  898. // The avatar needs to be added to the server that is dependent
  899. // on this precondition. Find it by name. Get the name from
  900. // the credential this avatar represents.
  901. Credential c = ca.getCredential(getSettingsHandler(), curi);
  902. String cd = null;
  903. try {
  904. cd = c.getCredentialDomain(curi);
  905. }
  906. catch (AttributeNotFoundException e) {
  907. logger.severe("Failed to get cred domain for " + curi +
  908. " for " + ca + ": " + e.getMessage());
  909. }
  910. if (cd != null) {
  911. CrawlServer cs
  912. = getController().getServerCache().getServerFor(cd);
  913. if (cs != null) {
  914. cs.addCredentialAvatar(ca);
  915. }
  916. }
  917. }
  918. }
  919. }
  920. /**
  921. * Server is looking for basic/digest auth credentials (RFC2617). If we have
  922. * any, put them into the CrawlURI and have it come around again. Presence
  923. * of the credential serves as flag to frontier to requeue promptly. If we
  924. * already tried this domain and still got a 401, then our credentials are
  925. * bad. Remove them and let this curi die.
  926. *
  927. * @param method Method that got a 401.
  928. * @param curi CrawlURI that got a 401.
  929. */
  930. protected void handle401(final HttpMethod method, final CrawlURI curi) {
  931. AuthScheme authscheme = getAuthScheme(method, curi);
  932. if (authscheme == null) {
  933. return;
  934. }
  935. String realm = authscheme.getRealm();
  936. // Look to see if this curi had rfc2617 avatars loaded. If so, are
  937. // any of them for this realm? If so, then the credential failed
  938. // if we got a 401 and it should be let die a natural 401 death.
  939. Set curiRfc2617Credentials = getCredentials(getSettingsHandler(),
  940. curi, Rfc2617Credential.class);
  941. Rfc2617Credential extant = Rfc2617Credential.
  942. getByRealm(curiRfc2617Credentials, realm, curi);
  943. if (extant != null) {
  944. // Then, already tried this credential. Remove ANY rfc2617
  945. // credential since presence of a rfc2617 credential serves
  946. // as flag to frontier to requeue this curi and let the curi
  947. // die a natural death.
  948. extant.detachAll(curi);
  949. logger.warning("Auth failed (401) though supplied realm " +
  950. realm + " to " + curi.toString());
  951. } else {
  952. // Look see if we have a credential that corresponds to this
  953. // realm in credential store. Filter by type and credential
  954. // domain. If not, let this curi die. Else, add it to the
  955. // curi and let it come around again. Add in the AuthScheme
  956. // we got too. Its needed when we go to run the Auth on
  957. // second time around.
  958. CredentialStore cs =
  959. CredentialStore.getCredentialStore(getSettingsHandler());
  960. if (cs == null) {
  961. logger.severe("No credential store for " + curi);
  962. } else {
  963. CrawlServer server = getController().getServerCache().
  964. getServerFor(curi);
  965. Set storeRfc2617Credentials = cs.subset(curi,
  966. Rfc2617Credential.class, server.getName());
  967. if (storeRfc2617Credentials == null ||
  968. storeRfc2617Credentials.size() <= 0) {
  969. logger.info("No rfc2617 credentials for " + curi);
  970. } else {
  971. Rfc2617Credential found = Rfc2617Credential.
  972. getByRealm(storeRfc2617Credentials, realm, curi);
  973. if (found == null) {
  974. logger.info("No rfc2617 credentials for realm " +
  975. realm + " in " + curi);
  976. } else {
  977. found.attach(curi, authscheme.getRealm());
  978. logger.info("Found credential for realm " + realm +
  979. " in store for " + curi.toString());
  980. }
  981. }
  982. }
  983. }
  984. }
  985. /**
  986. * @param method Method that got a 401.
  987. * @param curi CrawlURI that got a 401.
  988. * @return Returns first wholesome authscheme found else null.
  989. */
  990. protected AuthScheme getAuthScheme(final HttpMethod method,
  991. final CrawlURI curi) {
  992. Header [] headers = method.getResponseHeaders("WWW-Authenticate");
  993. if (headers == null || headers.length <= 0) {
  994. logger.info("We got a 401 but no WWW-Authenticate challenge: " +
  995. curi.toString());
  996. return null;
  997. }
  998. Map authschemes = null;
  999. try {
  1000. authschemes = AuthChallengeParser.parseChallenges(headers);
  1001. } catch(MalformedChallengeException e) {
  1002. logger.info("Failed challenge parse: " + e.getMessage());
  1003. }
  1004. if (authschemes == null || authschemes.size() <= 0) {
  1005. logger.info("We got a 401 and WWW-Authenticate challenge" +
  1006. " but failed parse of the header " + curi.toString());
  1007. return null;
  1008. }
  1009. AuthScheme result = null;
  1010. // Use the first auth found.
  1011. for (Iterator i = authschemes.keySet().iterator();
  1012. result == null && i.hasNext();) {
  1013. String key = (String)i.next();
  1014. String challenge = (String)authschemes.get(key);
  1015. if (key == null || key.length() <= 0 || challenge == null ||
  1016. challenge.length() <= 0) {
  1017. logger.warning("Empty scheme: " + curi.toString() +
  1018. ": " + headers);
  1019. }
  1020. AuthScheme authscheme = null;
  1021. if (key.equals("basic")) {
  1022. authscheme = new BasicScheme();
  1023. } else if (key.equals("digest")) {
  1024. authscheme = new DigestScheme();
  1025. } else {
  1026. logger.info("Unsupported scheme: " + key);
  1027. continue;
  1028. }
  1029. try {
  1030. authscheme.processChallenge(challenge);
  1031. } catch (MalformedChallengeException e) {
  1032. logger.info(e.getMessage() + " " + curi + " " + headers);
  1033. continue;
  1034. }
  1035. if (authscheme.isConnectionBased()) {
  1036. logger.info("Connection based " + authscheme);
  1037. continue;
  1038. }
  1039. if (authscheme.getRealm() == null ||
  1040. authscheme.getRealm().length() <= 0) {
  1041. logger.info("Empty realm " + authscheme + " for " + curi);
  1042. continue;
  1043. }
  1044. result = authscheme;
  1045. }
  1046. return result;
  1047. }
  1048. /**
  1049. * @param handler Settings Handler.
  1050. * @param curi CrawlURI that got a 401.
  1051. * @param type Class of credential to get from curi.
  1052. * @return Set of credentials attached to this curi.
  1053. */
  1054. private Set<Credential> getCredentials(SettingsHandler handler,
  1055. CrawlURI curi, Class type) {
  1056. Set<Credential> result = null;
  1057. if (curi.hasCredentialAvatars()) {
  1058. for (Iterator i = curi.getCredentialAvatars().iterator();
  1059. i.hasNext();) {
  1060. CredentialAvatar ca = (CredentialAvatar)i.next();
  1061. if (ca.match(type)) {
  1062. if (result == null) {
  1063. result = new HashSet<Credential>();
  1064. }
  1065. result.add(ca.getCredential(handler, curi));
  1066. }
  1067. }
  1068. }
  1069. return result;
  1070. }
  1071. public void initialTasks() {
  1072. super.initialTasks();
  1073. this.getController().addCrawlStatusListener(this);
  1074. configureHttp();
  1075. // load cookies from a file if specified in the order file.
  1076. loadCookies();
  1077. // I tried to get the default KeyManagers but doesn't work unless you
  1078. // point at a physical keystore. Passing null seems to do the right
  1079. // thing so we'll go w/ that.
  1080. try {
  1081. SSLContext context = SSLContext.getInstance("S