PageRenderTime 55ms CodeModel.GetById 28ms RepoModel.GetById 1ms app.codeStats 0ms

/projects/heritrix-1.14.4/src/java/org/archive/crawler/fetcher/FetchDNS.java

https://gitlab.com/essere.lab.public/qualitas.class-corpus
Java | 333 lines | 253 code | 23 blank | 57 comment | 44 complexity | 16e9a44c40f587c8b474c29344272c04 MD5 | raw file
  1. /* Copyright (C) 2003 Internet Archive.
  2. *
  3. * This file is part of the Heritrix web crawler (crawler.archive.org).
  4. *
  5. * Heritrix is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU Lesser Public License as published by
  7. * the Free Software Foundation; either version 2.1 of the License, or
  8. * any later version.
  9. *
  10. * Heritrix is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. * GNU Lesser Public License for more details.
  14. *
  15. * You should have received a copy of the GNU Lesser Public License
  16. * along with Heritrix; if not, write to the Free Software
  17. * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  18. *
  19. * FetchDNS
  20. * Created on Jun 5, 2003
  21. *
  22. * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/java/org/archive/crawler/fetcher/FetchDNS.java,v 1.29.4.1 2007/01/13 01:31:17 stack-sf Exp $
  23. */
  24. package org.archive.crawler.fetcher;
  25. import java.io.ByteArrayInputStream;
  26. import java.io.ByteArrayOutputStream;
  27. import java.io.IOException;
  28. import java.io.InputStream;
  29. import java.net.InetAddress;
  30. import java.net.UnknownHostException;
  31. import java.security.MessageDigest;
  32. import java.util.logging.Level;
  33. import java.util.logging.Logger;
  34. import java.util.regex.Matcher;
  35. import org.apache.commons.httpclient.URIException;
  36. import org.archive.crawler.datamodel.CoreAttributeConstants;
  37. import org.archive.crawler.datamodel.CrawlHost;
  38. import org.archive.crawler.datamodel.CrawlURI;
  39. import org.archive.crawler.datamodel.FetchStatusCodes;
  40. import org.archive.crawler.framework.Processor;
  41. import org.archive.crawler.settings.SimpleType;
  42. import org.archive.util.ArchiveUtils;
  43. import org.archive.util.HttpRecorder;
  44. import org.archive.util.InetAddressUtil;
  45. import org.xbill.DNS.ARecord;
  46. import org.xbill.DNS.DClass;
  47. import org.xbill.DNS.Lookup;
  48. import org.xbill.DNS.Record;
  49. import org.xbill.DNS.ResolverConfig;
  50. import org.xbill.DNS.TextParseException;
  51. import org.xbill.DNS.Type;
  52. /**
  53. * Processor to resolve 'dns:' URIs.
  54. *
  55. * TODO: Refactor to use org.archive.util.DNSJavaUtils.
  56. *
  57. * @author multiple
  58. */
  59. public class FetchDNS extends Processor
  60. implements CoreAttributeConstants, FetchStatusCodes {
  61. private static final long serialVersionUID = 4686199203459704426L;
  62. private Logger logger = Logger.getLogger(this.getClass().getName());
  63. // Defaults.
  64. private short ClassType = DClass.IN;
  65. private short TypeType = Type.A;
  66. protected InetAddress serverInetAddr = null;
  67. private static final String ATTR_ACCEPT_NON_DNS_RESOLVES =
  68. "accept-non-dns-resolves";
  69. private static final Boolean DEFAULT_ACCEPT_NON_DNS_RESOLVES =
  70. Boolean.FALSE;
  71. private static final long DEFAULT_TTL_FOR_NON_DNS_RESOLVES
  72. = 6 * 60 * 60; // 6 hrs
  73. private byte [] reusableBuffer = new byte[1024];
  74. /**
  75. * Create a new instance of FetchDNS.
  76. *
  77. * @param name the name of this attribute.
  78. */
  79. public FetchDNS(String name) {
  80. super(name, "DNS Fetcher. Handles DNS lookups.");
  81. org.archive.crawler.settings.Type e =
  82. addElementToDefinition(new SimpleType(ATTR_ACCEPT_NON_DNS_RESOLVES,
  83. "If a DNS lookup fails, whether or not to fallback to " +
  84. "InetAddress resolution, which may use local 'hosts' files " +
  85. "or other mechanisms.", DEFAULT_ACCEPT_NON_DNS_RESOLVES));
  86. e.setExpertSetting(true);
  87. e = addElementToDefinition(new SimpleType(FetchHTTP.ATTR_DIGEST_CONTENT,
  88. FetchHTTP.DESC_DIGEST_CONTENT, FetchHTTP.DEFAULT_DIGEST_CONTENT));
  89. e.setExpertSetting(true);
  90. e = addElementToDefinition(new SimpleType(
  91. FetchHTTP.ATTR_DIGEST_ALGORITHM,
  92. FetchHTTP.DESC_DIGEST_ALGORITHM,
  93. FetchHTTP.DEFAULT_DIGEST_ALGORITHM,
  94. FetchHTTP.DIGEST_ALGORITHMS));
  95. e.setExpertSetting(true);
  96. }
  97. protected void innerProcess(CrawlURI curi) {
  98. if (!curi.getUURI().getScheme().equals("dns")) {
  99. // Only handles dns
  100. return;
  101. }
  102. Record[] rrecordSet = null; // Retrieved dns records
  103. String dnsName = null;
  104. try {
  105. dnsName = curi.getUURI().getReferencedHost();
  106. } catch (URIException e) {
  107. logger.log(Level.SEVERE, "Failed parse of dns record " + curi, e);
  108. }
  109. if(dnsName == null) {
  110. curi.setFetchStatus(S_UNFETCHABLE_URI);
  111. return;
  112. }
  113. // Make sure we're in "normal operating mode", e.g. a cache +
  114. // controller exist to assist us.
  115. CrawlHost targetHost = null;
  116. if (getController() != null &&
  117. getController().getServerCache() != null) {
  118. targetHost = getController().getServerCache().getHostFor(dnsName);
  119. } else {
  120. // Standalone operation (mostly for test cases/potential other uses)
  121. targetHost = new CrawlHost(dnsName);
  122. }
  123. if (isQuadAddress(curi, dnsName, targetHost)) {
  124. // We're done processing.
  125. return;
  126. }
  127. // Do actual DNS lookup.
  128. curi.putLong(A_FETCH_BEGAN_TIME, System.currentTimeMillis());
  129. // Try to get the records for this host (assume domain name)
  130. // TODO: Bug #935119 concerns potential hang here
  131. try {
  132. rrecordSet = (new Lookup(dnsName, TypeType, ClassType)).run();
  133. } catch (TextParseException e) {
  134. rrecordSet = null;
  135. }
  136. curi.setContentType("text/dns");
  137. if (rrecordSet != null) {
  138. if (logger.isLoggable(Level.FINE)) {
  139. logger.fine("Found recordset for " + dnsName);
  140. }
  141. storeDNSRecord(curi, dnsName, targetHost, rrecordSet);
  142. } else {
  143. if (logger.isLoggable(Level.FINE)) {
  144. logger.fine("Failed find of recordset for " + dnsName);
  145. }
  146. if (((Boolean)getUncheckedAttribute(null,
  147. ATTR_ACCEPT_NON_DNS_RESOLVES)).booleanValue()) {
  148. // Do lookup that bypasses javadns.
  149. InetAddress address = null;
  150. try {
  151. address = InetAddress.getByName(dnsName);
  152. } catch (UnknownHostException e1) {
  153. address = null;
  154. }
  155. if (address != null) {
  156. targetHost.setIP(address, DEFAULT_TTL_FOR_NON_DNS_RESOLVES);
  157. curi.setFetchStatus(S_GETBYNAME_SUCCESS);
  158. if (logger.isLoggable(Level.FINE)) {
  159. logger.fine("Found address for " + dnsName +
  160. " using native dns.");
  161. }
  162. } else {
  163. if (logger.isLoggable(Level.FINE)) {
  164. logger.fine("Failed find of address for " + dnsName +
  165. " using native dns.");
  166. }
  167. setUnresolvable(curi, targetHost);
  168. }
  169. } else {
  170. setUnresolvable(curi, targetHost);
  171. }
  172. }
  173. curi.putLong(A_FETCH_COMPLETED_TIME, System.currentTimeMillis());
  174. }
  175. protected void storeDNSRecord(final CrawlURI curi, final String dnsName,
  176. final CrawlHost targetHost, final Record[] rrecordSet) {
  177. // Get TTL and IP info from the first A record (there may be
  178. // multiple, e.g. www.washington.edu) then update the CrawlServer
  179. ARecord arecord = getFirstARecord(rrecordSet);
  180. if (arecord == null) {
  181. throw new NullPointerException("Got null arecord for " +
  182. dnsName);
  183. }
  184. targetHost.setIP(arecord.getAddress(), arecord.getTTL());
  185. try {
  186. recordDNS(curi, rrecordSet);
  187. curi.setFetchStatus(S_DNS_SUCCESS);
  188. curi.putString(A_DNS_SERVER_IP_LABEL, ResolverConfig.getCurrentConfig().server());
  189. } catch (IOException e) {
  190. logger.log(Level.SEVERE, "Failed store of DNS Record for " +
  191. curi.toString(), e);
  192. setUnresolvable(curi, targetHost);
  193. }
  194. }
  195. protected boolean isQuadAddress(final CrawlURI curi, final String dnsName,
  196. final CrawlHost targetHost) {
  197. boolean result = false;
  198. Matcher matcher = InetAddressUtil.IPV4_QUADS.matcher(dnsName);
  199. // If it's an ip no need to do a lookup
  200. if (matcher == null || !matcher.matches()) {
  201. return result;
  202. }
  203. result = true;
  204. // Ideally this branch would never be reached: no CrawlURI
  205. // would be created for numerical IPs
  206. if (logger.isLoggable(Level.WARNING)) {
  207. logger.warning("Unnecessary DNS CrawlURI created: " + curi);
  208. }
  209. try {
  210. targetHost.setIP(InetAddress.getByAddress(dnsName, new byte[] {
  211. (byte) (new Integer(matcher.group(1)).intValue()),
  212. (byte) (new Integer(matcher.group(2)).intValue()),
  213. (byte) (new Integer(matcher.group(3)).intValue()),
  214. (byte) (new Integer(matcher.group(4)).intValue()) }),
  215. CrawlHost.IP_NEVER_EXPIRES); // Never expire numeric IPs
  216. curi.setFetchStatus(S_DNS_SUCCESS);
  217. } catch (UnknownHostException e) {
  218. logger.log(Level.SEVERE, "Should never be " + e.getMessage(), e);
  219. setUnresolvable(curi, targetHost);
  220. }
  221. return result;
  222. }
  223. protected void recordDNS(final CrawlURI curi, final Record[] rrecordSet)
  224. throws IOException {
  225. final byte[] dnsRecord =
  226. getDNSRecord(curi.getLong(A_FETCH_BEGAN_TIME), rrecordSet);
  227. HttpRecorder rec = HttpRecorder.getHttpRecorder();
  228. // Shall we get a digest on the content downloaded?
  229. boolean digestContent = ((Boolean)getUncheckedAttribute(curi,
  230. FetchHTTP.ATTR_DIGEST_CONTENT)).booleanValue();
  231. String algorithm = null;
  232. if (digestContent) {
  233. algorithm = ((String)getUncheckedAttribute(curi,
  234. FetchHTTP.ATTR_DIGEST_ALGORITHM));
  235. rec.getRecordedInput().setDigest(algorithm);
  236. } else {
  237. // clear
  238. rec.getRecordedInput().setDigest((MessageDigest)null);
  239. }
  240. curi.setHttpRecorder(rec);
  241. InputStream is = curi.getHttpRecorder().inputWrap(
  242. new ByteArrayInputStream(dnsRecord));
  243. if(digestContent) {
  244. rec.getRecordedInput().startDigest();
  245. }
  246. // Reading from the wrapped stream, behind the scenes, will write
  247. // files into scratch space
  248. try {
  249. while (is.read(this.reusableBuffer) != -1) {
  250. continue;
  251. }
  252. } finally {
  253. is.close();
  254. rec.closeRecorders();
  255. }
  256. curi.setContentSize(dnsRecord.length);
  257. if (digestContent) {
  258. curi.setContentDigest(algorithm,
  259. rec.getRecordedInput().getDigestValue());
  260. }
  261. }
  262. protected byte [] getDNSRecord(final long fetchStart,
  263. final Record[] rrecordSet)
  264. throws IOException {
  265. ByteArrayOutputStream baos = new ByteArrayOutputStream();
  266. // Start the record with a 14-digit date per RFC 2540
  267. byte[] fetchDate = ArchiveUtils.get14DigitDate(fetchStart).getBytes();
  268. baos.write(fetchDate);
  269. // Don't forget the newline
  270. baos.write("\n".getBytes());
  271. int recordLength = fetchDate.length + 1;
  272. if (rrecordSet != null) {
  273. for (int i = 0; i < rrecordSet.length; i++) {
  274. byte[] record = rrecordSet[i].toString().getBytes();
  275. recordLength += record.length;
  276. baos.write(record);
  277. // Add the newline between records back in
  278. baos.write("\n".getBytes());
  279. recordLength += 1;
  280. }
  281. }
  282. return baos.toByteArray();
  283. }
  284. protected void setUnresolvable(CrawlURI curi, CrawlHost host) {
  285. host.setIP(null, 0);
  286. curi.setFetchStatus(S_DOMAIN_UNRESOLVABLE);
  287. }
  288. protected ARecord getFirstARecord(Record[] rrecordSet) {
  289. ARecord arecord = null;
  290. if (rrecordSet == null || rrecordSet.length == 0) {
  291. if (logger.isLoggable(Level.FINEST)) {
  292. logger.finest("rrecordSet is null or zero length: " +
  293. rrecordSet);
  294. }
  295. return arecord;
  296. }
  297. for (int i = 0; i < rrecordSet.length; i++) {
  298. if (rrecordSet[i].getType() != Type.A) {
  299. if (logger.isLoggable(Level.FINEST)) {
  300. logger.finest("Record " + Integer.toString(i) +
  301. " is not A type but " + rrecordSet[i].getType());
  302. }
  303. continue;
  304. }
  305. arecord = (ARecord) rrecordSet[i];
  306. break;
  307. }
  308. return arecord;
  309. }
  310. }