PageRenderTime 1749ms CodeModel.GetById 24ms RepoModel.GetById 0ms app.codeStats 0ms

/projects/heritrix-1.14.4/src/java/org/archive/crawler/writer/WARCWriterProcessor.java

https://gitlab.com/essere.lab.public/qualitas.class-corpus
Java | 682 lines | 490 code | 51 blank | 141 comment | 66 complexity | 80256c1532d1bb184f13e08cbbc79cca MD5 | raw file
  1. /* $Id: ExperimentalWARCWriterProcessor.java 4935 2007-02-23 00:27:24Z gojomo $
  2. *
  3. * Created on August 1st, 2006.
  4. *
  5. * Copyright (C) 2006 Internet Archive.
  6. *
  7. * This file is part of the Heritrix web crawler (crawler.archive.org).
  8. *
  9. * Heritrix is free software; you can redistribute it and/or modify
  10. * it under the terms of the GNU Lesser Public License as published by
  11. * the Free Software Foundation; either version 2.1 of the License, or
  12. * any later version.
  13. *
  14. * Heritrix is distributed in the hope that it will be useful,
  15. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17. * GNU Lesser Public License for more details.
  18. *
  19. * You should have received a copy of the GNU Lesser Public License
  20. * along with Heritrix; if not, write to the Free Software
  21. * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  22. */
  23. package org.archive.crawler.writer;
  24. import java.io.ByteArrayInputStream;
  25. import java.io.File;
  26. import java.io.IOException;
  27. import java.net.InetAddress;
  28. import java.net.URI;
  29. import java.net.URISyntaxException;
  30. import java.net.UnknownHostException;
  31. import java.util.Collection;
  32. import java.util.HashMap;
  33. import java.util.Map;
  34. import java.util.concurrent.atomic.AtomicInteger;
  35. import java.util.logging.Level;
  36. import java.util.logging.Logger;
  37. import org.apache.commons.httpclient.Header;
  38. import org.apache.commons.httpclient.HttpMethodBase;
  39. import org.apache.commons.httpclient.HttpStatus;
  40. import org.apache.commons.lang.StringUtils;
  41. import org.archive.crawler.Heritrix;
  42. import org.archive.crawler.datamodel.CoreAttributeConstants;
  43. import org.archive.crawler.datamodel.CrawlURI;
  44. import org.archive.crawler.datamodel.FetchStatusCodes;
  45. import org.archive.crawler.deciderules.recrawl.IdenticalDigestDecideRule;
  46. import org.archive.crawler.event.CrawlStatusListener;
  47. import org.archive.crawler.extractor.Link;
  48. import org.archive.crawler.framework.WriterPoolProcessor;
  49. import org.archive.crawler.settings.SimpleType;
  50. import org.archive.crawler.settings.Type;
  51. import org.archive.io.ReplayInputStream;
  52. import org.archive.io.WriterPoolMember;
  53. import org.archive.io.WriterPoolSettings;
  54. import org.archive.io.warc.WARCConstants;
  55. import org.archive.io.warc.WARCWriter;
  56. import org.archive.io.warc.WARCWriterPool;
  57. import org.archive.uid.GeneratorFactory;
  58. import org.archive.util.ArchiveUtils;
  59. import org.archive.util.XmlUtils;
  60. import org.archive.util.anvl.ANVLRecord;
  61. import org.w3c.dom.Document;
  62. /**
  63. * WARCWriterProcessor.
  64. * Goes against the 0.18 version of the WARC specification (which
  65. * is functionally identical to 0.17 except in the protocol
  66. * identifier string).
  67. * See http://archive-access.sourceforge.net/warc/
  68. *
  69. * <p>TODO: Remove ANVLRecord. Rename NameValue or use RFC822
  70. * (commons-httpclient?) or find something else.
  71. *
  72. * @author stack
  73. */
  74. public class WARCWriterProcessor extends WriterPoolProcessor
  75. implements CoreAttributeConstants, CrawlStatusListener,
  76. WriterPoolSettings, FetchStatusCodes, WARCConstants {
  77. private static final long serialVersionUID = 6182850087635847443L;
  78. private final Logger logger = Logger.getLogger(this.getClass().getName());
  79. public long getDefaultMaxFileSize() {
  80. return 1000000000L; // 1 SI giga-byte (109 bytes), per WARC appendix A
  81. }
  82. /**
  83. * Key for whether to write 'request' type records where possible
  84. */
  85. public static final String ATTR_WRITE_REQUESTS =
  86. "write-requests";
  87. /**
  88. * Key for whether to write 'metadata' type records where possible
  89. */
  90. public static final String ATTR_WRITE_METADATA =
  91. "write-metadata";
  92. /**
  93. * Key for whether to write 'revisit' type records when
  94. * consecutive identical digest
  95. */
  96. public static final String ATTR_WRITE_REVISIT_FOR_IDENTICAL_DIGESTS =
  97. "write-revisit-for-identical-digests";
  98. /**
  99. * Key for whether to write 'revisit' type records for server
  100. * "304 not modified" responses
  101. */
  102. public static final String ATTR_WRITE_REVISIT_FOR_NOT_MODIFIED =
  103. "write-revisit-for-not-modified";
  104. /**
  105. * Default path list.
  106. */
  107. private static final String [] DEFAULT_PATH = {"warcs"};
  108. protected String [] getDefaultPath() {
  109. return DEFAULT_PATH;
  110. }
  111. /**
  112. * @param name Name of this writer.
  113. */
  114. public WARCWriterProcessor(final String name) {
  115. super(name, "Experimental WARCWriter processor (Version 0.17)");
  116. Type e = addElementToDefinition(
  117. new SimpleType(ATTR_WRITE_REQUESTS,
  118. "Whether to write 'request' type records. " +
  119. "Default is true.", new Boolean(true)));
  120. e.setOverrideable(true);
  121. e.setExpertSetting(true);
  122. e = addElementToDefinition(
  123. new SimpleType(ATTR_WRITE_METADATA,
  124. "Whether to write 'metadata' type records. " +
  125. "Default is true.", new Boolean(true)));
  126. e.setOverrideable(true);
  127. e.setExpertSetting(true);
  128. e = addElementToDefinition(
  129. new SimpleType(ATTR_WRITE_REVISIT_FOR_IDENTICAL_DIGESTS,
  130. "Whether to write 'revisit' type records when a URI's " +
  131. "history indicates the previous fetch had an identical " +
  132. "content digest. " +
  133. "Default is true.", new Boolean(true)));
  134. e.setOverrideable(true);
  135. e.setExpertSetting(true);
  136. e = addElementToDefinition(
  137. new SimpleType(ATTR_WRITE_REVISIT_FOR_NOT_MODIFIED,
  138. "Whether to write 'revisit' type records when a " +
  139. "304-Not Modified response is received. " +
  140. "Default is true.", new Boolean(true)));
  141. e.setOverrideable(true);
  142. e.setExpertSetting(true);
  143. }
  144. protected void setupPool(final AtomicInteger serialNo) {
  145. setPool(new WARCWriterPool(serialNo, this, getPoolMaximumActive(),
  146. getPoolMaximumWait()));
  147. }
  148. /**
  149. * Writes a CrawlURI and its associated data to store file.
  150. *
  151. * Currently this method understands the following uri types: dns, http, and
  152. * https.
  153. *
  154. * @param curi CrawlURI to process.
  155. *
  156. */
  157. protected void innerProcess(CrawlURI curi) {
  158. // If failure, or we haven't fetched the resource yet, return
  159. if (curi.getFetchStatus() <= 0) {
  160. return;
  161. }
  162. // If no recorded content at all, don't write record. Except FTP, which
  163. // can have empty content, since the "headers" don't count as content.
  164. String scheme = curi.getUURI().getScheme().toLowerCase();
  165. long recordLength = curi.getContentSize();
  166. if (recordLength <= 0 && !scheme.equals("ftp")) {
  167. // getContentSize() should be > 0 if any material (even just
  168. // HTTP headers with zero-length body) is available.
  169. return;
  170. }
  171. try {
  172. if (shouldWrite(curi)) {
  173. write(scheme, curi);
  174. } else {
  175. logger.info("This writer does not write out scheme " +
  176. scheme + " content");
  177. }
  178. } catch (IOException e) {
  179. curi.addLocalizedError(this.getName(), e, "WriteRecord: " +
  180. curi.toString());
  181. logger.log(Level.SEVERE, "Failed write of Record: " +
  182. curi.toString(), e);
  183. }
  184. }
  185. protected void write(final String lowerCaseScheme, final CrawlURI curi)
  186. throws IOException {
  187. logger.info("writing warc record for " + curi);
  188. WriterPoolMember writer = getPool().borrowFile();
  189. long position = writer.getPosition();
  190. // See if we need to open a new file because we've exceeed maxBytes.
  191. // Call to checkFileSize will open new file if we're at maximum for
  192. // current file.
  193. writer.checkSize();
  194. if (writer.getPosition() != position) {
  195. // We just closed the file because it was larger than maxBytes.
  196. // Add to the totalBytesWritten the size of the first record
  197. // in the file, if any.
  198. setTotalBytesWritten(getTotalBytesWritten() +
  199. (writer.getPosition() - position));
  200. position = writer.getPosition();
  201. }
  202. WARCWriter w = (WARCWriter)writer;
  203. try {
  204. // Write a request, response, and metadata all in the one
  205. // 'transaction'.
  206. final URI baseid = getRecordID();
  207. final String timestamp =
  208. ArchiveUtils.getLog14Date(curi.getLong(A_FETCH_BEGAN_TIME));
  209. if (lowerCaseScheme.startsWith("http")) {
  210. writeHttpRecords(w, curi, baseid, timestamp);
  211. } else if (lowerCaseScheme.equals("dns")) {
  212. writeDnsRecords(w, curi, baseid, timestamp);
  213. } else if (lowerCaseScheme.equals("ftp")) {
  214. writeFtpRecords(w, curi, baseid, timestamp);
  215. } else {
  216. logger.warning("No handler for scheme " + lowerCaseScheme);
  217. }
  218. } catch (IOException e) {
  219. // Invalidate this file (It gets a '.invalid' suffix).
  220. getPool().invalidateFile(writer);
  221. // Set the writer to null otherwise the pool accounting
  222. // of how many active writers gets skewed if we subsequently
  223. // do a returnWriter call on this object in the finally block.
  224. writer = null;
  225. throw e;
  226. } finally {
  227. if (writer != null) {
  228. setTotalBytesWritten(getTotalBytesWritten() +
  229. (writer.getPosition() - position));
  230. getPool().returnFile(writer);
  231. }
  232. }
  233. checkBytesWritten();
  234. }
  235. private void writeFtpRecords(WARCWriter w, final CrawlURI curi, final URI baseid,
  236. final String timestamp) throws IOException {
  237. ANVLRecord headers = new ANVLRecord(3);
  238. headers.addLabelValue(HEADER_KEY_IP, getHostAddress(curi));
  239. String controlConversation = curi.getString(A_FTP_CONTROL_CONVERSATION);
  240. URI rid = writeFtpControlConversation(w, timestamp, baseid, curi, headers, controlConversation);
  241. if (curi.getContentDigest() != null) {
  242. headers.addLabelValue(HEADER_KEY_PAYLOAD_DIGEST,
  243. curi.getContentDigestSchemeString());
  244. }
  245. if (curi.getHttpRecorder() != null) {
  246. if (IdenticalDigestDecideRule.hasIdenticalDigest(curi) &&
  247. ((Boolean)getUncheckedAttribute(curi,
  248. ATTR_WRITE_REVISIT_FOR_IDENTICAL_DIGESTS))) {
  249. rid = writeRevisitDigest(w, timestamp, null,
  250. baseid, curi, headers);
  251. } else {
  252. headers = new ANVLRecord(3);
  253. if (curi.isTruncatedFetch()) {
  254. String value = curi.isTimeTruncatedFetch()?
  255. NAMED_FIELD_TRUNCATED_VALUE_TIME:
  256. curi.isLengthTruncatedFetch()?
  257. NAMED_FIELD_TRUNCATED_VALUE_LENGTH:
  258. curi.isHeaderTruncatedFetch()?
  259. NAMED_FIELD_TRUNCATED_VALUE_HEAD:
  260. // TODO: Add this to spec.
  261. TRUNCATED_VALUE_UNSPECIFIED;
  262. headers.addLabelValue(HEADER_KEY_TRUNCATED, value);
  263. }
  264. if (curi.getContentDigest() != null) {
  265. headers.addLabelValue(HEADER_KEY_PAYLOAD_DIGEST,
  266. curi.getContentDigestSchemeString());
  267. }
  268. headers.addLabelValue(HEADER_KEY_CONCURRENT_TO, '<' + rid.toString() + '>');
  269. rid = writeResource(w, timestamp, curi.getContentType(), baseid, curi, headers);
  270. }
  271. }
  272. if(((Boolean)getUncheckedAttribute(curi, ATTR_WRITE_METADATA))) {
  273. headers = new ANVLRecord(1);
  274. headers.addLabelValue(HEADER_KEY_CONCURRENT_TO, '<' + rid.toString() + '>');
  275. writeMetadata(w, timestamp, baseid, curi, headers);
  276. }
  277. }
  278. private void writeDnsRecords(WARCWriter w, final CrawlURI curi, final URI baseid,
  279. final String timestamp) throws IOException {
  280. ANVLRecord headers = null;
  281. String ip = curi.getString(A_DNS_SERVER_IP_LABEL);
  282. if (ip != null && ip.length() > 0) {
  283. headers = new ANVLRecord(1);
  284. headers.addLabelValue(HEADER_KEY_IP, ip);
  285. }
  286. writeResponse(w, timestamp, curi.getContentType(), baseid,
  287. curi, headers);
  288. }
  289. private void writeHttpRecords(WARCWriter w, final CrawlURI curi, final URI baseid,
  290. final String timestamp) throws IOException {
  291. // Add named fields for ip, checksum, and relate the metadata
  292. // and request to the resource field.
  293. // TODO: Use other than ANVL (or rename ANVL as NameValue or
  294. // use RFC822 (commons-httpclient?).
  295. ANVLRecord headers = new ANVLRecord(5);
  296. if (curi.getContentDigest() != null) {
  297. headers.addLabelValue(HEADER_KEY_PAYLOAD_DIGEST,
  298. curi.getContentDigestSchemeString());
  299. }
  300. headers.addLabelValue(HEADER_KEY_IP, getHostAddress(curi));
  301. URI rid;
  302. if (IdenticalDigestDecideRule.hasIdenticalDigest(curi) &&
  303. ((Boolean)getUncheckedAttribute(curi,
  304. ATTR_WRITE_REVISIT_FOR_IDENTICAL_DIGESTS))) {
  305. rid = writeRevisitDigest(w, timestamp, HTTP_RESPONSE_MIMETYPE,
  306. baseid, curi, headers);
  307. } else if (curi.getFetchStatus() == HttpStatus.SC_NOT_MODIFIED &&
  308. ((Boolean)getUncheckedAttribute(curi,
  309. ATTR_WRITE_REVISIT_FOR_NOT_MODIFIED))) {
  310. rid = writeRevisitNotModified(w, timestamp,
  311. baseid, curi, headers);
  312. } else {
  313. if (curi.isTruncatedFetch()) {
  314. String value = curi.isTimeTruncatedFetch()?
  315. NAMED_FIELD_TRUNCATED_VALUE_TIME:
  316. curi.isLengthTruncatedFetch()?
  317. NAMED_FIELD_TRUNCATED_VALUE_LENGTH:
  318. curi.isHeaderTruncatedFetch()?
  319. NAMED_FIELD_TRUNCATED_VALUE_HEAD:
  320. // TODO: Add this to spec.
  321. TRUNCATED_VALUE_UNSPECIFIED;
  322. headers.addLabelValue(HEADER_KEY_TRUNCATED, value);
  323. }
  324. rid = writeResponse(w, timestamp, HTTP_RESPONSE_MIMETYPE,
  325. baseid, curi, headers);
  326. }
  327. headers = new ANVLRecord(1);
  328. headers.addLabelValue(HEADER_KEY_CONCURRENT_TO,
  329. '<' + rid.toString() + '>');
  330. if(((Boolean)getUncheckedAttribute(curi, ATTR_WRITE_REQUESTS))) {
  331. writeRequest(w, timestamp, HTTP_REQUEST_MIMETYPE,
  332. baseid, curi, headers);
  333. }
  334. if(((Boolean)getUncheckedAttribute(curi, ATTR_WRITE_METADATA))) {
  335. writeMetadata(w, timestamp, baseid, curi, headers);
  336. }
  337. }
  338. protected URI writeFtpControlConversation(WARCWriter w, String timestamp, URI baseid,
  339. CrawlURI curi, ANVLRecord headers, String controlConversation)
  340. throws IOException {
  341. final URI uid = qualifyRecordID(baseid, TYPE, METADATA);
  342. byte[] b = controlConversation.getBytes("UTF-8");
  343. w.writeMetadataRecord(curi.toString(), timestamp, FTP_CONTROL_CONVERSATION_MIMETYPE,
  344. uid, headers, new ByteArrayInputStream(b), b.length);
  345. return uid;
  346. }
  347. protected URI writeRequest(final WARCWriter w,
  348. final String timestamp, final String mimetype,
  349. final URI baseid, final CrawlURI curi,
  350. final ANVLRecord namedFields)
  351. throws IOException {
  352. final URI uid = qualifyRecordID(baseid, TYPE, REQUEST);
  353. ReplayInputStream ris =
  354. curi.getHttpRecorder().getRecordedOutput().getReplayInputStream();
  355. try {
  356. w.writeRequestRecord(curi.toString(), timestamp, mimetype, uid,
  357. namedFields, ris,
  358. curi.getHttpRecorder().getRecordedOutput().getSize());
  359. } finally {
  360. if (ris != null) {
  361. ris.close();
  362. }
  363. }
  364. return uid;
  365. }
  366. protected URI writeResponse(final WARCWriter w,
  367. final String timestamp, final String mimetype,
  368. final URI baseid, final CrawlURI curi,
  369. final ANVLRecord namedFields)
  370. throws IOException {
  371. ReplayInputStream ris =
  372. curi.getHttpRecorder().getRecordedInput().getReplayInputStream();
  373. try {
  374. w.writeResponseRecord(curi.toString(), timestamp, mimetype, baseid,
  375. namedFields, ris,
  376. curi.getHttpRecorder().getRecordedInput().getSize());
  377. } finally {
  378. if (ris != null) {
  379. ris.close();
  380. }
  381. }
  382. return baseid;
  383. }
  384. protected URI writeResource(final WARCWriter w,
  385. final String timestamp, final String mimetype,
  386. final URI baseid, final CrawlURI curi,
  387. final ANVLRecord namedFields)
  388. throws IOException {
  389. ReplayInputStream ris =
  390. curi.getHttpRecorder().getRecordedInput().getReplayInputStream();
  391. try {
  392. w.writeResourceRecord(curi.toString(), timestamp, mimetype, baseid,
  393. namedFields, ris,
  394. curi.getHttpRecorder().getRecordedInput().getSize());
  395. } finally {
  396. if (ris != null) {
  397. ris.close();
  398. }
  399. }
  400. return baseid;
  401. }
  402. protected URI writeRevisitDigest(final WARCWriter w,
  403. final String timestamp, final String mimetype,
  404. final URI baseid, final CrawlURI curi,
  405. final ANVLRecord namedFields)
  406. throws IOException {
  407. namedFields.addLabelValue(
  408. HEADER_KEY_PROFILE, PROFILE_REVISIT_IDENTICAL_DIGEST);
  409. namedFields.addLabelValue(
  410. HEADER_KEY_TRUNCATED, NAMED_FIELD_TRUNCATED_VALUE_LENGTH);
  411. ReplayInputStream ris = null;
  412. long revisedLength = 0;
  413. // null mimetype implies no payload
  414. if (mimetype != null) {
  415. ris = curi.getHttpRecorder().getRecordedInput().getReplayInputStream();
  416. revisedLength = curi.getHttpRecorder().getRecordedInput().getContentBegin();
  417. revisedLength = revisedLength > 0
  418. ? revisedLength
  419. : curi.getHttpRecorder().getRecordedInput().getSize();
  420. }
  421. try {
  422. w.writeRevisitRecord(curi.toString(), timestamp, mimetype, baseid,
  423. namedFields, ris, revisedLength);
  424. } finally {
  425. if (ris != null) {
  426. ris.close();
  427. }
  428. }
  429. curi.addAnnotation("warcRevisit:digest");
  430. return baseid;
  431. }
  432. protected URI writeRevisitNotModified(final WARCWriter w,
  433. final String timestamp,
  434. final URI baseid, final CrawlURI curi,
  435. final ANVLRecord namedFields)
  436. throws IOException {
  437. namedFields.addLabelValue(
  438. HEADER_KEY_PROFILE, PROFILE_REVISIT_NOT_MODIFIED);
  439. // save just enough context to understand basis of not-modified
  440. if(curi.containsKey(A_HTTP_TRANSACTION)) {
  441. HttpMethodBase method =
  442. (HttpMethodBase) curi.getObject(A_HTTP_TRANSACTION);
  443. saveHeader(A_ETAG_HEADER,method,namedFields,HEADER_KEY_ETAG);
  444. saveHeader(A_LAST_MODIFIED_HEADER,method,namedFields,
  445. HEADER_KEY_LAST_MODIFIED);
  446. }
  447. // truncate to zero-length (all necessary info is above)
  448. namedFields.addLabelValue(HEADER_KEY_TRUNCATED,
  449. NAMED_FIELD_TRUNCATED_VALUE_LENGTH);
  450. ReplayInputStream ris =
  451. curi.getHttpRecorder().getRecordedInput().getReplayInputStream();
  452. try {
  453. w.writeRevisitRecord(curi.toString(), timestamp, null, baseid,
  454. namedFields, ris, 0);
  455. } finally {
  456. if (ris != null) {
  457. ris.close();
  458. }
  459. }
  460. curi.addAnnotation("warcRevisit:notModified");
  461. return baseid;
  462. }
  463. /**
  464. * Save a header from the given HTTP operation into the
  465. * provider headers under a new name
  466. *
  467. * @param origName header name to get if present
  468. * @param method http operation containing headers
  469. */
  470. protected void saveHeader(String origName, HttpMethodBase method,
  471. ANVLRecord headers, String newName) {
  472. Header header = method.getResponseHeader(origName);
  473. if(header!=null) {
  474. headers.addLabelValue(newName, header.getValue());
  475. }
  476. }
  477. protected URI writeMetadata(final WARCWriter w,
  478. final String timestamp,
  479. final URI baseid, final CrawlURI curi,
  480. final ANVLRecord namedFields)
  481. throws IOException {
  482. final URI uid = qualifyRecordID(baseid, TYPE, METADATA);
  483. // Get some metadata from the curi.
  484. // TODO: Get all curi metadata.
  485. // TODO: Use other than ANVL (or rename ANVL as NameValue or use
  486. // RFC822 (commons-httpclient?).
  487. ANVLRecord r = new ANVLRecord();
  488. if (curi.isSeed()) {
  489. r.addLabel("seed");
  490. } else {
  491. if (curi.forceFetch()) {
  492. r.addLabel("force-fetch");
  493. }
  494. r.addLabelValue("via", curi.flattenVia());
  495. r.addLabelValue("hopsFromSeed", curi.getPathFromSeed());
  496. if (curi.containsKey(A_SOURCE_TAG)) {
  497. r.addLabelValue("sourceTag", curi.getString(A_SOURCE_TAG));
  498. }
  499. }
  500. long duration = curi.getFetchDuration();
  501. if(duration>-1) {
  502. r.addLabelValue("fetchTimeMs", Long.toString(duration));
  503. }
  504. if (curi.containsKey(A_FTP_FETCH_STATUS)) {
  505. r.addLabelValue("ftpFetchStatus", curi.getString(A_FTP_FETCH_STATUS));
  506. }
  507. // Add outlinks though they are effectively useless without anchor text.
  508. Collection<Link> links = curi.getOutLinks();
  509. if (links != null && links.size() > 0) {
  510. for (Link link: links) {
  511. r.addLabelValue("outlink", link.toString());
  512. }
  513. }
  514. // TODO: Other curi fields to write to metadata.
  515. //
  516. // Credentials
  517. //
  518. // fetch-began-time: 1154569278774
  519. // fetch-completed-time: 1154569281816
  520. //
  521. // Annotations.
  522. byte [] b = r.getUTF8Bytes();
  523. w.writeMetadataRecord(curi.toString(), timestamp, ANVLRecord.MIMETYPE,
  524. uid, namedFields, new ByteArrayInputStream(b), b.length);
  525. return uid;
  526. }
  527. protected URI getRecordID() throws IOException {
  528. URI result;
  529. try {
  530. result = GeneratorFactory.getFactory().getRecordID();
  531. } catch (URISyntaxException e) {
  532. throw new IOException(e.toString());
  533. }
  534. return result;
  535. }
  536. protected URI qualifyRecordID(final URI base, final String key,
  537. final String value)
  538. throws IOException {
  539. URI result;
  540. Map<String, String> qualifiers = new HashMap<String, String>(1);
  541. qualifiers.put(key, value);
  542. try {
  543. result = GeneratorFactory.getFactory().
  544. qualifyRecordID(base, qualifiers);
  545. } catch (URISyntaxException e) {
  546. throw new IOException(e.toString());
  547. }
  548. return result;
  549. }
  550. @Override
  551. protected String getFirstrecordStylesheet() {
  552. return "/warcinfobody.xsl";
  553. }
  554. /**
  555. * Return relevant values as header-like fields (here ANVLRecord, but
  556. * spec-defined "application/warc-fields" type when written). Field
  557. * names from from DCMI Terms and the WARC/0.17 specification.
  558. *
  559. * @see org.archive.crawler.framework.WriterPoolProcessor#getFirstrecordBody(java.io.File)
  560. */
  561. @Override
  562. protected String getFirstrecordBody(File orderFile) {
  563. ANVLRecord record = new ANVLRecord(7);
  564. record.addLabelValue("software", "Heritrix/" +
  565. Heritrix.getVersion() + " http://crawler.archive.org");
  566. try {
  567. InetAddress host = InetAddress.getLocalHost();
  568. record.addLabelValue("ip", host.getHostAddress());
  569. record.addLabelValue("hostname", host.getCanonicalHostName());
  570. } catch (UnknownHostException e) {
  571. logger.log(Level.WARNING,"unable top obtain local crawl engine host",e);
  572. }
  573. // conforms to ISO 28500:2009 as of May 2009
  574. // as described at http://bibnum.bnf.fr/WARC/
  575. // latest draft as of November 2008
  576. record.addLabelValue("format","WARC File Format 1.0");
  577. record.addLabelValue("conformsTo","http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf");
  578. // Get other values from order.xml
  579. try {
  580. Document doc = XmlUtils.getDocument(orderFile);
  581. addIfNotBlank(record,"operator",
  582. XmlUtils.xpathOrNull(doc,"//meta/operator"));
  583. addIfNotBlank(record,"publisher",
  584. XmlUtils.xpathOrNull(doc,"//meta/organization"));
  585. addIfNotBlank(record,"audience",
  586. XmlUtils.xpathOrNull(doc,"//meta/audience"));
  587. addIfNotBlank(record,"isPartOf",
  588. XmlUtils.xpathOrNull(doc,"//meta/name"));
  589. // disabling "created" field per HER-1634
  590. // though it's theoretically useful as a means of distinguishing
  591. // one crawl from another, the current usage/specification is too
  592. // vague... in particular a 'created' field in the 'warcinfo' is
  593. // reasonable to interpret as applying to the WARC-unit, rather
  594. // than the crawl-job-unit so we remove it and see if anyone
  595. // complains or makes a case for restoring it in a less-ambiguous
  596. // manner
  597. // String rawDate = XmlUtils.xpathOrNull(doc,"//meta/date");
  598. // if(StringUtils.isNotBlank(rawDate)) {
  599. // Date date;
  600. // try {
  601. // date = ArchiveUtils.parse14DigitDate(rawDate);
  602. // addIfNotBlank(record,"created",ArchiveUtils.getLog14Date(date));
  603. // } catch (ParseException e) {
  604. // logger.log(Level.WARNING,"obtaining warc created date",e);
  605. // }
  606. // }
  607. addIfNotBlank(record,"description",
  608. XmlUtils.xpathOrNull(doc,"//meta/description"));
  609. addIfNotBlank(record,"robots",
  610. XmlUtils.xpathOrNull(doc,
  611. "//newObject[@name='robots-honoring-policy']/string[@name='type']"));
  612. addIfNotBlank(record,"http-header-user-agent",
  613. XmlUtils.xpathOrNull(doc,
  614. "//map[@name='http-headers']/string[@name='user-agent']"));
  615. addIfNotBlank(record,"http-header-from",
  616. XmlUtils.xpathOrNull(doc,
  617. "//map[@name='http-headers']/string[@name='from']"));
  618. } catch (IOException e) {
  619. logger.log(Level.WARNING,"obtaining warcinfo",e);
  620. }
  621. // really ugly to return as string, when it may just be merged with
  622. // a couple other fields at write time, but changing would require
  623. // larger refactoring
  624. return record.toString();
  625. }
  626. protected void addIfNotBlank(ANVLRecord record, String label, String value) {
  627. if(StringUtils.isNotBlank(value)) {
  628. record.addLabelValue(label, value);
  629. }
  630. }
  631. }