PageRenderTime 55ms CodeModel.GetById 23ms RepoModel.GetById 0ms app.codeStats 0ms

/projects/heritrix-1.14.4/src/java/org/archive/crawler/framework/WriterPoolProcessor.java

https://gitlab.com/essere.lab.public/qualitas.class-corpus
Java | 734 lines | 467 code | 68 blank | 199 comment | 54 complexity | 0885d20474542027d80d371e67a2051b MD5 | raw file
  1. /* WriterPoolProcessor
  2. *
  3. * $Id: WriterPoolProcessor.java 6631 2009-11-09 21:10:20Z gojomo $
  4. *
  5. * Created on July 19th, 2006
  6. *
  7. * Copyright (C) 2006 Internet Archive.
  8. *
  9. * This file is part of the Heritrix web crawler (crawler.archive.org).
  10. *
  11. * Heritrix is free software; you can redistribute it and/or modify
  12. * it under the terms of the GNU Lesser Public License as published by
  13. * the Free Software Foundation; either version 2.1 of the License, or
  14. * any later version.
  15. *
  16. * Heritrix is distributed in the hope that it will be useful,
  17. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  19. * GNU Lesser Public License for more details.
  20. *
  21. * You should have received a copy of the GNU Lesser Public License
  22. * along with Heritrix; if not, write to the Free Software
  23. * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  24. */
  25. package org.archive.crawler.framework;
  26. import java.io.DataInputStream;
  27. import java.io.DataOutputStream;
  28. import java.io.File;
  29. import java.io.FileInputStream;
  30. import java.io.FileNotFoundException;
  31. import java.io.FileOutputStream;
  32. import java.io.IOException;
  33. import java.io.ObjectInputStream;
  34. import java.io.StringWriter;
  35. import java.net.InetAddress;
  36. import java.net.UnknownHostException;
  37. import java.util.ArrayList;
  38. import java.util.Arrays;
  39. import java.util.Iterator;
  40. import java.util.List;
  41. import java.util.concurrent.atomic.AtomicInteger;
  42. import java.util.logging.Logger;
  43. import javax.management.AttributeNotFoundException;
  44. import javax.management.MBeanException;
  45. import javax.management.ReflectionException;
  46. import javax.xml.transform.SourceLocator;
  47. import javax.xml.transform.Templates;
  48. import javax.xml.transform.Transformer;
  49. import javax.xml.transform.TransformerConfigurationException;
  50. import javax.xml.transform.TransformerException;
  51. import javax.xml.transform.TransformerFactory;
  52. import javax.xml.transform.stream.StreamResult;
  53. import javax.xml.transform.stream.StreamSource;
  54. import org.archive.crawler.Heritrix;
  55. import org.archive.crawler.datamodel.CoreAttributeConstants;
  56. import org.archive.crawler.datamodel.CrawlHost;
  57. import org.archive.crawler.datamodel.CrawlOrder;
  58. import org.archive.crawler.datamodel.CrawlURI;
  59. import org.archive.crawler.datamodel.FetchStatusCodes;
  60. import org.archive.crawler.deciderules.recrawl.IdenticalDigestDecideRule;
  61. import org.archive.crawler.event.CrawlStatusListener;
  62. import org.archive.crawler.settings.SimpleType;
  63. import org.archive.crawler.settings.StringList;
  64. import org.archive.crawler.settings.Type;
  65. import org.archive.crawler.settings.XMLSettingsHandler;
  66. import org.archive.io.ObjectPlusFilesInputStream;
  67. import org.archive.io.WriterPool;
  68. import org.archive.io.WriterPoolMember;
  69. /**
  70. * Abstract implementation of a file pool processor.
  71. * Subclass to implement for a particular {@link WriterPoolMember} instance.
  72. * @author Parker Thompson
  73. * @author stack
  74. */
  75. public abstract class WriterPoolProcessor extends Processor
  76. implements CoreAttributeConstants, CrawlStatusListener, FetchStatusCodes {
  77. private static final long serialVersionUID = 1L;
  78. private final Logger logger = Logger.getLogger(this.getClass().getName());
  79. /**
  80. * Key to use asking settings for file compression value.
  81. */
  82. public static final String ATTR_COMPRESS = "compress";
  83. /**
  84. * Default as to whether we do compression of files.
  85. */
  86. public static final boolean DEFAULT_COMPRESS = true;
  87. /**
  88. * Key to use asking settings for file prefix value.
  89. */
  90. public static final String ATTR_PREFIX = "prefix";
  91. /**
  92. * Key to use asking settings for arc path value.
  93. */
  94. public static final String ATTR_PATH ="path";
  95. /**
  96. * Key to use asking settings for file suffix value.
  97. */
  98. public static final String ATTR_SUFFIX = "suffix";
  99. /**
  100. * Key to use asking settings for file max size value.
  101. */
  102. public static final String ATTR_MAX_SIZE_BYTES = "max-size-bytes";
  103. /**
  104. * Key to get maximum pool size.
  105. *
  106. * This key is for maximum files active in the pool.
  107. */
  108. public static final String ATTR_POOL_MAX_ACTIVE = "pool-max-active";
  109. /**
  110. * Key to get maximum wait on pool object before we give up and
  111. * throw IOException.
  112. */
  113. public static final String ATTR_POOL_MAX_WAIT = "pool-max-wait";
  114. /**
  115. * Key for the maximum bytes to write attribute.
  116. */
  117. public static final String ATTR_MAX_BYTES_WRITTEN =
  118. "total-bytes-to-write";
  119. /**
  120. * Key for whether to skip writing records of content-digest repeats
  121. */
  122. public static final String ATTR_SKIP_IDENTICAL_DIGESTS =
  123. "skip-identical-digests";
  124. /**
  125. * CrawlURI annotation indicating no record was written
  126. */
  127. protected static final String ANNOTATION_UNWRITTEN = "unwritten";
  128. /**
  129. * Default maximum file size.
  130. */
  131. public abstract long getDefaultMaxFileSize();
  132. /**
  133. * Default path list.
  134. *
  135. * TODO: Confirm this one gets picked up.
  136. */
  137. private static final String [] DEFAULT_PATH = {"crawl-store"};
  138. /**
  139. * Reference to pool.
  140. */
  141. transient private WriterPool pool = null;
  142. /**
  143. * Total number of bytes written to disc.
  144. */
  145. private long totalBytesWritten = 0;
  146. /**
  147. * Calculate metadata once only.
  148. */
  149. transient private List<String> cachedMetadata = null;
  150. /**
  151. * @param name Name of this processor.
  152. */
  153. public WriterPoolProcessor(String name) {
  154. this(name, "Pool of files processor");
  155. }
  156. /**
  157. * @param name Name of this processor.
  158. * @param description Description for this processor.
  159. */
  160. public WriterPoolProcessor(final String name,
  161. final String description) {
  162. super(name, description);
  163. Type e = addElementToDefinition(
  164. new SimpleType(ATTR_COMPRESS, "Compress files when " +
  165. "writing to disk.", new Boolean(DEFAULT_COMPRESS)));
  166. e.setOverrideable(false);
  167. e = addElementToDefinition(
  168. new SimpleType(ATTR_PREFIX,
  169. "File prefix. " +
  170. "The text supplied here will be used as a prefix naming " +
  171. "writer files. For example if the prefix is 'IAH', " +
  172. "then file names will look like " +
  173. "IAH-20040808101010-0001-HOSTNAME.arc.gz " +
  174. "...if writing ARCs (The prefix will be " +
  175. "separated from the date by a hyphen).",
  176. WriterPoolMember.DEFAULT_PREFIX));
  177. e = addElementToDefinition(
  178. new SimpleType(ATTR_SUFFIX, "Suffix to tag onto " +
  179. "files. '${HOSTNAME_ADMINPORT}' in the suffix " +
  180. "will be replaced with the local hostname and " +
  181. "web UI port. '${HOSTNAME}' in the suffix will be " +
  182. "replaced with the local hostname. If empty, no "+
  183. "suffix will be added.",
  184. WriterPoolMember.DEFAULT_SUFFIX));
  185. e.setOverrideable(false);
  186. e = addElementToDefinition(
  187. new SimpleType(ATTR_MAX_SIZE_BYTES, "Max size of each file",
  188. new Long(getDefaultMaxFileSize())));
  189. e.setOverrideable(false);
  190. e = addElementToDefinition(
  191. new StringList(ATTR_PATH, "Where to files. " +
  192. "Supply absolute or relative path. If relative, files " +
  193. "will be written relative to " +
  194. "the " + CrawlOrder.ATTR_DISK_PATH + "setting." +
  195. " If more than one path specified, we'll round-robin" +
  196. " dropping files to each. This setting is safe" +
  197. " to change midcrawl (You can remove and add new dirs" +
  198. " as the crawler progresses).", getDefaultPath()));
  199. e.setOverrideable(false);
  200. e = addElementToDefinition(new SimpleType(ATTR_POOL_MAX_ACTIVE,
  201. "Maximum active files in pool. " +
  202. "This setting cannot be varied over the life of a crawl.",
  203. new Integer(WriterPool.DEFAULT_MAX_ACTIVE)));
  204. e.setOverrideable(false);
  205. e = addElementToDefinition(new SimpleType(ATTR_POOL_MAX_WAIT,
  206. "Maximum time to wait on pool element" +
  207. " (milliseconds). This setting cannot be varied over the life" +
  208. " of a crawl.",
  209. new Integer(WriterPool.DEFAULT_MAXIMUM_WAIT)));
  210. e.setOverrideable(false);
  211. e = addElementToDefinition(new SimpleType(ATTR_MAX_BYTES_WRITTEN,
  212. "Total file bytes to write to disk." +
  213. " Once the size of all files on disk has exceeded this " +
  214. "limit, this processor will stop the crawler. " +
  215. "A value of zero means no upper limit.", new Long(0)));
  216. e.setOverrideable(false);
  217. e.setExpertSetting(true);
  218. e = addElementToDefinition(new SimpleType(ATTR_SKIP_IDENTICAL_DIGESTS,
  219. "Whether to skip the writing of a record when URI " +
  220. "history information is available and indicates the " +
  221. "prior fetch had an identical content digest. " +
  222. "Default is false.", new Boolean(false)));
  223. e.setOverrideable(true);
  224. e.setExpertSetting(true);
  225. }
  226. protected String [] getDefaultPath() {
  227. return DEFAULT_PATH;
  228. }
  229. public synchronized void initialTasks() {
  230. // Add this class to crawl state listeners and setup pool.
  231. getSettingsHandler().getOrder().getController().
  232. addCrawlStatusListener(this);
  233. setupPool(new AtomicInteger());
  234. // Run checkpoint recovery code.
  235. if (getSettingsHandler().getOrder().getController().
  236. isCheckpointRecover()) {
  237. checkpointRecover();
  238. }
  239. }
  240. protected AtomicInteger getSerialNo() {
  241. return ((WriterPool)getPool()).getSerialNo();
  242. }
  243. /**
  244. * Set up pool of files.
  245. */
  246. protected abstract void setupPool(final AtomicInteger serialNo);
  247. /**
  248. * Writes a CrawlURI and its associated data to store file.
  249. *
  250. * Currently this method understands the following uri types: dns, http,
  251. * and https.
  252. *
  253. * @param curi CrawlURI to process.
  254. */
  255. protected abstract void innerProcess(CrawlURI curi);
  256. protected void checkBytesWritten() {
  257. long max = getMaxToWrite();
  258. if (max <= 0) {
  259. return;
  260. }
  261. if (max <= this.totalBytesWritten) {
  262. getController().requestCrawlStop("Finished - Maximum bytes (" +
  263. Long.toString(max) + ") written");
  264. }
  265. }
  266. /**
  267. * Whether the given CrawlURI should be written to archive files.
  268. * Annotates CrawlURI with a reason for any negative answer.
  269. *
  270. * @param curi CrawlURI
  271. * @return true if URI should be written; false otherwise
  272. */
  273. protected boolean shouldWrite(CrawlURI curi) {
  274. // check for duplicate content write suppression
  275. if(((Boolean)getUncheckedAttribute(curi, ATTR_SKIP_IDENTICAL_DIGESTS))
  276. && IdenticalDigestDecideRule.hasIdenticalDigest(curi)) {
  277. curi.addAnnotation(ANNOTATION_UNWRITTEN + ":identicalDigest");
  278. return false;
  279. }
  280. String scheme = curi.getUURI().getScheme().toLowerCase();
  281. // TODO: possibly move this sort of isSuccess() test into CrawlURI
  282. boolean retVal;
  283. if (scheme.equals("dns")) {
  284. retVal = curi.getFetchStatus() == S_DNS_SUCCESS;
  285. } else if (scheme.equals("http") || scheme.equals("https")) {
  286. retVal = curi.getFetchStatus() > 0 && curi.isHttpTransaction();
  287. } else if (scheme.equals("ftp")) {
  288. retVal = curi.getFetchStatus() > 0;
  289. } else {
  290. // unsupported scheme
  291. curi.addAnnotation(ANNOTATION_UNWRITTEN + ":scheme");
  292. return false;
  293. }
  294. if (retVal == false) {
  295. // status not deserving writing
  296. curi.addAnnotation(ANNOTATION_UNWRITTEN + ":status");
  297. return false;
  298. }
  299. return true;
  300. }
  301. /**
  302. * Return IP address of given URI suitable for recording (as in a
  303. * classic ARC 5-field header line).
  304. *
  305. * @param curi CrawlURI
  306. * @return String of IP address
  307. */
  308. protected String getHostAddress(CrawlURI curi) {
  309. // special handling for DNS URIs: want address of DNS server
  310. if(curi.getUURI().getScheme().toLowerCase().equals("dns")) {
  311. return curi.getString(A_DNS_SERVER_IP_LABEL);
  312. }
  313. // otherwise, host referenced in URI
  314. CrawlHost h = getController().getServerCache().getHostFor(curi);
  315. if (h == null) {
  316. throw new NullPointerException("Crawlhost is null for " +
  317. curi + " " + curi.getVia());
  318. }
  319. InetAddress a = h.getIP();
  320. if (a == null) {
  321. throw new NullPointerException("Address is null for " +
  322. curi + " " + curi.getVia() + ". Address " +
  323. ((h.getIpFetched() == CrawlHost.IP_NEVER_LOOKED_UP)?
  324. "was never looked up.":
  325. (System.currentTimeMillis() - h.getIpFetched()) +
  326. " ms ago."));
  327. }
  328. return h.getIP().getHostAddress();
  329. }
  330. /**
  331. * Version of getAttributes that catches and logs exceptions
  332. * and returns null if failure to fetch the attribute.
  333. * @param name Attribute name.
  334. * @return Attribute or null.
  335. */
  336. public Object getAttributeUnchecked(String name) {
  337. Object result = null;
  338. try {
  339. result = super.getAttribute(name);
  340. } catch (AttributeNotFoundException e) {
  341. logger.warning(e.getLocalizedMessage());
  342. } catch (MBeanException e) {
  343. logger.warning(e.getLocalizedMessage());
  344. } catch (ReflectionException e) {
  345. logger.warning(e.getLocalizedMessage());
  346. }
  347. return result;
  348. }
  349. /**
  350. * Max size we want files to be (bytes).
  351. *
  352. * Default is ARCConstants.DEFAULT_MAX_ARC_FILE_SIZE. Note that ARC
  353. * files will usually be bigger than maxSize; they'll be maxSize + length
  354. * to next boundary.
  355. * @return ARC maximum size.
  356. */
  357. public long getMaxSize() {
  358. Object obj = getAttributeUnchecked(ATTR_MAX_SIZE_BYTES);
  359. return (obj == null)? getDefaultMaxFileSize(): ((Long)obj).longValue();
  360. }
  361. public String getPrefix() {
  362. Object obj = getAttributeUnchecked(ATTR_PREFIX);
  363. return (obj == null)? WriterPoolMember.DEFAULT_PREFIX: (String)obj;
  364. }
  365. @SuppressWarnings("unchecked")
  366. public List<File> getOutputDirs() {
  367. Object obj = getAttributeUnchecked(ATTR_PATH);
  368. List list = (obj == null)? Arrays.asList(DEFAULT_PATH): (StringList)obj;
  369. ArrayList<File> results = new ArrayList<File>();
  370. for (Iterator i = list.iterator(); i.hasNext();) {
  371. String path = (String)i.next();
  372. File f = new File(path);
  373. if (!f.isAbsolute()) {
  374. f = new File(getController().getDisk(), path);
  375. }
  376. if (!f.exists()) {
  377. try {
  378. f.mkdirs();
  379. } catch (Exception e) {
  380. e.printStackTrace();
  381. continue;
  382. }
  383. }
  384. results.add(f);
  385. }
  386. return results;
  387. }
  388. public boolean isCompressed() {
  389. Object obj = getAttributeUnchecked(ATTR_COMPRESS);
  390. return (obj == null)? DEFAULT_COMPRESS:
  391. ((Boolean)obj).booleanValue();
  392. }
  393. /**
  394. * @return Returns the poolMaximumActive.
  395. */
  396. public int getPoolMaximumActive() {
  397. Object obj = getAttributeUnchecked(ATTR_POOL_MAX_ACTIVE);
  398. return (obj == null)? WriterPool.DEFAULT_MAX_ACTIVE:
  399. ((Integer)obj).intValue();
  400. }
  401. /**
  402. * @return Returns the poolMaximumWait.
  403. */
  404. public int getPoolMaximumWait() {
  405. Object obj = getAttributeUnchecked(ATTR_POOL_MAX_WAIT);
  406. return (obj == null)? WriterPool.DEFAULT_MAXIMUM_WAIT:
  407. ((Integer)obj).intValue();
  408. }
  409. private String getHostname() {
  410. String hostname = "localhost.localdomain";
  411. try {
  412. hostname = InetAddress.getLocalHost().getCanonicalHostName();
  413. } catch (UnknownHostException ue) {
  414. logger.severe("Failed getHostAddress for this host: " + ue);
  415. }
  416. return hostname;
  417. }
  418. private int getPort() {
  419. if (Heritrix.getHttpServer() != null) {
  420. return Heritrix.getHttpServer().getPort();
  421. } else {
  422. return 0;
  423. }
  424. }
  425. public String getSuffix() {
  426. Object obj = getAttributeUnchecked(ATTR_SUFFIX);
  427. String sfx = (obj == null)?
  428. WriterPoolMember.DEFAULT_SUFFIX: (String)obj;
  429. sfx = sfx.trim();
  430. if (sfx.contains(WriterPoolMember.HOSTNAME_ADMINPORT_VARIABLE)
  431. || sfx.contains(WriterPoolMember.HOSTNAME_VARIABLE)) {
  432. String hostname = getHostname();
  433. sfx = sfx.replace(WriterPoolMember.HOSTNAME_ADMINPORT_VARIABLE, hostname + "-" + getPort());
  434. sfx = sfx.replace(WriterPoolMember.HOSTNAME_VARIABLE, hostname);
  435. }
  436. return sfx;
  437. }
  438. public long getMaxToWrite() {
  439. Object obj = getAttributeUnchecked(ATTR_MAX_BYTES_WRITTEN);
  440. return (obj == null)? 0: ((Long)obj).longValue();
  441. }
  442. public void crawlEnding(String sExitMessage) {
  443. }
  444. public void crawlEnded(String sExitMessage) {
  445. // sExitMessage is unused.
  446. this.pool.close();
  447. }
  448. /* (non-Javadoc)
  449. * @see org.archive.crawler.event.CrawlStatusListener#crawlStarted(java.lang.String)
  450. */
  451. public void crawlStarted(String message) {
  452. // TODO Auto-generated method stub
  453. }
  454. protected String getCheckpointStateFile() {
  455. return this.getClass().getName() + ".state";
  456. }
  457. public void crawlCheckpoint(File checkpointDir) throws IOException {
  458. int serial = getSerialNo().get();
  459. if (this.pool.getNumActive() > 0) {
  460. // If we have open active Archive files, up the serial number
  461. // so after checkpoint, we start at one past current number and
  462. // so the number we serialize, is one past current serialNo.
  463. // All this serial number manipulation should be fine in here since
  464. // we're paused checkpointing (Revisit if this assumption changes).
  465. serial = getSerialNo().incrementAndGet();
  466. }
  467. saveCheckpointSerialNumber(checkpointDir, serial);
  468. // Close all ARCs on checkpoint.
  469. try {
  470. this.pool.close();
  471. } finally {
  472. // Reopen on checkpoint.
  473. setupPool(new AtomicInteger(serial));
  474. }
  475. }
  476. public void crawlPausing(String statusMessage) {
  477. // sExitMessage is unused.
  478. }
  479. public void crawlPaused(String statusMessage) {
  480. // sExitMessage is unused.
  481. }
  482. public void crawlResuming(String statusMessage) {
  483. // sExitMessage is unused.
  484. }
  485. private void readObject(ObjectInputStream stream)
  486. throws IOException, ClassNotFoundException {
  487. stream.defaultReadObject();
  488. ObjectPlusFilesInputStream coistream =
  489. (ObjectPlusFilesInputStream)stream;
  490. coistream.registerFinishTask( new Runnable() {
  491. public void run() {
  492. setupPool(new AtomicInteger());
  493. }
  494. });
  495. }
  496. protected WriterPool getPool() {
  497. return pool;
  498. }
  499. protected void setPool(WriterPool pool) {
  500. this.pool = pool;
  501. }
  502. protected long getTotalBytesWritten() {
  503. return totalBytesWritten;
  504. }
  505. protected void setTotalBytesWritten(long totalBytesWritten) {
  506. this.totalBytesWritten = totalBytesWritten;
  507. }
  508. /**
  509. * Called out of {@link #initialTasks()} when recovering a checkpoint.
  510. * Restore state.
  511. */
  512. protected void checkpointRecover() {
  513. int serialNo = loadCheckpointSerialNumber();
  514. if (serialNo != -1) {
  515. getSerialNo().set(serialNo);
  516. }
  517. }
  518. /**
  519. * @return Serial number from checkpoint state file or if unreadable, -1
  520. * (Client should check for -1).
  521. */
  522. protected int loadCheckpointSerialNumber() {
  523. int result = -1;
  524. // If in recover mode, read in the Writer serial number saved
  525. // off when we checkpointed.
  526. File stateFile = new File(getSettingsHandler().getOrder()
  527. .getController().getCheckpointRecover().getDirectory(),
  528. getCheckpointStateFile());
  529. if (!stateFile.exists()) {
  530. logger.info(stateFile.getAbsolutePath()
  531. + " doesn't exist so cannot restore Writer serial number.");
  532. } else {
  533. DataInputStream dis = null;
  534. try {
  535. dis = new DataInputStream(new FileInputStream(stateFile));
  536. result = dis.readShort();
  537. } catch (FileNotFoundException e) {
  538. e.printStackTrace();
  539. } catch (IOException e) {
  540. e.printStackTrace();
  541. } finally {
  542. try {
  543. if (dis != null) {
  544. dis.close();
  545. }
  546. } catch (IOException e) {
  547. e.printStackTrace();
  548. }
  549. }
  550. }
  551. return result;
  552. }
  553. protected void saveCheckpointSerialNumber(final File checkpointDir,
  554. final int serialNo)
  555. throws IOException {
  556. // Write out the current state of the ARCWriter serial number.
  557. File f = new File(checkpointDir, getCheckpointStateFile());
  558. DataOutputStream dos = new DataOutputStream(new FileOutputStream(f));
  559. try {
  560. dos.writeShort(serialNo);
  561. } finally {
  562. dos.close();
  563. }
  564. }
  565. /**
  566. * Return list of metadatas to add to first arc file metadata record.
  567. *
  568. * Default is to stylesheet the order file. To specify stylesheet,
  569. * override {@link #getFirstrecordStylesheet()}.
  570. *
  571. * Get xml files from settingshandler. Currently order file is the
  572. * only xml file. We're NOT adding seeds to meta data.
  573. *
  574. * @return List of strings and/or files to add to arc file as metadata or
  575. * null.
  576. */
  577. public synchronized List<String> getMetadata() {
  578. if (this.cachedMetadata != null) {
  579. return this.cachedMetadata;
  580. }
  581. return cacheMetadata();
  582. }
  583. protected synchronized List<String> cacheMetadata() {
  584. // If no stylesheet, return empty metadata.
  585. if (getFirstrecordStylesheet() == null ||
  586. getFirstrecordStylesheet().length() == 0) {
  587. this.cachedMetadata = new ArrayList<String>(1);
  588. this.cachedMetadata.add("");
  589. return this.cachedMetadata;
  590. }
  591. List<String> result = null;
  592. if (!XMLSettingsHandler.class.isInstance(getSettingsHandler())) {
  593. logger.warning("Expected xml settings handler (No warcinfo).");
  594. // Early return
  595. return result;
  596. }
  597. XMLSettingsHandler xsh = (XMLSettingsHandler)getSettingsHandler();
  598. File orderFile = xsh.getOrderFile();
  599. if (!orderFile.exists() || !orderFile.canRead()) {
  600. logger.severe("File " + orderFile.getAbsolutePath() +
  601. " is does not exist or is not readable.");
  602. } else {
  603. result = new ArrayList<String>(1);
  604. result.add(getFirstrecordBody(orderFile));
  605. }
  606. this.cachedMetadata = result;
  607. return this.cachedMetadata;
  608. }
  609. /**
  610. * @preturn Full path to stylesheet (Its read off the CLASSPATH
  611. * as resource).
  612. */
  613. protected String getFirstrecordStylesheet() {
  614. return null;
  615. }
  616. /**
  617. * Write the arc metadata body content.
  618. *
  619. * Its based on the order xml file but into this base we'll add other info
  620. * such as machine ip.
  621. *
  622. * @param orderFile Order file.
  623. *
  624. * @return String that holds the arc metaheader body.
  625. */
  626. protected String getFirstrecordBody(File orderFile) {
  627. String result = null;
  628. TransformerFactory factory = TransformerFactory.newInstance();
  629. Templates templates = null;
  630. Transformer xformer = null;
  631. try {
  632. templates = factory.newTemplates(new StreamSource(
  633. this.getClass().getResourceAsStream(getFirstrecordStylesheet())));
  634. xformer = templates.newTransformer();
  635. // Below parameter names must match what is in the stylesheet.
  636. xformer.setParameter("software", "Heritrix " +
  637. Heritrix.getVersion() + " http://crawler.archive.org");
  638. xformer.setParameter("ip",
  639. InetAddress.getLocalHost().getHostAddress());
  640. xformer.setParameter("hostname",
  641. InetAddress.getLocalHost().getCanonicalHostName());
  642. StreamSource source = new StreamSource(
  643. new FileInputStream(orderFile));
  644. StringWriter writer = new StringWriter();
  645. StreamResult target = new StreamResult(writer);
  646. xformer.transform(source, target);
  647. result= writer.toString();
  648. } catch (TransformerConfigurationException e) {
  649. logger.severe("Failed transform " + e);
  650. } catch (FileNotFoundException e) {
  651. logger.severe("Failed transform, file not found " + e);
  652. } catch (UnknownHostException e) {
  653. logger.severe("Failed transform, unknown host " + e);
  654. } catch(TransformerException e) {
  655. SourceLocator locator = e.getLocator();
  656. int col = locator.getColumnNumber();
  657. int line = locator.getLineNumber();
  658. String publicId = locator.getPublicId();
  659. String systemId = locator.getSystemId();
  660. logger.severe("Transform error " + e + ", col " + col + ", line " +
  661. line + ", publicId " + publicId + ", systemId " + systemId);
  662. }
  663. return result;
  664. }
  665. }