PageRenderTime 51ms CodeModel.GetById 23ms RepoModel.GetById 0ms app.codeStats 0ms

/projects/heritrix-1.14.4/src/java/org/archive/crawler/writer/Kw3WriterProcessor.java

https://gitlab.com/essere.lab.public/qualitas.class-corpus
Java | 466 lines | 276 code | 56 blank | 134 comment | 26 complexity | 45bbcb438e6154de29dcddb205c32c51 MD5 | raw file
  1. /* Created on 2006-okt-03
  2. *
  3. * Copyright (C) 2006 National Library of Sweden.
  4. *
  5. * This program is free software; you can redistribute it and/or
  6. * modify it under the terms of the GNU Lesser General Public License
  7. * as published by the Free Software Foundation; either version 2
  8. * of the License, or (at your option) any later version.
  9. *
  10. * This program is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. * GNU Lesser General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU Lesser General Public License
  16. * along with this program; if not, write to the Free Software
  17. * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  18. */
  19. package org.archive.crawler.writer;
  20. import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
  21. import java.io.ByteArrayOutputStream;
  22. import java.io.File;
  23. import java.io.FileOutputStream;
  24. import java.io.IOException;
  25. import java.io.OutputStream;
  26. import java.net.InetAddress;
  27. import java.security.MessageDigest;
  28. import java.security.NoSuchAlgorithmException;
  29. import java.util.logging.Level;
  30. import java.util.logging.Logger;
  31. import javax.management.AttributeNotFoundException;
  32. import javax.management.MBeanException;
  33. import javax.management.ReflectionException;
  34. import org.archive.crawler.datamodel.CoreAttributeConstants;
  35. import org.archive.crawler.datamodel.CrawlHost;
  36. import org.archive.crawler.datamodel.CrawlURI;
  37. import org.archive.crawler.framework.Processor;
  38. import org.archive.crawler.settings.SimpleType;
  39. import org.archive.crawler.settings.Type;
  40. import org.archive.io.ReplayInputStream;
  41. import org.archive.crawler.writer.Kw3Constants;
  42. /**
  43. * Processor module that writes the results of successful fetches to
  44. * files on disk. These files are MIME-files of the type used by the
  45. * Swedish National Library's Kulturarw3 web harvesting [http://www.kb.se/kw3/].
  46. *
  47. * Each URI gets written to its own file and has a path consisting of:
  48. * <ul>
  49. * <li> A dir named with the first two chars of the website's md5. </li>
  50. * <li> A dir named after the website. </li>
  51. * <li> 'current' - a dir indicating that this is the directory being written
  52. * to by the ongoing crawl. </li>
  53. * <li> A file on the format <md5 of url>.<fetchtime in seconds> </li>
  54. * </ul>
  55. * Example: '/53/www.kb.se/current/6879ad79c0ccf886ee8ca55d80e5d6a1.1169211837'
  56. *
  57. * The MIME-file itself consists of three parts:
  58. * <ul>
  59. * <li> 1. ArchiveInfo - Metadata about the file and its content. </li>
  60. * <li> 2. Header - The HTTP response header. </li>
  61. * <li> 3. Content - The HTTP response content, plus content-type. </li>
  62. * </ul>
  63. *
  64. * @author oskar
  65. */
  66. public class Kw3WriterProcessor extends Processor implements
  67. CoreAttributeConstants, Kw3Constants {
  68. private static final long serialVersionUID = 7171448068924684594L;
  69. private static String COLON = ":";
  70. private static String WS = " ";
  71. private static String LF = "\n";
  72. /**
  73. * Logger.
  74. */
  75. private static final Logger logger =
  76. Logger.getLogger(Kw3WriterProcessor.class.getName());
  77. /**
  78. * Key to use asking settings for arc path value.
  79. */
  80. public static final String ATTR_PATH ="path";
  81. /**
  82. * Default path.
  83. */
  84. private static final String DEFAULT_PATH = "arcs";
  85. /**
  86. * Key to use asking settings for max size value.
  87. */
  88. public static final String ATTR_MAX_SIZE_BYTES = "max-size-bytes";
  89. /**
  90. * Default max file size.
  91. */
  92. public static final int DEFAULT_MAX_FILE_SIZE = 10000000;
  93. /**
  94. * Key to use asking settings if chmod should be execuated .
  95. */
  96. public static final String ATTR_CHMOD = "chmod";
  97. /**
  98. * Key to use asking settings for the new chmod value.
  99. */
  100. public static final String ATTR_CHMOD_VALUE = "chmod-value";
  101. /**
  102. * Default value for permissions.
  103. */
  104. public static final String DEFAULT_CHMOD_VALUE = "777";
  105. /**
  106. * Key for the maximum ARC bytes to write attribute.
  107. */
  108. public static final String ATTR_MAX_BYTES_WRITTEN = "total-bytes-to-write";
  109. /**
  110. * Key for the collection attribute.
  111. */
  112. public static final String ATTR_COLLECTION = "collection";
  113. /**
  114. * Default value for collection.
  115. */
  116. public static final String DEFAULT_COLLECTION_VALUE = "kw3";
  117. /**
  118. * Key for the harvester attribute.
  119. */
  120. public static final String ATTR_HARVESTER = "harvester";
  121. /**
  122. * Default value for harvester.
  123. */
  124. public static final String DEFAULT_HARVESTER_VALUE = "heritrix";
  125. private static String BOUNDARY_START = "KulturArw3_";
  126. /*
  127. * Private members for settings
  128. */
  129. private File arcsDir;
  130. private boolean chmod;
  131. private String chmodValue;
  132. private int maxSize;
  133. private String collection;
  134. private String harvester;
  135. /**
  136. * @param name Name of this processor.
  137. */
  138. public Kw3WriterProcessor(String name) {
  139. super(name, "Kw3Writer processor. " +
  140. "A writer that writes files in the MIME format of The " +
  141. "Swedish National Library. See this class's javadoc for" +
  142. "format exposition.");
  143. Type e;
  144. e = addElementToDefinition(new SimpleType(ATTR_PATH,
  145. "Top-level directory for archive files.", DEFAULT_PATH));
  146. e.setOverrideable(false);
  147. e = addElementToDefinition(new SimpleType(ATTR_COLLECTION,
  148. "Name of collection.", DEFAULT_COLLECTION_VALUE));
  149. e.setOverrideable(false);
  150. e = addElementToDefinition(new SimpleType(ATTR_HARVESTER,
  151. "Name of the harvester that is used for the web harvesting.",
  152. DEFAULT_HARVESTER_VALUE));
  153. e.setOverrideable(false);
  154. e = addElementToDefinition(new SimpleType(ATTR_MAX_SIZE_BYTES,
  155. "Max size of each file", new Integer(DEFAULT_MAX_FILE_SIZE)));
  156. e.setOverrideable(false);
  157. e = addElementToDefinition(new SimpleType(ATTR_CHMOD,
  158. "Should permissions be changed for the newly created dirs",
  159. new Boolean(true)));
  160. e.setOverrideable(false);
  161. e = addElementToDefinition(new SimpleType(ATTR_CHMOD_VALUE,
  162. "What should the permissions be set to." +
  163. " Given as three octal digits, as to the UNIX 'chmod' command." +
  164. " Ex. 777 for all permissions to everyone.",
  165. DEFAULT_CHMOD_VALUE));
  166. e.setOverrideable(false);
  167. }
  168. protected void initialTasks () {
  169. try {
  170. String arcsDirPath = (String) getAttribute(ATTR_PATH);
  171. this.arcsDir = new File(arcsDirPath);
  172. if (!this.arcsDir.isAbsolute())
  173. this.arcsDir = new File(getController().getDisk(), arcsDirPath);
  174. this.collection = (String) getAttribute(ATTR_COLLECTION);
  175. this.harvester = (String) getAttribute(ATTR_HARVESTER);
  176. this.chmod = (Boolean) getAttribute(ATTR_CHMOD);
  177. this.chmodValue = (String) getAttribute(ATTR_CHMOD_VALUE);
  178. this.maxSize = (Integer) getAttribute(ATTR_MAX_SIZE_BYTES);
  179. } catch (AttributeNotFoundException e) {
  180. logger.log(Level.WARNING, "attribute error", e);
  181. } catch (MBeanException e) {
  182. logger.log(Level.WARNING, "attribute error", e);
  183. } catch (ReflectionException e) {
  184. logger.log(Level.WARNING, "attribute error", e);
  185. }
  186. }
  187. protected void innerProcess(CrawlURI curi) {
  188. // Only successful fetches are written.
  189. if (!curi.isSuccess())
  190. return;
  191. // Only http and https schemes are supported.
  192. String scheme = curi.getUURI().getScheme().toLowerCase();
  193. if (!"http".equalsIgnoreCase(scheme) && !"https".equalsIgnoreCase(scheme))
  194. return;
  195. // Write the MIME-file
  196. try {
  197. writeMimeFile(curi);
  198. } catch (IOException e) {
  199. logger.log(Level.WARNING, "i/o error", e);
  200. }
  201. }
  202. /*
  203. * The actual writing of the Kulturarw3 MIME-file.
  204. *
  205. * The MIME-file consists of three parts:
  206. * 1. ArchiveInfo - Metadata about the file and its content.
  207. * 2. Header - The HTTP response header.
  208. * 3. Content - The HTTP response content, plus content-type.
  209. *
  210. * For more on this format, see '?'.
  211. */
  212. protected void writeMimeFile(CrawlURI curi) throws IOException {
  213. ReplayInputStream ris = null;
  214. OutputStream out = null;
  215. try {
  216. String boundary = BOUNDARY_START + stringToMD5(curi.toString());
  217. ris = curi.getHttpRecorder().getRecordedInput().
  218. getReplayInputStream();
  219. out = initOutputStream(curi);
  220. // Part 1: Archive info
  221. writeArchiveInfoPart(boundary, curi, ris, out);
  222. // Part 2: Header info + HTTP header
  223. writeHeaderPart(boundary, ris, out);
  224. // Part 3: Content info + HTTP content
  225. writeContentPart(boundary, curi, ris, out);
  226. // And finally the terminator string
  227. String terminator = "\n--" + boundary + "--\n";
  228. out.write(terminator.getBytes());
  229. } finally {
  230. if (ris != null)
  231. ris.close();
  232. if (out != null)
  233. out.close();
  234. }
  235. }
  236. /*
  237. * Get the OutputStream for the file to write to.
  238. *
  239. * It has a path consisting of:
  240. * 1. A dir named with the first two chars of the website's md5.
  241. * 2. A dir named after the website.
  242. * 3. 'current' - a dir indicating that this is the directory being written
  243. * to by the ongoing crawl.
  244. * 4. A file on the format <md5 of url>.<fetchtime in seconds>
  245. *
  246. * Example: '/53/www.kb.se/current/6879ad79c0ccf886ee8ca55d80e5d6a1.1169211837'
  247. */
  248. protected OutputStream initOutputStream(CrawlURI curi) throws IOException {
  249. String uri = curi.toString();
  250. int port = curi.getUURI().getPort();
  251. String host = (port == 80 || port <= 0) ?
  252. curi.getUURI().getHost() : curi.getUURI().getHost() + ":" + port;
  253. long fetchTime = curi.getLong(A_FETCH_BEGAN_TIME) / 1000;
  254. String md5 = stringToMD5(host);
  255. File dir = new File(this.arcsDir, md5.substring(0, 2) + "/" + host +
  256. "/current");
  257. if (!dir.exists()) {
  258. dir.mkdirs();
  259. if (this.chmod)
  260. chmods(dir, this.arcsDir);
  261. }
  262. md5 = stringToMD5(uri);
  263. File arcFile = new File(dir, md5 + "." + fetchTime);
  264. return new FastBufferedOutputStream(new FileOutputStream(arcFile));
  265. }
  266. protected void writeArchiveInfoPart(String boundary, CrawlURI curi,
  267. ReplayInputStream ris, OutputStream out)
  268. throws IOException {
  269. // Get things we need to write in this part
  270. String uri = curi.toString();
  271. String ip = getHostAddress(curi);
  272. long headerLength = ris.getHeaderSize();
  273. long contentLength = ris.getContentSize();
  274. long archiveTime = System.currentTimeMillis() / 1000; // Fetchtime in seconds
  275. int statusCode = curi.getFetchStatus();
  276. String headerMd5 = null;
  277. Object contentMd5 = null;
  278. // Get headerMd5
  279. ByteArrayOutputStream baos = new ByteArrayOutputStream();
  280. ris.readHeaderTo(baos);
  281. headerMd5 = stringToMD5(baos.toString());
  282. // Get contentMd5
  283. contentMd5 = curi.getContentDigest();
  284. if (contentMd5 != null)
  285. contentMd5 = getHexString((byte[]) contentMd5);
  286. StringBuffer buffer = new StringBuffer();
  287. buffer.append("MIME-version: 1.1" + LF);
  288. buffer.append("Content-Type: multipart/mixed; boundary=" + boundary + LF);
  289. buffer.append("HTTP-Part: ArchiveInfo" + LF);
  290. buffer.append(COLLECTION_KEY + COLON + WS + this.collection + LF);
  291. buffer.append(HARVESTER_KEY + COLON + WS + this.harvester + LF);
  292. buffer.append(URL_KEY + COLON + WS + uri + LF);
  293. buffer.append(IP_ADDRESS_KEY + COLON + WS + ip + LF);
  294. buffer.append(HEADER_LENGTH_KEY + COLON + WS + headerLength + LF);
  295. buffer.append(HEADER_MD5_KEY + COLON + WS + headerMd5 + LF);
  296. buffer.append(CONTENT_LENGTH_KEY + COLON + WS + contentLength + LF);
  297. buffer.append(CONTENT_MD5_KEY + COLON + WS + contentMd5 + LF);
  298. buffer.append(ARCHIVE_TIME_KEY + COLON + WS+ archiveTime + LF);
  299. buffer.append(STATUS_CODE_KEY + COLON + WS + statusCode + LF + LF);
  300. out.write(buffer.toString().getBytes());
  301. }
  302. protected void writeHeaderPart(String boundary, ReplayInputStream ris,
  303. OutputStream out)
  304. throws IOException {
  305. StringBuffer buffer = new StringBuffer();
  306. buffer.append("--" + boundary + LF);
  307. buffer.append("Content-Type: text/plain; charset=\"US-ascii\"" + LF);
  308. buffer.append("HTTP-Part: Header" + LF + LF );
  309. out.write(buffer.toString().getBytes());
  310. ris.readHeaderTo(out);
  311. }
  312. protected void writeContentPart(String boundary, CrawlURI curi,
  313. ReplayInputStream ris, OutputStream out)
  314. throws IOException {
  315. // Get things we need to write in this part
  316. String uri = curi.toString();
  317. String contentType = curi.getContentType();
  318. long contentLength = ris.getContentSize();
  319. // Only write content if there is some
  320. if (contentLength == 0) return;
  321. StringBuffer buffer = new StringBuffer();
  322. buffer.append("--" + boundary + LF);
  323. buffer.append("Content-Type: " + contentType + LF);
  324. buffer.append("HTTP-Part: Content" + LF + LF);
  325. out.write(buffer.toString().getBytes());
  326. if (contentLength > this.maxSize) {
  327. ris.readContentTo(out, this.maxSize);
  328. logger.info(" Truncated url: " + uri + ", Size: " + contentLength +
  329. ", Content-type: " + contentType);
  330. } else {
  331. ris.readContentTo(out);
  332. }
  333. }
  334. // --- Private helper functions --- //
  335. /*
  336. * Get a MD5 checksum based on a String.
  337. */
  338. private String stringToMD5(String str) {
  339. try {
  340. byte b[] = str.getBytes();
  341. MessageDigest md = MessageDigest.getInstance("MD5");
  342. md.update(b);
  343. byte[] digest = md.digest();
  344. return getHexString(digest);
  345. } catch (NoSuchAlgorithmException e) {
  346. logger.log(Level.WARNING, "md5 error", e);
  347. }
  348. return null;
  349. }
  350. /*
  351. * Fast convert a byte array to a hex string with possible leading zero.
  352. */
  353. private String getHexString(byte[] b) {
  354. StringBuffer sb = new StringBuffer();
  355. for (int i = 0; i < b.length; i++) {
  356. String tmp = Integer.toHexString(b[i] & 0xff);
  357. if (tmp.length() < 2)
  358. sb.append("0" + tmp);
  359. else
  360. sb.append(tmp);
  361. }
  362. return sb.toString();
  363. }
  364. /*
  365. * Chmods for all newly created directories.
  366. */
  367. private void chmods(File dir, File arcsDir) {
  368. String topdir = arcsDir.getAbsolutePath();
  369. chmod(dir, this.chmodValue);
  370. File parent = dir.getParentFile();
  371. while (!parent.getAbsolutePath().equalsIgnoreCase((topdir))) {
  372. chmod(parent, this.chmodValue);
  373. parent = parent.getParentFile();
  374. }
  375. }
  376. /*
  377. * Chmod for a specific file or directory.
  378. */
  379. private void chmod(File file, String permissions) {
  380. Process proc = null;
  381. try {
  382. proc = Runtime.getRuntime().exec("chmod " + permissions + " " +
  383. file.getAbsolutePath());
  384. proc.waitFor();
  385. proc.getInputStream().close();
  386. proc.getOutputStream().close();
  387. proc.getErrorStream().close();
  388. } catch (IOException e) {
  389. logger.log(Level.WARNING, "chmod failed", e);
  390. } catch (InterruptedException e) {
  391. logger.log(Level.WARNING, "chmod failed", e);
  392. }
  393. }
  394. private String getHostAddress(CrawlURI curi) {
  395. CrawlHost h = getController().getServerCache().getHostFor(curi);
  396. if (h == null) {
  397. throw new NullPointerException("Crawlhost is null for " + curi + " " +
  398. curi.getVia());
  399. }
  400. InetAddress a = h.getIP();
  401. if (a == null) {
  402. throw new NullPointerException("Address is null for " + curi + " " +
  403. curi.getVia() + ". Address " +
  404. ((h.getIpFetched() == CrawlHost.IP_NEVER_LOOKED_UP) ?
  405. "was never looked up." :
  406. (System.currentTimeMillis() - h.getIpFetched()) + " ms ago."));
  407. }
  408. return h.getIP().getHostAddress();
  409. }
  410. }