PageRenderTime 25ms CodeModel.GetById 17ms RepoModel.GetById 1ms app.codeStats 0ms

/projects/heritrix-1.14.4/src/java/org/archive/io/warc/WARCReader.java

https://gitlab.com/essere.lab.public/qualitas.class-corpus
Java | 290 lines | 170 code | 26 blank | 94 comment | 16 complexity | c5b5b4d181d9375202ccedb25f62292d MD5 | raw file
  1. /* $Id: WARCReader.java 4754 2006-11-28 02:03:03Z stack-sf $
  2. *
  3. * Created Aug 23, 2006
  4. *
  5. * Copyright (C) 2006 Internet Archive.
  6. *
  7. * This file is part of the Heritrix web crawler (crawler.archive.org).
  8. *
  9. * Heritrix is free software; you can redistribute it and/or modify
  10. * it under the terms of the GNU Lesser Public License as published by
  11. * the Free Software Foundation; either version 2.1 of the License, or
  12. * any later version.
  13. *
  14. * Heritrix is distributed in the hope that it will be useful,
  15. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17. * GNU Lesser Public License for more details.
  18. *
  19. * You should have received a copy of the GNU Lesser Public License
  20. * along with Heritrix; if not, write to the Free Software
  21. * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  22. */
  23. package org.archive.io.warc;
  24. import java.io.File;
  25. import java.io.IOException;
  26. import java.io.InputStream;
  27. import java.util.Iterator;
  28. import java.util.List;
  29. import org.apache.commons.cli.CommandLine;
  30. import org.apache.commons.cli.HelpFormatter;
  31. import org.apache.commons.cli.Option;
  32. import org.apache.commons.cli.Options;
  33. import org.apache.commons.cli.ParseException;
  34. import org.apache.commons.cli.PosixParser;
  35. import org.apache.commons.lang.NotImplementedException;
  36. import org.archive.io.ArchiveReader;
  37. import org.archive.io.ArchiveRecord;
  38. import org.archive.io.warc.WARCConstants;
  39. /**
  40. * WARCReader.
  41. * Go via {@link WARCReaderFactory} to get instance.
  42. * @author stack
  43. * @version $Date: 2006-11-27 18:03:03 -0800 (Mon, 27 Nov 2006) $ $Version$
  44. */
  45. public class WARCReader extends ArchiveReader implements WARCConstants {
  46. WARCReader() {
  47. super();
  48. }
  49. @Override
  50. protected void initialize(String i) {
  51. super.initialize(i);
  52. setVersion(WARC_VERSION);
  53. }
  54. /**
  55. * Skip over any trailing new lines at end of the record so we're lined up
  56. * ready to read the next.
  57. * @param record
  58. * @throws IOException
  59. */
  60. protected void gotoEOR(ArchiveRecord record) throws IOException {
  61. if (record.available() != 0) {
  62. throw new IOException("Record should be exhausted before coming " +
  63. "in here");
  64. }
  65. // Records end in 2*CRLF. Suck it up.
  66. readExpectedChar(getIn(), CRLF.charAt(0));
  67. readExpectedChar(getIn(), CRLF.charAt(1));
  68. readExpectedChar(getIn(), CRLF.charAt(0));
  69. readExpectedChar(getIn(), CRLF.charAt(1));
  70. }
  71. protected void readExpectedChar(final InputStream is, final int expected)
  72. throws IOException {
  73. int c = is.read();
  74. if (c != expected) {
  75. throw new IOException("Unexpected character " +
  76. Integer.toHexString(c) + "(Expecting " +
  77. Integer.toHexString(expected) + ")");
  78. }
  79. }
  80. /**
  81. * Create new WARC record.
  82. * Encapsulate housekeeping that has to do w/ creating new Record.
  83. * @param is InputStream to use.
  84. * @param offset Absolute offset into WARC file.
  85. * @return A WARCRecord.
  86. * @throws IOException
  87. */
  88. protected WARCRecord createArchiveRecord(InputStream is, long offset)
  89. throws IOException {
  90. return (WARCRecord)currentRecord(new WARCRecord(is,
  91. getReaderIdentifier(), offset, isDigest(), isStrict()));
  92. }
  93. @Override
  94. public void dump(boolean compress)
  95. throws IOException, java.text.ParseException {
  96. for (final Iterator<ArchiveRecord> i = iterator(); i.hasNext();) {
  97. ArchiveRecord r = i.next();
  98. System.out.println(r.getHeader().toString());
  99. r.dump();
  100. System.out.println();
  101. }
  102. }
  103. @Override
  104. public ArchiveReader getDeleteFileOnCloseReader(final File f) {
  105. throw new NotImplementedException("TODO");
  106. }
  107. @Override
  108. public String getDotFileExtension() {
  109. return DOT_WARC_FILE_EXTENSION;
  110. }
  111. @Override
  112. public String getFileExtension() {
  113. return WARC_FILE_EXTENSION;
  114. }
  115. // Static methods follow. Mostly for command-line processing.
  116. /**
  117. *
  118. * @param formatter Help formatter instance.
  119. * @param options Usage options.
  120. * @param exitCode Exit code.
  121. */
  122. private static void usage(HelpFormatter formatter, Options options,
  123. int exitCode) {
  124. formatter.printHelp("java org.archive.io.arc.WARCReader" +
  125. " [--digest=true|false] \\\n" +
  126. " [--format=cdx|cdxfile|dump|gzipdump]" +
  127. " [--offset=#] \\\n[--strict] WARC_FILE|WARC_URL",
  128. options);
  129. System.exit(exitCode);
  130. }
  131. /**
  132. * Write out the arcfile.
  133. *
  134. * @param reader
  135. * @param format Format to use outputting.
  136. * @throws IOException
  137. * @throws java.text.ParseException
  138. */
  139. protected static void output(WARCReader reader, String format)
  140. throws IOException, java.text.ParseException {
  141. if (!reader.output(format)) {
  142. throw new IOException("Unsupported format: " + format);
  143. }
  144. }
  145. /**
  146. * Generate a CDX index file for an ARC file.
  147. *
  148. * @param urlOrPath The ARC file to generate a CDX index for
  149. * @throws IOException
  150. * @throws java.text.ParseException
  151. */
  152. public static void createCDXIndexFile(String urlOrPath)
  153. throws IOException, java.text.ParseException {
  154. WARCReader r = WARCReaderFactory.get(urlOrPath);
  155. r.setStrict(false);
  156. r.setDigest(true);
  157. output(r, CDX_FILE);
  158. }
  159. /**
  160. * Command-line interface to WARCReader.
  161. *
  162. * Here is the command-line interface:
  163. * <pre>
  164. * usage: java org.archive.io.arc.WARCReader [--offset=#] ARCFILE
  165. * -h,--help Prints this message and exits.
  166. * -o,--offset Outputs record at this offset into arc file.</pre>
  167. *
  168. * <p>Outputs using a pseudo-CDX format as described here:
  169. * <a href="http://www.archive.org/web/researcher/cdx_legend.php">CDX
  170. * Legent</a> and here
  171. * <a href="http://www.archive.org/web/researcher/example_cdx.php">Example</a>.
  172. * Legend used in below is: 'CDX b e a m s c V (or v if uncompressed) n g'.
  173. * Hash is hard-coded straight SHA-1 hash of content.
  174. *
  175. * @param args Command-line arguments.
  176. * @throws ParseException Failed parse of the command line.
  177. * @throws IOException
  178. * @throws java.text.ParseException
  179. */
  180. public static void main(String [] args)
  181. throws ParseException, IOException, java.text.ParseException {
  182. Options options = getOptions();
  183. PosixParser parser = new PosixParser();
  184. CommandLine cmdline = parser.parse(options, args, false);
  185. List cmdlineArgs = cmdline.getArgList();
  186. Option [] cmdlineOptions = cmdline.getOptions();
  187. HelpFormatter formatter = new HelpFormatter();
  188. // If no args, print help.
  189. if (cmdlineArgs.size() <= 0) {
  190. usage(formatter, options, 0);
  191. }
  192. // Now look at options passed.
  193. long offset = -1;
  194. boolean digest = false;
  195. boolean strict = false;
  196. String format = CDX;
  197. for (int i = 0; i < cmdlineOptions.length; i++) {
  198. switch(cmdlineOptions[i].getId()) {
  199. case 'h':
  200. usage(formatter, options, 0);
  201. break;
  202. case 'o':
  203. offset =
  204. Long.parseLong(cmdlineOptions[i].getValue());
  205. break;
  206. case 's':
  207. strict = true;
  208. break;
  209. case 'd':
  210. digest = getTrueOrFalse(cmdlineOptions[i].getValue());
  211. break;
  212. case 'f':
  213. format = cmdlineOptions[i].getValue().toLowerCase();
  214. boolean match = false;
  215. // List of supported formats.
  216. final String [] supportedFormats =
  217. {CDX, DUMP, GZIP_DUMP, CDX_FILE};
  218. for (int ii = 0; ii < supportedFormats.length; ii++) {
  219. if (supportedFormats[ii].equals(format)) {
  220. match = true;
  221. break;
  222. }
  223. }
  224. if (!match) {
  225. usage(formatter, options, 1);
  226. }
  227. break;
  228. default:
  229. throw new RuntimeException("Unexpected option: " +
  230. + cmdlineOptions[i].getId());
  231. }
  232. }
  233. if (offset >= 0) {
  234. if (cmdlineArgs.size() != 1) {
  235. System.out.println("Error: Pass one arcfile only.");
  236. usage(formatter, options, 1);
  237. }
  238. WARCReader r = WARCReaderFactory.get(
  239. new File((String)cmdlineArgs.get(0)), offset);
  240. r.setStrict(strict);
  241. outputRecord(r, format);
  242. } else {
  243. for (Iterator i = cmdlineArgs.iterator(); i.hasNext();) {
  244. String urlOrPath = (String)i.next();
  245. try {
  246. WARCReader r = WARCReaderFactory.get(urlOrPath);
  247. r.setStrict(strict);
  248. r.setDigest(digest);
  249. output(r, format);
  250. } catch (RuntimeException e) {
  251. // Write out name of file we failed on to help with
  252. // debugging. Then print stack trace and try to keep
  253. // going. We do this for case where we're being fed
  254. // a bunch of ARCs; just note the bad one and move
  255. // on to the next.
  256. System.err.println("Exception processing " + urlOrPath +
  257. ": " + e.getMessage());
  258. e.printStackTrace(System.err);
  259. System.exit(1);
  260. }
  261. }
  262. }
  263. }
  264. }