WARCWriterProcessor.java

/projects/heritrix-1.14.4/src/java/org/archive/crawler/writer/WARCWriterProcessor.java

https://gitlab.com/essere.lab.public/qualitas.class-corpus · Java · 682 lines · 490 code · 51 blank · 141 comment · 66 complexity · 80256c1532d1bb184f13e08cbbc79cca MD5 · raw file

/* $Id: ExperimentalWARCWriterProcessor.java 4935 2007-02-23 00:27:24Z gojomo $
 *
 * Created on August 1st, 2006.
 *
 * Copyright (C) 2006 Internet Archive.
 *
 * This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 * Heritrix is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser Public License as published by
 * the Free Software Foundation; either version 2.1 of the License, or
 * any later version.
 *
 * Heritrix is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser Public License for more details.
 *
 * You should have received a copy of the GNU Lesser Public License
 * along with Heritrix; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
package org.archive.crawler.writer;

import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.net.InetAddress;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.UnknownHostException;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpMethodBase;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.lang.StringUtils;
import org.archive.crawler.Heritrix;
import org.archive.crawler.datamodel.CoreAttributeConstants;
import org.archive.crawler.datamodel.CrawlURI;
import org.archive.crawler.datamodel.FetchStatusCodes;
import org.archive.crawler.deciderules.recrawl.IdenticalDigestDecideRule;
import org.archive.crawler.event.CrawlStatusListener;
import org.archive.crawler.extractor.Link;
import org.archive.crawler.framework.WriterPoolProcessor;
import org.archive.crawler.settings.SimpleType;
import org.archive.crawler.settings.Type;
import org.archive.io.ReplayInputStream;
import org.archive.io.WriterPoolMember;
import org.archive.io.WriterPoolSettings;
import org.archive.io.warc.WARCConstants;
import org.archive.io.warc.WARCWriter;
import org.archive.io.warc.WARCWriterPool;
import org.archive.uid.GeneratorFactory;
import org.archive.util.ArchiveUtils;
import org.archive.util.XmlUtils;
import org.archive.util.anvl.ANVLRecord;
import org.w3c.dom.Document;

/**
 * WARCWriterProcessor.
 * Goes against the 0.18 version of the WARC specification (which
 * is functionally identical to 0.17 except in the protocol 
 * identifier string). 
 * See http://archive-access.sourceforge.net/warc/
 * 
 * <p>TODO: Remove ANVLRecord. Rename NameValue or use RFC822
 * (commons-httpclient?) or find something else.
 * 
 * @author stack
 */
public class WARCWriterProcessor extends WriterPoolProcessor
implements CoreAttributeConstants, CrawlStatusListener,
WriterPoolSettings, FetchStatusCodes, WARCConstants {
    private static final long serialVersionUID = 6182850087635847443L;

    private final Logger logger = Logger.getLogger(this.getClass().getName());
    
    public long getDefaultMaxFileSize() {
          return 1000000000L; // 1 SI giga-byte (109 bytes), per WARC appendix A
    }
    
    /**
     * Key for whether to write 'request' type records where possible
     */
    public static final String ATTR_WRITE_REQUESTS =
        "write-requests";
    
    /**
     * Key for whether to write 'metadata' type records where possible
     */
    public static final String ATTR_WRITE_METADATA =
        "write-metadata";
    
    /**
     * Key for whether to write 'revisit' type records when
     * consecutive identical digest
     */
    public static final String ATTR_WRITE_REVISIT_FOR_IDENTICAL_DIGESTS =
        "write-revisit-for-identical-digests";
    
    /**
     * Key for whether to write 'revisit' type records for server
     * "304 not modified" responses
     */
    public static final String ATTR_WRITE_REVISIT_FOR_NOT_MODIFIED =
        "write-revisit-for-not-modified";
    
    /**
     * Default path list.
     */
    private static final String [] DEFAULT_PATH = {"warcs"};

    protected String [] getDefaultPath() {
        return DEFAULT_PATH;
    }
    
    /**
     * @param name Name of this writer.
     */
    public WARCWriterProcessor(final String name) {
        super(name, "Experimental WARCWriter processor (Version 0.17)");
        Type e = addElementToDefinition(
                new SimpleType(ATTR_WRITE_REQUESTS,
                "Whether to write 'request' type records. " +
                "Default is true.", new Boolean(true)));
        e.setOverrideable(true);
        e.setExpertSetting(true);
        e = addElementToDefinition(
                new SimpleType(ATTR_WRITE_METADATA,
                "Whether to write 'metadata' type records. " +
                "Default is true.", new Boolean(true)));
        e.setOverrideable(true);
        e.setExpertSetting(true);
        e = addElementToDefinition(
                new SimpleType(ATTR_WRITE_REVISIT_FOR_IDENTICAL_DIGESTS,
                "Whether to write 'revisit' type records when a URI's " +
                "history indicates the previous fetch had an identical " +
                "content digest. " +
                "Default is true.", new Boolean(true)));
        e.setOverrideable(true);
        e.setExpertSetting(true);
        e = addElementToDefinition(
                new SimpleType(ATTR_WRITE_REVISIT_FOR_NOT_MODIFIED,
                "Whether to write 'revisit' type records when a " +
                "304-Not Modified response is received. " +
                "Default is true.", new Boolean(true)));
        e.setOverrideable(true);
        e.setExpertSetting(true);
    }

    protected void setupPool(final AtomicInteger serialNo) {
		setPool(new WARCWriterPool(serialNo, this, getPoolMaximumActive(),
            getPoolMaximumWait()));
    }
    
    /**
     * Writes a CrawlURI and its associated data to store file.
     * 
     * Currently this method understands the following uri types: dns, http, and
     * https.
     * 
     * @param curi CrawlURI to process.
     * 
     */
    protected void innerProcess(CrawlURI curi) {
        // If failure, or we haven't fetched the resource yet, return
        if (curi.getFetchStatus() <= 0) {
            return;
        }
        
        // If no recorded content at all, don't write record. Except FTP, which
        // can have empty content, since the "headers" don't count as content.
        String scheme = curi.getUURI().getScheme().toLowerCase();
        long recordLength = curi.getContentSize();
        if (recordLength <= 0 && !scheme.equals("ftp")) {
            // getContentSize() should be > 0 if any material (even just
            // HTTP headers with zero-length body) is available. 
            return;
        }
        
        try {
            if (shouldWrite(curi)) {
                write(scheme, curi);
            } else {
                logger.info("This writer does not write out scheme " +
                        scheme + " content");
            }
        } catch (IOException e) {
            curi.addLocalizedError(this.getName(), e, "WriteRecord: " +
                curi.toString());
            logger.log(Level.SEVERE, "Failed write of Record: " +
                curi.toString(), e);
        }
    }
    
    protected void write(final String lowerCaseScheme, final CrawlURI curi)
    throws IOException {
        logger.info("writing warc record for " + curi);
        WriterPoolMember writer = getPool().borrowFile();
        long position = writer.getPosition();
        // See if we need to open a new file because we've exceeed maxBytes.
        // Call to checkFileSize will open new file if we're at maximum for
        // current file.
        writer.checkSize();
        if (writer.getPosition() != position) {
            // We just closed the file because it was larger than maxBytes.
            // Add to the totalBytesWritten the size of the first record
            // in the file, if any.
            setTotalBytesWritten(getTotalBytesWritten() +
            	(writer.getPosition() - position));
            position = writer.getPosition();
        }
        
        WARCWriter w = (WARCWriter)writer;
        try {
            // Write a request, response, and metadata all in the one
            // 'transaction'.
            final URI baseid = getRecordID();
            final String timestamp =
                ArchiveUtils.getLog14Date(curi.getLong(A_FETCH_BEGAN_TIME));
            if (lowerCaseScheme.startsWith("http")) {
                writeHttpRecords(w, curi, baseid, timestamp); 
            } else if (lowerCaseScheme.equals("dns")) {
                writeDnsRecords(w, curi, baseid, timestamp);
            } else if (lowerCaseScheme.equals("ftp")) {
                writeFtpRecords(w, curi, baseid, timestamp); 
            } else {
                logger.warning("No handler for scheme " + lowerCaseScheme);
            }
        } catch (IOException e) {
            // Invalidate this file (It gets a '.invalid' suffix).
            getPool().invalidateFile(writer);
            // Set the writer to null otherwise the pool accounting
            // of how many active writers gets skewed if we subsequently
            // do a returnWriter call on this object in the finally block.
            writer = null;
            throw e;
        } finally {
            if (writer != null) {
            	setTotalBytesWritten(getTotalBytesWritten() +
            	     (writer.getPosition() - position));
                getPool().returnFile(writer);
            }
        }
        checkBytesWritten();
    }

    private void writeFtpRecords(WARCWriter w, final CrawlURI curi, final URI baseid,
            final String timestamp) throws IOException {
        ANVLRecord headers = new ANVLRecord(3);
        headers.addLabelValue(HEADER_KEY_IP, getHostAddress(curi));
        String controlConversation = curi.getString(A_FTP_CONTROL_CONVERSATION);
        URI rid = writeFtpControlConversation(w, timestamp, baseid, curi, headers, controlConversation);
        
        if (curi.getContentDigest() != null) {
            headers.addLabelValue(HEADER_KEY_PAYLOAD_DIGEST,
                curi.getContentDigestSchemeString());
        }

        if (curi.getHttpRecorder() != null) {
            if (IdenticalDigestDecideRule.hasIdenticalDigest(curi) && 
                    ((Boolean)getUncheckedAttribute(curi, 
                        ATTR_WRITE_REVISIT_FOR_IDENTICAL_DIGESTS))) {
                rid = writeRevisitDigest(w, timestamp, null,
                        baseid, curi, headers);
            } else {
                headers = new ANVLRecord(3);
                if (curi.isTruncatedFetch()) {
                    String value = curi.isTimeTruncatedFetch()?
                        NAMED_FIELD_TRUNCATED_VALUE_TIME:
                        curi.isLengthTruncatedFetch()?
                        NAMED_FIELD_TRUNCATED_VALUE_LENGTH:
                        curi.isHeaderTruncatedFetch()?
                        NAMED_FIELD_TRUNCATED_VALUE_HEAD:
                        // TODO: Add this to spec.
                        TRUNCATED_VALUE_UNSPECIFIED;
                    headers.addLabelValue(HEADER_KEY_TRUNCATED, value);
                }
                if (curi.getContentDigest() != null) {
                    headers.addLabelValue(HEADER_KEY_PAYLOAD_DIGEST,
                            curi.getContentDigestSchemeString());
                }
                headers.addLabelValue(HEADER_KEY_CONCURRENT_TO, '<' + rid.toString() + '>');
                rid = writeResource(w, timestamp, curi.getContentType(), baseid, curi, headers);
            }
        }
        if(((Boolean)getUncheckedAttribute(curi, ATTR_WRITE_METADATA))) {
            headers = new ANVLRecord(1);
            headers.addLabelValue(HEADER_KEY_CONCURRENT_TO, '<' + rid.toString() + '>');
            writeMetadata(w, timestamp, baseid, curi, headers);
        }
    }

    private void writeDnsRecords(WARCWriter w, final CrawlURI curi, final URI baseid,
            final String timestamp) throws IOException {
        ANVLRecord headers = null;
        String ip = curi.getString(A_DNS_SERVER_IP_LABEL);
        if (ip != null && ip.length() > 0) {
            headers = new ANVLRecord(1);
            headers.addLabelValue(HEADER_KEY_IP, ip);
        }
        writeResponse(w, timestamp, curi.getContentType(), baseid,
            curi, headers);
    }

    private void writeHttpRecords(WARCWriter w, final CrawlURI curi, final URI baseid,
            final String timestamp) throws IOException {
        // Add named fields for ip, checksum, and relate the metadata
        // and request to the resource field.
        // TODO: Use other than ANVL (or rename ANVL as NameValue or
        // use RFC822 (commons-httpclient?).
        ANVLRecord headers = new ANVLRecord(5);
        if (curi.getContentDigest() != null) {
            headers.addLabelValue(HEADER_KEY_PAYLOAD_DIGEST,
                curi.getContentDigestSchemeString());
        }
        headers.addLabelValue(HEADER_KEY_IP, getHostAddress(curi));
        URI rid;
        
        if (IdenticalDigestDecideRule.hasIdenticalDigest(curi) && 
                ((Boolean)getUncheckedAttribute(curi, 
                        ATTR_WRITE_REVISIT_FOR_IDENTICAL_DIGESTS))) {
            rid = writeRevisitDigest(w, timestamp, HTTP_RESPONSE_MIMETYPE,
                    baseid, curi, headers);
        } else if (curi.getFetchStatus() == HttpStatus.SC_NOT_MODIFIED && 
                ((Boolean)getUncheckedAttribute(curi, 
                        ATTR_WRITE_REVISIT_FOR_NOT_MODIFIED))) {
            rid = writeRevisitNotModified(w, timestamp,
                    baseid, curi, headers);
        } else {
            if (curi.isTruncatedFetch()) {
                String value = curi.isTimeTruncatedFetch()?
                    NAMED_FIELD_TRUNCATED_VALUE_TIME:
                    curi.isLengthTruncatedFetch()?
                        NAMED_FIELD_TRUNCATED_VALUE_LENGTH:
                        curi.isHeaderTruncatedFetch()?
                            NAMED_FIELD_TRUNCATED_VALUE_HEAD:
                    // TODO: Add this to spec.
                    TRUNCATED_VALUE_UNSPECIFIED;
                headers.addLabelValue(HEADER_KEY_TRUNCATED, value);
            }
            rid = writeResponse(w, timestamp, HTTP_RESPONSE_MIMETYPE,
            	baseid, curi, headers);
        }
        
        headers = new ANVLRecord(1);
        headers.addLabelValue(HEADER_KEY_CONCURRENT_TO,
            '<' + rid.toString() + '>');

        if(((Boolean)getUncheckedAttribute(curi, ATTR_WRITE_REQUESTS))) {
            writeRequest(w, timestamp, HTTP_REQUEST_MIMETYPE,
                    baseid, curi, headers);
        }
        if(((Boolean)getUncheckedAttribute(curi, ATTR_WRITE_METADATA))) {
            writeMetadata(w, timestamp, baseid, curi, headers);
        }
    }
    
    protected URI writeFtpControlConversation(WARCWriter w, String timestamp, URI baseid,
            CrawlURI curi, ANVLRecord headers, String controlConversation) 
    throws IOException {
        final URI uid = qualifyRecordID(baseid, TYPE, METADATA);
        byte[] b = controlConversation.getBytes("UTF-8");
        w.writeMetadataRecord(curi.toString(), timestamp, FTP_CONTROL_CONVERSATION_MIMETYPE,
            uid, headers, new ByteArrayInputStream(b), b.length);
        return uid;
    }

    protected URI writeRequest(final WARCWriter w,
            final String timestamp, final String mimetype,
            final URI baseid, final CrawlURI curi,
            final ANVLRecord namedFields) 
    throws IOException {
        final URI uid = qualifyRecordID(baseid, TYPE, REQUEST);
        ReplayInputStream ris =
            curi.getHttpRecorder().getRecordedOutput().getReplayInputStream();
        try {
            w.writeRequestRecord(curi.toString(), timestamp, mimetype, uid,
                namedFields, ris,
                curi.getHttpRecorder().getRecordedOutput().getSize());
        } finally {
            if (ris != null) {
                ris.close();
            }
        }
        return uid;
    }
    
    protected URI writeResponse(final WARCWriter w,
            final String timestamp, final String mimetype,
            final URI baseid, final CrawlURI curi,
            final ANVLRecord namedFields) 
    throws IOException {
        ReplayInputStream ris =
            curi.getHttpRecorder().getRecordedInput().getReplayInputStream();
        try {
            w.writeResponseRecord(curi.toString(), timestamp, mimetype, baseid,
                namedFields, ris,
                curi.getHttpRecorder().getRecordedInput().getSize());
        } finally {
            if (ris != null) {
                ris.close();
            }
        }
        return baseid;
    }
    
    protected URI writeResource(final WARCWriter w,
            final String timestamp, final String mimetype,
            final URI baseid, final CrawlURI curi,
            final ANVLRecord namedFields) 
    throws IOException {
        ReplayInputStream ris =
            curi.getHttpRecorder().getRecordedInput().getReplayInputStream();
        try {
            w.writeResourceRecord(curi.toString(), timestamp, mimetype, baseid,
                namedFields, ris,
                curi.getHttpRecorder().getRecordedInput().getSize());
        } finally {
            if (ris != null) {
                ris.close();
            }
        }
        return baseid;
    }
    
    protected URI writeRevisitDigest(final WARCWriter w,
            final String timestamp, final String mimetype,
            final URI baseid, final CrawlURI curi,
            final ANVLRecord namedFields) 
    throws IOException {
        namedFields.addLabelValue(
                HEADER_KEY_PROFILE, PROFILE_REVISIT_IDENTICAL_DIGEST);
        namedFields.addLabelValue(
                HEADER_KEY_TRUNCATED, NAMED_FIELD_TRUNCATED_VALUE_LENGTH);
        
        ReplayInputStream ris = null;
        long revisedLength = 0;
        
        // null mimetype implies no payload
        if (mimetype != null) {
            ris = curi.getHttpRecorder().getRecordedInput().getReplayInputStream();
            revisedLength = curi.getHttpRecorder().getRecordedInput().getContentBegin();
            revisedLength = revisedLength > 0 
                ? revisedLength 
                : curi.getHttpRecorder().getRecordedInput().getSize();
        }
        
        try {
            w.writeRevisitRecord(curi.toString(), timestamp, mimetype, baseid,
                namedFields, ris, revisedLength);
        } finally {
            if (ris != null) {
                ris.close();
            }
        }
        curi.addAnnotation("warcRevisit:digest"); 
        return baseid;
    }
    
    protected URI writeRevisitNotModified(final WARCWriter w,
            final String timestamp, 
            final URI baseid, final CrawlURI curi,
            final ANVLRecord namedFields) 
    throws IOException {
        namedFields.addLabelValue(
        		HEADER_KEY_PROFILE, PROFILE_REVISIT_NOT_MODIFIED);
        // save just enough context to understand basis of not-modified
        if(curi.containsKey(A_HTTP_TRANSACTION)) {
            HttpMethodBase method = 
                (HttpMethodBase) curi.getObject(A_HTTP_TRANSACTION);
            saveHeader(A_ETAG_HEADER,method,namedFields,HEADER_KEY_ETAG);
            saveHeader(A_LAST_MODIFIED_HEADER,method,namedFields,
            		HEADER_KEY_LAST_MODIFIED);
        }
        // truncate to zero-length (all necessary info is above)
        namedFields.addLabelValue(HEADER_KEY_TRUNCATED,
            NAMED_FIELD_TRUNCATED_VALUE_LENGTH);
        ReplayInputStream ris =
            curi.getHttpRecorder().getRecordedInput().getReplayInputStream();
        try {
            w.writeRevisitRecord(curi.toString(), timestamp, null, baseid,
                namedFields, ris, 0);
        } finally {
            if (ris !=  null) {
                ris.close();
            }
        }
        curi.addAnnotation("warcRevisit:notModified"); 
        return baseid;
    }
    
    /**
     * Save a header from the given HTTP operation into the 
     * provider headers under a new name
     * 
     * @param origName header name to get if present
     * @param method http operation containing headers
     */
    protected void saveHeader(String origName, HttpMethodBase method, 
    		ANVLRecord headers, String newName) {
        Header header = method.getResponseHeader(origName);
        if(header!=null) {
            headers.addLabelValue(newName, header.getValue());
        }
    }

	protected URI writeMetadata(final WARCWriter w,
            final String timestamp,
            final URI baseid, final CrawlURI curi,
            final ANVLRecord namedFields) 
    throws IOException {
        final URI uid = qualifyRecordID(baseid, TYPE, METADATA);
        // Get some metadata from the curi.
        // TODO: Get all curi metadata.
        // TODO: Use other than ANVL (or rename ANVL as NameValue or use
        // RFC822 (commons-httpclient?).
        ANVLRecord r = new ANVLRecord();
        if (curi.isSeed()) {
            r.addLabel("seed");
        } else {
        	if (curi.forceFetch()) {
        		r.addLabel("force-fetch");
        	}
            r.addLabelValue("via", curi.flattenVia());
            r.addLabelValue("hopsFromSeed", curi.getPathFromSeed());
            if (curi.containsKey(A_SOURCE_TAG)) {
                r.addLabelValue("sourceTag", curi.getString(A_SOURCE_TAG));
            }
        }
        long duration = curi.getFetchDuration();
        if(duration>-1) {
            r.addLabelValue("fetchTimeMs", Long.toString(duration));
        }
        
        if (curi.containsKey(A_FTP_FETCH_STATUS)) {
            r.addLabelValue("ftpFetchStatus", curi.getString(A_FTP_FETCH_STATUS));
        }
        
        // Add outlinks though they are effectively useless without anchor text.
        Collection<Link> links = curi.getOutLinks();
        if (links != null && links.size() > 0) {
            for (Link link: links) {
                r.addLabelValue("outlink", link.toString());
            }
        }
        
        // TODO: Other curi fields to write to metadata.
        // 
        // Credentials
        // 
        // fetch-began-time: 1154569278774
        // fetch-completed-time: 1154569281816
        //
        // Annotations.
        
        byte [] b = r.getUTF8Bytes();
        w.writeMetadataRecord(curi.toString(), timestamp, ANVLRecord.MIMETYPE,
            uid, namedFields, new ByteArrayInputStream(b), b.length);
        return uid;
    }
    
    protected URI getRecordID() throws IOException {
        URI result;
        try {
            result = GeneratorFactory.getFactory().getRecordID();
        } catch (URISyntaxException e) {
            throw new IOException(e.toString());
        }
        return result;
    }
    
    protected URI qualifyRecordID(final URI base, final String key,
            final String value)
    throws IOException {
        URI result;
        Map<String, String> qualifiers = new HashMap<String, String>(1);
        qualifiers.put(key, value);
        try {
            result = GeneratorFactory.getFactory().
                qualifyRecordID(base, qualifiers);
        } catch (URISyntaxException e) {
            throw new IOException(e.toString());
        }
        return result;
    }  
    
    @Override
    protected String getFirstrecordStylesheet() {
        return "/warcinfobody.xsl";
    }

    /**
     * Return relevant values as header-like fields (here ANVLRecord, but 
     * spec-defined "application/warc-fields" type when written). Field
     * names from from DCMI Terms and the WARC/0.17 specification.
     * 
     * @see org.archive.crawler.framework.WriterPoolProcessor#getFirstrecordBody(java.io.File)
     */
    @Override
    protected String getFirstrecordBody(File orderFile) {
        ANVLRecord record = new ANVLRecord(7);
        record.addLabelValue("software", "Heritrix/" +
                Heritrix.getVersion() + " http://crawler.archive.org");
        try {
            InetAddress host = InetAddress.getLocalHost();
            record.addLabelValue("ip", host.getHostAddress());
            record.addLabelValue("hostname", host.getCanonicalHostName());
        } catch (UnknownHostException e) {
            logger.log(Level.WARNING,"unable top obtain local crawl engine host",e);
        }

        // conforms to ISO 28500:2009 as of May 2009
        // as described at http://bibnum.bnf.fr/WARC/ 
        // latest draft as of November 2008
        record.addLabelValue("format","WARC File Format 1.0"); 
        record.addLabelValue("conformsTo","http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf");

        // Get other values from order.xml 
        try {
            Document doc = XmlUtils.getDocument(orderFile);
            addIfNotBlank(record,"operator",
                    XmlUtils.xpathOrNull(doc,"//meta/operator"));
            addIfNotBlank(record,"publisher",
                    XmlUtils.xpathOrNull(doc,"//meta/organization"));
            addIfNotBlank(record,"audience",
                    XmlUtils.xpathOrNull(doc,"//meta/audience"));
            addIfNotBlank(record,"isPartOf",
                    XmlUtils.xpathOrNull(doc,"//meta/name"));

            // disabling "created" field per HER-1634
            // though it's theoretically useful as a means of distinguishing 
            // one crawl from another, the current usage/specification is too 
            // vague... in particular a 'created' field in the 'warcinfo' is 
            // reasonable to interpret as applying to the WARC-unit, rather 
            // than the crawl-job-unit so we remove it and see if anyone 
            // complains or makes a case for restoring it in a less-ambiguous 
            // manner
//            String rawDate = XmlUtils.xpathOrNull(doc,"//meta/date");
//            if(StringUtils.isNotBlank(rawDate)) {
//            	Date date;
//            	try {
//            		date = ArchiveUtils.parse14DigitDate(rawDate);
//            		addIfNotBlank(record,"created",ArchiveUtils.getLog14Date(date));
//            	} catch (ParseException e) {
//            		logger.log(Level.WARNING,"obtaining warc created date",e);
//            	}
//            }

            addIfNotBlank(record,"description",
                    XmlUtils.xpathOrNull(doc,"//meta/description"));
            addIfNotBlank(record,"robots",
                    XmlUtils.xpathOrNull(doc, 
                            "//newObject[@name='robots-honoring-policy']/string[@name='type']"));
            addIfNotBlank(record,"http-header-user-agent",
                    XmlUtils.xpathOrNull(doc, 
                            "//map[@name='http-headers']/string[@name='user-agent']"));
            addIfNotBlank(record,"http-header-from",
                    XmlUtils.xpathOrNull(doc, 
                            "//map[@name='http-headers']/string[@name='from']"));
        } catch (IOException e) {
            logger.log(Level.WARNING,"obtaining warcinfo",e);
        } 
        // really ugly to return as string, when it may just be merged with 
        // a couple other fields at write time, but changing would require 
        // larger refactoring
        return record.toString();
    }


    protected void addIfNotBlank(ANVLRecord record, String label, String value) {
        if(StringUtils.isNotBlank(value)) {
            record.addLabelValue(label, value);
        }
    }
}
Tech Fingerprint

Alerts (15)

'=' Maintainability Info: Avoid using unnamed 'magic' numbers directly in comparisons or assignments. Use named constants (static final variables) instead to improve readability and maintainability.
80 256 273 318 608
Complexity hotspot; lines 266 to 267 (total complexity: 4)
266 267
Complexity hotspot; line 331 (total complexity: 4)
331
'.close()' Manual .close() call detected. Prefer using try-with-resources (Java 7+) for automatic and safer resource management, especially handling exceptions during close.
389 408 427 460 492
Complexity hotspot; lines 548 to 549 (total complexity: 4)
548 549