Atom_0_3_Parser.java

/projects/informa-0.7.0-alpha2/src/de/nava/informa/parsers/Atom_0_3_Parser.java

https://gitlab.com/essere.lab.public/qualitas.class-corpus
Java | 374 lines | 199 code | 79 blank | 96 comment | 56 complexity | 85647e9beabdf53ba1abdbef7ad1c855 MD5 | raw file

//
// Informa -- RSS Library for Java
// Copyright (c) 2002 by Niko Schmuck
//
// Niko Schmuck
// http://sourceforge.net/projects/informa
// mailto:niko_schmuck@users.sourceforge.net
//
// This library is free software.
//
// You may redistribute it and/or modify it under the terms of the GNU
// Lesser General Public License as published by the Free Software Foundation.
//
// Version 2.1 of the license should be included with this distribution in
// the file LICENSE. If the license is not included with this distribution,
// you may find a copy at the FSF web site at 'www.gnu.org' or 'www.fsf.org',
// or you may write to the Free Software Foundation, 675 Mass Ave, Cambridge,
// MA 02139 USA.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied waranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
// Lesser General Public License for more details.
//

// $Id: Atom_0_3_Parser.java,v 1.15 2007/01/06 22:18:13 niko_schmuck Exp $
package de.nava.informa.parsers;

import java.net.URL;
import java.util.Date;
import java.util.Iterator;
import java.util.List;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.jdom.Element;
import org.jdom.Namespace;

import de.nava.informa.core.ChannelBuilderIF;
import de.nava.informa.core.ChannelFormat;
import de.nava.informa.core.ChannelIF;
import de.nava.informa.core.ChannelParserIF;
import de.nava.informa.core.ItemIF;
import de.nava.informa.core.ParseException;
import de.nava.informa.utils.AtomParserUtils;
import de.nava.informa.utils.ParserUtils;


/**
 * Parser which reads in document instances according to the Atom 0.3
 * specification and generates a news channel object. Currently the
 * support for the atom syntax is not complete.
 *
 * @author Niko Schmuck
 */
class Atom_0_3_Parser implements ChannelParserIF {
  static public final Log LOGGER = LogFactory.getLog(Atom_0_3_Parser.class);

  /**
   * Private constructor suppresses generation of a (public) default constructor.
   */
  private Atom_0_3_Parser() {}

  /**
   * Holder of the Atom_0_3_Parser instance.
   */
  private static class Atom_0_3_ParserHolder {
    private static Atom_0_3_Parser instance = new Atom_0_3_Parser();
  } 

  /**
   * Get the Atom_0_3_Parser instance.
   */
  public static Atom_0_3_Parser getInstance() {
    return Atom_0_3_ParserHolder.instance;
  }
  
  static String getValue(Element elt) {
      return AtomParserUtils.getValue(elt, elt.getAttributeValue("mode"));
  }

  /** Returns the content from content element. */
  static String getContent(Element elt) {
    if (elt == null) {
      return "";
    }

    String value = getValue(elt);
    String type = getContentType(elt);

    if ("text/plain".equals(type)) {
      value = ParserUtils.escape(value);
    }

    return value;
  }

  /** Returns the content type of element. Default is 'text/plain' according to Atom draft 0.3. */
  private static String getContentType(Element elt) {
    String type = elt.getAttributeValue("type");

    return (type == null) ? "text/plain" : type;
  }

  /** Returns copyright from element. */
  static String getCopyright(Element elt) {
    return getTitle(elt);
  }

  /**
   * Looks for "content" elements and takes first from them or looks for "summary" element if
   * "content" not found.
   *
   * @param item      item element.
   * @param namespace namespace.
   *
   * @return description for item.
   */
  public static String getDescription(Element item, Namespace namespace) {
    String strDesc = "";
    Element elDesc;

    List contents = item.getChildren("content", namespace);

    if (contents.size() > 0) {
      elDesc = (Element) contents.get(0);
    } else {
      elDesc = item.getChild("summary", namespace);
    }

    if (elDesc != null) {
      strDesc = getValue(elDesc);
    }

    return strDesc;
  }

  /** Returns the title from title element. */
  static String getTitle(Element elt) {
    if (elt == null) {
      return "";
    }

    String type = getContentType(elt);
    String value;

    if ("application/xhtml+xml".equals(type)) {
      value = elt.getValue();
    } else {
      value = AtomParserUtils.getValue(elt, elt.getAttributeValue("mode"));

      if (!"text/plain".equals(type)) {
        value = ParserUtils.unEscape(value);
      }
    }

    return value;
  }

  /**
   * @see de.nava.informa.core.ChannelParserIF#parse(de.nava.informa.core.ChannelBuilderIF, org.jdom.Element)
   */
  public ChannelIF parse(ChannelBuilderIF cBuilder, Element channel)
      throws ParseException {
    if (cBuilder == null) {
      throw new RuntimeException("Without builder no channel can " +
                                 "be created.");
    }

    Date dateParsed = new Date();
    Namespace defNS = ParserUtils.getDefaultNS(channel);

    if (defNS == null) {
      defNS = Namespace.NO_NAMESPACE;
      LOGGER.info("No default namespace found.");
    }

    // RSS 1.0 Dublin Core Module namespace
    Namespace dcNS = ParserUtils.getNamespace(channel, "dc");

    if (dcNS == null) {
      LOGGER.debug("No namespace for dublin core found");
      dcNS = defNS;
    }

    LOGGER.debug("start parsing.");

    // get version attribute
    String formatVersion = "0.3";

    if (channel.getAttribute("version") != null) {
      formatVersion = channel.getAttribute("version").getValue().trim();
      LOGGER.debug("Atom version " + formatVersion + " specified in document.");
    } else {
      LOGGER.info("No format version specified, using default.");
    }

    // --- read in channel information

    // Lower the case of these tags to simulate case-insensitive parsing
    ParserUtils.matchCaseOfChildren(channel,
                                    new String[] {
                                      "title", "description", "tagline", "ttl",
                                      "modified", "author", "generator",
                                      "copyright", "link", "entry"
                                    });

    // title element
    ChannelIF chnl = cBuilder.createChannel(channel,
                                            channel.getChildTextTrim("title",
                                                                     defNS));

    // TODO: support attributes: type, mode
    chnl.setFormat(ChannelFormat.ATOM_0_3);

    // language
    String language = channel.getAttributeValue("lang", Namespace.XML_NAMESPACE);

    if (language != null) {
      chnl.setLanguage(language);
    }

    // description element
    if (channel.getChild("description") != null) {
      chnl.setDescription(channel.getChildTextTrim("description", defNS));
    } else {
      // fallback
      chnl.setDescription(channel.getChildTextTrim("tagline", defNS));
    }

    // ttl in dc namespace
    Element ttl = channel.getChild("ttl", dcNS);

    if (ttl != null) {
      String ttlString = ttl.getTextTrim();

      if (ttlString != null) {
        chnl.setTtl(Integer.parseInt(ttlString));
      }
    }

    //  lastbuild element : modified ?
    Element modified = channel.getChild("modified", defNS);

    if (modified != null) {
      chnl.setPubDate(ParserUtils.getDate(modified.getTextTrim()));
    }

    // TODO : issued value
    /*
    if (modified != null) {
      modified = channel.getChild("issued", defNS);
      chnl.setLastBuildDate (ParserUtils.getDate(modified.getTextTrim()));
    }
    */

    // author element
    Element author = channel.getChild("author", defNS);

    if (author != null) {
      ParserUtils.matchCaseOfChildren(author, "name");
      chnl.setCreator(author.getChildTextTrim("name", defNS));
    }

    // generator element
    Element generator = channel.getChild("generator", defNS);

    if (generator != null) {
      chnl.setGenerator(generator.getTextTrim());
    }

    // copyright element
    Element copyright = channel.getChild("copyright", defNS);

    if (copyright != null) {
      chnl.setCopyright(getCopyright(copyright));
    }

    // n link elements
    // TODO : type attribut of link (text, application...)
    List links = channel.getChildren("link", defNS);
    Iterator i = links.iterator();

    while (i.hasNext()) {
      Element linkElement = (Element) i.next();

      // use first 'alternate' link
      String rel = linkElement.getAttributeValue("rel");
      String href = linkElement.getAttributeValue("href");

      if ((rel != null) && (href != null) && rel.equals("alternate")) {
        URL linkURL = ParserUtils.getURL(href);

        chnl.setSite(linkURL);

        break;
      }

      // TODO: further extraction of link information
    }

    // 1..n entry elements
    List items = channel.getChildren("entry", defNS);

    i = items.iterator();

    while (i.hasNext()) {
      Element item = (Element) i.next();

      // Lower the case of these tags to simulate case-insensitive parsing
      ParserUtils.matchCaseOfChildren(item,
                                      new String[] {
                                        "title", "link", "content", "summary",
                                        "issued", "subject"
                                      });

      // get title element
      // TODO : deal with type attribut
      Element elTitle = item.getChild("title", defNS);
      String strTitle = "<No Title>";

      if (elTitle != null) {
        strTitle = getTitle(elTitle);
        LOGGER.debug("Parsing title " + elTitle.getTextTrim() + "->" +
                     strTitle);
      }

      if (LOGGER.isDebugEnabled()) {
        LOGGER.debug("Entry element found (" + strTitle + ").");
      }

      // get link element
      String strLink = AtomParserUtils.getItemLink(item, defNS);

      // get description element
      String strDesc = getDescription(item, defNS);

      // generate new news item (link to article)
      ItemIF curItem = cBuilder.createItem(item, chnl, strTitle, strDesc,
                                           ParserUtils.getURL(strLink));

      curItem.setFound(dateParsed);

      // get issued element (required)
      Element elIssued = item.getChild("issued", defNS);

      if (elIssued == null) {
        // [adewale@gmail.com, 01-May-2005] Fix for blogs which have
        // 'created' dates, but not 'issued' dates -- in clear contravention
        // of the Atom 0.3 spec.
        Element elCreated = item.getChild("created", defNS);

        if (elCreated != null) {
          curItem.setDate(ParserUtils.getDate(elCreated.getTextTrim()));
        }
      } else {
        curItem.setDate(ParserUtils.getDate(elIssued.getTextTrim()));
      }

      // get subject element
      Element elSubject = item.getChild("subject", dcNS);

      if (elSubject != null) {
        // TODO: Mulitple subject elements not handled currently
        curItem.setSubject(elSubject.getTextTrim());
      }
    }

    // set to current date
    chnl.setLastUpdated(dateParsed);

    return chnl;
  }
}