PageRenderTime 91ms CodeModel.GetById 54ms app.highlight 32ms RepoModel.GetById 0ms app.codeStats 0ms

/projects/informa-0.7.0-alpha2/src/de/nava/informa/parsers/Atom_0_3_Parser.java

https://gitlab.com/essere.lab.public/qualitas.class-corpus
Java | 374 lines | 199 code | 79 blank | 96 comment | 56 complexity | 85647e9beabdf53ba1abdbef7ad1c855 MD5 | raw file
  1//
  2// Informa -- RSS Library for Java
  3// Copyright (c) 2002 by Niko Schmuck
  4//
  5// Niko Schmuck
  6// http://sourceforge.net/projects/informa
  7// mailto:niko_schmuck@users.sourceforge.net
  8//
  9// This library is free software.
 10//
 11// You may redistribute it and/or modify it under the terms of the GNU
 12// Lesser General Public License as published by the Free Software Foundation.
 13//
 14// Version 2.1 of the license should be included with this distribution in
 15// the file LICENSE. If the license is not included with this distribution,
 16// you may find a copy at the FSF web site at 'www.gnu.org' or 'www.fsf.org',
 17// or you may write to the Free Software Foundation, 675 Mass Ave, Cambridge,
 18// MA 02139 USA.
 19//
 20// This library is distributed in the hope that it will be useful,
 21// but WITHOUT ANY WARRANTY; without even the implied waranty of
 22// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 23// Lesser General Public License for more details.
 24//
 25
 26// $Id: Atom_0_3_Parser.java,v 1.15 2007/01/06 22:18:13 niko_schmuck Exp $
 27package de.nava.informa.parsers;
 28
 29import java.net.URL;
 30import java.util.Date;
 31import java.util.Iterator;
 32import java.util.List;
 33
 34import org.apache.commons.logging.Log;
 35import org.apache.commons.logging.LogFactory;
 36import org.jdom.Element;
 37import org.jdom.Namespace;
 38
 39import de.nava.informa.core.ChannelBuilderIF;
 40import de.nava.informa.core.ChannelFormat;
 41import de.nava.informa.core.ChannelIF;
 42import de.nava.informa.core.ChannelParserIF;
 43import de.nava.informa.core.ItemIF;
 44import de.nava.informa.core.ParseException;
 45import de.nava.informa.utils.AtomParserUtils;
 46import de.nava.informa.utils.ParserUtils;
 47
 48
 49/**
 50 * Parser which reads in document instances according to the Atom 0.3
 51 * specification and generates a news channel object. Currently the
 52 * support for the atom syntax is not complete.
 53 *
 54 * @author Niko Schmuck
 55 */
 56class Atom_0_3_Parser implements ChannelParserIF {
 57  static public final Log LOGGER = LogFactory.getLog(Atom_0_3_Parser.class);
 58
 59  /**
 60   * Private constructor suppresses generation of a (public) default constructor.
 61   */
 62  private Atom_0_3_Parser() {}
 63
 64  /**
 65   * Holder of the Atom_0_3_Parser instance.
 66   */
 67  private static class Atom_0_3_ParserHolder {
 68    private static Atom_0_3_Parser instance = new Atom_0_3_Parser();
 69  } 
 70
 71  /**
 72   * Get the Atom_0_3_Parser instance.
 73   */
 74  public static Atom_0_3_Parser getInstance() {
 75    return Atom_0_3_ParserHolder.instance;
 76  }
 77  
 78  static String getValue(Element elt) {
 79      return AtomParserUtils.getValue(elt, elt.getAttributeValue("mode"));
 80  }
 81
 82  /** Returns the content from content element. */
 83  static String getContent(Element elt) {
 84    if (elt == null) {
 85      return "";
 86    }
 87
 88    String value = getValue(elt);
 89    String type = getContentType(elt);
 90
 91    if ("text/plain".equals(type)) {
 92      value = ParserUtils.escape(value);
 93    }
 94
 95    return value;
 96  }
 97
 98  /** Returns the content type of element. Default is 'text/plain' according to Atom draft 0.3. */
 99  private static String getContentType(Element elt) {
100    String type = elt.getAttributeValue("type");
101
102    return (type == null) ? "text/plain" : type;
103  }
104
105  /** Returns copyright from element. */
106  static String getCopyright(Element elt) {
107    return getTitle(elt);
108  }
109
110  /**
111   * Looks for "content" elements and takes first from them or looks for "summary" element if
112   * "content" not found.
113   *
114   * @param item      item element.
115   * @param namespace namespace.
116   *
117   * @return description for item.
118   */
119  public static String getDescription(Element item, Namespace namespace) {
120    String strDesc = "";
121    Element elDesc;
122
123    List contents = item.getChildren("content", namespace);
124
125    if (contents.size() > 0) {
126      elDesc = (Element) contents.get(0);
127    } else {
128      elDesc = item.getChild("summary", namespace);
129    }
130
131    if (elDesc != null) {
132      strDesc = getValue(elDesc);
133    }
134
135    return strDesc;
136  }
137
138  /** Returns the title from title element. */
139  static String getTitle(Element elt) {
140    if (elt == null) {
141      return "";
142    }
143
144    String type = getContentType(elt);
145    String value;
146
147    if ("application/xhtml+xml".equals(type)) {
148      value = elt.getValue();
149    } else {
150      value = AtomParserUtils.getValue(elt, elt.getAttributeValue("mode"));
151
152      if (!"text/plain".equals(type)) {
153        value = ParserUtils.unEscape(value);
154      }
155    }
156
157    return value;
158  }
159
160  /**
161   * @see de.nava.informa.core.ChannelParserIF#parse(de.nava.informa.core.ChannelBuilderIF, org.jdom.Element)
162   */
163  public ChannelIF parse(ChannelBuilderIF cBuilder, Element channel)
164      throws ParseException {
165    if (cBuilder == null) {
166      throw new RuntimeException("Without builder no channel can " +
167                                 "be created.");
168    }
169
170    Date dateParsed = new Date();
171    Namespace defNS = ParserUtils.getDefaultNS(channel);
172
173    if (defNS == null) {
174      defNS = Namespace.NO_NAMESPACE;
175      LOGGER.info("No default namespace found.");
176    }
177
178    // RSS 1.0 Dublin Core Module namespace
179    Namespace dcNS = ParserUtils.getNamespace(channel, "dc");
180
181    if (dcNS == null) {
182      LOGGER.debug("No namespace for dublin core found");
183      dcNS = defNS;
184    }
185
186    LOGGER.debug("start parsing.");
187
188    // get version attribute
189    String formatVersion = "0.3";
190
191    if (channel.getAttribute("version") != null) {
192      formatVersion = channel.getAttribute("version").getValue().trim();
193      LOGGER.debug("Atom version " + formatVersion + " specified in document.");
194    } else {
195      LOGGER.info("No format version specified, using default.");
196    }
197
198    // --- read in channel information
199
200    // Lower the case of these tags to simulate case-insensitive parsing
201    ParserUtils.matchCaseOfChildren(channel,
202                                    new String[] {
203                                      "title", "description", "tagline", "ttl",
204                                      "modified", "author", "generator",
205                                      "copyright", "link", "entry"
206                                    });
207
208    // title element
209    ChannelIF chnl = cBuilder.createChannel(channel,
210                                            channel.getChildTextTrim("title",
211                                                                     defNS));
212
213    // TODO: support attributes: type, mode
214    chnl.setFormat(ChannelFormat.ATOM_0_3);
215
216    // language
217    String language = channel.getAttributeValue("lang", Namespace.XML_NAMESPACE);
218
219    if (language != null) {
220      chnl.setLanguage(language);
221    }
222
223    // description element
224    if (channel.getChild("description") != null) {
225      chnl.setDescription(channel.getChildTextTrim("description", defNS));
226    } else {
227      // fallback
228      chnl.setDescription(channel.getChildTextTrim("tagline", defNS));
229    }
230
231    // ttl in dc namespace
232    Element ttl = channel.getChild("ttl", dcNS);
233
234    if (ttl != null) {
235      String ttlString = ttl.getTextTrim();
236
237      if (ttlString != null) {
238        chnl.setTtl(Integer.parseInt(ttlString));
239      }
240    }
241
242    //  lastbuild element : modified ?
243    Element modified = channel.getChild("modified", defNS);
244
245    if (modified != null) {
246      chnl.setPubDate(ParserUtils.getDate(modified.getTextTrim()));
247    }
248
249    // TODO : issued value
250    /*
251    if (modified != null) {
252      modified = channel.getChild("issued", defNS);
253      chnl.setLastBuildDate (ParserUtils.getDate(modified.getTextTrim()));
254    }
255    */
256
257    // author element
258    Element author = channel.getChild("author", defNS);
259
260    if (author != null) {
261      ParserUtils.matchCaseOfChildren(author, "name");
262      chnl.setCreator(author.getChildTextTrim("name", defNS));
263    }
264
265    // generator element
266    Element generator = channel.getChild("generator", defNS);
267
268    if (generator != null) {
269      chnl.setGenerator(generator.getTextTrim());
270    }
271
272    // copyright element
273    Element copyright = channel.getChild("copyright", defNS);
274
275    if (copyright != null) {
276      chnl.setCopyright(getCopyright(copyright));
277    }
278
279    // n link elements
280    // TODO : type attribut of link (text, application...)
281    List links = channel.getChildren("link", defNS);
282    Iterator i = links.iterator();
283
284    while (i.hasNext()) {
285      Element linkElement = (Element) i.next();
286
287      // use first 'alternate' link
288      String rel = linkElement.getAttributeValue("rel");
289      String href = linkElement.getAttributeValue("href");
290
291      if ((rel != null) && (href != null) && rel.equals("alternate")) {
292        URL linkURL = ParserUtils.getURL(href);
293
294        chnl.setSite(linkURL);
295
296        break;
297      }
298
299      // TODO: further extraction of link information
300    }
301
302    // 1..n entry elements
303    List items = channel.getChildren("entry", defNS);
304
305    i = items.iterator();
306
307    while (i.hasNext()) {
308      Element item = (Element) i.next();
309
310      // Lower the case of these tags to simulate case-insensitive parsing
311      ParserUtils.matchCaseOfChildren(item,
312                                      new String[] {
313                                        "title", "link", "content", "summary",
314                                        "issued", "subject"
315                                      });
316
317      // get title element
318      // TODO : deal with type attribut
319      Element elTitle = item.getChild("title", defNS);
320      String strTitle = "<No Title>";
321
322      if (elTitle != null) {
323        strTitle = getTitle(elTitle);
324        LOGGER.debug("Parsing title " + elTitle.getTextTrim() + "->" +
325                     strTitle);
326      }
327
328      if (LOGGER.isDebugEnabled()) {
329        LOGGER.debug("Entry element found (" + strTitle + ").");
330      }
331
332      // get link element
333      String strLink = AtomParserUtils.getItemLink(item, defNS);
334
335      // get description element
336      String strDesc = getDescription(item, defNS);
337
338      // generate new news item (link to article)
339      ItemIF curItem = cBuilder.createItem(item, chnl, strTitle, strDesc,
340                                           ParserUtils.getURL(strLink));
341
342      curItem.setFound(dateParsed);
343
344      // get issued element (required)
345      Element elIssued = item.getChild("issued", defNS);
346
347      if (elIssued == null) {
348        // [adewale@gmail.com, 01-May-2005] Fix for blogs which have
349        // 'created' dates, but not 'issued' dates -- in clear contravention
350        // of the Atom 0.3 spec.
351        Element elCreated = item.getChild("created", defNS);
352
353        if (elCreated != null) {
354          curItem.setDate(ParserUtils.getDate(elCreated.getTextTrim()));
355        }
356      } else {
357        curItem.setDate(ParserUtils.getDate(elIssued.getTextTrim()));
358      }
359
360      // get subject element
361      Element elSubject = item.getChild("subject", dcNS);
362
363      if (elSubject != null) {
364        // TODO: Mulitple subject elements not handled currently
365        curItem.setSubject(elSubject.getTextTrim());
366      }
367    }
368
369    // set to current date
370    chnl.setLastUpdated(dateParsed);
371
372    return chnl;
373  }
374}