PageRenderTime 858ms CodeModel.GetById 22ms RepoModel.GetById 1ms app.codeStats 0ms

/projects/informa-0.7.0-alpha2/src/de/nava/informa/parsers/Atom_0_3_Parser.java

https://gitlab.com/essere.lab.public/qualitas.class-corpus
Java | 374 lines | 199 code | 79 blank | 96 comment | 56 complexity | 85647e9beabdf53ba1abdbef7ad1c855 MD5 | raw file
  1. //
  2. // Informa -- RSS Library for Java
  3. // Copyright (c) 2002 by Niko Schmuck
  4. //
  5. // Niko Schmuck
  6. // http://sourceforge.net/projects/informa
  7. // mailto:niko_schmuck@users.sourceforge.net
  8. //
  9. // This library is free software.
  10. //
  11. // You may redistribute it and/or modify it under the terms of the GNU
  12. // Lesser General Public License as published by the Free Software Foundation.
  13. //
  14. // Version 2.1 of the license should be included with this distribution in
  15. // the file LICENSE. If the license is not included with this distribution,
  16. // you may find a copy at the FSF web site at 'www.gnu.org' or 'www.fsf.org',
  17. // or you may write to the Free Software Foundation, 675 Mass Ave, Cambridge,
  18. // MA 02139 USA.
  19. //
  20. // This library is distributed in the hope that it will be useful,
  21. // but WITHOUT ANY WARRANTY; without even the implied waranty of
  22. // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  23. // Lesser General Public License for more details.
  24. //
  25. // $Id: Atom_0_3_Parser.java,v 1.15 2007/01/06 22:18:13 niko_schmuck Exp $
  26. package de.nava.informa.parsers;
  27. import java.net.URL;
  28. import java.util.Date;
  29. import java.util.Iterator;
  30. import java.util.List;
  31. import org.apache.commons.logging.Log;
  32. import org.apache.commons.logging.LogFactory;
  33. import org.jdom.Element;
  34. import org.jdom.Namespace;
  35. import de.nava.informa.core.ChannelBuilderIF;
  36. import de.nava.informa.core.ChannelFormat;
  37. import de.nava.informa.core.ChannelIF;
  38. import de.nava.informa.core.ChannelParserIF;
  39. import de.nava.informa.core.ItemIF;
  40. import de.nava.informa.core.ParseException;
  41. import de.nava.informa.utils.AtomParserUtils;
  42. import de.nava.informa.utils.ParserUtils;
  43. /**
  44. * Parser which reads in document instances according to the Atom 0.3
  45. * specification and generates a news channel object. Currently the
  46. * support for the atom syntax is not complete.
  47. *
  48. * @author Niko Schmuck
  49. */
  50. class Atom_0_3_Parser implements ChannelParserIF {
  51. static public final Log LOGGER = LogFactory.getLog(Atom_0_3_Parser.class);
  52. /**
  53. * Private constructor suppresses generation of a (public) default constructor.
  54. */
  55. private Atom_0_3_Parser() {}
  56. /**
  57. * Holder of the Atom_0_3_Parser instance.
  58. */
  59. private static class Atom_0_3_ParserHolder {
  60. private static Atom_0_3_Parser instance = new Atom_0_3_Parser();
  61. }
  62. /**
  63. * Get the Atom_0_3_Parser instance.
  64. */
  65. public static Atom_0_3_Parser getInstance() {
  66. return Atom_0_3_ParserHolder.instance;
  67. }
  68. static String getValue(Element elt) {
  69. return AtomParserUtils.getValue(elt, elt.getAttributeValue("mode"));
  70. }
  71. /** Returns the content from content element. */
  72. static String getContent(Element elt) {
  73. if (elt == null) {
  74. return "";
  75. }
  76. String value = getValue(elt);
  77. String type = getContentType(elt);
  78. if ("text/plain".equals(type)) {
  79. value = ParserUtils.escape(value);
  80. }
  81. return value;
  82. }
  83. /** Returns the content type of element. Default is 'text/plain' according to Atom draft 0.3. */
  84. private static String getContentType(Element elt) {
  85. String type = elt.getAttributeValue("type");
  86. return (type == null) ? "text/plain" : type;
  87. }
  88. /** Returns copyright from element. */
  89. static String getCopyright(Element elt) {
  90. return getTitle(elt);
  91. }
  92. /**
  93. * Looks for "content" elements and takes first from them or looks for "summary" element if
  94. * "content" not found.
  95. *
  96. * @param item item element.
  97. * @param namespace namespace.
  98. *
  99. * @return description for item.
  100. */
  101. public static String getDescription(Element item, Namespace namespace) {
  102. String strDesc = "";
  103. Element elDesc;
  104. List contents = item.getChildren("content", namespace);
  105. if (contents.size() > 0) {
  106. elDesc = (Element) contents.get(0);
  107. } else {
  108. elDesc = item.getChild("summary", namespace);
  109. }
  110. if (elDesc != null) {
  111. strDesc = getValue(elDesc);
  112. }
  113. return strDesc;
  114. }
  115. /** Returns the title from title element. */
  116. static String getTitle(Element elt) {
  117. if (elt == null) {
  118. return "";
  119. }
  120. String type = getContentType(elt);
  121. String value;
  122. if ("application/xhtml+xml".equals(type)) {
  123. value = elt.getValue();
  124. } else {
  125. value = AtomParserUtils.getValue(elt, elt.getAttributeValue("mode"));
  126. if (!"text/plain".equals(type)) {
  127. value = ParserUtils.unEscape(value);
  128. }
  129. }
  130. return value;
  131. }
  132. /**
  133. * @see de.nava.informa.core.ChannelParserIF#parse(de.nava.informa.core.ChannelBuilderIF, org.jdom.Element)
  134. */
  135. public ChannelIF parse(ChannelBuilderIF cBuilder, Element channel)
  136. throws ParseException {
  137. if (cBuilder == null) {
  138. throw new RuntimeException("Without builder no channel can " +
  139. "be created.");
  140. }
  141. Date dateParsed = new Date();
  142. Namespace defNS = ParserUtils.getDefaultNS(channel);
  143. if (defNS == null) {
  144. defNS = Namespace.NO_NAMESPACE;
  145. LOGGER.info("No default namespace found.");
  146. }
  147. // RSS 1.0 Dublin Core Module namespace
  148. Namespace dcNS = ParserUtils.getNamespace(channel, "dc");
  149. if (dcNS == null) {
  150. LOGGER.debug("No namespace for dublin core found");
  151. dcNS = defNS;
  152. }
  153. LOGGER.debug("start parsing.");
  154. // get version attribute
  155. String formatVersion = "0.3";
  156. if (channel.getAttribute("version") != null) {
  157. formatVersion = channel.getAttribute("version").getValue().trim();
  158. LOGGER.debug("Atom version " + formatVersion + " specified in document.");
  159. } else {
  160. LOGGER.info("No format version specified, using default.");
  161. }
  162. // --- read in channel information
  163. // Lower the case of these tags to simulate case-insensitive parsing
  164. ParserUtils.matchCaseOfChildren(channel,
  165. new String[] {
  166. "title", "description", "tagline", "ttl",
  167. "modified", "author", "generator",
  168. "copyright", "link", "entry"
  169. });
  170. // title element
  171. ChannelIF chnl = cBuilder.createChannel(channel,
  172. channel.getChildTextTrim("title",
  173. defNS));
  174. // TODO: support attributes: type, mode
  175. chnl.setFormat(ChannelFormat.ATOM_0_3);
  176. // language
  177. String language = channel.getAttributeValue("lang", Namespace.XML_NAMESPACE);
  178. if (language != null) {
  179. chnl.setLanguage(language);
  180. }
  181. // description element
  182. if (channel.getChild("description") != null) {
  183. chnl.setDescription(channel.getChildTextTrim("description", defNS));
  184. } else {
  185. // fallback
  186. chnl.setDescription(channel.getChildTextTrim("tagline", defNS));
  187. }
  188. // ttl in dc namespace
  189. Element ttl = channel.getChild("ttl", dcNS);
  190. if (ttl != null) {
  191. String ttlString = ttl.getTextTrim();
  192. if (ttlString != null) {
  193. chnl.setTtl(Integer.parseInt(ttlString));
  194. }
  195. }
  196. // lastbuild element : modified ?
  197. Element modified = channel.getChild("modified", defNS);
  198. if (modified != null) {
  199. chnl.setPubDate(ParserUtils.getDate(modified.getTextTrim()));
  200. }
  201. // TODO : issued value
  202. /*
  203. if (modified != null) {
  204. modified = channel.getChild("issued", defNS);
  205. chnl.setLastBuildDate (ParserUtils.getDate(modified.getTextTrim()));
  206. }
  207. */
  208. // author element
  209. Element author = channel.getChild("author", defNS);
  210. if (author != null) {
  211. ParserUtils.matchCaseOfChildren(author, "name");
  212. chnl.setCreator(author.getChildTextTrim("name", defNS));
  213. }
  214. // generator element
  215. Element generator = channel.getChild("generator", defNS);
  216. if (generator != null) {
  217. chnl.setGenerator(generator.getTextTrim());
  218. }
  219. // copyright element
  220. Element copyright = channel.getChild("copyright", defNS);
  221. if (copyright != null) {
  222. chnl.setCopyright(getCopyright(copyright));
  223. }
  224. // n link elements
  225. // TODO : type attribut of link (text, application...)
  226. List links = channel.getChildren("link", defNS);
  227. Iterator i = links.iterator();
  228. while (i.hasNext()) {
  229. Element linkElement = (Element) i.next();
  230. // use first 'alternate' link
  231. String rel = linkElement.getAttributeValue("rel");
  232. String href = linkElement.getAttributeValue("href");
  233. if ((rel != null) && (href != null) && rel.equals("alternate")) {
  234. URL linkURL = ParserUtils.getURL(href);
  235. chnl.setSite(linkURL);
  236. break;
  237. }
  238. // TODO: further extraction of link information
  239. }
  240. // 1..n entry elements
  241. List items = channel.getChildren("entry", defNS);
  242. i = items.iterator();
  243. while (i.hasNext()) {
  244. Element item = (Element) i.next();
  245. // Lower the case of these tags to simulate case-insensitive parsing
  246. ParserUtils.matchCaseOfChildren(item,
  247. new String[] {
  248. "title", "link", "content", "summary",
  249. "issued", "subject"
  250. });
  251. // get title element
  252. // TODO : deal with type attribut
  253. Element elTitle = item.getChild("title", defNS);
  254. String strTitle = "<No Title>";
  255. if (elTitle != null) {
  256. strTitle = getTitle(elTitle);
  257. LOGGER.debug("Parsing title " + elTitle.getTextTrim() + "->" +
  258. strTitle);
  259. }
  260. if (LOGGER.isDebugEnabled()) {
  261. LOGGER.debug("Entry element found (" + strTitle + ").");
  262. }
  263. // get link element
  264. String strLink = AtomParserUtils.getItemLink(item, defNS);
  265. // get description element
  266. String strDesc = getDescription(item, defNS);
  267. // generate new news item (link to article)
  268. ItemIF curItem = cBuilder.createItem(item, chnl, strTitle, strDesc,
  269. ParserUtils.getURL(strLink));
  270. curItem.setFound(dateParsed);
  271. // get issued element (required)
  272. Element elIssued = item.getChild("issued", defNS);
  273. if (elIssued == null) {
  274. // [adewale@gmail.com, 01-May-2005] Fix for blogs which have
  275. // 'created' dates, but not 'issued' dates -- in clear contravention
  276. // of the Atom 0.3 spec.
  277. Element elCreated = item.getChild("created", defNS);
  278. if (elCreated != null) {
  279. curItem.setDate(ParserUtils.getDate(elCreated.getTextTrim()));
  280. }
  281. } else {
  282. curItem.setDate(ParserUtils.getDate(elIssued.getTextTrim()));
  283. }
  284. // get subject element
  285. Element elSubject = item.getChild("subject", dcNS);
  286. if (elSubject != null) {
  287. // TODO: Mulitple subject elements not handled currently
  288. curItem.setSubject(elSubject.getTextTrim());
  289. }
  290. }
  291. // set to current date
  292. chnl.setLastUpdated(dateParsed);
  293. return chnl;
  294. }
  295. }