/projects/informa-0.7.0-alpha2/src/de/nava/informa/parsers/Atom_0_3_Parser.java
Java | 374 lines | 199 code | 79 blank | 96 comment | 56 complexity | 85647e9beabdf53ba1abdbef7ad1c855 MD5 | raw file
1//
2// Informa -- RSS Library for Java
3// Copyright (c) 2002 by Niko Schmuck
4//
5// Niko Schmuck
6// http://sourceforge.net/projects/informa
7// mailto:niko_schmuck@users.sourceforge.net
8//
9// This library is free software.
10//
11// You may redistribute it and/or modify it under the terms of the GNU
12// Lesser General Public License as published by the Free Software Foundation.
13//
14// Version 2.1 of the license should be included with this distribution in
15// the file LICENSE. If the license is not included with this distribution,
16// you may find a copy at the FSF web site at 'www.gnu.org' or 'www.fsf.org',
17// or you may write to the Free Software Foundation, 675 Mass Ave, Cambridge,
18// MA 02139 USA.
19//
20// This library is distributed in the hope that it will be useful,
21// but WITHOUT ANY WARRANTY; without even the implied waranty of
22// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23// Lesser General Public License for more details.
24//
25
26// $Id: Atom_0_3_Parser.java,v 1.15 2007/01/06 22:18:13 niko_schmuck Exp $
27package de.nava.informa.parsers;
28
29import java.net.URL;
30import java.util.Date;
31import java.util.Iterator;
32import java.util.List;
33
34import org.apache.commons.logging.Log;
35import org.apache.commons.logging.LogFactory;
36import org.jdom.Element;
37import org.jdom.Namespace;
38
39import de.nava.informa.core.ChannelBuilderIF;
40import de.nava.informa.core.ChannelFormat;
41import de.nava.informa.core.ChannelIF;
42import de.nava.informa.core.ChannelParserIF;
43import de.nava.informa.core.ItemIF;
44import de.nava.informa.core.ParseException;
45import de.nava.informa.utils.AtomParserUtils;
46import de.nava.informa.utils.ParserUtils;
47
48
49/**
50 * Parser which reads in document instances according to the Atom 0.3
51 * specification and generates a news channel object. Currently the
52 * support for the atom syntax is not complete.
53 *
54 * @author Niko Schmuck
55 */
56class Atom_0_3_Parser implements ChannelParserIF {
57 static public final Log LOGGER = LogFactory.getLog(Atom_0_3_Parser.class);
58
59 /**
60 * Private constructor suppresses generation of a (public) default constructor.
61 */
62 private Atom_0_3_Parser() {}
63
64 /**
65 * Holder of the Atom_0_3_Parser instance.
66 */
67 private static class Atom_0_3_ParserHolder {
68 private static Atom_0_3_Parser instance = new Atom_0_3_Parser();
69 }
70
71 /**
72 * Get the Atom_0_3_Parser instance.
73 */
74 public static Atom_0_3_Parser getInstance() {
75 return Atom_0_3_ParserHolder.instance;
76 }
77
78 static String getValue(Element elt) {
79 return AtomParserUtils.getValue(elt, elt.getAttributeValue("mode"));
80 }
81
82 /** Returns the content from content element. */
83 static String getContent(Element elt) {
84 if (elt == null) {
85 return "";
86 }
87
88 String value = getValue(elt);
89 String type = getContentType(elt);
90
91 if ("text/plain".equals(type)) {
92 value = ParserUtils.escape(value);
93 }
94
95 return value;
96 }
97
98 /** Returns the content type of element. Default is 'text/plain' according to Atom draft 0.3. */
99 private static String getContentType(Element elt) {
100 String type = elt.getAttributeValue("type");
101
102 return (type == null) ? "text/plain" : type;
103 }
104
105 /** Returns copyright from element. */
106 static String getCopyright(Element elt) {
107 return getTitle(elt);
108 }
109
110 /**
111 * Looks for "content" elements and takes first from them or looks for "summary" element if
112 * "content" not found.
113 *
114 * @param item item element.
115 * @param namespace namespace.
116 *
117 * @return description for item.
118 */
119 public static String getDescription(Element item, Namespace namespace) {
120 String strDesc = "";
121 Element elDesc;
122
123 List contents = item.getChildren("content", namespace);
124
125 if (contents.size() > 0) {
126 elDesc = (Element) contents.get(0);
127 } else {
128 elDesc = item.getChild("summary", namespace);
129 }
130
131 if (elDesc != null) {
132 strDesc = getValue(elDesc);
133 }
134
135 return strDesc;
136 }
137
138 /** Returns the title from title element. */
139 static String getTitle(Element elt) {
140 if (elt == null) {
141 return "";
142 }
143
144 String type = getContentType(elt);
145 String value;
146
147 if ("application/xhtml+xml".equals(type)) {
148 value = elt.getValue();
149 } else {
150 value = AtomParserUtils.getValue(elt, elt.getAttributeValue("mode"));
151
152 if (!"text/plain".equals(type)) {
153 value = ParserUtils.unEscape(value);
154 }
155 }
156
157 return value;
158 }
159
160 /**
161 * @see de.nava.informa.core.ChannelParserIF#parse(de.nava.informa.core.ChannelBuilderIF, org.jdom.Element)
162 */
163 public ChannelIF parse(ChannelBuilderIF cBuilder, Element channel)
164 throws ParseException {
165 if (cBuilder == null) {
166 throw new RuntimeException("Without builder no channel can " +
167 "be created.");
168 }
169
170 Date dateParsed = new Date();
171 Namespace defNS = ParserUtils.getDefaultNS(channel);
172
173 if (defNS == null) {
174 defNS = Namespace.NO_NAMESPACE;
175 LOGGER.info("No default namespace found.");
176 }
177
178 // RSS 1.0 Dublin Core Module namespace
179 Namespace dcNS = ParserUtils.getNamespace(channel, "dc");
180
181 if (dcNS == null) {
182 LOGGER.debug("No namespace for dublin core found");
183 dcNS = defNS;
184 }
185
186 LOGGER.debug("start parsing.");
187
188 // get version attribute
189 String formatVersion = "0.3";
190
191 if (channel.getAttribute("version") != null) {
192 formatVersion = channel.getAttribute("version").getValue().trim();
193 LOGGER.debug("Atom version " + formatVersion + " specified in document.");
194 } else {
195 LOGGER.info("No format version specified, using default.");
196 }
197
198 // --- read in channel information
199
200 // Lower the case of these tags to simulate case-insensitive parsing
201 ParserUtils.matchCaseOfChildren(channel,
202 new String[] {
203 "title", "description", "tagline", "ttl",
204 "modified", "author", "generator",
205 "copyright", "link", "entry"
206 });
207
208 // title element
209 ChannelIF chnl = cBuilder.createChannel(channel,
210 channel.getChildTextTrim("title",
211 defNS));
212
213 // TODO: support attributes: type, mode
214 chnl.setFormat(ChannelFormat.ATOM_0_3);
215
216 // language
217 String language = channel.getAttributeValue("lang", Namespace.XML_NAMESPACE);
218
219 if (language != null) {
220 chnl.setLanguage(language);
221 }
222
223 // description element
224 if (channel.getChild("description") != null) {
225 chnl.setDescription(channel.getChildTextTrim("description", defNS));
226 } else {
227 // fallback
228 chnl.setDescription(channel.getChildTextTrim("tagline", defNS));
229 }
230
231 // ttl in dc namespace
232 Element ttl = channel.getChild("ttl", dcNS);
233
234 if (ttl != null) {
235 String ttlString = ttl.getTextTrim();
236
237 if (ttlString != null) {
238 chnl.setTtl(Integer.parseInt(ttlString));
239 }
240 }
241
242 // lastbuild element : modified ?
243 Element modified = channel.getChild("modified", defNS);
244
245 if (modified != null) {
246 chnl.setPubDate(ParserUtils.getDate(modified.getTextTrim()));
247 }
248
249 // TODO : issued value
250 /*
251 if (modified != null) {
252 modified = channel.getChild("issued", defNS);
253 chnl.setLastBuildDate (ParserUtils.getDate(modified.getTextTrim()));
254 }
255 */
256
257 // author element
258 Element author = channel.getChild("author", defNS);
259
260 if (author != null) {
261 ParserUtils.matchCaseOfChildren(author, "name");
262 chnl.setCreator(author.getChildTextTrim("name", defNS));
263 }
264
265 // generator element
266 Element generator = channel.getChild("generator", defNS);
267
268 if (generator != null) {
269 chnl.setGenerator(generator.getTextTrim());
270 }
271
272 // copyright element
273 Element copyright = channel.getChild("copyright", defNS);
274
275 if (copyright != null) {
276 chnl.setCopyright(getCopyright(copyright));
277 }
278
279 // n link elements
280 // TODO : type attribut of link (text, application...)
281 List links = channel.getChildren("link", defNS);
282 Iterator i = links.iterator();
283
284 while (i.hasNext()) {
285 Element linkElement = (Element) i.next();
286
287 // use first 'alternate' link
288 String rel = linkElement.getAttributeValue("rel");
289 String href = linkElement.getAttributeValue("href");
290
291 if ((rel != null) && (href != null) && rel.equals("alternate")) {
292 URL linkURL = ParserUtils.getURL(href);
293
294 chnl.setSite(linkURL);
295
296 break;
297 }
298
299 // TODO: further extraction of link information
300 }
301
302 // 1..n entry elements
303 List items = channel.getChildren("entry", defNS);
304
305 i = items.iterator();
306
307 while (i.hasNext()) {
308 Element item = (Element) i.next();
309
310 // Lower the case of these tags to simulate case-insensitive parsing
311 ParserUtils.matchCaseOfChildren(item,
312 new String[] {
313 "title", "link", "content", "summary",
314 "issued", "subject"
315 });
316
317 // get title element
318 // TODO : deal with type attribut
319 Element elTitle = item.getChild("title", defNS);
320 String strTitle = "<No Title>";
321
322 if (elTitle != null) {
323 strTitle = getTitle(elTitle);
324 LOGGER.debug("Parsing title " + elTitle.getTextTrim() + "->" +
325 strTitle);
326 }
327
328 if (LOGGER.isDebugEnabled()) {
329 LOGGER.debug("Entry element found (" + strTitle + ").");
330 }
331
332 // get link element
333 String strLink = AtomParserUtils.getItemLink(item, defNS);
334
335 // get description element
336 String strDesc = getDescription(item, defNS);
337
338 // generate new news item (link to article)
339 ItemIF curItem = cBuilder.createItem(item, chnl, strTitle, strDesc,
340 ParserUtils.getURL(strLink));
341
342 curItem.setFound(dateParsed);
343
344 // get issued element (required)
345 Element elIssued = item.getChild("issued", defNS);
346
347 if (elIssued == null) {
348 // [adewale@gmail.com, 01-May-2005] Fix for blogs which have
349 // 'created' dates, but not 'issued' dates -- in clear contravention
350 // of the Atom 0.3 spec.
351 Element elCreated = item.getChild("created", defNS);
352
353 if (elCreated != null) {
354 curItem.setDate(ParserUtils.getDate(elCreated.getTextTrim()));
355 }
356 } else {
357 curItem.setDate(ParserUtils.getDate(elIssued.getTextTrim()));
358 }
359
360 // get subject element
361 Element elSubject = item.getChild("subject", dcNS);
362
363 if (elSubject != null) {
364 // TODO: Mulitple subject elements not handled currently
365 curItem.setSubject(elSubject.getTextTrim());
366 }
367 }
368
369 // set to current date
370 chnl.setLastUpdated(dateParsed);
371
372 return chnl;
373 }
374}