PageRenderTime 48ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 0ms

/src/com/atlassian/uwc/converters/xml/XmlConverter.java

https://bitbucket.org/dodok1/uwc
Java | 272 lines | 200 code | 25 blank | 47 comment | 21 complexity | aeccc0f32f8f0a4eeb64a65aa3e64872 MD5 | raw file
  1. package com.atlassian.uwc.converters.xml;
  2. import java.io.ByteArrayInputStream;
  3. import java.io.ByteArrayOutputStream;
  4. import java.io.InputStream;
  5. import java.io.StringReader;
  6. import java.io.UnsupportedEncodingException;
  7. import java.util.Enumeration;
  8. import java.util.Properties;
  9. import java.util.regex.Matcher;
  10. import java.util.regex.Pattern;
  11. import org.apache.log4j.Logger;
  12. import org.w3c.tidy.Configuration;
  13. import org.w3c.tidy.Tidy;
  14. import org.xml.sax.InputSource;
  15. import org.xml.sax.SAXException;
  16. import org.xml.sax.XMLReader;
  17. import org.xml.sax.helpers.XMLReaderFactory;
  18. import com.atlassian.uwc.converters.BaseConverter;
  19. import com.atlassian.uwc.ui.Page;
  20. /**
  21. * Used to parse xml documents. If no .xmlevent properties have been set up, all tags will be
  22. * parsed with the DefaultXmlParser.
  23. * Optional properties include: Xml Fragments Feature and Use HtmlTidy Feature
  24. * @see <a href="http://confluence.atlassian.com/display/CONFEXT/UWC+Xml+Framework">UWC Xml Framework Documentation</a>
  25. */
  26. public class XmlConverter extends BaseConverter {
  27. private static final String PROP_XMLFRAGMENTS_ROOT = "xml-fragments-root";
  28. private static final String DEFAULT_DOCTYPE = "strict";
  29. private static final String DEFAULT_USERAGENT = "Universal Wiki Converter";
  30. /**
  31. * logging object
  32. */
  33. Logger log = Logger.getLogger(this.getClass());
  34. /**
  35. * default xml declaration (used by Xml Fragments feature)
  36. */
  37. private static final String XML_DECLARATION = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n";
  38. /**
  39. * misc property key for Xml Fragments Feature
  40. */
  41. public final static String PROP_XMLFRAGMENTS = "xml-fragments";
  42. /**
  43. * misc property key for Use HtmlTidy Feature
  44. */
  45. public final static String PROP_USE_HTMLTIDY = "xml-use-htmltidy";
  46. public void convert(Page page) {
  47. log.debug("Xml Parser - Starting");
  48. XMLReader reader = getXmlReader();
  49. if (reader == null) return;
  50. //make sure incoming text is parsable
  51. String backup = page.getOriginalText();
  52. page.setOriginalText(enforceValidity(page.getOriginalText()));
  53. //prepare parser
  54. DefaultXmlEvents eventshandler = null;
  55. if (getProperties().containsKey("xmlevents")) { //get custom event handlers
  56. eventshandler = getEventsHandler();
  57. }
  58. DefaultXmlParser parser = new DefaultXmlParser(eventshandler, page); //we pass the page for things like labels
  59. parser.setProperties(getProperties());
  60. reader.setContentHandler(parser);
  61. reader.setErrorHandler(parser);
  62. //parse - this will change the page object's contents directly
  63. try {
  64. parse(page.getOriginalText(), reader, parser);
  65. } catch (RuntimeException e) {
  66. log.debug("Problem parsing xml. Reverting to backedup original text.");
  67. page.setOriginalText(backup);
  68. page.setConvertedText(backup);
  69. }
  70. log.debug("Xml Parser - Completed");
  71. }
  72. /**
  73. * @return the events handler, either a custom one, or the default if no custom one has been
  74. * configured
  75. */
  76. private DefaultXmlEvents getEventsHandler() {
  77. Class eventsClass;
  78. String xmleventsclass = this.getProperties().getProperty("xmlevents");
  79. try {
  80. eventsClass = Class.forName(xmleventsclass);
  81. DefaultXmlEvents events = (DefaultXmlEvents) eventsClass.newInstance();
  82. return events;
  83. } catch (Exception e) {
  84. log.error("Problem instantiating custom XmlEvents class: " + xmleventsclass +
  85. "Using DefaultXmlEvents.");
  86. }
  87. return new DefaultXmlEvents();
  88. }
  89. /**
  90. * @return the object that will be used to drive the parsing
  91. */
  92. private XMLReader getXmlReader() {
  93. try {
  94. return XMLReaderFactory.createXMLReader();
  95. } catch (SAXException e) {
  96. String message = "Could not load XmlReader. Skipping.";
  97. log.error(message);
  98. e.printStackTrace();
  99. addError(Feedback.CONVERTER_ERROR, message, true);
  100. return null;
  101. }
  102. }
  103. /**
  104. * parse the input using the given reader and parser
  105. * @param input
  106. * @param reader
  107. * @param parser
  108. * @return the resulting output
  109. */
  110. private String parse(String input, XMLReader reader, DefaultXmlParser parser) {
  111. InputSource source = new InputSource(new StringReader(input));
  112. System.setProperty( "http.agent", getUserAgent());
  113. try {
  114. reader.parse(source);
  115. } catch (Exception e) {
  116. String message = "Error while parsing xml. Skipping";
  117. log.error(message, e);
  118. addError(Feedback.CONVERTER_ERROR, message, true);
  119. throw new RuntimeException(e); //Skipping
  120. }
  121. return parser.getOutput(); //for junit tests purposes
  122. }
  123. private String getUserAgent() {
  124. Properties props = this.getProperties();
  125. if (!props.containsKey("user-agent"))
  126. return DEFAULT_USERAGENT;
  127. return props.getProperty("user-agent", DEFAULT_USERAGENT);
  128. }
  129. /**
  130. * uses optional features Xml Fragments Feature or Use Htmltidy Feature
  131. * to fix problematic xml documents. root node can be set with misc property:
  132. * xml-fragments-root
  133. * @param input original xml doc content
  134. * @return fixed xml
  135. */
  136. protected String enforceValidity(String input) {
  137. String root = "uwc-xml-outer-tag";
  138. if (getProperties().containsKey(PROP_XMLFRAGMENTS_ROOT)) {
  139. String rootCandidate = getProperties().getProperty(PROP_XMLFRAGMENTS_ROOT);
  140. if (rootCandidate != null && !"".equals(rootCandidate)) root = rootCandidate;
  141. log.debug("Using xml fragment root: " + root);
  142. }
  143. if (getProperties().containsKey(PROP_USE_HTMLTIDY) &&
  144. Boolean.parseBoolean(getProperties().getProperty(PROP_USE_HTMLTIDY))) {
  145. log.debug(PROP_USE_HTMLTIDY + " property was detected. Using htmltidy feature.");
  146. input = cleanWithJTidy(input);
  147. }
  148. if (getProperties().containsKey(PROP_XMLFRAGMENTS) &&
  149. Boolean.parseBoolean(getProperties().getProperty(PROP_XMLFRAGMENTS))) {
  150. if (getProperties().containsKey(PROP_USE_HTMLTIDY) &&
  151. Boolean.parseBoolean(getProperties().getProperty(PROP_USE_HTMLTIDY))) {
  152. log.debug(PROP_XMLFRAGMENTS + " property was detected, but cannot be used with use-htmltidy option. Skipping.");
  153. }
  154. else {
  155. log.debug(PROP_XMLFRAGMENTS + " property was detected. Document will be treated as containing xml fragments.");
  156. String enforced = "";
  157. if (!input.startsWith("<?xml ")) {
  158. enforced = XML_DECLARATION;
  159. }
  160. enforced += "<" + root + ">\n" +
  161. input +
  162. "\n</" + root + ">";
  163. input = enforced;
  164. }
  165. }
  166. // log.debug("Validated:\n" + input);
  167. return input;
  168. }
  169. /**
  170. * @param input
  171. * @return input that has been scrubbed by HtmlTidy
  172. */
  173. private String cleanWithJTidy(String input) {
  174. log.info("Cleaning HTML with JTidy: Starting. (This may take a while...)");
  175. input = preserveNewlines(input); //otherwise tidy will get rid of valid newlines
  176. Tidy tidy = new Tidy();
  177. tidy.setTidyMark(false);
  178. tidy.setDropEmptyParas(true);
  179. tidy.setXmlOut(true);
  180. tidy.setDropFontTags(false);
  181. tidy.setDocType(getDoctype());
  182. tidy.setConfigurationFromProps(getTidyProps());
  183. tidy.setCharEncoding(Configuration.UTF8); //XXX Support alternative encodings?
  184. InputStream in = null;
  185. String encoding = "utf-8"; //XXX Support alternative encodings?
  186. try {
  187. in = new ByteArrayInputStream(input.getBytes(encoding));
  188. ByteArrayOutputStream out = new ByteArrayOutputStream();
  189. tidy.parseDOM(in, out);
  190. log.info("Cleaning HTML with JTidy: Completed.");
  191. String output = out.toString(encoding);
  192. output = removeNewlines(output);//otherwise tidy will add newlines I don't want
  193. output = revertNewlines(output);//get back my original newlines
  194. return output;
  195. } catch (UnsupportedEncodingException e) {
  196. log.error("Could not use encoding: " + encoding);
  197. e.printStackTrace();
  198. }
  199. return input;
  200. }
  201. private Properties getTidyProps() {
  202. Properties props = getProperties();
  203. Properties tidyprops = new Properties();
  204. for (Enumeration iter = props.keys();iter.hasMoreElements();) {
  205. String prop = (String) iter.nextElement();
  206. if (prop.startsWith("xml-tidyopt-")) {
  207. String key = prop.replaceFirst("^xml-tidyopt-", "");
  208. String val = props.getProperty(prop, null);
  209. if (val != null) tidyprops.setProperty(key, val);
  210. }
  211. }
  212. return tidyprops;
  213. }
  214. private String getDoctype() {
  215. Properties props = getProperties();
  216. if (!props.containsKey("doctype"))
  217. return DEFAULT_DOCTYPE;
  218. return props.getProperty("doctype", DEFAULT_DOCTYPE);
  219. }
  220. Pattern nl = Pattern.compile("\n");
  221. public static final String NL_TOKEN = "~UWCXMLNLTOKEN~";
  222. Pattern nltokenPattern = Pattern.compile("\\Q" + NL_TOKEN + "\\E");
  223. Pattern ws = Pattern.compile("(?<=[><]) ");
  224. public static final String WS_TOKEN = "~UWCXMLWSTOKEN~";
  225. Pattern wstokenPattern = Pattern.compile("\\Q" + WS_TOKEN + "\\E");
  226. protected String preserveNewlines(String input) {
  227. Matcher nlFinder = nl.matcher(input);//save all existing newlines
  228. input = nlFinder.replaceAll(NL_TOKEN);
  229. Matcher wsFinder = ws.matcher(input);//save spaces after tags (sometimes tidy turns to nl)
  230. return wsFinder.replaceAll(WS_TOKEN);
  231. }
  232. Pattern tagNl = Pattern.compile("(?<=[>])\n");
  233. Pattern notTagNl = Pattern.compile("(?<=[^>])\n");
  234. protected String removeNewlines(String input) {
  235. Matcher nlFinder = tagNl.matcher(input);
  236. input = nlFinder.replaceAll(""); //remove newlines between tags <html>\n<head>
  237. nlFinder = notTagNl.matcher(input);
  238. input = nlFinder.replaceAll(" ");//replace with " " all other newlines <span\natt="...
  239. return input;
  240. }
  241. protected String revertNewlines(String input) {
  242. Matcher nltokenFinder = nltokenPattern.matcher(input);
  243. input = nltokenFinder.replaceAll("\n");
  244. Matcher wstokenFinder = wstokenPattern.matcher(input);
  245. return wstokenFinder.replaceAll(" ");
  246. }
  247. }