PageRenderTime 26ms CodeModel.GetById 15ms RepoModel.GetById 0ms app.codeStats 0ms

/src/com/atlassian/uwc/converters/jotspot/ListPreprocessor.java

https://bitbucket.org/dodok1/uwc
Java | 312 lines | 206 code | 49 blank | 57 comment | 15 complexity | 0909b7da2ae6091f434c98f7eb8214fb MD5 | raw file
  1. package com.atlassian.uwc.converters.jotspot;
  2. import java.util.regex.Matcher;
  3. import java.util.regex.Pattern;
  4. import org.apache.log4j.Logger;
  5. import com.atlassian.uwc.converters.BaseConverter;
  6. import com.atlassian.uwc.ui.Page;
  7. /**
  8. * Makes sure nested lists are in a parsable format:
  9. * <br/>Converts to:<br/>
  10. * &lt;ul&gt;
  11. * &lt;li&gt;a&lt;/li&gt;
  12. * &lt;ul&gt;&lt;li&gt;1&lt;/li&gt;
  13. * &lt;/ul&gt;
  14. * &lt;/ul&gt;
  15. * <br/>
  16. * From:
  17. * <br/>
  18. * &lt;ul&gt;
  19. * &lt;li&gt;a&lt;ul&gt;&lt;li&gt;1&lt;/li&gt;
  20. * &lt;/ul&gt;&lt;/li&gt;
  21. * &lt;/ul&gt;
  22. *
  23. * @author Laura Kolker
  24. */
  25. public class ListPreprocessor extends BaseConverter {
  26. Logger log = Logger.getLogger(this.getClass());
  27. String openingTagAttributes = "[^>]*";
  28. String itemTag = "<li" + openingTagAttributes + ">";
  29. String listTag = "<[uo]l" + openingTagAttributes + ">";
  30. String nlDelim = "\n";
  31. String nlReplace = "\n";
  32. public void convert(Page page) {
  33. log.debug("List PreProcessor - starting");
  34. String input = page.getOriginalText();
  35. String converted = input;
  36. //make sure incoming lists are parsable
  37. converted = preProcessLists(input);
  38. log.debug("converted = " + converted);
  39. page.setConvertedText(converted);
  40. log.debug("List PreProcessor - complete");
  41. }
  42. /**
  43. * prepares list HTML for the list parser.
  44. * <br/>
  45. * Includes:
  46. * <ul>
  47. * <li/> adds newlines between ul, ol, and li elements
  48. * <li/> requires closing &lt;/li&gt; tags before opening nested lists
  49. * <li/> removes undesireable tags we're not converting within the context of a list item:
  50. * &lt;p&gt; &lt;b&gt; &lt;font&gt; &lt;br&gt;
  51. * These should either be converted already, or are not useful as part of a list item.
  52. * <li/> preserves non-list, non-bold uses of the * symbol
  53. * </ul>
  54. * @param input conversion file contents
  55. * @return conversion file contents with parsable lists
  56. */
  57. protected String preProcessLists(String input) {
  58. //Step 0 make sure nlDelim accurately reflects input
  59. determineNlDelim(input);
  60. //Step 1 add newlines before every ul and li.
  61. input = addNewlinesBeforeOpeners(input);
  62. //Step 1.5 - disallow multiple newlines
  63. input = disallowMultipleNewlines(input);
  64. //Step 2 disallow lines that start with li to not end with /li
  65. input = disallowLostClosingLI(input);
  66. //Step 2.5 disallow lines that are </li>\n</li>
  67. input = disallowExtraNewlineLI(input);
  68. //Step 3 removing dangling closers
  69. input = removeExtraClosingLI(input);
  70. //Step 4 final newlines
  71. input = addFinalNewlines(input);
  72. //Step 4.5 make sure there are no </ul>\n<ul> constructs (converter can't chunk those)
  73. input = addChunkableWhitespace(input);
  74. //Step 5 clean up extra whitespace
  75. input = cleanWhitespace(input);
  76. //Step 6 make safe any non-list Jotspot syntax
  77. input = handleNonListNonBoldStars(input);
  78. //Step 7 paragraph & font tags do not belong here!
  79. input = removeForbiddenTags(input);
  80. return input;
  81. }
  82. Pattern rnl = Pattern.compile("\r\n");
  83. private void determineNlDelim(String input) {
  84. Matcher rnlFinder = rnl.matcher(input);
  85. if (rnlFinder.find()) { //using Windows delimiters!
  86. nlDelim = "(?:\r\n)";
  87. nlReplace = "\r\n";
  88. //need to recompile some patterns
  89. }
  90. recompilePatternsWithNewlines();
  91. }
  92. private void recompilePatternsWithNewlines() {
  93. nonlist = "(^|" + nlDelim + "|(?:<br\\s*\\/>))\\*(?=[^\n*]*(" + nlDelim + "|$))";
  94. listTransformNonList = Pattern.compile(nonlist);
  95. extraWS = "" + nlDelim + "?( )+";
  96. listTransformLessWS = Pattern.compile(extraWS);
  97. endTagNL = "(<\\/(?:(?:li)|(?:[uo]l))>)(?!" + nlDelim + ")";
  98. listTransformAddEnd = Pattern.compile(endTagNL);
  99. danglingClosersAfterNL = "<\\/li>" + nlDelim + "<\\/li>";
  100. listTransformClosingDanglersAfterNL = Pattern.compile(danglingClosersAfterNL);
  101. noListEnd = "(" + nlDelim + ""+ itemTag + ")(.*?)(?<!(?:<\\/li>))(" + nlDelim + ")";
  102. listTransformNoEnd = Pattern.compile(noListEnd);
  103. multNL = "" + nlDelim + "+";
  104. listTransformMultNL = Pattern.compile(multNL);
  105. noNL = "(?<!^|(?:" + nlDelim + "))((?:" + itemTag + ")|(?:" + listTag + "))";
  106. listTransformNoNL = Pattern.compile(noNL);
  107. }
  108. String nonlist = "(^|" + nlDelim + "|(?:<br\\s*\\/>))\\*(?=[^\n*]*(" + nlDelim + "|$))";
  109. Pattern listTransformNonList = Pattern.compile(nonlist);
  110. private String handleNonListNonBoldStars(String input) {
  111. String replacement;
  112. Matcher transformer;
  113. replacement = "\\\\*";
  114. transformer = listTransformNonList.matcher(input);
  115. if (transformer.find()) {
  116. String pre = transformer.group(1);
  117. input = transformer.replaceAll(pre + replacement);
  118. }
  119. return input;
  120. }
  121. String extraWS = "" + nlDelim + "?( )+";
  122. Pattern listTransformLessWS = Pattern.compile(extraWS);
  123. private String cleanWhitespace(String input) {
  124. String replacement;
  125. Matcher transformer;
  126. replacement = "$1";
  127. transformer = listTransformLessWS.matcher(input);
  128. if (transformer.find()) {
  129. input = transformer.replaceAll(replacement);
  130. }
  131. return input;
  132. }
  133. String endTagNL = "(<\\/(?:(?:li)|(?:[uo]l))>)(?!" + nlDelim + ")";
  134. Pattern listTransformAddEnd = Pattern.compile(endTagNL);
  135. private String addFinalNewlines(String input) {
  136. String replacement;
  137. Matcher transformer;
  138. replacement = "$1" + nlReplace + "";
  139. transformer = listTransformAddEnd.matcher(input);
  140. if (transformer.find()) {
  141. input = transformer.replaceAll(replacement);
  142. }
  143. return input;
  144. }
  145. String danglingClosers = "(<\\/[ou]l>)<\\/li>";
  146. Pattern listTransformClosingDanglers = Pattern.compile(danglingClosers);
  147. private String removeExtraClosingLI(String input) {
  148. String replacement;
  149. Matcher transformer;
  150. replacement = "" + nlDelim + "$1";
  151. transformer = listTransformClosingDanglers.matcher(input);
  152. if (transformer.find()) {
  153. input = transformer.replaceAll(replacement);
  154. }
  155. return input;
  156. }
  157. String danglingClosersAfterNL = "<\\/li>" + nlDelim + "<\\/li>";
  158. Pattern listTransformClosingDanglersAfterNL = Pattern.compile(danglingClosersAfterNL);
  159. private String disallowExtraNewlineLI(String input) {
  160. String replacement;
  161. Matcher transformer;
  162. replacement = "</li>";
  163. transformer = listTransformClosingDanglersAfterNL.matcher(input);
  164. if (transformer.find()) {
  165. input = transformer.replaceAll(replacement);
  166. }
  167. return input;
  168. }
  169. String noListEnd = "(" + nlDelim + ""+ itemTag + ")(.*?)(?<!(?:<\\/li>))(" + nlDelim + ")";
  170. Pattern listTransformNoEnd = Pattern.compile(noListEnd);
  171. Pattern endListTag = Pattern.compile("<\\/li>");
  172. private String disallowLostClosingLI(String input) {
  173. String replacement;
  174. Matcher transformer;
  175. replacement = "$1$2<\\/li>$3";
  176. transformer = listTransformNoEnd.matcher(input);
  177. StringBuffer sb = new StringBuffer();
  178. while (transformer.find()) {
  179. String insides = transformer.group(2);
  180. //handles case where extra post text gets an accidental <li/> appended
  181. //see JavaRegexJunitTest.testTableEndProblem
  182. Matcher doubleCheckListEnd = endListTag.matcher(insides);
  183. if (!doubleCheckListEnd.find()) {
  184. transformer.appendReplacement(sb, replacement);
  185. }
  186. }
  187. transformer.appendTail(sb);
  188. input = sb.toString();
  189. return input;
  190. }
  191. String multNL = "" + nlDelim + "+";
  192. Pattern listTransformMultNL = Pattern.compile(multNL);
  193. private String disallowMultipleNewlines(String input) {
  194. String replacement;
  195. Matcher transformer;
  196. replacement = "" + nlReplace + "";
  197. transformer = listTransformMultNL.matcher(input);
  198. if (transformer.find()) {
  199. input = transformer.replaceAll(replacement);
  200. }
  201. return input;
  202. }
  203. String noNL = "(?<!^|(?:" + nlDelim + "))((?:" + itemTag + ")|(?:" + listTag + "))";
  204. Pattern listTransformNoNL = Pattern.compile(noNL);
  205. private String addNewlinesBeforeOpeners(String input) {
  206. String replacement = "" + nlReplace + "$1";
  207. Matcher transformer = listTransformNoNL.matcher(input);
  208. if (transformer.find()) {
  209. input = transformer.replaceAll(replacement);
  210. }
  211. return input;
  212. }
  213. Pattern listContents = Pattern.compile("<li>(.*?)</li>", Pattern.DOTALL);
  214. //forbidden tags are <b> <p> <br/> <font>
  215. Pattern forbiddenTags = Pattern.compile("<\\/?(?:p|b|(?:font)|(?:br))\\s*[^>]*>");
  216. private String removeForbiddenTags(String input) {
  217. Matcher listContentsFinder = listContents.matcher(input);
  218. StringBuffer sb = new StringBuffer();
  219. boolean found = false;
  220. while (listContentsFinder.find()) {
  221. found = true;
  222. String contents = listContentsFinder.group(1);
  223. Matcher forbiddenFinder = forbiddenTags.matcher(contents);
  224. String replacement = contents;
  225. if (forbiddenFinder.find()) {
  226. replacement = forbiddenFinder.replaceAll("");
  227. }
  228. replacement = "<li>" + replacement + "</li>";
  229. listContentsFinder.appendReplacement(sb, replacement);
  230. }
  231. listContentsFinder.appendTail(sb);
  232. if (found)
  233. input = sb.toString();
  234. return input;
  235. }
  236. String badChunk = "(<\\/[uo]l>)(" + nlDelim + "+" + listTag + ")";
  237. Pattern badChunkPattern = Pattern.compile(badChunk);
  238. /**
  239. * lists that flow directly into other lists with no non-NL between
  240. * will not be converted properly by the ListConverter.
  241. * As a simple patch (it's a bit of a kludge), I'm adding a space
  242. * between these sorts of lists, so that they can be converted.
  243. * @param input
  244. * @return
  245. */
  246. private String addChunkableWhitespace(String input) {
  247. Matcher badChunkFinder = badChunkPattern.matcher(input);
  248. StringBuffer sb = new StringBuffer();
  249. boolean found = false;
  250. while (badChunkFinder.find()) {
  251. found = true;
  252. String openingTag = badChunkFinder.group(1);
  253. String nlsAndCloseTag = badChunkFinder.group(2);
  254. String replacement = openingTag + " " + nlsAndCloseTag;
  255. badChunkFinder.appendReplacement(sb, replacement);
  256. }
  257. if (found) {
  258. badChunkFinder.appendTail(sb);
  259. input = sb.toString();
  260. }
  261. return input;
  262. }
  263. }