PageRenderTime 49ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 0ms

/markdown/preprocessors.py

https://gitlab.com/Haritiana/Python-Markdown
Python | 346 lines | 329 code | 5 blank | 12 comment | 1 complexity | 4ba6a6e787a2fdb05d5a252da82bed62 MD5 | raw file
  1. """
  2. PRE-PROCESSORS
  3. =============================================================================
  4. Preprocessors work on source text before we start doing anything too
  5. complicated.
  6. """
  7. from __future__ import absolute_import
  8. from __future__ import unicode_literals
  9. from . import util
  10. from . import odict
  11. import re
  12. def build_preprocessors(md_instance, **kwargs):
  13. """ Build the default set of preprocessors used by Markdown. """
  14. preprocessors = odict.OrderedDict()
  15. preprocessors['normalize_whitespace'] = NormalizeWhitespace(md_instance)
  16. if md_instance.safeMode != 'escape':
  17. preprocessors["html_block"] = HtmlBlockPreprocessor(md_instance)
  18. preprocessors["reference"] = ReferencePreprocessor(md_instance)
  19. return preprocessors
  20. class Preprocessor(util.Processor):
  21. """
  22. Preprocessors are run after the text is broken into lines.
  23. Each preprocessor implements a "run" method that takes a pointer to a
  24. list of lines of the document, modifies it as necessary and returns
  25. either the same pointer or a pointer to a new list.
  26. Preprocessors must extend markdown.Preprocessor.
  27. """
  28. def run(self, lines):
  29. """
  30. Each subclass of Preprocessor should override the `run` method, which
  31. takes the document as a list of strings split by newlines and returns
  32. the (possibly modified) list of lines.
  33. """
  34. pass # pragma: no cover
  35. class NormalizeWhitespace(Preprocessor):
  36. """ Normalize whitespace for consistant parsing. """
  37. def run(self, lines):
  38. source = '\n'.join(lines)
  39. source = source.replace(util.STX, "").replace(util.ETX, "")
  40. source = source.replace("\r\n", "\n").replace("\r", "\n") + "\n\n"
  41. source = source.expandtabs(self.markdown.tab_length)
  42. source = re.sub(r'(?<=\n) +\n', '\n', source)
  43. return source.split('\n')
  44. class HtmlBlockPreprocessor(Preprocessor):
  45. """Remove html blocks from the text and store them for later retrieval."""
  46. right_tag_patterns = ["</%s>", "%s>"]
  47. attrs_pattern = r"""
  48. \s+(?P<attr>[^>"'/= ]+)=(?P<q>['"])(?P<value>.*?)(?P=q) # attr="value"
  49. | # OR
  50. \s+(?P<attr1>[^>"'/= ]+)=(?P<value1>[^> ]+) # attr=value
  51. | # OR
  52. \s+(?P<attr2>[^>"'/= ]+) # attr
  53. """
  54. left_tag_pattern = r'^\<(?P<tag>[^> ]+)(?P<attrs>(%s)*)\s*\/?\>?' % \
  55. attrs_pattern
  56. attrs_re = re.compile(attrs_pattern, re.VERBOSE)
  57. left_tag_re = re.compile(left_tag_pattern, re.VERBOSE)
  58. markdown_in_raw = False
  59. def _get_left_tag(self, block):
  60. m = self.left_tag_re.match(block)
  61. if m:
  62. tag = m.group('tag')
  63. raw_attrs = m.group('attrs')
  64. attrs = {}
  65. if raw_attrs:
  66. for ma in self.attrs_re.finditer(raw_attrs):
  67. if ma.group('attr'):
  68. if ma.group('value'):
  69. attrs[ma.group('attr').strip()] = ma.group('value')
  70. else:
  71. attrs[ma.group('attr').strip()] = ""
  72. elif ma.group('attr1'):
  73. if ma.group('value1'):
  74. attrs[ma.group('attr1').strip()] = ma.group(
  75. 'value1'
  76. )
  77. else:
  78. attrs[ma.group('attr1').strip()] = ""
  79. elif ma.group('attr2'):
  80. attrs[ma.group('attr2').strip()] = ""
  81. return tag, len(m.group(0)), attrs
  82. else:
  83. tag = block[1:].split(">", 1)[0].lower()
  84. return tag, len(tag)+2, {}
  85. def _recursive_tagfind(self, ltag, rtag, start_index, block):
  86. while 1:
  87. i = block.find(rtag, start_index)
  88. if i == -1:
  89. return -1
  90. j = block.find(ltag, start_index)
  91. # if no ltag, or rtag found before another ltag, return index
  92. if (j > i or j == -1):
  93. return i + len(rtag)
  94. # another ltag found before rtag, use end of ltag as starting
  95. # point and search again
  96. j = block.find('>', j)
  97. start_index = self._recursive_tagfind(ltag, rtag, j + 1, block)
  98. if start_index == -1:
  99. # HTML potentially malformed- ltag has no corresponding
  100. # rtag
  101. return -1
  102. def _get_right_tag(self, left_tag, left_index, block):
  103. for p in self.right_tag_patterns:
  104. tag = p % left_tag
  105. i = self._recursive_tagfind(
  106. "<%s" % left_tag, tag, left_index, block
  107. )
  108. if i > 2:
  109. return tag.lstrip("<").rstrip(">"), i
  110. return block.rstrip()[-left_index:-1].lower(), len(block)
  111. def _equal_tags(self, left_tag, right_tag):
  112. if left_tag[0] in ['?', '@', '%']: # handle PHP, etc.
  113. return True
  114. if ("/" + left_tag) == right_tag:
  115. return True
  116. if (right_tag == "--" and left_tag == "--"):
  117. return True
  118. elif left_tag == right_tag[1:] and right_tag[0] == "/":
  119. return True
  120. else:
  121. return False
  122. def _is_oneliner(self, tag):
  123. return (tag in ['hr', 'hr/'])
  124. def _stringindex_to_listindex(self, stringindex, items):
  125. """
  126. Same effect as concatenating the strings in items,
  127. finding the character to which stringindex refers in that string,
  128. and returning the index of the item in which that character resides.
  129. """
  130. items.append('dummy')
  131. i, count = 0, 0
  132. while count <= stringindex:
  133. count += len(items[i])
  134. i += 1
  135. return i - 1
  136. def _nested_markdown_in_html(self, items):
  137. """Find and process html child elements of the given element block."""
  138. for i, item in enumerate(items):
  139. if self.left_tag_re.match(item):
  140. left_tag, left_index, attrs = \
  141. self._get_left_tag(''.join(items[i:]))
  142. right_tag, data_index = self._get_right_tag(
  143. left_tag, left_index, ''.join(items[i:]))
  144. right_listindex = \
  145. self._stringindex_to_listindex(data_index, items[i:]) + i
  146. if 'markdown' in attrs.keys():
  147. items[i] = items[i][left_index:] # remove opening tag
  148. placeholder = self.markdown.htmlStash.store_tag(
  149. left_tag, attrs, i + 1, right_listindex + 1)
  150. items.insert(i, placeholder)
  151. if len(items) - right_listindex <= 1: # last nest, no tail
  152. right_listindex -= 1
  153. items[right_listindex] = items[right_listindex][
  154. :-len(right_tag) - 2] # remove closing tag
  155. else: # raw html
  156. if len(items) - right_listindex <= 1: # last element
  157. right_listindex -= 1
  158. if right_listindex <= i:
  159. right_listindex = i + 1
  160. placeholder = self.markdown.htmlStash.store('\n\n'.join(
  161. items[i:right_listindex]))
  162. del items[i:right_listindex]
  163. items.insert(i, placeholder)
  164. return items
  165. def run(self, lines):
  166. text = "\n".join(lines)
  167. new_blocks = []
  168. text = text.rsplit("\n\n")
  169. items = []
  170. left_tag = ''
  171. right_tag = ''
  172. in_tag = False # flag
  173. while text:
  174. block = text[0]
  175. if block.startswith("\n"):
  176. block = block[1:]
  177. text = text[1:]
  178. if block.startswith("\n"):
  179. block = block[1:]
  180. if not in_tag:
  181. if block.startswith("<") and len(block.strip()) > 1:
  182. if block[1:4] == "!--":
  183. # is a comment block
  184. left_tag, left_index, attrs = "--", 2, {}
  185. else:
  186. left_tag, left_index, attrs = self._get_left_tag(block)
  187. right_tag, data_index = self._get_right_tag(left_tag,
  188. left_index,
  189. block)
  190. # keep checking conditions below and maybe just append
  191. if data_index < len(block) and (util.isBlockLevel(left_tag) or left_tag == '--'):
  192. text.insert(0, block[data_index:])
  193. block = block[:data_index]
  194. if not (util.isBlockLevel(left_tag) or block[1] in ["!", "?", "@", "%"]):
  195. new_blocks.append(block)
  196. continue
  197. if self._is_oneliner(left_tag):
  198. new_blocks.append(block.strip())
  199. continue
  200. if block.rstrip().endswith(">") \
  201. and self._equal_tags(left_tag, right_tag):
  202. if self.markdown_in_raw and 'markdown' in attrs.keys():
  203. block = block[left_index:-len(right_tag) - 2]
  204. new_blocks.append(self.markdown.htmlStash.
  205. store_tag(left_tag, attrs, 0, 2))
  206. new_blocks.extend([block])
  207. else:
  208. new_blocks.append(
  209. self.markdown.htmlStash.store(block.strip()))
  210. continue
  211. else:
  212. # if is block level tag and is not complete
  213. if (not self._equal_tags(left_tag, right_tag)) and \
  214. (util.isBlockLevel(left_tag) or left_tag == "--"):
  215. items.append(block.strip())
  216. in_tag = True
  217. else:
  218. new_blocks.append(
  219. self.markdown.htmlStash.store(block.strip())
  220. )
  221. continue
  222. else:
  223. new_blocks.append(block)
  224. else:
  225. items.append(block)
  226. right_tag, data_index = self._get_right_tag(left_tag, 0, block)
  227. if self._equal_tags(left_tag, right_tag):
  228. # if find closing tag
  229. if data_index < len(block):
  230. # we have more text after right_tag
  231. items[-1] = block[:data_index]
  232. text.insert(0, block[data_index:])
  233. in_tag = False
  234. if self.markdown_in_raw and 'markdown' in attrs.keys():
  235. items[0] = items[0][left_index:]
  236. items[-1] = items[-1][:-len(right_tag) - 2]
  237. if items[len(items) - 1]: # not a newline/empty string
  238. right_index = len(items) + 3
  239. else:
  240. right_index = len(items) + 2
  241. new_blocks.append(self.markdown.htmlStash.store_tag(
  242. left_tag, attrs, 0, right_index))
  243. placeholderslen = len(self.markdown.htmlStash.tag_data)
  244. new_blocks.extend(
  245. self._nested_markdown_in_html(items))
  246. nests = len(self.markdown.htmlStash.tag_data) - \
  247. placeholderslen
  248. self.markdown.htmlStash.tag_data[-1 - nests][
  249. 'right_index'] += nests - 2
  250. else:
  251. new_blocks.append(
  252. self.markdown.htmlStash.store('\n\n'.join(items)))
  253. items = []
  254. if items:
  255. if self.markdown_in_raw and 'markdown' in attrs.keys():
  256. items[0] = items[0][left_index:]
  257. items[-1] = items[-1][:-len(right_tag) - 2]
  258. if items[len(items) - 1]: # not a newline/empty string
  259. right_index = len(items) + 3
  260. else:
  261. right_index = len(items) + 2
  262. new_blocks.append(
  263. self.markdown.htmlStash.store_tag(
  264. left_tag, attrs, 0, right_index))
  265. placeholderslen = len(self.markdown.htmlStash.tag_data)
  266. new_blocks.extend(self._nested_markdown_in_html(items))
  267. nests = len(self.markdown.htmlStash.tag_data) - placeholderslen
  268. self.markdown.htmlStash.tag_data[-1 - nests][
  269. 'right_index'] += nests - 2
  270. else:
  271. new_blocks.append(
  272. self.markdown.htmlStash.store('\n\n'.join(items)))
  273. new_blocks.append('\n')
  274. new_text = "\n\n".join(new_blocks)
  275. return new_text.split("\n")
  276. class ReferencePreprocessor(Preprocessor):
  277. """ Remove reference definitions from text and store for later use. """
  278. TITLE = r'[ ]*(\"(.*)\"|\'(.*)\'|\((.*)\))[ ]*'
  279. RE = re.compile(
  280. r'^[ ]{0,3}\[([^\]]*)\]:\s*([^ ]*)[ ]*(%s)?$' % TITLE, re.DOTALL
  281. )
  282. TITLE_RE = re.compile(r'^%s$' % TITLE)
  283. def run(self, lines):
  284. new_text = []
  285. while lines:
  286. line = lines.pop(0)
  287. m = self.RE.match(line)
  288. if m:
  289. id = m.group(1).strip().lower()
  290. link = m.group(2).lstrip('<').rstrip('>')
  291. t = m.group(5) or m.group(6) or m.group(7)
  292. if not t:
  293. # Check next line for title
  294. tm = self.TITLE_RE.match(lines[0])
  295. if tm:
  296. lines.pop(0)
  297. t = tm.group(2) or tm.group(3) or tm.group(4)
  298. self.markdown.references[id] = (link, t)
  299. else:
  300. new_text.append(line)
  301. return new_text # + "\n"