PageRenderTime 846ms CodeModel.GetById 10ms RepoModel.GetById 0ms app.codeStats 0ms

/str_format.py

https://github.com/k-anon/wakarimasen
Python | 353 lines | 343 code | 8 blank | 2 comment | 12 complexity | 555ebce4067abbc699a4d1b51b11a041 MD5 | raw file
  1. import re
  2. from urllib import quote
  3. from util import local
  4. MAX_UNICODE = 1114111
  5. CONTROL_CHARS_RE = re.compile('[\x00-\x08\x0b\x0c\x0e-\x1f]')
  6. ENTITIES_CLEAN_RE = re.compile('&(#([0-9]+);|#x([0-9a-fA-F]+);|)')
  7. ENTITY_REPLACES = {
  8. '<': '&lt;',
  9. '>': '&gt;',
  10. '"': '&quot;',
  11. "'": '&#39;',
  12. ',': '&#44;', # "clean up commas for some reason I forgot"
  13. }
  14. def clean_string(string, cleanentities=False):
  15. if cleanentities:
  16. string = string.replace("&", "&amp;") # clean up &
  17. else:
  18. def repl(match):
  19. g = match.groups()
  20. if not g[0]: # change simple ampersands
  21. return '&amp;'
  22. ordinal = int(g[1] or int(g[2], 16))
  23. if forbidden_unicode(ordinal): # strip forbidden unicode chars
  24. return ''
  25. else: # and leave the rest as-is.
  26. return '&' + g[0]
  27. string = ENTITIES_CLEAN_RE.sub(repl, string)
  28. # replace <, >, ", ' and "," with html entities
  29. for old, new in ENTITY_REPLACES.iteritems():
  30. string = string.replace(old, new)
  31. # Kill linefeeds.
  32. string = string.replace('\r', '')
  33. # remove control chars
  34. string = CONTROL_CHARS_RE.sub('', string)
  35. return string
  36. ENTITIES_DECODE_RE = re.compile('(&#([0-9]*)([;&])|&#([x&])([0-9a-f]*)([;&]))', re.I)
  37. def decode_string(string, noentities=False):
  38. '''Returns unicode string'''
  39. def repl(match):
  40. g = match.groups()
  41. ordinal = int(g[1] or int(g[4], 16))
  42. if '&' in g: # nested entities, leave as-is.
  43. return g[0]
  44. elif ordinal in (35, 38): # don't convert & or #
  45. return g[0]
  46. elif forbidden_unicode(ordinal): # strip forbidden unicode chars
  47. return ''
  48. else: # convert all entities to unicode chars
  49. return unichr(ordinal)
  50. if not noentities:
  51. string = ENTITIES_DECODE_RE.sub(repl, string)
  52. # remove control chars
  53. string = CONTROL_CHARS_RE.sub('', string)
  54. return string
  55. def forbidden_unicode(num):
  56. return ((len(str(num)) > 7) or # too long numbers
  57. (num > MAX_UNICODE) or # outside unicode range
  58. (num < 32) or # control chars
  59. (num >= 0xd800 and num <= 0xdfff) or # surrogate code points
  60. (num >= 0x202a and num <= 0x202e)) # text direction
  61. # The following top-level code is used to build markup translation
  62. # dictionaries to convert from an arbitrary formatting language to HTML
  63. # or theoretically any compatible language and back again. This is used
  64. # to streamline and potentially extend formatting routines.
  65. # BBCODE_TABLE is used as an example markup translation definition.
  66. # Note no regex is used in the dictionary other than group names. Constructs
  67. # like iterators (e.g., lists) and arbitrary arguments (e.g., XML-style
  68. # attributes) will be supported hopefully tomorrow.
  69. # The left side of this table is the markup inputted by the user following
  70. # the markup standard. The right-side is equivalent HTML, with the respective
  71. # captured groups of generic text. (Picker text capturing will also be
  72. # added.)
  73. BBCODE_TABLE \
  74. = { r'[b]\1[/b]' : r'<strong>\1</strong>',
  75. r'[i]\1\[/i]' : r'<em>\1</em>',
  76. r'[del]\1[/del]' : r'<del>\1</del>',
  77. # BAD IDEA but good for testing.
  78. r'[color="\1"]\2[/color]' \
  79. : r'<span style="color:\1">\2</span>'\
  80. }
  81. def __build_transl_dict(key, value, append):
  82. original_key = key
  83. # Escape metacharacters and match with \\1, \\2, etc.
  84. key = re.compile(r'\\\\\d+').sub(r'(.*?)', re.escape(key))
  85. # Effectively transpose each group to the relative location in the output
  86. # string (likely still in order).
  87. value = re.compile(key).sub(value, original_key)
  88. append[re.compile(key)] = value
  89. # Build markup translation dictionaries for converting to and from
  90. HTML_TRANSL = {}
  91. for (key, value) in BBCODE_TABLE.iteritems():
  92. # Clean the markup since the comment containing the code is cleaned, too.
  93. key = clean_string(decode_string(key))
  94. __build_transl_dict(key, value, HTML_TRANSL)
  95. CODE_TRANSL = {}
  96. for (key, value) in BBCODE_TABLE.iteritems():
  97. # The HTML code is raw, thus no need to decode/clean the key.
  98. __build_transl_dict(value, key, CODE_TRANSL)
  99. def percent_encode(string):
  100. return quote(string.encode('utf-8'))
  101. # The code above will be temporarily replaced by this wakabamark-only
  102. # version, in this branch only.
  103. #format_comment regexps (FC_*)
  104. FC_HIDE_POSTLINKS = [
  105. (re.compile('&gt;&gt;&gt;/?([0-9a-zA-Z]+)/?&gt;&gt;([0-9]+)'),
  106. r'&gt&gt&gt;/\1/&gt&gt;\2'),
  107. (re.compile('&gt;&gt;&gt;/([0-9a-zA-Z]+)/'), r'&gt&gt&gt;/\1/'),
  108. (re.compile('&gt;&gt;([0-9\-]+)'), r'&gtgt;\1')
  109. ]
  110. FC_BOARD_POST_LINK = re.compile('&gt&gt&gt;\/?([0-9a-zA-Z]+)\/?&gt&gt;([0-9]+)')
  111. FC_BOARD_LINK = re.compile('&gt&gt&gt;\/?([0-9a-zA-Z]+)\/?')
  112. FC_POST_LINK = re.compile('&gtgt;([0-9]+)')
  113. def format_comment(comment):
  114. # hide >>1 references from the quoting code
  115. for pattern, repl in FC_HIDE_POSTLINKS:
  116. comment = pattern.sub(repl, comment)
  117. def unhide_postlinks(string):
  118. return (string
  119. .replace("&gt&gt&gt;", "&gt;&gt;&gt;")
  120. .replace("&gt&gt;", "&gt;&gt;")
  121. .replace("&gtgt;", "&gt;&gt;"))
  122. def handler(line):
  123. '''fix up post link references'''
  124. # import this here to avoid circular imports. ugly, i know.
  125. import board
  126. def board_post_link(match):
  127. origtext = unhide_postlinks(match.group(0))
  128. try:
  129. newboard = board.Board(match.group(1))
  130. res = newboard.get_post(match.group(2))
  131. if res:
  132. return '<a href="%s" onclick="highlight(%s)">%s</a>' % (
  133. newboard.get_reply_link(res.num, res.parent),
  134. match.group(1), origtext)
  135. except board.BoardNotFound:
  136. pass
  137. return origtext
  138. line = FC_BOARD_POST_LINK.sub(board_post_link, line)
  139. def board_link(match):
  140. origtext = unhide_postlinks(match.group(0))
  141. try:
  142. newboard = board.Board(match.group(1))
  143. return '<a href="%s">%s</a>' % (
  144. newboard.make_path(page=0, url=True),
  145. origtext)
  146. except board.BoardNotFound:
  147. return origtext
  148. line = FC_BOARD_LINK.sub(board_link, line)
  149. def post_link(match):
  150. origtext = unhide_postlinks(match.group(0))
  151. res = local.board.get_post(match.group(1))
  152. if res:
  153. return '<a href="%s" onclick="highlight(%s)">%s</a>' % (
  154. local.board.get_reply_link(res.num, res.parent),
  155. res.num, origtext)
  156. else:
  157. return origtext
  158. line = FC_POST_LINK.sub(post_link, line)
  159. return line
  160. if local.board.options['ENABLE_WAKABAMARK']:
  161. comment = do_wakabamark(comment, handler)
  162. else:
  163. comment = "<p>" + simple_format(comment, handler) + "</p>"
  164. # fix <blockquote> styles for old stylesheets
  165. comment = comment.replace("<blockquote>", '<blockquote class="unkfunc">')
  166. # restore >>1 references hidden in code blocks
  167. comment = unhide_postlinks(comment)
  168. return comment
  169. #wakabamark regexps (WM_*)
  170. WM_REPLACEMENTS = [
  171. (re.compile(r'\*\*([^\s].*?)\*\*'), r'<strong>\1</strong>'),
  172. (re.compile(r'\*([^\s].*?)\*'), r'<em>\1</em>'),
  173. (re.compile(r'`([^\n]*?)`'), r'<code>\1</code>'),
  174. (re.compile(r'\[spoiler\]'), r'<span class="spoiler">'),
  175. (re.compile(r'\[/spoiler\]'), r'</span><!--/spoiler-->'),
  176. ]
  177. WM_CODEBLOCK = [re.compile(r'^( |\t)'), '<pre><code>', '', '\n',
  178. '</pre></code>', []]
  179. WM_OLIST = [re.compile(r'^(\d+\.)\s*'), '<ol>', '<li>', '</li>', '</ol>',
  180. []]
  181. WM_ULIST = [re.compile(r'^[\*\+\-]\s*'), '<ul>', '<li>', '</li>', '</ul>',
  182. []]
  183. WM_BLOCKQUOTE = [re.compile(r'^&gt;\s*'), '<blockquote>', '', '<br />',
  184. '</blockquote>', []]
  185. URL_PATTERN = re.compile(
  186. '(https?://[^\s<>"]*?)((?:\s|<|>|"|\.|\)|\]|!|\?|,|&#44;|&quot;)*'
  187. '(?:[\s<>"]|$))', re.I | re.S)
  188. URL_SUB = r'<a href="\1">\1</a>\2'
  189. def do_wakabamark(comment, handler):
  190. lines = []
  191. orig_lines = comment.split('\n')
  192. orig_lines.append('')
  193. # State variable: Did we previously see an empty line?
  194. empty_line_before = False
  195. for line in orig_lines:
  196. # Do spans outside codeblocks.
  197. if not WM_CODEBLOCK[0].match(line):
  198. line = URL_PATTERN.sub(URL_SUB, line)
  199. for pattern, repl in WM_REPLACEMENTS:
  200. line = pattern.sub(repl, line)
  201. # Parse with handler now.
  202. if handler:
  203. line = handler(line)
  204. # Go through each block type and format.
  205. match = False
  206. for format_type in (WM_CODEBLOCK, WM_OLIST, WM_ULIST, WM_BLOCKQUOTE):
  207. (ptn, open_el, open_item_el, close_item_el, close_el, lst) \
  208. = format_type
  209. if ptn.match(line):
  210. match = True
  211. if format_type != WM_BLOCKQUOTE:
  212. line = ptn.sub('', line)
  213. lst.append(open_item_el + line + close_item_el)
  214. elif lst:
  215. # The stack of lines in a format block may now be "popped."
  216. lines.append(open_el + ''.join(lst) + close_el)
  217. lst = format_type[5] = []
  218. if not match and line:
  219. PARAGRAPH_RE = re.compile(r'</p>$')
  220. if lines and not empty_line_before \
  221. and PARAGRAPH_RE.search(lines[-1]):
  222. lines[-1] = PARAGRAPH_RE.sub('<br />' + line + '</p>',
  223. lines[-1])
  224. else:
  225. lines.append('<p>' + line + '</p>')
  226. empty_line_before = not line
  227. return ''.join(lines)
  228. GREENTEXT_PATTERN = re.compile("^(&gt;[^_]*)$")
  229. GREENTEXT_SUB = r'<span class="unkfunc">\1</span>'
  230. def simple_format(comment, handler):
  231. lines = []
  232. for line in comment.split("\n"):
  233. # make URLs into links
  234. line = URL_PATTERN.sub(URL_SUB, line)
  235. # colour quoted sections if working in old-style mode.
  236. if not local.board.options['ENABLE_WAKABAMARK']:
  237. line = GREENTEXT_PATTERN.sub(GREENTEXT_SUB, line)
  238. if handler:
  239. line = handler(line)
  240. lines.append(line)
  241. return '<br />'.join(lines)
  242. #tag_killa regexps (TK_*)
  243. TK_REPLACEMENTS = [
  244. # Strip Oekaki postfix.
  245. (re.compile('<p(?: class="oekinfo">|>\s*<small>)\s*<strong>(?:Oekaki post|'
  246. 'Edited in Oekaki)</strong>\s*\(Time\:.*?</p>', re.I), ''),
  247. (re.compile('<br\s?/?>'), '\n'),
  248. (re.compile('<p>'), ''),
  249. (re.compile('</p>'), '\n\n'),
  250. (re.compile('<code>([^\n]*?)</code>'), r'`\1`'),
  251. (re.compile('</blockquote>'), '\n\n'),
  252. (re.compile('<span class="spoiler">'), '[spoiler]'),
  253. (re.compile('</span><!--/spoiler-->'), '[/spoiler]'),
  254. ]
  255. TK_CODEBLOCK = re.compile('<\s*?code>(.*?)</\s*?code>', re.S)
  256. TK_ULIST = re.compile('<ul>(.*?)</ul>', re.S)
  257. TK_OLIST = re.compile('<ol>(.*?)</ol>', re.S)
  258. TK_REPLACEMENTS_2 = [
  259. (re.compile('</?em>'), '*'),
  260. (re.compile('</?strong>'), '**'),
  261. (re.compile('<.*?>'), ''),
  262. (re.compile(r'\s+$'), '')
  263. ]
  264. def tag_killa(string):
  265. '''subroutine for stripping HTML tags and supplanting them with corresponding wakabamark'''
  266. for pattern, repl in TK_REPLACEMENTS:
  267. string = pattern.sub(repl, string)
  268. def codeblock(match):
  269. return '\n'.join([' ' + x for x in match.group(1).split("\n")]) \
  270. + '\n'
  271. string = TK_CODEBLOCK.sub(codeblock, string)
  272. def ulist(match):
  273. return match.group(1).replace("<li>", "* ").replace("</li>", "\n") \
  274. + '\n'
  275. string = TK_ULIST.sub(ulist, string)
  276. def olist(match):
  277. def replace_li(entry, count):
  278. return entry.replace("<li>", "%s. " % count)
  279. strings = match.group(1).split("</li>")
  280. return '\n'.join([replace_li(string, count) \
  281. for string, count in map(None, strings, xrange(len(strings)))]) \
  282. + "\n"
  283. string = TK_OLIST.sub(olist, string)
  284. for pattern, repl in TK_REPLACEMENTS_2:
  285. string = pattern.sub(repl, string)
  286. return decode_string(string)