/str_format.py
Python | 353 lines | 343 code | 8 blank | 2 comment | 12 complexity | 555ebce4067abbc699a4d1b51b11a041 MD5 | raw file
- import re
- from urllib import quote
- from util import local
- MAX_UNICODE = 1114111
- CONTROL_CHARS_RE = re.compile('[\x00-\x08\x0b\x0c\x0e-\x1f]')
- ENTITIES_CLEAN_RE = re.compile('&(#([0-9]+);|#x([0-9a-fA-F]+);|)')
- ENTITY_REPLACES = {
- '<': '<',
- '>': '>',
- '"': '"',
- "'": ''',
- ',': ',', # "clean up commas for some reason I forgot"
- }
- def clean_string(string, cleanentities=False):
- if cleanentities:
- string = string.replace("&", "&") # clean up &
- else:
- def repl(match):
- g = match.groups()
- if not g[0]: # change simple ampersands
- return '&'
- ordinal = int(g[1] or int(g[2], 16))
- if forbidden_unicode(ordinal): # strip forbidden unicode chars
- return ''
- else: # and leave the rest as-is.
- return '&' + g[0]
- string = ENTITIES_CLEAN_RE.sub(repl, string)
- # replace <, >, ", ' and "," with html entities
- for old, new in ENTITY_REPLACES.iteritems():
- string = string.replace(old, new)
- # Kill linefeeds.
- string = string.replace('\r', '')
- # remove control chars
- string = CONTROL_CHARS_RE.sub('', string)
- return string
- ENTITIES_DECODE_RE = re.compile('(&#([0-9]*)([;&])|&#([x&])([0-9a-f]*)([;&]))', re.I)
- def decode_string(string, noentities=False):
- '''Returns unicode string'''
- def repl(match):
- g = match.groups()
- ordinal = int(g[1] or int(g[4], 16))
- if '&' in g: # nested entities, leave as-is.
- return g[0]
- elif ordinal in (35, 38): # don't convert & or #
- return g[0]
- elif forbidden_unicode(ordinal): # strip forbidden unicode chars
- return ''
- else: # convert all entities to unicode chars
- return unichr(ordinal)
- if not noentities:
- string = ENTITIES_DECODE_RE.sub(repl, string)
- # remove control chars
- string = CONTROL_CHARS_RE.sub('', string)
- return string
- def forbidden_unicode(num):
- return ((len(str(num)) > 7) or # too long numbers
- (num > MAX_UNICODE) or # outside unicode range
- (num < 32) or # control chars
- (num >= 0xd800 and num <= 0xdfff) or # surrogate code points
- (num >= 0x202a and num <= 0x202e)) # text direction
- # The following top-level code is used to build markup translation
- # dictionaries to convert from an arbitrary formatting language to HTML
- # or theoretically any compatible language and back again. This is used
- # to streamline and potentially extend formatting routines.
- # BBCODE_TABLE is used as an example markup translation definition.
- # Note no regex is used in the dictionary other than group names. Constructs
- # like iterators (e.g., lists) and arbitrary arguments (e.g., XML-style
- # attributes) will be supported hopefully tomorrow.
- # The left side of this table is the markup inputted by the user following
- # the markup standard. The right-side is equivalent HTML, with the respective
- # captured groups of generic text. (Picker text capturing will also be
- # added.)
- BBCODE_TABLE \
- = { r'[b]\1[/b]' : r'<strong>\1</strong>',
- r'[i]\1\[/i]' : r'<em>\1</em>',
- r'[del]\1[/del]' : r'<del>\1</del>',
- # BAD IDEA but good for testing.
- r'[color="\1"]\2[/color]' \
- : r'<span style="color:\1">\2</span>'\
- }
- def __build_transl_dict(key, value, append):
- original_key = key
- # Escape metacharacters and match with \\1, \\2, etc.
- key = re.compile(r'\\\\\d+').sub(r'(.*?)', re.escape(key))
- # Effectively transpose each group to the relative location in the output
- # string (likely still in order).
- value = re.compile(key).sub(value, original_key)
- append[re.compile(key)] = value
- # Build markup translation dictionaries for converting to and from
- HTML_TRANSL = {}
- for (key, value) in BBCODE_TABLE.iteritems():
- # Clean the markup since the comment containing the code is cleaned, too.
- key = clean_string(decode_string(key))
- __build_transl_dict(key, value, HTML_TRANSL)
- CODE_TRANSL = {}
- for (key, value) in BBCODE_TABLE.iteritems():
- # The HTML code is raw, thus no need to decode/clean the key.
- __build_transl_dict(value, key, CODE_TRANSL)
- def percent_encode(string):
- return quote(string.encode('utf-8'))
- # The code above will be temporarily replaced by this wakabamark-only
- # version, in this branch only.
- #format_comment regexps (FC_*)
- FC_HIDE_POSTLINKS = [
- (re.compile('>>>/?([0-9a-zA-Z]+)/?>>([0-9]+)'),
- r'>>>/\1/>>\2'),
- (re.compile('>>>/([0-9a-zA-Z]+)/'), r'>>>/\1/'),
- (re.compile('>>([0-9\-]+)'), r'>gt;\1')
- ]
- FC_BOARD_POST_LINK = re.compile('>>>\/?([0-9a-zA-Z]+)\/?>>([0-9]+)')
- FC_BOARD_LINK = re.compile('>>>\/?([0-9a-zA-Z]+)\/?')
- FC_POST_LINK = re.compile('>gt;([0-9]+)')
- def format_comment(comment):
- # hide >>1 references from the quoting code
- for pattern, repl in FC_HIDE_POSTLINKS:
- comment = pattern.sub(repl, comment)
- def unhide_postlinks(string):
- return (string
- .replace(">>>", ">>>")
- .replace(">>", ">>")
- .replace(">gt;", ">>"))
- def handler(line):
- '''fix up post link references'''
- # import this here to avoid circular imports. ugly, i know.
- import board
- def board_post_link(match):
- origtext = unhide_postlinks(match.group(0))
- try:
- newboard = board.Board(match.group(1))
- res = newboard.get_post(match.group(2))
- if res:
- return '<a href="%s" onclick="highlight(%s)">%s</a>' % (
- newboard.get_reply_link(res.num, res.parent),
- match.group(1), origtext)
- except board.BoardNotFound:
- pass
- return origtext
- line = FC_BOARD_POST_LINK.sub(board_post_link, line)
- def board_link(match):
- origtext = unhide_postlinks(match.group(0))
- try:
- newboard = board.Board(match.group(1))
- return '<a href="%s">%s</a>' % (
- newboard.make_path(page=0, url=True),
- origtext)
- except board.BoardNotFound:
- return origtext
- line = FC_BOARD_LINK.sub(board_link, line)
- def post_link(match):
- origtext = unhide_postlinks(match.group(0))
- res = local.board.get_post(match.group(1))
- if res:
- return '<a href="%s" onclick="highlight(%s)">%s</a>' % (
- local.board.get_reply_link(res.num, res.parent),
- res.num, origtext)
- else:
- return origtext
- line = FC_POST_LINK.sub(post_link, line)
- return line
- if local.board.options['ENABLE_WAKABAMARK']:
- comment = do_wakabamark(comment, handler)
- else:
- comment = "<p>" + simple_format(comment, handler) + "</p>"
- # fix <blockquote> styles for old stylesheets
- comment = comment.replace("<blockquote>", '<blockquote class="unkfunc">')
- # restore >>1 references hidden in code blocks
- comment = unhide_postlinks(comment)
- return comment
- #wakabamark regexps (WM_*)
- WM_REPLACEMENTS = [
- (re.compile(r'\*\*([^\s].*?)\*\*'), r'<strong>\1</strong>'),
- (re.compile(r'\*([^\s].*?)\*'), r'<em>\1</em>'),
- (re.compile(r'`([^\n]*?)`'), r'<code>\1</code>'),
- (re.compile(r'\[spoiler\]'), r'<span class="spoiler">'),
- (re.compile(r'\[/spoiler\]'), r'</span><!--/spoiler-->'),
- ]
- WM_CODEBLOCK = [re.compile(r'^( |\t)'), '<pre><code>', '', '\n',
- '</pre></code>', []]
- WM_OLIST = [re.compile(r'^(\d+\.)\s*'), '<ol>', '<li>', '</li>', '</ol>',
- []]
- WM_ULIST = [re.compile(r'^[\*\+\-]\s*'), '<ul>', '<li>', '</li>', '</ul>',
- []]
- WM_BLOCKQUOTE = [re.compile(r'^>\s*'), '<blockquote>', '', '<br />',
- '</blockquote>', []]
- URL_PATTERN = re.compile(
- '(https?://[^\s<>"]*?)((?:\s|<|>|"|\.|\)|\]|!|\?|,|,|")*'
- '(?:[\s<>"]|$))', re.I | re.S)
- URL_SUB = r'<a href="\1">\1</a>\2'
- def do_wakabamark(comment, handler):
- lines = []
- orig_lines = comment.split('\n')
- orig_lines.append('')
- # State variable: Did we previously see an empty line?
- empty_line_before = False
- for line in orig_lines:
- # Do spans outside codeblocks.
- if not WM_CODEBLOCK[0].match(line):
- line = URL_PATTERN.sub(URL_SUB, line)
- for pattern, repl in WM_REPLACEMENTS:
- line = pattern.sub(repl, line)
- # Parse with handler now.
- if handler:
- line = handler(line)
- # Go through each block type and format.
- match = False
- for format_type in (WM_CODEBLOCK, WM_OLIST, WM_ULIST, WM_BLOCKQUOTE):
- (ptn, open_el, open_item_el, close_item_el, close_el, lst) \
- = format_type
- if ptn.match(line):
- match = True
- if format_type != WM_BLOCKQUOTE:
- line = ptn.sub('', line)
- lst.append(open_item_el + line + close_item_el)
- elif lst:
- # The stack of lines in a format block may now be "popped."
- lines.append(open_el + ''.join(lst) + close_el)
- lst = format_type[5] = []
- if not match and line:
- PARAGRAPH_RE = re.compile(r'</p>$')
- if lines and not empty_line_before \
- and PARAGRAPH_RE.search(lines[-1]):
- lines[-1] = PARAGRAPH_RE.sub('<br />' + line + '</p>',
- lines[-1])
- else:
- lines.append('<p>' + line + '</p>')
- empty_line_before = not line
- return ''.join(lines)
- GREENTEXT_PATTERN = re.compile("^(>[^_]*)$")
- GREENTEXT_SUB = r'<span class="unkfunc">\1</span>'
- def simple_format(comment, handler):
- lines = []
- for line in comment.split("\n"):
- # make URLs into links
- line = URL_PATTERN.sub(URL_SUB, line)
- # colour quoted sections if working in old-style mode.
- if not local.board.options['ENABLE_WAKABAMARK']:
- line = GREENTEXT_PATTERN.sub(GREENTEXT_SUB, line)
- if handler:
- line = handler(line)
- lines.append(line)
- return '<br />'.join(lines)
- #tag_killa regexps (TK_*)
- TK_REPLACEMENTS = [
- # Strip Oekaki postfix.
- (re.compile('<p(?: class="oekinfo">|>\s*<small>)\s*<strong>(?:Oekaki post|'
- 'Edited in Oekaki)</strong>\s*\(Time\:.*?</p>', re.I), ''),
- (re.compile('<br\s?/?>'), '\n'),
- (re.compile('<p>'), ''),
- (re.compile('</p>'), '\n\n'),
- (re.compile('<code>([^\n]*?)</code>'), r'`\1`'),
- (re.compile('</blockquote>'), '\n\n'),
- (re.compile('<span class="spoiler">'), '[spoiler]'),
- (re.compile('</span><!--/spoiler-->'), '[/spoiler]'),
- ]
- TK_CODEBLOCK = re.compile('<\s*?code>(.*?)</\s*?code>', re.S)
- TK_ULIST = re.compile('<ul>(.*?)</ul>', re.S)
- TK_OLIST = re.compile('<ol>(.*?)</ol>', re.S)
- TK_REPLACEMENTS_2 = [
- (re.compile('</?em>'), '*'),
- (re.compile('</?strong>'), '**'),
- (re.compile('<.*?>'), ''),
- (re.compile(r'\s+$'), '')
- ]
- def tag_killa(string):
- '''subroutine for stripping HTML tags and supplanting them with corresponding wakabamark'''
- for pattern, repl in TK_REPLACEMENTS:
- string = pattern.sub(repl, string)
- def codeblock(match):
- return '\n'.join([' ' + x for x in match.group(1).split("\n")]) \
- + '\n'
- string = TK_CODEBLOCK.sub(codeblock, string)
- def ulist(match):
- return match.group(1).replace("<li>", "* ").replace("</li>", "\n") \
- + '\n'
- string = TK_ULIST.sub(ulist, string)
- def olist(match):
- def replace_li(entry, count):
- return entry.replace("<li>", "%s. " % count)
- strings = match.group(1).split("</li>")
- return '\n'.join([replace_li(string, count) \
- for string, count in map(None, strings, xrange(len(strings)))]) \
- + "\n"
- string = TK_OLIST.sub(olist, string)
- for pattern, repl in TK_REPLACEMENTS_2:
- string = pattern.sub(repl, string)
- return decode_string(string)