/r2/r2/lib/contrib/markdown.py
Python | 690 lines | 663 code | 9 blank | 18 comment | 7 complexity | 3f53a4ebd3e155c875a1834142de750f MD5 | raw file
Possible License(s): MPL-2.0-no-copyleft-exception, LGPL-2.1
- #!/usr/bin/python
- import re, md5, sys, string
- """markdown.py: A Markdown-styled-text to HTML converter in Python.
- Usage:
- ./markdown.py textfile.markdown
-
- Calling:
- import markdown
- somehtml = markdown.markdown(sometext)
- """
- __version__ = '1.0.1-2' # port of 1.0.1
- __license__ = "GNU GPL 2"
- __author__ = [
- 'John Gruber <http://daringfireball.net/>',
- 'Tollef Fog Heen <tfheen@err.no>',
- 'Aaron Swartz <me@aaronsw.com>'
- ]
- def htmlquote(text):
- """Encodes `text` for raw use in HTML."""
- text = text.replace("&", "&") # Must be done first!
- text = text.replace("<", "<")
- text = text.replace(">", ">")
- text = text.replace("'", "'")
- text = text.replace('"', """)
- return text
- def mangle_text(text):
- from pylons import g
- return md5.new(text + g.SECRET).hexdigest()
- def semirandom(seed):
- from pylons import g
- x = 0
- for c in md5.new(seed + g.SECRET).digest(): x += ord(c)
- return x / (255*16.)
- class _Markdown:
- emptyelt = " />"
- tabwidth = 4
- escapechars = '\\`*_{}[]()>#+-.!'
- escapetable = {}
- for char in escapechars:
- escapetable[char] = mangle_text(char)
-
- r_multiline = re.compile("\n{2,}")
- r_stripspace = re.compile(r"^[ \t]+$", re.MULTILINE)
- def parse(self, text):
- self.urls = {}
- self.titles = {}
- self.html_blocks = {}
- self.list_level = 0
-
- text = text.replace("\r\n", "\n")
- text = text.replace("\r", "\n")
- text += "\n\n"
- text = self._Detab(text)
- text = self.r_stripspace.sub("", text)
- text = self._HashHTMLBlocks(text)
- text = self._StripLinkDefinitions(text)
- text = self._RunBlockGamut(text)
- text = self._UnescapeSpecialChars(text)
- return text
-
- r_StripLinkDefinitions = re.compile(r"""
- ^[ ]{0,%d}\[(.+)\]: # id = $1
- [ \t]*\n?[ \t]*
- <?(\S+?)>? # url = $2
- [ \t]*\n?[ \t]*
- (?:
- (?<=\s) # lookbehind for whitespace
- [\"\(] # " is backlashed so it colorizes our code right
- (.+?) # title = $3
- [\"\)]
- [ \t]*
- )? # title is optional
- (?:\n+|\Z)
- """ % (tabwidth-1), re.MULTILINE|re.VERBOSE)
- def _StripLinkDefinitions(self, text):
- def replacefunc(matchobj):
- (t1, t2, t3) = matchobj.groups()
- #@@ case sensitivity?
- self.urls[t1.lower()] = self._EncodeAmpsAndAngles(t2)
- if t3 is not None:
- self.titles[t1.lower()] = t3.replace('"', '"')
- return ""
- text = self.r_StripLinkDefinitions.sub(replacefunc, text)
- return text
- blocktagsb = r"p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|math"
- blocktagsa = blocktagsb + "|ins|del"
-
- r_HashHTMLBlocks1 = re.compile(r"""
- ( # save in $1
- ^ # start of line (with /m)
- <(%s) # start tag = $2
- \b # word break
- (.*\n)*? # any number of lines, minimally matching
- </\2> # the matching end tag
- [ \t]* # trailing spaces/tabs
- (?=\n+|$) # followed by a newline or end of document
- )
- """ % blocktagsa, re.MULTILINE | re.VERBOSE)
- r_HashHTMLBlocks2 = re.compile(r"""
- ( # save in $1
- ^ # start of line (with /m)
- <(%s) # start tag = $2
- \b # word break
- (.*\n)*? # any number of lines, minimally matching
- .*</\2> # the matching end tag
- [ \t]* # trailing spaces/tabs
- (?=\n+|\Z) # followed by a newline or end of document
- )
- """ % blocktagsb, re.MULTILINE | re.VERBOSE)
- r_HashHR = re.compile(r"""
- (?:
- (?<=\n\n) # Starting after a blank line
- | # or
- \A\n? # the beginning of the doc
- )
- ( # save in $1
- [ ]{0,%d}
- <(hr) # start tag = $2
- \b # word break
- ([^<>])*? #
- /?> # the matching end tag
- [ \t]*
- (?=\n{2,}|\Z)# followed by a blank line or end of document
- )
- """ % (tabwidth-1), re.VERBOSE)
- r_HashComment = re.compile(r"""
- (?:
- (?<=\n\n) # Starting after a blank line
- | # or
- \A\n? # the beginning of the doc
- )
- ( # save in $1
- [ ]{0,%d}
- (?:
- <!
- (--.*?--\s*)+
- >
- )
- [ \t]*
- (?=\n{2,}|\Z)# followed by a blank line or end of document
- )
- """ % (tabwidth-1), re.VERBOSE)
- def _HashHTMLBlocks(self, text):
- def handler(m):
- key = m.group(1)
- try:
- key = key.encode('utf8')
- except UnicodeDecodeError:
- key = ''.join(k for k in key if ord(k) < 128)
- key = mangle_text(key)
- self.html_blocks[key] = m.group(1)
- return "\n\n%s\n\n" % key
- text = self.r_HashHTMLBlocks1.sub(handler, text)
- text = self.r_HashHTMLBlocks2.sub(handler, text)
- oldtext = text
- text = self.r_HashHR.sub(handler, text)
- text = self.r_HashComment.sub(handler, text)
- return text
- #@@@ wrong!
- r_hr1 = re.compile(r'^[ ]{0,2}([ ]?\*[ ]?){3,}[ \t]*$', re.M)
- r_hr2 = re.compile(r'^[ ]{0,2}([ ]?-[ ]?){3,}[ \t]*$', re.M)
- r_hr3 = re.compile(r'^[ ]{0,2}([ ]?_[ ]?){3,}[ \t]*$', re.M)
-
- def _RunBlockGamut(self, text):
- text = self._DoHeaders(text)
- for x in [self.r_hr1, self.r_hr2, self.r_hr3]:
- text = x.sub("\n<hr%s\n" % self.emptyelt, text);
- text = self._DoLists(text)
- text = self._DoCodeBlocks(text)
- text = self._DoBlockQuotes(text)
- # We did this in parse()
- # to escape the source
- # now it's stuff _we_ made
- # so we don't wrap it in <p>s.
- text = self._HashHTMLBlocks(text)
- text = self._FormParagraphs(text)
- return text
- r_NewLine = re.compile(" {2,}\n")
- def _RunSpanGamut(self, text):
- text = self._DoCodeSpans(text)
- text = self._EscapeSpecialChars(text)
- text = self._DoImages(text)
- text = self._DoAnchors(text)
- text = self._DoAutoLinks(text)
- text = self._EncodeAmpsAndAngles(text)
- text = self._DoItalicsAndBold(text)
- text = self.r_NewLine.sub(" <br%s\n" % self.emptyelt, text)
- return text
- def _EscapeSpecialChars(self, text):
- tokens = self._TokenizeHTML(text)
- text = ""
- for cur_token in tokens:
- if cur_token[0] == "tag":
- cur_token[1] = cur_token[1].replace('*', self.escapetable["*"])
- cur_token[1] = cur_token[1].replace('_', self.escapetable["_"])
- text += cur_token[1]
- else:
- text += self._EncodeBackslashEscapes(cur_token[1])
- return text
- r_DoAnchors1 = re.compile(
- r""" ( # wrap whole match in $1
- \[
- (.*?) # link text = $2
- # [for bracket nesting, see below]
- \]
- [ ]? # one optional space
- (?:\n[ ]*)? # one optional newline followed by spaces
- \[
- (.*?) # id = $3
- \]
- )
- """, re.S|re.VERBOSE)
- r_DoAnchors2 = re.compile(
- r""" ( # wrap whole match in $1
- \[
- (.*?) # link text = $2
- \]
- \( # literal paren
- [ \t]*
- <?(.+?)>? # href = $3
- [ \t]*
- ( # $4
- ([\'\"]) # quote char = $5
- (.*?) # Title = $6
- \5 # matching quote
- )? # title is optional
- \)
- )
- """, re.S|re.VERBOSE)
- def _DoAnchors(self, text):
- # We here don't do the same as the perl version, as python's regex
- # engine gives us no way to match brackets.
- def handler1(m):
- whole_match = m.group(1)
- link_text = m.group(2)
- link_id = m.group(3).lower()
- if not link_id: link_id = link_text.lower()
- title = self.titles.get(link_id, None)
-
- if self.urls.has_key(link_id):
- url = self.urls[link_id]
- url = url.replace("*", self.escapetable["*"])
- url = url.replace("_", self.escapetable["_"])
- res = '<a href="%s"' % htmlquote(url)
- if not re.search('lesswrong|overcomingbias', res):
- res += ' rel="nofollow"'
- if title:
- title = title.replace("*", self.escapetable["*"])
- title = title.replace("_", self.escapetable["_"])
- res += ' title="%s"' % htmlquote(title)
- res += ">%s</a>" % htmlquote(link_text)
- else:
- res = whole_match
- return res
- def handler2(m):
- whole_match = m.group(1)
- link_text = m.group(2)
- url = m.group(3)
- title = m.group(6)
- url = url.replace("*", self.escapetable["*"])
- url = url.replace("_", self.escapetable["_"])
- res = '''<a href="%s"''' % htmlquote(url)
- if not re.search('lesswrong|overcomingbias', res):
- res += ' rel="nofollow"'
- if title:
- title = title.replace('"', '"')
- title = title.replace("*", self.escapetable["*"])
- title = title.replace("_", self.escapetable["_"])
- res += ' title="%s"' % htmlquote(title)
- res += ">%s</a>" % htmlquote(link_text)
- return res
- #text = self.r_DoAnchors1.sub(handler1, text)
- text = self.r_DoAnchors2.sub(handler2, text)
- return text
- r_DoImages1 = re.compile(
- r""" ( # wrap whole match in $1
- !\[
- (.*?) # alt text = $2
- \]
- [ ]? # one optional space
- (?:\n[ ]*)? # one optional newline followed by spaces
- \[
- (.*?) # id = $3
- \]
- )
- """, re.VERBOSE|re.S)
- r_DoImages2 = re.compile(
- r""" ( # wrap whole match in $1
- !\[
- (.*?) # alt text = $2
- \]
- \( # literal paren
- [ \t]*
- <?(\S+?)>? # src url = $3
- [ \t]*
- ( # $4
- ([\'\"]) # quote char = $5
- (.*?) # title = $6
- \5 # matching quote
- [ \t]*
- )? # title is optional
- \)
- )
- """, re.VERBOSE|re.S)
- def _DoImages(self, text):
- def handler1(m):
- whole_match = m.group(1)
- alt_text = m.group(2)
- link_id = m.group(3).lower()
- if not link_id:
- link_id = alt_text.lower()
- alt_text = alt_text.replace('"', """)
- if self.urls.has_key(link_id):
- url = self.urls[link_id]
- url = url.replace("*", self.escapetable["*"])
- url = url.replace("_", self.escapetable["_"])
- res = '''<img src="%s" alt="%s"''' % (htmlquote(url), htmlquote(alt_text))
- if self.titles.has_key(link_id):
- title = self.titles[link_id]
- title = title.replace("*", self.escapetable["*"])
- title = title.replace("_", self.escapetable["_"])
- res += ' title="%s"' % htmlquote(title)
- res += self.emptyelt
- else:
- res = whole_match
- return res
- def handler2(m):
- whole_match = m.group(1)
- alt_text = m.group(2)
- url = m.group(3)
- title = m.group(6) or ''
-
- alt_text = alt_text.replace('"', """)
- title = title.replace('"', """)
- url = url.replace("*", self.escapetable["*"])
- url = url.replace("_", self.escapetable["_"])
- res = '<img src="%s" alt="%s"' % (htmlquote(url), htmlquote(alt_text))
- if title is not None:
- title = title.replace("*", self.escapetable["*"])
- title = title.replace("_", self.escapetable["_"])
- res += ' title="%s"' % htmlquote(title)
- res += self.emptyelt
- return res
- text = self.r_DoImages1.sub(handler1, text)
- text = self.r_DoImages2.sub(handler2, text)
- return text
-
- r_DoHeaders = re.compile(r"^(\#{1,6})[ \t]*(.+?)[ \t]*\#*\n+", re.VERBOSE|re.M)
- def _DoHeaders(self, text):
- def findheader(text, c, n):
- textl = text.split('\n')
- for i in xrange(len(textl)):
- if i >= len(textl): continue
- count = textl[i].strip().count(c)
- if count > 0 and count == len(textl[i].strip()) and textl[i+1].strip() == '' and textl[i-1].strip() != '':
- textl = textl[:i] + textl[i+1:]
- textl[i-1] = '<h'+n+'>'+self._RunSpanGamut(textl[i-1])+'</h'+n+'>'
- textl = textl[:i] + textl[i+1:]
- text = '\n'.join(textl)
- return text
-
- def handler(m):
- level = len(m.group(1))
- header = self._RunSpanGamut(m.group(2))
- return "<h%s>%s</h%s>\n\n" % (level, header, level)
- text = findheader(text, '=', '1')
- text = findheader(text, '-', '2')
- text = self.r_DoHeaders.sub(handler, text)
- return text
-
- rt_l = r"""
- (
- (
- [ ]{0,%d}
- ([*+-]|\d+[.])
- [ \t]+
- )
- (?:.+?)
- (
- \Z
- |
- \n{2,}
- (?=\S)
- (?![ \t]* ([*+-]|\d+[.])[ \t]+)
- )
- )
- """ % (tabwidth - 1)
- r_DoLists = re.compile('^'+rt_l, re.M | re.VERBOSE | re.S)
- r_DoListsTop = re.compile(
- r'(?:\A\n?|(?<=\n\n))'+rt_l, re.M | re.VERBOSE | re.S)
-
- def _DoLists(self, text):
- def handler(m):
- list_type = "ol"
- if m.group(3) in [ "*", "-", "+" ]:
- list_type = "ul"
- listn = m.group(1)
- listn = self.r_multiline.sub("\n\n\n", listn)
- res = self._ProcessListItems(listn)
- res = "<%s>\n%s</%s>\n" % (list_type, res, list_type)
- return res
-
- if self.list_level:
- text = self.r_DoLists.sub(handler, text)
- else:
- text = self.r_DoListsTop.sub(handler, text)
- return text
- r_multiend = re.compile(r"\n{2,}\Z")
- r_ProcessListItems = re.compile(r"""
- (\n)? # leading line = $1
- (^[ \t]*) # leading whitespace = $2
- ([*+-]|\d+[.]) [ \t]+ # list marker = $3
- ((?:.+?) # list item text = $4
- (\n{1,2}))
- (?= \n* (\Z | \2 ([*+-]|\d+[.]) [ \t]+))
- """, re.VERBOSE | re.M | re.S)
- def _ProcessListItems(self, text):
- self.list_level += 1
- text = self.r_multiend.sub("\n", text)
-
- def handler(m):
- item = m.group(4)
- leading_line = m.group(1)
- leading_space = m.group(2)
- if leading_line or self.r_multiline.search(item):
- item = self._RunBlockGamut(self._Outdent(item))
- else:
- item = self._DoLists(self._Outdent(item))
- if item[-1] == "\n": item = item[:-1] # chomp
- item = self._RunSpanGamut(item)
- return "<li>%s</li>\n" % item
- text = self.r_ProcessListItems.sub(handler, text)
- self.list_level -= 1
- return text
-
- r_DoCodeBlocks = re.compile(r"""
- (?:\n\n|\A)
- ( # $1 = the code block
- (?:
- (?:[ ]{%d} | \t) # Lines must start with a tab or equiv
- .*\n+
- )+
- )
- ((?=^[ ]{0,%d}\S)|\Z) # Lookahead for non-space/end of doc
- """ % (tabwidth, tabwidth), re.M | re.VERBOSE)
- def _DoCodeBlocks(self, text):
- def handler(m):
- codeblock = m.group(1)
- codeblock = self._EncodeCode(self._Outdent(codeblock))
- codeblock = self._Detab(codeblock)
- codeblock = codeblock.lstrip("\n")
- codeblock = codeblock.rstrip()
- res = "\n\n<pre><code>%s\n</code></pre>\n\n" % codeblock
- return res
- text = self.r_DoCodeBlocks.sub(handler, text)
- return text
- r_DoCodeSpans = re.compile(r"""
- (`+) # $1 = Opening run of `
- (.+?) # $2 = The code block
- (?<!`)
- \1 # Matching closer
- (?!`)
- """, re.I|re.VERBOSE)
- def _DoCodeSpans(self, text):
- def handler(m):
- c = m.group(2)
- c = c.strip()
- c = self._EncodeCode(c)
- return "<code>%s</code>" % c
- text = self.r_DoCodeSpans.sub(handler, text)
- return text
-
- def _EncodeCode(self, text):
- text = text.replace("&","&")
- text = text.replace("<","<")
- text = text.replace(">",">")
- for c in "*_{}[]\\":
- text = text.replace(c, self.escapetable[c])
- return text
-
- r_DoBold = re.compile(r"(\*\*|__) (?=\S) (.+?[*_]*) (?<=\S) \1", re.VERBOSE | re.S)
- r_DoItalics = re.compile(r"(\*|_) (?=\S) (.+?) (?<=\S) \1", re.VERBOSE | re.S)
- def _DoItalicsAndBold(self, text):
- text = self.r_DoBold.sub(r"<strong>\2</strong>", text)
- text = self.r_DoItalics.sub(r"<em>\2</em>", text)
- return text
-
- r_start = re.compile(r"^", re.M)
- ####r_DoBlockQuotes1 = re.compile(r"^[ \t]*>[ \t]?", re.M)
- r_DoBlockQuotes1 = re.compile(r"^[ \t]*>[ \t]?", re.M)
- r_DoBlockQuotes2 = re.compile(r"^[ \t]+$", re.M)
- r_DoBlockQuotes3 = re.compile(r"""
- ( # Wrap whole match in $1
- (
- ^[ \t]*>[ \t]? # '>' at the start of a line
- .+\n # rest of the first line
- (.+\n)* # subsequent consecutive lines
- \n* # blanks
- )+
- )""", re.M | re.VERBOSE)
- r_protectpre = re.compile(r'(\s*<pre>.+?</pre>)', re.S)
- r_propre = re.compile(r'^ ', re.M)
- def _DoBlockQuotes(self, text):
- def prehandler(m):
- return self.r_propre.sub('', m.group(1))
-
- def handler(m):
- bq = m.group(1)
- bq = self.r_DoBlockQuotes1.sub("", bq)
- bq = self.r_DoBlockQuotes2.sub("", bq)
- bq = self._RunBlockGamut(bq)
- bq = self.r_start.sub(" ", bq)
- bq = self.r_protectpre.sub(prehandler, bq)
- return "<blockquote>\n%s\n</blockquote>\n\n" % bq
-
- text = self.r_DoBlockQuotes3.sub(handler, text)
- return text
- r_tabbed = re.compile(r"^([ \t]*)")
- def _FormParagraphs(self, text):
- text = text.strip("\n")
- grafs = self.r_multiline.split(text)
- for g in xrange(len(grafs)):
- t = grafs[g].strip() #@@?
- if not self.html_blocks.has_key(t):
- t = self._RunSpanGamut(t)
- t = self.r_tabbed.sub(r"<p>", t)
- t += "</p>"
- grafs[g] = t
- for g in xrange(len(grafs)):
- t = grafs[g].strip()
- if self.html_blocks.has_key(t):
- grafs[g] = self.html_blocks[t]
-
- return "\n\n".join(grafs)
- r_EncodeAmps = re.compile(r"&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)")
- r_EncodeAngles = re.compile(r"<(?![a-z/?\$!])")
- def _EncodeAmpsAndAngles(self, text):
- text = self.r_EncodeAmps.sub("&", text)
- text = self.r_EncodeAngles.sub("<", text)
- return text
- def _EncodeBackslashEscapes(self, text):
- for char in self.escapechars:
- text = text.replace("\\" + char, self.escapetable[char])
- return text
-
- r_link = re.compile(r"<((https?|ftp):[^\'\">\s]+)>", re.I)
- r_email = re.compile(r"""
- <
- (?:mailto:)?
- (
- [-.\w]+
- \@
- [-a-z0-9]+(\.[-a-z0-9]+)*\.[a-z]+
- )
- >""", re.VERBOSE|re.I)
- def _DoAutoLinks(self, text):
- text = self.r_link.sub(r'<a href="\1" rel="nofollow">\1</a>', text)
- def handler(m):
- l = m.group(1)
- return self._EncodeEmailAddress(self._UnescapeSpecialChars(l))
-
- text = self.r_email.sub(handler, text)
- return text
-
- r_EncodeEmailAddress = re.compile(r">.+?:")
- def _EncodeEmailAddress(self, text):
- encode = [
- lambda x: "&#%s;" % ord(x),
- lambda x: "&#x%X;" % ord(x),
- lambda x: x
- ]
- text = "mailto:" + text
- addr = ""
- for c in text:
- if c == ':': addr += c; continue
-
- r = semirandom(addr)
- if r < 0.45:
- addr += encode[1](c)
- elif r > 0.9 and c != '@':
- addr += encode[2](c)
- else:
- addr += encode[0](c)
- text = '<a href="%s">%s</a>' % (addr, addr)
- text = self.r_EncodeEmailAddress.sub('>', text)
- return text
- def _UnescapeSpecialChars(self, text):
- for key in self.escapetable.keys():
- text = text.replace(self.escapetable[key], key)
- return text
-
- tokenize_depth = 6
- tokenize_nested_tags = '|'.join([r'(?:<[a-z/!$](?:[^<>]'] * tokenize_depth) + (')*>)' * tokenize_depth)
- r_TokenizeHTML = re.compile(
- r"""(?: <! ( -- .*? -- \s* )+ > ) | # comment
- (?: <\? .*? \?> ) | # processing instruction
- %s # nested tags
- """ % tokenize_nested_tags, re.I|re.VERBOSE)
- def _TokenizeHTML(self, text):
- pos = 0
- tokens = []
- matchobj = self.r_TokenizeHTML.search(text, pos)
- while matchobj:
- whole_tag = matchobj.string[matchobj.start():matchobj.end()]
- sec_start = matchobj.end()
- tag_start = sec_start - len(whole_tag)
- if pos < tag_start:
- tokens.append(["text", matchobj.string[pos:tag_start]])
- tokens.append(["tag", whole_tag])
- pos = sec_start
- matchobj = self.r_TokenizeHTML.search(text, pos)
- if pos < len(text):
- tokens.append(["text", text[pos:]])
- return tokens
- r_Outdent = re.compile(r"""^(\t|[ ]{1,%d})""" % tabwidth, re.M)
- def _Outdent(self, text):
- text = self.r_Outdent.sub("", text)
- return text
- def _Detab(self, text): return text.expandtabs(self.tabwidth)
- def Markdown(*args, **kw): return _Markdown().parse(*args, **kw)
- markdown = Markdown
- if __name__ == '__main__':
- if len(sys.argv) > 1:
- print Markdown(open(sys.argv[1]).read())
- else:
- print Markdown(sys.stdin.read())