/external/markdown/markdown/inlinepatterns.py
Python | 371 lines | 336 code | 14 blank | 21 comment | 11 complexity | 0f6e93b03128ac2a38288e72f28ebff1 MD5 | raw file
1"""
2INLINE PATTERNS
3=============================================================================
4
5Inline patterns such as *emphasis* are handled by means of auxiliary
6objects, one per pattern. Pattern objects must be instances of classes
7that extend markdown.Pattern. Each pattern object uses a single regular
8expression and needs support the following methods:
9
10 pattern.getCompiledRegExp() # returns a regular expression
11
12 pattern.handleMatch(m) # takes a match object and returns
13 # an ElementTree element or just plain text
14
15All of python markdown's built-in patterns subclass from Pattern,
16but you can add additional patterns that don't.
17
18Also note that all the regular expressions used by inline must
19capture the whole block. For this reason, they all start with
20'^(.*)' and end with '(.*)!'. In case with built-in expression
21Pattern takes care of adding the "^(.*)" and "(.*)!".
22
23Finally, the order in which regular expressions are applied is very
24important - e.g. if we first replace http://.../ links with <a> tags
25and _then_ try to replace inline html, we would end up with a mess.
26So, we apply the expressions in the following order:
27
28* escape and backticks have to go before everything else, so
29 that we can preempt any markdown patterns by escaping them.
30
31* then we handle auto-links (must be done before inline html)
32
33* then we handle inline HTML. At this point we will simply
34 replace all inline HTML strings with a placeholder and add
35 the actual HTML to a hash.
36
37* then inline images (must be done before links)
38
39* then bracketed links, first regular then reference-style
40
41* finally we apply strong and emphasis
42"""
43
44import markdown
45import re
46from urlparse import urlparse, urlunparse
47import sys
48if sys.version >= "3.0":
49 from html import entities as htmlentitydefs
50else:
51 import htmlentitydefs
52
53"""
54The actual regular expressions for patterns
55-----------------------------------------------------------------------------
56"""
57
58NOBRACKET = r'[^\]\[]*'
59BRK = ( r'\[('
60 + (NOBRACKET + r'(\[')*6
61 + (NOBRACKET+ r'\])*')*6
62 + NOBRACKET + r')\]' )
63NOIMG = r'(?<!\!)'
64
65BACKTICK_RE = r'(?<!\\)(`+)(.+?)(?<!`)\2(?!`)' # `e=f()` or ``e=f("`")``
66ESCAPE_RE = r'\\(.)' # \<
67EMPHASIS_RE = r'(\*)([^\*]+)\2' # *emphasis*
68STRONG_RE = r'(\*{2}|_{2})(.+?)\2' # **strong**
69STRONG_EM_RE = r'(\*{3}|_{3})(.+?)\2' # ***strong***
70
71if markdown.SMART_EMPHASIS:
72 EMPHASIS_2_RE = r'(?<!\w)(_)(\S.+?)\2(?!\w)' # _emphasis_
73else:
74 EMPHASIS_2_RE = r'(_)(.+?)\2' # _emphasis_
75
76LINK_RE = NOIMG + BRK + \
77r'''\(\s*(<.*?>|((?:(?:\(.*?\))|[^\(\)]))*?)\s*((['"])(.*?)\12)?\)'''
78# [text](url) or [text](<url>)
79
80IMAGE_LINK_RE = r'\!' + BRK + r'\s*\((<.*?>|([^\)]*))\)'
81#  or 
82REFERENCE_RE = NOIMG + BRK+ r'\s*\[([^\]]*)\]' # [Google][3]
83IMAGE_REFERENCE_RE = r'\!' + BRK + '\s*\[([^\]]*)\]' # ![alt text][2]
84NOT_STRONG_RE = r'((^| )(\*|_)( |$))' # stand-alone * or _
85AUTOLINK_RE = r'<((?:f|ht)tps?://[^>]*)>' # <http://www.123.com>
86AUTOMAIL_RE = r'<([^> \!]*@[^> ]*)>' # <me@example.com>
87
88HTML_RE = r'(\<([a-zA-Z/][^\>]*?|\!--.*?--)\>)' # <...>
89ENTITY_RE = r'(&[\#a-zA-Z0-9]*;)' # &
90LINE_BREAK_RE = r' \n' # two spaces at end of line
91LINE_BREAK_2_RE = r' $' # two spaces at end of text
92
93
94def dequote(string):
95 """Remove quotes from around a string."""
96 if ( ( string.startswith('"') and string.endswith('"'))
97 or (string.startswith("'") and string.endswith("'")) ):
98 return string[1:-1]
99 else:
100 return string
101
102ATTR_RE = re.compile("\{@([^\}]*)=([^\}]*)}") # {@id=123}
103
104def handleAttributes(text, parent):
105 """Set values of an element based on attribute definitions ({@id=123})."""
106 def attributeCallback(match):
107 parent.set(match.group(1), match.group(2).replace('\n', ' '))
108 return ATTR_RE.sub(attributeCallback, text)
109
110
111"""
112The pattern classes
113-----------------------------------------------------------------------------
114"""
115
116class Pattern:
117 """Base class that inline patterns subclass. """
118
119 def __init__ (self, pattern, markdown_instance=None):
120 """
121 Create an instant of an inline pattern.
122
123 Keyword arguments:
124
125 * pattern: A regular expression that matches a pattern
126
127 """
128 self.pattern = pattern
129 self.compiled_re = re.compile("^(.*?)%s(.*?)$" % pattern, re.DOTALL)
130
131 # Api for Markdown to pass safe_mode into instance
132 self.safe_mode = False
133 if markdown_instance:
134 self.markdown = markdown_instance
135
136 def getCompiledRegExp (self):
137 """ Return a compiled regular expression. """
138 return self.compiled_re
139
140 def handleMatch(self, m):
141 """Return a ElementTree element from the given match.
142
143 Subclasses should override this method.
144
145 Keyword arguments:
146
147 * m: A re match object containing a match of the pattern.
148
149 """
150 pass
151
152 def type(self):
153 """ Return class name, to define pattern type """
154 return self.__class__.__name__
155
156BasePattern = Pattern # for backward compatibility
157
158class SimpleTextPattern (Pattern):
159 """ Return a simple text of group(2) of a Pattern. """
160 def handleMatch(self, m):
161 text = m.group(2)
162 if text == markdown.INLINE_PLACEHOLDER_PREFIX:
163 return None
164 return text
165
166class SimpleTagPattern (Pattern):
167 """
168 Return element of type `tag` with a text attribute of group(3)
169 of a Pattern.
170
171 """
172 def __init__ (self, pattern, tag):
173 Pattern.__init__(self, pattern)
174 self.tag = tag
175
176 def handleMatch(self, m):
177 el = markdown.etree.Element(self.tag)
178 el.text = m.group(3)
179 return el
180
181
182class SubstituteTagPattern (SimpleTagPattern):
183 """ Return a eLement of type `tag` with no children. """
184 def handleMatch (self, m):
185 return markdown.etree.Element(self.tag)
186
187
188class BacktickPattern (Pattern):
189 """ Return a `<code>` element containing the matching text. """
190 def __init__ (self, pattern):
191 Pattern.__init__(self, pattern)
192 self.tag = "code"
193
194 def handleMatch(self, m):
195 el = markdown.etree.Element(self.tag)
196 el.text = markdown.AtomicString(m.group(3).strip())
197 return el
198
199
200class DoubleTagPattern (SimpleTagPattern):
201 """Return a ElementTree element nested in tag2 nested in tag1.
202
203 Useful for strong emphasis etc.
204
205 """
206 def handleMatch(self, m):
207 tag1, tag2 = self.tag.split(",")
208 el1 = markdown.etree.Element(tag1)
209 el2 = markdown.etree.SubElement(el1, tag2)
210 el2.text = m.group(3)
211 return el1
212
213
214class HtmlPattern (Pattern):
215 """ Store raw inline html and return a placeholder. """
216 def handleMatch (self, m):
217 rawhtml = m.group(2)
218 inline = True
219 place_holder = self.markdown.htmlStash.store(rawhtml)
220 return place_holder
221
222
223class LinkPattern (Pattern):
224 """ Return a link element from the given match. """
225 def handleMatch(self, m):
226 el = markdown.etree.Element("a")
227 el.text = m.group(2)
228 title = m.group(11)
229 href = m.group(9)
230
231 if href:
232 if href[0] == "<":
233 href = href[1:-1]
234 el.set("href", self.sanitize_url(href.strip()))
235 else:
236 el.set("href", "")
237
238 if title:
239 title = dequote(title) #.replace('"', """)
240 el.set("title", title)
241 return el
242
243 def sanitize_url(self, url):
244 """
245 Sanitize a url against xss attacks in "safe_mode".
246
247 Rather than specifically blacklisting `javascript:alert("XSS")` and all
248 its aliases (see <http://ha.ckers.org/xss.html>), we whitelist known
249 safe url formats. Most urls contain a network location, however some
250 are known not to (i.e.: mailto links). Script urls do not contain a
251 location. Additionally, for `javascript:...`, the scheme would be
252 "javascript" but some aliases will appear to `urlparse()` to have no
253 scheme. On top of that relative links (i.e.: "foo/bar.html") have no
254 scheme. Therefore we must check "path", "parameters", "query" and
255 "fragment" for any literal colons. We don't check "scheme" for colons
256 because it *should* never have any and "netloc" must allow the form:
257 `username:password@host:port`.
258
259 """
260 locless_schemes = ['', 'mailto', 'news']
261 scheme, netloc, path, params, query, fragment = url = urlparse(url)
262 safe_url = False
263 if netloc != '' or scheme in locless_schemes:
264 safe_url = True
265
266 for part in url[2:]:
267 if ":" in part:
268 safe_url = False
269
270 if self.markdown.safeMode and not safe_url:
271 return ''
272 else:
273 return urlunparse(url)
274
275class ImagePattern(LinkPattern):
276 """ Return a img element from the given match. """
277 def handleMatch(self, m):
278 el = markdown.etree.Element("img")
279 src_parts = m.group(9).split()
280 if src_parts:
281 src = src_parts[0]
282 if src[0] == "<" and src[-1] == ">":
283 src = src[1:-1]
284 el.set('src', self.sanitize_url(src))
285 else:
286 el.set('src', "")
287 if len(src_parts) > 1:
288 el.set('title', dequote(" ".join(src_parts[1:])))
289
290 if markdown.ENABLE_ATTRIBUTES:
291 truealt = handleAttributes(m.group(2), el)
292 else:
293 truealt = m.group(2)
294
295 el.set('alt', truealt)
296 return el
297
298class ReferencePattern(LinkPattern):
299 """ Match to a stored reference and return link element. """
300 def handleMatch(self, m):
301 if m.group(9):
302 id = m.group(9).lower()
303 else:
304 # if we got something like "[Google][]"
305 # we'll use "google" as the id
306 id = m.group(2).lower()
307
308 if not id in self.markdown.references: # ignore undefined refs
309 return None
310 href, title = self.markdown.references[id]
311
312 text = m.group(2)
313 return self.makeTag(href, title, text)
314
315 def makeTag(self, href, title, text):
316 el = markdown.etree.Element('a')
317
318 el.set('href', self.sanitize_url(href))
319 if title:
320 el.set('title', title)
321
322 el.text = text
323 return el
324
325
326class ImageReferencePattern (ReferencePattern):
327 """ Match to a stored reference and return img element. """
328 def makeTag(self, href, title, text):
329 el = markdown.etree.Element("img")
330 el.set("src", self.sanitize_url(href))
331 if title:
332 el.set("title", title)
333 el.set("alt", text)
334 return el
335
336
337class AutolinkPattern (Pattern):
338 """ Return a link Element given an autolink (`<http://example/com>`). """
339 def handleMatch(self, m):
340 el = markdown.etree.Element("a")
341 el.set('href', m.group(2))
342 el.text = markdown.AtomicString(m.group(2))
343 return el
344
345class AutomailPattern (Pattern):
346 """
347 Return a mailto link Element given an automail link (`<foo@example.com>`).
348 """
349 def handleMatch(self, m):
350 el = markdown.etree.Element('a')
351 email = m.group(2)
352 if email.startswith("mailto:"):
353 email = email[len("mailto:"):]
354
355 def codepoint2name(code):
356 """Return entity definition by code, or the code if not defined."""
357 entity = htmlentitydefs.codepoint2name.get(code)
358 if entity:
359 return "%s%s;" % (markdown.AMP_SUBSTITUTE, entity)
360 else:
361 return "%s#%d;" % (markdown.AMP_SUBSTITUTE, code)
362
363 letters = [codepoint2name(ord(letter)) for letter in email]
364 el.text = markdown.AtomicString(''.join(letters))
365
366 mailto = "mailto:" + email
367 mailto = "".join([markdown.AMP_SUBSTITUTE + '#%d;' %
368 ord(letter) for letter in mailto])
369 el.set('href', mailto)
370 return el
371