PageRenderTime 50ms CodeModel.GetById 18ms RepoModel.GetById 1ms app.codeStats 0ms

/libs/external_libs/Creoleparser-0.5.0/creoleparser/elements.py

http://django-hotclub.googlecode.com/
Python | 1057 lines | 965 code | 20 blank | 72 comment | 10 complexity | ca3bee72b5982745f4380cf456ddb34c MD5 | raw file
Possible License(s): BSD-3-Clause, GPL-2.0, BSD-2-Clause, Apache-2.0, AGPL-3.0
  1. # elements.py
  2. #
  3. # Copyright (c) 2007 Stephen Day
  4. #
  5. # This module is part of Creoleparser and is released under
  6. # the MIT License: http://www.opensource.org/licenses/mit-license.php
  7. #
  8. import re
  9. import urlparse
  10. import genshi.builder as bldr
  11. from genshi.core import Stream
  12. from genshi.filters import HTMLSanitizer
  13. from core import escape_char, esc_neg_look, fragmentize
  14. sanitizer = HTMLSanitizer()
  15. __docformat__ = 'restructuredtext en'
  16. class WikiElement(object):
  17. """Baseclass for all wiki elements."""
  18. append_newline = False
  19. """Determines if newlines are appended to Element(s) during processing.
  20. Should only affect readability of source xml.
  21. """
  22. def __init__(self, tag, token, child_tags):
  23. """Constructor for WikiElement objects.
  24. Subclasses may have other keyword arguments.
  25. :parameters:
  26. tag
  27. The xhtml tag associated with the element.
  28. token
  29. The character string (or strings) that identifies the element
  30. in wiki markup.
  31. child_tags
  32. A list of wiki_elements that will be searched for in the body of the
  33. element. The order of these elements matters, because if an element is
  34. found before the element that encloses it, the enclosing element will
  35. never be found. In cases where this imposes limits (e.g, ``strong`` and
  36. ``em`` should be allowed to nest each other), place the conflicting
  37. elements in a sublist. The parser will then find which comes first.
  38. """
  39. self.tag = tag
  40. self.token = token
  41. self.child_tags = child_tags
  42. def _build(self,mo,element_store):
  43. """Returns a genshi Element that has ``self.tag`` as the
  44. outermost tag.
  45. This methods if called exclusively by ``_process``
  46. :parameters:
  47. mo
  48. match object, usually the one returned by
  49. self.regexp.search(s)
  50. """
  51. return bldr.tag.__getattr__(self.tag)(fragmentize(mo.group(1),
  52. self.child_tags,
  53. element_store))
  54. def re_string(self):
  55. """The regular expression pattern that is compiled into ``self.regexp``.
  56. The regular expression must consume the entire wiki element,
  57. including the tokens. For block elements, the newline on the last
  58. line must be consumed also. group(1) should normally be the
  59. entire string inside the tokens. If not, a custom ``_build``
  60. method will be needed.
  61. """
  62. pass
  63. def _process(self, mo, text, wiki_elements,element_store):
  64. """Returns genshi Fragments (Elements and text)
  65. This is mainly for block level markup. See InlineElement
  66. for the other method.
  67. """
  68. frags = []
  69. # call again for leading text and extend the result list
  70. if mo.start():
  71. frags.extend(fragmentize(text[:mo.start()],wiki_elements[1:],
  72. element_store))
  73. # append the found wiki element to the result list
  74. frags.append(self._build(mo,element_store))
  75. # make the source output easier to read
  76. if self.append_newline:
  77. frags.append('\n')
  78. # call again for trailing text and extend the result list
  79. if mo.end() < len(text):
  80. frags.extend(fragmentize(text[mo.end():],wiki_elements,
  81. element_store))
  82. return frags
  83. def __repr__(self):
  84. return "<WikiElement "+str(self.tag)+">"
  85. class InlineElement(WikiElement):
  86. r"""For finding generic inline elements like ``strong`` and ``em``.
  87. >>> em = InlineElement('em','//',[])
  88. >>> mo1 = em.regexp.search('a //word// in a line')
  89. >>> mo2 = em.regexp.search('a //word in a line\n or two\n')
  90. >>> mo1.group(0),mo1.group(1)
  91. ('//word//', 'word')
  92. >>> mo2.group(0),mo2.group(1)
  93. ('//word in a line\n or two', 'word in a line\n or two')
  94. Use a list for the ``token`` argument to have different start
  95. and end strings. These must be closed.
  96. >>> foo = InlineElement('foo',['<<','>>'],[])
  97. >>> mo = foo.regexp.search('blaa <<here it is >>\n')
  98. >>> mo.group(1)
  99. 'here it is '
  100. """
  101. def __init__(self, tag, token, child_tags=[]):
  102. super(InlineElement,self).__init__(tag,token , child_tags)
  103. self.regexp = re.compile(self.re_string(),re.DOTALL)
  104. def re_string(self):
  105. if isinstance(self.token,str):
  106. content = '(.+?)'
  107. end = '(' + esc_neg_look + re.escape(self.token) + r'|$)'
  108. return esc_neg_look + re.escape(self.token) + content + end
  109. else:
  110. content = '(.+?)'
  111. return esc_neg_look + re.escape(self.token[0]) + content + esc_neg_look + re.escape(self.token[1])
  112. def _process(self, mo, text, wiki_elements, element_store):
  113. """Returns genshi Fragments (Elements and text)"""
  114. processed = self._build(mo,element_store)
  115. store_id = str(id(processed))
  116. element_store[store_id] = processed
  117. text = ''.join([text[:mo.start()],'<<<',store_id,'>>>',
  118. text[mo.end():]])
  119. frags = fragmentize(text,wiki_elements,element_store)
  120. return frags
  121. macro_name = r'([a-zA-Z]+([-.]?[a-zA-Z0-9]+)*)'
  122. """allows any number of non-repeating hyphens or periods.
  123. Underscore is not included because hyphen is"""
  124. class Macro(WikiElement):
  125. r"""Finds and processes inline macro elements."""
  126. def __init__(self, tag, token, child_tags,func):
  127. super(Macro,self).__init__(tag,token , child_tags)
  128. self.func = func
  129. self.regexp = re.compile(self.re_string())
  130. def _process(self, mo, text, wiki_elements,element_store):
  131. """Returns genshi Fragments (Elements and text)"""
  132. processed = self._build(mo,element_store)
  133. if isinstance(processed, basestring):
  134. text = ''.join([text[:mo.start()],processed,
  135. text[mo.end():]])
  136. else:
  137. store_id = str(id(processed))
  138. element_store[store_id] = processed
  139. text = ''.join([text[:mo.start()],'<<<',store_id,'>>>',
  140. text[mo.end():]])
  141. frags = fragmentize(text,wiki_elements,element_store)
  142. return frags
  143. def re_string(self):
  144. content = '(.*?)'
  145. return esc_neg_look + re.escape(self.token[0]) + r'(' + macro_name + \
  146. content + ')' + esc_neg_look + re.escape(self.token[1])
  147. def _build(self,mo,element_store):
  148. if self.func:
  149. value = self.func(mo.group(2),mo.group(4),None,False)
  150. else:
  151. value = None
  152. if value is None:
  153. return bldr.tag(self.token[0] + mo.group(1) + self.token[1])
  154. elif isinstance(value,basestring):
  155. return value
  156. elif isinstance(value, (bldr.Element, Stream)):
  157. return [value]
  158. else:
  159. raise "Marcos can only return strings and Genshi Streams"
  160. class BodiedMacro(Macro):
  161. """Finds and processes macros with bodies.
  162. Does not span across top level block markup
  163. (see BodiedBlockMacro's for that)."""
  164. def __init__(self, tag, token, child_tags,func):
  165. super(BodiedMacro,self).__init__(tag,token , child_tags,func)
  166. self.func = func
  167. self.regexp = re.compile(self.re_string(),re.DOTALL)
  168. def re_string(self):
  169. content = r'([ \S]*?)'
  170. #macro_name = r'([a-zA-Z]+([-.]?[a-zA-Z0-9]+)*)'
  171. body = '(.+?)'
  172. return esc_neg_look + re.escape(self.token[0]) + r'(' + macro_name + \
  173. content + ')'+ esc_neg_look + re.escape(self.token[1]) + \
  174. body + esc_neg_look + re.escape(self.token[0]) + \
  175. r'/\2' + re.escape(self.token[1])
  176. def _build(self,mo,element_store):
  177. if self.func:
  178. value = self.func(mo.group(2),mo.group(4),mo.group(5),False)
  179. else:
  180. value = None
  181. if value is None:
  182. return bldr.tag(self.token[0] + mo.group(1) + self.token[1]
  183. + mo.group(5) + self.token[0] + '/'
  184. + mo.group(1) + self.token[1])
  185. elif isinstance(value, basestring):
  186. return value
  187. elif isinstance(value, (bldr.Element, Stream)):
  188. return [value]
  189. else:
  190. raise "macros can only return strings and genshi Streams"
  191. class BlockMacro(WikiElement):
  192. """Finds a block macros.
  193. Macro must be on a line alone without leading spaces. Resulting
  194. output with not be enclosed in paragraph marks or consumed by
  195. other markup (except pre blocks and BodiedBlockMacro's)
  196. """
  197. def __init__(self, tag, token, child_tags,func):
  198. super(BlockMacro,self).__init__(tag,token , child_tags)
  199. self.func = func
  200. self.regexp = re.compile(self.re_string(),re.MULTILINE)
  201. def _process(self, mo, text, wiki_elements,element_store):
  202. """Returns genshi Fragments (Elements and text)
  203. This is mainly for block level markup. See InlineElement
  204. for the other method.
  205. """
  206. processed = self._build(mo,element_store)
  207. if isinstance(processed, basestring):
  208. #print '_process', repr(processed)
  209. text = ''.join([text[:mo.start()],processed,
  210. text[mo.end():]])
  211. frags = fragmentize(text,wiki_elements,element_store)
  212. else:
  213. frags = []
  214. # call again for leading text and extend the result list
  215. if mo.start():
  216. frags.extend(fragmentize(text[:mo.start()],wiki_elements[1:],
  217. element_store))
  218. # append the found wiki element to the result list
  219. frags.append(processed)
  220. # make the source output easier to read
  221. if self.append_newline:
  222. frags.append('\n')
  223. # call again for trailing text and extend the result list
  224. if mo.end() < len(text):
  225. frags.extend(fragmentize(text[mo.end():],wiki_elements,
  226. element_store))
  227. return frags
  228. def re_string(self):
  229. arg_string = '((?!.*>>.*>>).*?)'
  230. #macro_name = r'([a-zA-Z]+([-.]?[a-zA-Z0-9]+)*)'
  231. # allows any number of non-repeating hyphens or periods
  232. # underscore is not included because hyphen is
  233. start = r'(^\s*?\n|\A)' + re.escape(self.token[0])
  234. end = re.escape(self.token[1]) + r'\s*?\n(\s*?\n|$)'
  235. return start + '(' + macro_name + arg_string + ')' + end
  236. def _build(self,mo,element_store):
  237. #print 'block_macro', mo.groups()
  238. if self.func:
  239. value = self.func(mo.group(3),mo.group(5),None,True)
  240. else:
  241. value = None
  242. if value is None:
  243. return bldr.tag(self.token[0] + mo.group(2) + self.token[1])
  244. elif isinstance(value,basestring):
  245. return ''.join([value.rstrip(),'\n'])
  246. elif isinstance(value, (bldr.Element, Stream)):
  247. return [value]
  248. else:
  249. raise "Marcos can only return strings and Genshi Streams"
  250. class BodiedBlockMacro(BlockMacro):
  251. """Finds and processes block macros with bodies.
  252. The opening and closing tokens must be are each on a line alone without
  253. leading spaces. These macros can enclose other block level markup
  254. including pre blocks and other BodiedBlockMacro's."""
  255. def __init__(self, tag, token, child_tags,func):
  256. super(BodiedBlockMacro,self).__init__(tag,token , child_tags,func)
  257. self.func = func
  258. self.regexp = re.compile(self.re_string(),re.DOTALL+re.MULTILINE)
  259. def re_string(self):
  260. arg_string = r'((?![^\n]*>>[^\n]*>>)[ \S]*?)'
  261. start = '^' + re.escape(self.token[0])
  262. #macro_name = r'([a-zA-Z]+([-.]?[a-zA-Z0-9]+)*)'
  263. body = r'(.*?\n)'
  264. end = re.escape(self.token[0]) + \
  265. r'/\2' + re.escape(self.token[1]) + r'\s*?\n'
  266. return start + '(' + macro_name + arg_string + ')' + re.escape(self.token[1]) + \
  267. r'\s*?\n' + body + end
  268. def _build(self,mo,element_store):
  269. #print 'block_bodied_macro', mo.groups()
  270. if self.func:
  271. value = self.func(mo.group(2),mo.group(4),mo.group(5),True)
  272. else:
  273. value = None
  274. if value is None:
  275. return bldr.tag(self.token[0] + mo.group(1) + self.token[1]
  276. + mo.group(5) + self.token[0] + '/'
  277. + mo.group(1) + self.token[1])
  278. elif isinstance(value, basestring):
  279. return value
  280. elif isinstance(value, (bldr.Element, Stream)):
  281. return [value]
  282. else:
  283. raise "macros can only return strings and genshi Streams"
  284. class RawLink(InlineElement):
  285. """Used to find raw urls in wiki text and build xml from them.
  286. >>> raw_link = RawLink(tag='a')
  287. >>> mo = raw_link.regexp.search(" a http://www.google.com url ")
  288. >>> raw_link.href(mo)
  289. 'http://www.google.com'
  290. >>> raw_link._build(mo,{}).generate().render()
  291. '<a href="http://www.google.com">http://www.google.com</a>'
  292. """
  293. linking_protocols = ['http','https']
  294. def __init__(self, tag):
  295. super(RawLink,self).__init__(tag=tag, token=None, child_tags=None)
  296. self.regexp = re.compile(self.re_string())
  297. def re_string(self):
  298. escape = '(' + re.escape(escape_char) + ')?'
  299. protocol = '((https?|ftp)://'
  300. rest_of_url = r'\S+?)'
  301. #allow one punctuation character or '**' or '//'
  302. look_ahead = r'(?=([,.?!:;"\']|\*\*|//)?(\s|$))'
  303. return escape + protocol + rest_of_url + look_ahead
  304. def _build(self,mo,element_store):
  305. if (not mo.group(1)) and (mo.group(3) in self.linking_protocols):
  306. return bldr.tag.__getattr__(self.tag)(self.alias(mo,element_store),
  307. href=self.href(mo))
  308. else:
  309. return self.href(mo)
  310. def href(self,mo):
  311. """Returns the string for the href attribute of the Element."""
  312. if sanitizer.is_safe_uri(mo.group(2)):
  313. return mo.group(2)
  314. else:
  315. return "unsafe_uri_detected"
  316. def alias(self,mo,element_store):
  317. """Returns the string for the content of the Element."""
  318. return self.href(mo)
  319. class URLLink(WikiElement):
  320. """Used to find url type links inside a link.
  321. The scope of these is within link markup only (i.e., [[url]]
  322. >>> url_link = URLLink('a','',[],'|')
  323. >>> mo = url_link.regexp.search(" http://www.google.com| here ")
  324. >>> url_link.href(mo)
  325. 'http://www.google.com'
  326. >>> url_link._build(mo,{}).generate().render()
  327. '<a href="http://www.google.com">here</a>'
  328. """
  329. def __init__(self, tag,token,child_tags,delimiter):
  330. super(URLLink,self).__init__(tag, token, child_tags)
  331. self.delimiter = delimiter
  332. self.regexp = re.compile(self.re_string())
  333. def re_string(self):
  334. protocol = r'^\s*((\w+?://|/)'
  335. rest_of_url = r'\S*?)\s*'
  336. alias = r'(' + re.escape(self.delimiter) + r' *(.*?))? *$'
  337. return protocol + rest_of_url + alias
  338. def _build(self,mo,element_store):
  339. return bldr.tag.__getattr__(self.tag)(self.alias(mo,element_store),
  340. href=self.href(mo))
  341. def href(self,mo):
  342. """Returns the string for the href attribute of the Element."""
  343. if sanitizer.is_safe_uri(mo.group(1)):
  344. return mo.group(1)
  345. else:
  346. return "unsafe_uri_detected"
  347. def alias(self,mo,element_store):
  348. """Returns the string for the content of the Element."""
  349. if not mo.group(4):
  350. return self.href(mo)
  351. else:
  352. return fragmentize(mo.group(4),self.child_tags,element_store)
  353. class InterWikiLink(WikiElement):
  354. """Used to match interwiki links inside a link.
  355. The search scope for these is only inside links.
  356. >>> interwiki_link = InterWikiLink('a','',[],
  357. ... delimiter1=':', delimiter2 = '|',
  358. ... base_urls=dict(somewiki='http://somewiki.org/',
  359. ... bigwiki='http://bigwiki.net/'),
  360. ... links_funcs={},default_space_char='_',
  361. ... space_chars={})
  362. >>> mo = interwiki_link.regexp.search(" somewiki:Home Page|steve ")
  363. >>> interwiki_link.href(mo)
  364. 'http://somewiki.org/Home_Page'
  365. >>> interwiki_link.alias(mo,{})
  366. ['steve']
  367. """
  368. def __init__(self, tag, token, child_tags,delimiter1,
  369. delimiter2,base_urls,links_funcs,default_space_char,space_chars):
  370. super(InterWikiLink,self).__init__(tag, token, child_tags)
  371. self.delimiter1 = delimiter1
  372. self.delimiter2 = delimiter2
  373. self.regexp = re.compile(self.re_string())
  374. self.base_urls = base_urls
  375. self.links_funcs = links_funcs
  376. self.default_space_char = default_space_char
  377. self.space_chars = space_chars
  378. def re_string(self):
  379. wiki_id = r'(\w+)'
  380. optional_spaces = ' *'
  381. page_name = r'(\S+?( \S+?)*)' #allows any number of single spaces
  382. alias = r'(' + re.escape(self.delimiter2) + r' *(.*?))? *$'
  383. return wiki_id + optional_spaces + re.escape(self.delimiter1) + \
  384. optional_spaces + page_name + optional_spaces + \
  385. alias
  386. def page_name(self,mo):
  387. space_char = self.space_chars.get(mo.group(1),self.default_space_char)
  388. return mo.group(2).replace(' ',space_char)
  389. def href(self,mo):
  390. linktype = mo.group(1)
  391. base_url = self.base_urls.get(linktype)
  392. link_func = self.links_funcs.get(linktype)
  393. if not (link_func or base_url):
  394. return None
  395. else:
  396. href = self.page_name(mo)
  397. if link_func:
  398. href = link_func(href)
  399. if base_url:
  400. href = urlparse.urljoin(base_url, href)
  401. return href
  402. def _build(self,mo,element_store):
  403. if not self.href(mo):
  404. return '[[' + mo.group(0) + ']]'
  405. return bldr.tag.__getattr__(self.tag)(self.alias(mo,element_store),
  406. href=self.href(mo))
  407. def alias(self,mo,element_store):
  408. """Returns the string for the content of the Element."""
  409. if not mo.group(5):
  410. return ''.join([mo.group(1),self.delimiter1,mo.group(2)])
  411. else:
  412. return fragmentize(mo.group(5),self.child_tags,element_store)
  413. class WikiLink(WikiElement):
  414. """Used to match wiki links inside a link.
  415. The search scope for these is only inside links.
  416. >>> wiki_link = WikiLink('a','',[],'|',base_url='http://somewiki.org/',
  417. ... space_char='_',class_func=None, path_func=None)
  418. >>> mo = wiki_link.regexp.search(" Home Page |Home")
  419. >>> wiki_link.href(mo)
  420. 'http://somewiki.org/Home_Page'
  421. >>> wiki_link.alias(mo,{})
  422. ['Home']
  423. """
  424. def __init__(self, tag, token, child_tags,delimiter,
  425. base_url,space_char,class_func,path_func):
  426. super(WikiLink,self).__init__(tag, token, child_tags)
  427. self.delimiter = delimiter
  428. self.base_url = base_url
  429. self.space_char = space_char
  430. self.class_func = class_func
  431. self.path_func = path_func
  432. self.regexp = re.compile(self.re_string())
  433. def re_string(self):
  434. optional_spaces = ' *'
  435. page_name = r'(\S+?( +\S+?)*?)' #allows any number of single spaces
  436. alias = r'(' + re.escape(self.delimiter) + r' *(.*?))? *$'
  437. return optional_spaces + page_name + optional_spaces + \
  438. alias
  439. def page_name(self,mo):
  440. return mo.group(1).replace(' ',self.space_char)
  441. def href(self,mo):
  442. if self.path_func:
  443. the_path = self.path_func(self.page_name(mo))
  444. else:
  445. the_path = self.page_name(mo)
  446. return urlparse.urljoin(self.base_url, the_path)
  447. def _build(self,mo,element_store):
  448. if self.class_func:
  449. the_class = self.class_func(self.page_name(mo))
  450. else:
  451. the_class = None
  452. return bldr.tag.__getattr__(self.tag)(self.alias(mo,element_store),
  453. href=self.href(mo),
  454. class_=the_class)
  455. def alias(self,mo,element_store):
  456. """Returns the string for the content of the Element."""
  457. if not mo.group(3):
  458. return mo.group(1)
  459. else:
  460. return fragmentize(mo.group(4),self.child_tags,element_store)
  461. class BlockElement(WikiElement):
  462. """Block elements inherit form this class
  463. Wiki elements wanting ``append_newline = True`` should use this
  464. as the base also.
  465. """
  466. append_newline = True
  467. class List(BlockElement):
  468. """Finds list (ordered, unordered, and definition) wiki elements.
  469. group(1) of the match object includes all lines from the list
  470. including newline characters.
  471. """
  472. def __init__(self, tag, token,child_tags,stop_tokens):
  473. super(List,self).__init__(tag, token, child_tags)
  474. self.stop_tokens = stop_tokens
  475. self.regexp = re.compile(self.re_string(),re.DOTALL+re.MULTILINE)
  476. def re_string(self):
  477. """This re_string is for finding generic block elements like
  478. lists (ordered, unordered, and definition) that start with a
  479. single token.
  480. """
  481. leading_whitespace = r'^([ \t]*'
  482. only_one_token = re.escape(self.token)+'[^'+ re.escape(self.token) + ']'
  483. rest_of_list = r'.*?\n)'
  484. ## only_one_other_token = re.escape(self.other_token)+'(?!'+ \
  485. ## re.escape(self.other_token) + ')'
  486. only_one_stop_token = '([' + re.escape(self.stop_tokens) + r'])(?!\3)'
  487. look_ahead = '(?=([ \t]*' + only_one_stop_token + '|$))'
  488. return leading_whitespace + only_one_token + rest_of_list + \
  489. look_ahead
  490. class ListItem(WikiElement):
  491. r"""Matches the current list item.
  492. Everything up to the next same-level list item is matched.
  493. >>> list_item = ListItem('li',[],'#*')
  494. >>> mo = list_item.regexp.search("*one\n**one.1\n**one.2\n*two\n")
  495. >>> mo.group(2)
  496. 'one\n**one.1\n**one.2'
  497. >>> mo.group(0)
  498. '*one\n**one.1\n**one.2'
  499. """
  500. append_newline = False
  501. def __init__(self, tag, child_tags, list_tokens):
  502. """Constructor for list items.
  503. :parameters"
  504. list_tokens
  505. A string that includes the tokens used for lists
  506. """
  507. super(ListItem,self).__init__(tag, token=None,
  508. child_tags=child_tags)
  509. self.list_tokens = list_tokens
  510. self.regexp = re.compile(self.re_string(),re.DOTALL)
  511. def re_string(self):
  512. whitespace = r'[ \t]*'
  513. item_start = '([*#]+)'
  514. rest_of_item = r'(.*?)\n?'
  515. start_of_same_level_item = r'\1(?![*#])'
  516. look_ahead = r'(?=(\n' + whitespace + start_of_same_level_item + '|$))'
  517. return whitespace + item_start + whitespace + \
  518. rest_of_item + look_ahead
  519. def _build(self,mo,element_store):
  520. return bldr.tag.__getattr__(self.tag)(fragmentize(mo.group(2),
  521. self.child_tags,
  522. element_store))
  523. class NestedList(WikiElement):
  524. r"""Finds a list in the current list item.
  525. >>> nested_ul = NestedList('ul','*',[])
  526. >>> mo = nested_ul.regexp.search('one\n**one.1\n**one.2\n')
  527. >>> mo.group(1)
  528. '**one.1\n**one.2\n'
  529. >>> mo.group(0) == mo.group(1)
  530. True
  531. """
  532. def __init__(self, tag, token,child_tags):
  533. super(NestedList,self).__init__(tag, token, child_tags)
  534. self.regexp = re.compile(self.re_string(),re.DOTALL+re.MULTILINE)
  535. def re_string(self):
  536. look_behind = r'(?<=\n)' # have to avoid finding a list on the first line
  537. whitespace = r'(\s*'
  538. rest_of_list = '.*$)'
  539. return look_behind + '^' + whitespace + re.escape(self.token) + \
  540. rest_of_list
  541. class DefinitionTerm(BlockElement):
  542. r"""Processes definition terms.
  543. >>> term = DefinitionTerm('dt',';',[],stop_token=':')
  544. >>> mo1,mo2 = term.regexp.finditer(";term1\n:def1\n;term2:def2\n")
  545. >>> mo1.group(1), mo2.group(1)
  546. ('term1', 'term2')
  547. >>> mo1.group(0), mo2.group(0)
  548. (';term1\n', ';term2')
  549. group(1) of the match object is the term line or up to the first ':'
  550. """
  551. def __init__(self, tag, token,child_tags,stop_token):
  552. super(DefinitionTerm,self).__init__(tag, token, child_tags)
  553. self.stop_token = stop_token
  554. self.regexp = re.compile(self.re_string(),re.DOTALL+re.MULTILINE)
  555. def re_string(self):
  556. leading_whitespace = r'^([ \t]*'
  557. #only_one_token = re.escape(self.token)+'[^'+ re.escape(self.token) + ']'
  558. rest_of_list = r'.*?\n)'
  559. #only_one_stop_token = '([' + re.escape(self.stop_tokens) + r'])(?!\3)'
  560. #look_ahead = r'(?=([ \t]*' + only_one_stop_token + '|$))'
  561. return r'^[ \t]*' + re.escape(self.token) + r'[ \t]*(.*?' + \
  562. re.escape(self.stop_token) + '?)\s*(\n|(?=(' + \
  563. esc_neg_look + re.escape(self.stop_token) + r'|$)))'
  564. class DefinitionDef(BlockElement):
  565. r"""Processes definitions.
  566. >>> definition = DefinitionDef('dd',':',[])
  567. >>> mo1,mo2 = definition.regexp.finditer(":def1a\ndef1b\n:def2\n")
  568. >>> mo1.group(1), mo2.group(1)
  569. ('def1a\ndef1b', 'def2')
  570. >>> mo1.group(0), mo2.group(0)
  571. (':def1a\ndef1b\n', ':def2\n')
  572. group(1) of the match object includes all lines from the defintion
  573. up to the next definition.
  574. """
  575. def __init__(self, tag, token,child_tags):
  576. super(DefinitionDef,self).__init__(tag, token, child_tags)
  577. self.regexp = re.compile(self.re_string(),re.DOTALL+re.MULTILINE)
  578. def re_string(self):
  579. leading_whitespace = r'^([ \t]*'
  580. rest_of_list = r'.*?\n)'
  581. look_ahead = r'(?=([ \t]*' + re.escape(self.token) + r')|$)'
  582. return r'^[ \t]*' + re.escape(self.token) + r'?[ \t]*(.+?)\s*\n(?=([ \t]*' + \
  583. re.escape(self.token) + r')|$)'
  584. class Paragraph(BlockElement):
  585. """"This should be the last outer level wiki element to be "searched".
  586. Anything that is left over will be placed in paragraphs.
  587. """
  588. def __init__(self, tag, child_tags):
  589. super(Paragraph,self).__init__(tag,token=None, child_tags=child_tags)
  590. self.regexp = re.compile(self.re_string(),re.DOTALL+re.MULTILINE)
  591. def re_string(self):
  592. return r'^(.*)\n'
  593. class Heading(BlockElement):
  594. r"""Finds heading wiki elements.
  595. >>> h1 = Heading('h1','=',[])
  596. >>> mo = h1.regexp.search('before\n = An important thing = \n after')
  597. >>> mo.group(1)
  598. 'An important thing'
  599. >>> mo.group(0)
  600. ' = An important thing = \n'
  601. """
  602. def __init__(self, tag, token, child_tags):
  603. super(Heading,self).__init__(tag,token , child_tags)
  604. self.regexp = re.compile(self.re_string(),re.MULTILINE)
  605. def re_string(self):
  606. whitespace = r'[ \t]*'
  607. neg_look_ahead = '(?!' + re.escape(self.token[0]) + ')'
  608. content = '(.*?)'
  609. trailing_markup = '(' + re.escape(self.token[0]) + r'+[ \t]*)?\n'
  610. return '^' + whitespace + re.escape(self.token) + neg_look_ahead + \
  611. whitespace + content + whitespace + trailing_markup
  612. class Table(BlockElement):
  613. r"""Find tables.
  614. >>> table = Table('table','|',[])
  615. >>> mo = table.regexp.search("before\n | one | two |\n|one|two \n hi")
  616. >>> mo.group(1)
  617. ' | one | two |\n|one|two \n'
  618. >>> mo.group(0) == mo.group(1)
  619. True
  620. """
  621. def __init__(self, tag, token, child_tags=[]):
  622. super(Table,self).__init__(tag,token , child_tags)
  623. self.regexp = re.compile(self.re_string(),re.MULTILINE)
  624. def re_string(self):
  625. whitespace = r'[ \t]*'
  626. rest_of_line = r'.*?\n'
  627. return '^((' + whitespace + re.escape(self.token) + \
  628. rest_of_line + ')+)'
  629. class TableRow(BlockElement):
  630. r"""Finds rows in a table.
  631. >>> row = TableRow('tr','|',[])
  632. >>> mo = row.regexp.search(' | one | two |\n|one|two \n')
  633. >>> mo.group(1)
  634. '| one | two '
  635. >>> mo.group(0)
  636. ' | one | two |\n'
  637. """
  638. def __init__(self, tag, token, child_tags=[]):
  639. super(TableRow,self).__init__(tag,token , child_tags)
  640. self.regexp = re.compile(self.re_string(),re.MULTILINE)
  641. def re_string(self):
  642. whitespace = r'[ \t]*'
  643. content = '(' + re.escape(self.token) + '.*?)'
  644. trailing_token = re.escape(self.token) + '?'
  645. return '^' + whitespace + content + trailing_token + \
  646. whitespace + r'\n'
  647. class TableCell(WikiElement):
  648. r"""Finds cells in a table row.
  649. >>> cell = TableCell('td','|',[])
  650. >>> mo = cell.regexp.search('| one | two ')
  651. >>> mo.group(1)
  652. 'one'
  653. >>> mo.group(0)
  654. '| one '
  655. """
  656. def __init__(self, tag, token, child_tags=[]):
  657. super(TableCell,self).__init__(tag,token , child_tags)
  658. self.regexp = re.compile(self.re_string())
  659. def re_string(self):
  660. whitespace = r'[ \t]*'
  661. content = '(.*?)'
  662. look_ahead = '((?=' + esc_neg_look + re.escape(self.token[0]) + ')|$)'
  663. return esc_neg_look + re.escape(self.token) + whitespace + \
  664. content + whitespace + look_ahead
  665. class Link(InlineElement):
  666. """Finds and builds links."""
  667. def __init__(self, tag, token, child_tags):
  668. super(Link,self).__init__(tag,token , child_tags)
  669. self.regexp = re.compile(self.re_string())
  670. def _build(self,mo,element_store):
  671. link = fragmentize(mo.group(1),self.child_tags,element_store)
  672. if link:
  673. return bldr.tag(link)
  674. else:
  675. return token[0] + mo.group(0) + token[-1]
  676. class Image(InlineElement):
  677. """Processes image elements.
  678. >>> img = Image('img',('{{','}}'),[], delimiter='|')
  679. >>> mo = img.regexp.search('{{ picture.jpg | An image of a house }}')
  680. >>> img._build(mo,{}).generate().render()
  681. '<img src="picture.jpg" alt="An image of a house"/>'
  682. """
  683. def __init__(self, tag, token, child_tags,delimiter):
  684. super(Image,self).__init__(tag,token , child_tags)
  685. self.regexp = re.compile(self.re_string())
  686. self.delimiter = delimiter
  687. self.src_regexp = re.compile(r'^\s*(\S+)\s*$')
  688. def _build(self,mo,element_store):
  689. body = mo.group(1).split(self.delimiter,1)
  690. src_mo = self.src_regexp.search(body[0])
  691. if not src_mo:
  692. return bldr.tag.span('Bad Image src')
  693. if sanitizer.is_safe_uri(src_mo.group(1)):
  694. link = src_mo.group(1)
  695. else:
  696. link = "unsafe_uri_detected"
  697. if len(body) == 1:
  698. alias = link
  699. else:
  700. alias = body[1].strip()
  701. return bldr.tag.__getattr__(self.tag)(src=link ,alt=alias)
  702. class NoWikiElement(InlineElement):
  703. """Inline no-wiki.
  704. When two or more end tokens are found together, only last marks
  705. the end of the element.
  706. This element must be on a single line.
  707. """
  708. def __init__(self, tag, token, child_tags=[]):
  709. super(NoWikiElement,self).__init__(tag,token , child_tags)
  710. self.regexp = re.compile(self.re_string(),re.DOTALL)
  711. def _build(self,mo,element_store):
  712. if self.tag:
  713. return bldr.tag.__getattr__(self.tag)(
  714. fragmentize(mo.group(1), self.child_tags,
  715. element_store,
  716. remove_escapes=False))
  717. else:
  718. return bldr.tag(fragmentize(mo.group(1),self.child_tags,
  719. element_store,
  720. remove_escapes=False))
  721. def re_string(self):
  722. if isinstance(self.token,str):
  723. content = '(.+?' + re.escape(self.token[-1]) + '*)'
  724. return esc_neg_look + re.escape(self.token) + \
  725. content + re.escape(self.token)
  726. else:
  727. content = '(.+?' + re.escape(self.token[1][-1]) + '*)'
  728. return esc_neg_look + re.escape(self.token[0]) + \
  729. content + re.escape(self.token[1])
  730. class PreBlock(BlockElement):
  731. """A preformatted block.
  732. If a closing token is found on a line with a space as the first
  733. character, the space will be removed from the output.
  734. """
  735. def __init__(self, tag, token, child_tags=[]):
  736. super(PreBlock,self).__init__(tag,token , child_tags)
  737. self.regexp = re.compile(self.re_string(),re.DOTALL+re.MULTILINE)
  738. self.regexp2 = re.compile(self.re_string2(),re.MULTILINE)
  739. def re_string(self):
  740. if isinstance(self.token,str):
  741. return '^' + re.escape(self.token) + r'\s*?\n(.*?\n)' + \
  742. re.escape(self.token) + r'\s*?\n'
  743. else:
  744. start = '^' + re.escape(self.token[0]) + r'\s*?\n'
  745. content = r'(.+?\n)'
  746. end = re.escape(self.token[1]) + r'\s*?\n'
  747. return start + content + end
  748. def re_string2(self):
  749. """Finds a closing token with a space at the start of the line."""
  750. if isinstance(self.token,str):
  751. return r'^ (\s*?' + re.escape(self.token) + r'\s*?\n)'
  752. else:
  753. return r'^ (\s*?' + re.escape(self.token[1]) + r'\s*?\n)'
  754. def _build(self,mo,element_store):
  755. match = self.regexp2.sub(r'\1',mo.group(1))
  756. return bldr.tag.__getattr__(self.tag)(
  757. fragmentize(match,self.child_tags,
  758. element_store,remove_escapes=False))
  759. class LoneElement(BlockElement):
  760. """Element on a line by itself with no content (e.g., <hr/>)"""
  761. def __init__(self, tag, token, child_tags):
  762. super(LoneElement,self).__init__(tag,token , child_tags)
  763. self.regexp = re.compile(self.re_string(),re.DOTALL+re.MULTILINE)
  764. def re_string(self):
  765. return r'^(\s*?' + re.escape(self.token) + r'\s*?\n)'
  766. def _build(self,mo,element_store):
  767. return bldr.tag.__getattr__(self.tag)()
  768. class LonePlaceHolder(BlockElement):
  769. """A place holder on a line by itself or with other place holders.
  770. This is used to avoid these being enclosed in a paragraph.
  771. """
  772. append_newline = False
  773. def __init__(self, tag, token, child_tags):
  774. super(LonePlaceHolder,self).__init__(tag,token , child_tags)
  775. self.regexp = re.compile(self.re_string(),re.MULTILINE)
  776. def re_string(self):
  777. place_holder = re.escape(self.token[0]) + r'\S*?' + re.escape(self.token[1])
  778. return r'^\s*?(' + place_holder + r'\s*$)+\s*?\n'
  779. def _build(self,mo,element_store):
  780. return bldr.tag(fragmentize(mo.group(0),[],element_store))
  781. class BlankLine(WikiElement):
  782. """Blank lines divide elements but don't add any output."""
  783. def __init__(self):
  784. super(BlankLine,self).__init__(tag=None,token='' , child_tags=[])
  785. self.regexp = re.compile(self.re_string(),re.MULTILINE)
  786. def re_string(self):
  787. return r'^(\s*\n)+'
  788. def _build(self,mo,element_store):
  789. return None
  790. class LineBreak(InlineElement):
  791. """An inline line break."""
  792. #append_newline = True
  793. def __init__(self,tag, token, child_tags=[]):
  794. super(LineBreak,self).__init__(tag,token , child_tags)
  795. self.regexp = re.compile(self.re_string(),re.DOTALL)
  796. def re_string(self):
  797. return esc_neg_look + re.escape(self.token)
  798. def _build(self,mo,element_store):
  799. return bldr.tag.__getattr__(self.tag)()
  800. def _test():
  801. import doctest
  802. doctest.testmod()
  803. if __name__ == "__main__":
  804. _test()