PageRenderTime 53ms CodeModel.GetById 14ms RepoModel.GetById 1ms app.codeStats 0ms

/lib/textile.py

http://github.com/Arachnid/bloggart
Python | 1068 lines | 989 code | 45 blank | 34 comment | 39 complexity | f0c13fac662e9533facb60537a4b111b MD5 | raw file
  1. #!/usr/bin/env python
  2. """
  3. PyTextile
  4. A Humane Web Text Generator
  5. """
  6. __version__ = '2.1.3'
  7. __date__ = '2009/02/07'
  8. __copyright__ = """
  9. Copyright (c) 2009, Jason Samsa, http://jsamsa.com/
  10. Copyright (c) 2004, Roberto A. F. De Almeida, http://dealmeida.net/
  11. Copyright (c) 2003, Mark Pilgrim, http://diveintomark.org/
  12. Original PHP Version:
  13. Copyright (c) 2003-2004, Dean Allen <dean@textism.com>
  14. All rights reserved.
  15. Thanks to Carlo Zottmann <carlo@g-blog.net> for refactoring
  16. Textile's procedural code into a class framework
  17. Additions and fixes Copyright (c) 2006 Alex Shiels http://thresholdstate.com/
  18. """
  19. __license__ = """
  20. L I C E N S E
  21. =============
  22. Redistribution and use in source and binary forms, with or without
  23. modification, are permitted provided that the following conditions are met:
  24. * Redistributions of source code must retain the above copyright notice,
  25. this list of conditions and the following disclaimer.
  26. * Redistributions in binary form must reproduce the above copyright notice,
  27. this list of conditions and the following disclaimer in the documentation
  28. and/or other materials provided with the distribution.
  29. * Neither the name Textile nor the names of its contributors may be used to
  30. endorse or promote products derived from this software without specific
  31. prior written permission.
  32. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  33. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  34. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  35. ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  36. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  37. CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  38. SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  39. INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  40. CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  41. ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  42. POSSIBILITY OF SUCH DAMAGE.
  43. """
  44. import re
  45. import uuid
  46. from urlparse import urlparse
  47. import sgmllib
  48. def _normalize_newlines(string):
  49. import re
  50. out = re.sub(r'\r\n', '\n', string)
  51. out = re.sub(r'\n{3,}', '\n\n', out)
  52. out = re.sub(r'\n\s*\n', '\n\n', out)
  53. out = re.sub(r'"$', '" ', out)
  54. return out
  55. # PyTextile can optionally sanitize the generated XHTML,
  56. # which is good for weblog comments. This code is from
  57. # Mark Pilgrim's feedparser.
  58. class _BaseHTMLProcessor(sgmllib.SGMLParser):
  59. elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
  60. 'img', 'input', 'isindex', 'link', 'meta', 'param']
  61. def __init__(self):
  62. sgmllib.SGMLParser.__init__(self)
  63. def reset(self):
  64. self.pieces = []
  65. sgmllib.SGMLParser.reset(self)
  66. def normalize_attrs(self, attrs):
  67. # utility method to be called by descendants
  68. attrs = [(k.lower(), sgmllib.charref.sub(lambda m: unichr(int(m.groups()[0])), v).strip()) for k, v in attrs]
  69. attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
  70. return attrs
  71. def unknown_starttag(self, tag, attrs):
  72. # called for each start tag
  73. # attrs is a list of (attr, value) tuples
  74. # e.g. for <pre class="screen">, tag="pre", attrs=[("class", "screen")]
  75. strattrs = "".join([' %s="%s"' % (key, value) for key, value in attrs])
  76. if tag in self.elements_no_end_tag:
  77. self.pieces.append("<%(tag)s%(strattrs)s />" % locals())
  78. else:
  79. self.pieces.append("<%(tag)s%(strattrs)s>" % locals())
  80. def unknown_endtag(self, tag):
  81. # called for each end tag, e.g. for </pre>, tag will be "pre"
  82. # Reconstruct the original end tag.
  83. if tag not in self.elements_no_end_tag:
  84. self.pieces.append("</%(tag)s>" % locals())
  85. def handle_charref(self, ref):
  86. # called for each character reference, e.g. for "&#160;", ref will be "160"
  87. # Reconstruct the original character reference.
  88. self.pieces.append("&#%(ref)s;" % locals())
  89. def handle_entityref(self, ref):
  90. # called for each entity reference, e.g. for "&copy;", ref will be "copy"
  91. # Reconstruct the original entity reference.
  92. self.pieces.append("&%(ref)s;" % locals())
  93. def handle_data(self, text):
  94. # called for each block of plain text, i.e. outside of any tag and
  95. # not containing any character or entity references
  96. # Store the original text verbatim.
  97. self.pieces.append(text)
  98. def handle_comment(self, text):
  99. # called for each HTML comment, e.g. <!-- insert Javascript code here -->
  100. # Reconstruct the original comment.
  101. self.pieces.append("<!--%(text)s-->" % locals())
  102. def handle_pi(self, text):
  103. # called for each processing instruction, e.g. <?instruction>
  104. # Reconstruct original processing instruction.
  105. self.pieces.append("<?%(text)s>" % locals())
  106. def handle_decl(self, text):
  107. # called for the DOCTYPE, if present, e.g.
  108. # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
  109. # "http://www.w3.org/TR/html4/loose.dtd">
  110. # Reconstruct original DOCTYPE
  111. self.pieces.append("<!%(text)s>" % locals())
  112. def output(self):
  113. """Return processed HTML as a single string"""
  114. return "".join(self.pieces)
  115. class _HTMLSanitizer(_BaseHTMLProcessor):
  116. acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big',
  117. 'blockquote', 'br', 'button', 'caption', 'center', 'cite', 'code', 'col',
  118. 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'fieldset',
  119. 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input',
  120. 'ins', 'kbd', 'label', 'legend', 'li', 'map', 'menu', 'ol', 'optgroup',
  121. 'option', 'p', 'pre', 'q', 's', 'samp', 'select', 'small', 'span', 'strike',
  122. 'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'tfoot', 'th',
  123. 'thead', 'tr', 'tt', 'u', 'ul', 'var']
  124. acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
  125. 'action', 'align', 'alt', 'axis', 'border', 'cellpadding', 'cellspacing',
  126. 'char', 'charoff', 'charset', 'checked', 'cite', 'class', 'clear', 'cols',
  127. 'colspan', 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled',
  128. 'enctype', 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace',
  129. 'id', 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method',
  130. 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly',
  131. 'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
  132. 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', 'type',
  133. 'usemap', 'valign', 'value', 'vspace', 'width']
  134. unacceptable_elements_with_end_tag = ['script', 'applet']
  135. # This if for MathML.
  136. mathml_elements = ['math', 'mi', 'mn', 'mo', 'mrow', 'msup']
  137. mathml_attributes = ['mode', 'xmlns']
  138. acceptable_elements = acceptable_elements + mathml_elements
  139. acceptable_attributes = acceptable_attributes + mathml_attributes
  140. def reset(self):
  141. _BaseHTMLProcessor.reset(self)
  142. self.unacceptablestack = 0
  143. def unknown_starttag(self, tag, attrs):
  144. if not tag in self.acceptable_elements:
  145. if tag in self.unacceptable_elements_with_end_tag:
  146. self.unacceptablestack += 1
  147. return
  148. attrs = self.normalize_attrs(attrs)
  149. attrs = [(key, value) for key, value in attrs if key in self.acceptable_attributes]
  150. _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
  151. def unknown_endtag(self, tag):
  152. if not tag in self.acceptable_elements:
  153. if tag in self.unacceptable_elements_with_end_tag:
  154. self.unacceptablestack -= 1
  155. return
  156. _BaseHTMLProcessor.unknown_endtag(self, tag)
  157. def handle_pi(self, text):
  158. pass
  159. def handle_decl(self, text):
  160. pass
  161. def handle_data(self, text):
  162. if not self.unacceptablestack:
  163. _BaseHTMLProcessor.handle_data(self, text)
  164. # PyTextile can optionally validate the generated
  165. # XHTML code using either mxTidy or uTidyLib.
  166. try:
  167. # This is mxTidy.
  168. from mx.Tidy import Tidy
  169. def _tidy1(text):
  170. """mxTidy's XHTML validator.
  171. This function is a wrapper to mxTidy's validator.
  172. """
  173. nerrors, nwarnings, text, errortext = Tidy.tidy(text, output_xhtml=1, numeric_entities=1, wrap=0)
  174. return _in_tag(text, 'body')
  175. _tidy = _tidy1
  176. except ImportError:
  177. try:
  178. # This is uTidyLib.
  179. import tidy
  180. def _tidy2(text):
  181. """uTidyLib's XHTML validator.
  182. This function is a wrapper to uTidyLib's validator.
  183. """
  184. text = tidy.parseString(text, output_xhtml=1, add_xml_decl=0, indent=0, tidy_mark=0)
  185. return _in_tag(str(text), 'body')
  186. _tidy = _tidy2
  187. except ImportError:
  188. _tidy = None
  189. class Textile(object):
  190. hlgn = r'(?:\<(?!>)|(?<!<)\>|\<\>|\=|[()]+(?! ))'
  191. vlgn = r'[\-^~]'
  192. clas = r'(?:\([^)]+\))'
  193. lnge = r'(?:\[[^\]]+\])'
  194. styl = r'(?:\{[^}]+\})'
  195. cspn = r'(?:\\\d+)'
  196. rspn = r'(?:\/\d+)'
  197. a = r'(?:%s|%s)*' % (hlgn, vlgn)
  198. s = r'(?:%s|%s)*' % (cspn, rspn)
  199. c = r'(?:%s)*' % '|'.join([clas, styl, lnge, hlgn])
  200. pnct = r'[-!"#$%&()*+,/:;<=>?@\'\[\\\]\.^_`{|}~]'
  201. # urlch = r'[\w"$\-_.+!*\'(),";/?:@=&%#{}|\\^~\[\]`]'
  202. urlch = '[\w"$\-_.+*\'(),";\/?:@=&%#{}|\\^~\[\]`]'
  203. url_schemes = ('http','https','ftp','mailto')
  204. btag = ('bq', 'bc', 'notextile', 'pre', 'h[1-6]', 'fn\d+', 'p')
  205. noimage = False
  206. hu = ''
  207. glyph_defaults = (
  208. ('txt_quote_single_open', '&#8216;'),
  209. ('txt_quote_single_close', '&#8217;'),
  210. ('txt_quote_double_open', '&#8220;'),
  211. ('txt_quote_double_close', '&#8221;'),
  212. ('txt_apostrophe', '&#8217;'),
  213. ('txt_prime', '&#8242;'),
  214. ('txt_prime_double', '&#8243;'),
  215. ('txt_ellipsis', '&#8230;'),
  216. ('txt_emdash', '&#8212;'),
  217. ('txt_endash', '&#8211;'),
  218. ('txt_dimension', '&#215;'),
  219. ('txt_trademark', '&#8482;'),
  220. ('txt_registered', '&#174;'),
  221. ('txt_copyright', '&#169;'),
  222. )
  223. def __init__(self, restricted=False, lite=False):
  224. """docstring for __init__"""
  225. self.restricted = restricted
  226. self.lite = lite
  227. self.fn = {}
  228. self.urlrefs = {}
  229. self.shelf = {}
  230. self.rel = ''
  231. def textile(self, text, rel=None, encoding='utf8', output='utf8', validate=False, sanitize=False, head_offset='ignored'):
  232. """
  233. >>> import textile
  234. >>> textile.textile('some textile')
  235. '\\t<p>some textile</p>'
  236. """
  237. text = _normalize_newlines(text)
  238. if rel:
  239. self.rel = ' rel="%s"' % rel
  240. text = self.getRefs(text)
  241. if not self.lite:
  242. text = self.block(text)
  243. text = self.retrieve(text)
  244. # Convert to desired output.
  245. if isinstance(text, str):
  246. text = unicode(text, encoding)
  247. text = text.encode(output, 'xmlcharrefreplace')
  248. # Sanitize?
  249. if sanitize:
  250. p = _HTMLSanitizer()
  251. p.feed(text)
  252. text = p.output()
  253. # Validate output.
  254. if _tidy and validate:
  255. text = _tidy(text)
  256. return text
  257. def pba(self, input, element=None):
  258. """
  259. >>> t = Textile()
  260. >>> t.pba(r'\3')
  261. ''
  262. >>> t.pba(r'\\3', element='td')
  263. ' colspan="3"'
  264. >>> t.pba(r'/4', element='td')
  265. ' rowspan="4"'
  266. >>> t.pba(r'\\3/4', element='td')
  267. ' colspan="3" rowspan="4"'
  268. >>> t.vAlign('^')
  269. 'top'
  270. >>> t.pba('^', element='td')
  271. ' style="vertical-align:top;"'
  272. >>> t.pba('{line-height:18px}')
  273. ' style="line-height:18px;"'
  274. >>> t.pba('(foo-bar)')
  275. ' class="foo-bar"'
  276. >>> t.pba('(#myid)')
  277. ' id="myid"'
  278. >>> t.pba('(foo-bar#myid)')
  279. ' class="foo-bar" id="myid"'
  280. >>> t.pba('((((')
  281. ' style="padding-left:4em;"'
  282. >>> t.pba(')))')
  283. ' style="padding-right:3em;"'
  284. >>> t.pba('[fr]')
  285. ' lang="fr"'
  286. """
  287. style = []
  288. aclass = ''
  289. lang = ''
  290. colspan = ''
  291. rowspan = ''
  292. id = ''
  293. atts = ''
  294. if not input: return ''
  295. matched = input
  296. if element == 'td':
  297. m = re.search(r'\\(\d+)', matched)
  298. if m:
  299. colspan = m.group(1)
  300. m = re.search(r'/(\d+)', matched)
  301. if m:
  302. rowspan = m.group(1)
  303. if element == 'td' or element == 'tr':
  304. m = re.search(r'(%s)' % self.vlgn, matched)
  305. if m: style.append("vertical-align:%s;" % self.vAlign(m.group(1)))
  306. m = re.search(r'\{([^}]*)\}', matched)
  307. if m:
  308. style.append(m.group(1).rstrip(';') + ';')
  309. matched = matched.replace(m.group(0), '')
  310. m = re.search(r'\[([^\]]+)\]', matched, re.U)
  311. if m:
  312. lang = m.group(1)
  313. matched = matched.replace(m.group(0), '')
  314. m = re.search(r'\(([^()]+)\)', matched, re.U)
  315. if m:
  316. aclass = m.group(1)
  317. matched = matched.replace(m.group(0), '')
  318. m = re.search(r'([(]+)', matched)
  319. if m:
  320. style.append("padding-left:%sem;" % len(m.group(1)))
  321. matched = matched.replace(m.group(0), '')
  322. m = re.search(r'([)]+)', matched)
  323. if m:
  324. style.append("padding-right:%sem;" % len(m.group(1)))
  325. matched = matched.replace(m.group(0), '')
  326. m = re.search(r'(%s)' % self.hlgn, matched)
  327. if m:
  328. style.append("text-align:%s;" % self.hAlign(m.group(1)))
  329. m = re.search(r'^(.*)#(.*)$', aclass)
  330. if m:
  331. id = m.group(2)
  332. aclass = m.group(1)
  333. if self.restricted:
  334. if lang: return ' lang="%s"'
  335. else: return ''
  336. result = []
  337. if style: result.append(' style="%s"' % "".join(style))
  338. if aclass: result.append(' class="%s"' % aclass)
  339. if lang: result.append(' lang="%s"' % lang)
  340. if id: result.append(' id="%s"' % id)
  341. if colspan: result.append(' colspan="%s"' % colspan)
  342. if rowspan: result.append(' rowspan="%s"' % rowspan)
  343. return ''.join(result)
  344. def hasRawText(self, text):
  345. """
  346. checks whether the text has text not already enclosed by a block tag
  347. >>> t = Textile()
  348. >>> t.hasRawText('<p>foo bar biz baz</p>')
  349. False
  350. >>> t.hasRawText(' why yes, yes it does')
  351. True
  352. """
  353. r = re.compile(r'<(p|blockquote|div|form|table|ul|ol|pre|h\d)[^>]*?>.*</\1>', re.S).sub('', text.strip()).strip()
  354. r = re.compile(r'<(hr|br)[^>]*?/>').sub('', r)
  355. return '' != r
  356. def table(self, text):
  357. r"""
  358. >>> t = Textile()
  359. >>> t.table('|one|two|three|\n|a|b|c|')
  360. '\t<table>\n\t\t<tr>\n\t\t\t<td>one</td>\n\t\t\t<td>two</td>\n\t\t\t<td>three</td>\n\t\t</tr>\n\t\t<tr>\n\t\t\t<td>a</td>\n\t\t\t<td>b</td>\n\t\t\t<td>c</td>\n\t\t</tr>\n\t</table>\n\n'
  361. """
  362. text = text + "\n\n"
  363. pattern = re.compile(r'^(?:table(_?%(s)s%(a)s%(c)s)\. ?\n)?^(%(a)s%(c)s\.? ?\|.*\|)\n\n' % {'s':self.s, 'a':self.a, 'c':self.c}, re.S|re.M|re.U)
  364. return pattern.sub(self.fTable, text)
  365. def fTable(self, match):
  366. tatts = self.pba(match.group(1), 'table')
  367. rows = []
  368. for row in [ x for x in match.group(2).split('\n') if x]:
  369. rmtch = re.search(r'^(%s%s\. )(.*)' % (self.a, self.c), row.lstrip())
  370. if rmtch:
  371. ratts = self.pba(rmtch.group(1), 'tr')
  372. row = rmtch.group(2)
  373. else: ratts = ''
  374. cells = []
  375. for cell in row.split('|'):
  376. ctyp = 'd'
  377. if re.search(r'^_', cell): ctyp = "h"
  378. cmtch = re.search(r'^(_?%s%s%s\. )(.*)' % (self.s, self.a, self.c), cell)
  379. if cmtch:
  380. catts = self.pba(cmtch.group(1), 'td')
  381. cell = cmtch.group(2)
  382. else: catts = ''
  383. cell = self.graf(self.span(cell))
  384. if cell.strip() != '':
  385. cells.append('\t\t\t<t%s%s>%s</t%s>' % (ctyp, catts, cell, ctyp))
  386. rows.append("\t\t<tr%s>\n%s\n\t\t</tr>" % (ratts, '\n'.join(cells)))
  387. cells = []
  388. catts = None
  389. return "\t<table%s>\n%s\n\t</table>\n\n" % (tatts, '\n'.join(rows))
  390. def lists(self, text):
  391. """
  392. >>> t = Textile()
  393. >>> t.lists("* one\\n* two\\n* three")
  394. '\\t<ul>\\n\\t\\t<li>one</li>\\n\\t\\t<li>two</li>\\n\\t\\t<li>three</li>\\n\\t</ul>'
  395. """
  396. pattern = re.compile(r'^([#*]+%s .*)$(?![^#*])' % self.c, re.U|re.M|re.S)
  397. return pattern.sub(self.fList, text)
  398. def fList(self, match):
  399. text = match.group(0).split("\n")
  400. result = []
  401. lists = []
  402. for i, line in enumerate(text):
  403. try:
  404. nextline = text[i+1]
  405. except IndexError:
  406. nextline = ''
  407. m = re.search(r"^([#*]+)(%s%s) (.*)$" % (self.a, self.c), line, re.S)
  408. if m:
  409. tl, atts, content = m.groups()
  410. nl = ''
  411. nm = re.search(r'^([#*]+)\s.*', nextline)
  412. if nm:
  413. nl = nm.group(1)
  414. if tl not in lists:
  415. lists.append(tl)
  416. atts = self.pba(atts)
  417. line = "\t<%sl%s>\n\t\t<li>%s" % (self.lT(tl), atts, self.graf(content))
  418. else:
  419. line = "\t\t<li>" + self.graf(content)
  420. if len(nl) <= len(tl): line = line + "</li>"
  421. for k in reversed(lists):
  422. if len(k) > len(nl):
  423. line = line + "\n\t</%sl>" % self.lT(k)
  424. if len(k) > 1:
  425. line = line + "</li>"
  426. lists.remove(k)
  427. result.append(line)
  428. return "\n".join(result)
  429. def lT(self, input):
  430. if re.search(r'^#+', input):
  431. return 'o'
  432. else:
  433. return 'u'
  434. def doPBr(self, in_):
  435. return re.compile(r'<(p)([^>]*?)>(.*)(</\1>)', re.S).sub(self.doBr, in_)
  436. def doBr(self, match):
  437. content = re.sub(r'(.+)(?:(?<!<br>)|(?<!<br />))\n(?![#*\s|])', '\\1<br />', match.group(3))
  438. return '<%s%s>%s%s' % (match.group(1), match.group(2), content, match.group(4))
  439. def block(self, text):
  440. """
  441. >>> t = Textile()
  442. >>> t.block('h1. foobar baby')
  443. '\\t<h1>foobar baby</h1>'
  444. """
  445. tre = '|'.join(self.btag)
  446. text = text.split('\n\n')
  447. tag = 'p'
  448. atts = cite = graf = ext = ''
  449. out = []
  450. anon = False
  451. for line in text:
  452. pattern = r'^(%s)(%s%s)\.(\.?)(?::(\S+))? (.*)$' % (tre, self.a, self.c)
  453. match = re.search(pattern, line, re.S)
  454. if match:
  455. if ext:
  456. out.append(out.pop() + c1)
  457. tag,atts,ext,cite,graf = match.groups()
  458. o1, o2, content, c2, c1 = self.fBlock(tag, atts, ext, cite, graf)
  459. # leave off c1 if this block is extended, we'll close it at the start of the next block
  460. if ext:
  461. line = "%s%s%s%s" % (o1, o2, content, c2)
  462. else:
  463. line = "%s%s%s%s%s" % (o1, o2, content, c2, c1)
  464. else:
  465. anon = True
  466. if ext or not re.search(r'^\s', line):
  467. o1, o2, content, c2, c1 = self.fBlock(tag, atts, ext, cite, line)
  468. # skip $o1/$c1 because this is part of a continuing extended block
  469. if tag == 'p' and not self.hasRawText(content):
  470. line = content
  471. else:
  472. line = "%s%s%s" % (o2, content, c2)
  473. else:
  474. line = self.graf(line)
  475. line = self.doPBr(line)
  476. line = re.sub(r'<br>', '<br />', line)
  477. if ext and anon:
  478. out.append(out.pop() + "\n" + line)
  479. else:
  480. out.append(line)
  481. if not ext:
  482. tag = 'p'
  483. atts = ''
  484. cite = ''
  485. graf = ''
  486. if ext:
  487. out.append(out.pop() + c1)
  488. return '\n\n'.join(out)
  489. def fBlock(self, tag, atts, ext, cite, content):
  490. """
  491. >>> t = Textile()
  492. >>> t.fBlock("bq", "", None, "", "Hello BlockQuote")
  493. ('\\t<blockquote>\\n', '\\t\\t<p>', 'Hello BlockQuote', '</p>', '\\n\\t</blockquote>')
  494. >>> t.fBlock("bq", "", None, "http://google.com", "Hello BlockQuote")
  495. ('\\t<blockquote cite="http://google.com">\\n', '\\t\\t<p>', 'Hello BlockQuote', '</p>', '\\n\\t</blockquote>')
  496. >>> t.fBlock("bc", "", None, "", 'printf "Hello, World";') # doctest: +ELLIPSIS
  497. ('<pre>', '<code>', ..., '</code>', '</pre>')
  498. >>> t.fBlock("h1", "", None, "", "foobar")
  499. ('', '\\t<h1>', 'foobar', '</h1>', '')
  500. """
  501. atts = self.pba(atts)
  502. o1 = o2 = c2 = c1 = ''
  503. m = re.search(r'fn(\d+)', tag)
  504. if m:
  505. tag = 'p'
  506. if m.group(1) in self.fn:
  507. fnid = self.fn[m.group(1)]
  508. else:
  509. fnid = m.group(1)
  510. atts = atts + ' id="fn%s"' % fnid
  511. if atts.find('class=') < 0:
  512. atts = atts + ' class="footnote"'
  513. content = ('<sup>%s</sup>' % m.group(1)) + content
  514. if tag == 'bq':
  515. cite = self.checkRefs(cite)
  516. if cite:
  517. cite = ' cite="%s"' % cite
  518. else:
  519. cite = ''
  520. o1 = "\t<blockquote%s%s>\n" % (cite, atts)
  521. o2 = "\t\t<p%s>" % atts
  522. c2 = "</p>"
  523. c1 = "\n\t</blockquote>"
  524. elif tag == 'bc':
  525. o1 = "<pre%s>" % atts
  526. o2 = "<code%s>" % atts
  527. c2 = "</code>"
  528. c1 = "</pre>"
  529. content = self.shelve(self.encode_html(content.rstrip("\n") + "\n"))
  530. elif tag == 'notextile':
  531. content = self.shelve(content)
  532. o1 = o2 = ''
  533. c1 = c2 = ''
  534. elif tag == 'pre':
  535. content = self.shelve(self.encode_html(content.rstrip("\n") + "\n"))
  536. o1 = "<pre%s>" % atts
  537. o2 = c2 = ''
  538. c1 = '</pre>'
  539. else:
  540. o2 = "\t<%s%s>" % (tag, atts)
  541. c2 = "</%s>" % tag
  542. content = self.graf(content)
  543. return o1, o2, content, c2, c1
  544. def footnoteRef(self, text):
  545. """
  546. >>> t = Textile()
  547. >>> t.footnoteRef('foo[1] ') # doctest: +ELLIPSIS
  548. 'foo<sup class="footnote"><a href="#fn...">1</a></sup> '
  549. """
  550. return re.sub(r'\b\[([0-9]+)\](\s)?', self.footnoteID, text)
  551. def footnoteID(self, match):
  552. id, t = match.groups()
  553. if id not in self.fn:
  554. self.fn[id] = str(uuid.uuid4())
  555. fnid = self.fn[id]
  556. if not t: t = ''
  557. return '<sup class="footnote"><a href="#fn%s">%s</a></sup>%s' % (fnid, id, t)
  558. def glyphs(self, text):
  559. """
  560. >>> t = Textile()
  561. >>> t.glyphs("apostrophe's")
  562. 'apostrophe&#8217;s'
  563. >>> t.glyphs("back in '88")
  564. 'back in &#8217;88'
  565. >>> t.glyphs('foo ...')
  566. 'foo &#8230;'
  567. >>> t.glyphs('--')
  568. '&#8212;'
  569. >>> t.glyphs('FooBar[tm]')
  570. 'FooBar&#8482;'
  571. >>> t.glyphs("<p><cite>Cat's Cradle</cite> by Vonnegut</p>")
  572. '<p><cite>Cat&#8217;s Cradle</cite> by Vonnegut</p>'
  573. """
  574. # fix: hackish
  575. text = re.sub(r'"\z', '\" ', text)
  576. glyph_search = (
  577. re.compile(r"(\w)\'(\w)"), # apostrophe's
  578. re.compile(r'(\s)\'(\d+\w?)\b(?!\')'), # back in '88
  579. re.compile(r'(\S)\'(?=\s|'+self.pnct+'|<|$)'), # single closing
  580. re.compile(r'\'/'), # single opening
  581. re.compile(r'(\S)\"(?=\s|'+self.pnct+'|<|$)'), # double closing
  582. re.compile(r'"'), # double opening
  583. re.compile(r'\b([A-Z][A-Z0-9]{2,})\b(?:[(]([^)]*)[)])'), # 3+ uppercase acronym
  584. re.compile(r'\b([A-Z][A-Z\'\-]+[A-Z])(?=[\s.,\)>])'), # 3+ uppercase
  585. re.compile(r'\b(\s{0,1})?\.{3}'), # ellipsis
  586. re.compile(r'(\s?)--(\s?)'), # em dash
  587. re.compile(r'\s-(?:\s|$)'), # en dash
  588. re.compile(r'(\d+)( ?)x( ?)(?=\d+)'), # dimension sign
  589. re.compile(r'\b ?[([]TM[])]', re.I), # trademark
  590. re.compile(r'\b ?[([]R[])]', re.I), # registered
  591. re.compile(r'\b ?[([]C[])]', re.I), # copyright
  592. )
  593. glyph_replace = [x % dict(self.glyph_defaults) for x in (
  594. r'\1%(txt_apostrophe)s\2', # apostrophe's
  595. r'\1%(txt_apostrophe)s\2', # back in '88
  596. r'\1%(txt_quote_single_close)s', # single closing
  597. r'%(txt_quote_single_open)s', # single opening
  598. r'\1%(txt_quote_double_close)s', # double closing
  599. r'%(txt_quote_double_open)s', # double opening
  600. r'<acronym title="\2">\1</acronym>', # 3+ uppercase acronym
  601. r'<span class="caps">\1</span>', # 3+ uppercase
  602. r'\1%(txt_ellipsis)s', # ellipsis
  603. r'\1%(txt_emdash)s\2', # em dash
  604. r' %(txt_endash)s ', # en dash
  605. r'\1\2%(txt_dimension)s\3', # dimension sign
  606. r'%(txt_trademark)s', # trademark
  607. r'%(txt_registered)s', # registered
  608. r'%(txt_copyright)s', # copyright
  609. )]
  610. result = []
  611. for line in re.compile(r'(<.*?>)', re.U).split(text):
  612. if not re.search(r'<.*>', line):
  613. for s, r in zip(glyph_search, glyph_replace):
  614. line = s.sub(r, line)
  615. result.append(line)
  616. return ''.join(result)
  617. def iAlign(self, input):
  618. d = {'<':'left', '=':'center', '>':'right'}
  619. return d.get(input, '')
  620. def vAlign(self, input):
  621. d = {'^':'top', '-':'middle', '~':'bottom'}
  622. return d.get(input, '')
  623. def hAlign(self, input):
  624. d = {'<':'left', '=':'center', '>':'right', '<>': 'justify'}
  625. return d.get(input, '')
  626. def getRefs(self, text):
  627. """
  628. what is this for?
  629. """
  630. pattern = re.compile(r'(?:(?<=^)|(?<=\s))\[(.+)\]((?:http:\/\/|\/)\S+)(?=\s|$)', re.U)
  631. text = pattern.sub(self.refs, text)
  632. return text
  633. def refs(self, match):
  634. flag, url = match.groups()
  635. self.urlrefs[flag] = url
  636. return ''
  637. def checkRefs(self, url):
  638. return self.urlrefs.get(url, url)
  639. def relURL(self, url):
  640. o = urlparse(url)
  641. (scheme,netloc,path,params,query,fragment) = o[0:6]
  642. if (not scheme or scheme == 'http') and not netloc and re.search(r'^\w', path):
  643. url = self.hu + url
  644. if self.restricted and scheme and scheme not in self.url_schemes:
  645. return '#'
  646. return url
  647. def shelve(self, text):
  648. id = str(uuid.uuid4())
  649. self.shelf[id] = text
  650. return id
  651. def retrieve(self, text):
  652. """
  653. >>> t = Textile()
  654. >>> id = t.shelve("foobar")
  655. >>> t.retrieve(id)
  656. 'foobar'
  657. """
  658. while True:
  659. old = text
  660. for k,v in self.shelf.items():
  661. text = text.replace(k,v)
  662. if text == old: break
  663. return text
  664. def encode_html(self, text, quotes=True):
  665. a = (
  666. ('&', '&#38;'),
  667. ('<', '&#60;'),
  668. ('>', '&#62;')
  669. )
  670. if quotes:
  671. a = a + (
  672. ("'", '&#39;'),
  673. ('"', '&#34;')
  674. )
  675. for k,v in a:
  676. text = text.replace(k,v)
  677. return text
  678. def graf(self, text):
  679. if not self.lite:
  680. text = self.noTextile(text)
  681. text = self.code(text)
  682. text = self.links(text)
  683. if not self.noimage:
  684. text = self.image(text)
  685. if not self.lite:
  686. text = self.lists(text)
  687. text = self.table(text)
  688. text = self.span(text)
  689. text = self.footnoteRef(text)
  690. text = self.glyphs(text)
  691. return text.rstrip('\n')
  692. def links(self, text):
  693. """
  694. >>> t = Textile()
  695. >>> t.links('fooobar "Google":http://google.com/foobar/ and hello world "flickr":http://flickr.com/photos/jsamsa/ ') # doctest: +ELLIPSIS
  696. 'fooobar ... and hello world ...'
  697. """
  698. punct = '!"#$%&\'*+,-./:;=?@\\^_`|~'
  699. pattern = r'''
  700. ([\s\[{(]|[%s])? # $pre
  701. " # start
  702. (%s) # $atts
  703. ([^"]+?) # $text
  704. \s?
  705. (?:\(([^)]+?)\)(?="))? # $title
  706. ":
  707. (\S+?) # $url
  708. (\/)? # $slash
  709. ([^\w\/;]*?) # $post
  710. (?=<|\s|$)
  711. ''' % (re.escape(punct), self.c)
  712. text = re.compile(pattern, re.X).sub(self.fLink, text)
  713. return text
  714. def fLink(self, match):
  715. pre, atts, text, title, url, slash, post = match.groups()
  716. if pre == None:
  717. pre = ''
  718. url = self.checkRefs(url)
  719. atts = self.pba(atts)
  720. if title: atts = atts + ' title="%s"' % self.encode_html(title)
  721. if not self.noimage:
  722. text = self.image(text)
  723. text = self.span(text)
  724. text = self.glyphs(text)
  725. url = self.relURL(url)
  726. if slash: url = url + slash
  727. out = '<a href="%s"%s%s>%s</a>' % (self.encode_html(url), atts, self.rel, text)
  728. out = self.shelve(out)
  729. return ''.join([pre, out, post])
  730. def span(self, text):
  731. """
  732. >>> t = Textile()
  733. >>> t.span(r"hello %(bob)span *strong* and **bold**% goodbye")
  734. 'hello <span class="bob">span <strong>strong</strong> and <b>bold</b></span> goodbye'
  735. """
  736. qtags = (r'\*\*', r'\*', r'\?\?', r'\-', r'__', r'_', r'%', r'\+', r'~', r'\^')
  737. pnct = ".,\"'?!;:"
  738. for qtag in qtags:
  739. pattern = re.compile(r"""
  740. (?:^|(?<=[\s>%(pnct)s])|([\]}]))
  741. (%(qtag)s)(?!%(qtag)s)
  742. (%(c)s)
  743. (?::(\S+))?
  744. ([^\s%(qtag)s]+|\S[^%(qtag)s\n]*[^\s%(qtag)s\n])
  745. ([%(pnct)s]*)
  746. %(qtag)s
  747. (?:$|([\]}])|(?=%(selfpnct)s{1,2}|\s))
  748. """ % {'qtag':qtag,'c':self.c,'pnct':pnct,'selfpnct':self.pnct}, re.X)
  749. text = pattern.sub(self.fSpan, text)
  750. return text
  751. def fSpan(self, match):
  752. _, tag, atts, cite, content, end, _ = match.groups()
  753. qtags = {
  754. '*': 'strong',
  755. '**': 'b',
  756. '??': 'cite',
  757. '_' : 'em',
  758. '__': 'i',
  759. '-' : 'del',
  760. '%' : 'span',
  761. '+' : 'ins',
  762. '~' : 'sub',
  763. '^' : 'sup'
  764. }
  765. tag = qtags[tag]
  766. atts = self.pba(atts)
  767. if cite:
  768. atts = atts + 'cite="%s"' % cite
  769. out = "<%s%s>%s%s</%s>" % (tag, atts, content, end, tag)
  770. return out;
  771. def image(self, text):
  772. """
  773. >>> t = Textile()
  774. >>> t.image('!/imgs/myphoto.jpg!:http://jsamsa.com')
  775. '<a href="http://jsamsa.com"><img src="/imgs/myphoto.jpg" alt="" /></a>'
  776. """
  777. pattern = re.compile(r"""
  778. (?:[\[{])? # pre
  779. \! # opening !
  780. (\<|\=|\>)?? # optional alignment atts
  781. (%s) # optional style,class atts
  782. (?:\. )? # optional dot-space
  783. ([^\s(!]+) # presume this is the src
  784. \s? # optional space
  785. (?:\(([^\)]+)\))? # optional title
  786. \! # closing
  787. (?::(\S+))? # optional href
  788. (?:[\]}]|(?=\s|$)) # lookahead: space or end of string
  789. """ % self.c, re.U|re.X)
  790. return pattern.sub(self.fImage, text)
  791. def fImage(self, match):
  792. # (None, '', '/imgs/myphoto.jpg', None, None)
  793. algn, atts, url, title, href = match.groups()
  794. atts = self.pba(atts)
  795. if algn:
  796. atts = atts + ' align="%s"' % self.iAlign(algn)
  797. if title:
  798. atts = atts + ' title="%s" alt="%s"' % (title, title)
  799. else:
  800. atts = atts + ' alt=""'
  801. # TODO how to do this in python?
  802. # size = @getimagesize(url)
  803. # if (size) atts .= " size[3]"
  804. if href:
  805. href = self.checkRefs(href)
  806. url = self.checkRefs(url)
  807. url = self.relURL(url)
  808. out = []
  809. if href: out.append('<a href="%s">' % href)
  810. out.append('<img src="%s"%s />' % (url, atts))
  811. if href: out.append('</a>')
  812. return ''.join(out)
  813. def code(self, text):
  814. text = self.doSpecial(text, '<code>', '</code>', self.fCode)
  815. text = self.doSpecial(text, '@', '@', self.fCode)
  816. text = self.doSpecial(text, '<pre>', '</pre>', self.fPre)
  817. return text
  818. def fCode(self, match):
  819. before, text, after = match.groups()
  820. if after == None: after = ''
  821. # text needs to be escaped
  822. if not self.restricted:
  823. text = self.encode_html(text)
  824. return ''.join([before, self.shelve('<code>%s</code>' % text), after])
  825. def fPre(self, match):
  826. before, text, after = match.groups()
  827. if after == None: after = ''
  828. # text needs to be escapedd
  829. if not self.restricted:
  830. text = self.encode_html(text)
  831. return ''.join([before, '<pre>', self.shelve(text), '</pre>', after])
  832. def doSpecial(self, text, start, end, method=None):
  833. if method == None:
  834. method = self.fSpecial
  835. pattern = re.compile(r'(^|\s|[\[({>])%s(.*?)%s(\s|$|[\])}])?' % (re.escape(start), re.escape(end)), re.M|re.S)
  836. return pattern.sub(method, text)
  837. def fSpecial(self, match):
  838. """
  839. special blocks like notextile or code
  840. """
  841. before, text, after = match.groups()
  842. if after == None: after = ''
  843. return ''.join([before, self.shelve(self.encode_html(text)), after])
  844. def noTextile(self, text):
  845. text = self.doSpecial(text, '<notextile>', '</notextile>', self.fTextile)
  846. return self.doSpecial(text, '==', '==', self.fTextile)
  847. def fTextile(self, match):
  848. before, notextile, after = match.groups()
  849. if after == None: after = ''
  850. return ''.join([before, self.shelve(notextile), after])
  851. def textile(text, **args):
  852. """
  853. this function takes additional parameters:
  854. encoding - input encoding (default: 'utf-8')
  855. output - output encoding (default: 'utf-8')
  856. validate - perform mxTidy or uTidyLib validation (default: False)
  857. sanitize - sanitize output good for weblog comments (default: False)
  858. head_offset - ignored
  859. """
  860. return Textile().textile(text, **args)
  861. def _test():
  862. import doctest
  863. doctest.testmod()
  864. if __name__ == "__main__":
  865. import sys
  866. if len(sys.argv) == 2:
  867. f = open(sys.argv[1])
  868. text = ''.join(f.readlines())
  869. print Textile().textile(text)
  870. else:
  871. _test()