PageRenderTime 73ms CodeModel.GetById 33ms RepoModel.GetById 0ms app.codeStats 1ms

/experiments/old-bluecloth.rb

https://bitbucket.org/ged/bluecloth
Ruby | 1222 lines | 903 code | 125 blank | 194 comment | 7 complexity | 6ec56c31ab60a312dbcc56d510213ab4 MD5 | raw file
Possible License(s): BSD-3-Clause
  1. #!/usr/bin/ruby
  2. #
  3. # Bluecloth is a Ruby implementation of Markdown, a text-to-HTML conversion
  4. # tool.
  5. #
  6. # == Synopsis
  7. #
  8. # doc = BlueCloth.new "
  9. # ## Test document ##
  10. #
  11. # Just a simple test.
  12. # "
  13. #
  14. # puts doc.to_html
  15. #
  16. # == Authors
  17. #
  18. # * Michael Granger <ged@FaerieMUD.org>
  19. #
  20. # == Contributors
  21. #
  22. # * Martin Chase <stillflame@FaerieMUD.org> - Peer review, helpful suggestions
  23. # * Florian Gross <flgr@ccan.de> - Filter options, suggestions
  24. #
  25. # == Copyright
  26. #
  27. # Original version:
  28. # Copyright (c) 2004, 2005, John Gruber
  29. # <http://daringfireball.net/>
  30. # All rights reserved.
  31. #
  32. # Redistribution and use in source and binary forms, with or without
  33. # modification, are permitted provided that the following conditions are
  34. # met:
  35. #
  36. # * Redistributions of source code must retain the above copyright notice,
  37. # this list of conditions and the following disclaimer.
  38. #
  39. # * Redistributions in binary form must reproduce the above copyright
  40. # notice, this list of conditions and the following disclaimer in the
  41. # documentation and/or other materials provided with the distribution.
  42. #
  43. # * Neither the name "Markdown" nor the names of its contributors may
  44. # be used to endorse or promote products derived from this software
  45. # without specific prior written permission.
  46. #
  47. # This software is provided by the copyright holders and contributors "as
  48. # is" and any express or implied warranties, including, but not limited
  49. # to, the implied warranties of merchantability and fitness for a
  50. # particular purpose are disclaimed. In no event shall the copyright owner
  51. # or contributors be liable for any direct, indirect, incidental, special,
  52. # exemplary, or consequential damages (including, but not limited to,
  53. # procurement of substitute goods or services; loss of use, data, or
  54. # profits; or business interruption) however caused and on any theory of
  55. # liability, whether in contract, strict liability, or tort (including
  56. # negligence or otherwise) arising in any way out of the use of this
  57. # software, even if advised of the possibility of such damage.
  58. #
  59. # Ruby port:
  60. # Copyright (c) 2004, 2005 The FaerieMUD Consortium.
  61. #
  62. # You may use, modify, and/or redistribute this software under the same terms
  63. # as Ruby itself. A copy of Ruby's license should be included in this package;
  64. # if not, it can be obtained online at:
  65. # http://www.ruby-lang.org/en/LICENSE.txt.
  66. #
  67. # THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
  68. # WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
  69. # MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
  70. #
  71. # == To-do
  72. #
  73. # * Refactor some of the larger uglier methods that have to do their own
  74. # brute-force scanning because of lack of Perl features in Ruby's Regexp
  75. # class. Alternately, could add a dependency on 'pcre' and use most Perl
  76. # regexps.
  77. #
  78. # * Put the StringScanner in the render state for thread-safety.
  79. #
  80. # == Version
  81. #
  82. # $Id$
  83. #
  84. require 'digest/md5'
  85. require 'logger'
  86. require 'strscan'
  87. ### BlueCloth is a Ruby implementation of Markdown, a text-to-HTML conversion
  88. ### tool.
  89. class OldBlueCloth < String
  90. ### Exception class for formatting errors.
  91. class FormatError < RuntimeError
  92. ### Create a new FormatError with the given source +str+ and an optional
  93. ### message about the +specific+ error.
  94. def initialize( str, specific=nil )
  95. if specific
  96. msg = "Bad markdown format near %p: %s" % [ str, specific ]
  97. else
  98. msg = "Bad markdown format near %p" % str
  99. end
  100. super( msg )
  101. end
  102. end
  103. # Release Version
  104. VERSION = '1.1.0'
  105. # SVN Revision
  106. SVNREV = %q$Rev$
  107. # SVN Id tag
  108. SVNID = %q$Id$
  109. # Rendering state struct. Keeps track of URLs, titles, and HTML blocks
  110. # midway through a render. I prefer this to the globals of the Perl version
  111. # because globals make me break out in hives. Or something.
  112. RenderState = Struct.new( "RenderState", :urls, :titles, :html_blocks, :list_level, :log )
  113. # Tab width for #detab! if none is specified
  114. # :TODO: Make this DEFAULT_TAB_WIDTH and make tab width a per-instance setting instead.
  115. TAB_WIDTH = 4
  116. LESS_THAN_TAB_WIDTH = TAB_WIDTH - 1
  117. # The tag-closing string -- set to '>' for HTML
  118. EMPTY_ELEMENT_SUFFIX = " />";
  119. # Table of MD5 sums for escaped characters
  120. ESCAPE_TABLE = {}
  121. '\\`*_{}[]()>#+-.!'.split(//).each {|char|
  122. hash = Digest::MD5.hexdigest( char )
  123. ESCAPE_TABLE[ char ] = {
  124. :md5 => hash,
  125. :md5re => Regexp.new( hash ),
  126. :re => Regexp.new( '\\\\' + Regexp.escape(char) ),
  127. }
  128. }
  129. #################################################################
  130. ### I N S T A N C E M E T H O D S
  131. #################################################################
  132. ### Create a new BlueCloth string.
  133. def initialize( content="", *restrictions )
  134. @log = Logger.new( $deferr )
  135. @log.level = $DEBUG ?
  136. Logger::DEBUG :
  137. ($VERBOSE ? Logger::INFO : Logger::WARN)
  138. @scanner = nil
  139. # Add any restrictions, and set the line-folding attribute to reflect
  140. # what happens by default.
  141. @filter_html = nil
  142. @filter_styles = nil
  143. restrictions.flatten.each {|r| __send__("#{r}=", true) }
  144. @fold_lines = true
  145. super( content )
  146. @log.debug "String is: %p" % self
  147. end
  148. ######
  149. public
  150. ######
  151. # Filters for controlling what gets output for untrusted input. (But really,
  152. # you're filtering bad stuff out of untrusted input at submission-time via
  153. # untainting, aren't you?)
  154. attr_accessor :filter_html, :filter_styles
  155. # RedCloth-compatibility accessor. Line-folding is part of Markdown syntax,
  156. # so this isn't used by anything.
  157. attr_accessor :fold_lines
  158. ### Render Markdown-formatted text in this string object as HTML and return
  159. ### it. The parameter is for compatibility with RedCloth, and is currently
  160. ### unused, though that may change in the future.
  161. def to_html( lite=false )
  162. # Create a StringScanner we can reuse for various lexing tasks
  163. @scanner = StringScanner.new( '' )
  164. # Make a structure to carry around stuff that gets placeholdered out of
  165. # the source.
  166. rs = RenderState.new( {}, {}, {}, 0 )
  167. # Make a copy of the string with normalized line endings, tabs turned to
  168. # spaces, and a couple of guaranteed newlines at the end
  169. text = self.gsub( /\r\n?/, "\n" ).detab
  170. text += "\n\n"
  171. @log.debug "Normalized line-endings: %p" % text
  172. # Filter HTML if we're asked to do so
  173. if self.filter_html
  174. text.gsub!( "<", "&lt;" )
  175. text.gsub!( ">", "&gt;" )
  176. @log.debug "Filtered HTML: %p" % text
  177. end
  178. # Simplify blank lines
  179. text.gsub!( /^ +$/, '' )
  180. @log.debug "Tabs -> spaces/blank lines stripped: %p" % text
  181. # Replace HTML blocks with placeholders
  182. text = hide_html_blocks( text, rs )
  183. @log.debug "Hid HTML blocks: %p" % text
  184. @log.debug "Render state: %p" % rs
  185. # Strip link definitions, store in render state
  186. text = strip_link_definitions( text, rs )
  187. @log.debug "Stripped link definitions: %p" % text
  188. @log.debug "Render state: %p" % rs
  189. # Escape meta-characters
  190. text = escape_special_chars( text )
  191. @log.debug "Escaped special characters: %p" % text
  192. # Transform block-level constructs
  193. text = apply_block_transforms( text, rs )
  194. @log.debug "After block-level transforms: %p" % text
  195. # Now swap back in all the escaped characters
  196. text = unescape_special_chars( text )
  197. @log.debug "After unescaping special characters: %p" % text
  198. return text
  199. end
  200. ### Convert tabs in +str+ to spaces.
  201. def detab( tabwidth=TAB_WIDTH )
  202. copy = self.dup
  203. copy.detab!( tabwidth )
  204. return copy
  205. end
  206. ### Convert tabs to spaces in place and return self if any were converted.
  207. def detab!( tabwidth=TAB_WIDTH )
  208. newstr = self.split( /\n/ ).collect {|line|
  209. line.gsub( /(.*?)\t/ ) do
  210. $1 + ' ' * (tabwidth - $1.length % tabwidth)
  211. end
  212. }.join("\n")
  213. self.replace( newstr )
  214. end
  215. #######
  216. #private
  217. #######
  218. ### Do block-level transforms on a copy of +str+ using the specified render
  219. ### state +rs+ and return the results.
  220. def apply_block_transforms( str, rs )
  221. # Port: This was called '_runBlockGamut' in the original
  222. @log.debug "Applying block transforms to:\n %p" % str
  223. text = transform_headers( str, rs )
  224. text = transform_hrules( text, rs )
  225. text = transform_lists( text, rs )
  226. text = transform_code_blocks( text, rs )
  227. text = transform_block_quotes( text, rs )
  228. text = hide_html_blocks( text, rs )
  229. text = form_paragraphs( text, rs )
  230. @log.debug "Done with block transforms:\n %p" % text
  231. return text
  232. end
  233. ### Apply Markdown span transforms to a copy of the specified +str+ with the
  234. ### given render state +rs+ and return it.
  235. def apply_span_transforms( str, rs )
  236. @log.debug "Applying span transforms to:\n %p" % str
  237. str = transform_code_spans( str, rs )
  238. str = escape_special_chars( str )
  239. str = transform_images( str, rs )
  240. str = transform_anchors( str, rs )
  241. str = transform_auto_links( str, rs )
  242. str = encode_html( str )
  243. str = transform_italic_and_bold( str, rs )
  244. # Hard breaks
  245. str.gsub!( / {2,}\n/, "<br#{EMPTY_ELEMENT_SUFFIX}\n" )
  246. @log.debug "Done with span transforms:\n %p" % str
  247. return str
  248. end
  249. # The list of tags which are considered block-level constructs and an
  250. # alternation pattern suitable for use in regexps made from the list
  251. StrictBlockTags = %w[ p div h[1-6] blockquote pre table dl ol ul script noscript
  252. form fieldset iframe math ins del ]
  253. StrictTagPattern = StrictBlockTags.join('|')
  254. LooseBlockTags = StrictBlockTags - %w[ins del]
  255. LooseTagPattern = LooseBlockTags.join('|')
  256. # Nested blocks:
  257. # <div>
  258. # <div>
  259. # tags for inner block must be indented.
  260. # </div>
  261. # </div>
  262. StrictBlockRegex = %r{
  263. ^ # Start of line
  264. <(#{StrictTagPattern}) # Start tag: \2
  265. \b # word break
  266. (.*\n)*? # Any number of lines, minimal match
  267. </\1> # Matching end tag
  268. [ ]* # trailing spaces
  269. $ # End of line or document
  270. }ix
  271. # More-liberal block-matching
  272. LooseBlockRegex = %r{
  273. ^ # Start of line
  274. <(#{LooseTagPattern}) # start tag: \2
  275. \b # word break
  276. (.*\n)*? # Any number of lines, minimal match
  277. .*</\1> # Anything + Matching end tag
  278. [ ]* # trailing spaces
  279. $ # End of line or document
  280. }ix
  281. # Special case for <hr />.
  282. HruleBlockRegex = %r{
  283. ( # $1
  284. \A\n? # Start of doc + optional \n
  285. | # or
  286. .*\n\n # anything + blank line
  287. )
  288. ( # save in $2
  289. [ ]{0,#{LESS_THAN_TAB_WIDTH}} # Any spaces
  290. <hr # Tag open
  291. \b # Word break
  292. ([^<>])*? # Attributes
  293. /?> # Tag close
  294. $ # followed by a blank line or end of document
  295. )
  296. }ix
  297. # Special case for standalone HTML comments
  298. CommentBlockRegex = %r{
  299. ( # $1
  300. \A\n? # Start of doc + optional \n
  301. | # or
  302. .*\n\n # anything + blank line
  303. )
  304. ( # save in $2
  305. [ ]{0,#{LESS_THAN_TAB_WIDTH}} # Any spaces
  306. (?:
  307. <!
  308. (--.*?--\s*)+
  309. >
  310. )
  311. $ # followed by a blank line or end of document
  312. )
  313. }ix
  314. ### Replace all blocks of HTML in +str+ that start in the left margin with
  315. ### tokens.
  316. def hide_html_blocks( str, rs )
  317. @log.debug "Hiding HTML blocks in %p" % str
  318. # Tokenizer proc to pass to gsub
  319. tokenize = lambda {|match|
  320. key = Digest::MD5.hexdigest( match )
  321. rs.html_blocks[ key ] = match
  322. @log.debug "Replacing %p with %p" % [ match, key ]
  323. "\n\n#{key}\n\n"
  324. }
  325. rval = str.dup
  326. @log.debug "Finding blocks with the strict regex..."
  327. rval.gsub!( StrictBlockRegex, &tokenize )
  328. @log.debug "Finding blocks with the loose regex..."
  329. rval.gsub!( LooseBlockRegex, &tokenize )
  330. @log.debug "Finding hrules..."
  331. rval.gsub!( HruleBlockRegex ) {|match| $1 + tokenize[$2] }
  332. @log.debug "Finding comments..."
  333. rval.gsub!( CommentBlockRegex ) {|match| $1 + tokenize[$2] }
  334. return rval
  335. end
  336. # Link defs are in the form: ^[id]: url "optional title"
  337. LinkRegex = %r{
  338. ^[ ]{0,#{LESS_THAN_TAB_WIDTH}}\[(.+)\]: # id = $1
  339. [ ]*
  340. \n? # maybe *one* newline
  341. [ ]*
  342. <?(\S+?)>? # url = $2
  343. [ ]*
  344. \n? # maybe one newline
  345. [ ]*
  346. (?:
  347. # Titles are delimited by "quotes" or (parens).
  348. ["(]
  349. (.+?) # title = $3
  350. [")] # Matching ) or "
  351. [ ]*
  352. )? # title is optional
  353. (?:\n+|\Z)
  354. }x
  355. ### Strip link definitions from +str+, storing them in the given RenderState
  356. ### +rs+.
  357. def strip_link_definitions( str, rs )
  358. str.gsub( LinkRegex ) {|match|
  359. id, url, title = $1, $2, $3
  360. rs.urls[ id.downcase ] = encode_html( url )
  361. unless title.nil?
  362. rs.titles[ id.downcase ] = title.gsub( /"/, "&quot;" )
  363. end
  364. ""
  365. }
  366. end
  367. ### Escape special characters in the given +str+
  368. def escape_special_chars( str )
  369. @log.debug " Escaping special characters"
  370. text = ''
  371. # Split the HTML into tags and text, calling back into this block for
  372. # each chunk.
  373. tokenize_html( str ) {|token, str|
  374. @log.debug " Adding %p token %p" % [ token, str ]
  375. case token
  376. # Within tags, encode * and _
  377. when :tag
  378. text += str.
  379. gsub( /\*/, ESCAPE_TABLE['*'][:md5] ).
  380. gsub( /_/, ESCAPE_TABLE['_'][:md5] )
  381. # Encode backslashed stuff in regular text
  382. when :text
  383. text += encode_backslash_escapes( str )
  384. else
  385. raise TypeError, "Unknown token type %p" % token
  386. end
  387. }
  388. @log.debug " Text with escapes is now: %p" % text
  389. return text
  390. end
  391. ### Swap escaped special characters in a copy of the given +str+ and return
  392. ### it.
  393. def unescape_special_chars( str )
  394. ESCAPE_TABLE.each {|char, hash|
  395. @log.debug "Unescaping escaped %p with %p" % [ char, hash[:md5re] ]
  396. str.gsub!( hash[:md5re], char )
  397. }
  398. return str
  399. end
  400. ### Return a copy of the given +str+ with any backslashed special character
  401. ### in it replaced with MD5 placeholders.
  402. def encode_backslash_escapes( str )
  403. # Make a copy with any double-escaped backslashes encoded
  404. text = str.gsub( /\\\\/, ESCAPE_TABLE['\\'][:md5] )
  405. ESCAPE_TABLE.each_pair {|char, esc|
  406. next if char == '\\'
  407. text.gsub!( esc[:re], esc[:md5] )
  408. }
  409. return text
  410. end
  411. ### Transform any Markdown-style horizontal rules in a copy of the specified
  412. ### +str+ and return it.
  413. def transform_hrules( str, rs )
  414. @log.debug " Transforming horizontal rules"
  415. str.gsub( /^[ ]{0,2}( ?[\-\*_] ?){3,} *$/, "\n<hr#{EMPTY_ELEMENT_SUFFIX}\n" )
  416. end
  417. # Patterns to match and transform lists
  418. ListMarkerOl = %r{\d+\.}
  419. ListMarkerUl = %r{[*+-]}
  420. ListMarkerAny = Regexp.union( ListMarkerOl, ListMarkerUl )
  421. # Part of list-pattern common to both first-level and n-level lists
  422. ListBodyPattern = %Q{
  423. (?:
  424. [ ]{0,#{LESS_THAN_TAB_WIDTH}} # Indent < tab width
  425. (#{ListMarkerAny}) # $3 (see below): unordered or ordered
  426. [ ]+ # At least one space
  427. )
  428. (?m:.+?) # item content (include newlines)
  429. (?:
  430. \\z # Either EOF
  431. | # or
  432. \\n{2,} # Blank line...
  433. (?=\S) # ...followed by non-space
  434. (?![ ]* # ...but not another item
  435. (#{ListMarkerAny})
  436. [ ]+)
  437. )
  438. }
  439. # Regexp to match first-level lists
  440. OuterListRegexp = %r{
  441. ( # $1
  442. \A\n? # Start of doc + optional \n
  443. | # or
  444. .*\n\n # anything + blank line
  445. )
  446. (#{ListBodyPattern}) # $2
  447. }x
  448. # Regexp to match n-level lists
  449. InnerListRegexp = %r{
  450. (^) # $1
  451. (#{ListBodyPattern}) # $2
  452. }x
  453. ### Transform Markdown-style lists in a copy of the specified +str+ and
  454. ### return it.
  455. def transform_lists( str, rs )
  456. @log.debug " Transforming lists at %p" % (str[0,100] + '...')
  457. # Choose a regexp based on whether we're already in a list or not
  458. re = if rs.list_level.zero? then OuterListRegexp else InnerListRegexp end
  459. # Use the chosen regexp to find lists
  460. str.gsub( re ) {
  461. pre, list, bullet = $1, $2, $3
  462. @log.debug " Found list bullet %p after %p: %p" %
  463. [ bullet, pre, list ]
  464. list_type = (ListMarkerUl.match(bullet) ? "ul" : "ol")
  465. list.gsub!( /\n{2,}/, "\n\n\n" )
  466. %{%s<%s>\n%s</%s>\n} % [
  467. pre,
  468. list_type,
  469. transform_list_items( list, rs ),
  470. list_type,
  471. ]
  472. }
  473. end
  474. # Pattern for transforming list items
  475. ListItemRegexp = %r{
  476. (\n)? # leading line = $1
  477. (^[ ]*) # leading whitespace = $2
  478. (#{ListMarkerAny}) [ ]+ # list marker = $3
  479. ((?m:.+?) # list item text = $4
  480. (\n{1,2}))
  481. (?= \n* (\z | \2 (#{ListMarkerAny}) [ ]+))
  482. }x
  483. ### Transform list items in a copy of the given +str+ and return it.
  484. def transform_list_items( str, rs )
  485. @log.debug " Transforming list items"
  486. # Increment the marker for parsing sublists
  487. rs.list_level += 1
  488. # Trim trailing blank lines
  489. str = str.sub( /\n{2,}\z/, "\n" )
  490. str.gsub( ListItemRegexp ) {|line|
  491. @log.debug " Found item line %p" % line
  492. leading_line, item = $1, $4
  493. if leading_line or /\n{2,}/.match( item )
  494. @log.debug " Found leading line or item has a blank"
  495. item = apply_block_transforms( outdent(item), rs )
  496. else
  497. # Recursion for sub-lists
  498. @log.debug " Recursing for sublist"
  499. item = transform_lists( outdent(item), rs ).chomp
  500. item = apply_span_transforms( item, rs )
  501. end
  502. %{<li>%s</li>\n} % item
  503. }
  504. ensure
  505. # Decrement the list-level counter
  506. rs.list_level -= 1
  507. end
  508. # Pattern for matching codeblocks
  509. CodeBlockRegexp = %r{
  510. (?:\n\n|\A)
  511. ( # $1 = the code block
  512. (?:
  513. (?:[ ]{#{TAB_WIDTH}} | \t) # a tab or tab-width of spaces
  514. .*\n+
  515. )+
  516. )
  517. (^[ ]{0,#{TAB_WIDTH - 1}}\S|\Z) # Lookahead for non-space at
  518. # line-start, or end of doc
  519. }x
  520. ### Transform Markdown-style codeblocks in a copy of the specified +str+ and
  521. ### return it.
  522. def transform_code_blocks( str, rs )
  523. @log.debug " Transforming code blocks"
  524. str.gsub( CodeBlockRegexp ) {|block|
  525. codeblock = $1
  526. remainder = $2
  527. # Generate the codeblock
  528. %{\n\n<pre><code>%s\n</code></pre>\n\n%s} %
  529. [ encode_code( outdent(codeblock), rs ).rstrip, remainder ]
  530. }
  531. end
  532. # Pattern for matching Markdown blockquote blocks
  533. BlockQuoteRegexp = %r{
  534. (?:
  535. ^[ ]*>[ ]? # '>' at the start of a line
  536. .+\n # rest of the first line
  537. (?:.+\n)* # subsequent consecutive lines
  538. \n* # blanks
  539. )+
  540. }x
  541. PreChunk = %r{ ( ^ \s* <pre> .+? </pre> ) }xm
  542. ### Transform Markdown-style blockquotes in a copy of the specified +str+
  543. ### and return it.
  544. def transform_block_quotes( str, rs )
  545. @log.debug " Transforming block quotes"
  546. str.gsub( BlockQuoteRegexp ) {|quote|
  547. @log.debug "Making blockquote from %p" % quote
  548. quote.gsub!( /^ *> ?/, '' ) # Trim one level of quoting
  549. quote.gsub!( /^ +$/, '' ) # Trim whitespace-only lines
  550. indent = " " * TAB_WIDTH
  551. quoted = %{<blockquote>\n%s\n</blockquote>\n\n} %
  552. apply_block_transforms( quote, rs ).
  553. gsub( /^/, indent ).
  554. gsub( PreChunk ) {|m| m.gsub(/^#{indent}/o, '') }
  555. @log.debug "Blockquoted chunk is: %p" % quoted
  556. quoted
  557. }
  558. end
  559. AutoAnchorURLRegexp = /<((https?|ftp):[^'">\s]+)>/
  560. AutoAnchorEmailRegexp = %r{
  561. <
  562. (
  563. [-.\w]+
  564. \@
  565. [-a-z0-9]+(\.[-a-z0-9]+)*\.[a-z]+
  566. )
  567. >
  568. }xi
  569. ### Transform URLs in a copy of the specified +str+ into links and return
  570. ### it.
  571. def transform_auto_links( str, rs )
  572. @log.debug " Transforming auto-links"
  573. str.gsub( AutoAnchorURLRegexp, %{<a href="\\1">\\1</a>}).
  574. gsub( AutoAnchorEmailRegexp ) {|addr|
  575. encode_email_address( unescape_special_chars($1) )
  576. }
  577. end
  578. # Encoder functions to turn characters of an email address into encoded
  579. # entities.
  580. Encoders = [
  581. lambda {|char| "&#%03d;" % char},
  582. lambda {|char| "&#x%X;" % char},
  583. lambda {|char| char.chr },
  584. ]
  585. ### Transform a copy of the given email +addr+ into an escaped version safer
  586. ### for posting publicly.
  587. def encode_email_address( addr )
  588. rval = ''
  589. ("mailto:" + addr).each_byte {|b|
  590. case b
  591. when ?:
  592. rval += ":"
  593. when ?@
  594. rval += Encoders[ rand(2) ][ b ]
  595. else
  596. r = rand(100)
  597. rval += (
  598. r > 90 ? Encoders[2][ b ] :
  599. r < 45 ? Encoders[1][ b ] :
  600. Encoders[0][ b ]
  601. )
  602. end
  603. }
  604. return %{<a href="%s">%s</a>} % [ rval, rval.sub(/.+?:/, '') ]
  605. end
  606. # Regex for matching Setext-style headers
  607. SetextHeaderRegexp = %r{
  608. (.+) # The title text ($1)
  609. \n
  610. ([\-=])+ # Match a line of = or -. Save only one in $2.
  611. [ ]*\n+
  612. }x
  613. # Regexp for matching ATX-style headers
  614. AtxHeaderRegexp = %r{
  615. ^(\#{1,6}) # $1 = string of #'s
  616. [ ]*
  617. (.+?) # $2 = Header text
  618. [ ]*
  619. \#* # optional closing #'s (not counted)
  620. \n+
  621. }x
  622. ### Apply Markdown header transforms to a copy of the given +str+ amd render
  623. ### state +rs+ and return the result.
  624. def transform_headers( str, rs )
  625. @log.debug " Transforming headers"
  626. # Setext-style headers:
  627. # Header 1
  628. # ========
  629. #
  630. # Header 2
  631. # --------
  632. #
  633. str.
  634. gsub( SetextHeaderRegexp ) {|m|
  635. @log.debug "Found setext-style header"
  636. title, hdrchar = $1, $2
  637. title = apply_span_transforms( title, rs )
  638. case hdrchar
  639. when '='
  640. %[<h1>#{title}</h1>\n\n]
  641. when '-'
  642. %[<h2>#{title}</h2>\n\n]
  643. else
  644. title
  645. end
  646. }.
  647. gsub( AtxHeaderRegexp ) {|m|
  648. @log.debug "Found ATX-style header"
  649. hdrchars, title = $1, $2
  650. title = apply_span_transforms( title, rs )
  651. level = hdrchars.length
  652. %{<h%d>%s</h%d>\n\n} % [ level, title, level ]
  653. }
  654. end
  655. ### Wrap all remaining paragraph-looking text in a copy of +str+ inside <p>
  656. ### tags and return it.
  657. def form_paragraphs( str, rs )
  658. @log.debug " Forming paragraphs"
  659. grafs = str.
  660. sub( /\A\n+/, '' ).
  661. sub( /\n+\z/, '' ).
  662. split( /\n{2,}/ )
  663. rval = grafs.collect {|graf|
  664. # Unhashify HTML blocks if this is a placeholder
  665. if rs.html_blocks.key?( graf )
  666. rs.html_blocks[ graf ]
  667. # Otherwise, wrap in <p> tags
  668. else
  669. apply_span_transforms(graf, rs).
  670. sub( /^[ ]*/, '<p>' ) + '</p>'
  671. end
  672. }.join( "\n\n" )
  673. @log.debug " Formed paragraphs: %p" % rval
  674. return rval
  675. end
  676. # Pattern to match the linkid part of an anchor tag for reference-style
  677. # links.
  678. RefLinkIdRegex = %r{
  679. [ ]? # Optional leading space
  680. (?:\n[ ]*)? # Optional newline + spaces
  681. \[
  682. (.*?) # Id = $1
  683. \]
  684. }x
  685. InlineLinkRegex = %r{
  686. \( # Literal paren
  687. [ ]* # Zero or more spaces
  688. <?(.+?)>? # URI = $1
  689. [ ]* # Zero or more spaces
  690. (?: #
  691. ([\"\']) # Opening quote char = $2
  692. (.*?) # Title = $3
  693. \2 # Matching quote char
  694. )? # Title is optional
  695. \)
  696. }x
  697. ### Apply Markdown anchor transforms to a copy of the specified +str+ with
  698. ### the given render state +rs+ and return it.
  699. def transform_anchors( str, rs )
  700. @log.debug " Transforming anchors"
  701. @scanner.string = str.dup
  702. text = ''
  703. # Scan the whole string
  704. until @scanner.eos?
  705. if @scanner.scan( /\[/ )
  706. link = ''; linkid = ''
  707. depth = 1
  708. startpos = @scanner.pos
  709. @log.debug " Found a bracket-open at %d" % startpos
  710. # Scan the rest of the tag, allowing unlimited nested []s. If
  711. # the scanner runs out of text before the opening bracket is
  712. # closed, append the text and return (wasn't a valid anchor).
  713. while depth.nonzero?
  714. linktext = @scanner.scan_until( /\]|\[/ )
  715. if linktext
  716. @log.debug " Found a bracket at depth %d: %p" % [ depth, linktext ]
  717. link += linktext
  718. # Decrement depth for each closing bracket
  719. depth += ( linktext[-1, 1] == ']' ? -1 : 1 )
  720. @log.debug " Depth is now #{depth}"
  721. # If there's no more brackets, it must not be an anchor, so
  722. # just abort.
  723. else
  724. @log.debug " Missing closing brace, assuming non-link."
  725. link += @scanner.rest
  726. @scanner.terminate
  727. return text + '[' + link
  728. end
  729. end
  730. link.slice!( -1 ) # Trim final ']'
  731. @log.debug " Found leading link %p" % link
  732. # Look for a reference-style second part
  733. if @scanner.scan( RefLinkIdRegex )
  734. linkid = @scanner[1]
  735. linkid = link.dup if linkid.empty?
  736. linkid.downcase!
  737. @log.debug " Found a linkid: %p" % linkid
  738. # If there's a matching link in the link table, build an
  739. # anchor tag for it.
  740. if rs.urls.key?( linkid )
  741. @log.debug " Found link key in the link table: %p" % rs.urls[linkid]
  742. url = escape_md( rs.urls[linkid] )
  743. text += %{<a href="#{url}"}
  744. if rs.titles.key?(linkid)
  745. text += %{ title="%s"} % escape_md( rs.titles[linkid] )
  746. end
  747. text += %{>#{link}</a>}
  748. # If the link referred to doesn't exist, just append the raw
  749. # source to the result
  750. else
  751. @log.debug " Linkid %p not found in link table" % linkid
  752. @log.debug " Appending original string instead: "
  753. @log.debug "%p" % @scanner.string[ startpos-1 .. @scanner.pos-1 ]
  754. text += @scanner.string[ startpos-1 .. @scanner.pos-1 ]
  755. end
  756. # ...or for an inline style second part
  757. elsif @scanner.scan( InlineLinkRegex )
  758. url = @scanner[1]
  759. title = @scanner[3]
  760. @log.debug " Found an inline link to %p" % url
  761. text += %{<a href="%s"} % escape_md( url )
  762. if title
  763. title.gsub!( /"/, "&quot;" )
  764. text += %{ title="%s"} % escape_md( title )
  765. end
  766. text += %{>#{link}</a>}
  767. # No linkid part: just append the first part as-is.
  768. else
  769. @log.debug "No linkid, so no anchor. Appending literal text."
  770. text += @scanner.string[ startpos-1 .. @scanner.pos-1 ]
  771. end # if linkid
  772. # Plain text
  773. else
  774. @log.debug " Scanning to the next link from %p" % @scanner.rest
  775. text += @scanner.scan( /[^\[]+/ )
  776. end
  777. end # until @scanner.eos?
  778. return text
  779. end
  780. # Pattern to match strong emphasis in Markdown text
  781. BoldRegexp = %r{ (\*\*|__) (\S|\S.*?\S) \1 }x
  782. # Pattern to match normal emphasis in Markdown text
  783. ItalicRegexp = %r{ (\*|_) (\S|\S.*?\S) \1 }x
  784. ### Transform italic- and bold-encoded text in a copy of the specified +str+
  785. ### and return it.
  786. def transform_italic_and_bold( str, rs )
  787. @log.debug " Transforming italic and bold"
  788. str.
  789. gsub( BoldRegexp, %{<strong>\\2</strong>} ).
  790. gsub( ItalicRegexp, %{<em>\\2</em>} )
  791. end
  792. ### Transform backticked spans into <code> spans.
  793. def transform_code_spans( str, rs )
  794. @log.debug " Transforming code spans"
  795. # Set up the string scanner and just return the string unless there's at
  796. # least one backtick.
  797. @scanner.string = str.dup
  798. unless @scanner.exist?( /`/ )
  799. @scanner.terminate
  800. @log.debug "No backticks found for code span in %p" % str
  801. return str
  802. end
  803. @log.debug "Transforming code spans in %p" % str
  804. # Build the transformed text anew
  805. text = ''
  806. # Scan to the end of the string
  807. until @scanner.eos?
  808. # Scan up to an opening backtick
  809. if pre = @scanner.scan_until( /.?(?=`)/m )
  810. text += pre
  811. @log.debug "Found backtick at %d after '...%s'" % [ @scanner.pos, text[-10, 10] ]
  812. # Make a pattern to find the end of the span
  813. opener = @scanner.scan( /`+/ )
  814. len = opener.length
  815. closer = Regexp.new( opener )
  816. @log.debug "Scanning for end of code span with %p" % closer
  817. # Scan until the end of the closing backtick sequence. Chop the
  818. # backticks off the resultant string, strip leading and trailing
  819. # whitespace, and encode any enitites contained in it.
  820. codespan = @scanner.scan_until( closer ) or
  821. raise FormatError.new( @scanner.rest[0,20],
  822. "No %p found before end" % opener )
  823. @log.debug "Found close of code span at %d: %p" % [ @scanner.pos - len, codespan ]
  824. codespan.slice!( -len, len )
  825. text += "<code>%s</code>" %
  826. encode_code( codespan.strip, rs )
  827. # If there's no more backticks, just append the rest of the string
  828. # and move the scan pointer to the end
  829. else
  830. text += @scanner.rest
  831. @scanner.terminate
  832. end
  833. end
  834. return text
  835. end
  836. # Next, handle inline images: ![alt text](url "optional title")
  837. # Don't forget: encode * and _
  838. InlineImageRegexp = %r{
  839. ( # Whole match = $1
  840. !\[ (.*?) \] # alt text = $2
  841. \([ ]*
  842. <?(\S+?)>? # source url = $3
  843. [ ]*
  844. (?: #
  845. (["']) # quote char = $4
  846. (.*?) # title = $5
  847. \4 # matching quote
  848. [ ]*
  849. )? # title is optional
  850. \)
  851. )
  852. }xs #"
  853. # Reference-style images
  854. ReferenceImageRegexp = %r{
  855. ( # Whole match = $1
  856. !\[ (.*?) \] # Alt text = $2
  857. [ ]? # Optional space
  858. (?:\n[ ]*)? # One optional newline + spaces
  859. \[ (.*?) \] # id = $3
  860. )
  861. }xs
  862. ### Turn image markup into image tags.
  863. def transform_images( str, rs )
  864. @log.debug " Transforming images (%p)" % [str]
  865. # Handle reference-style labeled images: ![alt text][id]
  866. str.
  867. gsub( ReferenceImageRegexp ) {|match|
  868. whole, alt, linkid = $1, $2, $3.downcase
  869. @log.debug "Matched %p" % match
  870. res = nil
  871. alt.gsub!( /"/, '&quot;' )
  872. # for shortcut links like ![this][].
  873. linkid = alt.downcase if linkid.empty?
  874. if rs.urls.key?( linkid )
  875. url = escape_md( rs.urls[linkid] )
  876. @log.debug "Found url '%s' for linkid '%s' " % [ url, linkid ]
  877. # Build the tag
  878. result = %{<img src="%s" alt="%s"} % [ url, alt ]
  879. if rs.titles.key?( linkid )
  880. result += %{ title="%s"} % escape_md( rs.titles[linkid] )
  881. end
  882. result += EMPTY_ELEMENT_SUFFIX
  883. else
  884. result = whole
  885. end
  886. @log.debug "Replacing %p with %p" % [ match, result ]
  887. result
  888. }.
  889. # Inline image style
  890. gsub( InlineImageRegexp ) {|match|
  891. @log.debug "Found inline image %p" % match
  892. whole, alt, title = $1, $2, $5
  893. url = escape_md( $3 )
  894. alt.gsub!( /"/, '&quot;' )
  895. # Build the tag
  896. result = %{<img src="%s" alt="%s"} % [ url, alt ]
  897. unless title.nil?
  898. title.gsub!( /"/, '&quot;' )
  899. result += %{ title="%s"} % escape_md( title )
  900. end
  901. result += EMPTY_ELEMENT_SUFFIX
  902. @log.debug "Replacing %p with %p" % [ match, result ]
  903. result
  904. }
  905. end
  906. # Regexp to match special characters in a code block
  907. CodeEscapeRegexp = %r{( \* | _ | \{ | \} | \[ | \] | \\ )}x
  908. ### Escape any characters special to HTML and encode any characters special
  909. ### to Markdown in a copy of the given +str+ and return it.
  910. def encode_code( str, rs )
  911. str.gsub( %r{&}, '&amp;' ).
  912. gsub( %r{<}, '&lt;' ).
  913. gsub( %r{>}, '&gt;' ).
  914. gsub( CodeEscapeRegexp ) {|match| ESCAPE_TABLE[match][:md5]}
  915. end
  916. #################################################################
  917. ### U T I L I T Y F U N C T I O N S
  918. #################################################################
  919. ### Escape any markdown characters in a copy of the given +str+ and return
  920. ### it.
  921. def escape_md( str )
  922. str.
  923. gsub( /\*/, ESCAPE_TABLE['*'][:md5] ).
  924. gsub( /_/, ESCAPE_TABLE['_'][:md5] )
  925. end
  926. # Matching constructs for tokenizing X/HTML
  927. HTMLCommentRegexp = %r{ <! ( -- .*? -- \s* )+ > }mx
  928. XMLProcInstRegexp = %r{ <\? .*? \?> }mx
  929. MetaTag = Regexp.union( HTMLCommentRegexp, XMLProcInstRegexp )
  930. HTMLTagOpenRegexp = %r{ < [a-z/!$] [^<>]* }imx
  931. HTMLTagCloseRegexp = %r{ > }x
  932. HTMLTagPart = Regexp.union( HTMLTagOpenRegexp, HTMLTagCloseRegexp )
  933. ### Break the HTML source in +str+ into a series of tokens and return
  934. ### them. The tokens are just 2-element Array tuples with a type and the
  935. ### actual content. If this function is called with a block, the type and
  936. ### text parts of each token will be yielded to it one at a time as they are
  937. ### extracted.
  938. def tokenize_html( str )
  939. depth = 0
  940. tokens = []
  941. @scanner.string = str.dup
  942. type, token = nil, nil
  943. until @scanner.eos?
  944. @log.debug "Scanning from %p" % @scanner.rest
  945. # Match comments and PIs without nesting
  946. if (( token = @scanner.scan(MetaTag) ))
  947. type = :tag
  948. # Do nested matching for HTML tags
  949. elsif (( token = @scanner.scan(HTMLTagOpenRegexp) ))
  950. tagstart = @scanner.pos
  951. @log.debug " Found the start of a plain tag at %d" % tagstart
  952. # Start the token with the opening angle
  953. depth = 1
  954. type = :tag
  955. # Scan the rest of the tag, allowing unlimited nested <>s. If
  956. # the scanner runs out of text before the tag is closed, raise
  957. # an error.
  958. while depth.nonzero?
  959. # Scan either an opener or a closer
  960. chunk = @scanner.scan( HTMLTagPart ) or
  961. raise "Malformed tag at character %d: %p" %
  962. [ tagstart, token + @scanner.rest ]
  963. @log.debug " Found another part of the tag at depth %d: %p" % [ depth, chunk ]
  964. token += chunk
  965. # If the last character of the token so far is a closing
  966. # angle bracket, decrement the depth. Otherwise increment
  967. # it for a nested tag.
  968. depth += ( token[-1, 1] == '>' ? -1 : 1 )
  969. @log.debug " Depth is now #{depth}"
  970. end
  971. # Match text segments
  972. else
  973. @log.debug " Looking for a chunk of text"
  974. type = :text
  975. # Scan forward, always matching at least one character to move
  976. # the pointer beyond any non-tag '<'.
  977. token = @scanner.scan_until( /[^<]+/m )
  978. end
  979. @log.debug " type: %p, token: %p" % [ type, token ]
  980. # If a block is given, feed it one token at a time. Add the token to
  981. # the token list to be returned regardless.
  982. if block_given?
  983. yield( type, token )
  984. end
  985. tokens << [ type, token ]
  986. end
  987. return tokens
  988. end
  989. ### Return a copy of +str+ with angle brackets and ampersands HTML-encoded.
  990. def encode_html( str )
  991. str.gsub( /&(?!#?[x]?(?:[0-9a-f]+|\w+);)/i, "&amp;" ).
  992. gsub( %r{<(?![a-z/?\$!])}i, "&lt;" )
  993. end
  994. ### Return one level of line-leading tabs or spaces from a copy of +str+ and
  995. ### return it.
  996. def outdent( str )
  997. str.gsub( /^(\t|[ ]{1,#{TAB_WIDTH}})/, '')
  998. end
  999. end # class BlueCloth