PageRenderTime 52ms CodeModel.GetById 22ms RepoModel.GetById 0ms app.codeStats 0ms

/engines/adva_cms/vendor/gems/BlueCloth-1.0.0/lib/bluecloth.rb

https://github.com/DoktahWorm/adva_cms
Ruby | 1144 lines | 866 code | 119 blank | 159 comment | 5 complexity | 9ff26cb076f660d4760fae8b104119f5 MD5 | raw file
Possible License(s): BSD-3-Clause, MIT, GPL-2.0
  1. #!/usr/bin/ruby
  2. #
  3. # Bluecloth is a Ruby implementation of Markdown, a text-to-HTML conversion
  4. # tool.
  5. #
  6. # == Synopsis
  7. #
  8. # doc = BlueCloth::new "
  9. # ## Test document ##
  10. #
  11. # Just a simple test.
  12. # "
  13. #
  14. # puts doc.to_html
  15. #
  16. # == Authors
  17. #
  18. # * Michael Granger <ged@FaerieMUD.org>
  19. #
  20. # == Contributors
  21. #
  22. # * Martin Chase <stillflame@FaerieMUD.org> - Peer review, helpful suggestions
  23. # * Florian Gross <flgr@ccan.de> - Filter options, suggestions
  24. #
  25. # == Copyright
  26. #
  27. # Original version:
  28. # Copyright (c) 2003-2004 John Gruber
  29. # <http://daringfireball.net/>
  30. # All rights reserved.
  31. #
  32. # Ruby port:
  33. # Copyright (c) 2004 The FaerieMUD Consortium.
  34. #
  35. # BlueCloth is free software; you can redistribute it and/or modify it under the
  36. # terms of the GNU General Public License as published by the Free Software
  37. # Foundation; either version 2 of the License, or (at your option) any later
  38. # version.
  39. #
  40. # BlueCloth is distributed in the hope that it will be useful, but WITHOUT ANY
  41. # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
  42. # A PARTICULAR PURPOSE. See the GNU General Public License for more details.
  43. #
  44. # == To-do
  45. #
  46. # * Refactor some of the larger uglier methods that have to do their own
  47. # brute-force scanning because of lack of Perl features in Ruby's Regexp
  48. # class. Alternately, could add a dependency on 'pcre' and use most Perl
  49. # regexps.
  50. #
  51. # * Put the StringScanner in the render state for thread-safety.
  52. #
  53. # == Version
  54. #
  55. # $Id: bluecloth.rb 69 2004-08-25 05:27:15Z ged $
  56. #
  57. require 'digest/md5'
  58. require 'logger'
  59. require 'strscan'
  60. ### BlueCloth is a Ruby implementation of Markdown, a text-to-HTML conversion
  61. ### tool.
  62. class BlueCloth < String
  63. ### Exception class for formatting errors.
  64. class FormatError < RuntimeError
  65. ### Create a new FormatError with the given source +str+ and an optional
  66. ### message about the +specific+ error.
  67. def initialize( str, specific=nil )
  68. if specific
  69. msg = "Bad markdown format near %p: %s" % [ str, specific ]
  70. else
  71. msg = "Bad markdown format near %p" % str
  72. end
  73. super( msg )
  74. end
  75. end
  76. # Release Version
  77. Version = '0.0.3'
  78. # SVN Revision
  79. SvnRev = %q$Rev: 69 $
  80. # SVN Id tag
  81. SvnId = %q$Id: bluecloth.rb 69 2004-08-25 05:27:15Z ged $
  82. # SVN URL
  83. SvnUrl = %q$URL: svn+ssh://svn.faeriemud.org/usr/local/svn/BlueCloth/trunk/lib/bluecloth.rb $
  84. # Rendering state struct. Keeps track of URLs, titles, and HTML blocks
  85. # midway through a render. I prefer this to the globals of the Perl version
  86. # because globals make me break out in hives. Or something.
  87. RenderState = Struct::new( "RenderState", :urls, :titles, :html_blocks, :log )
  88. # Tab width for #detab! if none is specified
  89. TabWidth = 4
  90. # The tag-closing string -- set to '>' for HTML
  91. EmptyElementSuffix = "/>";
  92. # Table of MD5 sums for escaped characters
  93. EscapeTable = {}
  94. '\\`*_{}[]()#.!'.split(//).each {|char|
  95. hash = Digest::MD5::hexdigest( char )
  96. EscapeTable[ char ] = {
  97. :md5 => hash,
  98. :md5re => Regexp::new( hash ),
  99. :re => Regexp::new( '\\\\' + Regexp::escape(char) ),
  100. }
  101. }
  102. #################################################################
  103. ### I N S T A N C E M E T H O D S
  104. #################################################################
  105. ### Create a new BlueCloth string.
  106. def initialize( content="", *restrictions )
  107. @log = Logger::new( $deferr )
  108. @log.level = $DEBUG ?
  109. Logger::DEBUG :
  110. ($VERBOSE ? Logger::INFO : Logger::WARN)
  111. @scanner = nil
  112. # Add any restrictions, and set the line-folding attribute to reflect
  113. # what happens by default.
  114. @filter_html = nil
  115. @filter_styles = nil
  116. restrictions.flatten.each {|r| __send__("#{r}=", true) }
  117. @fold_lines = true
  118. super( content )
  119. @log.debug "String is: %p" % self
  120. end
  121. ######
  122. public
  123. ######
  124. # Filters for controlling what gets output for untrusted input. (But really,
  125. # you're filtering bad stuff out of untrusted input at submission-time via
  126. # untainting, aren't you?)
  127. attr_accessor :filter_html, :filter_styles
  128. # RedCloth-compatibility accessor. Line-folding is part of Markdown syntax,
  129. # so this isn't used by anything.
  130. attr_accessor :fold_lines
  131. ### Render Markdown-formatted text in this string object as HTML and return
  132. ### it. The parameter is for compatibility with RedCloth, and is currently
  133. ### unused, though that may change in the future.
  134. def to_html( lite=false )
  135. # Create a StringScanner we can reuse for various lexing tasks
  136. @scanner = StringScanner::new( '' )
  137. # Make a structure to carry around stuff that gets placeholdered out of
  138. # the source.
  139. rs = RenderState::new( {}, {}, {} )
  140. # Make a copy of the string with normalized line endings, tabs turned to
  141. # spaces, and a couple of guaranteed newlines at the end
  142. text = self.gsub( /\r\n?/, "\n" ).detab
  143. text += "\n\n"
  144. @log.debug "Normalized line-endings: %p" % text
  145. # Filter HTML if we're asked to do so
  146. if self.filter_html
  147. text.gsub!( "<", "&lt;" )
  148. text.gsub!( ">", "&gt;" )
  149. @log.debug "Filtered HTML: %p" % text
  150. end
  151. # Simplify blank lines
  152. text.gsub!( /^ +$/, '' )
  153. @log.debug "Tabs -> spaces/blank lines stripped: %p" % text
  154. # Replace HTML blocks with placeholders
  155. text = hide_html_blocks( text, rs )
  156. @log.debug "Hid HTML blocks: %p" % text
  157. @log.debug "Render state: %p" % rs
  158. # Strip link definitions, store in render state
  159. text = strip_link_definitions( text, rs )
  160. @log.debug "Stripped link definitions: %p" % text
  161. @log.debug "Render state: %p" % rs
  162. # Escape meta-characters
  163. text = escape_special_chars( text )
  164. @log.debug "Escaped special characters: %p" % text
  165. # Transform block-level constructs
  166. text = apply_block_transforms( text, rs )
  167. @log.debug "After block-level transforms: %p" % text
  168. # Now swap back in all the escaped characters
  169. text = unescape_special_chars( text )
  170. @log.debug "After unescaping special characters: %p" % text
  171. return text
  172. end
  173. ### Convert tabs in +str+ to spaces.
  174. def detab( tabwidth=TabWidth )
  175. copy = self.dup
  176. copy.detab!( tabwidth )
  177. return copy
  178. end
  179. ### Convert tabs to spaces in place and return self if any were converted.
  180. def detab!( tabwidth=TabWidth )
  181. newstr = self.split( /\n/ ).collect {|line|
  182. line.gsub( /(.*?)\t/ ) do
  183. $1 + ' ' * (tabwidth - $1.length % tabwidth)
  184. end
  185. }.join("\n")
  186. self.replace( newstr )
  187. end
  188. #######
  189. #private
  190. #######
  191. ### Do block-level transforms on a copy of +str+ using the specified render
  192. ### state +rs+ and return the results.
  193. def apply_block_transforms( str, rs )
  194. # Port: This was called '_runBlockGamut' in the original
  195. @log.debug "Applying block transforms to:\n %p" % str
  196. text = transform_headers( str, rs )
  197. text = transform_hrules( text, rs )
  198. text = transform_lists( text, rs )
  199. text = transform_code_blocks( text, rs )
  200. text = transform_block_quotes( text, rs )
  201. text = transform_auto_links( text, rs )
  202. text = hide_html_blocks( text, rs )
  203. text = form_paragraphs( text, rs )
  204. @log.debug "Done with block transforms:\n %p" % text
  205. return text
  206. end
  207. ### Apply Markdown span transforms to a copy of the specified +str+ with the
  208. ### given render state +rs+ and return it.
  209. def apply_span_transforms( str, rs )
  210. @log.debug "Applying span transforms to:\n %p" % str
  211. str = transform_code_spans( str, rs )
  212. str = encode_html( str )
  213. str = transform_images( str, rs )
  214. str = transform_anchors( str, rs )
  215. str = transform_italic_and_bold( str, rs )
  216. # Hard breaks
  217. str.gsub!( / {2,}\n/, "<br#{EmptyElementSuffix}\n" )
  218. @log.debug "Done with span transforms:\n %p" % str
  219. return str
  220. end
  221. # The list of tags which are considered block-level constructs and an
  222. # alternation pattern suitable for use in regexps made from the list
  223. StrictBlockTags = %w[ p div h[1-6] blockquote pre table dl ol ul script noscript
  224. form fieldset iframe math ins del ]
  225. StrictTagPattern = StrictBlockTags.join('|')
  226. LooseBlockTags = StrictBlockTags - %w[ins del]
  227. LooseTagPattern = LooseBlockTags.join('|')
  228. # Nested blocks:
  229. # <div>
  230. # <div>
  231. # tags for inner block must be indented.
  232. # </div>
  233. # </div>
  234. StrictBlockRegex = %r{
  235. ^ # Start of line
  236. <(#{StrictTagPattern}) # Start tag: \2
  237. \b # word break
  238. (.*\n)*? # Any number of lines, minimal match
  239. </\1> # Matching end tag
  240. [ ]* # trailing spaces
  241. $ # End of line or document
  242. }ix
  243. # More-liberal block-matching
  244. LooseBlockRegex = %r{
  245. ^ # Start of line
  246. <(#{LooseTagPattern}) # start tag: \2
  247. \b # word break
  248. (.*\n)*? # Any number of lines, minimal match
  249. .*</\1> # Anything + Matching end tag
  250. [ ]* # trailing spaces
  251. $ # End of line or document
  252. }ix
  253. # Special case for <hr />.
  254. HruleBlockRegex = %r{
  255. ( # $1
  256. \A\n? # Start of doc + optional \n
  257. | # or
  258. .*\n\n # anything + blank line
  259. )
  260. ( # save in $2
  261. [ ]* # Any spaces
  262. <hr # Tag open
  263. \b # Word break
  264. ([^<>])*? # Attributes
  265. /?> # Tag close
  266. $ # followed by a blank line or end of document
  267. )
  268. }ix
  269. ### Replace all blocks of HTML in +str+ that start in the left margin with
  270. ### tokens.
  271. def hide_html_blocks( str, rs )
  272. @log.debug "Hiding HTML blocks in %p" % str
  273. # Tokenizer proc to pass to gsub
  274. tokenize = lambda {|match|
  275. key = Digest::MD5::hexdigest( match )
  276. rs.html_blocks[ key ] = match
  277. @log.debug "Replacing %p with %p" % [ match, key ]
  278. "\n\n#{key}\n\n"
  279. }
  280. rval = str.dup
  281. @log.debug "Finding blocks with the strict regex..."
  282. rval.gsub!( StrictBlockRegex, &tokenize )
  283. @log.debug "Finding blocks with the loose regex..."
  284. rval.gsub!( LooseBlockRegex, &tokenize )
  285. @log.debug "Finding hrules..."
  286. rval.gsub!( HruleBlockRegex ) {|match| $1 + tokenize[$2] }
  287. return rval
  288. end
  289. # Link defs are in the form: ^[id]: url "optional title"
  290. LinkRegex = %r{
  291. ^[ ]*\[(.+)\]: # id = $1
  292. [ ]*
  293. \n? # maybe *one* newline
  294. [ ]*
  295. <?(\S+?)>? # url = $2
  296. [ ]*
  297. \n? # maybe one newline
  298. [ ]*
  299. (?:
  300. # Titles are delimited by "quotes" or (parens).
  301. ["(]
  302. (.+?) # title = $3
  303. [")] # Matching ) or "
  304. [ ]*
  305. )? # title is optional
  306. (?:\n+|\Z)
  307. }x
  308. ### Strip link definitions from +str+, storing them in the given RenderState
  309. ### +rs+.
  310. def strip_link_definitions( str, rs )
  311. str.gsub( LinkRegex ) {|match|
  312. id, url, title = $1, $2, $3
  313. rs.urls[ id.downcase ] = encode_html( url )
  314. unless title.nil?
  315. rs.titles[ id.downcase ] = title.gsub( /"/, "&quot;" )
  316. end
  317. ""
  318. }
  319. end
  320. ### Escape special characters in the given +str+
  321. def escape_special_chars( str )
  322. @log.debug " Escaping special characters"
  323. text = ''
  324. # The original Markdown source has something called '$tags_to_skip'
  325. # declared here, but it's never used, so I don't define it.
  326. tokenize_html( str ) {|token, str|
  327. @log.debug " Adding %p token %p" % [ token, str ]
  328. case token
  329. # Within tags, encode * and _
  330. when :tag
  331. text += str.
  332. gsub( /\*/, EscapeTable['*'][:md5] ).
  333. gsub( /_/, EscapeTable['_'][:md5] )
  334. # Encode backslashed stuff in regular text
  335. when :text
  336. text += encode_backslash_escapes( str )
  337. else
  338. raise TypeError, "Unknown token type %p" % token
  339. end
  340. }
  341. @log.debug " Text with escapes is now: %p" % text
  342. return text
  343. end
  344. ### Swap escaped special characters in a copy of the given +str+ and return
  345. ### it.
  346. def unescape_special_chars( str )
  347. EscapeTable.each {|char, hash|
  348. @log.debug "Unescaping escaped %p with %p" % [ char, hash[:md5re] ]
  349. str.gsub!( hash[:md5re], char )
  350. }
  351. return str
  352. end
  353. ### Return a copy of the given +str+ with any backslashed special character
  354. ### in it replaced with MD5 placeholders.
  355. def encode_backslash_escapes( str )
  356. # Make a copy with any double-escaped backslashes encoded
  357. text = str.gsub( /\\\\/, EscapeTable['\\'][:md5] )
  358. EscapeTable.each_pair {|char, esc|
  359. next if char == '\\'
  360. text.gsub!( esc[:re], esc[:md5] )
  361. }
  362. return text
  363. end
  364. ### Transform any Markdown-style horizontal rules in a copy of the specified
  365. ### +str+ and return it.
  366. def transform_hrules( str, rs )
  367. @log.debug " Transforming horizontal rules"
  368. str.gsub( /^( ?[\-\*_] ?){3,}$/, "\n<hr#{EmptyElementSuffix}\n" )
  369. end
  370. # Patterns to match and transform lists
  371. ListMarkerOl = %r{\d+\.}
  372. ListMarkerUl = %r{[*+-]}
  373. ListMarkerAny = Regexp::union( ListMarkerOl, ListMarkerUl )
  374. ListRegexp = %r{
  375. (?:
  376. ^[ ]{0,#{TabWidth - 1}} # Indent < tab width
  377. (#{ListMarkerAny}) # unordered or ordered ($1)
  378. [ ]+ # At least one space
  379. )
  380. (?m:.+?) # item content (include newlines)
  381. (?:
  382. \z # Either EOF
  383. | # or
  384. \n{2,} # Blank line...
  385. (?=\S) # ...followed by non-space
  386. (?![ ]* # ...but not another item
  387. (#{ListMarkerAny})
  388. [ ]+)
  389. )
  390. }x
  391. ### Transform Markdown-style lists in a copy of the specified +str+ and
  392. ### return it.
  393. def transform_lists( str, rs )
  394. @log.debug " Transforming lists at %p" % (str[0,100] + '...')
  395. str.gsub( ListRegexp ) {|list|
  396. @log.debug " Found list %p" % list
  397. bullet = $1
  398. list_type = (ListMarkerUl.match(bullet) ? "ul" : "ol")
  399. list.gsub!( /\n{2,}/, "\n\n\n" )
  400. %{<%s>\n%s</%s>\n} % [
  401. list_type,
  402. transform_list_items( list, rs ),
  403. list_type,
  404. ]
  405. }
  406. end
  407. # Pattern for transforming list items
  408. ListItemRegexp = %r{
  409. (\n)? # leading line = $1
  410. (^[ ]*) # leading whitespace = $2
  411. (#{ListMarkerAny}) [ ]+ # list marker = $3
  412. ((?m:.+?) # list item text = $4
  413. (\n{1,2}))
  414. (?= \n* (\z | \2 (#{ListMarkerAny}) [ ]+))
  415. }x
  416. ### Transform list items in a copy of the given +str+ and return it.
  417. def transform_list_items( str, rs )
  418. @log.debug " Transforming list items"
  419. # Trim trailing blank lines
  420. str = str.sub( /\n{2,}\z/, "\n" )
  421. str.gsub( ListItemRegexp ) {|line|
  422. @log.debug " Found item line %p" % line
  423. leading_line, item = $1, $4
  424. if leading_line or /\n{2,}/.match( item )
  425. @log.debug " Found leading line or item has a blank"
  426. item = apply_block_transforms( outdent(item), rs )
  427. else
  428. # Recursion for sub-lists
  429. @log.debug " Recursing for sublist"
  430. item = transform_lists( outdent(item), rs ).chomp
  431. item = apply_span_transforms( item, rs )
  432. end
  433. %{<li>%s</li>\n} % item
  434. }
  435. end
  436. # Pattern for matching codeblocks
  437. CodeBlockRegexp = %r{
  438. (?:\n\n|\A)
  439. ( # $1 = the code block
  440. (?:
  441. (?:[ ]{#{TabWidth}} | \t) # a tab or tab-width of spaces
  442. .*\n+
  443. )+
  444. )
  445. (^[ ]{0,#{TabWidth - 1}}\S|\Z) # Lookahead for non-space at
  446. # line-start, or end of doc
  447. }x
  448. ### Transform Markdown-style codeblocks in a copy of the specified +str+ and
  449. ### return it.
  450. def transform_code_blocks( str, rs )
  451. @log.debug " Transforming code blocks"
  452. str.gsub( CodeBlockRegexp ) {|block|
  453. codeblock = $1
  454. remainder = $2
  455. # Generate the codeblock
  456. %{\n\n<pre><code>%s\n</code></pre>\n\n%s} %
  457. [ encode_code( outdent(codeblock), rs ).rstrip, remainder ]
  458. }
  459. end
  460. # Pattern for matching Markdown blockquote blocks
  461. BlockQuoteRegexp = %r{
  462. (?:
  463. ^[ ]*>[ ]? # '>' at the start of a line
  464. .+\n # rest of the first line
  465. (?:.+\n)* # subsequent consecutive lines
  466. \n* # blanks
  467. )+
  468. }x
  469. PreChunk = %r{ ( ^ \s* <pre> .+? </pre> ) }xm
  470. ### Transform Markdown-style blockquotes in a copy of the specified +str+
  471. ### and return it.
  472. def transform_block_quotes( str, rs )
  473. @log.debug " Transforming block quotes"
  474. str.gsub( BlockQuoteRegexp ) {|quote|
  475. @log.debug "Making blockquote from %p" % quote
  476. quote.gsub!( /^ *> ?/, '' ) # Trim one level of quoting
  477. quote.gsub!( /^ +$/, '' ) # Trim whitespace-only lines
  478. indent = " " * TabWidth
  479. quoted = %{<blockquote>\n%s\n</blockquote>\n\n} %
  480. apply_block_transforms( quote, rs ).
  481. gsub( /^/, indent ).
  482. gsub( PreChunk ) {|m| m.gsub(/^#{indent}/o, '') }
  483. @log.debug "Blockquoted chunk is: %p" % quoted
  484. quoted
  485. }
  486. end
  487. AutoAnchorURLRegexp = /<((https?|ftp):[^'">\s]+)>/
  488. AutoAnchorEmailRegexp = %r{
  489. <
  490. (
  491. [-.\w]+
  492. \@
  493. [-a-z0-9]+(\.[-a-z0-9]+)*\.[a-z]+
  494. )
  495. >
  496. }xi
  497. ### Transform URLs in a copy of the specified +str+ into links and return
  498. ### it.
  499. def transform_auto_links( str, rs )
  500. @log.debug " Transforming auto-links"
  501. str.gsub( AutoAnchorURLRegexp, %{<a href="\\1">\\1</a>}).
  502. gsub( AutoAnchorEmailRegexp ) {|addr|
  503. encode_email_address( unescape_special_chars($1) )
  504. }
  505. end
  506. # Encoder functions to turn characters of an email address into encoded
  507. # entities.
  508. Encoders = [
  509. lambda {|char| "&#%03d;" % char},
  510. lambda {|char| "&#x%X;" % char},
  511. lambda {|char| char.chr },
  512. ]
  513. ### Transform a copy of the given email +addr+ into an escaped version safer
  514. ### for posting publicly.
  515. def encode_email_address( addr )
  516. rval = ''
  517. ("mailto:" + addr).each_byte {|b|
  518. case b
  519. when ?:
  520. rval += ":"
  521. when ?@
  522. rval += Encoders[ rand(2) ][ b ]
  523. else
  524. r = rand(100)
  525. rval += (
  526. r > 90 ? Encoders[2][ b ] :
  527. r < 45 ? Encoders[1][ b ] :
  528. Encoders[0][ b ]
  529. )
  530. end
  531. }
  532. return %{<a href="%s">%s</a>} % [ rval, rval.sub(/.+?:/, '') ]
  533. end
  534. # Regex for matching Setext-style headers
  535. SetextHeaderRegexp = %r{
  536. (.+) # The title text ($1)
  537. \n
  538. ([\-=])+ # Match a line of = or -. Save only one in $2.
  539. [ ]*\n+
  540. }x
  541. # Regexp for matching ATX-style headers
  542. AtxHeaderRegexp = %r{
  543. ^(\#{1,6}) # $1 = string of #'s
  544. [ ]*
  545. (.+?) # $2 = Header text
  546. [ ]*
  547. \#* # optional closing #'s (not counted)
  548. \n+
  549. }x
  550. ### Apply Markdown header transforms to a copy of the given +str+ amd render
  551. ### state +rs+ and return the result.
  552. def transform_headers( str, rs )
  553. @log.debug " Transforming headers"
  554. # Setext-style headers:
  555. # Header 1
  556. # ========
  557. #
  558. # Header 2
  559. # --------
  560. #
  561. str.
  562. gsub( SetextHeaderRegexp ) {|m|
  563. @log.debug "Found setext-style header"
  564. title, hdrchar = $1, $2
  565. title = apply_span_transforms( title, rs )
  566. case hdrchar
  567. when '='
  568. %[<h1>#{title}</h1>\n\n]
  569. when '-'
  570. %[<h2>#{title}</h2>\n\n]
  571. else
  572. title
  573. end
  574. }.
  575. gsub( AtxHeaderRegexp ) {|m|
  576. @log.debug "Found ATX-style header"
  577. hdrchars, title = $1, $2
  578. title = apply_span_transforms( title, rs )
  579. level = hdrchars.length
  580. %{<h%d>%s</h%d>\n\n} % [ level, title, level ]
  581. }
  582. end
  583. ### Wrap all remaining paragraph-looking text in a copy of +str+ inside <p>
  584. ### tags and return it.
  585. def form_paragraphs( str, rs )
  586. @log.debug " Forming paragraphs"
  587. grafs = str.
  588. sub( /\A\n+/, '' ).
  589. sub( /\n+\z/, '' ).
  590. split( /\n{2,}/ )
  591. rval = grafs.collect {|graf|
  592. # Unhashify HTML blocks if this is a placeholder
  593. if rs.html_blocks.key?( graf )
  594. rs.html_blocks[ graf ]
  595. # Otherwise, wrap in <p> tags
  596. else
  597. apply_span_transforms(graf, rs).
  598. sub( /^[ ]*/, '<p>' ) + '</p>'
  599. end
  600. }.join( "\n\n" )
  601. @log.debug " Formed paragraphs: %p" % rval
  602. return rval
  603. end
  604. # Pattern to match the linkid part of an anchor tag for reference-style
  605. # links.
  606. RefLinkIdRegex = %r{
  607. [ ]? # Optional leading space
  608. (?:\n[ ]*)? # Optional newline + spaces
  609. \[
  610. (.*?) # Id = $1
  611. \]
  612. }x
  613. InlineLinkRegex = %r{
  614. \( # Literal paren
  615. [ ]* # Zero or more spaces
  616. <?(.+?)>? # URI = $1
  617. [ ]* # Zero or more spaces
  618. (?: #
  619. ([\"\']) # Opening quote char = $2
  620. (.*?) # Title = $3
  621. \2 # Matching quote char
  622. )? # Title is optional
  623. \)
  624. }x
  625. ### Apply Markdown anchor transforms to a copy of the specified +str+ with
  626. ### the given render state +rs+ and return it.
  627. def transform_anchors( str, rs )
  628. @log.debug " Transforming anchors"
  629. @scanner.string = str.dup
  630. text = ''
  631. # Scan the whole string
  632. until @scanner.empty?
  633. if @scanner.scan( /\[/ )
  634. link = ''; linkid = ''
  635. depth = 1
  636. startpos = @scanner.pos
  637. @log.debug " Found a bracket-open at %d" % startpos
  638. # Scan the rest of the tag, allowing unlimited nested []s. If
  639. # the scanner runs out of text before the opening bracket is
  640. # closed, append the text and return (wasn't a valid anchor).
  641. while depth.nonzero?
  642. linktext = @scanner.scan_until( /\]|\[/ )
  643. if linktext
  644. @log.debug " Found a bracket at depth %d: %p" % [ depth, linktext ]
  645. link += linktext
  646. # Decrement depth for each closing bracket
  647. depth += ( linktext[-1, 1] == ']' ? -1 : 1 )
  648. @log.debug " Depth is now #{depth}"
  649. # If there's no more brackets, it must not be an anchor, so
  650. # just abort.
  651. else
  652. @log.debug " Missing closing brace, assuming non-link."
  653. link += @scanner.rest
  654. @scanner.terminate
  655. return text + '[' + link
  656. end
  657. end
  658. link.slice!( -1 ) # Trim final ']'
  659. @log.debug " Found leading link %p" % link
  660. # Look for a reference-style second part
  661. if @scanner.scan( RefLinkIdRegex )
  662. linkid = @scanner[1]
  663. linkid = link.dup if linkid.empty?
  664. linkid.downcase!
  665. @log.debug " Found a linkid: %p" % linkid
  666. # If there's a matching link in the link table, build an
  667. # anchor tag for it.
  668. if rs.urls.key?( linkid )
  669. @log.debug " Found link key in the link table: %p" % rs.urls[linkid]
  670. url = escape_md( rs.urls[linkid] )
  671. text += %{<a href="#{url}"}
  672. if rs.titles.key?(linkid)
  673. text += %{ title="%s"} % escape_md( rs.titles[linkid] )
  674. end
  675. text += %{>#{link}</a>}
  676. # If the link referred to doesn't exist, just append the raw
  677. # source to the result
  678. else
  679. @log.debug " Linkid %p not found in link table" % linkid
  680. @log.debug " Appending original string instead: "
  681. @log.debug "%p" % @scanner.string[ startpos-1 .. @scanner.pos-1 ]
  682. text += @scanner.string[ startpos-1 .. @scanner.pos-1 ]
  683. end
  684. # ...or for an inline style second part
  685. elsif @scanner.scan( InlineLinkRegex )
  686. url = @scanner[1]
  687. title = @scanner[3]
  688. @log.debug " Found an inline link to %p" % url
  689. text += %{<a href="%s"} % escape_md( url )
  690. if title
  691. title.gsub!( /"/, "&quot;" )
  692. text += %{ title="%s"} % escape_md( title )
  693. end
  694. text += %{>#{link}</a>}
  695. # No linkid part: just append the first part as-is.
  696. else
  697. @log.debug "No linkid, so no anchor. Appending literal text."
  698. text += @scanner.string[ startpos-1 .. @scanner.pos-1 ]
  699. end # if linkid
  700. # Plain text
  701. else
  702. @log.debug " Scanning to the next link from %p" % @scanner.rest
  703. text += @scanner.scan( /[^\[]+/ )
  704. end
  705. end # until @scanner.empty?
  706. return text
  707. end
  708. # Pattern to match strong emphasis in Markdown text
  709. BoldRegexp = %r{ (\*\*|__) (\S|\S.+?\S) \1 }x
  710. # Pattern to match normal emphasis in Markdown text
  711. ItalicRegexp = %r{ (\*|_) (\S|\S.+?\S) \1 }x
  712. ### Transform italic- and bold-encoded text in a copy of the specified +str+
  713. ### and return it.
  714. def transform_italic_and_bold( str, rs )
  715. @log.debug " Transforming italic and bold"
  716. str.
  717. gsub( BoldRegexp, %{<strong>\\2</strong>} ).
  718. gsub( ItalicRegexp, %{<em>\\2</em>} )
  719. end
  720. ### Transform backticked spans into <code> spans.
  721. def transform_code_spans( str, rs )
  722. @log.debug " Transforming code spans"
  723. # Set up the string scanner and just return the string unless there's at
  724. # least one backtick.
  725. @scanner.string = str.dup
  726. unless @scanner.exist?( /`/ )
  727. @scanner.terminate
  728. @log.debug "No backticks found for code span in %p" % str
  729. return str
  730. end
  731. @log.debug "Transforming code spans in %p" % str
  732. # Build the transformed text anew
  733. text = ''
  734. # Scan to the end of the string
  735. until @scanner.empty?
  736. # Scan up to an opening backtick
  737. if pre = @scanner.scan_until( /.?(?=`)/m )
  738. text += pre
  739. @log.debug "Found backtick at %d after '...%s'" % [ @scanner.pos, text[-10, 10] ]
  740. # Make a pattern to find the end of the span
  741. opener = @scanner.scan( /`+/ )
  742. len = opener.length
  743. closer = Regexp::new( opener )
  744. @log.debug "Scanning for end of code span with %p" % closer
  745. # Scan until the end of the closing backtick sequence. Chop the
  746. # backticks off the resultant string, strip leading and trailing
  747. # whitespace, and encode any enitites contained in it.
  748. codespan = @scanner.scan_until( closer ) or
  749. raise FormatError::new( @scanner.rest[0,20],
  750. "No %p found before end" % opener )
  751. @log.debug "Found close of code span at %d: %p" % [ @scanner.pos - len, codespan ]
  752. codespan.slice!( -len, len )
  753. text += "<code>%s</code>" %
  754. encode_code( codespan.strip, rs )
  755. # If there's no more backticks, just append the rest of the string
  756. # and move the scan pointer to the end
  757. else
  758. text += @scanner.rest
  759. @scanner.terminate
  760. end
  761. end
  762. return text
  763. end
  764. # Next, handle inline images: ![alt text](url "optional title")
  765. # Don't forget: encode * and _
  766. InlineImageRegexp = %r{
  767. ( # Whole match = $1
  768. !\[ (.*?) \] # alt text = $2
  769. \([ ]*
  770. <?(\S+?)>? # source url = $3
  771. [ ]*
  772. (?: #
  773. (["']) # quote char = $4
  774. (.*?) # title = $5
  775. \4 # matching quote
  776. [ ]*
  777. )? # title is optional
  778. \)
  779. )
  780. }xs #"
  781. # Reference-style images
  782. ReferenceImageRegexp = %r{
  783. ( # Whole match = $1
  784. !\[ (.*?) \] # Alt text = $2
  785. [ ]? # Optional space
  786. (?:\n[ ]*)? # One optional newline + spaces
  787. \[ (.*?) \] # id = $3
  788. )
  789. }xs
  790. ### Turn image markup into image tags.
  791. def transform_images( str, rs )
  792. @log.debug " Transforming images" % str
  793. # Handle reference-style labeled images: ![alt text][id]
  794. str.
  795. gsub( ReferenceImageRegexp ) {|match|
  796. whole, alt, linkid = $1, $2, $3.downcase
  797. @log.debug "Matched %p" % match
  798. res = nil
  799. alt.gsub!( /"/, '&quot;' )
  800. # for shortcut links like ![this][].
  801. linkid = alt.downcase if linkid.empty?
  802. if rs.urls.key?( linkid )
  803. url = escape_md( rs.urls[linkid] )
  804. @log.debug "Found url '%s' for linkid '%s' " % [ url, linkid ]
  805. # Build the tag
  806. result = %{<img src="%s" alt="%s"} % [ url, alt ]
  807. if rs.titles.key?( linkid )
  808. result += %{ title="%s"} % escape_md( rs.titles[linkid] )
  809. end
  810. result += EmptyElementSuffix
  811. else
  812. result = whole
  813. end
  814. @log.debug "Replacing %p with %p" % [ match, result ]
  815. result
  816. }.
  817. # Inline image style
  818. gsub( InlineImageRegexp ) {|match|
  819. @log.debug "Found inline image %p" % match
  820. whole, alt, title = $1, $2, $5
  821. url = escape_md( $3 )
  822. alt.gsub!( /"/, '&quot;' )
  823. # Build the tag
  824. result = %{<img src="%s" alt="%s"} % [ url, alt ]
  825. unless title.nil?
  826. title.gsub!( /"/, '&quot;' )
  827. result += %{ title="%s"} % escape_md( title )
  828. end
  829. result += EmptyElementSuffix
  830. @log.debug "Replacing %p with %p" % [ match, result ]
  831. result
  832. }
  833. end
  834. # Regexp to match special characters in a code block
  835. CodeEscapeRegexp = %r{( \* | _ | \{ | \} | \[ | \] | \\ )}x
  836. ### Escape any characters special to HTML and encode any characters special
  837. ### to Markdown in a copy of the given +str+ and return it.
  838. def encode_code( str, rs )
  839. str.gsub( %r{&}, '&amp;' ).
  840. gsub( %r{<}, '&lt;' ).
  841. gsub( %r{>}, '&gt;' ).
  842. gsub( CodeEscapeRegexp ) {|match| EscapeTable[match][:md5]}
  843. end
  844. #################################################################
  845. ### U T I L I T Y F U N C T I O N S
  846. #################################################################
  847. ### Escape any markdown characters in a copy of the given +str+ and return
  848. ### it.
  849. def escape_md( str )
  850. str.
  851. gsub( /\*/, EscapeTable['*'][:md5] ).
  852. gsub( /_/, EscapeTable['_'][:md5] )
  853. end
  854. # Matching constructs for tokenizing X/HTML
  855. HTMLCommentRegexp = %r{ <! ( -- .*? -- \s* )+ > }mx
  856. XMLProcInstRegexp = %r{ <\? .*? \?> }mx
  857. MetaTag = Regexp::union( HTMLCommentRegexp, XMLProcInstRegexp )
  858. HTMLTagOpenRegexp = %r{ < [a-z/!$] [^<>]* }imx
  859. HTMLTagCloseRegexp = %r{ > }x
  860. HTMLTagPart = Regexp::union( HTMLTagOpenRegexp, HTMLTagCloseRegexp )
  861. ### Break the HTML source in +str+ into a series of tokens and return
  862. ### them. The tokens are just 2-element Array tuples with a type and the
  863. ### actual content. If this function is called with a block, the type and
  864. ### text parts of each token will be yielded to it one at a time as they are
  865. ### extracted.
  866. def tokenize_html( str )
  867. depth = 0
  868. tokens = []
  869. @scanner.string = str.dup
  870. type, token = nil, nil
  871. until @scanner.empty?
  872. @log.debug "Scanning from %p" % @scanner.rest
  873. # Match comments and PIs without nesting
  874. if (( token = @scanner.scan(MetaTag) ))
  875. type = :tag
  876. # Do nested matching for HTML tags
  877. elsif (( token = @scanner.scan(HTMLTagOpenRegexp) ))
  878. tagstart = @scanner.pos
  879. @log.debug " Found the start of a plain tag at %d" % tagstart
  880. # Start the token with the opening angle
  881. depth = 1
  882. type = :tag
  883. # Scan the rest of the tag, allowing unlimited nested <>s. If
  884. # the scanner runs out of text before the tag is closed, raise
  885. # an error.
  886. while depth.nonzero?
  887. # Scan either an opener or a closer
  888. chunk = @scanner.scan( HTMLTagPart ) or
  889. raise "Malformed tag at character %d: %p" %
  890. [ tagstart, token + @scanner.rest ]
  891. @log.debug " Found another part of the tag at depth %d: %p" % [ depth, chunk ]
  892. token += chunk
  893. # If the last character of the token so far is a closing
  894. # angle bracket, decrement the depth. Otherwise increment
  895. # it for a nested tag.
  896. depth += ( token[-1, 1] == '>' ? -1 : 1 )
  897. @log.debug " Depth is now #{depth}"
  898. end
  899. # Match text segments
  900. else
  901. @log.debug " Looking for a chunk of text"
  902. type = :text
  903. # Scan forward, always matching at least one character to move
  904. # the pointer beyond any non-tag '<'.
  905. token = @scanner.scan_until( /[^<]+/m )
  906. end
  907. @log.debug " type: %p, token: %p" % [ type, token ]
  908. # If a block is given, feed it one token at a time. Add the token to
  909. # the token list to be returned regardless.
  910. if block_given?
  911. yield( type, token )
  912. end
  913. tokens << [ type, token ]
  914. end
  915. return tokens
  916. end
  917. ### Return a copy of +str+ with angle brackets and ampersands HTML-encoded.
  918. def encode_html( str )
  919. str.gsub( /&(?!#?[x]?(?:[0-9a-f]+|\w+);)/i, "&amp;" ).
  920. gsub( %r{<(?![a-z/?\$!])}i, "&lt;" )
  921. end
  922. ### Return one level of line-leading tabs or spaces from a copy of +str+ and
  923. ### return it.
  924. def outdent( str )
  925. str.gsub( /^(\t|[ ]{1,#{TabWidth}})/, '')
  926. end
  927. end # class BlueCloth