PageRenderTime 69ms CodeModel.GetById 38ms RepoModel.GetById 1ms app.codeStats 0ms

/bin/bluecloth

https://github.com/viniciusteles/selenium_poetry
Ruby | 1226 lines | 935 code | 120 blank | 171 comment | 5 complexity | a1d67cd4ee82554b60941ab72296e58b MD5 | raw file
  1. #!/usr/bin/ruby
  2. #
  3. # = bluecloth
  4. #
  5. # Format one or more text files with the markdown formatter.
  6. #
  7. # = Synopsis
  8. #
  9. # bluecloth [OPTIONS] [FILES]
  10. #
  11. #
  12. #
  13. #!/usr/bin/ruby
  14. #
  15. # Bluecloth is a Ruby implementation of Markdown, a text-to-HTML conversion
  16. # tool.
  17. #
  18. # == Synopsis
  19. #
  20. # doc = BlueCloth::new "
  21. # ## Test document ##
  22. #
  23. # Just a simple test.
  24. # "
  25. #
  26. # puts doc.to_html
  27. #
  28. # == Authors
  29. #
  30. # * Michael Granger <ged@FaerieMUD.org>
  31. #
  32. # == Contributors
  33. #
  34. # * Martin Chase <stillflame@FaerieMUD.org> - Peer review, helpful suggestions
  35. # * Florian Gross <flgr@ccan.de> - Filter options, suggestions
  36. #
  37. # == Copyright
  38. #
  39. # Original version:
  40. # Copyright (c) 2003-2004 John Gruber
  41. # <http://daringfireball.net/>
  42. # All rights reserved.
  43. #
  44. # Ruby port:
  45. # Copyright (c) 2004 The FaerieMUD Consortium.
  46. #
  47. # BlueCloth is free software; you can redistribute it and/or modify it under the
  48. # terms of the GNU General Public License as published by the Free Software
  49. # Foundation; either version 2 of the License, or (at your option) any later
  50. # version.
  51. #
  52. # BlueCloth is distributed in the hope that it will be useful, but WITHOUT ANY
  53. # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
  54. # A PARTICULAR PURPOSE. See the GNU General Public License for more details.
  55. #
  56. # == To-do
  57. #
  58. # * Refactor some of the larger uglier methods that have to do their own
  59. # brute-force scanning because of lack of Perl features in Ruby's Regexp
  60. # class. Alternately, could add a dependency on 'pcre' and use most Perl
  61. # regexps.
  62. #
  63. # * Put the StringScanner in the render state for thread-safety.
  64. #
  65. # == Version
  66. #
  67. # $Id: bluecloth.rb 69 2004-08-25 05:27:15Z ged $
  68. #
  69. require 'digest/md5'
  70. require 'logger'
  71. require 'strscan'
  72. ### BlueCloth is a Ruby implementation of Markdown, a text-to-HTML conversion
  73. ### tool.
  74. class BlueCloth < String
  75. ### Exception class for formatting errors.
  76. class FormatError < RuntimeError
  77. ### Create a new FormatError with the given source +str+ and an optional
  78. ### message about the +specific+ error.
  79. def initialize( str, specific=nil )
  80. if specific
  81. msg = "Bad markdown format near %p: %s" % [ str, specific ]
  82. else
  83. msg = "Bad markdown format near %p" % str
  84. end
  85. super( msg )
  86. end
  87. end
  88. # Release Version
  89. Version = '0.0.3'
  90. # SVN Revision
  91. SvnRev = %q$Rev: 69 $
  92. # SVN Id tag
  93. SvnId = %q$Id: bluecloth.rb 69 2004-08-25 05:27:15Z ged $
  94. # SVN URL
  95. SvnUrl = %q$URL: svn+ssh://svn.faeriemud.org/usr/local/svn/BlueCloth/trunk/lib/bluecloth.rb $
  96. # Rendering state struct. Keeps track of URLs, titles, and HTML blocks
  97. # midway through a render. I prefer this to the globals of the Perl version
  98. # because globals make me break out in hives. Or something.
  99. RenderState = Struct::new( "RenderState", :urls, :titles, :html_blocks, :log )
  100. # Tab width for #detab! if none is specified
  101. TabWidth = 4
  102. # The tag-closing string -- set to '>' for HTML
  103. EmptyElementSuffix = "/>";
  104. # Table of MD5 sums for escaped characters
  105. EscapeTable = {}
  106. '\\`*_{}[]()#.!'.split(//).each {|char|
  107. hash = Digest::MD5::hexdigest( char )
  108. EscapeTable[ char ] = {
  109. :md5 => hash,
  110. :md5re => Regexp::new( hash ),
  111. :re => Regexp::new( '\\\\' + Regexp::escape(char) ),
  112. }
  113. }
  114. #################################################################
  115. ### I N S T A N C E M E T H O D S
  116. #################################################################
  117. ### Create a new BlueCloth string.
  118. def initialize( content="", *restrictions )
  119. @log = Logger::new( $deferr )
  120. @log.level = $DEBUG ?
  121. Logger::DEBUG :
  122. ($VERBOSE ? Logger::INFO : Logger::WARN)
  123. @scanner = nil
  124. # Add any restrictions, and set the line-folding attribute to reflect
  125. # what happens by default.
  126. @filter_html = nil
  127. @filter_styles = nil
  128. restrictions.flatten.each {|r| __send__("#{r}=", true) }
  129. @fold_lines = true
  130. super( content )
  131. @log.debug "String is: %p" % self
  132. end
  133. ######
  134. public
  135. ######
  136. # Filters for controlling what gets output for untrusted input. (But really,
  137. # you're filtering bad stuff out of untrusted input at submission-time via
  138. # untainting, aren't you?)
  139. attr_accessor :filter_html, :filter_styles
  140. # RedCloth-compatibility accessor. Line-folding is part of Markdown syntax,
  141. # so this isn't used by anything.
  142. attr_accessor :fold_lines
  143. ### Render Markdown-formatted text in this string object as HTML and return
  144. ### it. The parameter is for compatibility with RedCloth, and is currently
  145. ### unused, though that may change in the future.
  146. def to_html( lite=false )
  147. # Create a StringScanner we can reuse for various lexing tasks
  148. @scanner = StringScanner::new( '' )
  149. # Make a structure to carry around stuff that gets placeholdered out of
  150. # the source.
  151. rs = RenderState::new( {}, {}, {} )
  152. # Make a copy of the string with normalized line endings, tabs turned to
  153. # spaces, and a couple of guaranteed newlines at the end
  154. text = self.gsub( /\r\n?/, "\n" ).detab
  155. text += "\n\n"
  156. @log.debug "Normalized line-endings: %p" % text
  157. # Filter HTML if we're asked to do so
  158. if self.filter_html
  159. text.gsub!( "<", "&lt;" )
  160. text.gsub!( ">", "&gt;" )
  161. @log.debug "Filtered HTML: %p" % text
  162. end
  163. # Simplify blank lines
  164. text.gsub!( /^ +$/, '' )
  165. @log.debug "Tabs -> spaces/blank lines stripped: %p" % text
  166. # Replace HTML blocks with placeholders
  167. text = hide_html_blocks( text, rs )
  168. @log.debug "Hid HTML blocks: %p" % text
  169. @log.debug "Render state: %p" % rs
  170. # Strip link definitions, store in render state
  171. text = strip_link_definitions( text, rs )
  172. @log.debug "Stripped link definitions: %p" % text
  173. @log.debug "Render state: %p" % rs
  174. # Escape meta-characters
  175. text = escape_special_chars( text )
  176. @log.debug "Escaped special characters: %p" % text
  177. # Transform block-level constructs
  178. text = apply_block_transforms( text, rs )
  179. @log.debug "After block-level transforms: %p" % text
  180. # Now swap back in all the escaped characters
  181. text = unescape_special_chars( text )
  182. @log.debug "After unescaping special characters: %p" % text
  183. return text
  184. end
  185. ### Convert tabs in +str+ to spaces.
  186. def detab( tabwidth=TabWidth )
  187. copy = self.dup
  188. copy.detab!( tabwidth )
  189. return copy
  190. end
  191. ### Convert tabs to spaces in place and return self if any were converted.
  192. def detab!( tabwidth=TabWidth )
  193. newstr = self.split( /\n/ ).collect {|line|
  194. line.gsub( /(.*?)\t/ ) do
  195. $1 + ' ' * (tabwidth - $1.length % tabwidth)
  196. end
  197. }.join("\n")
  198. self.replace( newstr )
  199. end
  200. #######
  201. #private
  202. #######
  203. ### Do block-level transforms on a copy of +str+ using the specified render
  204. ### state +rs+ and return the results.
  205. def apply_block_transforms( str, rs )
  206. # Port: This was called '_runBlockGamut' in the original
  207. @log.debug "Applying block transforms to:\n %p" % str
  208. text = transform_headers( str, rs )
  209. text = transform_hrules( text, rs )
  210. text = transform_lists( text, rs )
  211. text = transform_code_blocks( text, rs )
  212. text = transform_block_quotes( text, rs )
  213. text = transform_auto_links( text, rs )
  214. text = hide_html_blocks( text, rs )
  215. text = form_paragraphs( text, rs )
  216. @log.debug "Done with block transforms:\n %p" % text
  217. return text
  218. end
  219. ### Apply Markdown span transforms to a copy of the specified +str+ with the
  220. ### given render state +rs+ and return it.
  221. def apply_span_transforms( str, rs )
  222. @log.debug "Applying span transforms to:\n %p" % str
  223. str = transform_code_spans( str, rs )
  224. str = encode_html( str )
  225. str = transform_images( str, rs )
  226. str = transform_anchors( str, rs )
  227. str = transform_italic_and_bold( str, rs )
  228. # Hard breaks
  229. str.gsub!( / {2,}\n/, "<br#{EmptyElementSuffix}\n" )
  230. @log.debug "Done with span transforms:\n %p" % str
  231. return str
  232. end
  233. # The list of tags which are considered block-level constructs and an
  234. # alternation pattern suitable for use in regexps made from the list
  235. StrictBlockTags = %w[ p div h[1-6] blockquote pre table dl ol ul script noscript
  236. form fieldset iframe math ins del ]
  237. StrictTagPattern = StrictBlockTags.join('|')
  238. LooseBlockTags = StrictBlockTags - %w[ins del]
  239. LooseTagPattern = LooseBlockTags.join('|')
  240. # Nested blocks:
  241. # <div>
  242. # <div>
  243. # tags for inner block must be indented.
  244. # </div>
  245. # </div>
  246. StrictBlockRegex = %r{
  247. ^ # Start of line
  248. <(#{StrictTagPattern}) # Start tag: \2
  249. \b # word break
  250. (.*\n)*? # Any number of lines, minimal match
  251. </\1> # Matching end tag
  252. [ ]* # trailing spaces
  253. $ # End of line or document
  254. }ix
  255. # More-liberal block-matching
  256. LooseBlockRegex = %r{
  257. ^ # Start of line
  258. <(#{LooseTagPattern}) # start tag: \2
  259. \b # word break
  260. (.*\n)*? # Any number of lines, minimal match
  261. .*</\1> # Anything + Matching end tag
  262. [ ]* # trailing spaces
  263. $ # End of line or document
  264. }ix
  265. # Special case for <hr />.
  266. HruleBlockRegex = %r{
  267. ( # $1
  268. \A\n? # Start of doc + optional \n
  269. | # or
  270. .*\n\n # anything + blank line
  271. )
  272. ( # save in $2
  273. [ ]* # Any spaces
  274. <hr # Tag open
  275. \b # Word break
  276. ([^<>])*? # Attributes
  277. /?> # Tag close
  278. $ # followed by a blank line or end of document
  279. )
  280. }ix
  281. ### Replace all blocks of HTML in +str+ that start in the left margin with
  282. ### tokens.
  283. def hide_html_blocks( str, rs )
  284. @log.debug "Hiding HTML blocks in %p" % str
  285. # Tokenizer proc to pass to gsub
  286. tokenize = lambda {|match|
  287. key = Digest::MD5::hexdigest( match )
  288. rs.html_blocks[ key ] = match
  289. @log.debug "Replacing %p with %p" % [ match, key ]
  290. "\n\n#{key}\n\n"
  291. }
  292. rval = str.dup
  293. @log.debug "Finding blocks with the strict regex..."
  294. rval.gsub!( StrictBlockRegex, &tokenize )
  295. @log.debug "Finding blocks with the loose regex..."
  296. rval.gsub!( LooseBlockRegex, &tokenize )
  297. @log.debug "Finding hrules..."
  298. rval.gsub!( HruleBlockRegex ) {|match| $1 + tokenize[$2] }
  299. return rval
  300. end
  301. # Link defs are in the form: ^[id]: url "optional title"
  302. LinkRegex = %r{
  303. ^[ ]*\[(.+)\]: # id = $1
  304. [ ]*
  305. \n? # maybe *one* newline
  306. [ ]*
  307. <?(\S+?)>? # url = $2
  308. [ ]*
  309. \n? # maybe one newline
  310. [ ]*
  311. (?:
  312. # Titles are delimited by "quotes" or (parens).
  313. ["(]
  314. (.+?) # title = $3
  315. [")] # Matching ) or "
  316. [ ]*
  317. )? # title is optional
  318. (?:\n+|\Z)
  319. }x
  320. ### Strip link definitions from +str+, storing them in the given RenderState
  321. ### +rs+.
  322. def strip_link_definitions( str, rs )
  323. str.gsub( LinkRegex ) {|match|
  324. id, url, title = $1, $2, $3
  325. rs.urls[ id.downcase ] = encode_html( url )
  326. unless title.nil?
  327. rs.titles[ id.downcase ] = title.gsub( /"/, "&quot;" )
  328. end
  329. ""
  330. }
  331. end
  332. ### Escape special characters in the given +str+
  333. def escape_special_chars( str )
  334. @log.debug " Escaping special characters"
  335. text = ''
  336. # The original Markdown source has something called '$tags_to_skip'
  337. # declared here, but it's never used, so I don't define it.
  338. tokenize_html( str ) {|token, str|
  339. @log.debug " Adding %p token %p" % [ token, str ]
  340. case token
  341. # Within tags, encode * and _
  342. when :tag
  343. text += str.
  344. gsub( /\*/, EscapeTable['*'][:md5] ).
  345. gsub( /_/, EscapeTable['_'][:md5] )
  346. # Encode backslashed stuff in regular text
  347. when :text
  348. text += encode_backslash_escapes( str )
  349. else
  350. raise TypeError, "Unknown token type %p" % token
  351. end
  352. }
  353. @log.debug " Text with escapes is now: %p" % text
  354. return text
  355. end
  356. ### Swap escaped special characters in a copy of the given +str+ and return
  357. ### it.
  358. def unescape_special_chars( str )
  359. EscapeTable.each {|char, hash|
  360. @log.debug "Unescaping escaped %p with %p" % [ char, hash[:md5re] ]
  361. str.gsub!( hash[:md5re], char )
  362. }
  363. return str
  364. end
  365. ### Return a copy of the given +str+ with any backslashed special character
  366. ### in it replaced with MD5 placeholders.
  367. def encode_backslash_escapes( str )
  368. # Make a copy with any double-escaped backslashes encoded
  369. text = str.gsub( /\\\\/, EscapeTable['\\'][:md5] )
  370. EscapeTable.each_pair {|char, esc|
  371. next if char == '\\'
  372. text.gsub!( esc[:re], esc[:md5] )
  373. }
  374. return text
  375. end
  376. ### Transform any Markdown-style horizontal rules in a copy of the specified
  377. ### +str+ and return it.
  378. def transform_hrules( str, rs )
  379. @log.debug " Transforming horizontal rules"
  380. str.gsub( /^( ?[\-\*_] ?){3,}$/, "\n<hr#{EmptyElementSuffix}\n" )
  381. end
  382. # Patterns to match and transform lists
  383. ListMarkerOl = %r{\d+\.}
  384. ListMarkerUl = %r{[*+-]}
  385. ListMarkerAny = Regexp::union( ListMarkerOl, ListMarkerUl )
  386. ListRegexp = %r{
  387. (?:
  388. ^[ ]{0,#{TabWidth - 1}} # Indent < tab width
  389. (#{ListMarkerAny}) # unordered or ordered ($1)
  390. [ ]+ # At least one space
  391. )
  392. (?m:.+?) # item content (include newlines)
  393. (?:
  394. \z # Either EOF
  395. | # or
  396. \n{2,} # Blank line...
  397. (?=\S) # ...followed by non-space
  398. (?![ ]* # ...but not another item
  399. (#{ListMarkerAny})
  400. [ ]+)
  401. )
  402. }x
  403. ### Transform Markdown-style lists in a copy of the specified +str+ and
  404. ### return it.
  405. def transform_lists( str, rs )
  406. @log.debug " Transforming lists at %p" % (str[0,100] + '...')
  407. str.gsub( ListRegexp ) {|list|
  408. @log.debug " Found list %p" % list
  409. bullet = $1
  410. list_type = (ListMarkerUl.match(bullet) ? "ul" : "ol")
  411. list.gsub!( /\n{2,}/, "\n\n\n" )
  412. %{<%s>\n%s</%s>\n} % [
  413. list_type,
  414. transform_list_items( list, rs ),
  415. list_type,
  416. ]
  417. }
  418. end
  419. # Pattern for transforming list items
  420. ListItemRegexp = %r{
  421. (\n)? # leading line = $1
  422. (^[ ]*) # leading whitespace = $2
  423. (#{ListMarkerAny}) [ ]+ # list marker = $3
  424. ((?m:.+?) # list item text = $4
  425. (\n{1,2}))
  426. (?= \n* (\z | \2 (#{ListMarkerAny}) [ ]+))
  427. }x
  428. ### Transform list items in a copy of the given +str+ and return it.
  429. def transform_list_items( str, rs )
  430. @log.debug " Transforming list items"
  431. # Trim trailing blank lines
  432. str = str.sub( /\n{2,}\z/, "\n" )
  433. str.gsub( ListItemRegexp ) {|line|
  434. @log.debug " Found item line %p" % line
  435. leading_line, item = $1, $4
  436. if leading_line or /\n{2,}/.match( item )
  437. @log.debug " Found leading line or item has a blank"
  438. item = apply_block_transforms( outdent(item), rs )
  439. else
  440. # Recursion for sub-lists
  441. @log.debug " Recursing for sublist"
  442. item = transform_lists( outdent(item), rs ).chomp
  443. item = apply_span_transforms( item, rs )
  444. end
  445. %{<li>%s</li>\n} % item
  446. }
  447. end
  448. # Pattern for matching codeblocks
  449. CodeBlockRegexp = %r{
  450. (?:\n\n|\A)
  451. ( # $1 = the code block
  452. (?:
  453. (?:[ ]{#{TabWidth}} | \t) # a tab or tab-width of spaces
  454. .*\n+
  455. )+
  456. )
  457. (^[ ]{0,#{TabWidth - 1}}\S|\Z) # Lookahead for non-space at
  458. # line-start, or end of doc
  459. }x
  460. ### Transform Markdown-style codeblocks in a copy of the specified +str+ and
  461. ### return it.
  462. def transform_code_blocks( str, rs )
  463. @log.debug " Transforming code blocks"
  464. str.gsub( CodeBlockRegexp ) {|block|
  465. codeblock = $1
  466. remainder = $2
  467. # Generate the codeblock
  468. %{\n\n<pre><code>%s\n</code></pre>\n\n%s} %
  469. [ encode_code( outdent(codeblock), rs ).rstrip, remainder ]
  470. }
  471. end
  472. # Pattern for matching Markdown blockquote blocks
  473. BlockQuoteRegexp = %r{
  474. (?:
  475. ^[ ]*>[ ]? # '>' at the start of a line
  476. .+\n # rest of the first line
  477. (?:.+\n)* # subsequent consecutive lines
  478. \n* # blanks
  479. )+
  480. }x
  481. PreChunk = %r{ ( ^ \s* <pre> .+? </pre> ) }xm
  482. ### Transform Markdown-style blockquotes in a copy of the specified +str+
  483. ### and return it.
  484. def transform_block_quotes( str, rs )
  485. @log.debug " Transforming block quotes"
  486. str.gsub( BlockQuoteRegexp ) {|quote|
  487. @log.debug "Making blockquote from %p" % quote
  488. quote.gsub!( /^ *> ?/, '' ) # Trim one level of quoting
  489. quote.gsub!( /^ +$/, '' ) # Trim whitespace-only lines
  490. indent = " " * TabWidth
  491. quoted = %{<blockquote>\n%s\n</blockquote>\n\n} %
  492. apply_block_transforms( quote, rs ).
  493. gsub( /^/, indent ).
  494. gsub( PreChunk ) {|m| m.gsub(/^#{indent}/o, '') }
  495. @log.debug "Blockquoted chunk is: %p" % quoted
  496. quoted
  497. }
  498. end
  499. AutoAnchorURLRegexp = /<((https?|ftp):[^'">\s]+)>/
  500. AutoAnchorEmailRegexp = %r{
  501. <
  502. (
  503. [-.\w]+
  504. \@
  505. [-a-z0-9]+(\.[-a-z0-9]+)*\.[a-z]+
  506. )
  507. >
  508. }xi
  509. ### Transform URLs in a copy of the specified +str+ into links and return
  510. ### it.
  511. def transform_auto_links( str, rs )
  512. @log.debug " Transforming auto-links"
  513. str.gsub( AutoAnchorURLRegexp, %{<a href="\\1">\\1</a>}).
  514. gsub( AutoAnchorEmailRegexp ) {|addr|
  515. encode_email_address( unescape_special_chars($1) )
  516. }
  517. end
  518. # Encoder functions to turn characters of an email address into encoded
  519. # entities.
  520. Encoders = [
  521. lambda {|char| "&#%03d;" % char},
  522. lambda {|char| "&#x%X;" % char},
  523. lambda {|char| char.chr },
  524. ]
  525. ### Transform a copy of the given email +addr+ into an escaped version safer
  526. ### for posting publicly.
  527. def encode_email_address( addr )
  528. rval = ''
  529. ("mailto:" + addr).each_byte {|b|
  530. case b
  531. when ?:
  532. rval += ":"
  533. when ?@
  534. rval += Encoders[ rand(2) ][ b ]
  535. else
  536. r = rand(100)
  537. rval += (
  538. r > 90 ? Encoders[2][ b ] :
  539. r < 45 ? Encoders[1][ b ] :
  540. Encoders[0][ b ]
  541. )
  542. end
  543. }
  544. return %{<a href="%s">%s</a>} % [ rval, rval.sub(/.+?:/, '') ]
  545. end
  546. # Regex for matching Setext-style headers
  547. SetextHeaderRegexp = %r{
  548. (.+) # The title text ($1)
  549. \n
  550. ([\-=])+ # Match a line of = or -. Save only one in $2.
  551. [ ]*\n+
  552. }x
  553. # Regexp for matching ATX-style headers
  554. AtxHeaderRegexp = %r{
  555. ^(\#{1,6}) # $1 = string of #'s
  556. [ ]*
  557. (.+?) # $2 = Header text
  558. [ ]*
  559. \#* # optional closing #'s (not counted)
  560. \n+
  561. }x
  562. ### Apply Markdown header transforms to a copy of the given +str+ amd render
  563. ### state +rs+ and return the result.
  564. def transform_headers( str, rs )
  565. @log.debug " Transforming headers"
  566. # Setext-style headers:
  567. # Header 1
  568. # ========
  569. #
  570. # Header 2
  571. # --------
  572. #
  573. str.
  574. gsub( SetextHeaderRegexp ) {|m|
  575. @log.debug "Found setext-style header"
  576. title, hdrchar = $1, $2
  577. title = apply_span_transforms( title, rs )
  578. case hdrchar
  579. when '='
  580. %[<h1>#{title}</h1>\n\n]
  581. when '-'
  582. %[<h2>#{title}</h2>\n\n]
  583. else
  584. title
  585. end
  586. }.
  587. gsub( AtxHeaderRegexp ) {|m|
  588. @log.debug "Found ATX-style header"
  589. hdrchars, title = $1, $2
  590. title = apply_span_transforms( title, rs )
  591. level = hdrchars.length
  592. %{<h%d>%s</h%d>\n\n} % [ level, title, level ]
  593. }
  594. end
  595. ### Wrap all remaining paragraph-looking text in a copy of +str+ inside <p>
  596. ### tags and return it.
  597. def form_paragraphs( str, rs )
  598. @log.debug " Forming paragraphs"
  599. grafs = str.
  600. sub( /\A\n+/, '' ).
  601. sub( /\n+\z/, '' ).
  602. split( /\n{2,}/ )
  603. rval = grafs.collect {|graf|
  604. # Unhashify HTML blocks if this is a placeholder
  605. if rs.html_blocks.key?( graf )
  606. rs.html_blocks[ graf ]
  607. # Otherwise, wrap in <p> tags
  608. else
  609. apply_span_transforms(graf, rs).
  610. sub( /^[ ]*/, '<p>' ) + '</p>'
  611. end
  612. }.join( "\n\n" )
  613. @log.debug " Formed paragraphs: %p" % rval
  614. return rval
  615. end
  616. # Pattern to match the linkid part of an anchor tag for reference-style
  617. # links.
  618. RefLinkIdRegex = %r{
  619. [ ]? # Optional leading space
  620. (?:\n[ ]*)? # Optional newline + spaces
  621. \[
  622. (.*?) # Id = $1
  623. \]
  624. }x
  625. InlineLinkRegex = %r{
  626. \( # Literal paren
  627. [ ]* # Zero or more spaces
  628. <?(.+?)>? # URI = $1
  629. [ ]* # Zero or more spaces
  630. (?: #
  631. ([\"\']) # Opening quote char = $2
  632. (.*?) # Title = $3
  633. \2 # Matching quote char
  634. )? # Title is optional
  635. \)
  636. }x
  637. ### Apply Markdown anchor transforms to a copy of the specified +str+ with
  638. ### the given render state +rs+ and return it.
  639. def transform_anchors( str, rs )
  640. @log.debug " Transforming anchors"
  641. @scanner.string = str.dup
  642. text = ''
  643. # Scan the whole string
  644. until @scanner.empty?
  645. if @scanner.scan( /\[/ )
  646. link = ''; linkid = ''
  647. depth = 1
  648. startpos = @scanner.pos
  649. @log.debug " Found a bracket-open at %d" % startpos
  650. # Scan the rest of the tag, allowing unlimited nested []s. If
  651. # the scanner runs out of text before the opening bracket is
  652. # closed, append the text and return (wasn't a valid anchor).
  653. while depth.nonzero?
  654. linktext = @scanner.scan_until( /\]|\[/ )
  655. if linktext
  656. @log.debug " Found a bracket at depth %d: %p" % [ depth, linktext ]
  657. link += linktext
  658. # Decrement depth for each closing bracket
  659. depth += ( linktext[-1, 1] == ']' ? -1 : 1 )
  660. @log.debug " Depth is now #{depth}"
  661. # If there's no more brackets, it must not be an anchor, so
  662. # just abort.
  663. else
  664. @log.debug " Missing closing brace, assuming non-link."
  665. link += @scanner.rest
  666. @scanner.terminate
  667. return text + '[' + link
  668. end
  669. end
  670. link.slice!( -1 ) # Trim final ']'
  671. @log.debug " Found leading link %p" % link
  672. # Look for a reference-style second part
  673. if @scanner.scan( RefLinkIdRegex )
  674. linkid = @scanner[1]
  675. linkid = link.dup if linkid.empty?
  676. linkid.downcase!
  677. @log.debug " Found a linkid: %p" % linkid
  678. # If there's a matching link in the link table, build an
  679. # anchor tag for it.
  680. if rs.urls.key?( linkid )
  681. @log.debug " Found link key in the link table: %p" % rs.urls[linkid]
  682. url = escape_md( rs.urls[linkid] )
  683. text += %{<a href="#{url}"}
  684. if rs.titles.key?(linkid)
  685. text += %{ title="%s"} % escape_md( rs.titles[linkid] )
  686. end
  687. text += %{>#{link}</a>}
  688. # If the link referred to doesn't exist, just append the raw
  689. # source to the result
  690. else
  691. @log.debug " Linkid %p not found in link table" % linkid
  692. @log.debug " Appending original string instead: "
  693. @log.debug "%p" % @scanner.string[ startpos-1 .. @scanner.pos-1 ]
  694. text += @scanner.string[ startpos-1 .. @scanner.pos-1 ]
  695. end
  696. # ...or for an inline style second part
  697. elsif @scanner.scan( InlineLinkRegex )
  698. url = @scanner[1]
  699. title = @scanner[3]
  700. @log.debug " Found an inline link to %p" % url
  701. text += %{<a href="%s"} % escape_md( url )
  702. if title
  703. title.gsub!( /"/, "&quot;" )
  704. text += %{ title="%s"} % escape_md( title )
  705. end
  706. text += %{>#{link}</a>}
  707. # No linkid part: just append the first part as-is.
  708. else
  709. @log.debug "No linkid, so no anchor. Appending literal text."
  710. text += @scanner.string[ startpos-1 .. @scanner.pos-1 ]
  711. end # if linkid
  712. # Plain text
  713. else
  714. @log.debug " Scanning to the next link from %p" % @scanner.rest
  715. text += @scanner.scan( /[^\[]+/ )
  716. end
  717. end # until @scanner.empty?
  718. return text
  719. end
  720. # Pattern to match strong emphasis in Markdown text
  721. BoldRegexp = %r{ (\*\*|__) (\S|\S.+?\S) \1 }x
  722. # Pattern to match normal emphasis in Markdown text
  723. ItalicRegexp = %r{ (\*|_) (\S|\S.+?\S) \1 }x
  724. ### Transform italic- and bold-encoded text in a copy of the specified +str+
  725. ### and return it.
  726. def transform_italic_and_bold( str, rs )
  727. @log.debug " Transforming italic and bold"
  728. str.
  729. gsub( BoldRegexp, %{<strong>\\2</strong>} ).
  730. gsub( ItalicRegexp, %{<em>\\2</em>} )
  731. end
  732. ### Transform backticked spans into <code> spans.
  733. def transform_code_spans( str, rs )
  734. @log.debug " Transforming code spans"
  735. # Set up the string scanner and just return the string unless there's at
  736. # least one backtick.
  737. @scanner.string = str.dup
  738. unless @scanner.exist?( /`/ )
  739. @scanner.terminate
  740. @log.debug "No backticks found for code span in %p" % str
  741. return str
  742. end
  743. @log.debug "Transforming code spans in %p" % str
  744. # Build the transformed text anew
  745. text = ''
  746. # Scan to the end of the string
  747. until @scanner.empty?
  748. # Scan up to an opening backtick
  749. if pre = @scanner.scan_until( /.?(?=`)/m )
  750. text += pre
  751. @log.debug "Found backtick at %d after '...%s'" % [ @scanner.pos, text[-10, 10] ]
  752. # Make a pattern to find the end of the span
  753. opener = @scanner.scan( /`+/ )
  754. len = opener.length
  755. closer = Regexp::new( opener )
  756. @log.debug "Scanning for end of code span with %p" % closer
  757. # Scan until the end of the closing backtick sequence. Chop the
  758. # backticks off the resultant string, strip leading and trailing
  759. # whitespace, and encode any enitites contained in it.
  760. codespan = @scanner.scan_until( closer ) or
  761. raise FormatError::new( @scanner.rest[0,20],
  762. "No %p found before end" % opener )
  763. @log.debug "Found close of code span at %d: %p" % [ @scanner.pos - len, codespan ]
  764. codespan.slice!( -len, len )
  765. text += "<code>%s</code>" %
  766. encode_code( codespan.strip, rs )
  767. # If there's no more backticks, just append the rest of the string
  768. # and move the scan pointer to the end
  769. else
  770. text += @scanner.rest
  771. @scanner.terminate
  772. end
  773. end
  774. return text
  775. end
  776. # Next, handle inline images: ![alt text](url "optional title")
  777. # Don't forget: encode * and _
  778. InlineImageRegexp = %r{
  779. ( # Whole match = $1
  780. !\[ (.*?) \] # alt text = $2
  781. \([ ]*
  782. <?(\S+?)>? # source url = $3
  783. [ ]*
  784. (?: #
  785. (["']) # quote char = $4
  786. (.*?) # title = $5
  787. \4 # matching quote
  788. [ ]*
  789. )? # title is optional
  790. \)
  791. )
  792. }xs #"
  793. # Reference-style images
  794. ReferenceImageRegexp = %r{
  795. ( # Whole match = $1
  796. !\[ (.*?) \] # Alt text = $2
  797. [ ]? # Optional space
  798. (?:\n[ ]*)? # One optional newline + spaces
  799. \[ (.*?) \] # id = $3
  800. )
  801. }xs
  802. ### Turn image markup into image tags.
  803. def transform_images( str, rs )
  804. @log.debug " Transforming images" % str
  805. # Handle reference-style labeled images: ![alt text][id]
  806. str.
  807. gsub( ReferenceImageRegexp ) {|match|
  808. whole, alt, linkid = $1, $2, $3.downcase
  809. @log.debug "Matched %p" % match
  810. res = nil
  811. alt.gsub!( /"/, '&quot;' )
  812. # for shortcut links like ![this][].
  813. linkid = alt.downcase if linkid.empty?
  814. if rs.urls.key?( linkid )
  815. url = escape_md( rs.urls[linkid] )
  816. @log.debug "Found url '%s' for linkid '%s' " % [ url, linkid ]
  817. # Build the tag
  818. result = %{<img src="%s" alt="%s"} % [ url, alt ]
  819. if rs.titles.key?( linkid )
  820. result += %{ title="%s"} % escape_md( rs.titles[linkid] )
  821. end
  822. result += EmptyElementSuffix
  823. else
  824. result = whole
  825. end
  826. @log.debug "Replacing %p with %p" % [ match, result ]
  827. result
  828. }.
  829. # Inline image style
  830. gsub( InlineImageRegexp ) {|match|
  831. @log.debug "Found inline image %p" % match
  832. whole, alt, title = $1, $2, $5
  833. url = escape_md( $3 )
  834. alt.gsub!( /"/, '&quot;' )
  835. # Build the tag
  836. result = %{<img src="%s" alt="%s"} % [ url, alt ]
  837. unless title.nil?
  838. title.gsub!( /"/, '&quot;' )
  839. result += %{ title="%s"} % escape_md( title )
  840. end
  841. result += EmptyElementSuffix
  842. @log.debug "Replacing %p with %p" % [ match, result ]
  843. result
  844. }
  845. end
  846. # Regexp to match special characters in a code block
  847. CodeEscapeRegexp = %r{( \* | _ | \{ | \} | \[ | \] | \\ )}x
  848. ### Escape any characters special to HTML and encode any characters special
  849. ### to Markdown in a copy of the given +str+ and return it.
  850. def encode_code( str, rs )
  851. str.gsub( %r{&}, '&amp;' ).
  852. gsub( %r{<}, '&lt;' ).
  853. gsub( %r{>}, '&gt;' ).
  854. gsub( CodeEscapeRegexp ) {|match| EscapeTable[match][:md5]}
  855. end
  856. #################################################################
  857. ### U T I L I T Y F U N C T I O N S
  858. #################################################################
  859. ### Escape any markdown characters in a copy of the given +str+ and return
  860. ### it.
  861. def escape_md( str )
  862. str.
  863. gsub( /\*/, EscapeTable['*'][:md5] ).
  864. gsub( /_/, EscapeTable['_'][:md5] )
  865. end
  866. # Matching constructs for tokenizing X/HTML
  867. HTMLCommentRegexp = %r{ <! ( -- .*? -- \s* )+ > }mx
  868. XMLProcInstRegexp = %r{ <\? .*? \?> }mx
  869. MetaTag = Regexp::union( HTMLCommentRegexp, XMLProcInstRegexp )
  870. HTMLTagOpenRegexp = %r{ < [a-z/!$] [^<>]* }imx
  871. HTMLTagCloseRegexp = %r{ > }x
  872. HTMLTagPart = Regexp::union( HTMLTagOpenRegexp, HTMLTagCloseRegexp )
  873. ### Break the HTML source in +str+ into a series of tokens and return
  874. ### them. The tokens are just 2-element Array tuples with a type and the
  875. ### actual content. If this function is called with a block, the type and
  876. ### text parts of each token will be yielded to it one at a time as they are
  877. ### extracted.
  878. def tokenize_html( str )
  879. depth = 0
  880. tokens = []
  881. @scanner.string = str.dup
  882. type, token = nil, nil
  883. until @scanner.empty?
  884. @log.debug "Scanning from %p" % @scanner.rest
  885. # Match comments and PIs without nesting
  886. if (( token = @scanner.scan(MetaTag) ))
  887. type = :tag
  888. # Do nested matching for HTML tags
  889. elsif (( token = @scanner.scan(HTMLTagOpenRegexp) ))
  890. tagstart = @scanner.pos
  891. @log.debug " Found the start of a plain tag at %d" % tagstart
  892. # Start the token with the opening angle
  893. depth = 1
  894. type = :tag
  895. # Scan the rest of the tag, allowing unlimited nested <>s. If
  896. # the scanner runs out of text before the tag is closed, raise
  897. # an error.
  898. while depth.nonzero?
  899. # Scan either an opener or a closer
  900. chunk = @scanner.scan( HTMLTagPart ) or
  901. raise "Malformed tag at character %d: %p" %
  902. [ tagstart, token + @scanner.rest ]
  903. @log.debug " Found another part of the tag at depth %d: %p" % [ depth, chunk ]
  904. token += chunk
  905. # If the last character of the token so far is a closing
  906. # angle bracket, decrement the depth. Otherwise increment
  907. # it for a nested tag.
  908. depth += ( token[-1, 1] == '>' ? -1 : 1 )
  909. @log.debug " Depth is now #{depth}"
  910. end
  911. # Match text segments
  912. else
  913. @log.debug " Looking for a chunk of text"
  914. type = :text
  915. # Scan forward, always matching at least one character to move
  916. # the pointer beyond any non-tag '<'.
  917. token = @scanner.scan_until( /[^<]+/m )
  918. end
  919. @log.debug " type: %p, token: %p" % [ type, token ]
  920. # If a block is given, feed it one token at a time. Add the token to
  921. # the token list to be returned regardless.
  922. if block_given?
  923. yield( type, token )
  924. end
  925. tokens << [ type, token ]
  926. end
  927. return tokens
  928. end
  929. ### Return a copy of +str+ with angle brackets and ampersands HTML-encoded.
  930. def encode_html( str )
  931. str.gsub( /&(?!#?[x]?(?:[0-9a-f]+|\w+);)/i, "&amp;" ).
  932. gsub( %r{<(?![a-z/?\$!])}i, "&lt;" )
  933. end
  934. ### Return one level of line-leading tabs or spaces from a copy of +str+ and
  935. ### return it.
  936. def outdent( str )
  937. str.gsub( /^(\t|[ ]{1,#{TabWidth}})/, '')
  938. end
  939. end # class BlueCloth
  940. BEGIN {
  941. require 'optparse'
  942. }
  943. DocumentWrapper = %{
  944. <html>
  945. <head><title>%s</title></head>
  946. <body>
  947. %s
  948. </body>
  949. </html>
  950. }
  951. def main
  952. fragment = false
  953. destination = '.'
  954. ARGV.options do |oparser|
  955. oparser.banner = "Usage: #$0 [OPTIONS] FILES"
  956. # Debug mode
  957. oparser.on( "--debug", "-d", TrueClass, "Turn debugging output on" ) {
  958. $DEBUG = true
  959. }
  960. # 'Fragment' mode
  961. oparser.on( "--fragment", "-f", TrueClass,
  962. "Output HTML fragments instead of whole documents" ) {
  963. fragment = true
  964. }
  965. # Output destination
  966. #oparser.on( "--output=DESTINATION", "-o DESTINATION", String,
  967. # "Write output to DESTINATION instead of the current directory" ) {|arg|
  968. # destination = arg
  969. #}
  970. oparser.parse!
  971. end
  972. # Filter mode if no arguments
  973. ARGV.push( "-" ) if ARGV.empty?
  974. ARGV.each {|file|
  975. if file == '-'
  976. contents = $stdin.readlines(nil)
  977. else
  978. contents = File::readlines( file, nil )
  979. end
  980. bc = BlueCloth::new( contents.join )
  981. if fragment
  982. $stdout.puts bc.to_html
  983. else
  984. $stdout.puts DocumentWrapper % [ file, bc.to_html ]
  985. end
  986. }
  987. rescue => err
  988. $stderr.puts "Aborting: Fatal error: %s" % err.message
  989. exit 255
  990. end
  991. main