/vendor/gems/BlueCloth-1.0.0/lib/bluecloth.rb
Ruby | 1144 lines | 866 code | 119 blank | 159 comment | 5 complexity | dd9cae93838b21613a66bb01d431a68f MD5 | raw file
Possible License(s): GPL-2.0
- #!/usr/bin/ruby
- #
- # Bluecloth is a Ruby implementation of Markdown, a text-to-HTML conversion
- # tool.
- #
- # == Synopsis
- #
- # doc = BlueCloth::new "
- # ## Test document ##
- #
- # Just a simple test.
- # "
- #
- # puts doc.to_html
- #
- # == Authors
- #
- # * Michael Granger <ged@FaerieMUD.org>
- #
- # == Contributors
- #
- # * Martin Chase <stillflame@FaerieMUD.org> - Peer review, helpful suggestions
- # * Florian Gross <flgr@ccan.de> - Filter options, suggestions
- #
- # == Copyright
- #
- # Original version:
- # Copyright (c) 2003-2004 John Gruber
- # <http://daringfireball.net/>
- # All rights reserved.
- #
- # Ruby port:
- # Copyright (c) 2004 The FaerieMUD Consortium.
- #
- # BlueCloth is free software; you can redistribute it and/or modify it under the
- # terms of the GNU General Public License as published by the Free Software
- # Foundation; either version 2 of the License, or (at your option) any later
- # version.
- #
- # BlueCloth is distributed in the hope that it will be useful, but WITHOUT ANY
- # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
- # A PARTICULAR PURPOSE. See the GNU General Public License for more details.
- #
- # == To-do
- #
- # * Refactor some of the larger uglier methods that have to do their own
- # brute-force scanning because of lack of Perl features in Ruby's Regexp
- # class. Alternately, could add a dependency on 'pcre' and use most Perl
- # regexps.
- #
- # * Put the StringScanner in the render state for thread-safety.
- #
- # == Version
- #
- # $Id: bluecloth.rb 69 2004-08-25 05:27:15Z ged $
- #
-
- require 'digest/md5'
- require 'logger'
- require 'strscan'
-
-
- ### BlueCloth is a Ruby implementation of Markdown, a text-to-HTML conversion
- ### tool.
- class BlueCloth < String
-
- ### Exception class for formatting errors.
- class FormatError < RuntimeError
-
- ### Create a new FormatError with the given source +str+ and an optional
- ### message about the +specific+ error.
- def initialize( str, specific=nil )
- if specific
- msg = "Bad markdown format near %p: %s" % [ str, specific ]
- else
- msg = "Bad markdown format near %p" % str
- end
-
- super( msg )
- end
- end
-
-
- # Release Version
- Version = '0.0.3'
-
- # SVN Revision
- SvnRev = %q$Rev: 69 $
-
- # SVN Id tag
- SvnId = %q$Id: bluecloth.rb 69 2004-08-25 05:27:15Z ged $
-
- # SVN URL
- SvnUrl = %q$URL: svn+ssh://svn.faeriemud.org/usr/local/svn/BlueCloth/trunk/lib/bluecloth.rb $
-
-
- # Rendering state struct. Keeps track of URLs, titles, and HTML blocks
- # midway through a render. I prefer this to the globals of the Perl version
- # because globals make me break out in hives. Or something.
- RenderState = Struct::new( "RenderState", :urls, :titles, :html_blocks, :log )
-
- # Tab width for #detab! if none is specified
- TabWidth = 4
-
- # The tag-closing string -- set to '>' for HTML
- EmptyElementSuffix = "/>";
-
- # Table of MD5 sums for escaped characters
- EscapeTable = {}
- '\\`*_{}[]()#.!'.split(//).each {|char|
- hash = Digest::MD5::hexdigest( char )
-
- EscapeTable[ char ] = {
- :md5 => hash,
- :md5re => Regexp::new( hash ),
- :re => Regexp::new( '\\\\' + Regexp::escape(char) ),
- }
- }
-
-
- #################################################################
- ### I N S T A N C E M E T H O D S
- #################################################################
-
- ### Create a new BlueCloth string.
- def initialize( content="", *restrictions )
- @log = Logger::new( $deferr )
- @log.level = $DEBUG ?
- Logger::DEBUG :
- ($VERBOSE ? Logger::INFO : Logger::WARN)
- @scanner = nil
-
- # Add any restrictions, and set the line-folding attribute to reflect
- # what happens by default.
- @filter_html = nil
- @filter_styles = nil
- restrictions.flatten.each {|r| __send__("#{r}=", true) }
- @fold_lines = true
-
- super( content )
-
- @log.debug "String is: %p" % self
- end
-
-
- ######
- public
- ######
-
- # Filters for controlling what gets output for untrusted input. (But really,
- # you're filtering bad stuff out of untrusted input at submission-time via
- # untainting, aren't you?)
- attr_accessor :filter_html, :filter_styles
-
- # RedCloth-compatibility accessor. Line-folding is part of Markdown syntax,
- # so this isn't used by anything.
- attr_accessor :fold_lines
-
-
- ### Render Markdown-formatted text in this string object as HTML and return
- ### it. The parameter is for compatibility with RedCloth, and is currently
- ### unused, though that may change in the future.
- def to_html( lite=false )
-
- # Create a StringScanner we can reuse for various lexing tasks
- @scanner = StringScanner::new( '' )
-
- # Make a structure to carry around stuff that gets placeholdered out of
- # the source.
- rs = RenderState::new( {}, {}, {} )
-
- # Make a copy of the string with normalized line endings, tabs turned to
- # spaces, and a couple of guaranteed newlines at the end
- text = self.gsub( /\r\n?/, "\n" ).detab
- text += "\n\n"
- @log.debug "Normalized line-endings: %p" % text
-
- # Filter HTML if we're asked to do so
- if self.filter_html
- text.gsub!( "<", "<" )
- text.gsub!( ">", ">" )
- @log.debug "Filtered HTML: %p" % text
- end
-
- # Simplify blank lines
- text.gsub!( /^ +$/, '' )
- @log.debug "Tabs -> spaces/blank lines stripped: %p" % text
-
- # Replace HTML blocks with placeholders
- text = hide_html_blocks( text, rs )
- @log.debug "Hid HTML blocks: %p" % text
- @log.debug "Render state: %p" % rs
-
- # Strip link definitions, store in render state
- text = strip_link_definitions( text, rs )
- @log.debug "Stripped link definitions: %p" % text
- @log.debug "Render state: %p" % rs
-
- # Escape meta-characters
- text = escape_special_chars( text )
- @log.debug "Escaped special characters: %p" % text
-
- # Transform block-level constructs
- text = apply_block_transforms( text, rs )
- @log.debug "After block-level transforms: %p" % text
-
- # Now swap back in all the escaped characters
- text = unescape_special_chars( text )
- @log.debug "After unescaping special characters: %p" % text
-
- return text
- end
-
-
- ### Convert tabs in +str+ to spaces.
- def detab( tabwidth=TabWidth )
- copy = self.dup
- copy.detab!( tabwidth )
- return copy
- end
-
-
- ### Convert tabs to spaces in place and return self if any were converted.
- def detab!( tabwidth=TabWidth )
- newstr = self.split( /\n/ ).collect {|line|
- line.gsub( /(.*?)\t/ ) do
- $1 + ' ' * (tabwidth - $1.length % tabwidth)
- end
- }.join("\n")
- self.replace( newstr )
- end
-
-
- #######
- #private
- #######
-
- ### Do block-level transforms on a copy of +str+ using the specified render
- ### state +rs+ and return the results.
- def apply_block_transforms( str, rs )
- # Port: This was called '_runBlockGamut' in the original
-
- @log.debug "Applying block transforms to:\n %p" % str
- text = transform_headers( str, rs )
- text = transform_hrules( text, rs )
- text = transform_lists( text, rs )
- text = transform_code_blocks( text, rs )
- text = transform_block_quotes( text, rs )
- text = transform_auto_links( text, rs )
- text = hide_html_blocks( text, rs )
-
- text = form_paragraphs( text, rs )
-
- @log.debug "Done with block transforms:\n %p" % text
- return text
- end
-
-
- ### Apply Markdown span transforms to a copy of the specified +str+ with the
- ### given render state +rs+ and return it.
- def apply_span_transforms( str, rs )
- @log.debug "Applying span transforms to:\n %p" % str
-
- str = transform_code_spans( str, rs )
- str = encode_html( str )
- str = transform_images( str, rs )
- str = transform_anchors( str, rs )
- str = transform_italic_and_bold( str, rs )
-
- # Hard breaks
- str.gsub!( / {2,}\n/, "<br#{EmptyElementSuffix}\n" )
-
- @log.debug "Done with span transforms:\n %p" % str
- return str
- end
-
-
- # The list of tags which are considered block-level constructs and an
- # alternation pattern suitable for use in regexps made from the list
- StrictBlockTags = %w[ p div h[1-6] blockquote pre table dl ol ul script noscript
- form fieldset iframe math ins del ]
- StrictTagPattern = StrictBlockTags.join('|')
-
- LooseBlockTags = StrictBlockTags - %w[ins del]
- LooseTagPattern = LooseBlockTags.join('|')
-
- # Nested blocks:
- # <div>
- # <div>
- # tags for inner block must be indented.
- # </div>
- # </div>
- StrictBlockRegex = %r{
- ^ # Start of line
- <(#{StrictTagPattern}) # Start tag: \2
- \b # word break
- (.*\n)*? # Any number of lines, minimal match
- </\1> # Matching end tag
- [ ]* # trailing spaces
- $ # End of line or document
- }ix
-
- # More-liberal block-matching
- LooseBlockRegex = %r{
- ^ # Start of line
- <(#{LooseTagPattern}) # start tag: \2
- \b # word break
- (.*\n)*? # Any number of lines, minimal match
- .*</\1> # Anything + Matching end tag
- [ ]* # trailing spaces
- $ # End of line or document
- }ix
-
- # Special case for <hr />.
- HruleBlockRegex = %r{
- ( # $1
- \A\n? # Start of doc + optional \n
- | # or
- .*\n\n # anything + blank line
- )
- ( # save in $2
- [ ]* # Any spaces
- <hr # Tag open
- \b # Word break
- ([^<>])*? # Attributes
- /?> # Tag close
- $ # followed by a blank line or end of document
- )
- }ix
-
- ### Replace all blocks of HTML in +str+ that start in the left margin with
- ### tokens.
- def hide_html_blocks( str, rs )
- @log.debug "Hiding HTML blocks in %p" % str
-
- # Tokenizer proc to pass to gsub
- tokenize = lambda {|match|
- key = Digest::MD5::hexdigest( match )
- rs.html_blocks[ key ] = match
- @log.debug "Replacing %p with %p" % [ match, key ]
- "\n\n#{key}\n\n"
- }
-
- rval = str.dup
-
- @log.debug "Finding blocks with the strict regex..."
- rval.gsub!( StrictBlockRegex, &tokenize )
-
- @log.debug "Finding blocks with the loose regex..."
- rval.gsub!( LooseBlockRegex, &tokenize )
-
- @log.debug "Finding hrules..."
- rval.gsub!( HruleBlockRegex ) {|match| $1 + tokenize[$2] }
-
- return rval
- end
-
-
- # Link defs are in the form: ^[id]: url "optional title"
- LinkRegex = %r{
- ^[ ]*\[(.+)\]: # id = $1
- [ ]*
- \n? # maybe *one* newline
- [ ]*
- <?(\S+?)>? # url = $2
- [ ]*
- \n? # maybe one newline
- [ ]*
- (?:
- # Titles are delimited by "quotes" or (parens).
- ["(]
- (.+?) # title = $3
- [")] # Matching ) or "
- [ ]*
- )? # title is optional
- (?:\n+|\Z)
- }x
-
- ### Strip link definitions from +str+, storing them in the given RenderState
- ### +rs+.
- def strip_link_definitions( str, rs )
- str.gsub( LinkRegex ) {|match|
- id, url, title = $1, $2, $3
-
- rs.urls[ id.downcase ] = encode_html( url )
- unless title.nil?
- rs.titles[ id.downcase ] = title.gsub( /"/, """ )
- end
- ""
- }
- end
-
-
- ### Escape special characters in the given +str+
- def escape_special_chars( str )
- @log.debug " Escaping special characters"
- text = ''
-
- # The original Markdown source has something called '$tags_to_skip'
- # declared here, but it's never used, so I don't define it.
-
- tokenize_html( str ) {|token, str|
- @log.debug " Adding %p token %p" % [ token, str ]
- case token
-
- # Within tags, encode * and _
- when :tag
- text += str.
- gsub( /\*/, EscapeTable['*'][:md5] ).
- gsub( /_/, EscapeTable['_'][:md5] )
-
- # Encode backslashed stuff in regular text
- when :text
- text += encode_backslash_escapes( str )
- else
- raise TypeError, "Unknown token type %p" % token
- end
- }
-
- @log.debug " Text with escapes is now: %p" % text
- return text
- end
-
-
- ### Swap escaped special characters in a copy of the given +str+ and return
- ### it.
- def unescape_special_chars( str )
- EscapeTable.each {|char, hash|
- @log.debug "Unescaping escaped %p with %p" % [ char, hash[:md5re] ]
- str.gsub!( hash[:md5re], char )
- }
-
- return str
- end
-
-
- ### Return a copy of the given +str+ with any backslashed special character
- ### in it replaced with MD5 placeholders.
- def encode_backslash_escapes( str )
- # Make a copy with any double-escaped backslashes encoded
- text = str.gsub( /\\\\/, EscapeTable['\\'][:md5] )
-
- EscapeTable.each_pair {|char, esc|
- next if char == '\\'
- text.gsub!( esc[:re], esc[:md5] )
- }
-
- return text
- end
-
-
- ### Transform any Markdown-style horizontal rules in a copy of the specified
- ### +str+ and return it.
- def transform_hrules( str, rs )
- @log.debug " Transforming horizontal rules"
- str.gsub( /^( ?[\-\*_] ?){3,}$/, "\n<hr#{EmptyElementSuffix}\n" )
- end
-
-
-
- # Patterns to match and transform lists
- ListMarkerOl = %r{\d+\.}
- ListMarkerUl = %r{[*+-]}
- ListMarkerAny = Regexp::union( ListMarkerOl, ListMarkerUl )
-
- ListRegexp = %r{
- (?:
- ^[ ]{0,#{TabWidth - 1}} # Indent < tab width
- (#{ListMarkerAny}) # unordered or ordered ($1)
- [ ]+ # At least one space
- )
- (?m:.+?) # item content (include newlines)
- (?:
- \z # Either EOF
- | # or
- \n{2,} # Blank line...
- (?=\S) # ...followed by non-space
- (?![ ]* # ...but not another item
- (#{ListMarkerAny})
- [ ]+)
- )
- }x
-
- ### Transform Markdown-style lists in a copy of the specified +str+ and
- ### return it.
- def transform_lists( str, rs )
- @log.debug " Transforming lists at %p" % (str[0,100] + '...')
-
- str.gsub( ListRegexp ) {|list|
- @log.debug " Found list %p" % list
- bullet = $1
- list_type = (ListMarkerUl.match(bullet) ? "ul" : "ol")
- list.gsub!( /\n{2,}/, "\n\n\n" )
-
- %{<%s>\n%s</%s>\n} % [
- list_type,
- transform_list_items( list, rs ),
- list_type,
- ]
- }
- end
-
-
- # Pattern for transforming list items
- ListItemRegexp = %r{
- (\n)? # leading line = $1
- (^[ ]*) # leading whitespace = $2
- (#{ListMarkerAny}) [ ]+ # list marker = $3
- ((?m:.+?) # list item text = $4
- (\n{1,2}))
- (?= \n* (\z | \2 (#{ListMarkerAny}) [ ]+))
- }x
-
- ### Transform list items in a copy of the given +str+ and return it.
- def transform_list_items( str, rs )
- @log.debug " Transforming list items"
-
- # Trim trailing blank lines
- str = str.sub( /\n{2,}\z/, "\n" )
-
- str.gsub( ListItemRegexp ) {|line|
- @log.debug " Found item line %p" % line
- leading_line, item = $1, $4
-
- if leading_line or /\n{2,}/.match( item )
- @log.debug " Found leading line or item has a blank"
- item = apply_block_transforms( outdent(item), rs )
- else
- # Recursion for sub-lists
- @log.debug " Recursing for sublist"
- item = transform_lists( outdent(item), rs ).chomp
- item = apply_span_transforms( item, rs )
- end
-
- %{<li>%s</li>\n} % item
- }
- end
-
-
- # Pattern for matching codeblocks
- CodeBlockRegexp = %r{
- (?:\n\n|\A)
- ( # $1 = the code block
- (?:
- (?:[ ]{#{TabWidth}} | \t) # a tab or tab-width of spaces
- .*\n+
- )+
- )
- (^[ ]{0,#{TabWidth - 1}}\S|\Z) # Lookahead for non-space at
- # line-start, or end of doc
- }x
-
- ### Transform Markdown-style codeblocks in a copy of the specified +str+ and
- ### return it.
- def transform_code_blocks( str, rs )
- @log.debug " Transforming code blocks"
-
- str.gsub( CodeBlockRegexp ) {|block|
- codeblock = $1
- remainder = $2
-
- # Generate the codeblock
- %{\n\n<pre><code>%s\n</code></pre>\n\n%s} %
- [ encode_code( outdent(codeblock), rs ).rstrip, remainder ]
- }
- end
-
-
- # Pattern for matching Markdown blockquote blocks
- BlockQuoteRegexp = %r{
- (?:
- ^[ ]*>[ ]? # '>' at the start of a line
- .+\n # rest of the first line
- (?:.+\n)* # subsequent consecutive lines
- \n* # blanks
- )+
- }x
- PreChunk = %r{ ( ^ \s* <pre> .+? </pre> ) }xm
-
- ### Transform Markdown-style blockquotes in a copy of the specified +str+
- ### and return it.
- def transform_block_quotes( str, rs )
- @log.debug " Transforming block quotes"
-
- str.gsub( BlockQuoteRegexp ) {|quote|
- @log.debug "Making blockquote from %p" % quote
-
- quote.gsub!( /^ *> ?/, '' ) # Trim one level of quoting
- quote.gsub!( /^ +$/, '' ) # Trim whitespace-only lines
-
- indent = " " * TabWidth
- quoted = %{<blockquote>\n%s\n</blockquote>\n\n} %
- apply_block_transforms( quote, rs ).
- gsub( /^/, indent ).
- gsub( PreChunk ) {|m| m.gsub(/^#{indent}/o, '') }
- @log.debug "Blockquoted chunk is: %p" % quoted
- quoted
- }
- end
-
-
- AutoAnchorURLRegexp = /<((https?|ftp):[^'">\s]+)>/
- AutoAnchorEmailRegexp = %r{
- <
- (
- [-.\w]+
- \@
- [-a-z0-9]+(\.[-a-z0-9]+)*\.[a-z]+
- )
- >
- }xi
-
- ### Transform URLs in a copy of the specified +str+ into links and return
- ### it.
- def transform_auto_links( str, rs )
- @log.debug " Transforming auto-links"
- str.gsub( AutoAnchorURLRegexp, %{<a href="\\1">\\1</a>}).
- gsub( AutoAnchorEmailRegexp ) {|addr|
- encode_email_address( unescape_special_chars($1) )
- }
- end
-
-
- # Encoder functions to turn characters of an email address into encoded
- # entities.
- Encoders = [
- lambda {|char| "&#%03d;" % char},
- lambda {|char| "&#x%X;" % char},
- lambda {|char| char.chr },
- ]
-
- ### Transform a copy of the given email +addr+ into an escaped version safer
- ### for posting publicly.
- def encode_email_address( addr )
-
- rval = ''
- ("mailto:" + addr).each_byte {|b|
- case b
- when ?:
- rval += ":"
- when ?@
- rval += Encoders[ rand(2) ][ b ]
- else
- r = rand(100)
- rval += (
- r > 90 ? Encoders[2][ b ] :
- r < 45 ? Encoders[1][ b ] :
- Encoders[0][ b ]
- )
- end
- }
-
- return %{<a href="%s">%s</a>} % [ rval, rval.sub(/.+?:/, '') ]
- end
-
-
- # Regex for matching Setext-style headers
- SetextHeaderRegexp = %r{
- (.+) # The title text ($1)
- \n
- ([\-=])+ # Match a line of = or -. Save only one in $2.
- [ ]*\n+
- }x
-
- # Regexp for matching ATX-style headers
- AtxHeaderRegexp = %r{
- ^(\#{1,6}) # $1 = string of #'s
- [ ]*
- (.+?) # $2 = Header text
- [ ]*
- \#* # optional closing #'s (not counted)
- \n+
- }x
-
- ### Apply Markdown header transforms to a copy of the given +str+ amd render
- ### state +rs+ and return the result.
- def transform_headers( str, rs )
- @log.debug " Transforming headers"
-
- # Setext-style headers:
- # Header 1
- # ========
- #
- # Header 2
- # --------
- #
- str.
- gsub( SetextHeaderRegexp ) {|m|
- @log.debug "Found setext-style header"
- title, hdrchar = $1, $2
- title = apply_span_transforms( title, rs )
-
- case hdrchar
- when '='
- %[<h1>#{title}</h1>\n\n]
- when '-'
- %[<h2>#{title}</h2>\n\n]
- else
- title
- end
- }.
-
- gsub( AtxHeaderRegexp ) {|m|
- @log.debug "Found ATX-style header"
- hdrchars, title = $1, $2
- title = apply_span_transforms( title, rs )
-
- level = hdrchars.length
- %{<h%d>%s</h%d>\n\n} % [ level, title, level ]
- }
- end
-
-
- ### Wrap all remaining paragraph-looking text in a copy of +str+ inside <p>
- ### tags and return it.
- def form_paragraphs( str, rs )
- @log.debug " Forming paragraphs"
- grafs = str.
- sub( /\A\n+/, '' ).
- sub( /\n+\z/, '' ).
- split( /\n{2,}/ )
-
- rval = grafs.collect {|graf|
-
- # Unhashify HTML blocks if this is a placeholder
- if rs.html_blocks.key?( graf )
- rs.html_blocks[ graf ]
-
- # Otherwise, wrap in <p> tags
- else
- apply_span_transforms(graf, rs).
- sub( /^[ ]*/, '<p>' ) + '</p>'
- end
- }.join( "\n\n" )
-
- @log.debug " Formed paragraphs: %p" % rval
- return rval
- end
-
-
- # Pattern to match the linkid part of an anchor tag for reference-style
- # links.
- RefLinkIdRegex = %r{
- [ ]? # Optional leading space
- (?:\n[ ]*)? # Optional newline + spaces
- \[
- (.*?) # Id = $1
- \]
- }x
-
- InlineLinkRegex = %r{
- \( # Literal paren
- [ ]* # Zero or more spaces
- <?(.+?)>? # URI = $1
- [ ]* # Zero or more spaces
- (?: #
- ([\"\']) # Opening quote char = $2
- (.*?) # Title = $3
- \2 # Matching quote char
- )? # Title is optional
- \)
- }x
-
- ### Apply Markdown anchor transforms to a copy of the specified +str+ with
- ### the given render state +rs+ and return it.
- def transform_anchors( str, rs )
- @log.debug " Transforming anchors"
- @scanner.string = str.dup
- text = ''
-
- # Scan the whole string
- until @scanner.empty?
-
- if @scanner.scan( /\[/ )
- link = ''; linkid = ''
- depth = 1
- startpos = @scanner.pos
- @log.debug " Found a bracket-open at %d" % startpos
-
- # Scan the rest of the tag, allowing unlimited nested []s. If
- # the scanner runs out of text before the opening bracket is
- # closed, append the text and return (wasn't a valid anchor).
- while depth.nonzero?
- linktext = @scanner.scan_until( /\]|\[/ )
-
- if linktext
- @log.debug " Found a bracket at depth %d: %p" % [ depth, linktext ]
- link += linktext
-
- # Decrement depth for each closing bracket
- depth += ( linktext[-1, 1] == ']' ? -1 : 1 )
- @log.debug " Depth is now #{depth}"
-
- # If there's no more brackets, it must not be an anchor, so
- # just abort.
- else
- @log.debug " Missing closing brace, assuming non-link."
- link += @scanner.rest
- @scanner.terminate
- return text + '[' + link
- end
- end
- link.slice!( -1 ) # Trim final ']'
- @log.debug " Found leading link %p" % link
-
- # Look for a reference-style second part
- if @scanner.scan( RefLinkIdRegex )
- linkid = @scanner[1]
- linkid = link.dup if linkid.empty?
- linkid.downcase!
- @log.debug " Found a linkid: %p" % linkid
-
- # If there's a matching link in the link table, build an
- # anchor tag for it.
- if rs.urls.key?( linkid )
- @log.debug " Found link key in the link table: %p" % rs.urls[linkid]
- url = escape_md( rs.urls[linkid] )
-
- text += %{<a href="#{url}"}
- if rs.titles.key?(linkid)
- text += %{ title="%s"} % escape_md( rs.titles[linkid] )
- end
- text += %{>#{link}</a>}
-
- # If the link referred to doesn't exist, just append the raw
- # source to the result
- else
- @log.debug " Linkid %p not found in link table" % linkid
- @log.debug " Appending original string instead: "
- @log.debug "%p" % @scanner.string[ startpos-1 .. @scanner.pos-1 ]
- text += @scanner.string[ startpos-1 .. @scanner.pos-1 ]
- end
-
- # ...or for an inline style second part
- elsif @scanner.scan( InlineLinkRegex )
- url = @scanner[1]
- title = @scanner[3]
- @log.debug " Found an inline link to %p" % url
-
- text += %{<a href="%s"} % escape_md( url )
- if title
- title.gsub!( /"/, """ )
- text += %{ title="%s"} % escape_md( title )
- end
- text += %{>#{link}</a>}
-
- # No linkid part: just append the first part as-is.
- else
- @log.debug "No linkid, so no anchor. Appending literal text."
- text += @scanner.string[ startpos-1 .. @scanner.pos-1 ]
- end # if linkid
-
- # Plain text
- else
- @log.debug " Scanning to the next link from %p" % @scanner.rest
- text += @scanner.scan( /[^\[]+/ )
- end
-
- end # until @scanner.empty?
-
- return text
- end
-
-
- # Pattern to match strong emphasis in Markdown text
- BoldRegexp = %r{ (\*\*|__) (\S|\S.+?\S) \1 }x
-
- # Pattern to match normal emphasis in Markdown text
- ItalicRegexp = %r{ (\*|_) (\S|\S.+?\S) \1 }x
-
- ### Transform italic- and bold-encoded text in a copy of the specified +str+
- ### and return it.
- def transform_italic_and_bold( str, rs )
- @log.debug " Transforming italic and bold"
-
- str.
- gsub( BoldRegexp, %{<strong>\\2</strong>} ).
- gsub( ItalicRegexp, %{<em>\\2</em>} )
- end
-
-
- ### Transform backticked spans into <code> spans.
- def transform_code_spans( str, rs )
- @log.debug " Transforming code spans"
-
- # Set up the string scanner and just return the string unless there's at
- # least one backtick.
- @scanner.string = str.dup
- unless @scanner.exist?( /`/ )
- @scanner.terminate
- @log.debug "No backticks found for code span in %p" % str
- return str
- end
-
- @log.debug "Transforming code spans in %p" % str
-
- # Build the transformed text anew
- text = ''
-
- # Scan to the end of the string
- until @scanner.empty?
-
- # Scan up to an opening backtick
- if pre = @scanner.scan_until( /.?(?=`)/m )
- text += pre
- @log.debug "Found backtick at %d after '...%s'" % [ @scanner.pos, text[-10, 10] ]
-
- # Make a pattern to find the end of the span
- opener = @scanner.scan( /`+/ )
- len = opener.length
- closer = Regexp::new( opener )
- @log.debug "Scanning for end of code span with %p" % closer
-
- # Scan until the end of the closing backtick sequence. Chop the
- # backticks off the resultant string, strip leading and trailing
- # whitespace, and encode any enitites contained in it.
- codespan = @scanner.scan_until( closer ) or
- raise FormatError::new( @scanner.rest[0,20],
- "No %p found before end" % opener )
-
- @log.debug "Found close of code span at %d: %p" % [ @scanner.pos - len, codespan ]
- codespan.slice!( -len, len )
- text += "<code>%s</code>" %
- encode_code( codespan.strip, rs )
-
- # If there's no more backticks, just append the rest of the string
- # and move the scan pointer to the end
- else
- text += @scanner.rest
- @scanner.terminate
- end
- end
-
- return text
- end
-
-
- # Next, handle inline images: ![alt text](url "optional title")
- # Don't forget: encode * and _
- InlineImageRegexp = %r{
- ( # Whole match = $1
- !\[ (.*?) \] # alt text = $2
- \([ ]*
- <?(\S+?)>? # source url = $3
- [ ]*
- (?: #
- (["']) # quote char = $4
- (.*?) # title = $5
- \4 # matching quote
- [ ]*
- )? # title is optional
- \)
- )
- }xs #"
-
-
- # Reference-style images
- ReferenceImageRegexp = %r{
- ( # Whole match = $1
- !\[ (.*?) \] # Alt text = $2
- [ ]? # Optional space
- (?:\n[ ]*)? # One optional newline + spaces
- \[ (.*?) \] # id = $3
- )
- }xs
-
- ### Turn image markup into image tags.
- def transform_images( str, rs )
- @log.debug " Transforming images" % str
-
- # Handle reference-style labeled images: ![alt text][id]
- str.
- gsub( ReferenceImageRegexp ) {|match|
- whole, alt, linkid = $1, $2, $3.downcase
- @log.debug "Matched %p" % match
- res = nil
- alt.gsub!( /"/, '"' )
-
- # for shortcut links like ![this][].
- linkid = alt.downcase if linkid.empty?
-
- if rs.urls.key?( linkid )
- url = escape_md( rs.urls[linkid] )
- @log.debug "Found url '%s' for linkid '%s' " % [ url, linkid ]
-
- # Build the tag
- result = %{<img src="%s" alt="%s"} % [ url, alt ]
- if rs.titles.key?( linkid )
- result += %{ title="%s"} % escape_md( rs.titles[linkid] )
- end
- result += EmptyElementSuffix
-
- else
- result = whole
- end
-
- @log.debug "Replacing %p with %p" % [ match, result ]
- result
- }.
-
- # Inline image style
- gsub( InlineImageRegexp ) {|match|
- @log.debug "Found inline image %p" % match
- whole, alt, title = $1, $2, $5
- url = escape_md( $3 )
- alt.gsub!( /"/, '"' )
-
- # Build the tag
- result = %{<img src="%s" alt="%s"} % [ url, alt ]
- unless title.nil?
- title.gsub!( /"/, '"' )
- result += %{ title="%s"} % escape_md( title )
- end
- result += EmptyElementSuffix
-
- @log.debug "Replacing %p with %p" % [ match, result ]
- result
- }
- end
-
-
- # Regexp to match special characters in a code block
- CodeEscapeRegexp = %r{( \* | _ | \{ | \} | \[ | \] | \\ )}x
-
- ### Escape any characters special to HTML and encode any characters special
- ### to Markdown in a copy of the given +str+ and return it.
- def encode_code( str, rs )
- str.gsub( %r{&}, '&' ).
- gsub( %r{<}, '<' ).
- gsub( %r{>}, '>' ).
- gsub( CodeEscapeRegexp ) {|match| EscapeTable[match][:md5]}
- end
-
-
-
- #################################################################
- ### U T I L I T Y F U N C T I O N S
- #################################################################
-
- ### Escape any markdown characters in a copy of the given +str+ and return
- ### it.
- def escape_md( str )
- str.
- gsub( /\*/, EscapeTable['*'][:md5] ).
- gsub( /_/, EscapeTable['_'][:md5] )
- end
-
-
- # Matching constructs for tokenizing X/HTML
- HTMLCommentRegexp = %r{ <! ( -- .*? -- \s* )+ > }mx
- XMLProcInstRegexp = %r{ <\? .*? \?> }mx
- MetaTag = Regexp::union( HTMLCommentRegexp, XMLProcInstRegexp )
-
- HTMLTagOpenRegexp = %r{ < [a-z/!$] [^<>]* }imx
- HTMLTagCloseRegexp = %r{ > }x
- HTMLTagPart = Regexp::union( HTMLTagOpenRegexp, HTMLTagCloseRegexp )
-
- ### Break the HTML source in +str+ into a series of tokens and return
- ### them. The tokens are just 2-element Array tuples with a type and the
- ### actual content. If this function is called with a block, the type and
- ### text parts of each token will be yielded to it one at a time as they are
- ### extracted.
- def tokenize_html( str )
- depth = 0
- tokens = []
- @scanner.string = str.dup
- type, token = nil, nil
-
- until @scanner.empty?
- @log.debug "Scanning from %p" % @scanner.rest
-
- # Match comments and PIs without nesting
- if (( token = @scanner.scan(MetaTag) ))
- type = :tag
-
- # Do nested matching for HTML tags
- elsif (( token = @scanner.scan(HTMLTagOpenRegexp) ))
- tagstart = @scanner.pos
- @log.debug " Found the start of a plain tag at %d" % tagstart
-
- # Start the token with the opening angle
- depth = 1
- type = :tag
-
- # Scan the rest of the tag, allowing unlimited nested <>s. If
- # the scanner runs out of text before the tag is closed, raise
- # an error.
- while depth.nonzero?
-
- # Scan either an opener or a closer
- chunk = @scanner.scan( HTMLTagPart ) or
- raise "Malformed tag at character %d: %p" %
- [ tagstart, token + @scanner.rest ]
-
- @log.debug " Found another part of the tag at depth %d: %p" % [ depth, chunk ]
-
- token += chunk
-
- # If the last character of the token so far is a closing
- # angle bracket, decrement the depth. Otherwise increment
- # it for a nested tag.
- depth += ( token[-1, 1] == '>' ? -1 : 1 )
- @log.debug " Depth is now #{depth}"
- end
-
- # Match text segments
- else
- @log.debug " Looking for a chunk of text"
- type = :text
-
- # Scan forward, always matching at least one character to move
- # the pointer beyond any non-tag '<'.
- token = @scanner.scan_until( /[^<]+/m )
- end
-
- @log.debug " type: %p, token: %p" % [ type, token ]
-
- # If a block is given, feed it one token at a time. Add the token to
- # the token list to be returned regardless.
- if block_given?
- yield( type, token )
- end
- tokens << [ type, token ]
- end
-
- return tokens
- end
-
-
- ### Return a copy of +str+ with angle brackets and ampersands HTML-encoded.
- def encode_html( str )
- str.gsub( /&(?!#?[x]?(?:[0-9a-f]+|\w+);)/i, "&" ).
- gsub( %r{<(?![a-z/?\$!])}i, "<" )
- end
-
-
- ### Return one level of line-leading tabs or spaces from a copy of +str+ and
- ### return it.
- def outdent( str )
- str.gsub( /^(\t|[ ]{1,#{TabWidth}})/, '')
- end
-
- end # class BlueCloth
-