/lib/rpagedown/converter.rb
Ruby | 740 lines | 687 code | 53 blank | 0 comment | 20 complexity | a21c1b1a710140d31fc9d3d7b50095ac MD5 | raw file
- require 'rpagedown/hook_collection'
- module RPageDown
- class Converter
- attr_reader :hooks
- def initialize
- @pluginHooks = @hooks = HookCollection.new
- @pluginHooks.add_noop(:plain_link_text)
- @pluginHooks.add_noop(:pre_conversion)
- @pluginHooks.add_noop(:post_conversion)
- @g_urls = nil
- @g_titles = nil
- @g_html_blocks = nil
- @g_list_level = -1
- @_list_item_markers = {
- :ol => "\\d+[.]",
- :ul => "[*+-]"
- }
- @_problem_url_chars = /(?:["'*()\[\]:]|~D)/
- end
- # gsub! on $1~9 will not affect the original variable
- # you can first assign $1~9 to a variable and then do that on the new variable
- # however $1~9 will remain unchanged
- # it is designed to be changed only when group capture changes
- def make_html(text)
- if @g_urls
- raise "Recursive call to converter.makeHtml"
- end
- @g_urls = Hash.new
- @g_titles = Hash.new
- @g_html_blocks = []
- @g_list_level = 0
- text = "#{@pluginHooks.pre_conversion(text)}"
- text.gsub!(/~/, '~T')
- text.gsub!(/\$/, '~D')
- text.gsub!(/\r\n/, "\n")
- text.gsub!(/\r/, "\n")
- text = "\n\n#{text}\n\n"
- text = _detab(text)
- text.gsub!(/^[ \t]+$/, '')
- text = _hash_html_blocks(text)
- text = _strip_link_definitions(text)
- text = _run_block_gamut(text)
- text = _unescape_special_chars(text)
- text.gsub!(/~D/, '$')
- text.gsub!(/~T/, '~')
- text = @pluginHooks.post_conversion(text)
- @g_html_blocks = @g_titles = @g_urls = nil
- text
- end
- private
- def _strip_link_definitions(text)
- regexp = %r{
- ^[ ]{0,3}\[(.+)\]: # id = $1 attacklab: g_tab_width - 1
- [ \t]*
- \n? # maybe *one* newline
- [ \t]*
- <?(\S+?)>? # url = $2
- (?=\s|$) # lookahead for whitespace instead of the lookbehind removed below
- [ \t]*
- \n? # maybe one newline
- [ \t]*
- ( # (potential) title = $3
- (\n*) # any lines skipped = $4 attacklab: lookbehind removed
- [ \t]+
- ["(]
- (.+?) # title = $5
- [")]
- [ \t]*
- )? # title is optional
- (?:\n+|$)
- }x
- text.gsub regexp do |match|
- m1 = $1.downcase
- @g_urls[m1] = _encode_amps_and_angles($2)
- if $4 and not $4.empty?
- next $3
- elsif $5 and not $5.empty?
- @g_titles[m1] = $5.gsub(/"/, """)
- end
- ""
- end
- end
- def _hash_html_blocks(text)
- block_tags_a = "p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del"
- block_tags_b = "p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math"
- text.gsub!(%r{
- ( # save in $1
- ^ # start of line (with /m)
- <(#{block_tags_a}) # start tag = $2
- \b # word break
- # attacklab: hack around khtml/pcre bug...
- [^\r]*?\n # any number of lines, minimally matching
- </\2> # the matching end tag
- [ \t]* # trailing spaces/tabs
- (?=\n+) # followed by a newline
- ) # attacklab: there are sentinel newlines at end of document
- }x) {|match| hash_element(match, $1)}
- text.gsub!(%r{
- ( # save in $1
- ^ # start of line (with /m)
- <(#{block_tags_b}) # start tag = $2
- \b # word break
- # attacklab: hack around khtml/pcre bug...
- [^\r]*? # any number of lines, minimally matching
- .*</\2> # the matching end tag
- [ \t]* # trailing spaces/tabs
- (?=\n+) # followed by a newline
- ) # attacklab: there are sentinel newlines at end of document
- }x) {|match| hash_element(match, $1) }
- text.gsub!(%r{
- \n # Starting after a blank line
- [ ]{0,3}
- ( # save in $1
- (<(hr) # start tag = $2
- \b # word break
- ([^<>])*?
- \/?>) # the matching end tag
- [ \t]*
- (?=\n{2,}) # followed by a blank line
- )
- }x) {|match| hash_element(match, $1) }
- text.gsub!(%r{
- \n\n # Starting after a blank line
- [ ]{0,3} # attacklab: g_tab_width - 1
- ( # save in $1
- <!
- (--(?:|(?:[^>-]|-[^>])(?:[^-]|-[^-])*)--) # see http://www.w3.org/TR/html-markup/syntax.html#comments and http://meta.stackoverflow.com/q/95256
- >
- [ \t]*
- (?=\n{2,}) # followed by a blank line
- )
- }x) {|match| hash_element(match, $1) }
- text.gsub!(%r{
- (?:
- \n\n # Starting after a blank line
- )
- ( # save in $1
- [ ]{0,3} # attacklab: g_tab_width - 1
- (?:
- <([?%]) # $2
- [^\r]*?
- \2>
- )
- [ \t]*
- (?=\n{2,}) # followed by a blank line
- )
- }x) {|match| hash_element(match, $1) }
- text
- end
- def hash_element(match, m1)
- block_text = m1
- block_text.gsub! /\A\n+/, ''
- block_text.gsub! /\n+\z/, ''
- "\n\n~K#{@g_html_blocks.push(block_text).size - 1}K\n\n"
- end
- def _run_block_gamut(text, do_not_unhash = nil)
- text = _do_headers(text)
- replacement = "<hr />\n"
- text.gsub!(/^[ ]{0,2}([ ]?\*[ ]?){3,}[ \t]*$/, replacement)
- text.gsub!(/^[ ]{0,2}([ ]?-[ ]?){3,}[ \t]*$/, replacement)
- text.gsub!(/^[ ]{0,2}([ ]?_[ ]?){3,}[ \t]*$/, replacement)
- text = _do_lists(text)
- text = _do_code_blocks(text)
- text = _do_block_quotes(text)
-
- text = _hash_html_blocks(text)
- text = _form_paragraphs(text, do_not_unhash)
- text
- end
- def _run_span_gamut(text)
- text = _do_code_spans(text)
- text = _escape_special_chars_within_tag_attributes(text)
- text = _encode_backslash_escapes(text)
- text = _do_images(text)
- text = _do_anchors(text)
- text = _do_auto_links(text)
- text.gsub!(/~P/, '://')
- text = _encode_amps_and_angles(text)
- text = _do_italics_and_bold(text)
- text.gsub(/ +\n/, " <br />\n")
- end
- def _escape_special_chars_within_tag_attributes(text)
- regexp = /(<[a-z\/!$]("[^"]*"|'[^']*'|[^'">])*>|<!(--(?:|(?:[^>-]|-[^>])(?:[^-]|-[^-])*)--)>)/i
- text.gsub! regexp do |match|
- tag = match.gsub(/(.)<\/?code>(?=.)/, "\\1`")
- tag = escape_characters(tag, if match[0] == '!' then "\\`*_/" else "\\`*_" end)
- tag
- end
- text
- end
- def _do_anchors(text)
- text.gsub!(%r{
- ( # wrap whole match in $1
- \[
- (
- (?:
- \[[^\]]*\] # allow brackets nested one level
- |
- [^\[] # or anything else
- )*
- )
- \]
- [ ]? # one optional space
- (?:\n[ ]*)? # one optional newline followed by spaces
- \[
- (.*?) # id = $3
- \]
- )
- ()()()() # pad remaining backreferences
- }x) {|match| write_anchor_tag(match, $1, $2, $3, $4, $5, $6, $7)}
- text.gsub!(%r{
- ( # wrap whole match in $1
- \[
- (
- (?:
- \[[^\]]*\] # allow brackets nested one level
- |
- [^\[\]] # or anything else
- )*
- )
- \]
- \( # literal paren
- [ \t]*
- () # no id, so leave $3 empty
- <?( # href = $4
- (?:
- \([^)]*\) # allow one level of (correctly nested) parens (think MSDN)
- |
- [^()\s]
- )*?
- )>?
- [ \t]*
- ( # $5
- (['"]) # quote char = $6
- (.*?) # Title = $7
- \6 # matching quote
- [ \t]* # ignore any spaces/tabs between closing quote and )
- )? # title is optional
- \)
- )
- }x) {|match| write_anchor_tag(match, $1, $2, $3, $4, $5, $6, $7)}
-
- text.gsub!(%r{
- ( # wrap whole match in $1
- \[
- ([^\[\]]+) # link text = $2; can't contain '[' or ']'
- \]
- )
- ()()()()() # pad rest of backreferences
- }x) {|match| write_anchor_tag(match, $1, $2, $3, $4, $5, $6, $7)}
- text
- end
- def write_anchor_tag(match, m1, m2, m3, m4, m5, m6, m7)
- m7 = '' if not m7
- whole_match = m1
- link_text = m2.gsub(/:\/\//, "~P")
- link_id = m3.downcase
- url = m4
- title = m7
- if url == ''
- if link_id == ''
- link_id = link_text.downcase.gsub(/ ?\n/, " ")
- end
- url = "##{link_id}"
- if @g_urls.has_key? link_id
- url = @g_urls[link_id]
- if @g_titles.has_key? link_id
- title = @g_titles[link_id]
- end
- else
- if whole_match =~ /\(\s*\)\z/
- url = ''
- else
- return whole_match
- end
- end
- end
- url = encode_problem_url_chars(url)
- url = escape_characters(url, "*_")
- result = %Q{<a href="#{url}"}
- if not title.empty?
- title = attribute_encode(title)
- title = escape_characters(title, "*_")
- result << %Q{ title="#{title}"}
- end
- result << %Q{>#{link_text}</a>}
- result
- end
- def _do_images(text)
- text.gsub!(%r{
- ( # wrap whole match in $1
- !\[
- (.*?) # alt text = $2
- \]
- [ ]? # one optional space
- (?:\n[ ]*)? # one optional newline followed by spaces
- \[
- (.*?) # id = $3
- \]
- )
- ()()()() # pad rest of backreferences
- }x) {|match| write_image_tag(match, $1, $2, $3, $4, $5, $6, $7)}
- text.gsub!(%r{
- ( # wrap whole match in $1
- !\[
- (.*?) # alt text = $2
- \]
- \s? # One optional whitespace character
- \( # literal paren
- [ \t]*
- () # no id, so leave $3 empty
- <?(\S+?)>? # src url = $4
- [ \t]*
- ( # $5
- (['"]) # quote char = $6
- (.*?) # title = $7
- \6 # matching quote
- [ \t]*
- )? # title is optional
- \)
- )
- }x) {|match| write_image_tag(match, $1, $2, $3, $4, $5, $6, $7)}
- text
- end
- def attribute_encode(text)
- text.gsub(/>/, '>').gsub(/</, '<').gsub(/"/, '"')
- end
- def write_image_tag(match, m1, m2, m3, m4, m5, m6, m7)
- whole_match = m1
- alt_text = m2
- link_id = m3.downcase
- url = m4
- title = m7
- title = '' if not title
- if url == ''
- if link_id == ''
- link_id = alt_text.downcase.gsub(/ ?\n/, " ")
- end
- url = '#' + link_id
- if @g_urls.has_key? link_id
- url = @g_urls[link_id]
- if @g_titles.has_key? link_id
- title = @g_titles[link_id]
- end
- else
- return whole_match
- end
- end
- alt_text = escape_characters(attribute_encode(alt_text), "*_[]()")
- url = escape_characters(url, "*_")
- result = %Q{<img src="#{url}" alt="#{alt_text}"}
- title = attribute_encode(title)
- title = escape_characters(title, "*_")
- result << %Q{ title="#{title}"}
- result << " />"
- result
- end
- def _do_headers(text)
- text.gsub! /^(.+)[ \t]*\n=+[ \t]*\n+/ do |match|
- %Q{<h1>#{_run_span_gamut($1)}</h1>\n\n}
- end
- text.gsub! /^(.+)[ \t]*\n-+[ \t]*\n+/ do |match|
- %Q{<h2>#{_run_span_gamut($1)}</h2>\n\n}
- end
- text.gsub! %r{
- ^(\#{1,6}) # $1 = string of #'s
- [ \t]*
- (.+?) # $2 = Header text
- [ \t]*
- \#* # optional closing #'s (not counted)
- \n+
- }x do |match|
- h_level = $1.length;
- %Q{<h#{h_level}>#{_run_span_gamut($2)}</h#{h_level}>\n\n}
- end
- text
- end
- def _do_lists(text)
- text << "~0"
- whole_list = %r{
- ( # $1 = whole list
- ( # $2
- [ ]{0,3} # attacklab: g_tab_width - 1
- ([*+-]|\d+[.]) # $3 = first list item marker
- [ \t]+
- )
- [^\r]+?
- ( # $4
- ~0 # sentinel for workaround; should be $
- |
- \n{2,}
- (?=\S)
- (?! # Negative lookahead for another list item marker
- [ \t]*
- (?:[*+-]|\d+[.])[ \t]+
- )
- )
- )
- }x
- if @g_list_level and @g_list_level > 0
- text.gsub! whole_list do |match|
- list = $1
- list_type = if $2 =~ /[*+-]/ then "ul" else "ol" end
- result = _process_list_items(list, list_type)
- result.sub!(/\s+\z/, "")
- %Q{<#{list_type}>#{result}</#{list_type}>\n}
- end
- else
- whole_list = /(\n\n|\A\n?)(([ ]{0,3}([*+-]|\d+[.])[ \t]+)[^\r]+?(~0|\n{2,}(?=\S)(?![ \t]*(?:[*+-]|\d+[.])[ \t]+)))/
- text.gsub! whole_list do |match|
- runup = $1
- list = $2
- list_type = if $3 =~ /[*+-]/ then "ul" else "ol" end
- result = _process_list_items(list, list_type)
- %Q{#{runup}<#{list_type}>\n#{result}</#{list_type}>\n}
- end
- end
- text.sub /~0/, ''
- end
- def _process_list_items(list_str, list_type)
- @g_list_level += 1
- list_str.sub! /\n{2,}\z/, "\n"
- list_str << "~0"
- marker = "#{@_list_item_markers[list_type.to_sym]}"
- re = %r{(^[ \t]*)(#{marker})[ \t]+([^\r]+?(\n+))(?=(~0|\1(#{marker})[ \t]+))}
- last_item_had_a_double_newline = false
- list_str.gsub! re do |whole_match|
- item = $3
- leading_space = $1
- ends_with_double_newline = item =~ /\n\n\z/
- contains_double_newline = ends_with_double_newline or item =~ /\n{2,}/
- if contains_double_newline or last_item_had_a_double_newline
- item = _run_block_gamut(_outdent(item), true)
- else
- item = _do_lists(_outdent(item))
- item.sub! /\n\z/, ''
- item = _run_span_gamut(item)
- end
- last_item_had_a_double_newline = ends_with_double_newline
- %Q{<li>#{item}</li>\n}
- end
- list_str.gsub! /~0/, ''
- @g_list_level -= 1
- list_str
- end
-
- def _do_code_blocks(text)
- text << "~0"
- text.gsub! %r{
- (?:\n\n|\A)
- ( # $1 = the code block -- one or more lines, starting with a space/tab
- (?:
- (?:[ ]{4}|\t) # Lines must start with a tab or a tab-width of spaces - attacklab: g_tab_width
- .*\n+
- )+
- )
- (\n*[ ]{0,3}[^ \t\n]|(?=~0)) # attacklab: g_tab_width
- }x do |whole_match|
- codeblock = $1
- nextchar = $2
- codeblock = _encode_code(_outdent(codeblock))
- codeblock = _detab(codeblock)
- codeblock.gsub!(/\A\n+/, '')
- codeblock.gsub!(/\n+\z/, '')
- codeblock = %Q{<pre><code>#{codeblock}\n</code></pre>}
- %Q{\n\n#{codeblock}\n\n#{nextchar}}
- end
- text.sub(/~0/, '')
- end
- def hash_block(text)
- text.gsub!(/(\A\n+|\n+\z)/, '')
- %Q{\n\n~K#{@g_html_blocks.push(text).size - 1}K\n\n}
- end
- def _do_code_spans(text)
- text.gsub! %r{
- (^|[^\\]) # Character before opening ` can't be a backslash
- (`+) # $2 = Opening run of `
- ( # $3 = The code block
- [^\r]*?
- [^`] # attacklab: work around lack of lookbehind
- )
- \2 # Matching closer
- (?!`)
- }x do |whole_match|
- m1 = $1
- c = $3
- c.gsub! /\A([ \t]*)/, ''
- c.gsub! /[ \t]*\z/, ''
- c = _encode_code(c)
- c.gsub! /:\/\//, '~P'
- %Q{#{m1}<code>#{c}</code>}
- end
- text
- end
- def _encode_code(text)
- text.gsub! '&', '&'
- text.gsub! '<', '<'
- text.gsub! '>', '>'
- text = escape_characters(text, "\*_{}[]\\", false)
- text
- end
- def _do_italics_and_bold(text)
- text.gsub! /([\W_]|^)(\*\*\*|___)(?=\S)([^\r]*?\S[\*_]*)\2([\W_]|$)/, "\\1<strong><em>\\3</em></strong>\\4"
- text.gsub! /([\W_]|^)(\*\*|__)(?=\S)([^\r]*?\S[\*_]*)\2([\W_]|$)/, "\\1<strong>\\3</strong>\\4"
- text.gsub! /([\W_]|^)(\*|_)(?=\S)([^\r\*_]*?\S)\2([\W_]|$)/, "\\1<em>\\3</em>\\4"
- text
- end
- def _do_block_quotes(text)
- text.gsub! %r{
- ( # Wrap whole match in $1
- (
- ^[ \t]*>[ \t]? # '>' at the start of a line
- .+\n # rest of the first line
- (.+\n)* # subsequent consecutive lines
- \n* # blanks
- )+
- )
- }x do |match|
- bq = $1
- bq.gsub! /^[ \t]*>[ \t]?/, "~0"
- bq.gsub! /~0/, ""
- bq.gsub! /^[ \t]+$/, ""
- bq = _run_block_gamut(bq)
- bq.gsub! /(\A|\n)/, "\\1 "
- bq.gsub! /(\s*<pre>[^\r]+?<\/pre>)/ do |match|
- pre = $1
- pre.gsub! /^ /, "~0"
- pre.gsub! /^~0/, ''
- pre
- end
- hash_block(%Q{<blockquote>\n#{bq}\n</blockquote>})
- end
- text
- end
- def _form_paragraphs(text, do_not_unhash)
- text.gsub! /\A\n+/, ''
- text.gsub! /\n+\z/, ''
- grafs = text.split(/\n{2,}/)
- grafs_out = []
- marker_re = /~K(\d+)K/
- grafs.each do |str|
- if str =~ marker_re
- grafs_out.push(str)
- elsif str =~ /\S/
- str = _run_span_gamut(str)
- str.gsub! /\A([ \t]*)/, '<p>'
- str << "</p>"
- grafs_out.push(str)
- end
- end
- if not do_not_unhash
- 0.upto(grafs_out.length - 1) do |i|
- found_any = true
- while found_any
- found_any = false
- grafs_out[i].gsub! /~K(\d+)K/ do |match|
- found_any = true
- @g_html_blocks[$1.to_i]
- end
- end
- end
- end
- grafs_out.join("\n\n")
- end
- def _encode_amps_and_angles(text)
- text.gsub! /&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/, "&"
- text.gsub /<(?![a-z\/?!]|~D)/i, '<'
- end
- def _encode_backslash_escapes(text)
- text.gsub!(/\\(\\)/) {|match| escape_characters_callback(match, $1)}
- text.gsub(/\\([`*_{}\[\]()>#+-.!])/) {|match| escape_characters_callback(match, $1)}
- end
- def _do_auto_links(text)
- text.gsub! /(^|\s)(https?|ftp)(:\/\/[-A-Z0-9+&@#\/%?=~_|\[\]\(\)!:,\.;]*[-A-Z0-9+&@#\/%=~_|\[\]])($|\W)/i, "\\1<\\2\\3>\\4"
- text.gsub /<((https?|ftp):[^'">\s]+)>/i do |match|
- %Q{<a href="#{$1}">#{@pluginHooks.plain_link_text($1)}</a>}
- end
- end
- def _unescape_special_chars(text)
- text.gsub /~E(\d+)E/ do |match|
- charCodeToReplace = $1.to_i
- [charCodeToReplace].pack "U*"
- end
- end
- def _outdent(text)
- text.gsub! /^(\t|[ ]{1,4})/, "~0"
- text.gsub! /^~0/, ''
- text
- end
- def _detab(text)
- return text if not text =~ /\t/
- spaces = [" ", " ", " ", " "]
- skew = 0
- text.gsub /[\n\t]/ do |match|
- offset = $~.begin(0)
- if match == "\n"
- skew = offset + 1
- next match
- end
- v = (offset - skew) % 4
- skew = offset + 1
- spaces[v]
- end
- end
- def encode_problem_url_chars(url)
- return "" if not url or url.empty?
- len = url.length
- url.gsub @_problem_url_chars do |match|
- next "%24" if match == "~D"
- offset = $~.begin(0)
- if match == ":"
- if offset == len - 1 || url[offset + 1] =~ /[0-9\/]/
- next ":"
- end
- end
- "%#{match[0].ord}"
- end
- end
- def escape_characters(text, chars_to_escape, after_backslash = nil)
- regex_string = %Q{([#{chars_to_escape.gsub(/([\[\]\\])/, "\\\\\\1")}])}
- regex_string = "\\\\" + regex_string if after_backslash
- text.gsub(/#{regex_string}/) {|match| escape_characters_callback(match, $1)}
- end
- def escape_characters_callback(match, m1)
- char_code_to_escape = m1[0].ord
- "~E#{char_code_to_escape}E"
- end
- end
- end