converter.rb | searchcode

/lib/rpagedown/converter.rb

https://github.com/ArieShout/rpagedown · Ruby · 740 lines · 687 code · 53 blank · 0 comment · 20 complexity · a21c1b1a710140d31fc9d3d7b50095ac MD5 · raw file

require 'rpagedown/hook_collection'

module RPageDown
  class Converter
    attr_reader :hooks

    def initialize
      @pluginHooks = @hooks = HookCollection.new
      @pluginHooks.add_noop(:plain_link_text)
      @pluginHooks.add_noop(:pre_conversion)
      @pluginHooks.add_noop(:post_conversion)
      @g_urls = nil
      @g_titles = nil
      @g_html_blocks = nil
      @g_list_level = -1

      @_list_item_markers = {
        :ol => "\\d+[.]",
        :ul => "[*+-]"
      }

      @_problem_url_chars = /(?:["'*()\[\]:]|~D)/
    end

    # gsub! on $1~9 will not affect the original variable
    # you can first assign $1~9 to a variable and then do that on the new variable
    # however $1~9 will remain unchanged
    # it is designed to be changed only when group capture changes

    def make_html(text)
      if @g_urls
        raise "Recursive call to converter.makeHtml"
      end

      @g_urls = Hash.new
      @g_titles = Hash.new
      @g_html_blocks = []
      @g_list_level = 0

      text = "#{@pluginHooks.pre_conversion(text)}"

      text.gsub!(/~/, '~T')

      text.gsub!(/\$/, '~D')

      text.gsub!(/\r\n/, "\n")
      text.gsub!(/\r/, "\n")

      text = "\n\n#{text}\n\n"

      text = _detab(text)

      text.gsub!(/^[ \t]+$/, '')

      text = _hash_html_blocks(text)

      text = _strip_link_definitions(text)

      text = _run_block_gamut(text)

      text = _unescape_special_chars(text)

      text.gsub!(/~D/, '$')

      text.gsub!(/~T/, '~')

      text = @pluginHooks.post_conversion(text)

      @g_html_blocks = @g_titles = @g_urls = nil

      text
    end

    private
    def _strip_link_definitions(text)
      regexp = %r{
        ^[ ]{0,3}\[(.+)\]:  # id = $1  attacklab: g_tab_width - 1
        [ \t]*
        \n?                 # maybe *one* newline
        [ \t]*
        <?(\S+?)>?          # url = $2
        (?=\s|$)            # lookahead for whitespace instead of the lookbehind removed below
        [ \t]*
        \n?                 # maybe one newline
        [ \t]*
        (                   # (potential) title = $3
          (\n*)             # any lines skipped = $4 attacklab: lookbehind removed
          [ \t]+
          ["(]
          (.+?)             # title = $5
          [")]
          [ \t]*
        )?                  # title is optional
        (?:\n+|$)
      }x

      text.gsub regexp do |match|
        m1 = $1.downcase
        @g_urls[m1] = _encode_amps_and_angles($2)
        if $4 and not $4.empty?
          next $3
        elsif $5 and not $5.empty?
          @g_titles[m1] = $5.gsub(/"/, "&quot;")
        end
        ""
      end
    end

    def _hash_html_blocks(text)
      block_tags_a = "p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del"
      block_tags_b = "p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math"

      text.gsub!(%r{
        (                     # save in $1
          ^                   # start of line  (with /m)
          <(#{block_tags_a})  # start tag = $2
          \b                  # word break
                              # attacklab: hack around khtml/pcre bug...
          [^\r]*?\n           # any number of lines, minimally matching
          </\2>               # the matching end tag
          [ \t]*              # trailing spaces/tabs
          (?=\n+)             # followed by a newline
        )                     # attacklab: there are sentinel newlines at end of document
      }x) {|match| hash_element(match, $1)}

      text.gsub!(%r{
        (                     # save in $1
          ^                   # start of line  (with /m)
          <(#{block_tags_b})  # start tag = $2
          \b                  # word break
                              # attacklab: hack around khtml/pcre bug...
          [^\r]*?             # any number of lines, minimally matching
          .*</\2>             # the matching end tag
          [ \t]*              # trailing spaces/tabs
          (?=\n+)             # followed by a newline
        )                     # attacklab: there are sentinel newlines at end of document
      }x) {|match| hash_element(match, $1) }

      text.gsub!(%r{
          \n                  # Starting after a blank line
          [ ]{0,3}
          (                   # save in $1
            (<(hr)            # start tag = $2
              \b              # word break
              ([^<>])*?
            \/?>)             # the matching end tag
            [ \t]*
            (?=\n{2,})        # followed by a blank line
          )
      }x) {|match| hash_element(match, $1) }

      text.gsub!(%r{
        \n\n                                          # Starting after a blank line
        [ ]{0,3}                                      # attacklab: g_tab_width - 1
        (                                             # save in $1
          <!
          (--(?:|(?:[^>-]|-[^>])(?:[^-]|-[^-])*)--)   # see http://www.w3.org/TR/html-markup/syntax.html#comments and http://meta.stackoverflow.com/q/95256
          >
          [ \t]*
          (?=\n{2,})                                  # followed by a blank line
        )
      }x) {|match| hash_element(match, $1) }

      text.gsub!(%r{
        (?:
          \n\n            # Starting after a blank line
        )
        (                 # save in $1
          [ ]{0,3}        # attacklab: g_tab_width - 1
          (?:
            <([?%])       # $2
            [^\r]*?
            \2>
          )
          [ \t]*
          (?=\n{2,})      # followed by a blank line
        )
      }x) {|match| hash_element(match, $1) }

      text
    end

    def hash_element(match, m1)
      block_text = m1
      block_text.gsub! /\A\n+/, ''
      block_text.gsub! /\n+\z/, ''
      "\n\n~K#{@g_html_blocks.push(block_text).size - 1}K\n\n"
    end

    def _run_block_gamut(text, do_not_unhash = nil)
      text = _do_headers(text)

      replacement = "<hr />\n"
      text.gsub!(/^[ ]{0,2}([ ]?\*[ ]?){3,}[ \t]*$/, replacement)
      text.gsub!(/^[ ]{0,2}([ ]?-[ ]?){3,}[ \t]*$/, replacement)
      text.gsub!(/^[ ]{0,2}([ ]?_[ ]?){3,}[ \t]*$/, replacement)

      text = _do_lists(text)
      text = _do_code_blocks(text)
      text = _do_block_quotes(text)
      
      text = _hash_html_blocks(text)
      text = _form_paragraphs(text, do_not_unhash)

      text
    end

    def _run_span_gamut(text)
      text = _do_code_spans(text)
      text = _escape_special_chars_within_tag_attributes(text)
      text = _encode_backslash_escapes(text)

      text = _do_images(text)
      text = _do_anchors(text)

      text = _do_auto_links(text)

      text.gsub!(/~P/, '://')

      text = _encode_amps_and_angles(text)
      text = _do_italics_and_bold(text)

      text.gsub(/  +\n/, " <br />\n")
    end

    def _escape_special_chars_within_tag_attributes(text)
      regexp = /(<[a-z\/!$]("[^"]*"|'[^']*'|[^'">])*>|<!(--(?:|(?:[^>-]|-[^>])(?:[^-]|-[^-])*)--)>)/i
      text.gsub! regexp do |match|
        tag = match.gsub(/(.)<\/?code>(?=.)/, "\\1`")
        tag = escape_characters(tag, if match[0] == '!' then "\\`*_/" else "\\`*_" end)
        tag
      end
      text
    end

    def _do_anchors(text)
      text.gsub!(%r{
        (                           # wrap whole match in $1
          \[
          (
            (?:
              \[[^\]]*\]            # allow brackets nested one level
              |
              [^\[]                 # or anything else
            )*
          )
          \]

          [ ]?                      # one optional space
          (?:\n[ ]*)?               # one optional newline followed by spaces

          \[
          (.*?)                     # id = $3
          \]
        )
        ()()()()                    # pad remaining backreferences
      }x) {|match| write_anchor_tag(match, $1, $2, $3, $4, $5, $6, $7)}

      text.gsub!(%r{
        (                           # wrap whole match in $1
          \[
          (
            (?:
              \[[^\]]*\]            # allow brackets nested one level
              |
              [^\[\]]               # or anything else
            )*
          )
          \]
          \(                        # literal paren
          [ \t]*
          ()                        # no id, so leave $3 empty
          <?(                       # href = $4
            (?:
              \([^)]*\)             # allow one level of (correctly nested) parens (think MSDN)
              |
              [^()\s]
            )*?
          )>?                
          [ \t]*
          (                         # $5
            (['"])                  # quote char = $6
            (.*?)                   # Title = $7
            \6                      # matching quote
            [ \t]*                  # ignore any spaces/tabs between closing quote and )
          )?                        # title is optional
          \)
        )
      }x) {|match| write_anchor_tag(match, $1, $2, $3, $4, $5, $6, $7)}
      
      text.gsub!(%r{
        (                           # wrap whole match in $1
          \[
          ([^\[\]]+)                # link text = $2; can't contain '[' or ']'
          \]
        )
        ()()()()()                  # pad rest of backreferences
      }x) {|match| write_anchor_tag(match, $1, $2, $3, $4, $5, $6, $7)}

      text
    end

    def write_anchor_tag(match, m1, m2, m3, m4, m5, m6, m7)
      m7 = '' if not m7
      whole_match = m1
      link_text = m2.gsub(/:\/\//, "~P")
      link_id = m3.downcase
      url = m4
      title = m7

      if url == ''
        if link_id == ''
          link_id = link_text.downcase.gsub(/ ?\n/, " ")
        end
        url = "##{link_id}"

        if @g_urls.has_key? link_id
          url = @g_urls[link_id]
          if @g_titles.has_key? link_id
            title = @g_titles[link_id]
          end
        else
          if whole_match =~ /\(\s*\)\z/
            url = ''
          else
            return whole_match
          end
        end
      end

      url = encode_problem_url_chars(url)
      url = escape_characters(url, "*_")
      result = %Q{<a href="#{url}"}
      if not title.empty?
        title = attribute_encode(title)
        title = escape_characters(title, "*_")
        result << %Q{ title="#{title}"}
      end

      result << %Q{>#{link_text}</a>}
      result
    end

    def _do_images(text)
      text.gsub!(%r{
        (                   # wrap whole match in $1
          !\[
          (.*?)             # alt text = $2
          \]

          [ ]?              # one optional space
          (?:\n[ ]*)?       # one optional newline followed by spaces

          \[
          (.*?)             # id = $3
          \]
        )
        ()()()()            # pad rest of backreferences
      }x) {|match| write_image_tag(match, $1, $2, $3, $4, $5, $6, $7)}

      text.gsub!(%r{
        (                   # wrap whole match in $1
          !\[
          (.*?)             # alt text = $2
          \]
          \s?               # One optional whitespace character
          \(                # literal paren
          [ \t]*
          ()                # no id, so leave $3 empty
          <?(\S+?)>?        # src url = $4
          [ \t]*
          (                 # $5
            (['"])          # quote char = $6
            (.*?)           # title = $7
            \6              # matching quote
            [ \t]*
          )?                # title is optional
          \)
        )
      }x) {|match| write_image_tag(match, $1, $2, $3, $4, $5, $6, $7)}

      text
    end

    def attribute_encode(text)
      text.gsub(/>/, '&gt;').gsub(/</, '&lt;').gsub(/"/, '&quot;')
    end

    def write_image_tag(match, m1, m2, m3, m4, m5, m6, m7)
      whole_match = m1
      alt_text = m2
      link_id = m3.downcase
      url = m4
      title = m7

      title = '' if not title

      if url == ''
        if link_id == ''
          link_id = alt_text.downcase.gsub(/ ?\n/, " ")
        end
        url = '#' + link_id

        if @g_urls.has_key? link_id
          url = @g_urls[link_id]
          if @g_titles.has_key? link_id
            title = @g_titles[link_id]
          end
        else
          return whole_match
        end
      end

      alt_text = escape_characters(attribute_encode(alt_text), "*_[]()")
      url = escape_characters(url, "*_")
      result = %Q{<img src="#{url}" alt="#{alt_text}"}

      title = attribute_encode(title)
      title = escape_characters(title, "*_")
      result << %Q{ title="#{title}"}

      result << " />"

      result
    end

    def _do_headers(text)
      text.gsub! /^(.+)[ \t]*\n=+[ \t]*\n+/ do |match|
        %Q{<h1>#{_run_span_gamut($1)}</h1>\n\n}
      end
      text.gsub! /^(.+)[ \t]*\n-+[ \t]*\n+/ do |match|
        %Q{<h2>#{_run_span_gamut($1)}</h2>\n\n}
      end
      text.gsub! %r{
        ^(\#{1,6})      # $1 = string of #'s
        [ \t]*
        (.+?)           # $2 = Header text
        [ \t]*
        \#*             # optional closing #'s (not counted)
        \n+
      }x do |match|
        h_level = $1.length;
        %Q{<h#{h_level}>#{_run_span_gamut($2)}</h#{h_level}>\n\n}
      end
      text
    end

    def _do_lists(text)
      text << "~0"
      whole_list = %r{
        (                               # $1 = whole list
          (                             # $2
            [ ]{0,3}                    # attacklab: g_tab_width - 1
            ([*+-]|\d+[.])              # $3 = first list item marker
            [ \t]+
          )
          [^\r]+?
          (                             # $4
            ~0                          # sentinel for workaround; should be $
            |
            \n{2,}
            (?=\S)
            (?!                         # Negative lookahead for another list item marker
              [ \t]*
              (?:[*+-]|\d+[.])[ \t]+
            )
          )
        )
      }x
      if @g_list_level and @g_list_level > 0
        text.gsub! whole_list do |match|
          list = $1
          list_type = if $2 =~ /[*+-]/ then "ul" else "ol" end

          result = _process_list_items(list, list_type)

          result.sub!(/\s+\z/, "")
          %Q{<#{list_type}>#{result}</#{list_type}>\n}
        end
      else
        whole_list = /(\n\n|\A\n?)(([ ]{0,3}([*+-]|\d+[.])[ \t]+)[^\r]+?(~0|\n{2,}(?=\S)(?![ \t]*(?:[*+-]|\d+[.])[ \t]+)))/
        text.gsub! whole_list do |match|
          runup = $1
          list = $2

          list_type = if $3 =~ /[*+-]/ then "ul" else "ol" end
          result = _process_list_items(list, list_type)
          %Q{#{runup}<#{list_type}>\n#{result}</#{list_type}>\n}
        end
      end
      text.sub /~0/, ''
    end

    def _process_list_items(list_str, list_type)
      @g_list_level += 1

      list_str.sub! /\n{2,}\z/, "\n"

      list_str << "~0"

      marker = "#{@_list_item_markers[list_type.to_sym]}"
      re = %r{(^[ \t]*)(#{marker})[ \t]+([^\r]+?(\n+))(?=(~0|\1(#{marker})[ \t]+))}
      last_item_had_a_double_newline = false
      list_str.gsub! re do |whole_match|
        item = $3
        leading_space = $1
        ends_with_double_newline = item =~ /\n\n\z/
        contains_double_newline = ends_with_double_newline or item =~ /\n{2,}/

        if contains_double_newline or last_item_had_a_double_newline
          item = _run_block_gamut(_outdent(item), true)
        else
          item = _do_lists(_outdent(item))
          item.sub! /\n\z/, ''
          item = _run_span_gamut(item)
        end
        last_item_had_a_double_newline = ends_with_double_newline
        %Q{<li>#{item}</li>\n}
      end

      list_str.gsub! /~0/, ''

      @g_list_level -= 1
      list_str
    end
    
    def _do_code_blocks(text)
      text << "~0"

      text.gsub! %r{
        (?:\n\n|\A)
        (                               # $1 = the code block -- one or more lines, starting with a space/tab
          (?:
            (?:[ ]{4}|\t)               # Lines must start with a tab or a tab-width of spaces - attacklab: g_tab_width
            .*\n+
          )+
        )
        (\n*[ ]{0,3}[^ \t\n]|(?=~0))    # attacklab: g_tab_width
      }x do |whole_match|
        codeblock = $1
        nextchar = $2

        codeblock = _encode_code(_outdent(codeblock))
        codeblock = _detab(codeblock)
        codeblock.gsub!(/\A\n+/, '')
        codeblock.gsub!(/\n+\z/, '')

        codeblock = %Q{<pre><code>#{codeblock}\n</code></pre>}
        %Q{\n\n#{codeblock}\n\n#{nextchar}}
      end
      text.sub(/~0/, '')
    end

    def hash_block(text)
      text.gsub!(/(\A\n+|\n+\z)/, '')
      %Q{\n\n~K#{@g_html_blocks.push(text).size - 1}K\n\n}
    end

    def _do_code_spans(text)
      text.gsub! %r{
        (^|[^\\])       # Character before opening ` can't be a backslash
        (`+)            # $2 = Opening run of `
        (               # $3 = The code block
          [^\r]*?
          [^`]          # attacklab: work around lack of lookbehind
        )
        \2              # Matching closer
        (?!`)
      }x do |whole_match|
        m1 = $1
        c = $3
        c.gsub! /\A([ \t]*)/, ''
        c.gsub! /[ \t]*\z/, ''
        c = _encode_code(c)
        c.gsub! /:\/\//, '~P'
        %Q{#{m1}<code>#{c}</code>}
      end
      text
    end

    def _encode_code(text)
      text.gsub! '&', '&amp;'
      text.gsub! '<', '&lt;'
      text.gsub! '>', '&gt;'
      text = escape_characters(text, "\*_{}[]\\", false)
      text
    end

    def _do_italics_and_bold(text)
      text.gsub! /([\W_]|^)(\*\*\*|___)(?=\S)([^\r]*?\S[\*_]*)\2([\W_]|$)/, "\\1<strong><em>\\3</em></strong>\\4"
      text.gsub! /([\W_]|^)(\*\*|__)(?=\S)([^\r]*?\S[\*_]*)\2([\W_]|$)/, "\\1<strong>\\3</strong>\\4"
      text.gsub! /([\W_]|^)(\*|_)(?=\S)([^\r\*_]*?\S)\2([\W_]|$)/, "\\1<em>\\3</em>\\4"
      text
    end

    def _do_block_quotes(text)
      text.gsub! %r{
        (                           # Wrap whole match in $1
          (
            ^[ \t]*>[ \t]?          # '>' at the start of a line
            .+\n                    # rest of the first line
            (.+\n)*                 # subsequent consecutive lines
            \n*                     # blanks
          )+
        )
      }x do |match|
        bq = $1

        bq.gsub! /^[ \t]*>[ \t]?/, "~0"
        bq.gsub! /~0/, ""

        bq.gsub! /^[ \t]+$/, ""
        bq = _run_block_gamut(bq)

        bq.gsub! /(\A|\n)/, "\\1  "

        bq.gsub! /(\s*<pre>[^\r]+?<\/pre>)/ do |match|
          pre = $1
          pre.gsub! /^  /, "~0"
          pre.gsub! /^~0/, ''
          pre
        end

        hash_block(%Q{<blockquote>\n#{bq}\n</blockquote>})
      end
      text
    end

    def _form_paragraphs(text, do_not_unhash)
      text.gsub! /\A\n+/, ''
      text.gsub! /\n+\z/, ''
      grafs = text.split(/\n{2,}/)
      grafs_out = []
      marker_re = /~K(\d+)K/

      grafs.each do |str|
        if str =~ marker_re
          grafs_out.push(str)
        elsif str =~ /\S/
          str = _run_span_gamut(str)
          str.gsub! /\A([ \t]*)/, '<p>'
          str << "</p>"
          grafs_out.push(str)
        end
      end

      if not do_not_unhash
        0.upto(grafs_out.length - 1) do |i|
          found_any = true
          while found_any
            found_any = false
            grafs_out[i].gsub! /~K(\d+)K/ do |match|
              found_any = true
              @g_html_blocks[$1.to_i]
            end
          end
        end
      end

      grafs_out.join("\n\n")
    end

    def _encode_amps_and_angles(text)
      text.gsub! /&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/, "&amp;"
      text.gsub /<(?![a-z\/?!]|~D)/i, '&lt;'
    end

    def _encode_backslash_escapes(text)
      text.gsub!(/\\(\\)/) {|match| escape_characters_callback(match, $1)}
      text.gsub(/\\([`*_{}\[\]()>#+-.!])/) {|match| escape_characters_callback(match, $1)}
    end

    def _do_auto_links(text)
      text.gsub! /(^|\s)(https?|ftp)(:\/\/[-A-Z0-9+&@#\/%?=~_|\[\]\(\)!:,\.;]*[-A-Z0-9+&@#\/%=~_|\[\]])($|\W)/i, "\\1<\\2\\3>\\4"
      text.gsub /<((https?|ftp):[^'">\s]+)>/i do |match|
        %Q{<a href="#{$1}">#{@pluginHooks.plain_link_text($1)}</a>}
      end
    end

    def _unescape_special_chars(text)
      text.gsub /~E(\d+)E/ do |match|
        charCodeToReplace = $1.to_i
        [charCodeToReplace].pack "U*"
      end
    end

    def _outdent(text)
      text.gsub! /^(\t|[ ]{1,4})/, "~0"
      text.gsub! /^~0/, ''
      text
    end

    def _detab(text)
      return text if not text =~ /\t/

      spaces = ["    ", "   ", "  ", " "]
      skew = 0

      text.gsub /[\n\t]/ do |match|
        offset = $~.begin(0)
        if match == "\n"
          skew = offset + 1
          next match
        end
        v = (offset - skew) % 4
        skew = offset + 1
        spaces[v]
      end
    end

    def encode_problem_url_chars(url)
      return "" if not url or url.empty?

      len = url.length

      url.gsub @_problem_url_chars do |match|
        next "%24" if match == "~D"
        offset = $~.begin(0)
        if match == ":"
          if offset == len - 1 || url[offset + 1] =~ /[0-9\/]/
            next ":"
          end
        end
        "%#{match[0].ord}"
      end
    end

    def escape_characters(text, chars_to_escape, after_backslash = nil)
      regex_string = %Q{([#{chars_to_escape.gsub(/([\[\]\\])/, "\\\\\\1")}])}
      regex_string = "\\\\" + regex_string if after_backslash

      text.gsub(/#{regex_string}/) {|match| escape_characters_callback(match, $1)}
    end

    def escape_characters_callback(match, m1)
      char_code_to_escape = m1[0].ord
      "~E#{char_code_to_escape}E"
    end
  end
end