markup.rb | searchcode

/lib/gollum/markup.rb

https://bitbucket.org/destructuring/gollum
Ruby | 664 lines | 397 code | 76 blank | 191 comment | 66 complexity | 1716c45bc88fc73c660c3b85ce0dea13 MD5 | raw file
Possible License(s): BSD-3-Clause, AGPL-1.0, CC-BY-SA-3.0, MIT

# ~*~ encoding: utf-8 ~*~
require 'digest/sha1'
require 'cgi'
require 'pygments'
require 'base64'

require File.expand_path '../frontend/helpers', __FILE__
require File.expand_path '../gitcode', __FILE__

# initialize Pygments
Pygments.start

module Gollum

  class Markup
    include Precious::Helpers

    attr_accessor :toc
    attr_reader   :metadata

    # Initialize a new Markup object.
    #
    # page - The Gollum::Page.
    #
    # Returns a new Gollum::Markup object, ready for rendering.
    def initialize(page)
      @wiki    = page.wiki
      @name    = page.filename
      @data    = page.text_data
      @version = page.version.id if page.version
      @format  = page.format
      @sub_page = page.sub_page
      @parent_page = page.parent_page
      @dir     = ::File.dirname(page.path)
      @tagmap  = {}
      @codemap = {}
      @wsdmap  = {}
      @premap  = {}
      @toc = nil
      @metadata = nil
      @to_xml = { :save_with => Nokogiri::XML::Node::SaveOptions::DEFAULT_XHTML ^ 1, :indent => 0, :encoding => 'UTF-8' }
    end

    # Render the content with Gollum wiki syntax on top of the file's own
    # markup language.
    #
    # no_follow - Boolean that determines if rel="nofollow" is added to all
    #             <a> tags.
    # encoding  - Encoding Constant or String.
    #
    # Returns the formatted String content.
    def render(no_follow = false, encoding = nil)
      sanitize = no_follow ?
        @wiki.history_sanitizer :
        @wiki.sanitizer

      data = @data.dup
      data = extract_metadata(data)
      data = extract_gitcode(data)
      data = extract_code(data)
      data = extract_wsd(data)
      data = extract_tags(data)
      begin
        data = GitHub::Markup.render(@name, data)
        if data.nil?
          raise "There was an error converting #{@name} to HTML."
        end
      rescue Object => e
        data = %{<p class="gollum-error">#{e.message}</p>}
      end
      data = process_tags(data)
      data = process_code(data, encoding)

      doc = Nokogiri::HTML::DocumentFragment.parse(data)
      doc = sanitize.clean_node!(doc) if sanitize
      doc,toc = process_headers(doc)
      @toc = @sub_page ? ( @parent_page ? @parent_page.toc_data : "[[_TOC_]]" ) : toc
      yield doc if block_given?
      # nokogiri's save options are ored together. FORMAT has a value of 1 so ^ 1 removes it.
      # formatting will create extra spaces in pre tags.
      # https://github.com/sparklemotion/nokogiri/issues/782
      # DEFAULT_HTML encodes unicode so XHTML is used for proper unicode support in href.
      data = doc.to_xml( @to_xml )

      data = process_toc_tags(data)
      data = process_wsd(data)
      data.gsub!(/<p><\/p>/) do
        ''
      end

      data
    end

    # Inserts header anchors and creates TOC
    #
    # doc - Nokogiri parsed document
    #
    # Returns doc Document and toc String
    def process_headers(doc)
      toc = nil
      doc.css('h1,h2,h3,h4,h5,h6').each do |h|
        # must escape "
        h_name = h.content.gsub(' ','-').gsub('"','%22')

        level = h.name.gsub(/[hH]/,'').to_i

        # Add anchors
        h.add_child(%Q{<a class="anchor" id="#{h_name}" href="##{h_name}"></a>})

        # Build TOC
        toc ||= Nokogiri::XML::DocumentFragment.parse('<div class="toc"><div class="toc-title">Table of Contents</div></div>')
        tail ||= toc.child
        tail_level ||= 0

        while tail_level < level
          node = Nokogiri::XML::Node.new('ul', doc)
          tail = tail.add_child(node)
          tail_level += 1
        end          
        while tail_level > level
          tail = tail.parent
          tail_level -= 1
        end
        node = Nokogiri::XML::Node.new('li', doc)
        # % -> %25 so anchors work on Firefox. See issue #475
        node.add_child(%Q{<a href="##{h_name}">#{h.content}</a>})
        tail.add_child(node)
      end
      toc = toc.to_xml(@to_xml) if toc != nil
      [doc, toc]
    end

    #########################################################################
    #
    # Tags
    #
    #########################################################################

    # Extract all tags into the tagmap and replace with placeholders.
    #
    # data - The raw String data.
    #
    # Returns the placeholder'd String data.
    def extract_tags(data)
      if @format == :asciidoc
        return data
      end
      data.gsub!(/(.?)\[\[(.+?)\]\]([^\[]?)/m) do
        if $1 == "'" && $3 != "'"
          "[[#{$2}]]#{$3}"
        elsif $2.include?('][')
          if $2[0..4] == 'file:'
            pre = $1
            post = $3
            parts = $2.split('][')
            parts[0][0..4] = ""
            link = "#{parts[1]}|#{parts[0].sub(/\.org/,'')}"
            id = Digest::SHA1.hexdigest(link)
            @tagmap[id] = link
            "#{pre}#{id}#{post}"
          else
            $&
          end
        else
          id = Digest::SHA1.hexdigest($2)
          @tagmap[id] = $2
          "#{$1}#{id}#{$3}"
        end
      end
      data
    end

    # Process all tags from the tagmap and replace the placeholders with the
    # final markup.
    #
    # data      - The String data (with placeholders).
    #
    # Returns the marked up String data.
    def process_tags(data)
      @tagmap.each do |id, tag|
        # If it's preformatted, just put the tag back
        if is_preformatted?(data, id)
          data.gsub!(id) do
            "[[#{tag}]]"
          end
        else
          data.gsub!(id) do
            process_tag(tag).gsub('%2F', '/')
          end
        end
      end
      data
    end

    # Find `id` within `data` and determine if it's within
    # preformatted tags.
    #
    # data      - The String data (with placeholders).
    # id        - The String SHA1 hash.
    PREFORMATTED_TAGS = %w(code tt)
    def is_preformatted?(data, id)
      doc = Nokogiri::HTML::DocumentFragment.parse(data)
      node = doc.search("[text()*='#{id}']").first
      node && (PREFORMATTED_TAGS.include?(node.name) ||
        node.ancestors.any? { |a| PREFORMATTED_TAGS.include?(a.name) })
    end

    # Process a single tag into its final HTML form.
    #
    # tag       - The String tag contents (the stuff inside the double
    #             brackets).
    #
    # Returns the String HTML version of the tag.
    def process_tag(tag)
      if tag =~ /^_TOC_$/
        %{[[#{tag}]]}
      elsif html = process_image_tag(tag)
        html
      elsif html = process_file_link_tag(tag)
        html
      else
        process_page_link_tag(tag)
      end
    end

    # Attempt to process the tag as an image tag.
    #
    # tag - The String tag contents (the stuff inside the double brackets).
    #
    # Returns the String HTML if the tag is a valid image tag or nil
    #   if it is not.
    def process_image_tag(tag)
      parts = tag.split('|')
      return if parts.size.zero?

      name  = parts[0].strip
      path  = if file = find_file(name)
        ::File.join @wiki.base_path, file.path
      elsif name =~ /^https?:\/\/.+(jpg|png|gif|svg|bmp)$/i
        name
      end

      if path
        opts = parse_image_tag_options(tag)

        containered = false

        classes = [] # applied to whatever the outermost container is
        attrs   = [] # applied to the image

        align = opts['align']
        if opts['float']
          containered = true
          align ||= 'left'
          if %w{left right}.include?(align)
            classes << "float-#{align}"
          end
        elsif %w{top texttop middle absmiddle bottom absbottom baseline}.include?(align)
          attrs << %{align="#{align}"}
        elsif align
          if %w{left center right}.include?(align)
            containered = true
            classes << "align-#{align}"
          end
        end

        if width = opts['width']
          if width =~ /^\d+(\.\d+)?(em|px)$/
            attrs << %{width="#{width}"}
          end
        end

        if height = opts['height']
          if height =~ /^\d+(\.\d+)?(em|px)$/
            attrs << %{height="#{height}"}
          end
        end

        if alt = opts['alt']
          attrs << %{alt="#{alt}"}
        end

        attr_string = attrs.size > 0 ? attrs.join(' ') + ' ' : ''

        if opts['frame'] || containered
          classes << 'frame' if opts['frame']
          %{<span class="#{classes.join(' ')}">} +
          %{<span>} +
          %{<img src="#{path}" #{attr_string}/>} +
          (alt ? %{<span>#{alt}</span>} : '') +
          %{</span>} +
          %{</span>}
        else
          %{<img src="#{path}" #{attr_string}/>}
        end
      end
    end

    # Parse any options present on the image tag and extract them into a
    # Hash of option names and values.
    #
    # tag - The String tag contents (the stuff inside the double brackets).
    #
    # Returns the options Hash:
    #   key - The String option name.
    #   val - The String option value or true if it is a binary option.
    def parse_image_tag_options(tag)
      tag.split('|')[1..-1].inject({}) do |memo, attr|
        parts = attr.split('=').map { |x| x.strip }
        memo[parts[0]] = (parts.size == 1 ? true : parts[1])
        memo
      end
    end

    # Attempt to process the tag as a file link tag.
    #
    # tag       - The String tag contents (the stuff inside the double
    #             brackets).
    #
    # Returns the String HTML if the tag is a valid file link tag or nil
    #   if it is not.
    def process_file_link_tag(tag)
      parts = tag.split('|')
      return if parts.size.zero?

      name  = parts[0].strip
      path  = parts[1] && parts[1].strip
      path  = if path && file = find_file(path)
        ::File.join @wiki.base_path, file.path
      elsif path =~ %r{^https?://}
        path
      else
        nil
      end

      if name && path && file
        %{<a href="#{::File.join @wiki.base_path, file.path}">#{name}</a>}
      elsif name && path
        %{<a href="#{path}">#{name}</a>}
      else
        nil
      end
    end

    # Attempt to process the tag as a page link tag.
    #
    # tag       - The String tag contents (the stuff inside the double
    #             brackets).
    #
    # Returns the String HTML if the tag is a valid page link tag or nil
    #   if it is not.
    def process_page_link_tag(tag)
      parts = tag.split('|')
      parts.reverse! if @format == :mediawiki

      name, page_name = *parts.compact.map(&:strip)
      cname = @wiki.page_class.cname(page_name || name)

      if name =~ %r{^https?://} && page_name.nil?
        %{<a href="#{name}">#{name}</a>}
      else
        presence    = "absent"
        link_name   = cname
        page, extra = find_page_from_name(cname)
        if page
          link_name = @wiki.page_class.cname(page.name)
          presence  = "present"
        end
        link = ::File.join(@wiki.base_path, page ? page.escaped_url_path : CGI.escape(link_name))

        # //page is invalid
        # strip all duplicate forward slashes using helpers.rb trim_leading_slash
        # //page => /page
        link = trim_leading_slash link

        %{<a class="internal #{presence}" href="#{link}#{extra}">#{name}</a>}
      end
    end


    # Process the special table of contents tag [[_TOC_]]
    #
    # data      - The String data (with placeholders).
    #
    # Returns the marked up String data.
    def process_toc_tags(data)
      data.gsub!("[[_TOC_]]") do
        @toc.nil? ? '' : @toc
      end
      data
    end

    # Find the given file in the repo.
    #
    # name - The String absolute or relative path of the file.
    #
    # Returns the Gollum::File or nil if none was found.
    def find_file(name, version=@version)
      if name =~ /^\//
        @wiki.file(name[1..-1], version)
      else
        path = @dir == '.' ? name : ::File.join(@dir, name)
        @wiki.file(path, version)
      end
    end

    # Find a page from a given cname.  If the page has an anchor (#) and has
    # no match, strip the anchor and try again.
    #
    # cname - The String canonical page name including path.
    #
    # Returns a Gollum::Page instance if a page is found, or an Array of
    # [Gollum::Page, String extra] if a page without the extra anchor data
    # is found.
    def find_page_from_name(cname)
      slash = cname.rindex('/')

      unless slash.nil?
        name = cname[slash+1..-1]
        path = cname[0..slash]
        page = @wiki.paged(name, path)
      else
        page = @wiki.paged(cname, '/') || @wiki.page(cname)
      end

      if page
        return page
      end
      if pos = cname.index('#')
        [@wiki.page(cname[0...pos]), cname[pos..-1]]
      end
    end

    #########################################################################
    #
    # Gitcode - fetch code from github search path and replace the contents
    #           to a code-block that gets run the next parse.
    #           Acceptable formats:
    #              ```language:local-file.ext```
    #              ```language:/abs/other-file.ext```
    #              ```language:github/gollum/master/somefile.txt```
    #
    #########################################################################

    def extract_gitcode data
      data.gsub /^[ \t]*``` ?([^:\n\r]+):([^`\n\r]+)```/ do
        contents = ''
        # Use empty string if $2 is nil.
        uri = $2 || ''
        # Detect local file.
        if uri[0..6] != 'github/'
            if file = self.find_file(uri, @wiki.ref)
              contents = file.raw_data
            else
              # How do we communicate a render error?
              next "File not found: #{Rack::Utils::escape_html(uri)}"
            end
        else
          contents = Gollum::Gitcode.new(uri).contents
        end

        "```#{$1}\n#{contents}\n```\n"
      end
    end

    #########################################################################
    #
    # Code
    #
    #########################################################################

    # Extract all code blocks into the codemap and replace with placeholders.
    #
    # data - The raw String data.
    #
    # Returns the placeholder'd String data.
    def extract_code(data)
      data.gsub!(/^([ \t]*)(~~~+) ?([^\r\n]+)?\r?\n(.+?)\r?\n\1(~~~+)[ \t\r]*$/m) do
        m_indent = $1
        m_start  = $2 # ~~~
        m_lang   = $3
        m_code   = $4
        m_end    = $5 # ~~~

        # start and finish tilde fence must be the same length
        return '' if m_start.length != m_end.length

        lang   = m_lang ? m_lang.strip : nil
        id     = Digest::SHA1.hexdigest("#{lang}.#{m_code}")
        cached = check_cache(:code, id)

        # extract lang from { .ruby } or { #stuff .ruby .indent }
        # see http://johnmacfarlane.net/pandoc/README.html#delimited-code-blocks

        if lang
            lang = lang.match(/\.([^}\s]+)/)
            lang = lang[1] unless lang.nil?
        end

        @codemap[id] = cached   ?
          { :output => cached } :
          { :lang => lang, :code => m_code, :indent => m_indent }

        "#{m_indent}#{id}" # print the SHA1 ID with the proper indentation
      end

      data.gsub!(/^([ \t]*)``` ?([^\r\n]+)?\r?\n(.+?)\r?\n\1```[ \t]*\r?$/m) do
        lang   = $2 ? $2.strip : nil
        id     = Digest::SHA1.hexdigest("#{lang}.#{$3}")
        cached = check_cache(:code, id)
        @codemap[id] = cached   ?
          { :output => cached } :
          { :lang => lang, :code => $3, :indent => $1 }
        "#{$1}#{id}" # print the SHA1 ID with the proper indentation
      end
      data
    end

    # Remove the leading space from a code block. Leading space
    # is only removed if every single line in the block has leading
    # whitespace.
    #
    # code      - The code block to remove spaces from
    # regex     - A regex to match whitespace
    def remove_leading_space(code, regex)
      if code.lines.all? { |line| line =~ /\A\r?\n\Z/ || line =~ regex }
        code.gsub!(regex) do
          ''
        end
      end
    end

    # Process all code from the codemap and replace the placeholders with the
    # final HTML.
    #
    # data     - The String data (with placeholders).
    # encoding - Encoding Constant or String.
    #
    # Returns the marked up String data.
    def process_code(data, encoding = nil)
      return data if data.nil? || data.size.zero? || @codemap.size.zero?

      blocks    = []
      @codemap.each do |id, spec|
        next if spec[:output] # cached

        code = spec[:code]

        remove_leading_space(code, /^#{spec[:indent]}/m)
        remove_leading_space(code, /^(  |\t)/m)

        blocks << [spec[:lang], code]
      end

      highlighted = []
      blocks.each do |lang, code|
        encoding ||= 'utf-8'
        begin
          # must set startinline to true for php to be highlighted without <?
          # http://pygments.org/docs/lexers/
          hl_code = Pygments.highlight(code, :lexer => lang, :options => {:encoding => encoding.to_s, :startinline => true})
        rescue
          hl_code = code
        end
        highlighted << hl_code
      end

      @codemap.each do |id, spec|
        body = spec[:output] || begin
          if (body = highlighted.shift.to_s).size > 0
            update_cache(:code, id, body)
            body
          else
            "<pre><code>#{CGI.escapeHTML(spec[:code])}</code></pre>"
          end
        end
        data.gsub!(id) do
          body
        end
      end

      data
    end

    #########################################################################
    #
    # Sequence Diagrams
    #
    #########################################################################

    # Extract all sequence diagram blocks into the wsdmap and replace with
    # placeholders.
    #
    # data - The raw String data.
    #
    # Returns the placeholder'd String data.
    def extract_wsd(data)
      data.gsub(/^\{\{\{\{\{\{ ?(.+?)\r?\n(.+?)\r?\n\}\}\}\}\}\}\r?$/m) do
        id = Digest::SHA1.hexdigest($2)
        @wsdmap[id] = { :style => $1, :code => $2 }
        id
      end
    end

    # Process all diagrams from the wsdmap and replace the placeholders with
    # the final HTML.
    #
    # data - The String data (with placeholders).
    #
    # Returns the marked up String data.
    def process_wsd(data)
      @wsdmap.each do |id, spec|
        style = spec[:style]
        code = spec[:code]
        data.gsub!(id) do
          Gollum::WebSequenceDiagram.new(code, style).to_tag
        end
      end
      data
    end

    #########################################################################
    #
    # Metadata
    #
    #########################################################################

    # Extract metadata for data and build metadata table. Metadata
    # is content found between markers, and must
    # be a valid YAML mapping.
    #
    # Because ri and ruby 1.8.7 are awesome, the markers can't
    # be included in this documentation without triggering
    # `Unhandled special: Special: type=17`
    # Please read the source code for the exact markers
    #
    # Returns the String of formatted data with metadata removed.
    def extract_metadata(data)
      @metadata = {}
      data
    end

    # Hook for getting the formatted value of extracted tag data.
    #
    # type - Symbol value identifying what type of data is being extracted.
    # id   - String SHA1 hash of original extracted tag data.
    #
    # Returns the String cached formatted data, or nil.
    def check_cache(type, id)
    end

    # Hook for caching the formatted value of extracted tag data.
    #
    # type - Symbol value identifying what type of data is being extracted.
    # id   - String SHA1 hash of original extracted tag data.
    # data - The String formatted value to be cached.
    #
    # Returns nothing.
    def update_cache(type, id, data)
    end
  end

  MarkupGFM = Markup
end