internal_attributes.rb

/lib/pismo/internal_attributes.rb

https://github.com/bigfolio/pismo
Ruby | 202 lines | 145 code | 32 blank | 25 comment | 13 complexity | a590521a6e7b49f8881c2985c6c37365 MD5 | raw file
Possible License(s): Apache-2.0

module Pismo
  # Internal attributes are different pieces of data we can extract from a document's content
  module InternalAttributes
    # Returns the title of the page/content - attempts to strip site name, etc, if possible
    def title
      title = @doc.match( 'h2.title',
                          '.entry h2',                                                      # Common style
                          '.entryheader h1',                                                # Ruby Inside/Kubrick
                          '.entry-title a',                                               # Common Blogger/Blogspot rules
                          '.post-title a',
                          '.posttitle a',
                          '.entry-title',
                          '.post-title',
                          '.posttitle',
                          ['meta[@name="title"]', lambda { |el| el.attr('content') }],
                          '#pname a',                                                       # Google Code style
                          'h1.headermain',
                          'h1.title',
                          '.mxb h1'                                                         # BBC News
                        )
      
      # If all else fails, go to the HTML title
      unless title
        title = @doc.match('title')
        return unless title

        # Strip off any leading or trailing site names - a scrappy way to try it out..
        title = title.split(/\s+(\-|\||\:)\s+/).sort_by { |i| i.length }.last.strip
      end
      
      title
    end
    
    # Return an estimate of when the page/content was created
    # As clients of this library should be doing HTTP retrieval themselves, they can fall to the
    # Last-Updated HTTP header if they so wish. This method is just rough and based on content only.
    def datetime
      # TODO: Clean all this mess up
      
      mo = %r{(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|January|February|March|April|May|June|July|August|September|October|November|December)}i
      
      regexen = [
        /#{mo}\b\s+\d+\D{1,10}\d{4}/i,
        /(on\s+)?\d+\s+#{mo}\s+\D{1,10}\d+/i,
        /(on[^\d+]{1,10})?\d+(th|st|rd)?.{1,10}#{mo}\b[^\d]{1,10}\d+/i,
        /on\s+#{mo}\s+\d+/i,
        /#{mo}\s+\d+/i,
        /\d{4}[\.\/\-]\d{2}[\.\/\-]\d{2}/,
        /\d{2}[\.\/\-]\d{2}[\.\/\-]\d{4}/
      ]
      
      datetime = 10
      
      regexen.each do |r|
        datetime = @doc.to_html[r]
        p datetime
        break if datetime
      end
      
      return unless datetime && datetime.length > 4
      
      # Clean up the string for use by Chronic
      datetime.strip!
      datetime.gsub!(/(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)[^\w]*/i, '')
      datetime.gsub!(/(mon|tues|tue|weds|wed|thurs|thur|thu|fri|sat|sun)[^\w]*/i, '')
      datetime.sub!(/on\s+/, '')
      datetime.gsub!(/\,/, '')
      datetime.sub!(/(\d+)(th|st|rd)/, '\1')
      
      Chronic.parse(datetime) || datetime
    end
    
    # TODO: Attempts to work out what type of site or page the page is from the provided URL
    # def site_type
    # end
    
    # Returns the author of the page/content
    def author
      author = @doc.match('.post-author .fn',
                          '.wire_author',
                          '.cnnByline b',
                          ['meta[@name="author"]', lambda { |el| el.attr('content') }],     # Traditional meta tag style
                          ['meta[@name="AUTHOR"]', lambda { |el| el.attr('content') }],     # CNN style
                          '.byline a',                                                      # Ruby Inside style
                          '.post_subheader_left a',                                         # TechCrunch style
                          '.byl',                                                           # BBC News style
                          '.meta a',
                          '.articledata .author a',
                          '#owners a',                                                      # Google Code style
                          '.author a',
                          '.author',
                          '.auth a',
                          '.auth',
                          '.cT-storyDetails h5',                                            # smh.com.au - worth dropping maybe..
                          ['meta[@name="byl"]', lambda { |el| el.attr('content') }],
                          '.fn a',
                          '.fn',
                          '.byline-author'
                          )
                          
      return unless author
    
      # Strip off any "By [whoever]" section
      author.sub!(/^(post(ed)?\s)?by\W+/i, '')
      
      author
    end
    
    # Returns the "description" of the page, usually comes from a meta tag
    def description
      @doc.match(
                  ['meta[@name="description"]', lambda { |el| el.attr('content') }],
                  ['meta[@name="Description"]', lambda { |el| el.attr('content') }],
                  '.description'
       )
    end
    
    # Returns the "lede" or first paragraph of the story/page
    def lede
      lede = @doc.match( 
                  '#blogpost p',
                  '.subhead',
                  '//div[@class="entrytext"]//p[string-length()>10]',                      # Ruby Inside / Kubrick style
                  'section p',
                  '.entry .text p',
                  '.entry-content p',
                  '#wikicontent p',                                                        # Google Code style
                  '//td[@class="storybody"]/p[string-length()>10]',                        # BBC News style
                  '//div[@class="entry"]//p[string-length()>100]',
                  # The below is a horrible, horrible way to pluck out lead paras from crappy Blogspot blogs that
                  # don't use <p> tags..
                  ['.entry-content', lambda { |el| el.inner_html[/(#{el.inner_text[0..4].strip}.*?)\<br/, 1] }],
                  ['.entry', lambda { |el| el.inner_html[/(#{el.inner_text[0..4].strip}.*?)\<br/, 1] }],
                  '.entry',
                  '#content p',
                  '#article p',
                  '.post-body',
                  '.entry-content'
                  )
                        
      lede[/^(.*?\.\s){2}/m] || lede
    end
    
    # Returns the "keywords" in the document (not the meta keywords - they're next to useless now)
    def keywords(options = {})
      options = { :stem_at => 10, :word_length_limit => 15, :limit => 20 }.merge(options)
      
      words = {}
      
      # Convert doc to lowercase, scrub out most HTML tags
      body.downcase.gsub(/\<[^\>]{1,100}\>/, '').gsub(/\&\w+\;/, '').scan(/\b[a-z][a-z\'\#\.]*\b/).each do |word|
        next if word.length > options[:word_length_limit]
        word.gsub!(/\'\w+/, '')
        words[word] ||= 0
        words[word] += 1
      end

      # Stem the words and stop words if necessary
      d = words.keys.uniq.map { |a| a.length > options[:stem_at] ? a.stem : a }
      s = File.read(File.dirname(__FILE__) + '/stopwords.txt').split.map { |a| a.length > options[:stem_at] ? a.stem : a }
            
      w = words.delete_if { |k1, v1| s.include?(k1) || (v1 < 2 && words.size > 80) }.sort_by { |k2, v2| v2 }.reverse.first(options[:limit])
      return w
    end
    
    # Returns body text as determined by Arc90's Readability algorithm
    def body
      @body ||= Readability::Document.new(@doc.to_s).content.strip
      
      # HACK: Remove annoying DIV that readability leaves around
      @body.gsub!(/\A\<div\>/, '')
      @body.gsub!(/\<\/div\>\Z/, '')
      
      return @body
    end
    
    # Returns URL to the site's favicon
    def favicon
      url = @doc.match( ['link[@rel="fluid-icon"]', lambda { |el| el.attr('href') }],      # Get a Fluid icon if possible..
                        ['link[@rel="shortcut icon"]', lambda { |el| el.attr('href') }],
                        ['link[@rel="icon"]', lambda { |el| el.attr('href') }])
      if url && url !~ /^http/ && @url
        url = URI.join(@url , url).to_s
      end
      
      url
    end
    
    # Returns URL of Web feed
    def feed
      url = @doc.match( ['link[@type="application/rss+xml"]', lambda { |el| el.attr('href') }],
                        ['link[@type="application/atom+xml"]', lambda { |el| el.attr('href') }]
      )
      
      if url && url !~ /^http/ && @url
        url = URI.join(@url , url).to_s
      end
      
      url
    end
  end
end