PageRenderTime 24ms CodeModel.GetById 27ms RepoModel.GetById 1ms app.codeStats 0ms

/lib/pismo/internal_attributes.rb

https://github.com/bigfolio/pismo
Ruby | 202 lines | 145 code | 32 blank | 25 comment | 13 complexity | a590521a6e7b49f8881c2985c6c37365 MD5 | raw file
Possible License(s): Apache-2.0
  1. module Pismo
  2. # Internal attributes are different pieces of data we can extract from a document's content
  3. module InternalAttributes
  4. # Returns the title of the page/content - attempts to strip site name, etc, if possible
  5. def title
  6. title = @doc.match( 'h2.title',
  7. '.entry h2', # Common style
  8. '.entryheader h1', # Ruby Inside/Kubrick
  9. '.entry-title a', # Common Blogger/Blogspot rules
  10. '.post-title a',
  11. '.posttitle a',
  12. '.entry-title',
  13. '.post-title',
  14. '.posttitle',
  15. ['meta[@name="title"]', lambda { |el| el.attr('content') }],
  16. '#pname a', # Google Code style
  17. 'h1.headermain',
  18. 'h1.title',
  19. '.mxb h1' # BBC News
  20. )
  21. # If all else fails, go to the HTML title
  22. unless title
  23. title = @doc.match('title')
  24. return unless title
  25. # Strip off any leading or trailing site names - a scrappy way to try it out..
  26. title = title.split(/\s+(\-|\||\:)\s+/).sort_by { |i| i.length }.last.strip
  27. end
  28. title
  29. end
  30. # Return an estimate of when the page/content was created
  31. # As clients of this library should be doing HTTP retrieval themselves, they can fall to the
  32. # Last-Updated HTTP header if they so wish. This method is just rough and based on content only.
  33. def datetime
  34. # TODO: Clean all this mess up
  35. mo = %r{(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|January|February|March|April|May|June|July|August|September|October|November|December)}i
  36. regexen = [
  37. /#{mo}\b\s+\d+\D{1,10}\d{4}/i,
  38. /(on\s+)?\d+\s+#{mo}\s+\D{1,10}\d+/i,
  39. /(on[^\d+]{1,10})?\d+(th|st|rd)?.{1,10}#{mo}\b[^\d]{1,10}\d+/i,
  40. /on\s+#{mo}\s+\d+/i,
  41. /#{mo}\s+\d+/i,
  42. /\d{4}[\.\/\-]\d{2}[\.\/\-]\d{2}/,
  43. /\d{2}[\.\/\-]\d{2}[\.\/\-]\d{4}/
  44. ]
  45. datetime = 10
  46. regexen.each do |r|
  47. datetime = @doc.to_html[r]
  48. p datetime
  49. break if datetime
  50. end
  51. return unless datetime && datetime.length > 4
  52. # Clean up the string for use by Chronic
  53. datetime.strip!
  54. datetime.gsub!(/(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)[^\w]*/i, '')
  55. datetime.gsub!(/(mon|tues|tue|weds|wed|thurs|thur|thu|fri|sat|sun)[^\w]*/i, '')
  56. datetime.sub!(/on\s+/, '')
  57. datetime.gsub!(/\,/, '')
  58. datetime.sub!(/(\d+)(th|st|rd)/, '\1')
  59. Chronic.parse(datetime) || datetime
  60. end
  61. # TODO: Attempts to work out what type of site or page the page is from the provided URL
  62. # def site_type
  63. # end
  64. # Returns the author of the page/content
  65. def author
  66. author = @doc.match('.post-author .fn',
  67. '.wire_author',
  68. '.cnnByline b',
  69. ['meta[@name="author"]', lambda { |el| el.attr('content') }], # Traditional meta tag style
  70. ['meta[@name="AUTHOR"]', lambda { |el| el.attr('content') }], # CNN style
  71. '.byline a', # Ruby Inside style
  72. '.post_subheader_left a', # TechCrunch style
  73. '.byl', # BBC News style
  74. '.meta a',
  75. '.articledata .author a',
  76. '#owners a', # Google Code style
  77. '.author a',
  78. '.author',
  79. '.auth a',
  80. '.auth',
  81. '.cT-storyDetails h5', # smh.com.au - worth dropping maybe..
  82. ['meta[@name="byl"]', lambda { |el| el.attr('content') }],
  83. '.fn a',
  84. '.fn',
  85. '.byline-author'
  86. )
  87. return unless author
  88. # Strip off any "By [whoever]" section
  89. author.sub!(/^(post(ed)?\s)?by\W+/i, '')
  90. author
  91. end
  92. # Returns the "description" of the page, usually comes from a meta tag
  93. def description
  94. @doc.match(
  95. ['meta[@name="description"]', lambda { |el| el.attr('content') }],
  96. ['meta[@name="Description"]', lambda { |el| el.attr('content') }],
  97. '.description'
  98. )
  99. end
  100. # Returns the "lede" or first paragraph of the story/page
  101. def lede
  102. lede = @doc.match(
  103. '#blogpost p',
  104. '.subhead',
  105. '//div[@class="entrytext"]//p[string-length()>10]', # Ruby Inside / Kubrick style
  106. 'section p',
  107. '.entry .text p',
  108. '.entry-content p',
  109. '#wikicontent p', # Google Code style
  110. '//td[@class="storybody"]/p[string-length()>10]', # BBC News style
  111. '//div[@class="entry"]//p[string-length()>100]',
  112. # The below is a horrible, horrible way to pluck out lead paras from crappy Blogspot blogs that
  113. # don't use <p> tags..
  114. ['.entry-content', lambda { |el| el.inner_html[/(#{el.inner_text[0..4].strip}.*?)\<br/, 1] }],
  115. ['.entry', lambda { |el| el.inner_html[/(#{el.inner_text[0..4].strip}.*?)\<br/, 1] }],
  116. '.entry',
  117. '#content p',
  118. '#article p',
  119. '.post-body',
  120. '.entry-content'
  121. )
  122. lede[/^(.*?\.\s){2}/m] || lede
  123. end
  124. # Returns the "keywords" in the document (not the meta keywords - they're next to useless now)
  125. def keywords(options = {})
  126. options = { :stem_at => 10, :word_length_limit => 15, :limit => 20 }.merge(options)
  127. words = {}
  128. # Convert doc to lowercase, scrub out most HTML tags
  129. body.downcase.gsub(/\<[^\>]{1,100}\>/, '').gsub(/\&\w+\;/, '').scan(/\b[a-z][a-z\'\#\.]*\b/).each do |word|
  130. next if word.length > options[:word_length_limit]
  131. word.gsub!(/\'\w+/, '')
  132. words[word] ||= 0
  133. words[word] += 1
  134. end
  135. # Stem the words and stop words if necessary
  136. d = words.keys.uniq.map { |a| a.length > options[:stem_at] ? a.stem : a }
  137. s = File.read(File.dirname(__FILE__) + '/stopwords.txt').split.map { |a| a.length > options[:stem_at] ? a.stem : a }
  138. w = words.delete_if { |k1, v1| s.include?(k1) || (v1 < 2 && words.size > 80) }.sort_by { |k2, v2| v2 }.reverse.first(options[:limit])
  139. return w
  140. end
  141. # Returns body text as determined by Arc90's Readability algorithm
  142. def body
  143. @body ||= Readability::Document.new(@doc.to_s).content.strip
  144. # HACK: Remove annoying DIV that readability leaves around
  145. @body.gsub!(/\A\<div\>/, '')
  146. @body.gsub!(/\<\/div\>\Z/, '')
  147. return @body
  148. end
  149. # Returns URL to the site's favicon
  150. def favicon
  151. url = @doc.match( ['link[@rel="fluid-icon"]', lambda { |el| el.attr('href') }], # Get a Fluid icon if possible..
  152. ['link[@rel="shortcut icon"]', lambda { |el| el.attr('href') }],
  153. ['link[@rel="icon"]', lambda { |el| el.attr('href') }])
  154. if url && url !~ /^http/ && @url
  155. url = URI.join(@url , url).to_s
  156. end
  157. url
  158. end
  159. # Returns URL of Web feed
  160. def feed
  161. url = @doc.match( ['link[@type="application/rss+xml"]', lambda { |el| el.attr('href') }],
  162. ['link[@type="application/atom+xml"]', lambda { |el| el.attr('href') }]
  163. )
  164. if url && url !~ /^http/ && @url
  165. url = URI.join(@url , url).to_s
  166. end
  167. url
  168. end
  169. end
  170. end