PageRenderTime 46ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 0ms

/lib/web-page-parser/parsers/bbc_news_page_parser.rb

https://bitbucket.org/imcnulty/web-page-parser
Ruby | 138 lines | 108 code | 21 blank | 9 comment | 1 complexity | 9eef65cf487cd11b1ba5a7eb0d9be85a MD5 | raw file
Possible License(s): MIT
  1. # -*- coding: utf-8 -*-
  2. module WebPageParser
  3. class BbcNewsPageParserFactory < WebPageParser::ParserFactory
  4. URL_RE = ORegexp.new("(www|news)\.bbc\.co\.uk/.+/([a-z-]+-)?[0-9]+(\.stm)?$")
  5. INVALID_URL_RE = ORegexp.new("in_pictures|pop_ups|sport1")
  6. def self.can_parse?(options)
  7. url = options[:url].split('#').first
  8. if INVALID_URL_RE.match(url)
  9. nil
  10. else
  11. URL_RE.match(url)
  12. end
  13. end
  14. def self.create(options = {})
  15. BbcNewsPageParserV4.new(options)
  16. end
  17. end
  18. # BbcNewsPageParserV1 parses BBC News web pages exactly like the
  19. # old News Sniffer BbcNewsPage class did. This should only ever
  20. # be used for backwards compatability with News Sniffer and is
  21. # never supplied for use by a factory.
  22. class BbcNewsPageParserV1 < WebPageParser::BaseParser
  23. TITLE_RE = ORegexp.new('<meta name="Headline" content="(.*)"', 'i')
  24. DATE_RE = ORegexp.new('<meta name="OriginalPublicationDate" content="(.*)"', 'i')
  25. CONTENT_RE = ORegexp.new('S (?:SF) -->(.*?)<!-- E BO', 'm')
  26. STRIP_TAGS_RE = ORegexp.new('</?(div|img|tr|td|!--|table)[^>]*>','i')
  27. WHITESPACE_RE = ORegexp.new('\t|')
  28. PARA_RE = Regexp.new(/<p>/i)
  29. def hash
  30. # Old News Sniffer only hashed the content, not the title
  31. Digest::MD5.hexdigest(content.to_s)
  32. end
  33. private
  34. def date_processor
  35. begin
  36. # OPD is in GMT/UTC, which DateTime seems to use by default
  37. @date = DateTime.parse(@date)
  38. rescue ArgumentError
  39. @date = Time.now.utc
  40. end
  41. end
  42. def content_processor
  43. @content = STRIP_TAGS_RE.gsub(@content, '')
  44. @content = WHITESPACE_RE.gsub(@content, '')
  45. @content = decode_entities(@content)
  46. @content = @content.split(PARA_RE)
  47. end
  48. end
  49. # BbcNewsPageParserV2 parses BBC News web pages
  50. class BbcNewsPageParserV2 < WebPageParser::BaseParser
  51. TITLE_RE = ORegexp.new('<meta name="Headline" content="(.*)"', 'i')
  52. DATE_RE = ORegexp.new('<meta name="OriginalPublicationDate" content="(.*)"', 'i')
  53. CONTENT_RE = ORegexp.new('S BO -->(.*?)<!-- E BO', 'm')
  54. STRIP_BLOCKS_RE = ORegexp.new('<(table|noscript|script|object|form)[^>]*>.*?</\1>', 'i')
  55. STRIP_TAGS_RE = ORegexp.new('</?(b|div|img|tr|td|br|font|span)[^>]*>','i')
  56. STRIP_COMMENTS_RE = ORegexp.new('<!--.*?-->')
  57. STRIP_CAPTIONS_RE = ORegexp.new('<!-- caption .+?<!-- END - caption -->')
  58. WHITESPACE_RE = ORegexp.new('[\t ]+')
  59. PARA_RE = Regexp.new('</?p[^>]*>', Regexp::IGNORECASE)
  60. private
  61. def content_processor
  62. @content = STRIP_CAPTIONS_RE.gsub(@content, '')
  63. @content = STRIP_COMMENTS_RE.gsub(@content, '')
  64. @content = STRIP_BLOCKS_RE.gsub(@content, '')
  65. @content = STRIP_TAGS_RE.gsub(@content, '')
  66. @content = WHITESPACE_RE.gsub(@content, ' ')
  67. @content = @content.split(PARA_RE)
  68. end
  69. def date_processor
  70. begin
  71. # OPD is in GMT/UTC, which DateTime seems to use by default
  72. @date = DateTime.parse(@date)
  73. rescue ArgumentError
  74. @date = Time.now.utc
  75. end
  76. end
  77. end
  78. class BbcNewsPageParserV3 < BbcNewsPageParserV2
  79. CONTENT_RE = ORegexp.new('<div id="story\-body">(.*?)<div class="bookmark-list">', 'm')
  80. STRIP_FEATURES_RE = ORegexp.new('<div class="story-feature">(.*?)</div>', 'm')
  81. STRIP_MARKET_DATA_WIDGET_RE = ORegexp.new('<\!\-\- S MD_WIDGET.*? E MD_WIDGET \-\->')
  82. ICONV = nil # BBC news is now in utf8
  83. def content_processor
  84. @content = STRIP_FEATURES_RE.gsub(@content, '')
  85. @content = STRIP_MARKET_DATA_WIDGET_RE.gsub(@content, '')
  86. super
  87. end
  88. end
  89. class BbcNewsPageParserV4 < BbcNewsPageParserV3
  90. CONTENT_RE = ORegexp.new('<div class=.story-body.>(.*?)<!-- / story\-body', 'm')
  91. STRIP_PAGE_BOOKMARKS = ORegexp.new('<div id="page-bookmark-links-head".+?</div>', 'm')
  92. STRIP_STORY_DATE = ORegexp.new('<span class="date".+?</span>', 'm')
  93. STRIP_STORY_LASTUPDATED = ORegexp.new('<span class="time\-text".+?</span>', 'm')
  94. STRIP_STORY_TIME = ORegexp.new('<span class="time".+?</span>', 'm')
  95. TITLE_RE = ORegexp.new('<h1 class="story\-header">(.+?)</h1>', 'm')
  96. STRIP_CAPTIONS_RE2 = ORegexp.new('<div class=.caption.+?</div>','m')
  97. STRIP_HIDDEN_A = ORegexp.new('<a class=.hidden.+?</a>','m')
  98. STRIP_STORY_FEATURE = ORegexp.new('<div class=.story\-feature.+?</div>', 'm')
  99. STRIP_HYPERPUFF_RE = ORegexp.new('<div class=.embedded-hyper.+?<div class=.hyperpuff.+?</div>.+?</div>', 'm')
  100. STRIP_MARKETDATA_RE = ORegexp.new('<div class=.market\-data.+?</div>', 'm')
  101. STRIP_EMBEDDEDHYPER_RE = ORegexp.new('<div class=.embedded\-hyper.+?</div>', 'm')
  102. def content_processor
  103. @content = STRIP_PAGE_BOOKMARKS.gsub(@content, '')
  104. @content = STRIP_STORY_DATE.gsub(@content, '')
  105. @content = STRIP_STORY_LASTUPDATED.gsub(@content, '')
  106. @content = STRIP_STORY_TIME.gsub(@content, '')
  107. @content = TITLE_RE.gsub(@content, '')
  108. @content = STRIP_CAPTIONS_RE2.gsub(@content, '')
  109. @content = STRIP_HIDDEN_A.gsub(@content, '')
  110. @content = STRIP_STORY_FEATURE.gsub(@content, '')
  111. @content = STRIP_HYPERPUFF_RE.gsub(@content, '')
  112. @content = STRIP_MARKETDATA_RE.gsub(@content, '')
  113. @content = STRIP_EMBEDDEDHYPER_RE.gsub(@content, '')
  114. super
  115. end
  116. end
  117. end