/lib/web-page-parser/parsers/bbc_news_page_parser.rb
Ruby | 138 lines | 108 code | 21 blank | 9 comment | 1 complexity | 9eef65cf487cd11b1ba5a7eb0d9be85a MD5 | raw file
Possible License(s): MIT
- # -*- coding: utf-8 -*-
- module WebPageParser
- class BbcNewsPageParserFactory < WebPageParser::ParserFactory
- URL_RE = ORegexp.new("(www|news)\.bbc\.co\.uk/.+/([a-z-]+-)?[0-9]+(\.stm)?$")
- INVALID_URL_RE = ORegexp.new("in_pictures|pop_ups|sport1")
- def self.can_parse?(options)
- url = options[:url].split('#').first
- if INVALID_URL_RE.match(url)
- nil
- else
- URL_RE.match(url)
- end
- end
- def self.create(options = {})
- BbcNewsPageParserV4.new(options)
- end
- end
- # BbcNewsPageParserV1 parses BBC News web pages exactly like the
- # old News Sniffer BbcNewsPage class did. This should only ever
- # be used for backwards compatability with News Sniffer and is
- # never supplied for use by a factory.
- class BbcNewsPageParserV1 < WebPageParser::BaseParser
- TITLE_RE = ORegexp.new('<meta name="Headline" content="(.*)"', 'i')
- DATE_RE = ORegexp.new('<meta name="OriginalPublicationDate" content="(.*)"', 'i')
- CONTENT_RE = ORegexp.new('S (?:SF) -->(.*?)<!-- E BO', 'm')
- STRIP_TAGS_RE = ORegexp.new('</?(div|img|tr|td|!--|table)[^>]*>','i')
- WHITESPACE_RE = ORegexp.new('\t|')
- PARA_RE = Regexp.new(/<p>/i)
-
- def hash
- # Old News Sniffer only hashed the content, not the title
- Digest::MD5.hexdigest(content.to_s)
- end
- private
-
- def date_processor
- begin
- # OPD is in GMT/UTC, which DateTime seems to use by default
- @date = DateTime.parse(@date)
- rescue ArgumentError
- @date = Time.now.utc
- end
- end
- def content_processor
- @content = STRIP_TAGS_RE.gsub(@content, '')
- @content = WHITESPACE_RE.gsub(@content, '')
- @content = decode_entities(@content)
- @content = @content.split(PARA_RE)
- end
- end
- # BbcNewsPageParserV2 parses BBC News web pages
- class BbcNewsPageParserV2 < WebPageParser::BaseParser
- TITLE_RE = ORegexp.new('<meta name="Headline" content="(.*)"', 'i')
- DATE_RE = ORegexp.new('<meta name="OriginalPublicationDate" content="(.*)"', 'i')
- CONTENT_RE = ORegexp.new('S BO -->(.*?)<!-- E BO', 'm')
- STRIP_BLOCKS_RE = ORegexp.new('<(table|noscript|script|object|form)[^>]*>.*?</\1>', 'i')
- STRIP_TAGS_RE = ORegexp.new('</?(b|div|img|tr|td|br|font|span)[^>]*>','i')
- STRIP_COMMENTS_RE = ORegexp.new('<!--.*?-->')
- STRIP_CAPTIONS_RE = ORegexp.new('<!-- caption .+?<!-- END - caption -->')
- WHITESPACE_RE = ORegexp.new('[\t ]+')
- PARA_RE = Regexp.new('</?p[^>]*>', Regexp::IGNORECASE)
-
- private
-
- def content_processor
- @content = STRIP_CAPTIONS_RE.gsub(@content, '')
- @content = STRIP_COMMENTS_RE.gsub(@content, '')
- @content = STRIP_BLOCKS_RE.gsub(@content, '')
- @content = STRIP_TAGS_RE.gsub(@content, '')
- @content = WHITESPACE_RE.gsub(@content, ' ')
- @content = @content.split(PARA_RE)
- end
-
- def date_processor
- begin
- # OPD is in GMT/UTC, which DateTime seems to use by default
- @date = DateTime.parse(@date)
- rescue ArgumentError
- @date = Time.now.utc
- end
- end
- end
- class BbcNewsPageParserV3 < BbcNewsPageParserV2
- CONTENT_RE = ORegexp.new('<div id="story\-body">(.*?)<div class="bookmark-list">', 'm')
- STRIP_FEATURES_RE = ORegexp.new('<div class="story-feature">(.*?)</div>', 'm')
- STRIP_MARKET_DATA_WIDGET_RE = ORegexp.new('<\!\-\- S MD_WIDGET.*? E MD_WIDGET \-\->')
- ICONV = nil # BBC news is now in utf8
-
- def content_processor
- @content = STRIP_FEATURES_RE.gsub(@content, '')
- @content = STRIP_MARKET_DATA_WIDGET_RE.gsub(@content, '')
- super
- end
- end
- class BbcNewsPageParserV4 < BbcNewsPageParserV3
- CONTENT_RE = ORegexp.new('<div class=.story-body.>(.*?)<!-- / story\-body', 'm')
- STRIP_PAGE_BOOKMARKS = ORegexp.new('<div id="page-bookmark-links-head".+?</div>', 'm')
- STRIP_STORY_DATE = ORegexp.new('<span class="date".+?</span>', 'm')
- STRIP_STORY_LASTUPDATED = ORegexp.new('<span class="time\-text".+?</span>', 'm')
- STRIP_STORY_TIME = ORegexp.new('<span class="time".+?</span>', 'm')
- TITLE_RE = ORegexp.new('<h1 class="story\-header">(.+?)</h1>', 'm')
- STRIP_CAPTIONS_RE2 = ORegexp.new('<div class=.caption.+?</div>','m')
- STRIP_HIDDEN_A = ORegexp.new('<a class=.hidden.+?</a>','m')
- STRIP_STORY_FEATURE = ORegexp.new('<div class=.story\-feature.+?</div>', 'm')
- STRIP_HYPERPUFF_RE = ORegexp.new('<div class=.embedded-hyper.+?<div class=.hyperpuff.+?</div>.+?</div>', 'm')
- STRIP_MARKETDATA_RE = ORegexp.new('<div class=.market\-data.+?</div>', 'm')
- STRIP_EMBEDDEDHYPER_RE = ORegexp.new('<div class=.embedded\-hyper.+?</div>', 'm')
- def content_processor
- @content = STRIP_PAGE_BOOKMARKS.gsub(@content, '')
- @content = STRIP_STORY_DATE.gsub(@content, '')
- @content = STRIP_STORY_LASTUPDATED.gsub(@content, '')
- @content = STRIP_STORY_TIME.gsub(@content, '')
- @content = TITLE_RE.gsub(@content, '')
- @content = STRIP_CAPTIONS_RE2.gsub(@content, '')
- @content = STRIP_HIDDEN_A.gsub(@content, '')
- @content = STRIP_STORY_FEATURE.gsub(@content, '')
- @content = STRIP_HYPERPUFF_RE.gsub(@content, '')
- @content = STRIP_MARKETDATA_RE.gsub(@content, '')
- @content = STRIP_EMBEDDEDHYPER_RE.gsub(@content, '')
- super
- end
- end
-
- end