PageRenderTime 50ms CodeModel.GetById 26ms app.highlight 20ms RepoModel.GetById 0ms app.codeStats 0ms

/cola/core/extractor/readability.py

https://gitlab.com/zouxc/cola
Python | 368 lines | 348 code | 11 blank | 9 comment | 30 complexity | f2914e4d451141c0d473e4c12b592286 MD5 | raw file
  1#!/usr/bin/env python
  2# -*- coding: utf-8 -*-
  3'''
  4Copyright (c) 2013 Qin Xuye <qin@qinxuye.me>
  5
  6Licensed under the Apache License, Version 2.0 (the "License");
  7you may not use this file except in compliance with the License.
  8You may obtain a copy of the License at
  9
 10    http://www.apache.org/licenses/LICENSE-2.0
 11
 12Unless required by applicable law or agreed to in writing, software
 13distributed under the License is distributed on an "AS IS" BASIS,
 14WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15See the License for the specific language governing permissions and
 16limitations under the License.
 17
 18Created on 2013-7-15
 19
 20@author: Chine
 21'''
 22
 23import re
 24
 25from cola.core.logs import get_logger
 26from cola.core.errors import DependencyNotInstalledError
 27from cola.core.utils import beautiful_soup
 28
 29try:
 30    from bs4 import NavigableString
 31except ImportError:
 32    raise DependencyNotInstalledError("BeautifulSoup4")
 33
 34from cola.core.extractor.preprocess import PreProcessor
 35
 36__all__ = ['Extractor']
 37
 38REGEXES = { 
 39    'unlikelyCandidatesRe': re.compile('combx|comment|disqus|foot|header|menu|meta|nav|rss|shoutbox|sidebar|aside|sponsor',re.I),
 40    'okMaybeItsACandidateRe': re.compile('and|article|body|column|main',re.I),
 41    'positiveRe': re.compile('article|body|content|entry|hentry|page|pagination|post|text',re.I),
 42    'negativeRe': re.compile('combx|comment|contact|foot|footer|footnote|link|media|meta|promo|related|scroll|shoutbox|sponsor|tags|widget',re.I),
 43    'divToPElementsRe': re.compile('<(a|blockquote|dl|div|img|ol|p|pre|table|ul)',re.I),
 44    'replaceBrsRe': re.compile('(<br[^>]*>[ \n\r\t]*){2,}',re.I),
 45    'replaceFontsRe': re.compile('<(\/?)font[^>]*>',re.I),
 46    'trimRe': re.compile('^\s+|\s+$/'),
 47    'normalizeRe': re.compile('\s{2,}/'),
 48    'killBreaksRe': re.compile('(<br\s*\/?>(\s|&nbsp;?)*){1,}/'),
 49    'videoRe': re.compile('http:\/\/(www\.)?(youtube|vimeo)\.com', re.I),
 50}
 51
 52class HashableElement():
 53    def __init__(self, node):
 54        self.node = node
 55        self._path = None
 56
 57    def _get_path(self):
 58        if self._path is None:
 59            reverse_path = []
 60            node = self.node
 61            while node:
 62                node_id = (node.name, tuple(node.attrs), node.string)
 63                reverse_path.append(node_id)
 64                node = node.parent
 65            self._path = tuple(reverse_path)
 66        return self._path
 67    path = property(_get_path)
 68
 69    def __hash__(self):
 70        return hash(self.path)
 71
 72    def __eq__(self, other):
 73        return self.path == other.path
 74
 75    def __getattr__(self, name):
 76        return getattr(self.node, name)
 77
 78class Extractor(object):
 79    TEXT_LENGTH_THRESHOLD = 25
 80    RETRY_LENGTH = 250
 81    
 82    def __init__(self, content, base_url=None, logger=None, debug=False, **options):
 83        self._content = content
 84        self.logger = logger
 85        self.base_url = base_url
 86        if self.logger is None:
 87            self.logger = get_logger('cola_extractor')
 88        self.on_debug = debug
 89        self.debug = self.logger.info if debug else (lambda s: None)
 90        self.options = options
 91            
 92        self._title = None
 93        self._html = None
 94            
 95    def preprocess(self, force=False):
 96        if force is True or self._html is None:
 97            preprocessor = PreProcessor(self._content, base_url=self.base_url)
 98            self._title, self._html = preprocessor.process()
 99            
100    def title(self, force=False):
101        self.preprocess(force=force)
102        return self._title
103    
104    def content(self, force=False):
105        self.preprocess(force=force)
106        return self._html
107    
108    def _tags(self, node, *tag_names):
109        for tag_name in tag_names:
110            for n in node.find_all(tag_name):
111                yield n
112                
113    def _text(self, node):
114        return ''.join(node.find_all(text=True))
115    
116    def _describe(self, node):
117        if not hasattr(node, 'name'):
118            return "[text]"
119        return "%s#%s.%s" % (
120            node.name, node.get('id', ''), node.get('class',''))
121                
122    def _remove_unlikely_candidates(self):
123        for elem in self._html.find_all():
124            s = '%s%s%s' % (
125                elem.name, elem.get('class', ''), elem.get('id', '')
126            )
127            if REGEXES['unlikelyCandidatesRe'].search(s) and \
128                (not REGEXES['okMaybeItsACandidateRe'].search(s)) and \
129                elem.name != 'body':
130                self.debug("Removing unlikely candidate - %s" % (s,))
131                elem.extract()
132                
133    def _transform_misused_divs_into_p(self):
134        for elem in self._html.find_all('div'):
135            if not REGEXES['divToPElementsRe'].search(''.join(map(unicode, elem.contents))):
136                self.debug("Altering div(#%s.%s) to p" % (elem.get('id', ''), elem.get('class', '')))
137                elem.name = 'p'
138                
139    def _get_link_density(self, node):
140        link_length = len("".join([i.text or "" for i in node.find_all("a")]))
141        text_length = len(self._text(node))
142        return float(link_length) / max(text_length, 1)
143                
144    def _weight_node(self, node):
145        weight = 0
146        if node.get('class', None):
147            cls = ''.join(node['class'])
148            
149            if REGEXES['negativeRe'].search(cls):
150                weight -= 25
151
152            if REGEXES['positiveRe'].search(cls):
153                weight += 25
154
155        if node.get('id', None):
156            if REGEXES['negativeRe'].search(node['id']):
157                weight -= 25
158
159            if REGEXES['positiveRe'].search(node['id']):
160                weight += 25
161
162        return weight
163                
164    def _score_node(self, node):
165        content_score = self._weight_node(node)
166        name = node.name.lower()
167        if name in ("div", "article"):
168            content_score += 5
169        elif name == "blockquote":
170            content_score += 3
171        elif name == "form":
172            content_score -= 3
173        elif name == "th":
174            content_score -= 5
175        return { 'content_score': content_score, 'elem': node }
176                
177    def _score_paragraphs(self, min_text_length=None):
178        if min_text_length is None:
179            min_text_length = self.TEXT_LENGTH_THRESHOLD
180            
181        candidates = {}
182        elems = self._tags(self._html, 'p', 'td')
183        
184        for elem in elems:
185            parent_node = elem.parent
186            grand_parent_node = parent_node.parent
187            parent_key = HashableElement(parent_node)
188            grand_parent_key = HashableElement(grand_parent_node)
189
190            inner_text = self._text(elem)
191            
192            # If this paragraph is less than 25 characters, don't even count it.
193            if (not inner_text) or len(inner_text) < min_text_length:
194                continue
195            
196            if parent_key not in candidates:
197                candidates[parent_key] = self._score_node(parent_node)
198            if grand_parent_node and grand_parent_key not in candidates:
199                candidates[grand_parent_key] = self._score_node(grand_parent_node)
200                
201            content_score = 1
202            content_score += len(re.split(ur',|,', inner_text))
203            content_score += min([(len(inner_text) / 100), 3])
204
205            candidates[parent_key]['content_score'] += content_score
206            if grand_parent_node:
207                candidates[grand_parent_key]['content_score'] += content_score / 2.0
208                
209        # Scale the final candidates score based on link density. Good content should have a
210        # relatively small link density (5% or less) and be mostly unaffected by this operation.
211        for elem, candidate in candidates.items():
212            candidate['content_score'] *= (1 - self._get_link_density(elem))
213            self.debug("candidate %s scored %s" % (self._describe(elem), candidate['content_score']))
214
215        return candidates
216    
217    def _select_best_candidate(self, candidates):
218        sorted_candidates = sorted(candidates.values(), 
219                                   key=lambda x: x['content_score'], 
220                                   reverse=True)
221        self.debug("Top 5 candidates:")
222        for candidate in sorted_candidates[:5]:
223            elem = candidate['elem']
224            self.debug("Candidate %s with score %s" % \
225                       (self._describe(elem), candidate['content_score']))
226
227        if len(sorted_candidates) == 0:
228            return None
229        best_candidate = sorted_candidates[0]
230        self.debug("Best candidate %s with score %s" % \
231                   (self._describe(best_candidate['elem']), best_candidate['content_score']))
232        return best_candidate
233    
234    def _get_article(self, candidates, best_candidate):
235        # Now that we have the top candidate, look through its siblings for content that might also be related.
236        # Things like preambles, content split by ads that we removed, etc.
237        
238        sibling_score_threshold = max([10, best_candidate['content_score'] * 0.2])
239        output = beautiful_soup("<div/>")
240        for sibling in best_candidate['elem'].parent.contents:
241            if isinstance(sibling, NavigableString): continue
242            append = False
243            if sibling is best_candidate['elem']:
244                append = True
245            sibling_key = HashableElement(sibling)
246            if sibling_key in candidates and \
247                candidates[sibling_key]['content_score'] >= sibling_score_threshold:
248                append = True
249
250            if sibling.name == "p":
251                link_density = self._get_link_density(sibling)
252                node_content = sibling.string or ""
253                node_length = len(node_content)
254
255                if node_length > 80 and link_density < 0.25:
256                    append = True
257                elif node_length < 80 and link_density == 0 and re.search('\.( |$)', node_content):
258                    append = True
259
260            if append:
261                output.div.append(sibling)
262                
263        return output
264    
265    def _sanitize(self, node, candidates):
266        for header in self._tags(node, "h1", "h2", "h3", "h4", "h5", "h6"):
267            if self._weight_node(header) < 0 or \
268                self._get_link_density(header) > 0.33: 
269                header.extract()
270
271        for elem in self._tags(node, "form", "iframe"):
272            elem.extract()
273
274        # Conditionally clean <table>s, <ul>s, and <div>s
275        for el in self._tags(node, "table", "ul", "div"):
276            weight = self._weight_node(el)
277            el_key = HashableElement(el)
278            if el_key in candidates:
279                content_score = candidates[el_key]['content_score']
280            else:
281                content_score = 0
282            name = el.name
283
284            if weight + content_score < 0:
285                el.extract()
286                self.debug("Conditionally cleaned %s with weight %s and content score %s because score + content score was less than zero." %
287                    (self._describe(el), weight, content_score))
288            elif len(re.split(ur',|,', self._text(el))) < 10:
289                counts = {}
290                for kind in ['p', 'img', 'li', 'a', 'embed', 'input']:
291                    counts[kind] = len(el.find_all(kind))
292                counts["li"] -= 100
293
294                content_length = len(self._text(el)) # Count the text length excluding any surrounding whitespace
295                link_density = self._get_link_density(el)
296                to_remove = False
297                reason = ""
298
299                if counts["img"] > counts["p"]:
300                    reason = "too many images"
301                    to_remove = True
302                elif counts["li"] > counts["p"] and name != "ul" and name != "ol":
303                    reason = "more <li>s than <p>s"
304                    to_remove = True
305                elif counts["input"] > (counts["p"] / 3):
306                    reason = "less than 3x <p>s than <input>s"
307                    to_remove = True
308                elif content_length < (self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD)) and (counts["img"] == 0 or counts["img"] > 2):
309                    reason = "too short a content length without a single image"
310                    to_remove = True
311                elif weight < 25 and link_density > 0.2:
312                    reason = "too many links for its weight (#{weight})"
313                    to_remove = True
314                elif weight >= 25 and link_density > 0.5:
315                    reason = "too many links for its weight (#{weight})"
316                    to_remove = True
317                elif (counts["embed"] == 1 and content_length < 75) or counts["embed"] > 1:
318                    reason = "<embed>s with too short a content length, or too many <embed>s"
319                    to_remove = True
320
321                if to_remove:
322                    self.debug("Conditionally cleaned %s#%s.%s with weight %s and content score %s because it has %s." %
323                        (el.name, el.get('id',''), el.get('class', ''), weight, content_score, reason))
324                    el.extract()
325
326        for el in ([node] + node.find_all()):
327            if not (self.options.get('attributes')):
328                el.attrMap = {}
329
330        return unicode(node)
331            
332    def extract(self):
333        try:
334            ruthless = True
335            while True:
336                self.preprocess(force=True)
337                for tag in self._tags(self._html, 'script', 'style'):
338                    tag.extract()
339                    
340                if ruthless:
341                    self._remove_unlikely_candidates()
342                self._transform_misused_divs_into_p()
343                candidates = self._score_paragraphs(self.options.get('min_text_length'))
344                best_candidate = self._select_best_candidate(candidates)
345                if best_candidate:
346                    article = self._get_article(candidates, best_candidate)
347                else:
348                    if ruthless:
349                        ruthless = False
350                        self.debug("ended up stripping too much - going for a safer parse")
351                        # try again
352                        continue
353                    else:
354                        article = self._html.find('body') or self._html
355                        
356                cleaned_article = self._sanitize(article, candidates)
357                retry_length = self.options.get('retry_length') or self.RETRY_LENGTH
358                of_acceptable_length = len(cleaned_article or '') >= retry_length
359                if ruthless and not of_acceptable_length:
360                    ruthless = False
361                    continue # try again
362                else:
363                    return cleaned_article
364                
365        except Exception, e:
366            self.logger.exception(e)
367            if self.on_debug:
368                raise e