PageRenderTime 84ms CodeModel.GetById 29ms RepoModel.GetById 1ms app.codeStats 0ms

/cola/core/extractor/readability.py

https://gitlab.com/zouxc/cola
Python | 368 lines | 348 code | 11 blank | 9 comment | 25 complexity | f2914e4d451141c0d473e4c12b592286 MD5 | raw file
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. '''
  4. Copyright (c) 2013 Qin Xuye <qin@qinxuye.me>
  5. Licensed under the Apache License, Version 2.0 (the "License");
  6. you may not use this file except in compliance with the License.
  7. You may obtain a copy of the License at
  8. http://www.apache.org/licenses/LICENSE-2.0
  9. Unless required by applicable law or agreed to in writing, software
  10. distributed under the License is distributed on an "AS IS" BASIS,
  11. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. See the License for the specific language governing permissions and
  13. limitations under the License.
  14. Created on 2013-7-15
  15. @author: Chine
  16. '''
  17. import re
  18. from cola.core.logs import get_logger
  19. from cola.core.errors import DependencyNotInstalledError
  20. from cola.core.utils import beautiful_soup
  21. try:
  22. from bs4 import NavigableString
  23. except ImportError:
  24. raise DependencyNotInstalledError("BeautifulSoup4")
  25. from cola.core.extractor.preprocess import PreProcessor
  26. __all__ = ['Extractor']
  27. REGEXES = {
  28. 'unlikelyCandidatesRe': re.compile('combx|comment|disqus|foot|header|menu|meta|nav|rss|shoutbox|sidebar|aside|sponsor',re.I),
  29. 'okMaybeItsACandidateRe': re.compile('and|article|body|column|main',re.I),
  30. 'positiveRe': re.compile('article|body|content|entry|hentry|page|pagination|post|text',re.I),
  31. 'negativeRe': re.compile('combx|comment|contact|foot|footer|footnote|link|media|meta|promo|related|scroll|shoutbox|sponsor|tags|widget',re.I),
  32. 'divToPElementsRe': re.compile('<(a|blockquote|dl|div|img|ol|p|pre|table|ul)',re.I),
  33. 'replaceBrsRe': re.compile('(<br[^>]*>[ \n\r\t]*){2,}',re.I),
  34. 'replaceFontsRe': re.compile('<(\/?)font[^>]*>',re.I),
  35. 'trimRe': re.compile('^\s+|\s+$/'),
  36. 'normalizeRe': re.compile('\s{2,}/'),
  37. 'killBreaksRe': re.compile('(<br\s*\/?>(\s|&nbsp;?)*){1,}/'),
  38. 'videoRe': re.compile('http:\/\/(www\.)?(youtube|vimeo)\.com', re.I),
  39. }
  40. class HashableElement():
  41. def __init__(self, node):
  42. self.node = node
  43. self._path = None
  44. def _get_path(self):
  45. if self._path is None:
  46. reverse_path = []
  47. node = self.node
  48. while node:
  49. node_id = (node.name, tuple(node.attrs), node.string)
  50. reverse_path.append(node_id)
  51. node = node.parent
  52. self._path = tuple(reverse_path)
  53. return self._path
  54. path = property(_get_path)
  55. def __hash__(self):
  56. return hash(self.path)
  57. def __eq__(self, other):
  58. return self.path == other.path
  59. def __getattr__(self, name):
  60. return getattr(self.node, name)
  61. class Extractor(object):
  62. TEXT_LENGTH_THRESHOLD = 25
  63. RETRY_LENGTH = 250
  64. def __init__(self, content, base_url=None, logger=None, debug=False, **options):
  65. self._content = content
  66. self.logger = logger
  67. self.base_url = base_url
  68. if self.logger is None:
  69. self.logger = get_logger('cola_extractor')
  70. self.on_debug = debug
  71. self.debug = self.logger.info if debug else (lambda s: None)
  72. self.options = options
  73. self._title = None
  74. self._html = None
  75. def preprocess(self, force=False):
  76. if force is True or self._html is None:
  77. preprocessor = PreProcessor(self._content, base_url=self.base_url)
  78. self._title, self._html = preprocessor.process()
  79. def title(self, force=False):
  80. self.preprocess(force=force)
  81. return self._title
  82. def content(self, force=False):
  83. self.preprocess(force=force)
  84. return self._html
  85. def _tags(self, node, *tag_names):
  86. for tag_name in tag_names:
  87. for n in node.find_all(tag_name):
  88. yield n
  89. def _text(self, node):
  90. return ''.join(node.find_all(text=True))
  91. def _describe(self, node):
  92. if not hasattr(node, 'name'):
  93. return "[text]"
  94. return "%s#%s.%s" % (
  95. node.name, node.get('id', ''), node.get('class',''))
  96. def _remove_unlikely_candidates(self):
  97. for elem in self._html.find_all():
  98. s = '%s%s%s' % (
  99. elem.name, elem.get('class', ''), elem.get('id', '')
  100. )
  101. if REGEXES['unlikelyCandidatesRe'].search(s) and \
  102. (not REGEXES['okMaybeItsACandidateRe'].search(s)) and \
  103. elem.name != 'body':
  104. self.debug("Removing unlikely candidate - %s" % (s,))
  105. elem.extract()
  106. def _transform_misused_divs_into_p(self):
  107. for elem in self._html.find_all('div'):
  108. if not REGEXES['divToPElementsRe'].search(''.join(map(unicode, elem.contents))):
  109. self.debug("Altering div(#%s.%s) to p" % (elem.get('id', ''), elem.get('class', '')))
  110. elem.name = 'p'
  111. def _get_link_density(self, node):
  112. link_length = len("".join([i.text or "" for i in node.find_all("a")]))
  113. text_length = len(self._text(node))
  114. return float(link_length) / max(text_length, 1)
  115. def _weight_node(self, node):
  116. weight = 0
  117. if node.get('class', None):
  118. cls = ''.join(node['class'])
  119. if REGEXES['negativeRe'].search(cls):
  120. weight -= 25
  121. if REGEXES['positiveRe'].search(cls):
  122. weight += 25
  123. if node.get('id', None):
  124. if REGEXES['negativeRe'].search(node['id']):
  125. weight -= 25
  126. if REGEXES['positiveRe'].search(node['id']):
  127. weight += 25
  128. return weight
  129. def _score_node(self, node):
  130. content_score = self._weight_node(node)
  131. name = node.name.lower()
  132. if name in ("div", "article"):
  133. content_score += 5
  134. elif name == "blockquote":
  135. content_score += 3
  136. elif name == "form":
  137. content_score -= 3
  138. elif name == "th":
  139. content_score -= 5
  140. return { 'content_score': content_score, 'elem': node }
  141. def _score_paragraphs(self, min_text_length=None):
  142. if min_text_length is None:
  143. min_text_length = self.TEXT_LENGTH_THRESHOLD
  144. candidates = {}
  145. elems = self._tags(self._html, 'p', 'td')
  146. for elem in elems:
  147. parent_node = elem.parent
  148. grand_parent_node = parent_node.parent
  149. parent_key = HashableElement(parent_node)
  150. grand_parent_key = HashableElement(grand_parent_node)
  151. inner_text = self._text(elem)
  152. # If this paragraph is less than 25 characters, don't even count it.
  153. if (not inner_text) or len(inner_text) < min_text_length:
  154. continue
  155. if parent_key not in candidates:
  156. candidates[parent_key] = self._score_node(parent_node)
  157. if grand_parent_node and grand_parent_key not in candidates:
  158. candidates[grand_parent_key] = self._score_node(grand_parent_node)
  159. content_score = 1
  160. content_score += len(re.split(ur',|,', inner_text))
  161. content_score += min([(len(inner_text) / 100), 3])
  162. candidates[parent_key]['content_score'] += content_score
  163. if grand_parent_node:
  164. candidates[grand_parent_key]['content_score'] += content_score / 2.0
  165. # Scale the final candidates score based on link density. Good content should have a
  166. # relatively small link density (5% or less) and be mostly unaffected by this operation.
  167. for elem, candidate in candidates.items():
  168. candidate['content_score'] *= (1 - self._get_link_density(elem))
  169. self.debug("candidate %s scored %s" % (self._describe(elem), candidate['content_score']))
  170. return candidates
  171. def _select_best_candidate(self, candidates):
  172. sorted_candidates = sorted(candidates.values(),
  173. key=lambda x: x['content_score'],
  174. reverse=True)
  175. self.debug("Top 5 candidates:")
  176. for candidate in sorted_candidates[:5]:
  177. elem = candidate['elem']
  178. self.debug("Candidate %s with score %s" % \
  179. (self._describe(elem), candidate['content_score']))
  180. if len(sorted_candidates) == 0:
  181. return None
  182. best_candidate = sorted_candidates[0]
  183. self.debug("Best candidate %s with score %s" % \
  184. (self._describe(best_candidate['elem']), best_candidate['content_score']))
  185. return best_candidate
  186. def _get_article(self, candidates, best_candidate):
  187. # Now that we have the top candidate, look through its siblings for content that might also be related.
  188. # Things like preambles, content split by ads that we removed, etc.
  189. sibling_score_threshold = max([10, best_candidate['content_score'] * 0.2])
  190. output = beautiful_soup("<div/>")
  191. for sibling in best_candidate['elem'].parent.contents:
  192. if isinstance(sibling, NavigableString): continue
  193. append = False
  194. if sibling is best_candidate['elem']:
  195. append = True
  196. sibling_key = HashableElement(sibling)
  197. if sibling_key in candidates and \
  198. candidates[sibling_key]['content_score'] >= sibling_score_threshold:
  199. append = True
  200. if sibling.name == "p":
  201. link_density = self._get_link_density(sibling)
  202. node_content = sibling.string or ""
  203. node_length = len(node_content)
  204. if node_length > 80 and link_density < 0.25:
  205. append = True
  206. elif node_length < 80 and link_density == 0 and re.search('\.( |$)', node_content):
  207. append = True
  208. if append:
  209. output.div.append(sibling)
  210. return output
  211. def _sanitize(self, node, candidates):
  212. for header in self._tags(node, "h1", "h2", "h3", "h4", "h5", "h6"):
  213. if self._weight_node(header) < 0 or \
  214. self._get_link_density(header) > 0.33:
  215. header.extract()
  216. for elem in self._tags(node, "form", "iframe"):
  217. elem.extract()
  218. # Conditionally clean <table>s, <ul>s, and <div>s
  219. for el in self._tags(node, "table", "ul", "div"):
  220. weight = self._weight_node(el)
  221. el_key = HashableElement(el)
  222. if el_key in candidates:
  223. content_score = candidates[el_key]['content_score']
  224. else:
  225. content_score = 0
  226. name = el.name
  227. if weight + content_score < 0:
  228. el.extract()
  229. self.debug("Conditionally cleaned %s with weight %s and content score %s because score + content score was less than zero." %
  230. (self._describe(el), weight, content_score))
  231. elif len(re.split(ur',|,', self._text(el))) < 10:
  232. counts = {}
  233. for kind in ['p', 'img', 'li', 'a', 'embed', 'input']:
  234. counts[kind] = len(el.find_all(kind))
  235. counts["li"] -= 100
  236. content_length = len(self._text(el)) # Count the text length excluding any surrounding whitespace
  237. link_density = self._get_link_density(el)
  238. to_remove = False
  239. reason = ""
  240. if counts["img"] > counts["p"]:
  241. reason = "too many images"
  242. to_remove = True
  243. elif counts["li"] > counts["p"] and name != "ul" and name != "ol":
  244. reason = "more <li>s than <p>s"
  245. to_remove = True
  246. elif counts["input"] > (counts["p"] / 3):
  247. reason = "less than 3x <p>s than <input>s"
  248. to_remove = True
  249. elif content_length < (self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD)) and (counts["img"] == 0 or counts["img"] > 2):
  250. reason = "too short a content length without a single image"
  251. to_remove = True
  252. elif weight < 25 and link_density > 0.2:
  253. reason = "too many links for its weight (#{weight})"
  254. to_remove = True
  255. elif weight >= 25 and link_density > 0.5:
  256. reason = "too many links for its weight (#{weight})"
  257. to_remove = True
  258. elif (counts["embed"] == 1 and content_length < 75) or counts["embed"] > 1:
  259. reason = "<embed>s with too short a content length, or too many <embed>s"
  260. to_remove = True
  261. if to_remove:
  262. self.debug("Conditionally cleaned %s#%s.%s with weight %s and content score %s because it has %s." %
  263. (el.name, el.get('id',''), el.get('class', ''), weight, content_score, reason))
  264. el.extract()
  265. for el in ([node] + node.find_all()):
  266. if not (self.options.get('attributes')):
  267. el.attrMap = {}
  268. return unicode(node)
  269. def extract(self):
  270. try:
  271. ruthless = True
  272. while True:
  273. self.preprocess(force=True)
  274. for tag in self._tags(self._html, 'script', 'style'):
  275. tag.extract()
  276. if ruthless:
  277. self._remove_unlikely_candidates()
  278. self._transform_misused_divs_into_p()
  279. candidates = self._score_paragraphs(self.options.get('min_text_length'))
  280. best_candidate = self._select_best_candidate(candidates)
  281. if best_candidate:
  282. article = self._get_article(candidates, best_candidate)
  283. else:
  284. if ruthless:
  285. ruthless = False
  286. self.debug("ended up stripping too much - going for a safer parse")
  287. # try again
  288. continue
  289. else:
  290. article = self._html.find('body') or self._html
  291. cleaned_article = self._sanitize(article, candidates)
  292. retry_length = self.options.get('retry_length') or self.RETRY_LENGTH
  293. of_acceptable_length = len(cleaned_article or '') >= retry_length
  294. if ruthless and not of_acceptable_length:
  295. ruthless = False
  296. continue # try again
  297. else:
  298. return cleaned_article
  299. except Exception, e:
  300. self.logger.exception(e)
  301. if self.on_debug:
  302. raise e