PageRenderTime 54ms CodeModel.GetById 22ms RepoModel.GetById 0ms app.codeStats 0ms

/py/lib/readability/readability.py

https://github.com/dewbot/laterbox
Python | 593 lines | 569 code | 16 blank | 8 comment | 18 complexity | 42c0ecffef11820fa8bf1db9668c54e4 MD5 | raw file
  1. #!/usr/bin/env python
  2. import logging
  3. import re
  4. import sys
  5. from collections import defaultdict
  6. from lxml.etree import tostring
  7. from lxml.etree import tounicode
  8. from lxml.html import document_fromstring
  9. from lxml.html import fragment_fromstring
  10. from cleaners import clean_attributes
  11. from cleaners import html_cleaner
  12. from htmls import build_doc
  13. from htmls import get_body
  14. from htmls import get_title
  15. from htmls import shorten_title
  16. logging.basicConfig(level=logging.INFO)
  17. log = logging.getLogger()
  18. REGEXES = {
  19. 'unlikelyCandidatesRe': re.compile('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter', re.I),
  20. 'okMaybeItsACandidateRe': re.compile('and|article|body|column|main|shadow', re.I),
  21. 'positiveRe': re.compile('article|body|content|entry|hentry|main|page|pagination|post|text|blog|story', re.I),
  22. 'negativeRe': re.compile('combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget', re.I),
  23. 'divToPElementsRe': re.compile('<(a|blockquote|dl|div|img|ol|p|pre|table|ul)', re.I),
  24. #'replaceBrsRe': re.compile('(<br[^>]*>[ \n\r\t]*){2,}',re.I),
  25. #'replaceFontsRe': re.compile('<(\/?)font[^>]*>',re.I),
  26. #'trimRe': re.compile('^\s+|\s+$/'),
  27. #'normalizeRe': re.compile('\s{2,}/'),
  28. #'killBreaksRe': re.compile('(<br\s*\/?>(\s|&nbsp;?)*){1,}/'),
  29. #'videoRe': re.compile('http:\/\/(www\.)?(youtube|vimeo)\.com', re.I),
  30. #skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i,
  31. }
  32. class Unparseable(ValueError):
  33. pass
  34. def describe(node, depth=1):
  35. if not hasattr(node, 'tag'):
  36. return "[%s]" % type(node)
  37. name = node.tag
  38. if node.get('id', ''):
  39. name += '#' + node.get('id')
  40. if node.get('class', ''):
  41. name += '.' + node.get('class').replace(' ', '.')
  42. if name[:4] in ['div#', 'div.']:
  43. name = name[3:]
  44. if depth and node.getparent() is not None:
  45. return name + ' - ' + describe(node.getparent(), depth - 1)
  46. return name
  47. def to_int(x):
  48. if not x:
  49. return None
  50. x = x.strip()
  51. if x.endswith('px'):
  52. return int(x[:-2])
  53. if x.endswith('em'):
  54. return int(x[:-2]) * 12
  55. return int(x)
  56. def clean(text):
  57. text = re.sub('\s*\n\s*', '\n', text)
  58. text = re.sub('[ \t]{2,}', ' ', text)
  59. return text.strip()
  60. def text_length(i):
  61. return len(clean(i.text_content() or ""))
  62. class Document:
  63. """Class to build a etree document out of html."""
  64. TEXT_LENGTH_THRESHOLD = 25
  65. RETRY_LENGTH = 250
  66. def __init__(self, input, **options):
  67. """Generate the document
  68. :param input: string of the html content.
  69. kwargs:
  70. - attributes:
  71. - debug: output debug messages
  72. - min_text_length:
  73. - retry_length:
  74. - url: will allow adjusting links to be absolute
  75. """
  76. self.input = input
  77. self.options = options
  78. self.html = None
  79. def _html(self, force=False):
  80. if force or self.html is None:
  81. self.html = self._parse(self.input)
  82. return self.html
  83. def _parse(self, input):
  84. doc = build_doc(input)
  85. doc = html_cleaner.clean_html(doc)
  86. base_href = self.options.get('url', None)
  87. if base_href:
  88. doc.make_links_absolute(base_href, resolve_base_href=True)
  89. else:
  90. doc.resolve_base_href()
  91. return doc
  92. def content(self):
  93. return get_body(self._html(True))
  94. def title(self):
  95. return get_title(self._html(True))
  96. def short_title(self):
  97. return shorten_title(self._html(True))
  98. def get_clean_html(self):
  99. return clean_attributes(tounicode(self.html))
  100. def summary(self, html_partial=False):
  101. """Generate the summary of the html docuemnt
  102. :param html_partial: return only the div of the document, don't wrap
  103. in html and body tags.
  104. """
  105. try:
  106. ruthless = True
  107. while True:
  108. self._html(True)
  109. for i in self.tags(self.html, 'script', 'style'):
  110. i.drop_tree()
  111. for i in self.tags(self.html, 'body'):
  112. i.set('id', 'readabilityBody')
  113. if ruthless:
  114. self.remove_unlikely_candidates()
  115. self.transform_misused_divs_into_paragraphs()
  116. candidates = self.score_paragraphs()
  117. best_candidate = self.select_best_candidate(candidates)
  118. if best_candidate:
  119. article = self.get_article(candidates, best_candidate,
  120. html_partial=html_partial)
  121. else:
  122. if ruthless:
  123. log.debug("ruthless removal did not work. ")
  124. ruthless = False
  125. self.debug(
  126. ("ended up stripping too much - "
  127. "going for a safer _parse"))
  128. # try again
  129. continue
  130. else:
  131. log.debug(
  132. ("Ruthless and lenient parsing did not work. "
  133. "Returning raw html"))
  134. article = self.html.find('body')
  135. if article is None:
  136. article = self.html
  137. cleaned_article = self.sanitize(article, candidates)
  138. article_length = len(cleaned_article or '')
  139. retry_length = self.options.get(
  140. 'retry_length',
  141. self.RETRY_LENGTH)
  142. of_acceptable_length = article_length >= retry_length
  143. if ruthless and not of_acceptable_length:
  144. ruthless = False
  145. # Loop through and try again.
  146. continue
  147. else:
  148. return cleaned_article
  149. except StandardError, e:
  150. log.exception('error getting summary: ')
  151. raise Unparseable(str(e)), None, sys.exc_info()[2]
  152. def get_article(self, candidates, best_candidate, html_partial=False):
  153. # Now that we have the top candidate, look through its siblings for
  154. # content that might also be related.
  155. # Things like preambles, content split by ads that we removed, etc.
  156. sibling_score_threshold = max([
  157. 10,
  158. best_candidate['content_score'] * 0.2])
  159. # create a new html document with a html->body->div
  160. if html_partial:
  161. output = fragment_fromstring('<div/>')
  162. else:
  163. output = document_fromstring('<div/>')
  164. best_elem = best_candidate['elem']
  165. for sibling in best_elem.getparent().getchildren():
  166. # in lxml there no concept of simple text
  167. # if isinstance(sibling, NavigableString): continue
  168. append = False
  169. if sibling is best_elem:
  170. append = True
  171. sibling_key = sibling # HashableElement(sibling)
  172. if sibling_key in candidates and \
  173. candidates[sibling_key]['content_score'] >= sibling_score_threshold:
  174. append = True
  175. if sibling.tag == "p":
  176. link_density = self.get_link_density(sibling)
  177. node_content = sibling.text or ""
  178. node_length = len(node_content)
  179. if node_length > 80 and link_density < 0.25:
  180. append = True
  181. elif node_length <= 80 \
  182. and link_density == 0 \
  183. and re.search('\.( |$)', node_content):
  184. append = True
  185. if append:
  186. # We don't want to append directly to output, but the div
  187. # in html->body->div
  188. if html_partial:
  189. output.append(sibling)
  190. else:
  191. output.getchildren()[0].getchildren()[0].append(sibling)
  192. #if output is not None:
  193. # output.append(best_elem)
  194. return output
  195. def select_best_candidate(self, candidates):
  196. sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True)
  197. for candidate in sorted_candidates[:5]:
  198. elem = candidate['elem']
  199. self.debug("Top 5 : %6.3f %s" % (
  200. candidate['content_score'],
  201. describe(elem)))
  202. if len(sorted_candidates) == 0:
  203. return None
  204. best_candidate = sorted_candidates[0]
  205. return best_candidate
  206. def get_link_density(self, elem):
  207. link_length = 0
  208. for i in elem.findall(".//a"):
  209. link_length += text_length(i)
  210. #if len(elem.findall(".//div") or elem.findall(".//p")):
  211. # link_length = link_length
  212. total_length = text_length(elem)
  213. return float(link_length) / max(total_length, 1)
  214. def score_paragraphs(self, ):
  215. MIN_LEN = self.options.get(
  216. 'min_text_length',
  217. self.TEXT_LENGTH_THRESHOLD)
  218. candidates = {}
  219. ordered = []
  220. for elem in self.tags(self._html(), "p", "pre", "td"):
  221. parent_node = elem.getparent()
  222. if parent_node is None:
  223. continue
  224. grand_parent_node = parent_node.getparent()
  225. inner_text = clean(elem.text_content() or "")
  226. inner_text_len = len(inner_text)
  227. # If this paragraph is less than 25 characters
  228. # don't even count it.
  229. if inner_text_len < MIN_LEN:
  230. continue
  231. if parent_node not in candidates:
  232. candidates[parent_node] = self.score_node(parent_node)
  233. ordered.append(parent_node)
  234. if grand_parent_node is not None and grand_parent_node not in candidates:
  235. candidates[grand_parent_node] = self.score_node(
  236. grand_parent_node)
  237. ordered.append(grand_parent_node)
  238. content_score = 1
  239. content_score += len(inner_text.split(','))
  240. content_score += min((inner_text_len / 100), 3)
  241. #if elem not in candidates:
  242. # candidates[elem] = self.score_node(elem)
  243. #WTF? candidates[elem]['content_score'] += content_score
  244. candidates[parent_node]['content_score'] += content_score
  245. if grand_parent_node is not None:
  246. candidates[grand_parent_node]['content_score'] += content_score / 2.0
  247. # Scale the final candidates score based on link density. Good content
  248. # should have a relatively small link density (5% or less) and be
  249. # mostly unaffected by this operation.
  250. for elem in ordered:
  251. candidate = candidates[elem]
  252. ld = self.get_link_density(elem)
  253. score = candidate['content_score']
  254. self.debug("Candid: %6.3f %s link density %.3f -> %6.3f" % (
  255. score,
  256. describe(elem),
  257. ld,
  258. score * (1 - ld)))
  259. candidate['content_score'] *= (1 - ld)
  260. return candidates
  261. def class_weight(self, e):
  262. weight = 0
  263. if e.get('class', None):
  264. if REGEXES['negativeRe'].search(e.get('class')):
  265. weight -= 25
  266. if REGEXES['positiveRe'].search(e.get('class')):
  267. weight += 25
  268. if e.get('id', None):
  269. if REGEXES['negativeRe'].search(e.get('id')):
  270. weight -= 25
  271. if REGEXES['positiveRe'].search(e.get('id')):
  272. weight += 25
  273. return weight
  274. def score_node(self, elem):
  275. content_score = self.class_weight(elem)
  276. name = elem.tag.lower()
  277. if name == "div":
  278. content_score += 5
  279. elif name in ["pre", "td", "blockquote"]:
  280. content_score += 3
  281. elif name in ["address", "ol", "ul", "dl", "dd", "dt", "li", "form"]:
  282. content_score -= 3
  283. elif name in ["h1", "h2", "h3", "h4", "h5", "h6", "th"]:
  284. content_score -= 5
  285. return {
  286. 'content_score': content_score,
  287. 'elem': elem
  288. }
  289. def debug(self, *a):
  290. if self.options.get('debug', False):
  291. log.debug(*a)
  292. def remove_unlikely_candidates(self):
  293. for elem in self.html.iter():
  294. s = "%s %s" % (elem.get('class', ''), elem.get('id', ''))
  295. if len(s) < 2:
  296. continue
  297. #self.debug(s)
  298. if REGEXES['unlikelyCandidatesRe'].search(s) and (not REGEXES['okMaybeItsACandidateRe'].search(s)) and elem.tag not in ['html', 'body']:
  299. self.debug("Removing unlikely candidate - %s" % describe(elem))
  300. elem.drop_tree()
  301. def transform_misused_divs_into_paragraphs(self):
  302. for elem in self.tags(self.html, 'div'):
  303. # transform <div>s that do not contain other block elements into
  304. # <p>s
  305. #FIXME: The current implementation ignores all descendants that
  306. # are not direct children of elem
  307. # This results in incorrect results in case there is an <img>
  308. # buried within an <a> for example
  309. if not REGEXES['divToPElementsRe'].search(
  310. unicode(''.join(map(tostring, list(elem))))):
  311. #self.debug("Altering %s to p" % (describe(elem)))
  312. elem.tag = "p"
  313. #print "Fixed element "+describe(elem)
  314. for elem in self.tags(self.html, 'div'):
  315. if elem.text and elem.text.strip():
  316. p = fragment_fromstring('<p/>')
  317. p.text = elem.text
  318. elem.text = None
  319. elem.insert(0, p)
  320. #print "Appended "+tounicode(p)+" to "+describe(elem)
  321. for pos, child in reversed(list(enumerate(elem))):
  322. if child.tail and child.tail.strip():
  323. p = fragment_fromstring('<p/>')
  324. p.text = child.tail
  325. child.tail = None
  326. elem.insert(pos + 1, p)
  327. #print "Inserted "+tounicode(p)+" to "+describe(elem)
  328. if child.tag == 'br':
  329. #print 'Dropped <br> at '+describe(elem)
  330. child.drop_tree()
  331. def tags(self, node, *tag_names):
  332. for tag_name in tag_names:
  333. for e in node.findall('.//%s' % tag_name):
  334. yield e
  335. def reverse_tags(self, node, *tag_names):
  336. for tag_name in tag_names:
  337. for e in reversed(node.findall('.//%s' % tag_name)):
  338. yield e
  339. def sanitize(self, node, candidates):
  340. MIN_LEN = self.options.get('min_text_length',
  341. self.TEXT_LENGTH_THRESHOLD)
  342. for header in self.tags(node, "h1", "h2", "h3", "h4", "h5", "h6"):
  343. if self.class_weight(header) < 0 or self.get_link_density(header) > 0.33:
  344. header.drop_tree()
  345. for elem in self.tags(node, "form", "iframe", "textarea"):
  346. elem.drop_tree()
  347. allowed = {}
  348. # Conditionally clean <table>s, <ul>s, and <div>s
  349. for el in self.reverse_tags(node, "table", "ul", "div"):
  350. if el in allowed:
  351. continue
  352. weight = self.class_weight(el)
  353. if el in candidates:
  354. content_score = candidates[el]['content_score']
  355. #print '!',el, '-> %6.3f' % content_score
  356. else:
  357. content_score = 0
  358. tag = el.tag
  359. if weight + content_score < 0:
  360. self.debug("Cleaned %s with score %6.3f and weight %-3s" %
  361. (describe(el), content_score, weight, ))
  362. el.drop_tree()
  363. elif el.text_content().count(",") < 10:
  364. counts = {}
  365. for kind in ['p', 'img', 'li', 'a', 'embed', 'input']:
  366. counts[kind] = len(el.findall('.//%s' % kind))
  367. counts["li"] -= 100
  368. # Count the text length excluding any surrounding whitespace
  369. content_length = text_length(el)
  370. link_density = self.get_link_density(el)
  371. parent_node = el.getparent()
  372. if parent_node is not None:
  373. if parent_node in candidates:
  374. content_score = candidates[parent_node]['content_score']
  375. else:
  376. content_score = 0
  377. #if parent_node is not None:
  378. #pweight = self.class_weight(parent_node) + content_score
  379. #pname = describe(parent_node)
  380. #else:
  381. #pweight = 0
  382. #pname = "no parent"
  383. to_remove = False
  384. reason = ""
  385. #if el.tag == 'div' and counts["img"] >= 1:
  386. # continue
  387. if counts["p"] and counts["img"] > counts["p"]:
  388. reason = "too many images (%s)" % counts["img"]
  389. to_remove = True
  390. elif counts["li"] > counts["p"] and tag != "ul" and tag != "ol":
  391. reason = "more <li>s than <p>s"
  392. to_remove = True
  393. elif counts["input"] > (counts["p"] / 3):
  394. reason = "less than 3x <p>s than <input>s"
  395. to_remove = True
  396. elif content_length < (MIN_LEN) and (counts["img"] == 0 or counts["img"] > 2):
  397. reason = "too short content length %s without a single image" % content_length
  398. to_remove = True
  399. elif weight < 25 and link_density > 0.2:
  400. reason = "too many links %.3f for its weight %s" % (
  401. link_density, weight)
  402. to_remove = True
  403. elif weight >= 25 and link_density > 0.5:
  404. reason = "too many links %.3f for its weight %s" % (
  405. link_density, weight)
  406. to_remove = True
  407. elif (counts["embed"] == 1 and content_length < 75) or counts["embed"] > 1:
  408. reason = "<embed>s with too short content length, or too many <embed>s"
  409. to_remove = True
  410. # if el.tag == 'div' and counts['img'] >= 1 and to_remove:
  411. # imgs = el.findall('.//img')
  412. # valid_img = False
  413. # self.debug(tounicode(el))
  414. # for img in imgs:
  415. #
  416. # height = img.get('height')
  417. # text_length = img.get('text_length')
  418. # self.debug ("height %s text_length %s" %(repr(height), repr(text_length)))
  419. # if to_int(height) >= 100 or to_int(text_length) >= 100:
  420. # valid_img = True
  421. # self.debug("valid image" + tounicode(img))
  422. # break
  423. # if valid_img:
  424. # to_remove = False
  425. # self.debug("Allowing %s" %el.text_content())
  426. # for desnode in self.tags(el, "table", "ul", "div"):
  427. # allowed[desnode] = True
  428. #find x non empty preceding and succeeding siblings
  429. i, j = 0, 0
  430. x = 1
  431. siblings = []
  432. for sib in el.itersiblings():
  433. #self.debug(sib.text_content())
  434. sib_content_length = text_length(sib)
  435. if sib_content_length:
  436. i =+ 1
  437. siblings.append(sib_content_length)
  438. if i == x:
  439. break
  440. for sib in el.itersiblings(preceding=True):
  441. #self.debug(sib.text_content())
  442. sib_content_length = text_length(sib)
  443. if sib_content_length:
  444. j =+ 1
  445. siblings.append(sib_content_length)
  446. if j == x:
  447. break
  448. #self.debug(str(siblings))
  449. if siblings and sum(siblings) > 1000:
  450. to_remove = False
  451. self.debug("Allowing %s" % describe(el))
  452. for desnode in self.tags(el, "table", "ul", "div"):
  453. allowed[desnode] = True
  454. if to_remove:
  455. self.debug("Cleaned %6.3f %s with weight %s cause it has %s." %
  456. (content_score, describe(el), weight, reason))
  457. #print tounicode(el)
  458. #self.debug("pname %s pweight %.3f" %(pname, pweight))
  459. el.drop_tree()
  460. for el in ([node] + [n for n in node.iter()]):
  461. if not self.options.get('attributes', None):
  462. #el.attrib = {} #FIXME:Checkout the effects of disabling this
  463. pass
  464. self.html = node
  465. return self.get_clean_html()
  466. class HashableElement():
  467. def __init__(self, node):
  468. self.node = node
  469. self._path = None
  470. def _get_path(self):
  471. if self._path is None:
  472. reverse_path = []
  473. node = self.node
  474. while node is not None:
  475. node_id = (node.tag, tuple(node.attrib.items()), node.text)
  476. reverse_path.append(node_id)
  477. node = node.getparent()
  478. self._path = tuple(reverse_path)
  479. return self._path
  480. path = property(_get_path)
  481. def __hash__(self):
  482. return hash(self.path)
  483. def __eq__(self, other):
  484. return self.path == other.path
  485. def __getattr__(self, tag):
  486. return getattr(self.node, tag)
  487. def main():
  488. from optparse import OptionParser
  489. parser = OptionParser(usage="%prog: [options] [file]")
  490. parser.add_option('-v', '--verbose', action='store_true')
  491. parser.add_option('-u', '--url', default=None, help="use URL instead of a local file")
  492. (options, args) = parser.parse_args()
  493. if not (len(args) == 1 or options.url):
  494. parser.print_help()
  495. sys.exit(1)
  496. file = None
  497. if options.url:
  498. import urllib
  499. file = urllib.urlopen(options.url)
  500. else:
  501. file = open(args[0], 'rt')
  502. enc = sys.__stdout__.encoding or 'utf-8'
  503. try:
  504. print Document(file.read(),
  505. debug=options.verbose,
  506. url=options.url).summary().encode(enc, 'replace')
  507. finally:
  508. file.close()
  509. if __name__ == '__main__':
  510. main()