PageRenderTime 44ms CodeModel.GetById 21ms RepoModel.GetById 0ms app.codeStats 0ms

/src/spyder/readability.py

https://github.com/fireyy/spyder
Python | 221 lines | 212 code | 3 blank | 6 comment | 0 complexity | 937556256a50ba8bd2bc0792d234c5cc MD5 | raw file
  1. #coding: utf-8
  2. '''
  3. 格式化页面
  4. 现在文章图片 这里直接获得
  5. '''
  6. import re
  7. import lxml
  8. from spyder.pyquery import PyQuery as pq
  9. from urlparse import urljoin
  10. from libs.utils import safestr, safeunicode
  11. __all__ = [
  12. 'Readability'
  13. ]
  14. class Readability:
  15. regexps = {
  16. 'unlikelyCandidates': re.compile("combx|comment|community|disqus|extra|foot|header|menu|"
  17. "remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|"
  18. "pagination|pager|popup|tweet|twitter",re.I),
  19. 'okMaybeItsACandidate': re.compile("and|article|body|column|main|shadow", re.I),
  20. 'positive': re.compile("article|body|content|entry|hentry|main|page|pagination|post|text|"
  21. "blog|story",re.I),
  22. 'negative': re.compile("combx|comment|com|contact|foot|footer|footnote|masthead|media|"
  23. "meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|"
  24. "shopping|tags|tool|widget", re.I),
  25. 'extraneous': re.compile("print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|"
  26. "sign|single",re.I),
  27. 'divToPElements': re.compile("<(a|blockquote|dl|div|img|ol|p|pre|table|ul)",re.I),
  28. 'replaceBrs': re.compile("(<br[^>]*>[ \n\r\t]*){2,}",re.I),
  29. 'replaceFonts': re.compile("<(/?)font[^>]*>",re.I),
  30. 'trim': re.compile("^\s+|\s+$",re.I),
  31. 'normalize': re.compile("\s{2,}",re.I),
  32. 'killBreaks': re.compile("(<br\s*/?>(\s|&nbsp;?)*)+",re.I),
  33. 'videos': re.compile("http://(www\.)?(youtube|vimeo)\.com",re.I),
  34. 'skipFootnoteLink': re.compile("^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$",re.I),
  35. 'nextLink': re.compile("(next|weiter|continue|>([^\|]|$)|»([^\|]|$))",re.I),
  36. 'prevLink': re.compile("(prev|earl|old|new|<|«)",re.I),
  37. "spp_reg" : re.compile(u"""[ ]*""", re.I|re.M|re.S)
  38. }
  39. images = []
  40. '''
  41. image 保留字段
  42. '''
  43. image_attr = ["src", "alt", "width", "height"]
  44. def __init__(self, content, baseurl, filters):
  45. self.content = content;
  46. self.baseurl = baseurl
  47. self.filters = filters
  48. self.replaceBrs();
  49. self.replaceFonts();
  50. self.replace_spp();
  51. self.specialFilter();
  52. self.getHtml();
  53. self.clean_comments()
  54. self.removeScript();
  55. self.removeStyle();
  56. self.removeLink();
  57. #移除所有 a 标记
  58. for e in self.tags(self.html, "a"):
  59. self.drop_anchor(e)
  60. try:
  61. for e in self.tags(self.html, "hr", "font", "p", "span", "div", "ul", "li", "from", "iframe", "center"):
  62. self.clean_attributes(e)
  63. #self.removeEmptyEl(e)
  64. except:
  65. pass
  66. for e in self.tags(self.html, "img"):
  67. self.processingImage(e)
  68. def getContent(self):
  69. content = self.html.html(method='html');
  70. content = content.replace("<body>", "");
  71. content = content.replace("</body>", "");
  72. return content
  73. def replaceBrs(self):
  74. try:
  75. self.content = self.regexps["replaceBrs"].sub("<p></p>", self.content)
  76. self.content = self.regexps["killBreaks"].sub("<br />", self.content)
  77. except:
  78. pass
  79. def replace_spp(self):
  80. try:
  81. self.content = self.content.strip()
  82. self.content = self.regexps["spp_reg"].sub("", self.content)
  83. self.content = self.regexps["trim"].sub("", self.content)
  84. self.content = self.regexps["normalize"].sub("", self.content)
  85. except:
  86. pass
  87. def replaceFonts(self):
  88. try:
  89. self.content = self.regexps["replaceFonts"].sub("<\g<1>span>", self.content)
  90. except:
  91. pass
  92. def getHtml(self):
  93. #自动加上html标记
  94. if self.content.find("<html>") == -1:
  95. content = "<html><body>" + self.content + "</body></html>"
  96. self.html = pq(content)
  97. @staticmethod
  98. def tags(node, *tag_names):
  99. for tag_name in tag_names:
  100. for e in node.find(tag_name):
  101. yield e
  102. def removeScript(self):
  103. self.html.remove("script");
  104. def removeStyle(self):
  105. self.html.remove("style");
  106. def removeLink(self):
  107. self.html.remove("link");
  108. def removeEmptyEl(self, element):
  109. innerText = element.html
  110. if innerText is None:
  111. element.getparent().remove(element)
  112. def clean_comments(self):
  113. def clean_comment(i, element):
  114. if (isinstance(element, lxml.html.HtmlComment)):
  115. element.getparent().remove(element)
  116. self.html.children().each(clean_comment)
  117. def drop_anchor(self, element):
  118. for k in element.attrib:
  119. del element.attrib[k]
  120. try:
  121. element.drop_tag()
  122. except:
  123. element.tag = "span"
  124. pass
  125. def clean_attributes(self, element):
  126. if element.tag == "font":
  127. element.tag = "span"
  128. if element.tag == "center":
  129. element.tag = "div"
  130. for att in ["color", "width", "height", "background", "style", "class", "id", "face"]:
  131. if element.get(att) is not None:
  132. del element.attrib[att]
  133. def getImages(self):
  134. return self.images;
  135. def specialFilter(self):
  136. if len(self.filters) > 0:
  137. for filter in self.filters:
  138. rule = filter;
  139. rule = rule.replace('(*)', '(.+)?')
  140. if isinstance(self.content, unicode):
  141. rule = safeunicode(rule)
  142. else:
  143. rule = safestr(rule)
  144. self.content = re.compile(rule, re.I).sub("", self.content);
  145. def processingImage(self, image):
  146. #首先处理图片层
  147. parent = image.getparent()
  148. if parent is not None and parent.tag is "a":
  149. parentAttrs = parent.attrib
  150. for k in parentAttrs:
  151. del parentAttrs[k]
  152. parent.drop_tag()
  153. imgAttrs = image.attrib
  154. need_deleted_attrs = list(set(imgAttrs) - set(self.image_attr))
  155. if need_deleted_attrs:
  156. for k in need_deleted_attrs:
  157. del imgAttrs[k]
  158. image_src = image.get("src");
  159. '''
  160. 这里的图片url也要修正
  161. '''
  162. image_src = urljoin(self.baseurl, image_src)
  163. image.set("src", image_src)
  164. self.images.append(image_src)
  165. '''
  166. def specialFilter( content):
  167. if len(self.articleRule.filters) > 0:
  168. for filter in self.articleRule.filters:
  169. element = getElementData(content, filter, True)
  170. if element is not None:
  171. element.getparent().remove(element);
  172. '''