spyder /src/spyder/readability.py

Language Python Lines 222
MD5 Hash 937556256a50ba8bd2bc0792d234c5cc
Repository https://github.com/fireyy/spyder.git View Raw File
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
#coding: utf-8

'''
格式化页面

现在文章图片 这里直接获得
'''
import re
import lxml
from spyder.pyquery import PyQuery as pq
from urlparse import urljoin
from libs.utils import safestr, safeunicode

__all__ = [
    'Readability'
]

class Readability:
    regexps = {
	'unlikelyCandidates': re.compile("combx|comment|community|disqus|extra|foot|header|menu|"
	    "remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|"
	    "pagination|pager|popup|tweet|twitter",re.I),
	'okMaybeItsACandidate': re.compile("and|article|body|column|main|shadow", re.I),
	'positive': re.compile("article|body|content|entry|hentry|main|page|pagination|post|text|"
	    "blog|story",re.I),
	'negative': re.compile("combx|comment|com|contact|foot|footer|footnote|masthead|media|"
	    "meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|"
	    "shopping|tags|tool|widget", re.I),
	'extraneous': re.compile("print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|"
	    "sign|single",re.I),
	'divToPElements': re.compile("<(a|blockquote|dl|div|img|ol|p|pre|table|ul)",re.I),
	'replaceBrs': re.compile("(<br[^>]*>[ \n\r\t]*){2,}",re.I),
	'replaceFonts': re.compile("<(/?)font[^>]*>",re.I),
	'trim': re.compile("^\s+|\s+$",re.I),
	'normalize': re.compile("\s{2,}",re.I),
	'killBreaks': re.compile("(<br\s*/?>(\s|&nbsp;?)*)+",re.I),
	'videos': re.compile("http://(www\.)?(youtube|vimeo)\.com",re.I),
	'skipFootnoteLink': re.compile("^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$",re.I),
	'nextLink': re.compile("(next|weiter|continue|>([^\|]|$)|»([^\|]|$))",re.I),
	'prevLink': re.compile("(prev|earl|old|new|<|«)",re.I),
	"spp_reg" : re.compile(u"""[ ]*""", re.I|re.M|re.S)
    }

    images = []

    '''
    image 保留字段
    '''
    image_attr = ["src", "alt", "width", "height"]

    def __init__(self, content, baseurl, filters):
	self.content = content;

	self.baseurl = baseurl

	self.filters = filters
	
	self.replaceBrs();
	self.replaceFonts();
	self.replace_spp();

	self.specialFilter();

	self.getHtml();

	self.clean_comments()
	self.removeScript();
	self.removeStyle();
	self.removeLink();

	#移除所有 a 标记
	for e in self.tags(self.html, "a"):
	    self.drop_anchor(e)
    
	try:
	    for e in self.tags(self.html, "hr", "font", "p", "span", "div", "ul", "li", "from", "iframe", "center"):
		self.clean_attributes(e)
		#self.removeEmptyEl(e)
	except:
	    pass

	for e in self.tags(self.html, "img"):
	    self.processingImage(e)

    def getContent(self):
	content = self.html.html(method='html');

	content = content.replace("<body>", "");
	content = content.replace("</body>", "");

	return content

    def replaceBrs(self):
	try:
	    self.content = self.regexps["replaceBrs"].sub("<p></p>", self.content)
	    self.content = self.regexps["killBreaks"].sub("<br />", self.content)
	except:
	    pass

    def replace_spp(self):
	try:
	    self.content = self.content.strip()
	    self.content = self.regexps["spp_reg"].sub("", self.content)
	    self.content = self.regexps["trim"].sub("", self.content)
	    self.content = self.regexps["normalize"].sub("", self.content)
	except:
	    pass

    def replaceFonts(self):
	try:
	    self.content = self.regexps["replaceFonts"].sub("<\g<1>span>", self.content)
	except:
	    pass


    def getHtml(self):
	#自动加上html标记
	if self.content.find("<html>") == -1:
	    content = "<html><body>" + self.content + "</body></html>"
	    self.html = pq(content)


    @staticmethod
    def tags(node, *tag_names):
	for tag_name in tag_names:
	    for e in node.find(tag_name):
		yield e

    def removeScript(self):
	self.html.remove("script");

    def removeStyle(self):
	self.html.remove("style");

    def removeLink(self):
	self.html.remove("link");

    def removeEmptyEl(self, element):
        innerText = element.html
        if innerText is None:
        	element.getparent().remove(element)

    def clean_comments(self):
	def clean_comment(i, element):
	    if (isinstance(element, lxml.html.HtmlComment)):
		element.getparent().remove(element)

	self.html.children().each(clean_comment)

    def drop_anchor(self, element):
	for k in element.attrib:
	    del element.attrib[k]
	try:
	    element.drop_tag()
	except:
	    element.tag = "span"
	    pass


    def clean_attributes(self, element):
	if element.tag == "font":
	    element.tag = "span"

	if element.tag == "center":
	    element.tag = "div"

	for att in ["color", "width", "height", "background", "style", "class", "id", "face"]:
	    if element.get(att) is not None:
		del element.attrib[att]


    def getImages(self):
	return self.images;

    def specialFilter(self):
        if len(self.filters) > 0:
            for filter in self.filters:
                rule = filter;
                rule = rule.replace('(*)', '(.+)?')
                if isinstance(self.content, unicode):
                        rule = safeunicode(rule)
                else:
                        rule = safestr(rule)
                self.content = re.compile(rule, re.I).sub("", self.content);

    def processingImage(self, image):
	#首先处理图片层
	parent = image.getparent()

	if parent is not None and parent.tag is "a":
	    parentAttrs = parent.attrib
	    for k in parentAttrs:
		del parentAttrs[k]
	    parent.drop_tag()

	imgAttrs = image.attrib
	need_deleted_attrs = list(set(imgAttrs) - set(self.image_attr))
	if need_deleted_attrs:
	    for k in need_deleted_attrs:
		del imgAttrs[k]

	image_src = image.get("src");
	'''
	这里的图片url也要修正
	'''
	image_src = urljoin(self.baseurl, image_src)
	image.set("src", image_src)
	self.images.append(image_src)

'''
def specialFilter( content):
    if len(self.articleRule.filters) > 0:
	for filter in self.articleRule.filters:
	    element = getElementData(content, filter, True)
	    if element is not None:
		element.getparent().remove(element);
'''
Back to Top