PageRenderTime 50ms CodeModel.GetById 22ms RepoModel.GetById 1ms app.codeStats 0ms

/src/spyder/document.py

https://github.com/fireyy/spyder
Python | 560 lines | 545 code | 6 blank | 9 comment | 5 complexity | adeb1f7333cdf1a2a49e541439e875e3 MD5 | raw file
  1. #coding=utf-8
  2. import os, sys
  3. parentdir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
  4. if parentdir not in sys.path:
  5. sys.path.insert(0,parentdir)
  6. import re, urlparse
  7. from hashlib import md5
  8. import spyder.feedparser as feedparser
  9. from spyder.pyquery import PyQuery as pq
  10. from spyder.pybits import ansicolor
  11. from libs.utils import safestr, safeunicode
  12. from spyder.fetch import Fetch
  13. from spyder.readability import Readability
  14. from spyder.seed import Seed
  15. import json
  16. from spyder.field import Field, Item, get_field_from_cache
  17. from spyder.publish import publish_server
  18. __all__ = [
  19. "getElementData",
  20. "Document",
  21. "Grab"
  22. ]
  23. images_type = ["jpg", "jpeg", "png", "gif"]
  24. def is_image(url):
  25. attrs = url.split(".")
  26. attr = attrs[-1]
  27. if attr in images_type:
  28. return True
  29. else:
  30. return False
  31. attrParrent = re.compile("(\w+)?\((.+)?\)");
  32. def getElementData(obj, rule, images=None, fetch_all=0):
  33. """
  34. 根据rule对obj的进行解析
  35. obj可以是pq后的对象 也可以是html页面
  36. images将会把解析过程的image连接插入此表中
  37. 规则可以有两种模式
  38. 1. DOM selector
  39. 1.1 选择器类似于jquery 比如你要某个a的url
  40. >> a.attr("href")
  41. 1.2 需要一个标签内的文本内容
  42. >> div[id="content"].text()
  43. 1.3 需要获得某个子元素中的内容
  44. >> li.eq(1).text() #li元素组中的第2个文本内容
  45. 2. 正则模式
  46. 正则模式需要的内容使用[arg]标签其余可以使用(*)填充
  47. """
  48. if not isinstance(obj, pq):
  49. obj = pq(obj);
  50. old_rule = rule
  51. rule = rule.split(".")
  52. #避免有url链接
  53. if len(rule) > 1 and old_rule.find("[arg]") == -1:
  54. #第一个永远是dom选择
  55. selectRule = rule.pop(0)
  56. #移除 ( )
  57. selectRule = selectRule.replace("(", "");
  58. selectRule = selectRule.replace(")", "");
  59. selecteddom = obj.find(selectRule);
  60. for attr in rule:
  61. m = attrParrent.match(attr)
  62. if m:
  63. action, v = m.groups()
  64. if v:
  65. v = v.encode("utf-8")
  66. #去除引号
  67. v = v.strip("\'").strip('\"');
  68. if action == "attr" and hasattr(selecteddom, "attr") and v:
  69. if fetch_all == 1:
  70. values = []
  71. dom_count = len(selecteddom)
  72. for i in range(dom_count):
  73. vv = selecteddom.eq(i).attr(v)
  74. if vv:
  75. values.append(vv)
  76. if is_image(vv):
  77. images.append(vv)
  78. return values
  79. else:
  80. value = selecteddom.attr(v)
  81. if selecteddom and selecteddom[0].tag == "img" and v == "src" and images is not None:
  82. images.append(value)
  83. return value
  84. elif action == "eq" and hasattr(selecteddom, "eq"):
  85. _rules = attr.split(" ")
  86. if len(rule) > 1:
  87. selecteddom = selecteddom.eq(int(v))
  88. if len(_rules) > 1:
  89. '''
  90. 假设eq后面还有子元素
  91. eq(1) a
  92. '''
  93. _rules.pop(0)
  94. _dom = " ".join(_rules)
  95. selecteddom = selecteddom.find(_dom)
  96. else:
  97. return selecteddom.eq(int(v))
  98. elif action == "text" and hasattr(selecteddom, "text"):
  99. return safeunicode(selecteddom.text()).strip()
  100. elif action == "html" and hasattr(selecteddom, "html"):
  101. return safeunicode(selecteddom.html()).strip()
  102. elif len(rule) == 1:
  103. rule = rule.pop()
  104. #正则模式
  105. if rule.find('[arg]'):
  106. content = obj.html()
  107. content_text = obj.text()
  108. rule = rule.replace('[arg]', '(.+)?')
  109. rule = rule.replace('(*)', '.+?')
  110. if isinstance(content, unicode):
  111. rule = safeunicode(rule)
  112. else:
  113. rule = safestr(rule)
  114. parrent = re.compile(rule, re.MULTILINE | re.UNICODE)
  115. try:
  116. result = parrent.search(content)
  117. if result is not None:
  118. result = safeunicode(result.group(1)).strip()
  119. return result
  120. else:
  121. result = parrent.search(content_text)
  122. if result is not None:
  123. result = safeunicode(result.group(1)).strip()
  124. return result
  125. except:
  126. return None
  127. return None
  128. r"""
  129. 从种子表中获得并且分析成文章数据
  130. 如果调用了.push 将会根据你的配置直接入库
  131. 如果你需要进行调试
  132. 直接使用
  133. g = Grab(seed)
  134. g[guid]
  135. """
  136. class Grab(object):
  137. dont_craw_content = [
  138. 'kaifu', 'kaice', "gift"
  139. ]
  140. def __init__(self, seed):
  141. if isinstance(seed, Seed):
  142. self.items = {}
  143. self.seed = seed
  144. self.seed_id = seed["sid"]
  145. self.seed_type = self.seed["type"]
  146. rule = seed.getRule();
  147. listtype = seed["listtype"]
  148. self.guid_rule = seed.getGUID()
  149. if listtype == "feed":
  150. self.parseFeed();
  151. elif listtype == "html" or listtype == "json":
  152. self.listRule = rule.getListRule();
  153. self.fetchListPages(listtype);
  154. else:
  155. print "Cant support `%s` type" % listtype
  156. else:
  157. print "You must give `Seed` instance"
  158. def getItemGUID(self, data):
  159. guid_rule = self.guid_rule
  160. s = "";
  161. if isinstance(guid_rule, list):
  162. for field_id in guid_rule:
  163. field = get_field_from_cache(field_id)
  164. if field:
  165. field_name = field["name"]
  166. if field_name and data[field_name]:
  167. if "value" in data[field_name] and data[field_name].value:
  168. s += safestr(data[field_name].value)
  169. elif data[field_name] and isinstance(data[field_name], unicode) and isinstance(data[field_name], str):
  170. s += safestr(data[field_name])
  171. elif isinstance(guid_rule, str) or isinstance(guid_rule, unicode):
  172. s = data[guid_rule]
  173. return md5(s).hexdigest()
  174. def parseFeed(self):
  175. print "Start to fetch and parse Feed list"
  176. seed = self.seed
  177. f = Fetch(seed.prefixurl, seed.charset, self.seed.timeout);
  178. if f.isReady():
  179. feed = feedparser.parse(f.read())
  180. items = feed["entries"]
  181. if len(items) > 0:
  182. for item in items:
  183. _item = Item({
  184. "url" : item["link"],
  185. "type" : self.seed_type
  186. })
  187. if self.guid_rule is None:
  188. self.guid_rule = "url"
  189. guid = self.getItemGUID(item)
  190. self.items[guid] = _item
  191. print "List has finished parsing. It has %s docs." % ansicolor.red(self.__len__())
  192. def fetchListPages(self, listtype="html"):
  193. print "Start to fetch and parse List"
  194. urls = self.listRule.getListUrls()
  195. for url in urls:
  196. print "Fetching list page: ", url, "charset:", safestr(self.seed["charset"]), "timeout:", safestr(self.seed["timeout"])
  197. f = Fetch(url, charset = self.seed["charset"], timeout = self.seed["timeout"])
  198. if f.isReady():
  199. doc = f.read()
  200. if listtype == "html":
  201. self.parseListPage(f, doc, url)
  202. elif listtype == "json":
  203. self.parseJsonPage(f, doc, url)
  204. print "List has finished parsing. It has %s docs." % ansicolor.red(self.__len__())
  205. def parseListPage(self, site, doc, listurl):
  206. '''
  207. 分析采集回来的页面
  208. @param site Fetch instance
  209. @param doc 页面String stream
  210. @param url link
  211. '''
  212. doc = pq(doc);
  213. list = doc.find(self.listRule.getListParent());
  214. extrarules = self.listRule.extrarules
  215. if list:
  216. def entry(i, e):
  217. #link
  218. urlParent = self.listRule.getContentUrl()
  219. if e.tag == "a":
  220. link = e.get("href")
  221. else:
  222. link = getElementData(e, urlParent)
  223. if link is not None:
  224. link = urlparse.urljoin(listurl, link);
  225. _item = Item({
  226. "type" : self.seed_type,
  227. "images" : []
  228. })
  229. for field_id, _rule, fetch_all in extrarules:
  230. field = Field(field_id = field_id, rule=_rule)
  231. value = getElementData(e, _rule, _item["images"])
  232. #TODO:
  233. # filter HOOK
  234. field.value = value
  235. _item[field["name"]] = field
  236. if (link is not None):
  237. _item['url'] = link
  238. # get item guid
  239. if self.guid_rule:
  240. guid = self.getItemGUID(_item)
  241. elif self.seed_type in self.dont_craw_content:
  242. self.guid_rule = []
  243. for f in _item.fields:
  244. self.guid_rule.append(_item[f]["id"])
  245. guid = self.getItemGUID(_item)
  246. self.guid_rule = None
  247. else:
  248. self.guid_rule = "url"
  249. guid = self.getItemGUID(_item)
  250. self.guid_rule = None
  251. self.items[guid] = _item
  252. if len(self.listRule.getEntryItem()) == 0:
  253. list.children().map(entry)
  254. else:
  255. list.find(self.listRule.getEntryItem()).map(entry)
  256. def parseJsonPage(self, site, doc, listurl):
  257. try:
  258. doc = json.loads(doc, encoding=site.getCharset())
  259. item = self.listRule.getEntryItem()
  260. if item and item in doc:
  261. data = doc[item]
  262. else:
  263. data = doc
  264. urlParent = self.listRule.getContentUrl()
  265. extrarules = self.listRule.extrarules
  266. if isinstance(data, list) and urlParent:
  267. for _data in data:
  268. if urlParent in _data:
  269. link = urlparse.urljoin(listurl, _data[urlParent])
  270. guid = md5(link).hexdigest()
  271. _item = Item({
  272. "type" : self.seed_type,
  273. "images" : []
  274. })
  275. #取出需要的key数据
  276. for field_id, _rule, fetch_all in extrarules:
  277. field = Field(field_id = field_id, rule=_rule)
  278. if _rule in _data:
  279. value = _data[_rule]
  280. if is_image(value):
  281. _item["images"].append(value)
  282. field.value = value
  283. _item[field["name"]] = field
  284. if (link is not None):
  285. _item['url'] = link
  286. # get item guid
  287. if self.guid_rule:
  288. guid = self.getItemGUID(_item)
  289. elif self.seed_type in self.dont_craw_content:
  290. self.guid_rule = []
  291. for f in _item.fields:
  292. self.guid_rule.append(_item[f]["id"])
  293. guid = self.getItemGUID(_item)
  294. self.guid_rule = None
  295. else:
  296. self.guid_rule = "url"
  297. guid = self.getItemGUID(_item)
  298. self.guid_rule = None
  299. self.items[guid] = _item
  300. except:
  301. raise "Cant parse json file"
  302. def __len__(self):
  303. '''
  304. 获取列表中有多少数据量
  305. '''
  306. return len(self.items.items())
  307. def keys(self):
  308. return self.items.keys()
  309. def __getitem__(self, key):
  310. '''
  311. @param key MD5 string
  312. '''
  313. if key in self.items:
  314. _item = self.items[key]
  315. if "url" in _item and ("article" not in _item):
  316. _item["article"] = Document(_item, self.seed);
  317. return _item
  318. def push(self):
  319. '''
  320. 发布推送
  321. '''
  322. print ansicolor.cyan("Start fetching these articles", True)
  323. for k in self.keys():
  324. publish_server.push(k, self[k])
  325. r"""
  326. 文章数据
  327. 包括抓取 分析 提取
  328. """
  329. class Document(object):
  330. def __init__(self, item, seed):
  331. '''
  332. document base url
  333. '''
  334. self.url = item["url"]
  335. self.data = item
  336. self.seed = seed;
  337. item["tags"] = ",".join(self.seed.tags)
  338. #文章采集规则
  339. self.articleRule = seed.getRule().getArticleRule()
  340. print "Document %s is fetcing" % ansicolor.green(self.url)
  341. firstContent = Fetch(self.url, charset = seed["charset"], timeout = seed["timeout"]).read();
  342. if firstContent:
  343. self.parseDocument(firstContent)
  344. def _getContent(self, html, wrapparent, content_re):
  345. if not html:
  346. return
  347. html = pq(html).find(wrapparent)
  348. _content = getElementData(html, content_re);
  349. if _content:
  350. return _content
  351. def parseDocument(self, doc):
  352. doc = pq(doc);
  353. wrapparent = self.articleRule.wrapparent
  354. pageparent = self.articleRule.pageparent
  355. content_re = "";
  356. #子页面url
  357. urls = []
  358. #文本数据内容
  359. content = ""
  360. article = doc.find(wrapparent);
  361. #pages
  362. if pageparent:
  363. urls = self.parsePage(article, pageparent)
  364. #need title, tags
  365. extrarules = self.articleRule.extrarules
  366. #只有文章是有content
  367. if len(extrarules):
  368. for key, rule, fetch_all in extrarules:
  369. field = Field(field_id=key, rule=rule);
  370. value = getElementData(doc, rule, self.data["images"], fetch_all)
  371. self.data[field.get('name')] = field
  372. if field.is_article_content():
  373. content_re = field.get("rule")
  374. content = value
  375. elif field.is_gallery_content():
  376. content_re = field.get("rule")
  377. content = []
  378. if (isinstance(value, list)):
  379. content += value
  380. else:
  381. field.value = value
  382. #采集分页内容
  383. if len(urls) > 0 and content_re:
  384. for next_url in urls:
  385. next_page = Fetch(next_url, charset = self.seed["charset"], timeout = self.seed["timeout"]).read()
  386. if next_page is not None:
  387. next_page = self._getContent(next_page, wrapparent, content_re);
  388. if next_page:
  389. if isinstance(content, list):
  390. content.append(next_page)
  391. else:
  392. content += next_page
  393. if content and content_re:
  394. if isinstance(content, list):
  395. self.data['content'].value = content
  396. self.data['images'] += content
  397. else:
  398. content = Readability(content, self.url, self.articleRule.filters)
  399. images = content.getImages();
  400. self.data['content'].value = content.getContent();
  401. self.data['images'] += images
  402. def parsePage(self, doc, pageparent):
  403. pages = doc.find(pageparent + " a")
  404. urls = []
  405. if len(pages) > 0:
  406. for link in pages:
  407. if link is not None and link.tag == "a" and hasattr(link, "get") and ('href' in link.keys()):
  408. url = link.get("href");
  409. '过滤掉是javascript的链接'
  410. if url and re.match(r"javascript", url) == None:
  411. url = urlparse.urljoin(self.url, url)
  412. if url != self.url and url not in urls:
  413. urls.append(url)
  414. else:
  415. continue;
  416. self.data["pageurls"] = urls
  417. return urls
  418. if __name__ == "__main__":
  419. from web.models import Seed as Seed_Model
  420. db = Seed_Model();
  421. #文章测试
  422. #r = db.view(34);
  423. #seed = Seed(r.list()[0])
  424. #articles = Grab(seed)
  425. #print articles[md5("http://www.265g.com/news/gamenews/321219.html").hexdigest()]
  426. #Document("http://www.265g.com/newgame/abroad/320618.html", seed)
  427. #articles.push()
  428. #游戏测试
  429. #r = db.view(7);
  430. #seed = Seed(r.list()[0])
  431. #games= Grab(seed)
  432. ##games.push()
  433. #print games[md5("http://www.kaifu.com/gameinfo-long2.html").hexdigest()]
  434. #游戏开服
  435. #r = db.view(8);
  436. #seed = Seed(r.list()[0])
  437. #kaifus = Grab(seed)
  438. #kaifus.push()
  439. #print kaifus['43d4eaccab7675ac175c030455d0cbb2']
  440. #游戏开测
  441. #r = db.view(20);
  442. #seed = Seed(r.list()[0])
  443. #kaices = Grab(seed)
  444. #for k in kaices.keys():
  445. # print kaices[k]
  446. #kaices.push()
  447. ##print kaifus['43d4eaccab7675ac175c030455d0cbb2']
  448. #礼包
  449. #r = db.view(21);
  450. #seed = Seed(r.list()[0])
  451. #gifts = Grab(seed)
  452. #gifts.push()
  453. #厂商
  454. #r = db.view(22);
  455. #seed = Seed(r.list()[0])
  456. #c = Grab(seed)
  457. #c.push()
  458. #图库
  459. #r = db.view(23)
  460. #seed = Seed(r.list()[0])
  461. #gas = Grab(seed)
  462. #for k in gas.keys():
  463. #print gas[k]
  464. #gas.push()
  465. #r = db.view(24)
  466. #seed = Seed(r.list()[0])
  467. #gas = Grab(seed)
  468. #gas.push()