/news-crawler/gzb.py

https://github.com/DarkSand/fetchman · Python · 110 lines · 86 code · 16 blank · 8 comment · 9 complexity · afcaf782ab88d4c39ed9036224b7c4d1 MD5 · raw file

  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. import copy
  4. import hashlib
  5. import random
  6. import sys
  7. import time
  8. from bs4 import BeautifulSoup as bs
  9. from util.seq_util import SeqUtil
  10. from fetchman.downloader.http.spider_request import Request
  11. from fetchman.pipeline.pipe_item import pipeItem
  12. from fetchman.processor.base_processor import BaseProcessor
  13. from fetchman.spider.spider_core import SpiderCore
  14. from fetchman.utils.decorator import check
  15. from pipelines.console_pipeline import ConsolePipeline
  16. from pipelines.database_pipeline import DataBasePipeline
  17. from pipelines.pic_pipeline import PicPipeline
  18. reload(sys)
  19. sys.setdefaultencoding('utf-8')
  20. # 爬取各种帮解析器
  21. class Gzb_Processor(BaseProcessor):
  22. spider_id = 'gzb_spider'
  23. allowed_domains = ['gengzhongbang.com']
  24. # 推入初始request
  25. @classmethod
  26. def init_start_requests(cls):
  27. cls.start_requests.extend([Request(url='http://www.gengzhongbang.com/14/index.php?page=%s' % page, priority=0, meta={'newsCateId': '20171102111913008'}) for page in range(1, 9)])
  28. cls.start_requests.extend([Request(url='http://www.gengzhongbang.com/10/index.php?page=%s' % page, priority=0, meta={'newsCateId': '20171102111913008'}) for page in range(1, 9)])
  29. @check
  30. def process(self, response):
  31. soup = bs(response.m_response.content, 'lxml')
  32. gzb_div_list = soup.select('div.bm_c.xld dl.bbda.cl')
  33. for gzb_div in gzb_div_list:
  34. if gzb_div.select('a img'):
  35. detail_url = gzb_div.select('a')[0]['href']
  36. img_url = 'http://www.gengzhongbang.com/' + gzb_div.select('a img')[0]['src']
  37. name = gzb_div.select('dt.xs2')[0].text.strip()
  38. createTime = gzb_div.select('span.xg1')[0].text.strip()
  39. shortDes = gzb_div.select('dd.xs2.cl')[0].text.strip()
  40. md5 = hashlib.md5()
  41. rand_name = str(time.time()) + str(random.random())
  42. md5.update(rand_name)
  43. img_name = md5.hexdigest() + '.jpg'
  44. request = Request(url=img_url, priority=1, callback=self.process_pic)
  45. request.meta['img_name'] = img_name
  46. yield request
  47. request = Request(url=detail_url, priority=1, callback=self.process_detail)
  48. request.meta['name'] = name
  49. request.meta['createTime'] = createTime
  50. request.meta['shortDes'] = shortDes
  51. request.meta['img_name'] = img_name
  52. request.meta['newsCateId'] = response.request.meta['newsCateId']
  53. yield request
  54. # 获取图片内容并丢入PicPipeline
  55. @check
  56. def process_pic(self, response):
  57. item = dict()
  58. item['content'] = response.m_response.content
  59. item['name'] = response.request.meta['img_name']
  60. yield pipeItem(['pic'], item)
  61. # 获取新闻详情并丢入DataBasePipeline
  62. @check
  63. def process_detail(self, response):
  64. soup = bs(response.m_response.content, 'lxml')
  65. result = dict()
  66. result['newsProductId'] = time.strftime('%Y%m%d%H%M%S', time.localtime(time.time())) + SeqUtil.get_seq()
  67. result['newsCateId'] = response.request.meta['newsCateId']
  68. result['name'] = response.request.meta['name']
  69. result['imageUrl'] = response.request.meta['img_name']
  70. result['newsCateId'] = response.request.meta['newsCateId']
  71. result['shortDes'] = response.request.meta['shortDes']
  72. result['createTime'] = response.request.meta['createTime']
  73. result['newsFromWebUrl'] = response.request.url
  74. result['newsFrom'] = '互联网'
  75. longDes = soup.select('td#article_content')[0]
  76. longDes.name = 'div'
  77. tag_list = longDes.find_all()
  78. # 去除样式
  79. for tag in tag_list:
  80. attrs = copy.copy(tag.attrs)
  81. for key in attrs.iterkeys():
  82. if key != 'src':
  83. del tag.attrs[key]
  84. else:
  85. tag.attrs[key] = 'http://www.gengzhongbang.com/' + tag.attrs[key]
  86. result['longDes'] = str(longDes)
  87. yield pipeItem(['database', 'console'], result)
  88. if __name__ == '__main__':
  89. # 生成爬虫对象设置pipeline启动爬虫
  90. SpiderCore(Gzb_Processor()) \
  91. .set_pipeline(ConsolePipeline(), 'console') \
  92. .set_pipeline(PicPipeline(), 'pic') \
  93. .set_pipeline(DataBasePipeline(), 'database') \
  94. .start()