/Day66-75/code/main_redis.py

https://github.com/jackfrued/Python-100-Days · Python · 156 lines · 121 code · 33 blank · 2 comment · 30 complexity · 80c4baa93eac2a824ff5d91fb49d3bc8 MD5 · raw file

  1. import pickle
  2. import zlib
  3. from enum import Enum, unique
  4. from hashlib import sha1
  5. from random import random
  6. from threading import Thread, current_thread, local
  7. from time import sleep
  8. from urllib.parse import urlparse
  9. import pymongo
  10. import redis
  11. import requests
  12. from bs4 import BeautifulSoup
  13. from bson import Binary
  14. @unique
  15. class SpiderStatus(Enum):
  16. IDLE = 0
  17. WORKING = 1
  18. def decode_page(page_bytes, charsets=('utf-8',)):
  19. page_html = None
  20. for charset in charsets:
  21. try:
  22. page_html = page_bytes.decode(charset)
  23. break
  24. except UnicodeDecodeError:
  25. pass
  26. return page_html
  27. class Retry(object):
  28. def __init__(self, *, retry_times=3,
  29. wait_secs=5, errors=(Exception, )):
  30. self.retry_times = retry_times
  31. self.wait_secs = wait_secs
  32. self.errors = errors
  33. def __call__(self, fn):
  34. def wrapper(*args, **kwargs):
  35. for _ in range(self.retry_times):
  36. try:
  37. return fn(*args, **kwargs)
  38. except self.errors as e:
  39. print(e)
  40. sleep((random() + 1) * self.wait_secs)
  41. return None
  42. return wrapper
  43. class Spider(object):
  44. def __init__(self):
  45. self.status = SpiderStatus.IDLE
  46. @Retry()
  47. def fetch(self, current_url, *, charsets=('utf-8', ),
  48. user_agent=None, proxies=None):
  49. thread_name = current_thread().name
  50. print(f'[{thread_name}]: {current_url}')
  51. headers = {'user-agent': user_agent} if user_agent else {}
  52. resp = requests.get(current_url,
  53. headers=headers, proxies=proxies)
  54. return decode_page(resp.content, charsets) \
  55. if resp.status_code == 200 else None
  56. def parse(self, html_page, *, domain='m.sohu.com'):
  57. soup = BeautifulSoup(html_page, 'lxml')
  58. for a_tag in soup.body.select('a[href]'):
  59. parser = urlparse(a_tag.attrs['href'])
  60. scheme = parser.scheme or 'http'
  61. netloc = parser.netloc or domain
  62. if scheme != 'javascript' and netloc == domain:
  63. path = parser.path
  64. query = '?' + parser.query if parser.query else ''
  65. full_url = f'{scheme}://{netloc}{path}{query}'
  66. redis_client = thread_local.redis_client
  67. if not redis_client.sismember('visited_urls', full_url):
  68. redis_client.rpush('m_sohu_task', full_url)
  69. def extract(self, html_page):
  70. pass
  71. def store(self, data_dict):
  72. # redis_client = thread_local.redis_client
  73. # mongo_db = thread_local.mongo_db
  74. pass
  75. class SpiderThread(Thread):
  76. def __init__(self, name, spider):
  77. super().__init__(name=name, daemon=True)
  78. self.spider = spider
  79. def run(self):
  80. redis_client = redis.Redis(host='1.2.3.4', port=6379, password='1qaz2wsx')
  81. mongo_client = pymongo.MongoClient(host='1.2.3.4', port=27017)
  82. thread_local.redis_client = redis_client
  83. thread_local.mongo_db = mongo_client.msohu
  84. while True:
  85. current_url = redis_client.lpop('m_sohu_task')
  86. while not current_url:
  87. current_url = redis_client.lpop('m_sohu_task')
  88. self.spider.status = SpiderStatus.WORKING
  89. current_url = current_url.decode('utf-8')
  90. if not redis_client.sismember('visited_urls', current_url):
  91. redis_client.sadd('visited_urls', current_url)
  92. html_page = self.spider.fetch(current_url)
  93. if html_page not in [None, '']:
  94. hasher = hasher_proto.copy()
  95. hasher.update(current_url.encode('utf-8'))
  96. doc_id = hasher.hexdigest()
  97. sohu_data_coll = mongo_client.msohu.webpages
  98. if not sohu_data_coll.find_one({'_id': doc_id}):
  99. sohu_data_coll.insert_one({
  100. '_id': doc_id,
  101. 'url': current_url,
  102. 'page': Binary(zlib.compress(pickle.dumps(html_page)))
  103. })
  104. self.spider.parse(html_page)
  105. self.spider.status = SpiderStatus.IDLE
  106. def is_any_alive(spider_threads):
  107. return any([spider_thread.spider.status == SpiderStatus.WORKING
  108. for spider_thread in spider_threads])
  109. thread_local = local()
  110. hasher_proto = sha1()
  111. def main():
  112. redis_client = redis.Redis(host='1.2.3.4', port=6379, password='1qaz2wsx')
  113. if not redis_client.exists('m_sohu_task'):
  114. redis_client.rpush('m_sohu_task', 'http://m.sohu.com/')
  115. spider_threads = [SpiderThread('thread-%d' % i, Spider())
  116. for i in range(10)]
  117. for spider_thread in spider_threads:
  118. spider_thread.start()
  119. while redis_client.exists('m_sohu_task') or is_any_alive(spider_threads):
  120. pass
  121. print('Over!')
  122. if __name__ == '__main__':
  123. main()