/Python爬虫日记系列/Python爬取日记八:斗鱼弹幕相关信息保存到mongodb.py

https://github.com/rieuse/learnPython · Python · 94 lines · 77 code · 16 blank · 1 comment · 10 complexity · 0c004e3a4beca5fe60e62d8bbf8251ae MD5 · raw file

  1. # 这个抓取弹幕,然后把用户的uid昵称等级弹幕内容都保存到mongodb中
  2. __author__ = '布咯咯_rieuse'
  3. __time__ = '2017.6.2'
  4. __github__ = 'https://github.com/rieuse'
  5. import multiprocessing
  6. import re
  7. import socket
  8. import time
  9. import pymongo
  10. import requests
  11. from bs4 import BeautifulSoup
  12. clients = pymongo.MongoClient('localhost')
  13. db = clients["DouyuTV_danmu"]
  14. col = db["info"]
  15. client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
  16. host = socket.gethostbyname("openbarrage.douyutv.com")
  17. port = 8601
  18. client.connect((host, port))
  19. danmu_path = re.compile(b'txt@=(.+?)/cid@')
  20. uid_path = re.compile(b'uid@=(.+?)/nn@')
  21. nickname_path = re.compile(b'nn@=(.+?)/txt@')
  22. level_path = re.compile(b'level@=([1-9][0-9]?)/sahf')
  23. def sendmsg(msgstr):
  24. msg = msgstr.encode('utf-8')
  25. data_length = len(msg) + 8
  26. code = 689
  27. msgHead = int.to_bytes(data_length, 4, 'little') \
  28. + int.to_bytes(data_length, 4, 'little') + int.to_bytes(code, 4, 'little')
  29. client.send(msgHead)
  30. sent = 0
  31. while sent < len(msg):
  32. tn = client.send(msg[sent:])
  33. sent = sent + tn
  34. def start(roomid):
  35. msg = 'type@=loginreq/username@=rieuse/password@=douyu/roomid@={}/\0'.format(roomid)
  36. sendmsg(msg)
  37. msg_more = 'type@=joingroup/rid@={}/gid@=-9999/\0'.format(roomid)
  38. sendmsg(msg_more)
  39. print('---------------欢迎连接到{}的直播间---------------'.format(get_name(roomid)))
  40. while True:
  41. data = client.recv(1024)
  42. uid_more = uid_path.findall(data)
  43. nickname_more = nickname_path.findall(data)
  44. level_more = level_path.findall(data)
  45. danmu_more = danmu_path.findall(data)
  46. if not level_more:
  47. level_more = b'0'
  48. if not data:
  49. break
  50. else:
  51. for i in range(0, len(danmu_more)):
  52. try:
  53. product = {
  54. 'uid': uid_more[0].decode(encoding='utf-8'),
  55. 'nickname': nickname_more[0].decode(encoding='utf-8'),
  56. 'level': level_more[0].decode(encoding='utf-8'),
  57. 'danmu': danmu_more[0].decode(encoding='utf-8')
  58. }
  59. print(product)
  60. col.insert(product)
  61. print('成功导入mongodb')
  62. except Exception as e:
  63. print(e)
  64. def keeplive():
  65. while True:
  66. msg = 'type@=keeplive/tick@=' + str(int(time.time())) + '/\0'
  67. sendmsg(msg)
  68. time.sleep(15)
  69. def get_name(roomid):
  70. r = requests.get("http://www.douyu.com/" + roomid)
  71. soup = BeautifulSoup(r.text, 'lxml')
  72. return soup.find('a', {'class', 'zb-name'}).string
  73. if __name__ == '__main__':
  74. room_id = input('请出入房间ID: ')
  75. p1 = multiprocessing.Process(target=start, args=(room_id,))
  76. p2 = multiprocessing.Process(target=keeplive)
  77. p1.start()
  78. p2.start()