/examples/baidu_spider_progress.py

https://github.com/kkyon/botflow · Python · 123 lines · 64 code · 35 blank · 24 comment · 4 complexity · 11116452a666feb4d07ec90675cad3ce MD5 · raw file

  1. from botflow import Pipe, Branch, Timer
  2. from botflow import BotFlow
  3. from bs4 import BeautifulSoup
  4. from dataclasses import dataclass
  5. from botflow.ex.http import HttpLoader
  6. from botflow.config import config
  7. @dataclass
  8. class ResultItem:
  9. id: str = ''
  10. name: str = ''
  11. url: str = ' '
  12. page_rank: int = 0
  13. page_no: int = 0
  14. def __repr__(self):
  15. return self.name
  16. @dataclass
  17. class UrlItem:
  18. name: str
  19. url: str
  20. # 解析具体条目
  21. def get_all_items(response):
  22. soup = BeautifulSoup(response.text, "lxml")
  23. items = soup.select('div.result.c-container')
  24. result = []
  25. for rank, item in enumerate(items):
  26. import uuid
  27. id = uuid.uuid4()
  28. r = ResultItem()
  29. r.id = id
  30. r.page_rank = rank
  31. r.name = item.h3.get_text()
  32. yield r
  33. # 解析 分页 链接
  34. def get_all_page_url(response):
  35. itemList = []
  36. #BD_URL='https://180.97.33.108' #
  37. BD_URL='https://www.baidu.com'
  38. soup = BeautifulSoup(response.text, "lxml")
  39. page = soup.select('div#page')
  40. for item in page[0].find_all('a'):
  41. href = item.get('href')
  42. no = item.get_text()
  43. if '下一页' in no:
  44. break
  45. yield BD_URL + href
  46. result = []
  47. delay=5
  48. def collect(i):
  49. result.append(i)
  50. def show_progress(count):
  51. n=len(result)
  52. speed=n/(count*delay)
  53. print('got len item %s speed:%03f per second,total cost: %ss'%(n,speed,count*delay))
  54. config.exception_policy=config.Exception_ignore
  55. def main():
  56. words = ['贸易战', '世界杯']*50
  57. baidu_url = 'https://www.baidu.com/s?wd=%s'
  58. urls = [baidu_url % (word) for word in words]
  59. # make data flow net
  60. p1=Pipe(
  61. urls,
  62. HttpLoader(),
  63. Branch(get_all_items, collect),
  64. Branch(get_all_page_url, HttpLoader(), get_all_items, collect),
  65. )
  66. Pipe(Timer(delay=delay), show_progress)
  67. BotFlow.run(silent=True)
  68. main()
  69. #
  70. # ---run result----
  71. #post man test result for a page requrest ;1100ms
  72. #
  73. # PING www.a.shifen.com (180.97.33.108): 56 data bytes
  74. # 64 bytes from 180.97.33.108: icmp_seq=0 ttl=55 time=41.159 ms
  75. # got len item 9274 speed:52.994286 per second,total cost: 175s
  76. # got len item 9543 speed:53.016667 per second,total cost: 180s
  77. # got len item 9614 speed:51.967568 per second,total cost: 185s
  78. #best test data
  79. #25 pages per seconde.
  80. # got len item 1540 speed:102.666667 per second,total cost: 15s
  81. # got len item 2549 speed:127.450000 per second,total cost: 20s
  82. # got len item 3450 speed:138.000000 per second,total cost: 25s
  83. # got len item 4843 speed:161.433333 per second,total cost: 30s
  84. # got len item 6070 speed:173.428571 per second,total cost: 35s
  85. # got len item 6826 speed:170.650000 per second,total cost: 40s
  86. # got len item 7773 speed:172.733333 per second,total cost: 45s
  87. # got len item 8681 speed:173.620000 per second,total cost: 50s
  88. # got len item 9700 speed:176.363636 per second,total cost: 55s