/scripts/ci.py

https://github.com/pwxcoo/chinese-xinhua · Python · 72 lines · 47 code · 15 blank · 10 comment · 12 complexity · b47f9382e089ae82482cc3ec6f9adb99 MD5 · raw file

  1. """
  2. author: pwxcoo
  3. date: 2018-08-02
  4. description: 多线程抓取下载词语并保存
  5. """
  6. import requests, csv
  7. from bs4 import BeautifulSoup
  8. import time
  9. from multiprocessing.dummy import Pool as ThreadPool
  10. def downloader(url):
  11. """
  12. 下载词语并保存
  13. """
  14. res = []
  15. try:
  16. response = requests.get(url)
  17. if response.status_code != 200:
  18. print(f'{url} is failed!')
  19. return len(res)
  20. print(f'{url} is parsing')
  21. html = BeautifulSoup(response.content.decode('gbk', errors='ignore'), "lxml")
  22. a = html.find_all('a', target="_blank")
  23. prefix = 'http://www.zd9999.com'
  24. words = [prefix + w.get('href') for w in a]
  25. for i in range(0, len(words)):
  26. print(f'{[words[i]]} is parsing')
  27. try:
  28. response = requests.get(words[i])
  29. wordhtml = BeautifulSoup(response.content.decode('gbk', errors='ignore').replace('<br/>', '\n').replace('<br>', '\n')\
  30. , "lxml")
  31. td = wordhtml.find_all('table')[5].find_all('td')
  32. res.append([td[0].text.strip(), td[1].text.strip()])
  33. except Exception as e:
  34. with open('../data/error.csv', mode='a+', encoding='utf-8', newline='') as error_file:
  35. csv.writer(error_file).writerows([0, e, words[i]])
  36. print(f'{words[i]} is failed! {e}')
  37. continue
  38. except Exception as e:
  39. with open('../data/error.csv', mode='a+', encoding='utf-8', newline='') as error_file:
  40. csv.writer(error_file).writerows([1, e, url])
  41. print(f'{url} is failed! {e}')
  42. with open('../data/ci.csv', mode='a+', encoding='utf-8', newline='') as csv_file:
  43. csv.writer(csv_file).writerows(res)
  44. return len(res)
  45. if __name__ == '__main__':
  46. start_time = time.time()
  47. pool = ThreadPool(100)
  48. urls = ['http://www.zd9999.com/ci/index.htm']
  49. for i in range(2, 1959):
  50. urls.append(f'http://www.zd9999.com/ci/index_{i}.htm')
  51. responses = pool.map(downloader, urls)
  52. pool.close()
  53. pool.join()
  54. end_time = time.time()
  55. print(f'总共耗时 {end_time - start_time}, 抓取了 {sum(responses)} 条数据')