PageRenderTime 57ms CodeModel.GetById 24ms RepoModel.GetById 0ms app.codeStats 0ms

/scrape/single.py

https://gitlab.com/skororu/pysnippets
Python | 67 lines | 32 code | 12 blank | 23 comment | 10 complexity | 1a1533170c55a08dbac793b5b155859d MD5 | raw file
  1. #!/usr/bin/env python3
  2. """
  3. A simple web scraping test using generators, single thread/process:
  4. Examine the word frequency of the titles from a random sample of XKCD cartoons
  5. """
  6. import collections as col # Counter
  7. import random # randint
  8. import bs4 # BeautifulSoup
  9. import requests # codes.ok, get
  10. def generate_urls(base, limit, quantity):
  11. """
  12. generate a series of URLs, each of which represents a randomly selected,
  13. valid XKCD cartoon webpage
  14. """
  15. for url_num in range(quantity):
  16. yield f'{base}{random.randint(1, limit)}/'
  17. def words_from_titles(base_url, upper_limit, num):
  18. """
  19. iterate through the URLs, obtaining the page title for each one,
  20. take each page title and split it into individual words and yield them
  21. """
  22. # pick 10 random cartoon strips
  23. for url in generate_urls(base_url, upper_limit, num):
  24. # access the permanent URL for the selected cartoon
  25. req = requests.get(url)
  26. page = bs4.BeautifulSoup(req.text, 'lxml')
  27. # obtain cartoon strip title from the ctitle block
  28. title = page.find(id='ctitle').text
  29. words = title.split(' ')
  30. for word in words:
  31. yield word
  32. def main():
  33. """simple web scraping test"""
  34. quantity = 80
  35. base_url = 'http://xkcd.com/'
  36. # obtain integer value of most recent cartoon
  37. # so our random selection has an upper bound
  38. with requests.get(base_url) as req:
  39. page = bs4.BeautifulSoup(req.text, 'lxml')
  40. num_previous = page.find('a', rel='prev')['href']
  41. upper_limit = int(num_previous[1:-1]) + 1
  42. # calculate word frequencies
  43. word_freq = col.Counter(words_from_titles(base_url, upper_limit, quantity))
  44. # display overview
  45. multiple = len([x for x in word_freq.values() if x > 1])
  46. print(f'The titles from {quantity} randomly selected XKCD cartoons contained '
  47. f'{len(word_freq)} unique words.\n'
  48. f'{multiple} words were used more than once:')
  49. # display frequent words
  50. frequent_words = sorted(k for k, v in word_freq.items() if v > 1)
  51. for word in frequent_words:
  52. print(word)
  53. ##############################################################################
  54. if __name__ == '__main__':
  55. main()