/scrape/single.py
Python | 67 lines | 32 code | 12 blank | 23 comment | 10 complexity | 1a1533170c55a08dbac793b5b155859d MD5 | raw file
- #!/usr/bin/env python3
- """
- A simple web scraping test using generators, single thread/process:
- Examine the word frequency of the titles from a random sample of XKCD cartoons
- """
- import collections as col # Counter
- import random # randint
- import bs4 # BeautifulSoup
- import requests # codes.ok, get
- def generate_urls(base, limit, quantity):
- """
- generate a series of URLs, each of which represents a randomly selected,
- valid XKCD cartoon webpage
- """
- for url_num in range(quantity):
- yield f'{base}{random.randint(1, limit)}/'
- def words_from_titles(base_url, upper_limit, num):
- """
- iterate through the URLs, obtaining the page title for each one,
- take each page title and split it into individual words and yield them
- """
- # pick 10 random cartoon strips
- for url in generate_urls(base_url, upper_limit, num):
- # access the permanent URL for the selected cartoon
- req = requests.get(url)
- page = bs4.BeautifulSoup(req.text, 'lxml')
- # obtain cartoon strip title from the ctitle block
- title = page.find(id='ctitle').text
- words = title.split(' ')
- for word in words:
- yield word
- def main():
- """simple web scraping test"""
- quantity = 80
- base_url = 'http://xkcd.com/'
- # obtain integer value of most recent cartoon
- # so our random selection has an upper bound
- with requests.get(base_url) as req:
- page = bs4.BeautifulSoup(req.text, 'lxml')
- num_previous = page.find('a', rel='prev')['href']
- upper_limit = int(num_previous[1:-1]) + 1
- # calculate word frequencies
- word_freq = col.Counter(words_from_titles(base_url, upper_limit, quantity))
- # display overview
- multiple = len([x for x in word_freq.values() if x > 1])
- print(f'The titles from {quantity} randomly selected XKCD cartoons contained '
- f'{len(word_freq)} unique words.\n'
- f'{multiple} words were used more than once:')
- # display frequent words
- frequent_words = sorted(k for k, v in word_freq.items() if v > 1)
- for word in frequent_words:
- print(word)
- ##############################################################################
- if __name__ == '__main__':
- main()