single.py | searchcode

/scrape/single.py

https://gitlab.com/skororu/pysnippets
Python | 67 lines | 32 code | 12 blank | 23 comment | 10 complexity | 1a1533170c55a08dbac793b5b155859d MD5 | raw file

#!/usr/bin/env python3
"""
A simple web scraping test using generators, single thread/process:
Examine the word frequency of the titles from a random sample of XKCD cartoons
"""
import collections as col # Counter
import random             # randint

import bs4                # BeautifulSoup
import requests           # codes.ok, get

def generate_urls(base, limit, quantity):
    """
    generate a series of URLs, each of which represents a randomly selected,
    valid XKCD cartoon webpage
    """
    for url_num in range(quantity):
        yield f'{base}{random.randint(1, limit)}/'

def words_from_titles(base_url, upper_limit, num):
    """
    iterate through the URLs, obtaining the page title for each one,
    take each page title and split it into individual words and yield them
    """
    # pick 10 random cartoon strips
    for url in generate_urls(base_url, upper_limit, num):

        # access the permanent URL for the selected cartoon
        req = requests.get(url)
        page = bs4.BeautifulSoup(req.text, 'lxml')

        # obtain cartoon strip title from the ctitle block
        title = page.find(id='ctitle').text
        words = title.split(' ')
        for word in words:
            yield word

def main():
    """simple web scraping test"""
    quantity = 80
    base_url = 'http://xkcd.com/'

    # obtain integer value of most recent cartoon
    # so our random selection has an upper bound
    with requests.get(base_url) as req:
        page = bs4.BeautifulSoup(req.text, 'lxml')
        num_previous = page.find('a', rel='prev')['href']
        upper_limit = int(num_previous[1:-1]) + 1

    # calculate word frequencies
    word_freq = col.Counter(words_from_titles(base_url, upper_limit, quantity))

    # display overview
    multiple = len([x for x in word_freq.values() if x > 1])
    print(f'The titles from {quantity} randomly selected XKCD cartoons contained '
          f'{len(word_freq)} unique words.\n'
          f'{multiple} words were used more than once:')

    # display frequent words
    frequent_words = sorted(k for k, v in word_freq.items() if v > 1)
    for word in frequent_words:
        print(word)


##############################################################################
if __name__ == '__main__':
    main()