scrape-stock-index.py

/scrape-stock-index.py

https://bitbucket.org/pombredanne/stock-index-scraper
Python | 238 lines | 192 code | 16 blank | 30 comment | 4 complexity | 266d85bd08431d7ff950b9d51400fd85 MD5 | raw file

#!/usr/bin/python

"""
Command-line tool to scrape topyields.nl to extract dividend yield and
price/earnings data for a given stock index. Outputs to stdout as CSV.

Note that it caches to _CACHE_DIR by to avoid lots of requests.

Still to do:
    - Support more indices.
    - Make price/earnings and dividend yield more defensive.
    - Create a unit test class with typical variations of the html (ie new 
      column) to ensure that the scraper breaks over producing incorrect data.
    - Create an alternative main function which can be used if this module
      is imported by another.
"""

import os
import sys
import urllib2
import re
from collections import namedtuple
from BeautifulSoup import BeautifulSoup

_CACHE_DIR = '/tmp/topyields_cache'

# ------------------------------------------------------------------------------

Stock = namedtuple('Stock', 'name symbol pe dividend_yield')

BASE_URL = 'http://www.topyields.nl'
INDEX_URLS = {
        'ftse100' : '/Top-dividend-yields-of-FTSE100.php',
        'ftse250' : '/Top-dividend-yields-of-FTSE250.php',
        'nyse'    : '/Top-dividend-yields-of-NYSE.php',
        'nasdaq'  : '/Top-dividend-yields-of-NASDAQ.php',
        }

class InvalidStockIndexError(Exception):
    pass

def main():
    """
    Runs the scraper for the stock index specified as the first argument.
    Writes the result as CSV to stdout.
    """
    if len(sys.argv) == 1:
        usage()
        sys.exit(1)
    try:
        index_page = download_index(sys.argv[1])
    except InvalidStockIndexError:
        usage()
        sys.exit(2)

    stocks = parse_index_page_html(index_page.read())
    sys.stdout.write("Symbol,Name,Price/Earnings Ratio,Dividend Yield\n")
    for stock in stocks:
        sys.stdout.write("\"%s\",\"%s\",%s,%s\n" % (
            stock.symbol,stock.name,stock.pe,stock.dividend_yield))

def usage():
    """Writes the usage message out to stderr."""
    sys.stderr.write("Usage: %s <stock index>\n    Valid stock indices: %s\n" % (
        sys.argv[0],
        ','.join(get_valid_indices())))

def get_valid_indices():
    """Returns a list of supported stock index names."""
    return INDEX_URLS.keys()

def download_index(index_name):
    """
    Get the url for this index and open it, returning the HTML file.
    """
    url = get_index_url(index_name)
    return download_with_cache(url)

def get_index_url(index_name):
    """
    Returns the absolute URL for the given index.
    
    >>> get_index_url('ftse100')
    'http://www.topyields.nl/Top-dividend-yields-of-FTSE100.php'
    
    >>> get_index_url('ftse101')
    Traceback (most recent call last):
    ...
    InvalidStockIndexError: Unknown index: ftse101
    """
    try:
        return BASE_URL + INDEX_URLS[index_name]
    except KeyError:
        raise InvalidStockIndexError("Unknown index: %s" % index_name)

def download_with_cache(url):
    """
    Attempts to read the URL from the cache directory - if not present, the URL
    is downloaded and stored to cache, then read back from there.
    """
    if not os.path.isdir(_CACHE_DIR):
        os.mkdir(_CACHE_DIR)

    filename = os.path.join(_CACHE_DIR, slugify(url))
    if not os.path.exists(filename):
        sys.stderr.write("Downloading %s to %s\n" % (url, filename))
        downloaded = urllib2.urlopen(url)
        f = open(filename, 'w')
        f.write(downloaded.read())
        f.close()

    return open(filename, 'r')

def parse_index_page_html(html):
    """
    Creates the BeautifulSoup instance from the given html, locates the main
    table, then processes each row as an individual stock entry.
    """
    soup = BeautifulSoup(html)
    table = find_main_table(soup)
    if not table:
        raise ValueError("Failed to find a table.")
    table_rows = table.findAll('tr')
    return parse_stock_table_rows(table_rows[1:])

def find_main_table(soup):
    """
    Attempts to find the stock table in the page html, returns None or 
    a BeautifulSoup instance for the table.
    """
    return soup.find('table', attrs = {'id' : 'data'})

def parse_stock_table_rows(trs):
    """
    Takes a list of table rows (tr) as BeautifulSoup instances where one row
    contains the data for one stock entry. Uses helper functions to extract
    the fields of interest, returning a populated list of Stock structures.
    """
    
    stocks = []
    for tr in trs:
        stocks.append(Stock(
            name = find_name_from_row(tr),
            symbol = find_symbol_from_row(tr),
            dividend_yield = find_dividend_yield_from_row(tr),
            pe = find_pe_from_row(tr)))
    return stocks

def find_name_from_row(tr):
    """Returns the name of the stock ie 'Aviva PLC' or raises ValueError"""
    input_tag = tr.find('input', attrs = {'type' : 'hidden', 'name' : 'NAME'})
    if not input_tag:
        raise ValueError("Failed to extract stock name from row.")
    return input_tag['value']

def find_symbol_from_row(tr):
    """Returns the stock symbol ie 'ULVR' or raises ValueError"""
    input_tag = tr.find('input', attrs = {'type' : 'hidden', 'name' : 'SYMBOL'})
    if not input_tag:
        raise ValueError("Failed to extract stock symbol from row.")
    return input_tag['value'].split(':')[0]

def find_dividend_yield_from_row(tr):
    """
    Finds and converts the dividend yield to float, or raises if that
    fails or the value isn't in a sensible range.
    """
    raw  = float(tr.findAll('td')[-2].text) # NOTE: make this a bit stronger
    return convert_to_float_in_range(raw, 0, 100, description="Dividend yield")

def find_pe_from_row(tr):
    """
    Finds and converts the price-earnings ratio to float, or raises if that
    fails or the value isn't in a sensible range.
    """
    raw = tr.findAll('td')[-7].text # NOTE: make this a bit stronger
    return convert_to_float_in_range(raw, 0, 1000, description="Price/earnings")

def convert_to_float_in_range(text, lower=None, upper=None, description="Value"):
    """
    Converts a string to a float, and validates the lower and upper. Raises
    ValueError if the conversion failed or the value isn't within bounds.

    >>> convert_to_float_in_range('123.45', 100, 200)
    123.45
    
    >>> convert_to_float_in_range('xxx')
    Traceback (most recent call last):
    ...
    ValueError: Value of 'xxx' doesn't seem right!
    
    >>> convert_to_float_in_range('100', 50, 75)
    Traceback (most recent call last):
    ...
    ValueError: Value of '100' doesn't seem right!
    
    >>> convert_to_float_in_range(100, 150, 200)
    Traceback (most recent call last):
    ...
    ValueError: Value of '100' doesn't seem right!
    """
    suspect_value = False
    try:
        value = float(text)
    except ValueError:
        sys.stderr.write("Failed to convert '%s' to floating point.\n" % text)
        suspect_value = True
    else:
        if lower and value < lower or upper and value > upper:
            suspect_value = True
    
    if suspect_value:
        raise ValueError("%s of '%s' doesn't seem right!" % (
            description, text))
    else:
        return value


_slugify_strip_re = re.compile(r'[^\w\s-]')
_slugify_hyphenate_re = re.compile(r'[-\s]+')

def slugify(value):
    """
    Normalizes string, converts to lowercase, removes non-alpha characters,
    and converts spaces to hyphens.

    From Django's "django/template/defaultfilters.py".
    """
    import unicodedata
    if not isinstance(value, unicode):
        value = unicode(value)
    value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore')
    value = unicode(_slugify_strip_re.sub('', value).strip().lower())
    return _slugify_hyphenate_re.sub('-', value)


if __name__ == '__main__':
    main()