/scrape-stock-index.py
Python | 238 lines | 192 code | 16 blank | 30 comment | 4 complexity | 266d85bd08431d7ff950b9d51400fd85 MD5 | raw file
- #!/usr/bin/python
- """
- Command-line tool to scrape topyields.nl to extract dividend yield and
- price/earnings data for a given stock index. Outputs to stdout as CSV.
- Note that it caches to _CACHE_DIR by to avoid lots of requests.
- Still to do:
- - Support more indices.
- - Make price/earnings and dividend yield more defensive.
- - Create a unit test class with typical variations of the html (ie new
- column) to ensure that the scraper breaks over producing incorrect data.
- - Create an alternative main function which can be used if this module
- is imported by another.
- """
- import os
- import sys
- import urllib2
- import re
- from collections import namedtuple
- from BeautifulSoup import BeautifulSoup
- _CACHE_DIR = '/tmp/topyields_cache'
- # ------------------------------------------------------------------------------
- Stock = namedtuple('Stock', 'name symbol pe dividend_yield')
- BASE_URL = 'http://www.topyields.nl'
- INDEX_URLS = {
- 'ftse100' : '/Top-dividend-yields-of-FTSE100.php',
- 'ftse250' : '/Top-dividend-yields-of-FTSE250.php',
- 'nyse' : '/Top-dividend-yields-of-NYSE.php',
- 'nasdaq' : '/Top-dividend-yields-of-NASDAQ.php',
- }
- class InvalidStockIndexError(Exception):
- pass
- def main():
- """
- Runs the scraper for the stock index specified as the first argument.
- Writes the result as CSV to stdout.
- """
- if len(sys.argv) == 1:
- usage()
- sys.exit(1)
- try:
- index_page = download_index(sys.argv[1])
- except InvalidStockIndexError:
- usage()
- sys.exit(2)
- stocks = parse_index_page_html(index_page.read())
- sys.stdout.write("Symbol,Name,Price/Earnings Ratio,Dividend Yield\n")
- for stock in stocks:
- sys.stdout.write("\"%s\",\"%s\",%s,%s\n" % (
- stock.symbol,stock.name,stock.pe,stock.dividend_yield))
- def usage():
- """Writes the usage message out to stderr."""
- sys.stderr.write("Usage: %s <stock index>\n Valid stock indices: %s\n" % (
- sys.argv[0],
- ','.join(get_valid_indices())))
- def get_valid_indices():
- """Returns a list of supported stock index names."""
- return INDEX_URLS.keys()
- def download_index(index_name):
- """
- Get the url for this index and open it, returning the HTML file.
- """
- url = get_index_url(index_name)
- return download_with_cache(url)
- def get_index_url(index_name):
- """
- Returns the absolute URL for the given index.
-
- >>> get_index_url('ftse100')
- 'http://www.topyields.nl/Top-dividend-yields-of-FTSE100.php'
-
- >>> get_index_url('ftse101')
- Traceback (most recent call last):
- ...
- InvalidStockIndexError: Unknown index: ftse101
- """
- try:
- return BASE_URL + INDEX_URLS[index_name]
- except KeyError:
- raise InvalidStockIndexError("Unknown index: %s" % index_name)
- def download_with_cache(url):
- """
- Attempts to read the URL from the cache directory - if not present, the URL
- is downloaded and stored to cache, then read back from there.
- """
- if not os.path.isdir(_CACHE_DIR):
- os.mkdir(_CACHE_DIR)
- filename = os.path.join(_CACHE_DIR, slugify(url))
- if not os.path.exists(filename):
- sys.stderr.write("Downloading %s to %s\n" % (url, filename))
- downloaded = urllib2.urlopen(url)
- f = open(filename, 'w')
- f.write(downloaded.read())
- f.close()
- return open(filename, 'r')
- def parse_index_page_html(html):
- """
- Creates the BeautifulSoup instance from the given html, locates the main
- table, then processes each row as an individual stock entry.
- """
- soup = BeautifulSoup(html)
- table = find_main_table(soup)
- if not table:
- raise ValueError("Failed to find a table.")
- table_rows = table.findAll('tr')
- return parse_stock_table_rows(table_rows[1:])
- def find_main_table(soup):
- """
- Attempts to find the stock table in the page html, returns None or
- a BeautifulSoup instance for the table.
- """
- return soup.find('table', attrs = {'id' : 'data'})
- def parse_stock_table_rows(trs):
- """
- Takes a list of table rows (tr) as BeautifulSoup instances where one row
- contains the data for one stock entry. Uses helper functions to extract
- the fields of interest, returning a populated list of Stock structures.
- """
-
- stocks = []
- for tr in trs:
- stocks.append(Stock(
- name = find_name_from_row(tr),
- symbol = find_symbol_from_row(tr),
- dividend_yield = find_dividend_yield_from_row(tr),
- pe = find_pe_from_row(tr)))
- return stocks
- def find_name_from_row(tr):
- """Returns the name of the stock ie 'Aviva PLC' or raises ValueError"""
- input_tag = tr.find('input', attrs = {'type' : 'hidden', 'name' : 'NAME'})
- if not input_tag:
- raise ValueError("Failed to extract stock name from row.")
- return input_tag['value']
- def find_symbol_from_row(tr):
- """Returns the stock symbol ie 'ULVR' or raises ValueError"""
- input_tag = tr.find('input', attrs = {'type' : 'hidden', 'name' : 'SYMBOL'})
- if not input_tag:
- raise ValueError("Failed to extract stock symbol from row.")
- return input_tag['value'].split(':')[0]
- def find_dividend_yield_from_row(tr):
- """
- Finds and converts the dividend yield to float, or raises if that
- fails or the value isn't in a sensible range.
- """
- raw = float(tr.findAll('td')[-2].text) # NOTE: make this a bit stronger
- return convert_to_float_in_range(raw, 0, 100, description="Dividend yield")
- def find_pe_from_row(tr):
- """
- Finds and converts the price-earnings ratio to float, or raises if that
- fails or the value isn't in a sensible range.
- """
- raw = tr.findAll('td')[-7].text # NOTE: make this a bit stronger
- return convert_to_float_in_range(raw, 0, 1000, description="Price/earnings")
- def convert_to_float_in_range(text, lower=None, upper=None, description="Value"):
- """
- Converts a string to a float, and validates the lower and upper. Raises
- ValueError if the conversion failed or the value isn't within bounds.
- >>> convert_to_float_in_range('123.45', 100, 200)
- 123.45
-
- >>> convert_to_float_in_range('xxx')
- Traceback (most recent call last):
- ...
- ValueError: Value of 'xxx' doesn't seem right!
-
- >>> convert_to_float_in_range('100', 50, 75)
- Traceback (most recent call last):
- ...
- ValueError: Value of '100' doesn't seem right!
-
- >>> convert_to_float_in_range(100, 150, 200)
- Traceback (most recent call last):
- ...
- ValueError: Value of '100' doesn't seem right!
- """
- suspect_value = False
- try:
- value = float(text)
- except ValueError:
- sys.stderr.write("Failed to convert '%s' to floating point.\n" % text)
- suspect_value = True
- else:
- if lower and value < lower or upper and value > upper:
- suspect_value = True
-
- if suspect_value:
- raise ValueError("%s of '%s' doesn't seem right!" % (
- description, text))
- else:
- return value
- _slugify_strip_re = re.compile(r'[^\w\s-]')
- _slugify_hyphenate_re = re.compile(r'[-\s]+')
- def slugify(value):
- """
- Normalizes string, converts to lowercase, removes non-alpha characters,
- and converts spaces to hyphens.
- From Django's "django/template/defaultfilters.py".
- """
- import unicodedata
- if not isinstance(value, unicode):
- value = unicode(value)
- value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore')
- value = unicode(_slugify_strip_re.sub('', value).strip().lower())
- return _slugify_hyphenate_re.sub('-', value)
- if __name__ == '__main__':
- main()