PageRenderTime 42ms CodeModel.GetById 19ms RepoModel.GetById 1ms app.codeStats 0ms

/scrape-stock-index.py

https://bitbucket.org/pombredanne/stock-index-scraper
Python | 238 lines | 192 code | 16 blank | 30 comment | 4 complexity | 266d85bd08431d7ff950b9d51400fd85 MD5 | raw file
  1. #!/usr/bin/python
  2. """
  3. Command-line tool to scrape topyields.nl to extract dividend yield and
  4. price/earnings data for a given stock index. Outputs to stdout as CSV.
  5. Note that it caches to _CACHE_DIR by to avoid lots of requests.
  6. Still to do:
  7. - Support more indices.
  8. - Make price/earnings and dividend yield more defensive.
  9. - Create a unit test class with typical variations of the html (ie new
  10. column) to ensure that the scraper breaks over producing incorrect data.
  11. - Create an alternative main function which can be used if this module
  12. is imported by another.
  13. """
  14. import os
  15. import sys
  16. import urllib2
  17. import re
  18. from collections import namedtuple
  19. from BeautifulSoup import BeautifulSoup
  20. _CACHE_DIR = '/tmp/topyields_cache'
  21. # ------------------------------------------------------------------------------
  22. Stock = namedtuple('Stock', 'name symbol pe dividend_yield')
  23. BASE_URL = 'http://www.topyields.nl'
  24. INDEX_URLS = {
  25. 'ftse100' : '/Top-dividend-yields-of-FTSE100.php',
  26. 'ftse250' : '/Top-dividend-yields-of-FTSE250.php',
  27. 'nyse' : '/Top-dividend-yields-of-NYSE.php',
  28. 'nasdaq' : '/Top-dividend-yields-of-NASDAQ.php',
  29. }
  30. class InvalidStockIndexError(Exception):
  31. pass
  32. def main():
  33. """
  34. Runs the scraper for the stock index specified as the first argument.
  35. Writes the result as CSV to stdout.
  36. """
  37. if len(sys.argv) == 1:
  38. usage()
  39. sys.exit(1)
  40. try:
  41. index_page = download_index(sys.argv[1])
  42. except InvalidStockIndexError:
  43. usage()
  44. sys.exit(2)
  45. stocks = parse_index_page_html(index_page.read())
  46. sys.stdout.write("Symbol,Name,Price/Earnings Ratio,Dividend Yield\n")
  47. for stock in stocks:
  48. sys.stdout.write("\"%s\",\"%s\",%s,%s\n" % (
  49. stock.symbol,stock.name,stock.pe,stock.dividend_yield))
  50. def usage():
  51. """Writes the usage message out to stderr."""
  52. sys.stderr.write("Usage: %s <stock index>\n Valid stock indices: %s\n" % (
  53. sys.argv[0],
  54. ','.join(get_valid_indices())))
  55. def get_valid_indices():
  56. """Returns a list of supported stock index names."""
  57. return INDEX_URLS.keys()
  58. def download_index(index_name):
  59. """
  60. Get the url for this index and open it, returning the HTML file.
  61. """
  62. url = get_index_url(index_name)
  63. return download_with_cache(url)
  64. def get_index_url(index_name):
  65. """
  66. Returns the absolute URL for the given index.
  67. >>> get_index_url('ftse100')
  68. 'http://www.topyields.nl/Top-dividend-yields-of-FTSE100.php'
  69. >>> get_index_url('ftse101')
  70. Traceback (most recent call last):
  71. ...
  72. InvalidStockIndexError: Unknown index: ftse101
  73. """
  74. try:
  75. return BASE_URL + INDEX_URLS[index_name]
  76. except KeyError:
  77. raise InvalidStockIndexError("Unknown index: %s" % index_name)
  78. def download_with_cache(url):
  79. """
  80. Attempts to read the URL from the cache directory - if not present, the URL
  81. is downloaded and stored to cache, then read back from there.
  82. """
  83. if not os.path.isdir(_CACHE_DIR):
  84. os.mkdir(_CACHE_DIR)
  85. filename = os.path.join(_CACHE_DIR, slugify(url))
  86. if not os.path.exists(filename):
  87. sys.stderr.write("Downloading %s to %s\n" % (url, filename))
  88. downloaded = urllib2.urlopen(url)
  89. f = open(filename, 'w')
  90. f.write(downloaded.read())
  91. f.close()
  92. return open(filename, 'r')
  93. def parse_index_page_html(html):
  94. """
  95. Creates the BeautifulSoup instance from the given html, locates the main
  96. table, then processes each row as an individual stock entry.
  97. """
  98. soup = BeautifulSoup(html)
  99. table = find_main_table(soup)
  100. if not table:
  101. raise ValueError("Failed to find a table.")
  102. table_rows = table.findAll('tr')
  103. return parse_stock_table_rows(table_rows[1:])
  104. def find_main_table(soup):
  105. """
  106. Attempts to find the stock table in the page html, returns None or
  107. a BeautifulSoup instance for the table.
  108. """
  109. return soup.find('table', attrs = {'id' : 'data'})
  110. def parse_stock_table_rows(trs):
  111. """
  112. Takes a list of table rows (tr) as BeautifulSoup instances where one row
  113. contains the data for one stock entry. Uses helper functions to extract
  114. the fields of interest, returning a populated list of Stock structures.
  115. """
  116. stocks = []
  117. for tr in trs:
  118. stocks.append(Stock(
  119. name = find_name_from_row(tr),
  120. symbol = find_symbol_from_row(tr),
  121. dividend_yield = find_dividend_yield_from_row(tr),
  122. pe = find_pe_from_row(tr)))
  123. return stocks
  124. def find_name_from_row(tr):
  125. """Returns the name of the stock ie 'Aviva PLC' or raises ValueError"""
  126. input_tag = tr.find('input', attrs = {'type' : 'hidden', 'name' : 'NAME'})
  127. if not input_tag:
  128. raise ValueError("Failed to extract stock name from row.")
  129. return input_tag['value']
  130. def find_symbol_from_row(tr):
  131. """Returns the stock symbol ie 'ULVR' or raises ValueError"""
  132. input_tag = tr.find('input', attrs = {'type' : 'hidden', 'name' : 'SYMBOL'})
  133. if not input_tag:
  134. raise ValueError("Failed to extract stock symbol from row.")
  135. return input_tag['value'].split(':')[0]
  136. def find_dividend_yield_from_row(tr):
  137. """
  138. Finds and converts the dividend yield to float, or raises if that
  139. fails or the value isn't in a sensible range.
  140. """
  141. raw = float(tr.findAll('td')[-2].text) # NOTE: make this a bit stronger
  142. return convert_to_float_in_range(raw, 0, 100, description="Dividend yield")
  143. def find_pe_from_row(tr):
  144. """
  145. Finds and converts the price-earnings ratio to float, or raises if that
  146. fails or the value isn't in a sensible range.
  147. """
  148. raw = tr.findAll('td')[-7].text # NOTE: make this a bit stronger
  149. return convert_to_float_in_range(raw, 0, 1000, description="Price/earnings")
  150. def convert_to_float_in_range(text, lower=None, upper=None, description="Value"):
  151. """
  152. Converts a string to a float, and validates the lower and upper. Raises
  153. ValueError if the conversion failed or the value isn't within bounds.
  154. >>> convert_to_float_in_range('123.45', 100, 200)
  155. 123.45
  156. >>> convert_to_float_in_range('xxx')
  157. Traceback (most recent call last):
  158. ...
  159. ValueError: Value of 'xxx' doesn't seem right!
  160. >>> convert_to_float_in_range('100', 50, 75)
  161. Traceback (most recent call last):
  162. ...
  163. ValueError: Value of '100' doesn't seem right!
  164. >>> convert_to_float_in_range(100, 150, 200)
  165. Traceback (most recent call last):
  166. ...
  167. ValueError: Value of '100' doesn't seem right!
  168. """
  169. suspect_value = False
  170. try:
  171. value = float(text)
  172. except ValueError:
  173. sys.stderr.write("Failed to convert '%s' to floating point.\n" % text)
  174. suspect_value = True
  175. else:
  176. if lower and value < lower or upper and value > upper:
  177. suspect_value = True
  178. if suspect_value:
  179. raise ValueError("%s of '%s' doesn't seem right!" % (
  180. description, text))
  181. else:
  182. return value
  183. _slugify_strip_re = re.compile(r'[^\w\s-]')
  184. _slugify_hyphenate_re = re.compile(r'[-\s]+')
  185. def slugify(value):
  186. """
  187. Normalizes string, converts to lowercase, removes non-alpha characters,
  188. and converts spaces to hyphens.
  189. From Django's "django/template/defaultfilters.py".
  190. """
  191. import unicodedata
  192. if not isinstance(value, unicode):
  193. value = unicode(value)
  194. value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore')
  195. value = unicode(_slugify_strip_re.sub('', value).strip().lower())
  196. return _slugify_hyphenate_re.sub('-', value)
  197. if __name__ == '__main__':
  198. main()