PageRenderTime 58ms CodeModel.GetById 26ms RepoModel.GetById 1ms app.codeStats 0ms

/pandas/io/data.py

http://github.com/pydata/pandas
Python | 1203 lines | 1172 code | 15 blank | 16 comment | 8 complexity | 01710f94731959407769eca16084f555 MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
  1. """
  2. Module contains tools for collecting data from various remote sources
  3. """
  4. import warnings
  5. import tempfile
  6. import datetime as dt
  7. import time
  8. from collections import defaultdict
  9. import numpy as np
  10. from pandas.compat import(
  11. StringIO, bytes_to_str, range, lrange, lmap, zip
  12. )
  13. import pandas.compat as compat
  14. from pandas import Panel, DataFrame, Series, read_csv, concat, to_datetime
  15. from pandas.core.common import is_list_like, PandasError
  16. from pandas.io.parsers import TextParser
  17. from pandas.io.common import urlopen, ZipFile, urlencode
  18. from pandas.tseries.offsets import MonthBegin
  19. from pandas.util.testing import _network_error_classes
  20. class SymbolWarning(UserWarning):
  21. pass
  22. class RemoteDataError(PandasError, IOError):
  23. pass
  24. def DataReader(name, data_source=None, start=None, end=None,
  25. retry_count=3, pause=0.001):
  26. """
  27. Imports data from a number of online sources.
  28. Currently supports Yahoo! Finance, Google Finance, St. Louis FED (FRED)
  29. and Kenneth French's data library.
  30. Parameters
  31. ----------
  32. name : str or list of strs
  33. the name of the dataset. Some data sources (yahoo, google, fred) will
  34. accept a list of names.
  35. data_source: str
  36. the data source ("yahoo", "google", "fred", or "ff")
  37. start : {datetime, None}
  38. left boundary for range (defaults to 1/1/2010)
  39. end : {datetime, None}
  40. right boundary for range (defaults to today)
  41. Examples
  42. ----------
  43. # Data from Yahoo! Finance
  44. gs = DataReader("GS", "yahoo")
  45. # Data from Google Finance
  46. aapl = DataReader("AAPL", "google")
  47. # Data from FRED
  48. vix = DataReader("VIXCLS", "fred")
  49. # Data from Fama/French
  50. ff = DataReader("F-F_Research_Data_Factors", "famafrench")
  51. ff = DataReader("F-F_Research_Data_Factors_weekly", "famafrench")
  52. ff = DataReader("6_Portfolios_2x3", "famafrench")
  53. ff = DataReader("F-F_ST_Reversal_Factor", "famafrench")
  54. """
  55. start, end = _sanitize_dates(start, end)
  56. if data_source == "yahoo":
  57. return get_data_yahoo(symbols=name, start=start, end=end,
  58. adjust_price=False, chunksize=25,
  59. retry_count=retry_count, pause=pause)
  60. elif data_source == "google":
  61. return get_data_google(symbols=name, start=start, end=end,
  62. adjust_price=False, chunksize=25,
  63. retry_count=retry_count, pause=pause)
  64. elif data_source == "fred":
  65. return get_data_fred(name, start, end)
  66. elif data_source == "famafrench":
  67. return get_data_famafrench(name)
  68. def _sanitize_dates(start, end):
  69. from pandas.core.datetools import to_datetime
  70. start = to_datetime(start)
  71. end = to_datetime(end)
  72. if start is None:
  73. start = dt.datetime(2010, 1, 1)
  74. if end is None:
  75. end = dt.datetime.today()
  76. return start, end
  77. def _in_chunks(seq, size):
  78. """
  79. Return sequence in 'chunks' of size defined by size
  80. """
  81. return (seq[pos:pos + size] for pos in range(0, len(seq), size))
  82. _yahoo_codes = {'symbol': 's', 'last': 'l1', 'change_pct': 'p2', 'PE': 'r',
  83. 'time': 't1', 'short_ratio': 's7'}
  84. _YAHOO_QUOTE_URL = 'http://finance.yahoo.com/d/quotes.csv?'
  85. def get_quote_yahoo(symbols):
  86. """
  87. Get current yahoo quote
  88. Returns a DataFrame
  89. """
  90. if isinstance(symbols, compat.string_types):
  91. sym_list = symbols
  92. else:
  93. sym_list = '+'.join(symbols)
  94. # for codes see: http://www.gummy-stuff.org/Yahoo-data.htm
  95. request = ''.join(compat.itervalues(_yahoo_codes)) # code request string
  96. header = list(_yahoo_codes.keys())
  97. data = defaultdict(list)
  98. url_str = _YAHOO_QUOTE_URL + 's=%s&f=%s' % (sym_list, request)
  99. with urlopen(url_str) as url:
  100. lines = url.readlines()
  101. for line in lines:
  102. fields = line.decode('utf-8').strip().split(',')
  103. for i, field in enumerate(fields):
  104. if field[-2:] == '%"':
  105. v = float(field.strip('"%'))
  106. elif field[0] == '"':
  107. v = field.strip('"')
  108. else:
  109. try:
  110. v = float(field)
  111. except ValueError:
  112. v = np.nan
  113. data[header[i]].append(v)
  114. idx = data.pop('symbol')
  115. return DataFrame(data, index=idx)
  116. def get_quote_google(symbols):
  117. raise NotImplementedError("Google Finance doesn't have this functionality")
  118. def _retry_read_url(url, retry_count, pause, name):
  119. for _ in range(retry_count):
  120. time.sleep(pause)
  121. # kludge to close the socket ASAP
  122. try:
  123. with urlopen(url) as resp:
  124. lines = resp.read()
  125. except _network_error_classes:
  126. pass
  127. else:
  128. rs = read_csv(StringIO(bytes_to_str(lines)), index_col=0,
  129. parse_dates=True)[::-1]
  130. # Yahoo! Finance sometimes does this awesome thing where they
  131. # return 2 rows for the most recent business day
  132. if len(rs) > 2 and rs.index[-1] == rs.index[-2]: # pragma: no cover
  133. rs = rs[:-1]
  134. return rs
  135. raise IOError("after %d tries, %s did not "
  136. "return a 200 for url %r" % (retry_count, name, url))
  137. _HISTORICAL_YAHOO_URL = 'http://ichart.finance.yahoo.com/table.csv?'
  138. def _get_hist_yahoo(sym, start, end, retry_count, pause):
  139. """
  140. Get historical data for the given name from yahoo.
  141. Date format is datetime
  142. Returns a DataFrame.
  143. """
  144. start, end = _sanitize_dates(start, end)
  145. url = (_HISTORICAL_YAHOO_URL + 's=%s' % sym +
  146. '&a=%s' % (start.month - 1) +
  147. '&b=%s' % start.day +
  148. '&c=%s' % start.year +
  149. '&d=%s' % (end.month - 1) +
  150. '&e=%s' % end.day +
  151. '&f=%s' % end.year +
  152. '&g=d' +
  153. '&ignore=.csv')
  154. return _retry_read_url(url, retry_count, pause, 'Yahoo!')
  155. _HISTORICAL_GOOGLE_URL = 'http://www.google.com/finance/historical?'
  156. def _get_hist_google(sym, start, end, retry_count, pause):
  157. """
  158. Get historical data for the given name from google.
  159. Date format is datetime
  160. Returns a DataFrame.
  161. """
  162. start, end = _sanitize_dates(start, end)
  163. # www.google.com/finance/historical?q=GOOG&startdate=Jun+9%2C+2011&enddate=Jun+8%2C+2013&output=csv
  164. url = "%s%s" % (_HISTORICAL_GOOGLE_URL,
  165. urlencode({"q": sym,
  166. "startdate": start.strftime('%b %d, ' '%Y'),
  167. "enddate": end.strftime('%b %d, %Y'),
  168. "output": "csv"}))
  169. return _retry_read_url(url, retry_count, pause, 'Google')
  170. def _adjust_prices(hist_data, price_list=None):
  171. """
  172. Return modifed DataFrame or Panel with adjusted prices based on
  173. 'Adj Close' price. Adds 'Adj_Ratio' column.
  174. """
  175. if price_list is None:
  176. price_list = 'Open', 'High', 'Low', 'Close'
  177. adj_ratio = hist_data['Adj Close'] / hist_data['Close']
  178. data = hist_data.copy()
  179. for item in price_list:
  180. data[item] = hist_data[item] * adj_ratio
  181. data['Adj_Ratio'] = adj_ratio
  182. del data['Adj Close']
  183. return data
  184. def _calc_return_index(price_df):
  185. """
  186. Return a returns index from a input price df or series. Initial value
  187. (typically NaN) is set to 1.
  188. """
  189. df = price_df.pct_change().add(1).cumprod()
  190. mask = df.ix[1].notnull() & df.ix[0].isnull()
  191. df.ix[0][mask] = 1
  192. # Check for first stock listings after starting date of index in ret_index
  193. # If True, find first_valid_index and set previous entry to 1.
  194. if (~mask).any():
  195. for sym in mask.index[~mask]:
  196. tstamp = df[sym].first_valid_index()
  197. t_idx = df.index.get_loc(tstamp) - 1
  198. df[sym].ix[t_idx] = 1
  199. return df
  200. _YAHOO_COMPONENTS_URL = 'http://download.finance.yahoo.com/d/quotes.csv?'
  201. def get_components_yahoo(idx_sym):
  202. """
  203. Returns DataFrame containing list of component information for
  204. index represented in idx_sym from yahoo. Includes component symbol
  205. (ticker), exchange, and name.
  206. Parameters
  207. ----------
  208. idx_sym : str
  209. Stock index symbol
  210. Examples:
  211. '^DJI' (Dow Jones Industrial Average)
  212. '^NYA' (NYSE Composite)
  213. '^IXIC' (NASDAQ Composite)
  214. See: http://finance.yahoo.com/indices for other index symbols
  215. Returns
  216. -------
  217. idx_df : DataFrame
  218. """
  219. stats = 'snx'
  220. # URL of form:
  221. # http://download.finance.yahoo.com/d/quotes.csv?s=@%5EIXIC&f=snxl1d1t1c1ohgv
  222. url = _YAHOO_COMPONENTS_URL + 's={0}&f={1}&e=.csv&h={2}'
  223. idx_mod = idx_sym.replace('^', '@%5E')
  224. url_str = url.format(idx_mod, stats, 1)
  225. idx_df = DataFrame()
  226. mask = [True]
  227. comp_idx = 1
  228. # LOOP across component index structure,
  229. # break when no new components are found
  230. while True in mask:
  231. url_str = url.format(idx_mod, stats, comp_idx)
  232. with urlopen(url_str) as resp:
  233. raw = resp.read()
  234. lines = raw.decode('utf-8').strip().strip('"').split('"\r\n"')
  235. lines = [line.strip().split('","') for line in lines]
  236. temp_df = DataFrame(lines, columns=['ticker', 'name', 'exchange'])
  237. temp_df = temp_df.drop_duplicates()
  238. temp_df = temp_df.set_index('ticker')
  239. mask = ~temp_df.index.isin(idx_df.index)
  240. comp_idx = comp_idx + 50
  241. idx_df = idx_df.append(temp_df[mask])
  242. return idx_df
  243. def _dl_mult_symbols(symbols, start, end, chunksize, retry_count, pause,
  244. method):
  245. stocks = {}
  246. for sym_group in _in_chunks(symbols, chunksize):
  247. for sym in sym_group:
  248. try:
  249. stocks[sym] = method(sym, start, end, retry_count, pause)
  250. except IOError:
  251. warnings.warn('Failed to read symbol: {0!r}, replacing with '
  252. 'NaN.'.format(sym), SymbolWarning)
  253. stocks[sym] = np.nan
  254. try:
  255. return Panel(stocks).swapaxes('items', 'minor')
  256. except AttributeError:
  257. # cannot construct a panel with just 1D nans indicating no data
  258. raise RemoteDataError("No data fetched using "
  259. "{0!r}".format(method.__name__))
  260. _source_functions = {'google': _get_hist_google, 'yahoo': _get_hist_yahoo}
  261. def _get_data_from(symbols, start, end, retry_count, pause, adjust_price,
  262. ret_index, chunksize, source):
  263. src_fn = _source_functions[source]
  264. # If a single symbol, (e.g., 'GOOG')
  265. if isinstance(symbols, (compat.string_types, int)):
  266. hist_data = src_fn(symbols, start, end, retry_count, pause)
  267. # Or multiple symbols, (e.g., ['GOOG', 'AAPL', 'MSFT'])
  268. elif isinstance(symbols, DataFrame):
  269. hist_data = _dl_mult_symbols(symbols.index, start, end, chunksize,
  270. retry_count, pause, src_fn)
  271. else:
  272. hist_data = _dl_mult_symbols(symbols, start, end, chunksize,
  273. retry_count, pause, src_fn)
  274. if source.lower() == 'yahoo':
  275. if ret_index:
  276. hist_data['Ret_Index'] = _calc_return_index(hist_data['Adj Close'])
  277. if adjust_price:
  278. hist_data = _adjust_prices(hist_data)
  279. return hist_data
  280. def get_data_yahoo(symbols=None, start=None, end=None, retry_count=3,
  281. pause=0.001, adjust_price=False, ret_index=False,
  282. chunksize=25):
  283. """
  284. Returns DataFrame/Panel of historical stock prices from symbols, over date
  285. range, start to end. To avoid being penalized by Yahoo! Finance servers,
  286. pauses between downloading 'chunks' of symbols can be specified.
  287. Parameters
  288. ----------
  289. symbols : string, array-like object (list, tuple, Series), or DataFrame
  290. Single stock symbol (ticker), array-like object of symbols or
  291. DataFrame with index containing stock symbols.
  292. start : string, (defaults to '1/1/2010')
  293. Starting date, timestamp. Parses many different kind of date
  294. representations (e.g., 'JAN-01-2010', '1/1/10', 'Jan, 1, 1980')
  295. end : string, (defaults to today)
  296. Ending date, timestamp. Same format as starting date.
  297. retry_count : int, default 3
  298. Number of times to retry query request.
  299. pause : int, default 0
  300. Time, in seconds, to pause between consecutive queries of chunks. If
  301. single value given for symbol, represents the pause between retries.
  302. adjust_price : bool, default False
  303. If True, adjusts all prices in hist_data ('Open', 'High', 'Low',
  304. 'Close') based on 'Adj Close' price. Adds 'Adj_Ratio' column and drops
  305. 'Adj Close'.
  306. ret_index : bool, default False
  307. If True, includes a simple return index 'Ret_Index' in hist_data.
  308. chunksize : int, default 25
  309. Number of symbols to download consecutively before intiating pause.
  310. Returns
  311. -------
  312. hist_data : DataFrame (str) or Panel (array-like object, DataFrame)
  313. """
  314. return _get_data_from(symbols, start, end, retry_count, pause,
  315. adjust_price, ret_index, chunksize, 'yahoo')
  316. def get_data_google(symbols=None, start=None, end=None, retry_count=3,
  317. pause=0.001, adjust_price=False, ret_index=False,
  318. chunksize=25):
  319. """
  320. Returns DataFrame/Panel of historical stock prices from symbols, over date
  321. range, start to end. To avoid being penalized by Google Finance servers,
  322. pauses between downloading 'chunks' of symbols can be specified.
  323. Parameters
  324. ----------
  325. symbols : string, array-like object (list, tuple, Series), or DataFrame
  326. Single stock symbol (ticker), array-like object of symbols or
  327. DataFrame with index containing stock symbols.
  328. start : string, (defaults to '1/1/2010')
  329. Starting date, timestamp. Parses many different kind of date
  330. representations (e.g., 'JAN-01-2010', '1/1/10', 'Jan, 1, 1980')
  331. end : string, (defaults to today)
  332. Ending date, timestamp. Same format as starting date.
  333. retry_count : int, default 3
  334. Number of times to retry query request.
  335. pause : int, default 0
  336. Time, in seconds, to pause between consecutive queries of chunks. If
  337. single value given for symbol, represents the pause between retries.
  338. chunksize : int, default 25
  339. Number of symbols to download consecutively before intiating pause.
  340. Returns
  341. -------
  342. hist_data : DataFrame (str) or Panel (array-like object, DataFrame)
  343. """
  344. return _get_data_from(symbols, start, end, retry_count, pause,
  345. adjust_price, ret_index, chunksize, 'google')
  346. _FRED_URL = "http://research.stlouisfed.org/fred2/series/"
  347. def get_data_fred(name, start=dt.datetime(2010, 1, 1),
  348. end=dt.datetime.today()):
  349. """
  350. Get data for the given name from the St. Louis FED (FRED).
  351. Date format is datetime
  352. Returns a DataFrame.
  353. If multiple names are passed for "series" then the index of the
  354. DataFrame is the outer join of the indicies of each series.
  355. """
  356. start, end = _sanitize_dates(start, end)
  357. if not is_list_like(name):
  358. names = [name]
  359. else:
  360. names = name
  361. urls = [_FRED_URL + '%s' % n + '/downloaddata/%s' % n + '.csv' for
  362. n in names]
  363. def fetch_data(url, name):
  364. with urlopen(url) as resp:
  365. data = read_csv(resp, index_col=0, parse_dates=True,
  366. header=None, skiprows=1, names=["DATE", name],
  367. na_values='.')
  368. try:
  369. return data.truncate(start, end)
  370. except KeyError:
  371. if data.ix[3].name[7:12] == 'Error':
  372. raise IOError("Failed to get the data. Check that {0!r} is "
  373. "a valid FRED series.".format(name))
  374. raise
  375. df = concat([fetch_data(url, n) for url, n in zip(urls, names)],
  376. axis=1, join='outer')
  377. return df
  378. _FAMAFRENCH_URL = 'http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp'
  379. def get_data_famafrench(name):
  380. # path of zip files
  381. zip_file_path = '{0}/{1}.zip'.format(_FAMAFRENCH_URL, name)
  382. with urlopen(zip_file_path) as url:
  383. raw = url.read()
  384. with tempfile.TemporaryFile() as tmpf:
  385. tmpf.write(raw)
  386. with ZipFile(tmpf, 'r') as zf:
  387. data = zf.open(zf.namelist()[0]).readlines()
  388. line_lengths = np.array(lmap(len, data))
  389. file_edges = np.where(line_lengths == 2)[0]
  390. datasets = {}
  391. edges = zip(file_edges + 1, file_edges[1:])
  392. for i, (left_edge, right_edge) in enumerate(edges):
  393. dataset = [d.split() for d in data[left_edge:right_edge]]
  394. if len(dataset) > 10:
  395. ncol_raw = np.array(lmap(len, dataset))
  396. ncol = np.median(ncol_raw)
  397. header_index = np.where(ncol_raw == ncol - 1)[0][-1]
  398. header = dataset[header_index]
  399. ds_header = dataset[header_index + 1:]
  400. # to ensure the header is unique
  401. header = ['{0} {1}'.format(j, hj) for j, hj in enumerate(header,
  402. start=1)]
  403. index = np.array([d[0] for d in ds_header], dtype=int)
  404. dataset = np.array([d[1:] for d in ds_header], dtype=float)
  405. datasets[i] = DataFrame(dataset, index, columns=header)
  406. return datasets
  407. # Items needed for options class
  408. CUR_MONTH = dt.datetime.now().month
  409. CUR_YEAR = dt.datetime.now().year
  410. CUR_DAY = dt.datetime.now().day
  411. def _unpack(row, kind):
  412. def _parse_row_values(val):
  413. ret = val.text_content()
  414. if 'neg_arrow' in val.xpath('.//@class'):
  415. try:
  416. ret = float(ret.replace(',', ''))*(-1.0)
  417. except ValueError:
  418. ret = np.nan
  419. return ret
  420. els = row.xpath('.//%s' % kind)
  421. return [_parse_row_values(val) for val in els]
  422. def _parse_options_data(table):
  423. rows = table.xpath('.//tr')
  424. header = _unpack(rows[0], kind='th')
  425. data = [_unpack(row, kind='td') for row in rows[1:]]
  426. # Use ',' as a thousands separator as we're pulling from the US site.
  427. return TextParser(data, names=header, na_values=['N/A'],
  428. thousands=',').get_chunk()
  429. def _two_char_month(s):
  430. return '{0:0>2}'.format(s)
  431. class Options(object):
  432. """
  433. ***Experimental***
  434. This class fetches call/put data for a given stock/expiry month.
  435. It is instantiated with a string representing the ticker symbol.
  436. The class has the following methods:
  437. get_options_data:(month, year, expiry)
  438. get_call_data:(month, year, expiry)
  439. get_put_data: (month, year, expiry)
  440. get_near_stock_price(opt_frame, above_below)
  441. get_all_data(call, put)
  442. get_forward_data(months, call, put) (deprecated)
  443. Examples
  444. --------
  445. # Instantiate object with ticker
  446. >>> aapl = Options('aapl', 'yahoo')
  447. # Fetch May 2014 call data
  448. >>> expiry = datetime.date(2014, 5, 1)
  449. >>> calls = aapl.get_call_data(expiry=expiry)
  450. # Can now access aapl.calls instance variable
  451. >>> aapl.calls
  452. # Fetch May 2014 put data
  453. >>> puts = aapl.get_put_data(expiry=expiry)
  454. # Can now access aapl.puts instance variable
  455. >>> aapl.puts
  456. # cut down the call data to be 3 below and 3 above the stock price.
  457. >>> cut_calls = aapl.get_near_stock_price(call=True, above_below=3)
  458. # Fetch call and put data with expiry from now to 8 months out
  459. >>> forward_data = aapl.get_forward_data(8, call=True, put=True)
  460. # Fetch all call and put data
  461. >>> all_data = aapl.get_all_data()
  462. """
  463. _TABLE_LOC = {'calls': 9, 'puts': 13}
  464. def __init__(self, symbol, data_source=None):
  465. """ Instantiates options_data with a ticker saved as symbol """
  466. self.symbol = symbol.upper()
  467. if data_source is None:
  468. warnings.warn("Options(symbol) is deprecated, use Options(symbol,"
  469. " data_source) instead", FutureWarning)
  470. data_source = "yahoo"
  471. if data_source != "yahoo":
  472. raise NotImplementedError("currently only yahoo supported")
  473. def get_options_data(self, month=None, year=None, expiry=None):
  474. """
  475. ***Experimental***
  476. Gets call/put data for the stock with the expiration data in the
  477. given month and year
  478. Parameters
  479. ----------
  480. expiry: datetime.date, optional(default=None)
  481. The date when options expire (defaults to current month)
  482. Returns
  483. -------
  484. pandas.DataFrame
  485. A DataFrame with requested options data.
  486. Index:
  487. Strike: Option strike, int
  488. Expiry: Option expiry, datetime.date
  489. Type: Call or Put, string
  490. Symbol: Option symbol as reported on Yahoo, string
  491. Columns:
  492. Last: Last option price, float
  493. Chg: Change from prior day, float
  494. Bid: Bid price, float
  495. Ask: Ask price, float
  496. Vol: Volume traded, int64
  497. Open_Int: Open interest, int64
  498. IsNonstandard: True if the the deliverable is not 100 shares, otherwise false
  499. Underlying: Ticker of the underlying security, string
  500. Underlying_Price: Price of the underlying security, float64
  501. Quote_Time: Time of the quote, Timestamp
  502. Notes
  503. -----
  504. Note: Format of returned data frame is dependent on Yahoo and may change.
  505. When called, this function will add instance variables named
  506. calls and puts. See the following example:
  507. >>> aapl = Options('aapl', 'yahoo') # Create object
  508. >>> aapl.calls # will give an AttributeError
  509. >>> aapl.get_options() # Get data and set ivars
  510. >>> aapl.calls # Doesn't throw AttributeError
  511. Also note that aapl.calls and appl.puts will always be the calls
  512. and puts for the next expiry. If the user calls this method with
  513. a different month or year, the ivar will be named callsMMYY or
  514. putsMMYY where MM and YY are, respectively, two digit
  515. representations of the month and year for the expiry of the
  516. options.
  517. """
  518. return concat([f(month, year, expiry)
  519. for f in (self.get_put_data,
  520. self.get_call_data)]).sortlevel()
  521. _OPTIONS_BASE_URL = 'http://finance.yahoo.com/q/op?s={sym}'
  522. def _get_option_tables(self, expiry):
  523. root = self._get_option_page_from_yahoo(expiry)
  524. tables = self._parse_option_page_from_yahoo(root)
  525. m1 = _two_char_month(expiry.month)
  526. table_name = '_tables' + m1 + str(expiry.year)[-2:]
  527. setattr(self, table_name, tables)
  528. return tables
  529. def _get_option_page_from_yahoo(self, expiry):
  530. url = self._OPTIONS_BASE_URL.format(sym=self.symbol)
  531. m1 = _two_char_month(expiry.month)
  532. # if this month use other url
  533. if expiry.month == CUR_MONTH and expiry.year == CUR_YEAR:
  534. url += '+Options'
  535. else:
  536. url += '&m={year}-{m1}'.format(year=expiry.year, m1=m1)
  537. root = self._parse_url(url)
  538. return root
  539. def _parse_option_page_from_yahoo(self, root):
  540. tables = root.xpath('.//table')
  541. ntables = len(tables)
  542. if ntables == 0:
  543. raise RemoteDataError("No tables found")
  544. try:
  545. self.underlying_price, self.quote_time = self._get_underlying_price(root)
  546. except IndexError:
  547. self.underlying_price, self.quote_time = np.nan, np.nan
  548. return tables
  549. def _get_underlying_price(self, root):
  550. underlying_price = float(root.xpath('.//*[@class="time_rtq_ticker"]')[0]\
  551. .getchildren()[0].text)
  552. #Gets the time of the quote, note this is actually the time of the underlying price.
  553. quote_time_text = root.xpath('.//*[@class="time_rtq"]')[0].getchildren()[0].text
  554. if quote_time_text:
  555. #weekend and prior to market open time format
  556. split = quote_time_text.split(",")
  557. timesplit = split[1].strip().split(":")
  558. timestring = split[0] + ", " + timesplit[0].zfill(2) + ":" + timesplit[1]
  559. quote_time = dt.datetime.strptime(timestring, "%b %d, %H:%M%p EDT")
  560. quote_time = quote_time.replace(year=CUR_YEAR)
  561. else:
  562. quote_time_text = root.xpath('.//*[@class="time_rtq"]')[0].getchildren()[0].getchildren()[0].text
  563. quote_time = dt.datetime.strptime(quote_time_text, "%H:%M%p EDT")
  564. quote_time = quote_time.replace(year=CUR_YEAR, month=CUR_MONTH, day=CUR_DAY)
  565. return underlying_price, quote_time
  566. def _get_option_data(self, month, year, expiry, name):
  567. year, month, expiry = self._try_parse_dates(year, month, expiry)
  568. m1 = _two_char_month(month)
  569. table_name = '_tables' + m1 + str(year)[-2:]
  570. try:
  571. tables = getattr(self, table_name)
  572. except AttributeError:
  573. tables = self._get_option_tables(expiry)
  574. ntables = len(tables)
  575. table_loc = self._TABLE_LOC[name]
  576. if table_loc - 1 > ntables:
  577. raise RemoteDataError("Table location {0} invalid, {1} tables"
  578. " found".format(table_loc, ntables))
  579. option_data = _parse_options_data(tables[table_loc])
  580. option_data['Type'] = name[:-1]
  581. option_data = self._process_data(option_data, name[:-1])
  582. if month == CUR_MONTH and year == CUR_YEAR:
  583. setattr(self, name, option_data)
  584. name += m1 + str(year)[-2:]
  585. setattr(self, name, option_data)
  586. return option_data
  587. def get_call_data(self, month=None, year=None, expiry=None):
  588. """
  589. ***Experimental***
  590. Gets call/put data for the stock with the expiration data in the
  591. given month and year
  592. Parameters
  593. ----------
  594. expiry: datetime.date, optional(default=None)
  595. The date when options expire (defaults to current month)
  596. Returns
  597. -------
  598. call_data: pandas.DataFrame
  599. A DataFrame with requested options data.
  600. Index:
  601. Strike: Option strike, int
  602. Expiry: Option expiry, datetime.date
  603. Type: Call or Put, string
  604. Symbol: Option symbol as reported on Yahoo, string
  605. Columns:
  606. Last: Last option price, float
  607. Chg: Change from prior day, float
  608. Bid: Bid price, float
  609. Ask: Ask price, float
  610. Vol: Volume traded, int64
  611. Open_Int: Open interest, int64
  612. IsNonstandard: True if the the deliverable is not 100 shares, otherwise false
  613. Underlying: Ticker of the underlying security, string
  614. Underlying_Price: Price of the underlying security, float64
  615. Quote_Time: Time of the quote, Timestamp
  616. Notes
  617. -----
  618. Note: Format of returned data frame is dependent on Yahoo and may change.
  619. When called, this function will add instance variables named
  620. calls and puts. See the following example:
  621. >>> aapl = Options('aapl', 'yahoo') # Create object
  622. >>> aapl.calls # will give an AttributeError
  623. >>> aapl.get_call_data() # Get data and set ivars
  624. >>> aapl.calls # Doesn't throw AttributeError
  625. Also note that aapl.calls will always be the calls for the next
  626. expiry. If the user calls this method with a different month
  627. or year, the ivar will be named callsMMYY where MM and YY are,
  628. respectively, two digit representations of the month and year
  629. for the expiry of the options.
  630. """
  631. return self._get_option_data(month, year, expiry, 'calls').sortlevel()
  632. def get_put_data(self, month=None, year=None, expiry=None):
  633. """
  634. ***Experimental***
  635. Gets put data for the stock with the expiration data in the
  636. given month and year
  637. Parameters
  638. ----------
  639. expiry: datetime.date, optional(default=None)
  640. The date when options expire (defaults to current month)
  641. Returns
  642. -------
  643. put_data: pandas.DataFrame
  644. A DataFrame with requested options data.
  645. Index:
  646. Strike: Option strike, int
  647. Expiry: Option expiry, datetime.date
  648. Type: Call or Put, string
  649. Symbol: Option symbol as reported on Yahoo, string
  650. Columns:
  651. Last: Last option price, float
  652. Chg: Change from prior day, float
  653. Bid: Bid price, float
  654. Ask: Ask price, float
  655. Vol: Volume traded, int64
  656. Open_Int: Open interest, int64
  657. IsNonstandard: True if the the deliverable is not 100 shares, otherwise false
  658. Underlying: Ticker of the underlying security, string
  659. Underlying_Price: Price of the underlying security, float64
  660. Quote_Time: Time of the quote, Timestamp
  661. Notes
  662. -----
  663. Note: Format of returned data frame is dependent on Yahoo and may change.
  664. When called, this function will add instance variables named
  665. puts. See the following example:
  666. >>> aapl = Options('aapl') # Create object
  667. >>> aapl.puts # will give an AttributeError
  668. >>> aapl.get_put_data() # Get data and set ivars
  669. >>> aapl.puts # Doesn't throw AttributeError
  670. return self.__setattr__(self, str(str(x) + str(y)))
  671. Also note that aapl.puts will always be the puts for the next
  672. expiry. If the user calls this method with a different month
  673. or year, the ivar will be named putsMMYY where MM and YY are,
  674. repsectively, two digit representations of the month and year
  675. for the expiry of the options.
  676. """
  677. return self._get_option_data(month, year, expiry, 'puts').sortlevel()
  678. def get_near_stock_price(self, above_below=2, call=True, put=False,
  679. month=None, year=None, expiry=None):
  680. """
  681. ***Experimental***
  682. Returns a data frame of options that are near the current stock price.
  683. Parameters
  684. ----------
  685. above_below: number, int, optional (default=2)
  686. The number of strike prices above and below the stock price that
  687. should be taken
  688. call: bool
  689. Tells the function whether or not it should be using
  690. self.calls
  691. put: bool
  692. Tells the function weather or not it should be using
  693. self.puts
  694. expiry: datetime.date, optional(default=None)
  695. The date when options expire (defaults to current month)
  696. Returns
  697. -------
  698. chopped: DataFrame
  699. The resultant DataFrame chopped down to be 2 * above_below + 1 rows
  700. desired. If there isn't data as far out as the user has asked for
  701. then
  702. Note: Format of returned data frame is dependent on Yahoo and may change.
  703. """
  704. to_ret = Series({'calls': call, 'puts': put})
  705. to_ret = to_ret[to_ret].index
  706. data = {}
  707. for nam in to_ret:
  708. df = self._get_option_data(month, year, expiry, nam)
  709. data[nam] = self.chop_data(df, above_below, self.underlying_price)
  710. return concat([data[nam] for nam in to_ret]).sortlevel()
  711. def chop_data(self, df, above_below=2, underlying_price=None):
  712. """Returns a data frame only options that are near the current stock price."""
  713. if not underlying_price:
  714. try:
  715. underlying_price = self.underlying_price
  716. except AttributeError:
  717. underlying_price = np.nan
  718. if not np.isnan(underlying_price):
  719. start_index = np.where(df.index.get_level_values('Strike')
  720. > underlying_price)[0][0]
  721. get_range = slice(start_index - above_below,
  722. start_index + above_below + 1)
  723. df = df[get_range].dropna(how='all')
  724. return df
  725. @staticmethod
  726. def _try_parse_dates(year, month, expiry):
  727. """
  728. Validates dates provided by user. Ensures the user either provided both a month and a year or an expiry.
  729. Parameters
  730. ----------
  731. year: Calendar year, int (deprecated)
  732. month: Calendar month, int (deprecated)
  733. expiry: Expiry date (month and year), datetime.date, (preferred)
  734. Returns
  735. -------
  736. Tuple of year (int), month (int), expiry (datetime.date)
  737. """
  738. #Checks if the user gave one of the month or the year but not both and did not provide an expiry:
  739. if (month is not None and year is None) or (month is None and year is not None) and expiry is None:
  740. msg = "You must specify either (`year` and `month`) or `expiry` " \
  741. "or none of these options for the current month."
  742. raise ValueError(msg)
  743. if (year is not None or month is not None) and expiry is None:
  744. warnings.warn("month, year arguments are deprecated, use expiry"
  745. " instead", FutureWarning)
  746. if expiry is not None:
  747. year = expiry.year
  748. month = expiry.month
  749. elif year is None and month is None:
  750. year = CUR_YEAR
  751. month = CUR_MONTH
  752. expiry = dt.date(year, month, 1)
  753. else:
  754. expiry = dt.date(year, month, 1)
  755. return year, month, expiry
  756. def get_forward_data(self, months, call=True, put=False, near=False,
  757. above_below=2):
  758. """
  759. ***Experimental***
  760. Gets either call, put, or both data for months starting in the current
  761. month and going out in the future a specified amount of time.
  762. Parameters
  763. ----------
  764. months: number, int
  765. How many months to go out in the collection of the data. This is
  766. inclusive.
  767. call: bool, optional (default=True)
  768. Whether or not to collect data for call options
  769. put: bool, optional (default=False)
  770. Whether or not to collect data for put options.
  771. near: bool, optional (default=False)
  772. Whether this function should get only the data near the
  773. current stock price. Uses Options.get_near_stock_price
  774. above_below: number, int, optional (default=2)
  775. The number of strike prices above and below the stock price that
  776. should be taken if the near option is set to True
  777. Returns
  778. -------
  779. pandas.DataFrame
  780. A DataFrame with requested options data.
  781. Index:
  782. Strike: Option strike, int
  783. Expiry: Option expiry, datetime.date
  784. Type: Call or Put, string
  785. Symbol: Option symbol as reported on Yahoo, string
  786. Columns:
  787. Last: Last option price, float
  788. Chg: Change from prior day, float
  789. Bid: Bid price, float
  790. Ask: Ask price, float
  791. Vol: Volume traded, int64
  792. Open_Int: Open interest, int64
  793. IsNonstandard: True if the the deliverable is not 100 shares, otherwise false
  794. Underlying: Ticker of the underlying security, string
  795. Underlying_Price: Price of the underlying security, float64
  796. Quote_Time: Time of the quote, Timestamp
  797. Note: Format of returned data frame is dependent on Yahoo and may change.
  798. """
  799. warnings.warn("get_forward_data() is deprecated", FutureWarning)
  800. in_months = lrange(CUR_MONTH, CUR_MONTH + months + 1)
  801. in_years = [CUR_YEAR] * (months + 1)
  802. # Figure out how many items in in_months go past 12
  803. to_change = 0
  804. for i in range(months):
  805. if in_months[i] > 12:
  806. in_months[i] -= 12
  807. to_change += 1
  808. # Change the corresponding items in the in_years list.
  809. for i in range(1, to_change + 1):
  810. in_years[-i] += 1
  811. to_ret = Series({'calls': call, 'puts': put})
  812. to_ret = to_ret[to_ret].index
  813. all_data = []
  814. for name in to_ret:
  815. for mon in range(months):
  816. m2 = in_months[mon]
  817. y2 = in_years[mon]
  818. if not near:
  819. m1 = _two_char_month(m2)
  820. nam = name + str(m1) + str(y2)[2:]
  821. try: # Try to access on the instance
  822. frame = getattr(self, nam)
  823. except AttributeError:
  824. meth_name = 'get_{0}_data'.format(name[:-1])
  825. frame = getattr(self, meth_name)(m2, y2)
  826. else:
  827. frame = self.get_near_stock_price(call=call, put=put,
  828. above_below=above_below,
  829. month=m2, year=y2)
  830. frame = self._process_data(frame, name[:-1])
  831. all_data.append(frame)
  832. return concat(all_data).sortlevel()
  833. def get_all_data(self, call=True, put=True):
  834. """
  835. ***Experimental***
  836. Gets either call, put, or both data for all available months starting
  837. in the current month.
  838. Parameters
  839. ----------
  840. call: bool, optional (default=True)
  841. Whether or not to collect data for call options
  842. put: bool, optional (default=True)
  843. Whether or not to collect data for put options.
  844. Returns
  845. -------
  846. pandas.DataFrame
  847. A DataFrame with requested options data.
  848. Index:
  849. Strike: Option strike, int
  850. Expiry: Option expiry, datetime.date
  851. Type: Call or Put, string
  852. Symbol: Option symbol as reported on Yahoo, string
  853. Columns:
  854. Last: Last option price, float
  855. Chg: Change from prior day, float
  856. Bid: Bid price, float
  857. Ask: Ask price, float
  858. Vol: Volume traded, int64
  859. Open_Int: Open interest, int64
  860. IsNonstandard: True if the the deliverable is not 100 shares, otherwise false
  861. Underlying: Ticker of the underlying security, string
  862. Underlying_Price: Price of the underlying security, float64
  863. Quote_Time: Time of the quote, Timestamp
  864. Note: Format of returned data frame is dependent on Yahoo and may change.
  865. """
  866. to_ret = Series({'calls': call, 'puts': put})
  867. to_ret = to_ret[to_ret].index
  868. try:
  869. months = self.months
  870. except AttributeError:
  871. months = self._get_expiry_months()
  872. all_data = []
  873. for name in to_ret:
  874. for month in months:
  875. m2 = month.month
  876. y2 = month.year
  877. m1 = _two_char_month(m2)
  878. nam = name + str(m1) + str(y2)[2:]
  879. try: # Try to access on the instance
  880. frame = getattr(self, nam)
  881. except AttributeError:
  882. meth_name = 'get_{0}_data'.format(name[:-1])
  883. frame = getattr(self, meth_name)(expiry=month)
  884. all_data.append(frame)
  885. return concat(all_data).sortlevel()
  886. def _get_expiry_months(self):
  887. """
  888. Gets available expiry months.
  889. Returns
  890. -------
  891. months : List of datetime objects
  892. """
  893. url = 'http://finance.yahoo.com/q/op?s={sym}'.format(sym=self.symbol)
  894. root = self._parse_url(url)
  895. try:
  896. links = root.xpath('.//*[@id="yfncsumtab"]')[0].xpath('.//a')
  897. except IndexError:
  898. return RemoteDataError('Expiry months not available')
  899. month_gen = (element.attrib['href'].split('=')[-1]
  900. for element in links
  901. if '/q/op?s=' in element.attrib['href']
  902. and '&m=' in element.attrib['href'])
  903. months = [dt.date(int(month.split('-')[0]),
  904. int(month.split('-')[1]), 1)
  905. for month in month_gen]
  906. current_month_text = root.xpath('.//*[@id="yfncsumtab"]')[0].xpath('.//strong')[0].text
  907. current_month = dt.datetime.strptime(current_month_text, '%b %y')
  908. months.insert(0, current_month)
  909. self.months = months
  910. return months
  911. def _parse_url(self, url):
  912. """
  913. Downloads and parses a URL, returns xml root.
  914. """
  915. try:
  916. from lxml.html import parse
  917. except ImportError:
  918. raise ImportError("Please install lxml if you want to use the "
  919. "{0!r} class".format(self.__class__.__name__))
  920. try:
  921. doc = parse(url)
  922. except _network_error_classes:
  923. raise RemoteDataError("Unable to parse URL "
  924. "{0!r}".format(url))
  925. else:
  926. root = doc.getroot()
  927. if root is None:
  928. raise RemoteDataError("Parsed URL {0!r} has no root"
  929. "element".format(url))
  930. return root
  931. def _process_data(self, frame, type):
  932. """
  933. Adds columns for Expiry, IsNonstandard (ie: deliverable is not 100 shares)
  934. and Tag (the tag indicating what is actually deliverable, None if standard).
  935. """
  936. frame["Rootexp"] = frame.Symbol.str[0:-9]
  937. frame["Root"] = frame.Rootexp.str[0:-6]
  938. frame["Expiry"] = to_datetime(frame.Rootexp.str[-6:])
  939. #Removes dashes in equity ticker to map to option ticker.
  940. #Ex: BRK-B to BRKB140517C00100000
  941. frame["IsNonstandard"] = frame['Root'] != self.symbol.replace('-','')
  942. del frame["Rootexp"]
  943. frame["Underlying"] = self.symbol
  944. frame['Underlying_Price'] = self.underlying_price
  945. frame["Quote_Time"] = self.quote_time
  946. frame.rename(columns={'Open Int': 'Open_Int'}, inplace=True)
  947. frame['Type'] = type
  948. frame.set_index(['Strike', 'Expiry', 'Type', 'Symbol'], inplace=True)
  949. return frame