PageRenderTime 53ms CodeModel.GetById 24ms RepoModel.GetById 1ms app.codeStats 0ms

/Economics/E588/Project/get_data.py

https://bitbucket.org/spencerlyon2/byuclasses
Python | 326 lines | 242 code | 49 blank | 35 comment | 15 complexity | 0a479da99e5eb3926b47a6497785a268 MD5 | raw file
  1. """
  2. Created Apr 9, 2013
  3. Author: Spencer Lyon
  4. Various routines used in my final project for 588
  5. TODO:
  6. - Follow the data description from the original EPP paper page 147.
  7. """
  8. import pandas as pd
  9. from pandas.io.data import DataReader, get_data_yahoo
  10. import statsmodels.api as sm
  11. import matplotlib.pyplot as plt
  12. import numpy as np
  13. from byumcl.misc.pd_tools import df_to_latex
  14. # Apply monkey patch to pandas DataFrame to get my latex_form function
  15. pd.DataFrame.latex_form = lambda self, *args, **kwargs: \
  16. df_to_latex(self, *args, **kwargs)
  17. def save_data():
  18. start = '1/1/1990'
  19. # Get S&P 500 data from yahoo
  20. sp500 = get_data_yahoo('^GSPC', start=start)['Adj Close']
  21. sp500.name = 'SP500'
  22. vix = get_data_yahoo('^VIX', start=start)['Adj Close']
  23. vix.name = 'VIX'
  24. # Get ten year and 3 month t-bill rates
  25. ten_yr = DataReader('DGS10', 'fred', start=start)
  26. three_mon = DataReader('DGS3MO', 'fred', start=start)
  27. ten_yr = ten_yr.ix[ten_yr.DGS10.str.count(r'^\.') != 1].astype(float)
  28. three_mon = three_mon.ix[three_mon.DGS3MO.str.count(r'^\.') != 1].astype(float)
  29. data = ten_yr.join(three_mon)
  30. data = data.join(sp500)
  31. data = data.join(vix)
  32. # Drop non-like observations (obs on different days)
  33. data = data.dropna()
  34. data.save('SP_YC.db')
  35. data.to_csv('the_data.csv')
  36. def load_data():
  37. return pd.DataFrame.load('SP_YC.db')
  38. def fred_plots():
  39. # Plot FRED data
  40. plt.figure()
  41. fred.plot()
  42. plt.ylabel('%')
  43. plt.title('US Treasury Bill Time Series')
  44. plt.savefig('./Figures/FRED_data.eps', format='eps', dpi=1000)
  45. # Plot logged FRED data
  46. plt.figure()
  47. np.log(fred).plot()
  48. plt.ylabel('%')
  49. plt.title('Logged US Treasury Bill Time Series')
  50. plt.savefig('./Figures/FRED_data_log.eps', format='eps', dpi=1000)
  51. # Plot first differenced FRED data
  52. plt.figure()
  53. fred.diff(1).plot()
  54. plt.ylabel('%')
  55. plt.title('First Differenced US Treasury Bill Time Series')
  56. plt.savefig('./Figures/FRED_data_diff.eps', format='eps', dpi=1000)
  57. def sp_plots():
  58. # Plot sp50 data
  59. plt.figure()
  60. sp.plot()
  61. plt.ylabel('%')
  62. plt.title('US Treasury Bill Time Series')
  63. plt.savefig('./Figures/sp500_data.eps', format='eps', dpi=1000)
  64. # Plot logged sp data
  65. plt.figure()
  66. np.log(sp).plot()
  67. plt.ylabel('%')
  68. plt.title('Logged US Treasury Bill Time Series')
  69. plt.savefig('./Figures/sp500_data_log.eps', format='eps', dpi=1000)
  70. # Plot first differenced sp data
  71. plt.figure()
  72. sp.diff(1).plot()
  73. plt.ylabel('%')
  74. plt.title('First Differenced US Treasury Bill Time Series')
  75. plt.savefig('./Figures/sp500_data_diff.eps', format='eps', dpi=1000)
  76. def data_plot():
  77. (data / data.mean()).plot(linewidth=0.6)
  78. plt.legend(loc=2)
  79. plt.savefig('./Figures/all_data.eps', format='eps', dpi=900)
  80. def ac_pac():
  81. ac = [sm.tsa.stattools.acf(data['%s' % i].values) for i in data.columns]
  82. pac = [sm.tsa.stattools.pacf(data['%s' % i].values) for i in data.columns]
  83. corrs = pd.DataFrame(ac[0], columns=['DGS10ac'])
  84. corrs['DGS3ac'] = ac[1]
  85. corrs['SPac'] = ac[2]
  86. corrs['VIXac'] = ac[3]
  87. corrs['DGS10pac'] = pac[0]
  88. corrs['DGS3pac'] = pac[1]
  89. corrs['SPpac'] = pac[2]
  90. corrs['VIXpac'] = pac[3]
  91. return corrs
  92. def ac_plot():
  93. # Plot the correlation and autocorrelation coefficients.
  94. corrs = ac_pac(45)
  95. fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(8, 5))
  96. corrs[['DGS10ac', 'DGS10pac']].plot(ax=axes[0, 0])
  97. axes[0, 0].set_title('Ten Year T-Bill (P)AC coefs')
  98. corrs[['DGS3ac', 'DGS3pac']].plot(ax=axes[0, 1])
  99. axes[0, 1].set_title('Three Month T-Bill (P)AC coefs')
  100. corrs[['SPac', 'SPpac']].plot(ax=axes[1, 0])
  101. axes[1, 0].set_title('S&P 500 (P)AC coefs')
  102. corrs[['VIXac', 'VIXpac']].plot(ax=axes[1, 1])
  103. axes[1, 1].set_title('VIX (P)AC coefs')
  104. fig.subplots_adjust(hspace=0.3)
  105. fig.savefig('./Figures/all_corrs.eps', format='eps', dpi=1000)
  106. def plot_diffs(ser, n=4, rows_cols=None, subplots=True):
  107. """
  108. Plot the first n differences of the pandas series ser
  109. """
  110. df = pd.DataFrame(ser)
  111. names = ['d%i' % i for i in range(1, n + 1)]
  112. for i in range(n):
  113. df[names[i]] = ser.diff(i + 1)
  114. df = df.drop(ser.name, axis=1)
  115. if subplots:
  116. if rows_cols:
  117. rows, cols = rows_cols
  118. rem = n % (rows * cols)
  119. fig, axes = plt.subplots(nrows=rows, ncols=cols, figsize=(8, 8))
  120. i = 0
  121. for r in range(rows):
  122. for c in range(cols):
  123. name = names[i]
  124. title = '%s: diff(%i)' % (ser.name, i + 1)
  125. if rows == 1 or cols == 1:
  126. df[name].plot(ax=axes[r + c], xticks=[], use_index=False)
  127. axes[r + c].set_title(title)
  128. else:
  129. df[name].plot(ax=axes[r, c], xticks=[], use_index=False)
  130. axes[r, c].set_title(title)
  131. i += 1
  132. if rem:
  133. plt.show()
  134. # Plot the remainder on separate axis
  135. name = names[i]
  136. title = '%s: diff(%i)' % (ser.name, i + 1)
  137. plt.figure()
  138. df[name].plot()
  139. plt.title(title)
  140. plt.show()
  141. # return df
  142. def diff_analysis():
  143. arr1 = np.array([[i] * 4 for i in data.columns]).ravel()
  144. arr2 = np.array([[1, 2, 3, 4] * 4])
  145. arrays = np.row_stack([arr1, arr2])
  146. tuples = zip(*arrays)
  147. index = pd.MultiIndex.from_tuples(tuples, names=['DataSet', 'Lags'])
  148. diffs = pd.DataFrame(columns=['Mean', 'Variance'], index=index).T
  149. for col in data.columns:
  150. for i in range(1, 5):
  151. index = '%s, %s' % (col, str(i))
  152. exec "d%i = data[col].diff(%i)" % (i, i)
  153. exec "mean = d%i.mean()" % (i)
  154. exec "var = d%i.var()" % (i)
  155. print('DataSet: %s, LAGS: %i, Mean:%.2e, Variance: %.2e'
  156. % (col, i, mean, var))
  157. diffs[col][str(i)]['Mean'] = mean
  158. diffs[col][str(i)]['Variance'] = var
  159. plt.figure()
  160. plot_diffs(data[col], 4, (2, 2))
  161. plt.show()
  162. return diffs
  163. def est_returns(items, freq='A'):
  164. """
  165. Get an estimate for the return on an equity at a given frequency
  166. Parameters
  167. ----------
  168. items : pandas.DataFrame
  169. The pandas data frame containing a TimeSeriesIndex and columns
  170. for the equities you want to estimate returns for
  171. freq : str, optional(default='A')
  172. The frequency you would like to sample the
  173. """
  174. if freq == 'A':
  175. periods = 250 # 250 trading days in a year
  176. elif freq == 'M':
  177. periods = 21 # 21 trading days in a month
  178. elif freq == 'W':
  179. periods = 5 # 5 trading days in a week
  180. changes = items.pct_change(periods=periods)
  181. # Resample annually
  182. period_returns = changes.resample(freq, how='mean') * 100
  183. period_returns.columns.name = 'Returns - ' + freq + ' (%)'
  184. return period_returns.dropna()
  185. def adf_tests(data):
  186. lags = floor((data.shape[0] - 1) ** (1 / 3.)) # use R default
  187. adf_nolag = np.asarray([sm.tsa.adfuller(data[i], maxlag=lags,
  188. regression='ct',
  189. autolag=None)[:2]
  190. for i in data.columns])
  191. adf_1lag = np.asarray([sm.tsa.adfuller(data[i].diff(1).dropna(),
  192. maxlag=lags,
  193. regression='ct',
  194. autolag=None)[:2]
  195. for i in data.columns])
  196. adf_tests = np.concatenate([adf_nolag, adf_1lag])
  197. multi_ind = pd.MultiIndex.from_arrays(
  198. [['DGS10', 'DGS3MO', 'SP500', 'VIX'] * 2,
  199. [0, 0, 0, 0, 1, 1, 1, 1]])
  200. adf = pd.DataFrame(adf_tests, index=multi_ind, columns=['Statistic', 'p-value'])
  201. adf.columns.name = 'Value'
  202. adf.index.names = ['Data', 'Lags']
  203. adf = adf.sort_index()
  204. return adf
  205. # save_data()
  206. data = load_data()
  207. # Describe data and save as latex file
  208. desc = data.describe()
  209. fred = data[['DGS10', 'DGS3MO']]
  210. sp = data[['SP500', 'VIX']]
  211. # Plot the data
  212. # fred_plots()
  213. # sp_plots()
  214. # Generate autocorrelation and partial autocorrelation coefficients
  215. corrs = ac_pac()
  216. ann_returns = est_returns(sp, freq='A')
  217. ann_returns = ann_returns.join(fred.resample('A', how='mean'))
  218. ann_returns.plot(linewidth=2, title='Estimated Annual returns')
  219. plt.savefig('./Figures/returns.eps', format='eps', dpi=1000)
  220. # plot_diffs(data.VIX, 4, (1, 4))nu
  221. # # Read ARIMA forecasts from R
  222. # sp_arima_fore = pd.DataFrame.from_csv('sp_arima_forecast.csv').reset_index().drop('index', axis=1)
  223. # vix_arima_fore = pd.DataFrame.from_csv('vix_arima_forecast.csv').reset_index().drop('index', axis=1)
  224. # d10_arima_fore = pd.DataFrame.from_csv('d10_arima_forecast.csv').reset_index().drop('index', axis=1)
  225. # d3m_arima_fore = pd.DataFrame.from_csv('dg3_arima_forecast.csv').reset_index().drop('index', axis=1)
  226. # # Read GARCH forecasts from Stata
  227. # stata = pd.DataFrame.from_csv('Sata_forecasts.csv')
  228. # sp_garch_fore = stata['sp_archCAST']
  229. # vix_garch_fore = stata['vix_archCAST']
  230. # d10_garch_fore = stata['d10_archCAST']
  231. # d3m_garch_fore = stata['d3m_archCAST']
  232. sp_arima_fore = sp_arima_fore['Point.Forecast']
  233. vix_arima_fore = vix_arima_fore['Point.Forecast']
  234. d10_arima_fore = d10_arima_fore['Point.Forecast']
  235. d3m_arima_fore = d3m_arima_fore['Point.Forecast']
  236. sp_arima_fore.index = sp_garch_fore.index
  237. vix_arima_fore.index = sp_garch_fore.index
  238. d10_arima_fore.index = sp_garch_fore.index
  239. d3m_arima_fore.index = sp_garch_fore.index
  240. forecasts = pd.DataFrame(index=sp_garch_fore.index)
  241. forecasts['SP_ARIMA'] = sp_arima_fore
  242. forecasts['VIX_ARIMA'] = vix_arima_fore
  243. forecasts['DGS10_ARIMA'] = d10_arima_fore
  244. forecasts['DGS3MO_ARIMA'] = d3m_arima_fore
  245. forecasts['SP_GARCH'] = sp_garch_fore
  246. forecasts['VIX_GARCH'] = vix_garch_fore
  247. forecasts['DGS10_GARCH'] = d10_garch_fore
  248. forecasts['DGS3MO_GARCH'] = d3m_garch_fore