PageRenderTime 50ms CodeModel.GetById 24ms RepoModel.GetById 0ms app.codeStats 0ms

/TreasuryRegression/dru_hw2_1_9815.py

https://bitbucket.org/vroomzel/vroomzel_mth9815
Python | 270 lines | 247 code | 3 blank | 20 comment | 3 complexity | f99d5f5ebeb8763bb65e2d02a1dfe083 MD5 | raw file
  1. import pandas
  2. import numpy
  3. '''
  4. Input: csv file containing multiple time series represented by a column of dates,
  5. a column of data, and an empty separating column
  6. All data columns should have human-friendly column label if the user is to add or remove columns
  7. Columns do not need to be of the same length or pertain to the same dates
  8. '''
  9. class TimeSeriesCollection(object):
  10. def __init__(self,filename):
  11. '''
  12. Reads 'filename' csv file without indexing. Determines number of columns for data set.
  13. Also determines number of rows in the first time series.
  14. '''
  15. source = pandas.read_csv(open(filename))
  16. date_col_label = source.columns[0]
  17. num_cols = source.shape[1] - 1
  18. num_rows = pandas.Series(source[date_col_label]).dropna().shape[0]
  19. '''
  20. Reads file again, but this time indexing on the 0th column (of dates) and only
  21. reading the first 'num_rows' number of rows. Converts to Series object and then to
  22. DataFrame object to make use of attributes unique to those types
  23. '''
  24. source = pandas.read_csv(open(filename), nrows = num_rows, index_col = 0, parse_dates=True)
  25. data_col = source.columns[0]
  26. source = pandas.Series(source[data_col]).dropna()
  27. data = pandas.DataFrame(source.dropna(), index = source.dropna().index, columns = [data_col])
  28. '''
  29. Loops through remaining time series, one at a time
  30. Each series is identified by the index column of dates, which occurs every third column
  31. Only accept rows with non-empty values
  32. '''
  33. cols = 0
  34. for i in range(3,num_cols,3):
  35. '''
  36. Open file again, recalculate num_rows for series
  37. '''
  38. source = pandas.read_csv(open(filename), header=1)
  39. date_col_label = source.columns[i]
  40. num_rows = pandas.Series(source[date_col_label]).dropna().shape[0]
  41. '''
  42. Re-open file, this time reading the actual data (up to row 'num_rows')
  43. Outer-join the series to 'data', which accumulating the data in the format we want
  44. '''
  45. source = pandas.read_csv(open(filename), nrows = num_rows + 1, index_col = i, parse_dates=True)
  46. data_col = source.columns[i]
  47. source = pandas.Series(source[data_col]).dropna()
  48. source = pandas.DataFrame(source.dropna(), index = source.dropna().index, columns = [data_col])
  49. data = data.join(source, how='outer')
  50. cols += 1
  51. '''
  52. Fill forward any empty rows (which will result from all the joins; correspond to
  53. dates that were not common to all time series
  54. '''
  55. data = data.fillna(method='ffill')
  56. self.data = data
  57. '''
  58. Returns the object's data as a dataframe
  59. '''
  60. def get_data(self):
  61. return self.data
  62. '''
  63. Returns the object's data as a numpy matrix
  64. '''
  65. def get_matrix(self):
  66. mat = numpy.matrix(self.data)
  67. return mat
  68. '''
  69. Returns data as a dataframe, but expressed as a return from the previous date
  70. Should only be used if dates are separated by the same time interval
  71. '''
  72. def get_returns(self):
  73. data = self.data
  74. for col in data.columns:
  75. for row in range(1,data.shape[0])[::-1]:
  76. data[col][row] = (data[col][row] - data[col][row-1])/data[col][row-1]
  77. return data[1:]
  78. '''
  79. Returns data as a dataframe, but expressed as a log-return from the previous date
  80. Should only be used if dates are separated by the same time interval
  81. '''
  82. def get_logreturns(self):
  83. data = self.data
  84. for col in data.columns:
  85. for row in range(1,data.shape[0])[::-1]:
  86. data[col][row] = numpy.log(data[col][row]/data[col][row-1])
  87. return data[1:]
  88. '''
  89. Returns a series, specified by the data column's label
  90. '''
  91. def get_series(self, col_label):
  92. col = pandas.DataFrame(self.data[col_label], index = self.data.index, columns = [col_label])
  93. return col
  94. '''
  95. Deletes a series, specified by the column's label
  96. '''
  97. def pull_series(self, col_label):
  98. del self.data[col_label]
  99. '''
  100. Adds a series to the dataset
  101. '''
  102. def add_series(self,column):
  103. self.data = self.data.join(column, how='outer')
  104. class LinearRegression:
  105. def __init__(self,data_frame):
  106. '''
  107. Reads data in DataFrame form. Data should have relevant column labels.
  108. '''
  109. self.data = data_frame
  110. def get_data(self):
  111. '''
  112. Returns data as DataFrame object
  113. '''
  114. return self.data
  115. def add_constant_term(self):
  116. '''
  117. Void function. Adds a column of 1's with 'const' as label
  118. Allows for linear regression with a constant term
  119. Appends column to right end of table.
  120. '''
  121. const = pandas.Series(1., index = self.data.index)
  122. const = pandas.DataFrame(const, index = const.index, columns = ['const'])
  123. self.data = self.data.join(const)
  124. def regress(self, col_label):
  125. '''
  126. Removes the dependent variable from the data and creates a separate DataFrame from it
  127. '''
  128. data = self.data
  129. col = pandas.DataFrame(data[col_label], index = data.index, columns = [col_label])
  130. del data[col_label]
  131. '''
  132. Converts data into a matrix
  133. '''
  134. A = numpy.matrix(data)
  135. b = numpy.matrix(col)
  136. '''
  137. Solve Ax=b by solving A^t*Ax = A^t*b
  138. '''
  139. M = A.T*A
  140. b_new = A.T*b
  141. '''
  142. Performs Cholesky decomposition on A^t*A and solves for x
  143. x is equal to the linear regression coefficients
  144. '''
  145. L = numpy.linalg.cholesky(M)
  146. y = numpy.linalg.solve(L,b_new)
  147. x = numpy.linalg.solve(L.T,y)
  148. '''
  149. Stores the approximation error from our regression
  150. '''
  151. self.last_error = numpy.linalg.norm(A*x-b)
  152. '''
  153. Returns coefficients as a DataFrame object
  154. '''
  155. return pandas.DataFrame(x, index = data.columns, columns = ['coeffs'])
  156. def get_last_error(self):
  157. '''
  158. Returns the error term for the previously performed regression
  159. '''
  160. return self.last_error
  161. class CovarianceMatrix:
  162. def __init__(self, dataframe):
  163. '''
  164. Creates matrix from DataFrame object and initializes a mean-return vector
  165. Data should correspond to returns
  166. '''
  167. returns = numpy.matrix(dataframe)
  168. m = returns.shape[0]
  169. n = returns.shape[1]
  170. mean_returns = numpy.zeros(n)
  171. '''
  172. For each time series, finds the mean return...
  173. '''
  174. for j in range(n):
  175. for i in range(m):
  176. mean_returns[j] += returns[i,j]
  177. mean_returns[j] /= (m-1)
  178. '''
  179. ...and subtracts it from the time series
  180. '''
  181. for i in range(m):
  182. for j in range(n):
  183. returns[i,j] -= mean_returns[j]
  184. '''
  185. Finds the size of the matrix and calculates the covariance matrix
  186. '''
  187. N = returns.shape[0]
  188. self.returns = (1./(N-1))*returns.T*returns
  189. def get_returns(self):
  190. '''
  191. Returns the matrix of returns as a numpy matrix
  192. '''
  193. return self.returns
  194. '''Part a.
  195. Reads a csv file as a collection of timeseries and outputs result as a DataFrame
  196. Removes the column for 5 year treasuries and calculates the regression coefficients
  197. for 5yr treasuries against all other treasuries
  198. '''
  199. data1 = TimeSeriesCollection('/home/dru/Downloads/treasuries2011.csv')
  200. print 'Treasuries data'
  201. print data1.get_data().to_string()
  202. returns1 = data1.get_data()
  203. reg = LinearRegression(returns1)
  204. reg.add_constant_term()
  205. print ''
  206. print ''
  207. print 'Regression results, GT5 Govt'
  208. print reg.regress('GT5 Govt')
  209. print ''
  210. print 'Residual'
  211. print reg.get_last_error()
  212. print ''
  213. print ''
  214. print 'Regression results, GT10 Govt'
  215. print reg.regress('GT10 Govt')
  216. print ''
  217. print 'Residual'
  218. print reg.get_last_error()
  219. print ''
  220. print ''
  221. print 'Regression results, GT30 Govt'
  222. print reg.regress('GT30 Govt')
  223. print ''
  224. print 'Residual'
  225. print reg.get_last_error()
  226. '''
  227. data2 = TimeSeriesCollection('/home/dru/Downloads/data-indices2011.csv')
  228. returns2 = data2.get_logreturns()
  229. cov = CovarianceMatrix(returns2)
  230. print cov.get_matrix()
  231. '''