/TreasuryRegression/dru_hw2_1_9815.py
Python | 270 lines | 247 code | 3 blank | 20 comment | 3 complexity | f99d5f5ebeb8763bb65e2d02a1dfe083 MD5 | raw file
-
- import pandas
- import numpy
- '''
- Input: csv file containing multiple time series represented by a column of dates,
- a column of data, and an empty separating column
- All data columns should have human-friendly column label if the user is to add or remove columns
- Columns do not need to be of the same length or pertain to the same dates
- '''
- class TimeSeriesCollection(object):
-
- def __init__(self,filename):
-
- '''
- Reads 'filename' csv file without indexing. Determines number of columns for data set.
- Also determines number of rows in the first time series.
- '''
- source = pandas.read_csv(open(filename))
- date_col_label = source.columns[0]
- num_cols = source.shape[1] - 1
- num_rows = pandas.Series(source[date_col_label]).dropna().shape[0]
- '''
- Reads file again, but this time indexing on the 0th column (of dates) and only
- reading the first 'num_rows' number of rows. Converts to Series object and then to
- DataFrame object to make use of attributes unique to those types
- '''
- source = pandas.read_csv(open(filename), nrows = num_rows, index_col = 0, parse_dates=True)
- data_col = source.columns[0]
- source = pandas.Series(source[data_col]).dropna()
- data = pandas.DataFrame(source.dropna(), index = source.dropna().index, columns = [data_col])
- '''
- Loops through remaining time series, one at a time
- Each series is identified by the index column of dates, which occurs every third column
- Only accept rows with non-empty values
- '''
- cols = 0
- for i in range(3,num_cols,3):
- '''
- Open file again, recalculate num_rows for series
- '''
- source = pandas.read_csv(open(filename), header=1)
- date_col_label = source.columns[i]
- num_rows = pandas.Series(source[date_col_label]).dropna().shape[0]
- '''
- Re-open file, this time reading the actual data (up to row 'num_rows')
- Outer-join the series to 'data', which accumulating the data in the format we want
- '''
- source = pandas.read_csv(open(filename), nrows = num_rows + 1, index_col = i, parse_dates=True)
- data_col = source.columns[i]
- source = pandas.Series(source[data_col]).dropna()
- source = pandas.DataFrame(source.dropna(), index = source.dropna().index, columns = [data_col])
- data = data.join(source, how='outer')
- cols += 1
- '''
- Fill forward any empty rows (which will result from all the joins; correspond to
- dates that were not common to all time series
- '''
- data = data.fillna(method='ffill')
- self.data = data
-
- '''
- Returns the object's data as a dataframe
- '''
- def get_data(self):
- return self.data
-
- '''
- Returns the object's data as a numpy matrix
- '''
- def get_matrix(self):
- mat = numpy.matrix(self.data)
- return mat
-
- '''
- Returns data as a dataframe, but expressed as a return from the previous date
- Should only be used if dates are separated by the same time interval
- '''
- def get_returns(self):
- data = self.data
- for col in data.columns:
- for row in range(1,data.shape[0])[::-1]:
- data[col][row] = (data[col][row] - data[col][row-1])/data[col][row-1]
- return data[1:]
-
- '''
- Returns data as a dataframe, but expressed as a log-return from the previous date
- Should only be used if dates are separated by the same time interval
- '''
- def get_logreturns(self):
- data = self.data
- for col in data.columns:
- for row in range(1,data.shape[0])[::-1]:
- data[col][row] = numpy.log(data[col][row]/data[col][row-1])
-
- return data[1:]
-
- '''
- Returns a series, specified by the data column's label
- '''
- def get_series(self, col_label):
- col = pandas.DataFrame(self.data[col_label], index = self.data.index, columns = [col_label])
- return col
-
- '''
- Deletes a series, specified by the column's label
- '''
- def pull_series(self, col_label):
- del self.data[col_label]
-
- '''
- Adds a series to the dataset
- '''
- def add_series(self,column):
- self.data = self.data.join(column, how='outer')
-
-
- class LinearRegression:
- def __init__(self,data_frame):
- '''
- Reads data in DataFrame form. Data should have relevant column labels.
- '''
- self.data = data_frame
-
- def get_data(self):
- '''
- Returns data as DataFrame object
- '''
- return self.data
-
- def add_constant_term(self):
- '''
- Void function. Adds a column of 1's with 'const' as label
- Allows for linear regression with a constant term
- Appends column to right end of table.
- '''
- const = pandas.Series(1., index = self.data.index)
- const = pandas.DataFrame(const, index = const.index, columns = ['const'])
- self.data = self.data.join(const)
-
- def regress(self, col_label):
- '''
- Removes the dependent variable from the data and creates a separate DataFrame from it
- '''
- data = self.data
- col = pandas.DataFrame(data[col_label], index = data.index, columns = [col_label])
- del data[col_label]
-
- '''
- Converts data into a matrix
- '''
- A = numpy.matrix(data)
- b = numpy.matrix(col)
-
- '''
- Solve Ax=b by solving A^t*Ax = A^t*b
- '''
- M = A.T*A
- b_new = A.T*b
-
- '''
- Performs Cholesky decomposition on A^t*A and solves for x
- x is equal to the linear regression coefficients
- '''
- L = numpy.linalg.cholesky(M)
- y = numpy.linalg.solve(L,b_new)
- x = numpy.linalg.solve(L.T,y)
-
- '''
- Stores the approximation error from our regression
- '''
- self.last_error = numpy.linalg.norm(A*x-b)
-
- '''
- Returns coefficients as a DataFrame object
- '''
- return pandas.DataFrame(x, index = data.columns, columns = ['coeffs'])
-
- def get_last_error(self):
- '''
- Returns the error term for the previously performed regression
- '''
- return self.last_error
-
-
-
- class CovarianceMatrix:
- def __init__(self, dataframe):
- '''
- Creates matrix from DataFrame object and initializes a mean-return vector
- Data should correspond to returns
- '''
- returns = numpy.matrix(dataframe)
- m = returns.shape[0]
- n = returns.shape[1]
- mean_returns = numpy.zeros(n)
-
- '''
- For each time series, finds the mean return...
- '''
- for j in range(n):
- for i in range(m):
- mean_returns[j] += returns[i,j]
- mean_returns[j] /= (m-1)
-
- '''
- ...and subtracts it from the time series
- '''
- for i in range(m):
- for j in range(n):
- returns[i,j] -= mean_returns[j]
- '''
- Finds the size of the matrix and calculates the covariance matrix
- '''
- N = returns.shape[0]
-
- self.returns = (1./(N-1))*returns.T*returns
-
- def get_returns(self):
- '''
- Returns the matrix of returns as a numpy matrix
- '''
- return self.returns
-
- '''Part a.
- Reads a csv file as a collection of timeseries and outputs result as a DataFrame
- Removes the column for 5 year treasuries and calculates the regression coefficients
- for 5yr treasuries against all other treasuries
- '''
- data1 = TimeSeriesCollection('/home/dru/Downloads/treasuries2011.csv')
- print 'Treasuries data'
- print data1.get_data().to_string()
- returns1 = data1.get_data()
- reg = LinearRegression(returns1)
- reg.add_constant_term()
- print ''
- print ''
- print 'Regression results, GT5 Govt'
- print reg.regress('GT5 Govt')
- print ''
- print 'Residual'
- print reg.get_last_error()
- print ''
- print ''
- print 'Regression results, GT10 Govt'
- print reg.regress('GT10 Govt')
- print ''
- print 'Residual'
- print reg.get_last_error()
- print ''
- print ''
- print 'Regression results, GT30 Govt'
- print reg.regress('GT30 Govt')
- print ''
- print 'Residual'
- print reg.get_last_error()
-
-
- '''
- data2 = TimeSeriesCollection('/home/dru/Downloads/data-indices2011.csv')
- returns2 = data2.get_logreturns()
- cov = CovarianceMatrix(returns2)
- print cov.get_matrix()
- '''
-
-
-
-
-
-
-