dru_hw2_1_9815.py | searchcode

/TreasuryRegression/dru_hw2_1_9815.py

https://bitbucket.org/vroomzel/vroomzel_mth9815 · Python · 270 lines · 129 code · 13 blank · 128 comment · 18 complexity · f99d5f5ebeb8763bb65e2d02a1dfe083 MD5 · raw file

import pandas

import numpy

'''

Input: csv file containing multiple time series represented by a column of dates,

a column of data, and an empty separating column

All data columns should have human-friendly column label if the user is to add or remove columns

Columns do not need to be of the same length or pertain to the same dates

'''

class TimeSeriesCollection(object):

    

    def __init__(self,filename):       

        

        '''

        Reads 'filename' csv file without indexing. Determines number of columns for data set.

        Also determines number of rows in the first time series.        

        '''

        source = pandas.read_csv(open(filename))

        date_col_label = source.columns[0]        

        num_cols = source.shape[1] - 1

        num_rows = pandas.Series(source[date_col_label]).dropna().shape[0]    

        '''

        Reads file again, but this time indexing on the 0th column (of dates) and only

        reading the first 'num_rows' number of rows. Converts to Series object and then to

        DataFrame object to make use of attributes unique to those types 

        '''    

        source = pandas.read_csv(open(filename), nrows = num_rows, index_col = 0, parse_dates=True)

        data_col = source.columns[0]

        source = pandas.Series(source[data_col]).dropna()

        data = pandas.DataFrame(source.dropna(), index = source.dropna().index, columns = [data_col])

        '''

        Loops through remaining time series, one at a time

        Each series is identified by the index column of dates, which occurs every third column

        Only accept rows with non-empty values

        '''

        cols = 0

        for i in range(3,num_cols,3):

            '''

            Open file again, recalculate num_rows for series

            '''

            source = pandas.read_csv(open(filename), header=1)

            date_col_label = source.columns[i]        

            num_rows = pandas.Series(source[date_col_label]).dropna().shape[0]

            '''

            Re-open file, this time reading the actual data (up to row 'num_rows')

            Outer-join the series to 'data', which accumulating the data in the format we want

            '''

            source = pandas.read_csv(open(filename), nrows = num_rows + 1, index_col = i, parse_dates=True)

            data_col = source.columns[i]

            source = pandas.Series(source[data_col]).dropna()

            source = pandas.DataFrame(source.dropna(), index = source.dropna().index, columns = [data_col])

            data = data.join(source, how='outer')

            cols += 1

        '''

        Fill forward any empty rows (which will result from all the joins; correspond to

        dates that were not common to all time series

        '''

        data = data.fillna(method='ffill')

        self.data = data         

    

    '''

    Returns the object's data as a dataframe

    '''

    def get_data(self):

        return self.data

    

    '''

    Returns the object's data as a numpy matrix

    '''

    def get_matrix(self):

        mat = numpy.matrix(self.data)

        return mat



    '''

    Returns data as a dataframe, but expressed as a return from the previous date

    Should only be used if dates are separated by the same time interval

    '''

    def get_returns(self):

        data = self.data

        for col in data.columns:

            for row in range(1,data.shape[0])[::-1]:

                data[col][row] = (data[col][row] - data[col][row-1])/data[col][row-1]

        return data[1:]

    

    '''

    Returns data as a dataframe, but expressed as a log-return from the previous date

    Should only be used if dates are separated by the same time interval 

    '''

    def get_logreturns(self):

        data = self.data

        for col in data.columns:

            for row in range(1,data.shape[0])[::-1]:

                data[col][row] = numpy.log(data[col][row]/data[col][row-1])

          

        return data[1:]  

    

    '''

    Returns a series, specified by the data column's label

    '''

    def get_series(self, col_label):

        col = pandas.DataFrame(self.data[col_label], index = self.data.index, columns = [col_label])

        return col

    

    '''

    Deletes a series, specified by the column's label

    '''

    def pull_series(self, col_label):

        del self.data[col_label]

    

    '''

    Adds a series to the dataset

    '''         

    def add_series(self,column):

        self.data = self.data.join(column, how='outer')

    



class LinearRegression:

    def __init__(self,data_frame):

        '''

        Reads data in DataFrame form. Data should have relevant column labels.

        '''

        self.data = data_frame

        

    def get_data(self):

        '''

        Returns data as DataFrame object

        '''

        return self.data

        

    def add_constant_term(self):

        '''

        Void function. Adds a column of 1's with 'const' as label

        Allows for linear regression with a constant term

        Appends column to right end of table.

        '''

        const = pandas.Series(1., index = self.data.index)

        const = pandas.DataFrame(const, index = const.index, columns = ['const'])

        self.data = self.data.join(const)   

    

    def regress(self, col_label):

        '''

        Removes the dependent variable from the data and creates a separate DataFrame from it

        '''

        data = self.data

        col = pandas.DataFrame(data[col_label], index = data.index, columns = [col_label])

        del data[col_label]

        

        '''

        Converts data into a matrix

        '''

        A = numpy.matrix(data)

        b = numpy.matrix(col)



        '''

        Solve Ax=b by solving A^t*Ax = A^t*b

        '''

        M = A.T*A

        b_new = A.T*b



        '''

        Performs Cholesky decomposition on A^t*A and solves for x

        x is equal to the linear regression coefficients

        '''

        L = numpy.linalg.cholesky(M)

        y = numpy.linalg.solve(L,b_new)

        x = numpy.linalg.solve(L.T,y)       

        

        '''

        Stores the approximation error from our regression

        '''

        self.last_error =  numpy.linalg.norm(A*x-b)

        

        '''

        Returns coefficients as a DataFrame object

        '''        

        return pandas.DataFrame(x, index = data.columns, columns = ['coeffs'])

    

    def get_last_error(self):  

        '''

        Returns the error term for the previously performed regression

        '''      

        return self.last_error

    





class CovarianceMatrix:

    def __init__(self, dataframe): 

        '''

        Creates matrix from DataFrame object and initializes a mean-return vector

        Data should correspond to returns

        '''       

        returns = numpy.matrix(dataframe)

        m = returns.shape[0]

        n = returns.shape[1]        

        mean_returns = numpy.zeros(n)

        

        '''

        For each time series, finds the mean return...

        '''

        for j in range(n):

            for i in range(m):

                mean_returns[j] += returns[i,j]

                mean_returns[j] /= (m-1) 

                

        '''

        ...and subtracts it from the time series        

        '''

        for i in range(m):

            for j in range(n):

                returns[i,j] -= mean_returns[j]

        '''

        Finds the size of the matrix and calculates the covariance matrix

        '''        

        N = returns.shape[0]

        

        self.returns = (1./(N-1))*returns.T*returns

        

    def get_returns(self):

        '''

        Returns the matrix of returns as a numpy matrix

        '''

        return self.returns



'''Part a.

Reads a csv file as a collection of timeseries and outputs result as a DataFrame

Removes the column for 5 year treasuries and calculates the regression coefficients

for 5yr treasuries against all other treasuries

'''

data1 = TimeSeriesCollection('/home/dru/Downloads/treasuries2011.csv')

print 'Treasuries data'

print data1.get_data().to_string()

returns1 = data1.get_data()

reg = LinearRegression(returns1)

reg.add_constant_term()

print ''

print ''

print 'Regression results, GT5 Govt'

print reg.regress('GT5 Govt')

print ''

print 'Residual'

print reg.get_last_error()

print ''

print ''

print 'Regression results, GT10 Govt'

print reg.regress('GT10 Govt')

print ''

print 'Residual'

print reg.get_last_error()

print ''

print ''

print 'Regression results, GT30 Govt'

print reg.regress('GT30 Govt')

print ''

print 'Residual'

print reg.get_last_error()





'''

data2 = TimeSeriesCollection('/home/dru/Downloads/data-indices2011.csv')

returns2 = data2.get_logreturns()

cov = CovarianceMatrix(returns2)

print cov.get_matrix()

'''
Tech Fingerprint

NumPy
Pandas
Alerts (6)

'open(' Use 'with open()' to ensure Files are properly closed
17
'del' Avoid unless necessary; Python's garbage collector typically handles object deletion
107 145
Complexity hotspot; lines 224 to 226 (total complexity: 4)
224 225 226