TimeSeriesCollection.py

/TimeSeriesCollection.py

https://bitbucket.org/jiefujie/hw2_1_9815 · Python · 113 lines · 59 code · 8 blank · 46 comment · 5 complexity · eb5fa08598472e52d0a592221af9945d MD5 · raw file


import pandas
import numpy
'''
Input: csv file containing multiple time series represented by a column of dates,
a column of data, and an empty separating column
All data columns should have human-friendly column label if the user is to add or remove columns
Columns do not need to be of the same length or pertain to the same dates
'''
class TimeSeriesCollection(object):
    
    def __init__(self,filename):       
        
        '''
        Reads 'filename' csv file without indexing. Determines number of columns for data set.
        Also determines number of rows in the first time series.        
        '''
        source = pandas.read_csv(open(filename))
        date_col_label = source.columns[0]        
        num_cols = source.shape[1] - 1
        num_rows = pandas.Series(source[date_col_label]).dropna().shape[0]    
        '''
        Reads file again, but this time indexing on the 0th column (of dates) and only
        reading the first 'num_rows' number of rows. Converts to Series object and then to
        DataFrame object to make use of attributes unique to those types 
        '''    
        source = pandas.read_csv(open(filename), nrows = num_rows, index_col = 0, parse_dates=True)
        data_col = source.columns[0]
        source = pandas.Series(source[data_col]).dropna()
        data = pandas.DataFrame(source.dropna(), index = source.dropna().index, columns = [data_col])
        '''
        Loops through remaining time series, one at a time
        Each series is identified by the index column of dates, which occurs every third column
        Only accept rows with non-empty values
        '''
        cols = 0
        for i in range(3,num_cols,3):
            '''
            Open file again, recalculate num_rows for series
            '''
            source = pandas.read_csv(open(filename), header=1)
            date_col_label = source.columns[i]        
            num_rows = pandas.Series(source[date_col_label]).dropna().shape[0]
            '''
            Re-open file, this time reading the actual data (up to row 'num_rows')
            Outer-join the series to 'data', which accumulating the data in the format we want
            '''
            source = pandas.read_csv(open(filename), nrows = num_rows + 1, index_col = i, parse_dates=True)
            data_col = source.columns[i]
            source = pandas.Series(source[data_col]).dropna()
            source = pandas.DataFrame(source.dropna(), index = source.dropna().index, columns = [data_col])
            data = data.join(source, how='outer')
            cols += 1
        '''
        Fill forward any empty rows (which will result from all the joins; correspond to
        dates that were not common to all time series
        '''
        data = data.fillna(method='ffill')
        self.data = data         
    
    '''
    Returns the object's data as a dataframe
    '''
    def get_data(self):
        return self.data
    
    '''
    Returns the object's data as a numpy matrix
    '''
    def get_matrix(self):
        mat = numpy.matrix(self.data)
        return mat

    '''
    Returns data as a dataframe, but expressed as a return from the previous date
    Should only be used if dates are separated by the same time interval
    '''
    def get_returns(self):
        data = self.data
        for col in data.columns:
            for row in range(1,data.shape[0])[::-1]:
                data[col][row] = (data[col][row] - data[col][row-1])/data[col][row-1]
        return data[1:]
    
    '''
    Returns data as a dataframe, but expressed as a log-return from the previous date
    Should only be used if dates are separated by the same time interval 
    '''
    def get_logreturns(self):
        data = self.data
        for col in data.columns:
            for row in range(1,data.shape[0])[::-1]:
                data[col][row] = numpy.log(data[col][row]/data[col][row-1])
          
        return data[1:]  
    
    '''
    Returns a series, specified by the data column's label
    '''
    def get_series(self, col_label):
        col = pandas.DataFrame(self.data[col_label], index = self.data.index, columns = [col_label])
        return col
    
    '''
    Deletes a series, specified by the column's label
    '''
    def pull_series(self, col_label):
        del self.data[col_label]
    
    '''
    Adds a series to the dataset
    '''         
    def add_series(self,column):
        self.data = self.data.join(column, how='outer')

Tech Fingerprint

NumPy
Pandas

Alerts (2)

'open(' Use 'with open()' to ensure Files are properly closed
17
'del' Avoid unless necessary; Python's garbage collector typically handles object deletion
107