/TimeSeriesCollection.py
Python | 113 lines | 93 code | 2 blank | 18 comment | 0 complexity | eb5fa08598472e52d0a592221af9945d MD5 | raw file
- import pandas
- import numpy
- '''
- Input: csv file containing multiple time series represented by a column of dates,
- a column of data, and an empty separating column
- All data columns should have human-friendly column label if the user is to add or remove columns
- Columns do not need to be of the same length or pertain to the same dates
- '''
- class TimeSeriesCollection(object):
-
- def __init__(self,filename):
-
- '''
- Reads 'filename' csv file without indexing. Determines number of columns for data set.
- Also determines number of rows in the first time series.
- '''
- source = pandas.read_csv(open(filename))
- date_col_label = source.columns[0]
- num_cols = source.shape[1] - 1
- num_rows = pandas.Series(source[date_col_label]).dropna().shape[0]
- '''
- Reads file again, but this time indexing on the 0th column (of dates) and only
- reading the first 'num_rows' number of rows. Converts to Series object and then to
- DataFrame object to make use of attributes unique to those types
- '''
- source = pandas.read_csv(open(filename), nrows = num_rows, index_col = 0, parse_dates=True)
- data_col = source.columns[0]
- source = pandas.Series(source[data_col]).dropna()
- data = pandas.DataFrame(source.dropna(), index = source.dropna().index, columns = [data_col])
- '''
- Loops through remaining time series, one at a time
- Each series is identified by the index column of dates, which occurs every third column
- Only accept rows with non-empty values
- '''
- cols = 0
- for i in range(3,num_cols,3):
- '''
- Open file again, recalculate num_rows for series
- '''
- source = pandas.read_csv(open(filename), header=1)
- date_col_label = source.columns[i]
- num_rows = pandas.Series(source[date_col_label]).dropna().shape[0]
- '''
- Re-open file, this time reading the actual data (up to row 'num_rows')
- Outer-join the series to 'data', which accumulating the data in the format we want
- '''
- source = pandas.read_csv(open(filename), nrows = num_rows + 1, index_col = i, parse_dates=True)
- data_col = source.columns[i]
- source = pandas.Series(source[data_col]).dropna()
- source = pandas.DataFrame(source.dropna(), index = source.dropna().index, columns = [data_col])
- data = data.join(source, how='outer')
- cols += 1
- '''
- Fill forward any empty rows (which will result from all the joins; correspond to
- dates that were not common to all time series
- '''
- data = data.fillna(method='ffill')
- self.data = data
-
- '''
- Returns the object's data as a dataframe
- '''
- def get_data(self):
- return self.data
-
- '''
- Returns the object's data as a numpy matrix
- '''
- def get_matrix(self):
- mat = numpy.matrix(self.data)
- return mat
- '''
- Returns data as a dataframe, but expressed as a return from the previous date
- Should only be used if dates are separated by the same time interval
- '''
- def get_returns(self):
- data = self.data
- for col in data.columns:
- for row in range(1,data.shape[0])[::-1]:
- data[col][row] = (data[col][row] - data[col][row-1])/data[col][row-1]
- return data[1:]
-
- '''
- Returns data as a dataframe, but expressed as a log-return from the previous date
- Should only be used if dates are separated by the same time interval
- '''
- def get_logreturns(self):
- data = self.data
- for col in data.columns:
- for row in range(1,data.shape[0])[::-1]:
- data[col][row] = numpy.log(data[col][row]/data[col][row-1])
-
- return data[1:]
-
- '''
- Returns a series, specified by the data column's label
- '''
- def get_series(self, col_label):
- col = pandas.DataFrame(self.data[col_label], index = self.data.index, columns = [col_label])
- return col
-
- '''
- Deletes a series, specified by the column's label
- '''
- def pull_series(self, col_label):
- del self.data[col_label]
-
- '''
- Adds a series to the dataset
- '''
- def add_series(self,column):
- self.data = self.data.join(column, how='outer')