PageRenderTime 55ms CodeModel.GetById 21ms RepoModel.GetById 0ms app.codeStats 0ms

/TimeSeriesCollection.py

https://bitbucket.org/jiefujie/hw2_1_9815
Python | 113 lines | 93 code | 2 blank | 18 comment | 0 complexity | eb5fa08598472e52d0a592221af9945d MD5 | raw file
  1. import pandas
  2. import numpy
  3. '''
  4. Input: csv file containing multiple time series represented by a column of dates,
  5. a column of data, and an empty separating column
  6. All data columns should have human-friendly column label if the user is to add or remove columns
  7. Columns do not need to be of the same length or pertain to the same dates
  8. '''
  9. class TimeSeriesCollection(object):
  10. def __init__(self,filename):
  11. '''
  12. Reads 'filename' csv file without indexing. Determines number of columns for data set.
  13. Also determines number of rows in the first time series.
  14. '''
  15. source = pandas.read_csv(open(filename))
  16. date_col_label = source.columns[0]
  17. num_cols = source.shape[1] - 1
  18. num_rows = pandas.Series(source[date_col_label]).dropna().shape[0]
  19. '''
  20. Reads file again, but this time indexing on the 0th column (of dates) and only
  21. reading the first 'num_rows' number of rows. Converts to Series object and then to
  22. DataFrame object to make use of attributes unique to those types
  23. '''
  24. source = pandas.read_csv(open(filename), nrows = num_rows, index_col = 0, parse_dates=True)
  25. data_col = source.columns[0]
  26. source = pandas.Series(source[data_col]).dropna()
  27. data = pandas.DataFrame(source.dropna(), index = source.dropna().index, columns = [data_col])
  28. '''
  29. Loops through remaining time series, one at a time
  30. Each series is identified by the index column of dates, which occurs every third column
  31. Only accept rows with non-empty values
  32. '''
  33. cols = 0
  34. for i in range(3,num_cols,3):
  35. '''
  36. Open file again, recalculate num_rows for series
  37. '''
  38. source = pandas.read_csv(open(filename), header=1)
  39. date_col_label = source.columns[i]
  40. num_rows = pandas.Series(source[date_col_label]).dropna().shape[0]
  41. '''
  42. Re-open file, this time reading the actual data (up to row 'num_rows')
  43. Outer-join the series to 'data', which accumulating the data in the format we want
  44. '''
  45. source = pandas.read_csv(open(filename), nrows = num_rows + 1, index_col = i, parse_dates=True)
  46. data_col = source.columns[i]
  47. source = pandas.Series(source[data_col]).dropna()
  48. source = pandas.DataFrame(source.dropna(), index = source.dropna().index, columns = [data_col])
  49. data = data.join(source, how='outer')
  50. cols += 1
  51. '''
  52. Fill forward any empty rows (which will result from all the joins; correspond to
  53. dates that were not common to all time series
  54. '''
  55. data = data.fillna(method='ffill')
  56. self.data = data
  57. '''
  58. Returns the object's data as a dataframe
  59. '''
  60. def get_data(self):
  61. return self.data
  62. '''
  63. Returns the object's data as a numpy matrix
  64. '''
  65. def get_matrix(self):
  66. mat = numpy.matrix(self.data)
  67. return mat
  68. '''
  69. Returns data as a dataframe, but expressed as a return from the previous date
  70. Should only be used if dates are separated by the same time interval
  71. '''
  72. def get_returns(self):
  73. data = self.data
  74. for col in data.columns:
  75. for row in range(1,data.shape[0])[::-1]:
  76. data[col][row] = (data[col][row] - data[col][row-1])/data[col][row-1]
  77. return data[1:]
  78. '''
  79. Returns data as a dataframe, but expressed as a log-return from the previous date
  80. Should only be used if dates are separated by the same time interval
  81. '''
  82. def get_logreturns(self):
  83. data = self.data
  84. for col in data.columns:
  85. for row in range(1,data.shape[0])[::-1]:
  86. data[col][row] = numpy.log(data[col][row]/data[col][row-1])
  87. return data[1:]
  88. '''
  89. Returns a series, specified by the data column's label
  90. '''
  91. def get_series(self, col_label):
  92. col = pandas.DataFrame(self.data[col_label], index = self.data.index, columns = [col_label])
  93. return col
  94. '''
  95. Deletes a series, specified by the column's label
  96. '''
  97. def pull_series(self, col_label):
  98. del self.data[col_label]
  99. '''
  100. Adds a series to the dataset
  101. '''
  102. def add_series(self,column):
  103. self.data = self.data.join(column, how='outer')