PageRenderTime 455ms CodeModel.GetById 23ms RepoModel.GetById 0ms app.codeStats 0ms

/AnomalyDetection/Smoother/KalmanFilters.py

https://gitlab.com/debasishk/PerformanceTest-Monitoring
Python | 310 lines | 262 code | 17 blank | 31 comment | 9 complexity | 7106dd7e99bb43f012b2a7fda7f55137 MD5 | raw file
  1. # Project: Predictive Analytics
  2. # Author: Debasish Kanhar
  3. # UserID: BC03421
  4. import numpy as np
  5. from externals.pykalman import KalmanFilter
  6. import matplotlib.pyplot as plt
  7. import pandas as pd
  8. import datetime, platform, os
  9. from numpy.random import random
  10. from utils.mkdir import mkdir
  11. class UnivariateKalmanFilter:
  12. def __init__(self, params, out_path=None, data=pd.DataFrame(), test_data=False, plot=False):
  13. """
  14. web function for UnivariateKalmanFilter class to initialize params.
  15. Args:
  16. params (Dictionary): Dictionary of parameters to be used in ARIMA, mapped by key(param name): value(param value) pairs.
  17. out_path (str): The output path, to which specific path to be appended to save results
  18. data (Pandas Dataframe): The data which needs to be fitted with ARIMA model. Defaults to Empty Dataframe
  19. test_data (bool, Optional [Default False]): Defaults to False. If model to be used for Unit Tests, explicitely specify this variable as True to bypass get_data() method.
  20. plot (bool, Optional [Default False]): Flag variable to specify weather plots are to be plotted & saved or not.
  21. Returns:
  22. None
  23. """
  24. # ----------------------------------------------------------------------------------------------------------- #
  25. # Initialize state space variables for Kalman Filters.
  26. # Initialize state space variables for Kalman Filters.
  27. self.delta = 1e-5
  28. self.n_dim_obs=1
  29. self.n_dim_state=1
  30. self.initial_state_mean=np.zeros(2)
  31. self.initial_state_covariance=np.ones((2, 2))
  32. self.transition_matrices=np.eye(2)
  33. self.observation_covariance=1.0
  34. # ----------------------------------------------------------------------------------------------------------- #
  35. self.window = params['window']
  36. self.prediction = params['prediction_type']
  37. self.optimize = params['optimize']
  38. self.tmpData = data
  39. # ----------------------------------------------------------------------------------------------------------- #
  40. if hasattr(params, 'metric'):
  41. self.metric = params['metric']
  42. # ----------------------------------------------------------------------------------------------------------- #
  43. self.plot = plot
  44. if out_path is not None and hasattr(self, 'metric'):
  45. self.plot_path = out_path + 'plot//Forecast//KalmanFilters//{}_({})//'.format(self.metric, self.prediction)
  46. elif hasattr(self, 'metric'):
  47. self.plot_path = 'Results//' + 'plot//Forecast//KalmanFilters//{}_({})//'.format(self.metric, self.prediction)
  48. else:
  49. self.plot_path = 'Results//KF//'
  50. # ----------------------------------------------------------------------------------------------------------- #
  51. mkdir(self.plot_path)
  52. if not test_data:
  53. if len(data.index):
  54. # When not calling for unit tests, and input data is passed to class initialization
  55. self.__driver__()
  56. pass
  57. def apply_(self, data=pd.DataFrame()):
  58. """
  59. Applies Kalman Filters to input dataset to find out predicted values
  60. Args:
  61. data (Pandas DataFrame): Input Dataset which needs to be smoothed out, and predicted values to be found
  62. Returns:
  63. outDF (Pandas DataFrame): The resultant dataframe, storing 'actual', 'predicted' and 'difference' columns indexed by timestamp
  64. """
  65. # Assign input dataset to self.tmpData instance
  66. self.tmpData = data
  67. # Call the driver method to start KalmanFilters
  68. self.__driver__()
  69. # Call get_forecast() method to get predictions
  70. outDF = self.get_forecast()
  71. return outDF
  72. def __driver__(self):
  73. """
  74. Driver method for UnivariateKalmanFilter class.
  75. Returns:
  76. None
  77. """
  78. self.__get_data__()
  79. self.KalmanFilter()
  80. self.__plot__()
  81. pass
  82. def test(self, data):
  83. """
  84. Test the Kalman Filters module. This method is used for unit tests.
  85. Args:
  86. data (pandas Dataframe): The data to be tested against.
  87. It can have '1' columns in DF, which corresponds to respective metrics
  88. Returns:
  89. test_output (pandas Dataframe): It has 3 columns. Returned dataframe stores, actual value, predicted value, and difference between predicted and actuals
  90. Col 1: Original value.
  91. Col 2: Predicted value
  92. Col 3: Difference
  93. """
  94. if data.shape[1] == 1:
  95. self.data = data
  96. self.data.dropna(inplace=True)
  97. self.metric = [col for col in self.data.columns.tolist() if col is not 'timestamp'][0]
  98. self.KalmanFilter()
  99. self.__plot__()
  100. else:
  101. raise TypeError('Wrong type of Pandas Dataframe passed. Kindly check input data. Passed shape is ', data.shape)
  102. test_output = self.outDF
  103. return test_output
  104. def __get_platform__(self):
  105. """ Gets the platform of machine you are running your module on, and saves it at 'self.platform'
  106. Returns:
  107. None, stores platform type in self.platform instance
  108. """
  109. plat = platform.system()
  110. self.platform = plat
  111. def __create_dirs__(self):
  112. """ Create directory to save all results. (CSV files and Plots). Modifies path name according to platform being used
  113. Returns:
  114. None
  115. """
  116. dirname = self.plot_path
  117. self.__get_platform__()
  118. if 'Windows' in self.platform:
  119. dirname = dirname
  120. elif 'Linux' in self.platform:
  121. dirname = dirname.replace("\\", "/")
  122. if not os.path.exists(dirname):
  123. os.makedirs(dirname)
  124. pass
  125. def __get_data__(self):
  126. """ Get data which was passed to class, and convert it to compirtable type for KalmanFilter class.
  127. Returns:
  128. None, Stores final dataframe object in self.data instance
  129. """
  130. self.data = self.tmpData
  131. # self.data.replace({self.metric: {0: 0.001}}, inplace=True)
  132. pass
  133. def KalmanFilter(self):
  134. """ Implements Kalman Filters to find predicted values for Data passed.
  135. Transforms data by calling data_transform method, and fits it to Kalman Filters for predicted values and calculating more state space variable.
  136. Depending on flag variable (self.optimize), it optimizes state space parameters (hyper params) or not.
  137. Returns:
  138. None, Stores results in self.state_means, self.state_covs, and depending on self.parameter, calculates smoothed or filtered predictions.
  139. """
  140. self.__data_transform__()
  141. # self.kf = KalmanFilter(transition_matrices=self.transition_matrices, observation_matrices=self.observation_matrices, transition_covariance=self.trans_cov,
  142. # observation_covariance=self.observation_covariance, n_dim_obs=self.n_dim_obs, n_dim_state=self.n_dim_state,
  143. # initial_state_covariance=self.initial_state_covariance, initial_state_mean=self.initial_state_mean)
  144. self.kf = KalmanFilter(n_dim_obs=self.n_dim_obs,
  145. n_dim_state=self.n_dim_state)
  146. if self.optimize:
  147. val = self.data.values
  148. self.kf = self.kf.em(X=val)
  149. elif self.optimize is False:
  150. pass
  151. else:
  152. raise ValueError('Optimize parameter can only be True or False (Bool). Passed type is {}'.format(type(self.optimize)))
  153. if self.prediction == 'filter':
  154. self.state_means, self.state_covs = self.kf.filter(self.data.values)
  155. elif self.prediction == 'smooth':
  156. self.state_means, self.state_covs = self.kf.smooth(self.data.values)
  157. else:
  158. raise ValueError('Type of prediction can only be either "filter" or "smooth". Kindly check "prediction_type" parameter.')
  159. pass
  160. def __data_transform__(self):
  161. """ Calculates observation_matrix and transition_matrix state space parameters.
  162. Returns:
  163. None, Stores Observation_matrix and Transition_matrix in self.observation_matrices and self.transition_covariance instances.
  164. """
  165. self.obs_mat = np.vstack([self.data.values, np.ones(self.data.shape)]).T[:, np.newaxis]
  166. self.trans_cov = self.delta / (1 - self.delta) * np.eye(2)
  167. self.transition_covariance=self.trans_cov
  168. self.observation_matrices=self.obs_mat
  169. pass
  170. def __plot__(self):
  171. """ Plots the results and saves it to file
  172. Plots the results. Plots Intercept and slope at each point, Plots original values and predicted values, and
  173. calculates difference between actual and predictions and stores in difference.csv
  174. Returns:
  175. None, Stores the output to difference.csv, and in instance variable named self.outDF
  176. """
  177. if self.plot:
  178. ax1 = pd.DataFrame(dict(slope=self.state_means[:, 0], intercept=self.state_means[:, 1]), index=self.data['timestamp']).plot(subplots=True)
  179. plt.tight_layout()
  180. plt.savefig(self.plot_path + "slope and intercept.png")
  181. plt.clf()
  182. # tmpdata = self.data.values
  183. # for i in range(len(tmpdata)):
  184. # print(tmpdata[i] - self.state_means[i], tmpdata[i], self.state_means[i])
  185. slope = self.state_means[:,0]
  186. intercept = self.state_means[:,1]
  187. X = self.data
  188. Y = []
  189. for i in range(1, X.values.shape[0]):
  190. Y.append(slope[i-1]*X.values[i-1] + intercept[i-1])
  191. # for i in range(X.values.shape[0]):
  192. # Y.append(slope[i]*X.values[i] + intercept[i])
  193. Y = np.array(Y)
  194. if self.plot:
  195. ax2 = pd.DataFrame(dict(preds=Y, actual=X.values[1:]), index=self.data['timestamp'][1:]).plot(subplots=True)
  196. plt.tight_layout()
  197. plt.savefig(self.plot_path + "prediction and actual.png")
  198. plt.clf()
  199. # self.difference = X.values - Y
  200. self.difference = X.values[1:] - Y
  201. if self.plot:
  202. ax3 = plt.plot(self.difference)
  203. plt.savefig(self.plot_path + "difference.png")
  204. plt.clf()
  205. # self.X = X.iloc[0:]
  206. self.X = X.iloc[1:]
  207. # Replace 0s in difference with very low number so that score can be calculated. This is known issue with algo.
  208. leng = self.difference.shape[0]
  209. rand = random(leng)
  210. rand *= 1e-3
  211. np.place(self.difference, self.difference == 0, rand)
  212. writeDF = pd.DataFrame(data=dict(actual=self.X.values, prediction=Y, difference=self.difference))
  213. writeDF.to_csv(self.plot_path + "difference.csv")
  214. self.outDF = writeDF
  215. pass
  216. def get_forecast(self):
  217. """ Returns final output
  218. Returns:
  219. self.outDF (Pandas Dataframe): Returns the results from Kalman Filters stored in self.outDF
  220. """
  221. return self.outDF
  222. def main():
  223. date_start = datetime.datetime(2015, 11, 20)
  224. date_end = datetime.datetime(2015, 11, 21)
  225. window = 50
  226. prediction_type = 'filter'
  227. optimize = False
  228. params = {'window': window,
  229. 'optimize': optimize,
  230. 'prediction_type': prediction_type,
  231. 'date_start': date_start,
  232. 'date_end': date_end}
  233. obj = UnivariateKalmanFilter
  234. obj(params=params)
  235. if __name__ == '__main__':
  236. main()