/AnomalyDetection/Smoother/KalmanFilters.py
Python | 310 lines | 262 code | 17 blank | 31 comment | 9 complexity | 7106dd7e99bb43f012b2a7fda7f55137 MD5 | raw file
- # Project: Predictive Analytics
- # Author: Debasish Kanhar
- # UserID: BC03421
- import numpy as np
- from externals.pykalman import KalmanFilter
- import matplotlib.pyplot as plt
- import pandas as pd
- import datetime, platform, os
- from numpy.random import random
- from utils.mkdir import mkdir
- class UnivariateKalmanFilter:
- def __init__(self, params, out_path=None, data=pd.DataFrame(), test_data=False, plot=False):
- """
- web function for UnivariateKalmanFilter class to initialize params.
- Args:
- params (Dictionary): Dictionary of parameters to be used in ARIMA, mapped by key(param name): value(param value) pairs.
- out_path (str): The output path, to which specific path to be appended to save results
- data (Pandas Dataframe): The data which needs to be fitted with ARIMA model. Defaults to Empty Dataframe
- test_data (bool, Optional [Default False]): Defaults to False. If model to be used for Unit Tests, explicitely specify this variable as True to bypass get_data() method.
- plot (bool, Optional [Default False]): Flag variable to specify weather plots are to be plotted & saved or not.
- Returns:
- None
- """
- # ----------------------------------------------------------------------------------------------------------- #
- # Initialize state space variables for Kalman Filters.
- # Initialize state space variables for Kalman Filters.
- self.delta = 1e-5
- self.n_dim_obs=1
- self.n_dim_state=1
- self.initial_state_mean=np.zeros(2)
- self.initial_state_covariance=np.ones((2, 2))
- self.transition_matrices=np.eye(2)
- self.observation_covariance=1.0
- # ----------------------------------------------------------------------------------------------------------- #
- self.window = params['window']
- self.prediction = params['prediction_type']
- self.optimize = params['optimize']
- self.tmpData = data
- # ----------------------------------------------------------------------------------------------------------- #
- if hasattr(params, 'metric'):
- self.metric = params['metric']
- # ----------------------------------------------------------------------------------------------------------- #
- self.plot = plot
- if out_path is not None and hasattr(self, 'metric'):
- self.plot_path = out_path + 'plot//Forecast//KalmanFilters//{}_({})//'.format(self.metric, self.prediction)
- elif hasattr(self, 'metric'):
- self.plot_path = 'Results//' + 'plot//Forecast//KalmanFilters//{}_({})//'.format(self.metric, self.prediction)
- else:
- self.plot_path = 'Results//KF//'
- # ----------------------------------------------------------------------------------------------------------- #
- mkdir(self.plot_path)
- if not test_data:
- if len(data.index):
- # When not calling for unit tests, and input data is passed to class initialization
- self.__driver__()
- pass
- def apply_(self, data=pd.DataFrame()):
- """
- Applies Kalman Filters to input dataset to find out predicted values
- Args:
- data (Pandas DataFrame): Input Dataset which needs to be smoothed out, and predicted values to be found
- Returns:
- outDF (Pandas DataFrame): The resultant dataframe, storing 'actual', 'predicted' and 'difference' columns indexed by timestamp
- """
- # Assign input dataset to self.tmpData instance
- self.tmpData = data
- # Call the driver method to start KalmanFilters
- self.__driver__()
- # Call get_forecast() method to get predictions
- outDF = self.get_forecast()
- return outDF
- def __driver__(self):
- """
- Driver method for UnivariateKalmanFilter class.
- Returns:
- None
- """
- self.__get_data__()
- self.KalmanFilter()
- self.__plot__()
- pass
- def test(self, data):
- """
- Test the Kalman Filters module. This method is used for unit tests.
- Args:
- data (pandas Dataframe): The data to be tested against.
- It can have '1' columns in DF, which corresponds to respective metrics
- Returns:
- test_output (pandas Dataframe): It has 3 columns. Returned dataframe stores, actual value, predicted value, and difference between predicted and actuals
- Col 1: Original value.
- Col 2: Predicted value
- Col 3: Difference
- """
- if data.shape[1] == 1:
- self.data = data
- self.data.dropna(inplace=True)
- self.metric = [col for col in self.data.columns.tolist() if col is not 'timestamp'][0]
- self.KalmanFilter()
- self.__plot__()
- else:
- raise TypeError('Wrong type of Pandas Dataframe passed. Kindly check input data. Passed shape is ', data.shape)
- test_output = self.outDF
- return test_output
- def __get_platform__(self):
- """ Gets the platform of machine you are running your module on, and saves it at 'self.platform'
- Returns:
- None, stores platform type in self.platform instance
- """
- plat = platform.system()
- self.platform = plat
- def __create_dirs__(self):
- """ Create directory to save all results. (CSV files and Plots). Modifies path name according to platform being used
- Returns:
- None
- """
- dirname = self.plot_path
- self.__get_platform__()
- if 'Windows' in self.platform:
- dirname = dirname
- elif 'Linux' in self.platform:
- dirname = dirname.replace("\\", "/")
- if not os.path.exists(dirname):
- os.makedirs(dirname)
- pass
- def __get_data__(self):
- """ Get data which was passed to class, and convert it to compirtable type for KalmanFilter class.
- Returns:
- None, Stores final dataframe object in self.data instance
- """
- self.data = self.tmpData
- # self.data.replace({self.metric: {0: 0.001}}, inplace=True)
- pass
- def KalmanFilter(self):
- """ Implements Kalman Filters to find predicted values for Data passed.
- Transforms data by calling data_transform method, and fits it to Kalman Filters for predicted values and calculating more state space variable.
- Depending on flag variable (self.optimize), it optimizes state space parameters (hyper params) or not.
- Returns:
- None, Stores results in self.state_means, self.state_covs, and depending on self.parameter, calculates smoothed or filtered predictions.
- """
- self.__data_transform__()
- # self.kf = KalmanFilter(transition_matrices=self.transition_matrices, observation_matrices=self.observation_matrices, transition_covariance=self.trans_cov,
- # observation_covariance=self.observation_covariance, n_dim_obs=self.n_dim_obs, n_dim_state=self.n_dim_state,
- # initial_state_covariance=self.initial_state_covariance, initial_state_mean=self.initial_state_mean)
- self.kf = KalmanFilter(n_dim_obs=self.n_dim_obs,
- n_dim_state=self.n_dim_state)
- if self.optimize:
- val = self.data.values
- self.kf = self.kf.em(X=val)
- elif self.optimize is False:
- pass
- else:
- raise ValueError('Optimize parameter can only be True or False (Bool). Passed type is {}'.format(type(self.optimize)))
- if self.prediction == 'filter':
- self.state_means, self.state_covs = self.kf.filter(self.data.values)
- elif self.prediction == 'smooth':
- self.state_means, self.state_covs = self.kf.smooth(self.data.values)
- else:
- raise ValueError('Type of prediction can only be either "filter" or "smooth". Kindly check "prediction_type" parameter.')
- pass
- def __data_transform__(self):
- """ Calculates observation_matrix and transition_matrix state space parameters.
- Returns:
- None, Stores Observation_matrix and Transition_matrix in self.observation_matrices and self.transition_covariance instances.
- """
- self.obs_mat = np.vstack([self.data.values, np.ones(self.data.shape)]).T[:, np.newaxis]
- self.trans_cov = self.delta / (1 - self.delta) * np.eye(2)
- self.transition_covariance=self.trans_cov
- self.observation_matrices=self.obs_mat
- pass
- def __plot__(self):
- """ Plots the results and saves it to file
- Plots the results. Plots Intercept and slope at each point, Plots original values and predicted values, and
- calculates difference between actual and predictions and stores in difference.csv
- Returns:
- None, Stores the output to difference.csv, and in instance variable named self.outDF
- """
- if self.plot:
- ax1 = pd.DataFrame(dict(slope=self.state_means[:, 0], intercept=self.state_means[:, 1]), index=self.data['timestamp']).plot(subplots=True)
- plt.tight_layout()
- plt.savefig(self.plot_path + "slope and intercept.png")
- plt.clf()
- # tmpdata = self.data.values
- # for i in range(len(tmpdata)):
- # print(tmpdata[i] - self.state_means[i], tmpdata[i], self.state_means[i])
- slope = self.state_means[:,0]
- intercept = self.state_means[:,1]
- X = self.data
- Y = []
- for i in range(1, X.values.shape[0]):
- Y.append(slope[i-1]*X.values[i-1] + intercept[i-1])
- # for i in range(X.values.shape[0]):
- # Y.append(slope[i]*X.values[i] + intercept[i])
- Y = np.array(Y)
- if self.plot:
- ax2 = pd.DataFrame(dict(preds=Y, actual=X.values[1:]), index=self.data['timestamp'][1:]).plot(subplots=True)
- plt.tight_layout()
- plt.savefig(self.plot_path + "prediction and actual.png")
- plt.clf()
- # self.difference = X.values - Y
- self.difference = X.values[1:] - Y
- if self.plot:
- ax3 = plt.plot(self.difference)
- plt.savefig(self.plot_path + "difference.png")
- plt.clf()
- # self.X = X.iloc[0:]
- self.X = X.iloc[1:]
- # Replace 0s in difference with very low number so that score can be calculated. This is known issue with algo.
- leng = self.difference.shape[0]
- rand = random(leng)
- rand *= 1e-3
- np.place(self.difference, self.difference == 0, rand)
- writeDF = pd.DataFrame(data=dict(actual=self.X.values, prediction=Y, difference=self.difference))
- writeDF.to_csv(self.plot_path + "difference.csv")
- self.outDF = writeDF
- pass
- def get_forecast(self):
- """ Returns final output
- Returns:
- self.outDF (Pandas Dataframe): Returns the results from Kalman Filters stored in self.outDF
- """
- return self.outDF
- def main():
- date_start = datetime.datetime(2015, 11, 20)
- date_end = datetime.datetime(2015, 11, 21)
- window = 50
- prediction_type = 'filter'
- optimize = False
- params = {'window': window,
- 'optimize': optimize,
- 'prediction_type': prediction_type,
- 'date_start': date_start,
- 'date_end': date_end}
- obj = UnivariateKalmanFilter
- obj(params=params)
- if __name__ == '__main__':
- main()