KalmanFilters.py | searchcode

/AnomalyDetection/Smoother/KalmanFilters.py

https://gitlab.com/debasishk/PerformanceTest-Monitoring
Python | 310 lines | 262 code | 17 blank | 31 comment | 9 complexity | 7106dd7e99bb43f012b2a7fda7f55137 MD5 | raw file

# Project: Predictive Analytics
# Author: Debasish Kanhar
# UserID: BC03421


import numpy as np
from externals.pykalman import KalmanFilter
import matplotlib.pyplot as plt
import pandas as pd
import datetime, platform, os
from numpy.random import random
from utils.mkdir import mkdir


class UnivariateKalmanFilter:

    def __init__(self, params, out_path=None, data=pd.DataFrame(), test_data=False, plot=False):
        """
         web function for UnivariateKalmanFilter class to initialize params.

        Args:
            params (Dictionary): Dictionary of parameters to be used in ARIMA, mapped by key(param name): value(param value) pairs.
            out_path (str): The output path, to which specific path to be appended to save results
            data (Pandas Dataframe): The data which needs to be fitted with ARIMA model. Defaults to Empty Dataframe
            test_data (bool, Optional [Default False]): Defaults to False. If model to be used for Unit Tests, explicitely specify this variable as True to bypass get_data() method.
            plot (bool, Optional [Default False]): Flag variable to specify weather plots are to be plotted & saved or not.

        Returns:
            None
        """

        # ----------------------------------------------------------------------------------------------------------- #

        # Initialize state space variables for Kalman Filters.
        # Initialize state space variables for Kalman Filters.
        self.delta = 1e-5
        self.n_dim_obs=1
        self.n_dim_state=1
        self.initial_state_mean=np.zeros(2)
        self.initial_state_covariance=np.ones((2, 2))
        self.transition_matrices=np.eye(2)
        self.observation_covariance=1.0

        # ----------------------------------------------------------------------------------------------------------- #

        self.window = params['window']
        self.prediction = params['prediction_type']
        self.optimize = params['optimize']
        self.tmpData = data

        # ----------------------------------------------------------------------------------------------------------- #

        if hasattr(params, 'metric'):
            self.metric = params['metric']

        # ----------------------------------------------------------------------------------------------------------- #

        self.plot = plot

        if out_path is not None and hasattr(self, 'metric'):
            self.plot_path = out_path + 'plot//Forecast//KalmanFilters//{}_({})//'.format(self.metric, self.prediction)
        elif hasattr(self, 'metric'):
            self.plot_path = 'Results//' + 'plot//Forecast//KalmanFilters//{}_({})//'.format(self.metric, self.prediction)
        else:
            self.plot_path = 'Results//KF//'

        # ----------------------------------------------------------------------------------------------------------- #
        mkdir(self.plot_path)

        if not test_data:
            if len(data.index):
                # When not calling for unit tests, and input data is passed to class initialization
                self.__driver__()
        pass

    def apply_(self, data=pd.DataFrame()):
        """
        Applies Kalman Filters to input dataset to find out predicted values

        Args:
            data (Pandas DataFrame): Input Dataset which needs to be smoothed out, and predicted values to be found

        Returns:
            outDF (Pandas DataFrame): The resultant dataframe, storing 'actual', 'predicted' and 'difference' columns indexed by timestamp
        """

        # Assign input dataset to self.tmpData instance
        self.tmpData = data

        # Call the driver method to start KalmanFilters
        self.__driver__()

        # Call get_forecast() method to get predictions
        outDF = self.get_forecast()

        return outDF

    def __driver__(self):
        """
        Driver method for UnivariateKalmanFilter class.

        Returns:
            None
        """

        self.__get_data__()
        self.KalmanFilter()
        self.__plot__()
        pass

    def test(self, data):
        """
        Test the Kalman Filters module. This method is used for unit tests.

        Args:
            data (pandas Dataframe): The data to be tested against.
                                    It can have '1' columns in DF, which corresponds to respective metrics

        Returns:
            test_output (pandas Dataframe): It has 3 columns. Returned dataframe stores, actual value, predicted value, and difference between predicted and actuals
                                                Col 1: Original value.
                                                Col 2: Predicted value
                                                Col 3: Difference
        """

        if data.shape[1] == 1:
            self.data = data
            self.data.dropna(inplace=True)
            self.metric = [col for col in self.data.columns.tolist() if col is not 'timestamp'][0]

            self.KalmanFilter()
            self.__plot__()
        else:
            raise TypeError('Wrong type of Pandas Dataframe passed. Kindly check input data. Passed shape is ', data.shape)

        test_output = self.outDF
        return test_output

    def __get_platform__(self):
        """ Gets the platform of machine you are running your module on, and saves it at 'self.platform'

        Returns:
            None, stores platform type in self.platform instance
        """

        plat = platform.system()
        self.platform = plat

    def __create_dirs__(self):
        """ Create directory to save all results. (CSV files and Plots). Modifies path name according to platform being used

        Returns:
            None
        """

        dirname = self.plot_path
        self.__get_platform__()
        if 'Windows' in self.platform:
            dirname = dirname
        elif 'Linux' in self.platform:
            dirname = dirname.replace("\\", "/")

        if not os.path.exists(dirname):
            os.makedirs(dirname)
        pass

    def __get_data__(self):
        """ Get data which was passed to class, and convert it to compirtable type for KalmanFilter class.

        Returns:
            None, Stores final dataframe object in self.data instance
        """
        self.data = self.tmpData

        # self.data.replace({self.metric: {0: 0.001}}, inplace=True)
        pass

    def KalmanFilter(self):
        """ Implements Kalman Filters to find predicted values for Data passed.

        Transforms data by calling data_transform method, and fits it to Kalman Filters for predicted values and calculating more state space variable.
        Depending on flag variable (self.optimize), it optimizes state space parameters (hyper params) or not.

        Returns:
            None, Stores results in self.state_means, self.state_covs, and depending on self.parameter, calculates smoothed or filtered predictions.

        """

        self.__data_transform__()
        # self.kf = KalmanFilter(transition_matrices=self.transition_matrices, observation_matrices=self.observation_matrices, transition_covariance=self.trans_cov,
        #                        observation_covariance=self.observation_covariance,  n_dim_obs=self.n_dim_obs, n_dim_state=self.n_dim_state,
        #                        initial_state_covariance=self.initial_state_covariance, initial_state_mean=self.initial_state_mean)
        self.kf = KalmanFilter(n_dim_obs=self.n_dim_obs,
                               n_dim_state=self.n_dim_state)
        if self.optimize:
            val = self.data.values
            self.kf = self.kf.em(X=val)
        elif self.optimize is False:
            pass
        else:
            raise ValueError('Optimize parameter can only be True or False (Bool). Passed type is {}'.format(type(self.optimize)))

        if self.prediction == 'filter':
            self.state_means, self.state_covs = self.kf.filter(self.data.values)
        elif self.prediction == 'smooth':
            self.state_means, self.state_covs = self.kf.smooth(self.data.values)
        else:
            raise ValueError('Type of prediction can only be either "filter" or "smooth". Kindly check "prediction_type" parameter.')
        pass

    def __data_transform__(self):
        """ Calculates observation_matrix and transition_matrix state space parameters.

        Returns:
             None, Stores Observation_matrix and Transition_matrix in self.observation_matrices and self.transition_covariance instances.
        """
        self.obs_mat = np.vstack([self.data.values, np.ones(self.data.shape)]).T[:, np.newaxis]
        self.trans_cov = self.delta / (1 - self.delta) * np.eye(2)

        self.transition_covariance=self.trans_cov
        self.observation_matrices=self.obs_mat
        pass

    def __plot__(self):
        """ Plots the results and saves it to file

        Plots the results. Plots Intercept and slope at each point, Plots original values and predicted values, and
        calculates difference between actual and predictions and stores in difference.csv

        Returns:
            None, Stores the output to difference.csv, and in instance variable named self.outDF
        """

        if self.plot:
            ax1 = pd.DataFrame(dict(slope=self.state_means[:, 0], intercept=self.state_means[:, 1]), index=self.data['timestamp']).plot(subplots=True)
            plt.tight_layout()
            plt.savefig(self.plot_path + "slope and intercept.png")
            plt.clf()
        # tmpdata = self.data.values
        # for i in range(len(tmpdata)):
        #     print(tmpdata[i] - self.state_means[i], tmpdata[i], self.state_means[i])
        slope = self.state_means[:,0]
        intercept = self.state_means[:,1]
        X = self.data

        Y = []
        for i in range(1, X.values.shape[0]):
            Y.append(slope[i-1]*X.values[i-1] + intercept[i-1])
        # for i in range(X.values.shape[0]):
        #     Y.append(slope[i]*X.values[i] + intercept[i])
        Y = np.array(Y)

        if self.plot:
            ax2 = pd.DataFrame(dict(preds=Y, actual=X.values[1:]), index=self.data['timestamp'][1:]).plot(subplots=True)
            plt.tight_layout()
            plt.savefig(self.plot_path + "prediction and actual.png")
            plt.clf()

        # self.difference = X.values - Y
        self.difference = X.values[1:] - Y

        if self.plot:
            ax3 = plt.plot(self.difference)
            plt.savefig(self.plot_path + "difference.png")
            plt.clf()
        # self.X = X.iloc[0:]
        self.X = X.iloc[1:]

        # Replace 0s in difference with very low number so that score can be calculated. This is known issue with algo.
        leng = self.difference.shape[0]
        rand = random(leng)
        rand *= 1e-3
        np.place(self.difference, self.difference == 0, rand)
        writeDF = pd.DataFrame(data=dict(actual=self.X.values, prediction=Y, difference=self.difference))
        writeDF.to_csv(self.plot_path + "difference.csv")

        self.outDF = writeDF

        pass

    def get_forecast(self):
        """ Returns final output

        Returns:
            self.outDF (Pandas Dataframe): Returns the results from Kalman Filters stored in self.outDF
        """

        return self.outDF


def main():
    date_start = datetime.datetime(2015, 11, 20)
    date_end = datetime.datetime(2015, 11, 21)

    window = 50
    prediction_type = 'filter'
    optimize = False

    params = {'window': window,
              'optimize': optimize,
              'prediction_type': prediction_type,
              'date_start': date_start,
              'date_end': date_end}

    obj = UnivariateKalmanFilter
    obj(params=params)


if __name__ == '__main__':
    main()