FastFourierTransform.py

/AnomalyDetection/Univariate/FastFourierTransform.py

https://gitlab.com/debasishk/PerformanceTest-Monitoring
Python | 328 lines | 312 code | 6 blank | 10 comment | 1 complexity | ba2797317a82f08fe1e3ced296f28ae3 MD5 | raw file

# Project: Predictive Analytics
# Author: Debasish Kanhar
# UserID: BC03421

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
# from utils.mkdir import mkdir
import os


COLOR_PALETTE = [
               "#348ABD",
               "#A60628",
               "#7A68A6",
               "#467821",
               "#CF4457",
               "#188487",
               "#E24A33"
              ]

plt_pth = 'Results\KalmanFilters\All\FFT\\'


class FastFourierTransform:
    """
    This class is used to calculate Outliers in data based on Fast Fourier Transform algorithm
    """
    def __init__(self, thresh_freq, freq_amp, window=10, forecaster='KalmanFilters',
                 out_path=plt_pth, data=pd.DataFrame(), test_data=False, plot=False):
        """ __init__method for FastFoutierTransform class based Outlier Detection

        Args:
            date_start (datetime.datetime): Start date for fetching of data
            date_end (datetime.datetime): End date for fetching of data
            thresh_freq (float): Threshold frequency for input signal wave.
            freq_amp (float): Frequency amplitude hyper parameter for input signal wave
            window (int): The window size to consider. Min and Max signal are found in this window as outliers
            forecaster (str): The name of forecasting algirithm used to predict values
            out_path (str): The output path where results are stored
            data (Pandas Dataframe): The input data where Outliers are to be found out
            test_data (bool): Defaults to False. If model to be used for Unit Tests, explicitely specify this variable as True to bypass get_data() method.
            plot (bool, Optional [Default False]): Flag variable to mention weather to plot results & save it or not.

        Returns:
            None
        """

        self.forecaster = forecaster
        self.window = window
        self.data = data
        self.threshold_freq = thresh_freq
        self.frequency_amplitude = freq_amp
        self.plot = plot

        self.COLOR_PALETTE = COLOR_PALETTE

        self.fmt = "%m-%d-%y %H:%M"

        if hasattr(self, 'metric'):
            self.plot_path = out_path + '{}\\FFT\\thresh-freq-{}-freq-amp-{}-win-{}_t\\'.format(
                                                self.metric, thresh_freq, freq_amp,self.window)
        else:
            self.plot_path = out_path + 'plot\\Outliers\\{}\\FFT\\test\\thresh-freq-{}-freq-amp-{}-win-{}_t\\complete\\'.format(
                                                self.forecaster, thresh_freq, freq_amp,self.window)
        if not test_data:
            if len(data.index):
                self.__driver__()
        pass

    def apply_(self, data=pd.DataFrame()):
        """
        Applies Fast Fourier transform algorithm to input dataset to find out Outliers and return scores of each point.

        Args:
            data (Pandas DataFrame): Input dataset on which outlier scores based on FFT algorithm to be found out.

        Returns:
            scores (Pandas DataFrame): FFT based outlier scores for each point stored in Pandas DataFrame indexed by tinestamp.
        """

        self.data = data

        self.__driver__()

        scores, outliers = self.return_scores_and_outliers()

        return scores

    def __driver__(self):
        """ Driver method for FastFourierTransform class

        Returns:
            None
        """
        # dirname = os.getcwd() + '\\' + self.plot_path
        # mkdir(dirname)

        self.__get_data__()
        self.FastFourierTransformation()
        pass

    def test(self, data=None, columns=None):
        """
        Test the Fast Fourier Transform module. This method is used for unit tests.

        Args:
            data (pandas Dataframe): The data to be tested against.
                                    It can have either '2' columns in DF, which corresponds to respective metrics, and timestamp,
                                    Or '1' column with index as timestamp

        Returns:
            test_output (Pandas DataFrame): Pandas Dataframe object containing 1 column which holds scores for each point
        """
        # self.__create_dirs__(self.plot_path)

        flag = 0
        for col in data.columns.tolist():
            if col == 'timestamp':
                flag = 1

        if flag == 1:
            self.data_ts = data['timestamp'].values
        else:
            self.data_ts = data.index.tolist()

        if not columns:
            columns = data.columns.tolist()

        else:
            columns = columns

        score = dict()

        for col in columns:
            self.metric = col
            self.data = data[self.metric].values
            self.FastFourierTransformation()

            score[col + '_FFT_Score'] = self.scores

        tmpList = []
        for k in score.keys():
            tmp = score[k]
            tmp.reset_index(inplace=True)
            tmpList.append(tmp)

        output = reduce(lambda left, right: pd.merge(left, right, on='index'), tmpList)
        output = output.drop('index', axis=1)

        return output

    def __create_dirs__(self, dirname):
        """ Create result directory to save results and not stop by throwing IOException.

        Args:
            dirname (str): The pathname to directory which is to be created

        Returns:
            None
        """

        if not os.path.exists(dirname):
            os.makedirs(dirname)
        pass

    def __get_start_end_index__(self):
        """ Fetches the start and end index of data depending on start_date and end_date timestamps in parameter list.

        Returns:
            start_index (int): The start index for fetching data

            end_index (int): The end index for fetching data
        """

        if self.data.shape or self.data:
            ts = self.data_ts['timestamp']
            ts = ts.tolist()
            ts = [datetime.datetime.strptime(elem, self.fmt) for elem in ts]

            start_index = ts.index(self.date_start)
            end_index = ts.index(self.date_end)

            self.data_ts.drop(['timestamp'], axis=1, inplace=True)
            self.data_ts['timestamp'] = ts

            self.data_ts.set_index(['timestamp'])

        else:
            raise ValueError('Empty Pandas dataframe passed as input data. Kindly check your input data')

        return start_index, end_index

    def __get_data__(self):
        """ Converts input data of type Pandas Dataframe object to numpy array

        Returns:
            None, stores final extracted data in instance variable.

            self.data (numpy.array): The data on which Outliers to be found out.
        """

        self.metric = [col for col in self.data.columns.tolist() if col is not 'timestamp'][0]

        if len(self.data.index):
            self.data_ts = self.data.index.tolist()
            self.data = self.data[self.metric].values

        else:
            raise ValueError('Empty Pandas dataframe passed as input data. Kindly check your input data')
        pass

    def __detect_outlier_position_by_fft__(self, signal):
        """ Detects the outliers and its positions in data (signal) passed to this method.

        Args:
            signal (numpy array, dtype: float, shape: 1 X window): The signal where outliers are to be found. This is same size of window.

        Returns:
            None, if no outliers found. Else,

            index_of_outlier[0] (int): The index of data point which was identified as outlier
        """
        fft_of_signal = np.fft.fft(signal)
        outlier = np.max(signal) if abs(np.max(signal)) > abs(np.min(signal)) else np.min(signal)

        if np.any(np.abs(fft_of_signal[self.threshold_freq:]) > self.frequency_amplitude) and \
                (np.abs(outlier) >= (abs(np.max(signal))-abs(np.min(signal)))*self.frequency_amplitude):
            index_of_outlier = np.where(signal == outlier)
            return index_of_outlier[0]
        else:
            return None

    def get_scores(self):
        """ Returns scores for each data point, after calculating scores based on naive algorithm.

        Differences current value from previous value, and uses the ratio between current value and last value to find scores.

        Returns:
            scores (list of floats): The score of each point. Usually no threshold, but referenced by higher value tends to be outlier
        """

        score = []
        for i in range(self.data.size):
            elem = self.data[i]
            fft = np.fft.fft([elem])
            fft = np.abs(fft)
            score.append(fft.mean())

        return score

    def FastFourierTransformation(self):
        """ This method finds outliers based on FastFourierTransform algorithm. Acts as driver function, which calls __detect_outlier_position_by_fft__ method by passing small signals.

        This method acts as driver method to find outliers in data. Slices original data in steps of size "window" and passed to __detect_outlier_position_by_fft__
        to detect outliers in that small part of signal. Stores all the index, plots the results, and saves it pre-defined paths.

        Returns:
            None, stores outliers in following instance variable.

            self.outliers (Pandas Dataframe): Stores the outliers detected in dataset as instance variable
        """

        outlier_positions = []
        self.scores = self.get_scores()

        for ii in range(self.window, self.data.size, 1):
            outlier_position = self.__detect_outlier_position_by_fft__(self.data[ii-self.window:ii+self.window])
            if outlier_position is not None:
                outlier_positions.append(ii + outlier_position[0] - self.window)
        outlier_positions = list(set(outlier_positions))

        score_df = pd.DataFrame(data=np.array(self.scores), index=self.data_ts, columns=['{}_FFT_Score'.format(self.metric)])
        self.scores = score_df

        if self.plot:
            plt.figure(figsize=(12, 6))
            try:
                plt.scatter(range(self.data.size), self.data, c=self.COLOR_PALETTE[0], label='Original Signal')
                plt.scatter(outlier_positions, self.data[np.asanyarray(outlier_positions)], c=self.COLOR_PALETTE[-1], label='Outliers')
            except:
                pass
            plt.legend()
            try:
                plt.savefig(self.plot_path + "outlier_positions.png")
            except:
                pass
            plt.clf()

        outliers = []
        outlier_ts = []
        for ind in outlier_positions:
            outliers.append(self.data[ind])
            if self.plot:
                outlier_ts.append(self.data_ts.iloc[ind])

                plt.plot(self.data)
                plt.scatter(outlier_positions, outliers)
                try:
                    plt.savefig(self.plot_path + "outliers with data.png")
                except IOError:
                    pass
                plt.clf()

        self.outliers = pd.DataFrame(data=np.array(outliers), columns=['outliers'])
        pass

    def return_scores_and_outliers(self):
        """ Returns scores of each data point and outliers in dataset.

        Returns:
            self.scores (list of floats): The score of each point. Usually no threshold, but referenced by higher value tends to be outlier

            self.outliers (Pandas Dataframe): Stores the outliers detected in dataset
        """

        return self.scores, self.outliers


def main():
    threshold_freq=1
    frequency_amplitude=2
    date_start = datetime.datetime(2015, 11, 20)
    date_end = datetime.datetime(2015, 11, 21)

    obj = FastFourierTransform
    obj(date_start, date_end, threshold_freq, frequency_amplitude)