/AnomalyDetection/Univariate/FastFourierTransform.py
Python | 328 lines | 312 code | 6 blank | 10 comment | 1 complexity | ba2797317a82f08fe1e3ced296f28ae3 MD5 | raw file
- # Project: Predictive Analytics
- # Author: Debasish Kanhar
- # UserID: BC03421
- import numpy as np
- import pandas as pd
- import matplotlib.pyplot as plt
- import datetime
- # from utils.mkdir import mkdir
- import os
- COLOR_PALETTE = [
- "#348ABD",
- "#A60628",
- "#7A68A6",
- "#467821",
- "#CF4457",
- "#188487",
- "#E24A33"
- ]
- plt_pth = 'Results\KalmanFilters\All\FFT\\'
- class FastFourierTransform:
- """
- This class is used to calculate Outliers in data based on Fast Fourier Transform algorithm
- """
- def __init__(self, thresh_freq, freq_amp, window=10, forecaster='KalmanFilters',
- out_path=plt_pth, data=pd.DataFrame(), test_data=False, plot=False):
- """ __init__method for FastFoutierTransform class based Outlier Detection
- Args:
- date_start (datetime.datetime): Start date for fetching of data
- date_end (datetime.datetime): End date for fetching of data
- thresh_freq (float): Threshold frequency for input signal wave.
- freq_amp (float): Frequency amplitude hyper parameter for input signal wave
- window (int): The window size to consider. Min and Max signal are found in this window as outliers
- forecaster (str): The name of forecasting algirithm used to predict values
- out_path (str): The output path where results are stored
- data (Pandas Dataframe): The input data where Outliers are to be found out
- test_data (bool): Defaults to False. If model to be used for Unit Tests, explicitely specify this variable as True to bypass get_data() method.
- plot (bool, Optional [Default False]): Flag variable to mention weather to plot results & save it or not.
- Returns:
- None
- """
- self.forecaster = forecaster
- self.window = window
- self.data = data
- self.threshold_freq = thresh_freq
- self.frequency_amplitude = freq_amp
- self.plot = plot
- self.COLOR_PALETTE = COLOR_PALETTE
- self.fmt = "%m-%d-%y %H:%M"
- if hasattr(self, 'metric'):
- self.plot_path = out_path + '{}\\FFT\\thresh-freq-{}-freq-amp-{}-win-{}_t\\'.format(
- self.metric, thresh_freq, freq_amp,self.window)
- else:
- self.plot_path = out_path + 'plot\\Outliers\\{}\\FFT\\test\\thresh-freq-{}-freq-amp-{}-win-{}_t\\complete\\'.format(
- self.forecaster, thresh_freq, freq_amp,self.window)
- if not test_data:
- if len(data.index):
- self.__driver__()
- pass
- def apply_(self, data=pd.DataFrame()):
- """
- Applies Fast Fourier transform algorithm to input dataset to find out Outliers and return scores of each point.
- Args:
- data (Pandas DataFrame): Input dataset on which outlier scores based on FFT algorithm to be found out.
- Returns:
- scores (Pandas DataFrame): FFT based outlier scores for each point stored in Pandas DataFrame indexed by tinestamp.
- """
- self.data = data
- self.__driver__()
- scores, outliers = self.return_scores_and_outliers()
- return scores
- def __driver__(self):
- """ Driver method for FastFourierTransform class
- Returns:
- None
- """
- # dirname = os.getcwd() + '\\' + self.plot_path
- # mkdir(dirname)
- self.__get_data__()
- self.FastFourierTransformation()
- pass
- def test(self, data=None, columns=None):
- """
- Test the Fast Fourier Transform module. This method is used for unit tests.
- Args:
- data (pandas Dataframe): The data to be tested against.
- It can have either '2' columns in DF, which corresponds to respective metrics, and timestamp,
- Or '1' column with index as timestamp
- Returns:
- test_output (Pandas DataFrame): Pandas Dataframe object containing 1 column which holds scores for each point
- """
- # self.__create_dirs__(self.plot_path)
- flag = 0
- for col in data.columns.tolist():
- if col == 'timestamp':
- flag = 1
- if flag == 1:
- self.data_ts = data['timestamp'].values
- else:
- self.data_ts = data.index.tolist()
- if not columns:
- columns = data.columns.tolist()
- else:
- columns = columns
- score = dict()
- for col in columns:
- self.metric = col
- self.data = data[self.metric].values
- self.FastFourierTransformation()
- score[col + '_FFT_Score'] = self.scores
- tmpList = []
- for k in score.keys():
- tmp = score[k]
- tmp.reset_index(inplace=True)
- tmpList.append(tmp)
- output = reduce(lambda left, right: pd.merge(left, right, on='index'), tmpList)
- output = output.drop('index', axis=1)
- return output
- def __create_dirs__(self, dirname):
- """ Create result directory to save results and not stop by throwing IOException.
- Args:
- dirname (str): The pathname to directory which is to be created
- Returns:
- None
- """
- if not os.path.exists(dirname):
- os.makedirs(dirname)
- pass
- def __get_start_end_index__(self):
- """ Fetches the start and end index of data depending on start_date and end_date timestamps in parameter list.
- Returns:
- start_index (int): The start index for fetching data
- end_index (int): The end index for fetching data
- """
- if self.data.shape or self.data:
- ts = self.data_ts['timestamp']
- ts = ts.tolist()
- ts = [datetime.datetime.strptime(elem, self.fmt) for elem in ts]
- start_index = ts.index(self.date_start)
- end_index = ts.index(self.date_end)
- self.data_ts.drop(['timestamp'], axis=1, inplace=True)
- self.data_ts['timestamp'] = ts
- self.data_ts.set_index(['timestamp'])
- else:
- raise ValueError('Empty Pandas dataframe passed as input data. Kindly check your input data')
- return start_index, end_index
- def __get_data__(self):
- """ Converts input data of type Pandas Dataframe object to numpy array
- Returns:
- None, stores final extracted data in instance variable.
- self.data (numpy.array): The data on which Outliers to be found out.
- """
- self.metric = [col for col in self.data.columns.tolist() if col is not 'timestamp'][0]
- if len(self.data.index):
- self.data_ts = self.data.index.tolist()
- self.data = self.data[self.metric].values
- else:
- raise ValueError('Empty Pandas dataframe passed as input data. Kindly check your input data')
- pass
- def __detect_outlier_position_by_fft__(self, signal):
- """ Detects the outliers and its positions in data (signal) passed to this method.
- Args:
- signal (numpy array, dtype: float, shape: 1 X window): The signal where outliers are to be found. This is same size of window.
- Returns:
- None, if no outliers found. Else,
- index_of_outlier[0] (int): The index of data point which was identified as outlier
- """
- fft_of_signal = np.fft.fft(signal)
- outlier = np.max(signal) if abs(np.max(signal)) > abs(np.min(signal)) else np.min(signal)
- if np.any(np.abs(fft_of_signal[self.threshold_freq:]) > self.frequency_amplitude) and \
- (np.abs(outlier) >= (abs(np.max(signal))-abs(np.min(signal)))*self.frequency_amplitude):
- index_of_outlier = np.where(signal == outlier)
- return index_of_outlier[0]
- else:
- return None
- def get_scores(self):
- """ Returns scores for each data point, after calculating scores based on naive algorithm.
- Differences current value from previous value, and uses the ratio between current value and last value to find scores.
- Returns:
- scores (list of floats): The score of each point. Usually no threshold, but referenced by higher value tends to be outlier
- """
- score = []
- for i in range(self.data.size):
- elem = self.data[i]
- fft = np.fft.fft([elem])
- fft = np.abs(fft)
- score.append(fft.mean())
- return score
- def FastFourierTransformation(self):
- """ This method finds outliers based on FastFourierTransform algorithm. Acts as driver function, which calls __detect_outlier_position_by_fft__ method by passing small signals.
- This method acts as driver method to find outliers in data. Slices original data in steps of size "window" and passed to __detect_outlier_position_by_fft__
- to detect outliers in that small part of signal. Stores all the index, plots the results, and saves it pre-defined paths.
- Returns:
- None, stores outliers in following instance variable.
- self.outliers (Pandas Dataframe): Stores the outliers detected in dataset as instance variable
- """
- outlier_positions = []
- self.scores = self.get_scores()
- for ii in range(self.window, self.data.size, 1):
- outlier_position = self.__detect_outlier_position_by_fft__(self.data[ii-self.window:ii+self.window])
- if outlier_position is not None:
- outlier_positions.append(ii + outlier_position[0] - self.window)
- outlier_positions = list(set(outlier_positions))
- score_df = pd.DataFrame(data=np.array(self.scores), index=self.data_ts, columns=['{}_FFT_Score'.format(self.metric)])
- self.scores = score_df
- if self.plot:
- plt.figure(figsize=(12, 6))
- try:
- plt.scatter(range(self.data.size), self.data, c=self.COLOR_PALETTE[0], label='Original Signal')
- plt.scatter(outlier_positions, self.data[np.asanyarray(outlier_positions)], c=self.COLOR_PALETTE[-1], label='Outliers')
- except:
- pass
- plt.legend()
- try:
- plt.savefig(self.plot_path + "outlier_positions.png")
- except:
- pass
- plt.clf()
- outliers = []
- outlier_ts = []
- for ind in outlier_positions:
- outliers.append(self.data[ind])
- if self.plot:
- outlier_ts.append(self.data_ts.iloc[ind])
- plt.plot(self.data)
- plt.scatter(outlier_positions, outliers)
- try:
- plt.savefig(self.plot_path + "outliers with data.png")
- except IOError:
- pass
- plt.clf()
- self.outliers = pd.DataFrame(data=np.array(outliers), columns=['outliers'])
- pass
- def return_scores_and_outliers(self):
- """ Returns scores of each data point and outliers in dataset.
- Returns:
- self.scores (list of floats): The score of each point. Usually no threshold, but referenced by higher value tends to be outlier
- self.outliers (Pandas Dataframe): Stores the outliers detected in dataset
- """
- return self.scores, self.outliers
- def main():
- threshold_freq=1
- frequency_amplitude=2
- date_start = datetime.datetime(2015, 11, 20)
- date_end = datetime.datetime(2015, 11, 21)
- obj = FastFourierTransform
- obj(date_start, date_end, threshold_freq, frequency_amplitude)