PageRenderTime 98ms CodeModel.GetById 1ms RepoModel.GetById 1ms app.codeStats 0ms

/AnomalyDetection/Univariate/FastFourierTransform.py

https://gitlab.com/debasishk/PerformanceTest-Monitoring
Python | 328 lines | 312 code | 6 blank | 10 comment | 1 complexity | ba2797317a82f08fe1e3ced296f28ae3 MD5 | raw file
  1. # Project: Predictive Analytics
  2. # Author: Debasish Kanhar
  3. # UserID: BC03421
  4. import numpy as np
  5. import pandas as pd
  6. import matplotlib.pyplot as plt
  7. import datetime
  8. # from utils.mkdir import mkdir
  9. import os
  10. COLOR_PALETTE = [
  11. "#348ABD",
  12. "#A60628",
  13. "#7A68A6",
  14. "#467821",
  15. "#CF4457",
  16. "#188487",
  17. "#E24A33"
  18. ]
  19. plt_pth = 'Results\KalmanFilters\All\FFT\\'
  20. class FastFourierTransform:
  21. """
  22. This class is used to calculate Outliers in data based on Fast Fourier Transform algorithm
  23. """
  24. def __init__(self, thresh_freq, freq_amp, window=10, forecaster='KalmanFilters',
  25. out_path=plt_pth, data=pd.DataFrame(), test_data=False, plot=False):
  26. """ __init__method for FastFoutierTransform class based Outlier Detection
  27. Args:
  28. date_start (datetime.datetime): Start date for fetching of data
  29. date_end (datetime.datetime): End date for fetching of data
  30. thresh_freq (float): Threshold frequency for input signal wave.
  31. freq_amp (float): Frequency amplitude hyper parameter for input signal wave
  32. window (int): The window size to consider. Min and Max signal are found in this window as outliers
  33. forecaster (str): The name of forecasting algirithm used to predict values
  34. out_path (str): The output path where results are stored
  35. data (Pandas Dataframe): The input data where Outliers are to be found out
  36. test_data (bool): Defaults to False. If model to be used for Unit Tests, explicitely specify this variable as True to bypass get_data() method.
  37. plot (bool, Optional [Default False]): Flag variable to mention weather to plot results & save it or not.
  38. Returns:
  39. None
  40. """
  41. self.forecaster = forecaster
  42. self.window = window
  43. self.data = data
  44. self.threshold_freq = thresh_freq
  45. self.frequency_amplitude = freq_amp
  46. self.plot = plot
  47. self.COLOR_PALETTE = COLOR_PALETTE
  48. self.fmt = "%m-%d-%y %H:%M"
  49. if hasattr(self, 'metric'):
  50. self.plot_path = out_path + '{}\\FFT\\thresh-freq-{}-freq-amp-{}-win-{}_t\\'.format(
  51. self.metric, thresh_freq, freq_amp,self.window)
  52. else:
  53. self.plot_path = out_path + 'plot\\Outliers\\{}\\FFT\\test\\thresh-freq-{}-freq-amp-{}-win-{}_t\\complete\\'.format(
  54. self.forecaster, thresh_freq, freq_amp,self.window)
  55. if not test_data:
  56. if len(data.index):
  57. self.__driver__()
  58. pass
  59. def apply_(self, data=pd.DataFrame()):
  60. """
  61. Applies Fast Fourier transform algorithm to input dataset to find out Outliers and return scores of each point.
  62. Args:
  63. data (Pandas DataFrame): Input dataset on which outlier scores based on FFT algorithm to be found out.
  64. Returns:
  65. scores (Pandas DataFrame): FFT based outlier scores for each point stored in Pandas DataFrame indexed by tinestamp.
  66. """
  67. self.data = data
  68. self.__driver__()
  69. scores, outliers = self.return_scores_and_outliers()
  70. return scores
  71. def __driver__(self):
  72. """ Driver method for FastFourierTransform class
  73. Returns:
  74. None
  75. """
  76. # dirname = os.getcwd() + '\\' + self.plot_path
  77. # mkdir(dirname)
  78. self.__get_data__()
  79. self.FastFourierTransformation()
  80. pass
  81. def test(self, data=None, columns=None):
  82. """
  83. Test the Fast Fourier Transform module. This method is used for unit tests.
  84. Args:
  85. data (pandas Dataframe): The data to be tested against.
  86. It can have either '2' columns in DF, which corresponds to respective metrics, and timestamp,
  87. Or '1' column with index as timestamp
  88. Returns:
  89. test_output (Pandas DataFrame): Pandas Dataframe object containing 1 column which holds scores for each point
  90. """
  91. # self.__create_dirs__(self.plot_path)
  92. flag = 0
  93. for col in data.columns.tolist():
  94. if col == 'timestamp':
  95. flag = 1
  96. if flag == 1:
  97. self.data_ts = data['timestamp'].values
  98. else:
  99. self.data_ts = data.index.tolist()
  100. if not columns:
  101. columns = data.columns.tolist()
  102. else:
  103. columns = columns
  104. score = dict()
  105. for col in columns:
  106. self.metric = col
  107. self.data = data[self.metric].values
  108. self.FastFourierTransformation()
  109. score[col + '_FFT_Score'] = self.scores
  110. tmpList = []
  111. for k in score.keys():
  112. tmp = score[k]
  113. tmp.reset_index(inplace=True)
  114. tmpList.append(tmp)
  115. output = reduce(lambda left, right: pd.merge(left, right, on='index'), tmpList)
  116. output = output.drop('index', axis=1)
  117. return output
  118. def __create_dirs__(self, dirname):
  119. """ Create result directory to save results and not stop by throwing IOException.
  120. Args:
  121. dirname (str): The pathname to directory which is to be created
  122. Returns:
  123. None
  124. """
  125. if not os.path.exists(dirname):
  126. os.makedirs(dirname)
  127. pass
  128. def __get_start_end_index__(self):
  129. """ Fetches the start and end index of data depending on start_date and end_date timestamps in parameter list.
  130. Returns:
  131. start_index (int): The start index for fetching data
  132. end_index (int): The end index for fetching data
  133. """
  134. if self.data.shape or self.data:
  135. ts = self.data_ts['timestamp']
  136. ts = ts.tolist()
  137. ts = [datetime.datetime.strptime(elem, self.fmt) for elem in ts]
  138. start_index = ts.index(self.date_start)
  139. end_index = ts.index(self.date_end)
  140. self.data_ts.drop(['timestamp'], axis=1, inplace=True)
  141. self.data_ts['timestamp'] = ts
  142. self.data_ts.set_index(['timestamp'])
  143. else:
  144. raise ValueError('Empty Pandas dataframe passed as input data. Kindly check your input data')
  145. return start_index, end_index
  146. def __get_data__(self):
  147. """ Converts input data of type Pandas Dataframe object to numpy array
  148. Returns:
  149. None, stores final extracted data in instance variable.
  150. self.data (numpy.array): The data on which Outliers to be found out.
  151. """
  152. self.metric = [col for col in self.data.columns.tolist() if col is not 'timestamp'][0]
  153. if len(self.data.index):
  154. self.data_ts = self.data.index.tolist()
  155. self.data = self.data[self.metric].values
  156. else:
  157. raise ValueError('Empty Pandas dataframe passed as input data. Kindly check your input data')
  158. pass
  159. def __detect_outlier_position_by_fft__(self, signal):
  160. """ Detects the outliers and its positions in data (signal) passed to this method.
  161. Args:
  162. signal (numpy array, dtype: float, shape: 1 X window): The signal where outliers are to be found. This is same size of window.
  163. Returns:
  164. None, if no outliers found. Else,
  165. index_of_outlier[0] (int): The index of data point which was identified as outlier
  166. """
  167. fft_of_signal = np.fft.fft(signal)
  168. outlier = np.max(signal) if abs(np.max(signal)) > abs(np.min(signal)) else np.min(signal)
  169. if np.any(np.abs(fft_of_signal[self.threshold_freq:]) > self.frequency_amplitude) and \
  170. (np.abs(outlier) >= (abs(np.max(signal))-abs(np.min(signal)))*self.frequency_amplitude):
  171. index_of_outlier = np.where(signal == outlier)
  172. return index_of_outlier[0]
  173. else:
  174. return None
  175. def get_scores(self):
  176. """ Returns scores for each data point, after calculating scores based on naive algorithm.
  177. Differences current value from previous value, and uses the ratio between current value and last value to find scores.
  178. Returns:
  179. scores (list of floats): The score of each point. Usually no threshold, but referenced by higher value tends to be outlier
  180. """
  181. score = []
  182. for i in range(self.data.size):
  183. elem = self.data[i]
  184. fft = np.fft.fft([elem])
  185. fft = np.abs(fft)
  186. score.append(fft.mean())
  187. return score
  188. def FastFourierTransformation(self):
  189. """ This method finds outliers based on FastFourierTransform algorithm. Acts as driver function, which calls __detect_outlier_position_by_fft__ method by passing small signals.
  190. This method acts as driver method to find outliers in data. Slices original data in steps of size "window" and passed to __detect_outlier_position_by_fft__
  191. to detect outliers in that small part of signal. Stores all the index, plots the results, and saves it pre-defined paths.
  192. Returns:
  193. None, stores outliers in following instance variable.
  194. self.outliers (Pandas Dataframe): Stores the outliers detected in dataset as instance variable
  195. """
  196. outlier_positions = []
  197. self.scores = self.get_scores()
  198. for ii in range(self.window, self.data.size, 1):
  199. outlier_position = self.__detect_outlier_position_by_fft__(self.data[ii-self.window:ii+self.window])
  200. if outlier_position is not None:
  201. outlier_positions.append(ii + outlier_position[0] - self.window)
  202. outlier_positions = list(set(outlier_positions))
  203. score_df = pd.DataFrame(data=np.array(self.scores), index=self.data_ts, columns=['{}_FFT_Score'.format(self.metric)])
  204. self.scores = score_df
  205. if self.plot:
  206. plt.figure(figsize=(12, 6))
  207. try:
  208. plt.scatter(range(self.data.size), self.data, c=self.COLOR_PALETTE[0], label='Original Signal')
  209. plt.scatter(outlier_positions, self.data[np.asanyarray(outlier_positions)], c=self.COLOR_PALETTE[-1], label='Outliers')
  210. except:
  211. pass
  212. plt.legend()
  213. try:
  214. plt.savefig(self.plot_path + "outlier_positions.png")
  215. except:
  216. pass
  217. plt.clf()
  218. outliers = []
  219. outlier_ts = []
  220. for ind in outlier_positions:
  221. outliers.append(self.data[ind])
  222. if self.plot:
  223. outlier_ts.append(self.data_ts.iloc[ind])
  224. plt.plot(self.data)
  225. plt.scatter(outlier_positions, outliers)
  226. try:
  227. plt.savefig(self.plot_path + "outliers with data.png")
  228. except IOError:
  229. pass
  230. plt.clf()
  231. self.outliers = pd.DataFrame(data=np.array(outliers), columns=['outliers'])
  232. pass
  233. def return_scores_and_outliers(self):
  234. """ Returns scores of each data point and outliers in dataset.
  235. Returns:
  236. self.scores (list of floats): The score of each point. Usually no threshold, but referenced by higher value tends to be outlier
  237. self.outliers (Pandas Dataframe): Stores the outliers detected in dataset
  238. """
  239. return self.scores, self.outliers
  240. def main():
  241. threshold_freq=1
  242. frequency_amplitude=2
  243. date_start = datetime.datetime(2015, 11, 20)
  244. date_end = datetime.datetime(2015, 11, 21)
  245. obj = FastFourierTransform
  246. obj(date_start, date_end, threshold_freq, frequency_amplitude)