PageRenderTime 26ms CodeModel.GetById 22ms RepoModel.GetById 0ms app.codeStats 0ms

/AnomalyDetection/Utils/PeerAnalysis/PeerAnalysis.py

https://gitlab.com/debasishk/PerformanceTest-Monitoring
Python | 286 lines | 277 code | 1 blank | 8 comment | 0 complexity | b1ef1eac6b20670eb395049ef66195b0 MD5 | raw file
  1. # Predictive-Analytics
  2. # Debasish Kanhar
  3. # BC03421
  4. # 06-06-2016
  5. """
  6. This is core module in Peer Analysis. The Engine shall fetch data from DataPreparation Sub-module, and then
  7. correlation matrix is fetched from Correlation class. Once the correlation matrix is fetched, it becomes input data for this class.
  8. The input data is of format:
  9. Dictionary: { 'server_name_1': { 'server_name_2': { 'pearsonr': pearsonr_correlation,
  10. 'normal': Normal correlation method },
  11. 'server_name_3': { 'pearsonr': pearsonr_correlation,
  12. 'normal': Normal correlation method },
  13. .. so on for all remaining servers. }
  14. 'server_name_2': { 'server_name_1': { 'pearsonr': pearsonr_correlation,
  15. 'normal': Normal correlation method },
  16. 'server_name_3': { 'pearsonr': pearsonr_correlation,
  17. 'normal': Normal correlation method },
  18. .. so on for all remaining servers. }
  19. .. so on for all remaining servers. }
  20. Logic:
  21. Get server wise correlation outliers. Get the most anomalous server w.r.t. 'original server' i.e. w.r.t. server 1/2...
  22. Get overall correlation outliers. i.e. Total 22 * 23 correlation points. Outliers among those. Keep a threshold i.e. max 5 outliers etc.
  23. Find common anomalous points from above 2 points. Those correlation pairs are most anomalous ones.
  24. If any of those anomalous point-pairs have common server sharing between them, then that common node is anomalous.
  25. If there are no common node,
  26. If anomalous point is fetched from 2nd method for fetching outliers, and there are no common nodes, search weather the outlier
  27. points from method 2, also fall as outliers under method 1. If yes, find the parent server for those outliers (Probably ?)
  28. If no, for now consider as noise. Can think better way to reduce correlation to original server names.
  29. """
  30. from collections import OrderedDict
  31. import pandas as pd
  32. import datetime as dt
  33. import numpy as np
  34. import matplotlib.pyplot as plt
  35. import sys
  36. from AnomalyDetection.Univariate.LOF import LOF
  37. from ...Utils.InitializeOutlierDetectionParams import ModelParams
  38. from AnomalyDetection import MedianAbsoluteDeviation
  39. from AnomalyDetection import FastFourierTransform
  40. from ...Utils.convert_scale_scores import ConvertScores
  41. class PeerAnalysis(object):
  42. def __init__(self):
  43. self.models = ['LOF', 'FFT', 'MAD']
  44. model_params = ModelParams()
  45. self.LOF_params = model_params.LOF_params
  46. self.FFT_params = model_params.FTT_params
  47. self.MAD_params = model_params.MAD_params
  48. self.good_correlation_algo = 'pearsonr' # Should be either 'pearsonr', 'correlation', or both
  49. self.f = open("output.txt", "a+")
  50. pass
  51. def apply_(self, data):
  52. self.dataset = self.format_input_set_to_readable_set(data)
  53. self.dataset.to_csv("test_output_correlation.csv")
  54. print("CSV created")
  55. self.outliers = self.find_outliers(self.dataset)
  56. anomalous_server = self.reduce_correlation_score_to_anomalous_server(self.outliers)
  57. for od_model in anomalous_server.keys():
  58. print("Results according to {} model".format(od_model))
  59. result = anomalous_server[od_model]
  60. self.print_output(result, file=self.f)
  61. return self.dataset
  62. def print_output(self, anomalous_server, file=None):
  63. if file is None:
  64. output = sys.stderr
  65. else:
  66. output = file
  67. sys.stdout = output
  68. print("_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-")
  69. print("\nFinal results\n")
  70. for k in anomalous_server.keys():
  71. print("Method: ", k)
  72. print("Description of method: ", anomalous_server[k]['descr'])
  73. try:
  74. strong = anomalous_server[k]['strong outlier']
  75. weak = anomalous_server[k]['moderately strong outliers']
  76. print("Strong outlier (Votes in brackets): {} ({})".format(strong, anomalous_server[k]['outliers'][strong]))
  77. weak_outlier_string = ""
  78. try:
  79. for each_server in weak:
  80. string = "{} ({})".format(each_server, anomalous_server[k]['outliers'][each_server])
  81. weak_outlier_string += string
  82. except:
  83. pass
  84. finally:
  85. print("Moderately strong outlier (Votes in brackets) ", weak_outlier_string)
  86. except KeyError:
  87. server = anomalous_server[k]['outliers'].keys()[0]
  88. vote = anomalous_server[k]['outliers'][server]
  89. print("Outlier (Vote in brackets): {} ({})".format(server, vote))
  90. def format_input_set_to_readable_set(self, data):
  91. dataset = pd.DataFrame()
  92. for k in data.keys():
  93. for key in data[k].keys():
  94. col = "{}_and_{}".format(k, key)
  95. pearsonr = data[k][key]['pearsonr']
  96. correlation = data[k][key]['normal']
  97. tmpDict = {'relationship': col, 'pearsonr': pearsonr, 'correlation': correlation}
  98. dataset = dataset.append(tmpDict, ignore_index=True)
  99. return dataset
  100. def scale_columns(self, x, oldMin, oldMax):
  101. oldRange = oldMax - oldMin
  102. newRange = 100 - 0
  103. scaledValue = (((x - oldMin) * newRange) / oldRange) + 0
  104. return scaledValue
  105. def flag_outliers(self, x, threshold=1.5):
  106. if x > threshold:
  107. return True
  108. else:
  109. return False
  110. def find_outliers(self, data):
  111. outliers = dict()
  112. for od_model in self.models:
  113. if od_model == 'LOF':
  114. model = LOF(nbrs=self.LOF_params['neighbours'], minPts=self.LOF_params['minimum_points'],
  115. threshold=self.LOF_params['threshold'])
  116. elif od_model == 'FFT':
  117. model = FastFourierTransform(self.FFT_params['thresh_freq'], self.FFT_params['freq_amp'])
  118. elif od_model == 'MAD':
  119. model = MedianAbsoluteDeviation(self.MAD_params['threshold'])
  120. try:
  121. # When 'relationship' column present
  122. data.set_index('relationship', inplace=True, verify_integrity=False)
  123. except KeyError:
  124. # When not present, it implies that relationship column is index.
  125. pass
  126. finally:
  127. data.dropna(inplace=True)
  128. for col in data.columns.tolist():
  129. if col == self.good_correlation_algo:
  130. min = data[col].min()
  131. max = data[col].max()
  132. # data[col] = data[col].apply(self.scale_columns, args=(min, max))
  133. result = model.apply_(data[[col]])
  134. self.result_col = result.columns.tolist()[0]
  135. conversion_obj = ConvertScores()
  136. z_score = conversion_obj.apply_(result, [self.result_col])
  137. result['flag_anomalies'] = result[self.result_col].apply(self.flag_outliers)
  138. outliers[od_model] = result[result['flag_anomalies'] == True]
  139. result.plot()
  140. plt.show()
  141. return outliers
  142. def __split_index__(self, od_idx):
  143. first_server = []
  144. second_server = []
  145. for idx in od_idx:
  146. split_idx = idx.split("_")
  147. first_server.append(split_idx[0])
  148. second_server.append(split_idx[-1])
  149. return first_server, second_server
  150. def assign_votes_to_most_occuring_server(self, common_servers, od_ids):
  151. assign_counts = dict()
  152. for server in common_servers:
  153. assign_counts[server] = 0
  154. for single_correlation_row in od_ids:
  155. if server in single_correlation_row:
  156. assign_counts[server] += 1
  157. return assign_counts
  158. def reduce_correlation_score_to_anomalous_server(self, od_data):
  159. outlier_servers_return_set = dict()
  160. for od_model in od_data.keys():
  161. outlier_data = od_data[od_model]
  162. outlier_scores = outlier_data[self.result_col]
  163. outlier_indexes = outlier_data.index.tolist()
  164. first_server, second_server = self.__split_index__(outlier_indexes)
  165. common_elems = set(first_server) & set(second_server)
  166. assign_counts = self.assign_votes_to_most_occuring_server(common_elems, outlier_indexes)
  167. """------------------------------------------------------------------------------------------------------"""
  168. # Method 1 to find common server
  169. outlier_server = max(assign_counts.keys(), key=(lambda key: assign_counts[key]))
  170. """------------------------------------------------------------------------------------------------------"""
  171. # Method 2 to find common servers
  172. newVal = assign_counts[outlier_server] - .3*assign_counts[outlier_server]
  173. outlier_servers = []
  174. for k in assign_counts.keys():
  175. if k != outlier_server:
  176. if assign_counts[k] >= newVal:
  177. outlier_servers.append(k)
  178. outlier_servers = [outlier_server] + outlier_servers
  179. """------------------------------------------------------------------------------------------------------"""
  180. # Method 3 to get common servers
  181. maxScore = outlier_scores.max()
  182. threshold = .5 * maxScore
  183. newData = outlier_data[outlier_data[self.result_col] >= threshold]
  184. first_server, second_server = self.__split_index__(newData.index.tolist())
  185. common_servers = set(first_server) & set(second_server)
  186. new_votes = self.assign_votes_to_most_occuring_server(common_servers, newData.index.tolist())
  187. new_strong_outlier = max(new_votes.keys(), key=(lambda key: new_votes[key]))
  188. new_weak_outliers = new_votes.keys().remove(new_strong_outlier)
  189. """------------------------------------------------------------------------------------------------------"""
  190. # Prepare return value
  191. method_2_dict = OrderedDict()
  192. method_3_dict = OrderedDict()
  193. for s in outlier_servers:
  194. method_2_dict[s] = assign_counts[s]
  195. for s in new_votes.keys():
  196. method_3_dict[s] = new_votes[s]
  197. outlier_servers_return_set[od_model] = OrderedDict()
  198. outlier_servers_return_set[od_model]['method_1'] = {'descr': "This is the server with highest votes",
  199. 'outliers': {outlier_server: assign_counts[outlier_server]}}
  200. outlier_servers_return_set[od_model]['method_2'] = \
  201. {'descr': "This is the server with highest votes, and servers in its vicinity of 30%",
  202. 'outliers': method_2_dict,
  203. 'strong outlier': outlier_server,
  204. 'moderately strong outliers': outlier_servers.remove(outlier_server)}
  205. outlier_servers_return_set[od_model]['method_3'] = \
  206. {'descr': "These servers are highest contributing servers by highest scores",
  207. 'outliers': method_3_dict,
  208. 'strong outlier': new_strong_outlier,
  209. 'moderately strong outliers': new_weak_outliers}
  210. return outlier_servers_return_set