/AnomalyDetection/Utils/PeerAnalysis/PeerAnalysis.py
Python | 286 lines | 277 code | 1 blank | 8 comment | 0 complexity | b1ef1eac6b20670eb395049ef66195b0 MD5 | raw file
- # Predictive-Analytics
- # Debasish Kanhar
- # BC03421
- # 06-06-2016
- """
- This is core module in Peer Analysis. The Engine shall fetch data from DataPreparation Sub-module, and then
- correlation matrix is fetched from Correlation class. Once the correlation matrix is fetched, it becomes input data for this class.
- The input data is of format:
- Dictionary: { 'server_name_1': { 'server_name_2': { 'pearsonr': pearsonr_correlation,
- 'normal': Normal correlation method },
- 'server_name_3': { 'pearsonr': pearsonr_correlation,
- 'normal': Normal correlation method },
- .. so on for all remaining servers. }
- 'server_name_2': { 'server_name_1': { 'pearsonr': pearsonr_correlation,
- 'normal': Normal correlation method },
- 'server_name_3': { 'pearsonr': pearsonr_correlation,
- 'normal': Normal correlation method },
- .. so on for all remaining servers. }
- .. so on for all remaining servers. }
- Logic:
- Get server wise correlation outliers. Get the most anomalous server w.r.t. 'original server' i.e. w.r.t. server 1/2...
- Get overall correlation outliers. i.e. Total 22 * 23 correlation points. Outliers among those. Keep a threshold i.e. max 5 outliers etc.
- Find common anomalous points from above 2 points. Those correlation pairs are most anomalous ones.
- If any of those anomalous point-pairs have common server sharing between them, then that common node is anomalous.
- If there are no common node,
- If anomalous point is fetched from 2nd method for fetching outliers, and there are no common nodes, search weather the outlier
- points from method 2, also fall as outliers under method 1. If yes, find the parent server for those outliers (Probably ?)
- If no, for now consider as noise. Can think better way to reduce correlation to original server names.
- """
- from collections import OrderedDict
- import pandas as pd
- import datetime as dt
- import numpy as np
- import matplotlib.pyplot as plt
- import sys
- from AnomalyDetection.Univariate.LOF import LOF
- from ...Utils.InitializeOutlierDetectionParams import ModelParams
- from AnomalyDetection import MedianAbsoluteDeviation
- from AnomalyDetection import FastFourierTransform
- from ...Utils.convert_scale_scores import ConvertScores
- class PeerAnalysis(object):
- def __init__(self):
- self.models = ['LOF', 'FFT', 'MAD']
- model_params = ModelParams()
- self.LOF_params = model_params.LOF_params
- self.FFT_params = model_params.FTT_params
- self.MAD_params = model_params.MAD_params
- self.good_correlation_algo = 'pearsonr' # Should be either 'pearsonr', 'correlation', or both
- self.f = open("output.txt", "a+")
- pass
- def apply_(self, data):
- self.dataset = self.format_input_set_to_readable_set(data)
- self.dataset.to_csv("test_output_correlation.csv")
- print("CSV created")
- self.outliers = self.find_outliers(self.dataset)
- anomalous_server = self.reduce_correlation_score_to_anomalous_server(self.outliers)
- for od_model in anomalous_server.keys():
- print("Results according to {} model".format(od_model))
- result = anomalous_server[od_model]
- self.print_output(result, file=self.f)
- return self.dataset
- def print_output(self, anomalous_server, file=None):
- if file is None:
- output = sys.stderr
- else:
- output = file
- sys.stdout = output
- print("_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-")
- print("\nFinal results\n")
- for k in anomalous_server.keys():
- print("Method: ", k)
- print("Description of method: ", anomalous_server[k]['descr'])
- try:
- strong = anomalous_server[k]['strong outlier']
- weak = anomalous_server[k]['moderately strong outliers']
- print("Strong outlier (Votes in brackets): {} ({})".format(strong, anomalous_server[k]['outliers'][strong]))
- weak_outlier_string = ""
- try:
- for each_server in weak:
- string = "{} ({})".format(each_server, anomalous_server[k]['outliers'][each_server])
- weak_outlier_string += string
- except:
- pass
- finally:
- print("Moderately strong outlier (Votes in brackets) ", weak_outlier_string)
- except KeyError:
- server = anomalous_server[k]['outliers'].keys()[0]
- vote = anomalous_server[k]['outliers'][server]
- print("Outlier (Vote in brackets): {} ({})".format(server, vote))
- def format_input_set_to_readable_set(self, data):
- dataset = pd.DataFrame()
- for k in data.keys():
- for key in data[k].keys():
- col = "{}_and_{}".format(k, key)
- pearsonr = data[k][key]['pearsonr']
- correlation = data[k][key]['normal']
- tmpDict = {'relationship': col, 'pearsonr': pearsonr, 'correlation': correlation}
- dataset = dataset.append(tmpDict, ignore_index=True)
- return dataset
- def scale_columns(self, x, oldMin, oldMax):
- oldRange = oldMax - oldMin
- newRange = 100 - 0
- scaledValue = (((x - oldMin) * newRange) / oldRange) + 0
- return scaledValue
- def flag_outliers(self, x, threshold=1.5):
- if x > threshold:
- return True
- else:
- return False
- def find_outliers(self, data):
- outliers = dict()
- for od_model in self.models:
- if od_model == 'LOF':
- model = LOF(nbrs=self.LOF_params['neighbours'], minPts=self.LOF_params['minimum_points'],
- threshold=self.LOF_params['threshold'])
- elif od_model == 'FFT':
- model = FastFourierTransform(self.FFT_params['thresh_freq'], self.FFT_params['freq_amp'])
- elif od_model == 'MAD':
- model = MedianAbsoluteDeviation(self.MAD_params['threshold'])
- try:
- # When 'relationship' column present
- data.set_index('relationship', inplace=True, verify_integrity=False)
- except KeyError:
- # When not present, it implies that relationship column is index.
- pass
- finally:
- data.dropna(inplace=True)
- for col in data.columns.tolist():
- if col == self.good_correlation_algo:
- min = data[col].min()
- max = data[col].max()
- # data[col] = data[col].apply(self.scale_columns, args=(min, max))
- result = model.apply_(data[[col]])
- self.result_col = result.columns.tolist()[0]
- conversion_obj = ConvertScores()
- z_score = conversion_obj.apply_(result, [self.result_col])
- result['flag_anomalies'] = result[self.result_col].apply(self.flag_outliers)
- outliers[od_model] = result[result['flag_anomalies'] == True]
- result.plot()
- plt.show()
- return outliers
- def __split_index__(self, od_idx):
- first_server = []
- second_server = []
- for idx in od_idx:
- split_idx = idx.split("_")
- first_server.append(split_idx[0])
- second_server.append(split_idx[-1])
- return first_server, second_server
- def assign_votes_to_most_occuring_server(self, common_servers, od_ids):
- assign_counts = dict()
- for server in common_servers:
- assign_counts[server] = 0
- for single_correlation_row in od_ids:
- if server in single_correlation_row:
- assign_counts[server] += 1
- return assign_counts
- def reduce_correlation_score_to_anomalous_server(self, od_data):
- outlier_servers_return_set = dict()
- for od_model in od_data.keys():
- outlier_data = od_data[od_model]
- outlier_scores = outlier_data[self.result_col]
- outlier_indexes = outlier_data.index.tolist()
- first_server, second_server = self.__split_index__(outlier_indexes)
- common_elems = set(first_server) & set(second_server)
- assign_counts = self.assign_votes_to_most_occuring_server(common_elems, outlier_indexes)
- """------------------------------------------------------------------------------------------------------"""
- # Method 1 to find common server
- outlier_server = max(assign_counts.keys(), key=(lambda key: assign_counts[key]))
- """------------------------------------------------------------------------------------------------------"""
- # Method 2 to find common servers
- newVal = assign_counts[outlier_server] - .3*assign_counts[outlier_server]
- outlier_servers = []
- for k in assign_counts.keys():
- if k != outlier_server:
- if assign_counts[k] >= newVal:
- outlier_servers.append(k)
- outlier_servers = [outlier_server] + outlier_servers
- """------------------------------------------------------------------------------------------------------"""
- # Method 3 to get common servers
- maxScore = outlier_scores.max()
- threshold = .5 * maxScore
- newData = outlier_data[outlier_data[self.result_col] >= threshold]
- first_server, second_server = self.__split_index__(newData.index.tolist())
- common_servers = set(first_server) & set(second_server)
- new_votes = self.assign_votes_to_most_occuring_server(common_servers, newData.index.tolist())
- new_strong_outlier = max(new_votes.keys(), key=(lambda key: new_votes[key]))
- new_weak_outliers = new_votes.keys().remove(new_strong_outlier)
- """------------------------------------------------------------------------------------------------------"""
- # Prepare return value
- method_2_dict = OrderedDict()
- method_3_dict = OrderedDict()
- for s in outlier_servers:
- method_2_dict[s] = assign_counts[s]
- for s in new_votes.keys():
- method_3_dict[s] = new_votes[s]
- outlier_servers_return_set[od_model] = OrderedDict()
- outlier_servers_return_set[od_model]['method_1'] = {'descr': "This is the server with highest votes",
- 'outliers': {outlier_server: assign_counts[outlier_server]}}
- outlier_servers_return_set[od_model]['method_2'] = \
- {'descr': "This is the server with highest votes, and servers in its vicinity of 30%",
- 'outliers': method_2_dict,
- 'strong outlier': outlier_server,
- 'moderately strong outliers': outlier_servers.remove(outlier_server)}
- outlier_servers_return_set[od_model]['method_3'] = \
- {'descr': "These servers are highest contributing servers by highest scores",
- 'outliers': method_3_dict,
- 'strong outlier': new_strong_outlier,
- 'moderately strong outliers': new_weak_outliers}
- return outlier_servers_return_set