PeerAnalysis.py | searchcode

/AnomalyDetection/Utils/PeerAnalysis/PeerAnalysis.py

https://gitlab.com/debasishk/PerformanceTest-Monitoring
Python | 286 lines | 277 code | 1 blank | 8 comment | 0 complexity | b1ef1eac6b20670eb395049ef66195b0 MD5 | raw file

# Predictive-Analytics
# Debasish Kanhar
# BC03421
# 06-06-2016

"""
This is core module in Peer Analysis. The Engine shall fetch data from DataPreparation Sub-module, and then
correlation matrix is fetched from Correlation class. Once the correlation matrix is fetched, it becomes input data for this class.
The input data is of format:
    Dictionary: { 'server_name_1': { 'server_name_2': { 'pearsonr': pearsonr_correlation,
                                                        'normal': Normal correlation method },
                                     'server_name_3': { 'pearsonr': pearsonr_correlation,
                                                        'normal': Normal correlation method },
                                     .. so on for all remaining servers. }
                  'server_name_2': { 'server_name_1': { 'pearsonr': pearsonr_correlation,
                                                        'normal': Normal correlation method },
                                     'server_name_3': { 'pearsonr': pearsonr_correlation,
                                                        'normal': Normal correlation method },
                                     .. so on for all remaining servers. }
                  .. so on for all remaining servers. }

Logic:
    Get server wise correlation outliers. Get the most anomalous server w.r.t. 'original server' i.e. w.r.t. server 1/2...
    Get overall correlation outliers. i.e. Total 22 * 23 correlation points. Outliers among those. Keep a threshold i.e. max 5 outliers etc.

    Find common anomalous points from above 2 points. Those correlation pairs are most anomalous ones.
    If any of those anomalous point-pairs have common server sharing between them, then that common node is anomalous.
    If there are no common node,
        If anomalous point is fetched from 2nd method for fetching outliers, and there are no common nodes, search weather the outlier
        points from method 2, also fall as outliers under method 1. If yes, find the parent server for those outliers (Probably ?)
        If no, for now consider as noise. Can think better way to reduce correlation to original server names.
"""

from collections import OrderedDict
import pandas as pd
import datetime as dt
import numpy as np
import matplotlib.pyplot as plt
import sys

from AnomalyDetection.Univariate.LOF import LOF
from ...Utils.InitializeOutlierDetectionParams import ModelParams
from AnomalyDetection import MedianAbsoluteDeviation
from AnomalyDetection import FastFourierTransform
from ...Utils.convert_scale_scores import ConvertScores


class PeerAnalysis(object):
    def __init__(self):
        self.models = ['LOF', 'FFT', 'MAD']

        model_params = ModelParams()

        self.LOF_params = model_params.LOF_params
        self.FFT_params = model_params.FTT_params
        self.MAD_params = model_params.MAD_params

        self.good_correlation_algo = 'pearsonr'                    # Should be either 'pearsonr', 'correlation', or both

        self.f = open("output.txt", "a+")
        pass

    def apply_(self, data):
        self.dataset = self.format_input_set_to_readable_set(data)

        self.dataset.to_csv("test_output_correlation.csv")

        print("CSV created")
        self.outliers = self.find_outliers(self.dataset)

        anomalous_server = self.reduce_correlation_score_to_anomalous_server(self.outliers)

        for od_model in anomalous_server.keys():
            print("Results according to {} model".format(od_model))

            result = anomalous_server[od_model]

            self.print_output(result, file=self.f)

        return self.dataset

    def print_output(self, anomalous_server, file=None):

        if file is None:
            output = sys.stderr
        else:
            output = file

        sys.stdout = output

        print("_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-")
        print("\nFinal results\n")

        for k in anomalous_server.keys():
            print("Method: ", k)
            print("Description of method: ", anomalous_server[k]['descr'])

            try:
                strong = anomalous_server[k]['strong outlier']
                weak = anomalous_server[k]['moderately strong outliers']

                print("Strong outlier (Votes in brackets): {} ({})".format(strong, anomalous_server[k]['outliers'][strong]))

                weak_outlier_string = ""
                try:
                    for each_server in weak:
                        string = "{} ({})".format(each_server, anomalous_server[k]['outliers'][each_server])
                        weak_outlier_string += string
                except:
                    pass
                finally:
                    print("Moderately strong outlier (Votes in brackets) ", weak_outlier_string)

            except KeyError:
                server = anomalous_server[k]['outliers'].keys()[0]
                vote = anomalous_server[k]['outliers'][server]

                print("Outlier (Vote in brackets): {} ({})".format(server, vote))

    def format_input_set_to_readable_set(self, data):
        dataset = pd.DataFrame()

        for k in data.keys():
            for key in data[k].keys():
                col = "{}_and_{}".format(k, key)

                pearsonr = data[k][key]['pearsonr']
                correlation = data[k][key]['normal']

                tmpDict = {'relationship': col, 'pearsonr': pearsonr, 'correlation': correlation}
                dataset = dataset.append(tmpDict, ignore_index=True)

        return dataset

    def scale_columns(self, x, oldMin, oldMax):
        oldRange = oldMax - oldMin
        newRange = 100 - 0
        scaledValue = (((x - oldMin) * newRange) / oldRange) + 0
        return scaledValue

    def flag_outliers(self, x, threshold=1.5):
        if x > threshold:
            return True
        else:
            return False

    def find_outliers(self, data):
        outliers = dict()

        for od_model in self.models:
            if od_model == 'LOF':
                model = LOF(nbrs=self.LOF_params['neighbours'], minPts=self.LOF_params['minimum_points'],
                                  threshold=self.LOF_params['threshold'])

            elif od_model == 'FFT':
                model = FastFourierTransform(self.FFT_params['thresh_freq'], self.FFT_params['freq_amp'])

            elif od_model == 'MAD':
                model = MedianAbsoluteDeviation(self.MAD_params['threshold'])

            try:
                # When 'relationship' column present
                data.set_index('relationship', inplace=True, verify_integrity=False)
            except KeyError:
                # When not present, it implies that relationship column is index.
                pass
            finally:
                data.dropna(inplace=True)

            for col in data.columns.tolist():
                if col == self.good_correlation_algo:

                    min = data[col].min()
                    max = data[col].max()

                    # data[col] = data[col].apply(self.scale_columns, args=(min, max))

                    result = model.apply_(data[[col]])

                    self.result_col = result.columns.tolist()[0]

                    conversion_obj = ConvertScores()
                    z_score = conversion_obj.apply_(result, [self.result_col])

                    result['flag_anomalies'] = result[self.result_col].apply(self.flag_outliers)

                    outliers[od_model] = result[result['flag_anomalies'] == True]
                    result.plot()
                    plt.show()

        return outliers

    def __split_index__(self, od_idx):
        first_server = []
        second_server = []

        for idx in od_idx:
            split_idx = idx.split("_")

            first_server.append(split_idx[0])
            second_server.append(split_idx[-1])

        return first_server, second_server

    def assign_votes_to_most_occuring_server(self, common_servers, od_ids):
        assign_counts = dict()
        for server in common_servers:
            assign_counts[server] = 0
            for single_correlation_row in od_ids:
                if server in single_correlation_row:
                    assign_counts[server] += 1

        return assign_counts

    def reduce_correlation_score_to_anomalous_server(self, od_data):
        outlier_servers_return_set = dict()

        for od_model in od_data.keys():
            outlier_data = od_data[od_model]

            outlier_scores = outlier_data[self.result_col]
            outlier_indexes = outlier_data.index.tolist()

            first_server, second_server = self.__split_index__(outlier_indexes)

            common_elems = set(first_server) & set(second_server)

            assign_counts = self.assign_votes_to_most_occuring_server(common_elems, outlier_indexes)

            """------------------------------------------------------------------------------------------------------"""
            # Method 1 to find common server
            outlier_server = max(assign_counts.keys(), key=(lambda key: assign_counts[key]))

            """------------------------------------------------------------------------------------------------------"""
            # Method 2 to find common servers
            newVal = assign_counts[outlier_server] - .3*assign_counts[outlier_server]
            outlier_servers = []
            for k in assign_counts.keys():
                if k != outlier_server:
                    if assign_counts[k] >= newVal:
                        outlier_servers.append(k)

            outlier_servers = [outlier_server] + outlier_servers

            """------------------------------------------------------------------------------------------------------"""
            # Method 3 to get common servers
            maxScore = outlier_scores.max()
            threshold = .5 * maxScore
            newData = outlier_data[outlier_data[self.result_col] >= threshold]

            first_server, second_server = self.__split_index__(newData.index.tolist())
            common_servers = set(first_server) & set(second_server)
            new_votes = self.assign_votes_to_most_occuring_server(common_servers, newData.index.tolist())

            new_strong_outlier = max(new_votes.keys(), key=(lambda key: new_votes[key]))
            new_weak_outliers = new_votes.keys().remove(new_strong_outlier)

            """------------------------------------------------------------------------------------------------------"""
            # Prepare return value

            method_2_dict = OrderedDict()
            method_3_dict = OrderedDict()

            for s in outlier_servers:
                method_2_dict[s] = assign_counts[s]

            for s in new_votes.keys():
                method_3_dict[s] = new_votes[s]

            outlier_servers_return_set[od_model] = OrderedDict()
            outlier_servers_return_set[od_model]['method_1'] = {'descr': "This is the server with highest votes",
                                                      'outliers': {outlier_server: assign_counts[outlier_server]}}

            outlier_servers_return_set[od_model]['method_2'] = \
                                {'descr': "This is the server with highest votes, and servers in its vicinity of 30%",
                                    'outliers': method_2_dict,
                                    'strong outlier': outlier_server,
                                    'moderately strong outliers': outlier_servers.remove(outlier_server)}

            outlier_servers_return_set[od_model]['method_3'] = \
                                    {'descr': "These servers are highest contributing servers by highest scores",
                                    'outliers': method_3_dict,
                                    'strong outlier': new_strong_outlier,
                                    'moderately strong outliers': new_weak_outliers}

        return outlier_servers_return_set