PageRenderTime 43ms CodeModel.GetById 16ms RepoModel.GetById 0ms app.codeStats 0ms

/src/analyzer/algorithms.py

https://github.com/which07/skyline
Python | 221 lines | 133 code | 6 blank | 82 comment | 6 complexity | 33767cfefaeb939caf00737b2b4403a7 MD5 | raw file
Possible License(s): MIT
  1. import pandas
  2. import numpy as np
  3. import scipy
  4. import statsmodels.api as sm
  5. from time import time
  6. import traceback
  7. import logging
  8. from settings import (
  9. ALGORITHMS,
  10. CONSENSUS,
  11. FULL_DURATION,
  12. MAX_TOLERABLE_BOREDOM,
  13. MIN_TOLERABLE_LENGTH,
  14. STALE_PERIOD
  15. )
  16. from algorithm_exceptions import *
  17. logger = logging.getLogger("AnalyzerLog")
  18. """
  19. This is no man's land. Do anything you want in here,
  20. as long as you return a boolean that determines whether the input
  21. timeseries is anomalous or not.
  22. To add an algorithm, define it here, and add its name to settings.ALGORITHMS.
  23. """
  24. def tail_avg(timeseries):
  25. """
  26. This is a utility function used to calculate the average of the last three
  27. datapoints in the series as a measure, instead of just the last datapoint.
  28. It reduces noise, but it also reduces sensitivity and increases the delay
  29. to detection.
  30. """
  31. try:
  32. t = (timeseries[-1][1] + timeseries[-2][1] + timeseries[-3][1]) / 3
  33. return t
  34. except IndexError:
  35. return timeseries[-1][1]
  36. def median_absolute_deviation(timeseries):
  37. """
  38. A timeseries is anomalous if the deviation of its latest datapoint with
  39. respect to the median is X times larger than the median of deviations.
  40. """
  41. series = pandas.Series([x[1] for x in timeseries])
  42. median = series.median()
  43. demedianed = np.abs(series - median)
  44. median_deviation = demedianed.median()
  45. # The test statistic is infinite when the median is zero,
  46. # so it becomes super sensitive. We play it safe and skip when this happens.
  47. if median_deviation == 0:
  48. return False
  49. test_statistic = demedianed.iget(-1) / median_deviation
  50. # Completely arbitary...triggers if the median deviation is
  51. # 6 times bigger than the median
  52. if test_statistic > 6:
  53. return True
  54. def grubbs(timeseries):
  55. """
  56. A timeseries is anomalous if the Z score is greater than the Grubb's score.
  57. """
  58. series = scipy.array([x[1] for x in timeseries])
  59. stdDev = scipy.std(series)
  60. mean = np.mean(series)
  61. tail_average = tail_avg(timeseries)
  62. z_score = (tail_average - mean) / stdDev
  63. len_series = len(series)
  64. threshold = scipy.stats.t.isf(.05 / (2 * len_series) , len_series - 2)
  65. threshold_squared = threshold * threshold
  66. grubbs_score = ((len_series - 1) / np.sqrt(len_series)) * np.sqrt(threshold_squared / (len_series - 2 + threshold_squared))
  67. return z_score > grubbs_score
  68. def first_hour_average(timeseries):
  69. """
  70. Calcuate the simple average over one hour, FULL_DURATION seconds ago.
  71. A timeseries is anomalous if the average of the last three datapoints
  72. are outside of three standard deviations of this value.
  73. """
  74. last_hour_threshold = time() - (FULL_DURATION - 3600)
  75. series = pandas.Series([x[1] for x in timeseries if x[0] < last_hour_threshold])
  76. mean = (series).mean()
  77. stdDev = (series).std()
  78. t = tail_avg(timeseries)
  79. return abs(t - mean) > 3 * stdDev
  80. def simple_stddev_from_moving_average(timeseries):
  81. """
  82. A timeseries is anomalous if the absolute value of the average of the latest
  83. three datapoint minus the moving average is greater than one standard
  84. deviation of the average. This does not exponentially weight the MA and so
  85. is better for detecting anomalies with respect to the entire series.
  86. """
  87. series = pandas.Series([x[1] for x in timeseries])
  88. mean = series.mean()
  89. stdDev = series.std()
  90. t = tail_avg(timeseries)
  91. return abs(t - mean) > 3 * stdDev
  92. def stddev_from_moving_average(timeseries):
  93. """
  94. A timeseries is anomalous if the absolute value of the average of the latest
  95. three datapoint minus the moving average is greater than one standard
  96. deviation of the moving average. This is better for finding anomalies with
  97. respect to the short term trends.
  98. """
  99. series = pandas.Series([x[1] for x in timeseries])
  100. expAverage = pandas.stats.moments.ewma(series, com=50)
  101. stdDev = pandas.stats.moments.ewmstd(series, com=50)
  102. return abs(series.iget(-1) - expAverage.iget(-1)) > 3 * stdDev.iget(-1)
  103. def mean_subtraction_cumulation(timeseries):
  104. """
  105. A timeseries is anomalous if the value of the next datapoint in the
  106. series is farther than a standard deviation out in cumulative terms
  107. after subtracting the mean from each data point.
  108. """
  109. series = pandas.Series([x[1] if x[1] else 0 for x in timeseries])
  110. series = series - series[0:len(series) - 1].mean()
  111. stdDev = series[0:len(series) - 1].std()
  112. expAverage = pandas.stats.moments.ewma(series, com=15)
  113. return abs(series.iget(-1)) > 3 * stdDev
  114. def least_squares(timeseries):
  115. """
  116. A timeseries is anomalous if the average of the last three datapoints
  117. on a projected least squares model is greater than three sigma.
  118. """
  119. x = np.array([t[0] for t in timeseries])
  120. y = np.array([t[1] for t in timeseries])
  121. A = np.vstack([x, np.ones(len(x))]).T
  122. results = np.linalg.lstsq(A, y)
  123. residual = results[1]
  124. m, c = np.linalg.lstsq(A, y)[0]
  125. errors = []
  126. for i, value in enumerate(y):
  127. projected = m * x[i] + c
  128. error = value - projected
  129. errors.append(error)
  130. if len(errors) < 3:
  131. return False
  132. std_dev = scipy.std(errors)
  133. t = (errors[-1] + errors[-2] + errors[-3]) / 3
  134. return abs(t) > std_dev * 3 and round(std_dev) != 0 and round(t) != 0
  135. def histogram_bins(timeseries):
  136. """
  137. A timeseries is anomalous if the average of the last three datapoints falls
  138. into a histogram bin with less than 20 other datapoints (you'll need to tweak
  139. that number depending on your data)
  140. Returns: the size of the bin which contains the tail_avg. Smaller bin size
  141. means more anomalous.
  142. """
  143. series = scipy.array([x[1] for x in timeseries])
  144. t = tail_avg(timeseries)
  145. h = np.histogram(series, bins=15)
  146. bins = h[1]
  147. for index, bin_size in enumerate(h[0]):
  148. if bin_size <= 20:
  149. # Is it in the first bin?
  150. if index == 0:
  151. if t <= bins[0]:
  152. return True
  153. # Is it in the current bin?
  154. elif t >= bins[index] and t < bins[index + 1]:
  155. return True
  156. return False
  157. def run_selected_algorithm(timeseries):
  158. """
  159. Filter timeseries and run selected algorithm.
  160. """
  161. # Get rid of short series
  162. if len(timeseries) < MIN_TOLERABLE_LENGTH:
  163. raise TooShort()
  164. # Get rid of stale series
  165. if time() - timeseries[-1][0] > STALE_PERIOD:
  166. raise Stale()
  167. # Get rid of incomplete series
  168. duration = timeseries[-1][0] - timeseries[0][0]
  169. if duration < FULL_DURATION:
  170. raise Incomplete()
  171. # Get rid of boring series
  172. if len(set(item[1] for item in timeseries[-MAX_TOLERABLE_BOREDOM:])) == 1:
  173. raise Boring()
  174. try:
  175. ensemble = [globals()[algorithm](timeseries) for algorithm in ALGORITHMS]
  176. threshold = len(ensemble) - CONSENSUS
  177. if ensemble.count(False) <= threshold:
  178. return True, ensemble, tail_avg(timeseries)
  179. return False, ensemble, timeseries[-1][1]
  180. except:
  181. logging.error("Algorithm error: " + traceback.format_exc())
  182. return False, [], 1