PageRenderTime 28ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 1ms

/monitor_batch/pymodules/python2.7/lib/python/cnvlib/smoothing.py

https://gitlab.com/pooja043/Globus_Docker_4
Python | 226 lines | 199 code | 8 blank | 19 comment | 6 complexity | 514497b1524948061f2b33abe1c6057a MD5 | raw file
  1. """Signal smoothing functions."""
  2. from __future__ import absolute_import, division
  3. from bisect import insort, bisect_left
  4. from collections import deque
  5. import math
  6. import numpy as np
  7. import pandas as pd
  8. from . import core, metrics
  9. def check_inputs(x, width):
  10. """Transform width into a half-window size.
  11. `width` is either a fraction of the length of `x` or an integer size of the
  12. whole window. The output half-window size is truncated to the length of `x`
  13. if needed.
  14. """
  15. x = np.asfarray(x)
  16. if 0 < width < 1:
  17. wing = int(math.ceil(len(x) * width * 0.5))
  18. elif width >= 2 and int(width) == width:
  19. wing = width // 2
  20. else:
  21. raise ValueError("width must be between 0 and 1 (got %s)" % width)
  22. if wing > len(x):
  23. wing = len(x) - 1
  24. assert wing > 0, "Wing must be greater than 0 (got %s)" % wing
  25. return x, wing
  26. def rolling_median(x, width):
  27. """Rolling median with mirrored edges.
  28. Contributed by Peter Otten to comp.lang.python.
  29. This is (somehow) faster than pandas' Cythonized skip-list implementation
  30. for arrays smaller than ~100,000 elements.
  31. Source:
  32. https://bitbucket.org/janto/snippets/src/tip/running_median.py
  33. https://groups.google.com/d/msg/comp.lang.python/0OARyHF0wtA/SEs-glW4t6gJ
  34. """
  35. x, wing = check_inputs(x, width)
  36. # Pad the edges of the original array with mirror copies
  37. signal = np.concatenate((x[wing-1::-1], x, x[:-wing-1:-1]))
  38. rolled = pd.rolling_median(signal, 2 * wing + 1, center=True)
  39. return rolled[wing:-wing]
  40. def rolling_quantile(x, width, quantile):
  41. """Rolling quantile (0--1) with mirrored edges."""
  42. x, wing = check_inputs(x, width)
  43. # Pad the edges of the original array with mirror copies
  44. signal = np.concatenate((x[wing-1::-1], x, x[:-wing-1:-1]))
  45. rolled = pd.rolling_quantile(signal, 2 * wing + 1, quantile, center=True)
  46. return rolled[wing:-wing]
  47. def rolling_std(x, width):
  48. """Rolling quantile (0--1) with mirrored edges."""
  49. x, wing = check_inputs(x, width)
  50. # Pad the edges of the original array with mirror copies
  51. signal = np.concatenate((x[wing-1::-1], x, x[:-wing-1:-1]))
  52. rolled = pd.rolling_std(signal, 2 * wing + 1, center=True)
  53. return rolled[wing:-wing]
  54. def smoothed(x, width, do_fit_edges=False):
  55. """Smooth the values in `x` with the Kaiser windowed filter.
  56. See: https://en.wikipedia.org/wiki/Kaiser_window
  57. Parameters:
  58. x : array-like
  59. 1-dimensional numeric data set.
  60. width : float
  61. Fraction of x's total length to include in the rolling window (i.e. the
  62. proportional window width), or the integer size of the window.
  63. """
  64. x, wing = check_inputs(x, width)
  65. # Pad the edges with mirror-image copies of the array
  66. signal = np.concatenate((x[wing-1::-1], x, x[:-wing-1:-1]))
  67. # Apply signal smoothing
  68. window = np.kaiser(2 * wing + 1, 14)
  69. y = np.convolve(window / window.sum(), signal, mode='same')
  70. # Chop off the ends of the result so it has the original size
  71. y = y[wing:-wing]
  72. if do_fit_edges:
  73. fit_edges(x, y, wing) # In-place
  74. return y
  75. def fit_edges(x, y, wing, polyorder=3):
  76. """Apply polynomial interpolation to the edges of y, in-place.
  77. Calculates a polynomial fit (of order `polyorder`) of `x` within a window of
  78. width twice `wing`, then updates the smoothed values `y` in the half of the
  79. window closest to the edge.
  80. """
  81. window_length = 2 * wing + 1
  82. n = len(x)
  83. # Fit each of the two array edges (start and end)
  84. _fit_edge(x, y, 0, window_length, 0, wing, polyorder)
  85. _fit_edge(x, y, n - window_length, n, n - wing, n, polyorder)
  86. # TODO - fix the discontinuities at wing, n-wing
  87. def _fit_edge(x, y, window_start, window_stop, interp_start, interp_stop,
  88. polyorder):
  89. """
  90. Given a 1-D array `x` and the specification of a slice of `x` from
  91. `window_start` to `window_stop`, create an interpolating polynomial of the
  92. sliced sub-array, and evaluate that polynomial from `interp_start` to
  93. `interp_stop`. Put the result into the corresponding slice of `y`.
  94. """
  95. # Get the edge into a (window_length, -1) array.
  96. x_edge = x[window_start:window_stop]
  97. # Fit the edges. poly_coeffs has shape (polyorder + 1, -1),
  98. # where '-1' is the same as in x_edge.
  99. poly_coeffs = np.polyfit(np.arange(0, window_stop - window_start),
  100. x_edge, polyorder)
  101. # Compute the interpolated values for the edge.
  102. i = np.arange(interp_start - window_start, interp_stop - window_start)
  103. values = np.polyval(poly_coeffs, i)
  104. # Put the values into the appropriate slice of y.
  105. y[interp_start:interp_stop] = values
  106. # Outlier detection
  107. def outlier_iqr(a, c=3.0):
  108. """Detect outliers as a multiple of the IQR from the median.
  109. By convention, "outliers" are points more than 1.5 * IQR from the median,
  110. and "extremes" or extreme outliers are those more than 3.0 * IQR.
  111. """
  112. a = np.asarray(a)
  113. dists = np.abs(a - np.median(a))
  114. iqr = metrics.interquartile_range(a)
  115. return dists > (c * iqr)
  116. def outlier_mad_median(a):
  117. """MAD-Median rule for detecting outliers.
  118. Returns: a boolean array of the same size, where outlier indices are True.
  119. X_i is an outlier if::
  120. | X_i - M |
  121. _____________ > K ~= 2.24
  122. MAD / 0.6745
  123. where $K = sqrt( X^2_{0.975,1} )$,
  124. the square root of the 0.975 quantile of a chi-squared distribution with 1
  125. degree of freedom.
  126. This is a very robust rule with the highest possible breakdown point of 0.5.
  127. See:
  128. - Davies & Gather (1993) The Identification of Multiple Outliers.
  129. - Rand R. Wilcox (2012) Introduction to robust estimation and hypothesis
  130. testing. Ch.3: Estimating measures of location and scale.
  131. """
  132. K = 2.24
  133. a = np.asarray(a)
  134. dists = np.abs(a - np.median(a))
  135. mad = metrics.median_absolute_deviation(a)
  136. return (dists / mad) > K
  137. def rolling_outlier_iqr(x, width, c=3.0):
  138. """Detect outliers as a multiple of the IQR from the median.
  139. By convention, "outliers" are points more than 1.5 * IQR from the median (~2
  140. SD if values are normally distributed), and "extremes" or extreme outliers
  141. are those more than 3.0 * IQR (~4 SD).
  142. """
  143. if len(x) <= width:
  144. return np.zeros(len(x), dtype=np.bool_)
  145. dists = x - smoothed(x, width)
  146. q_hi = rolling_quantile(dists, width, .75)
  147. q_lo = rolling_quantile(dists, width, .25)
  148. iqr = q_hi - q_lo
  149. outliers = (np.abs(dists) > iqr * c)
  150. return outliers
  151. def rolling_outlier_quantile(x, width, q, m):
  152. """Return a boolean mask of outliers by multiples of a quantile in a window.
  153. Outliers are the array elements outside `m` times the `q`'th
  154. quantile of deviations from the smoothed trend line, as calculated from
  155. the trend line residuals. (For example, take the magnitude of the 95th
  156. quantile times 5, and mark any elements greater than that value as
  157. outliers.)
  158. This is the smoothing method used in BIC-seq (doi:10.1073/pnas.1110574108)
  159. with the parameters width=200, q=.95, m=5 for WGS.
  160. """
  161. if len(x) <= width:
  162. return np.zeros(len(x), dtype=np.bool_)
  163. dists = np.abs(x - smoothed(x, width))
  164. quants = rolling_quantile(dists, width, q)
  165. outliers = (dists > quants * m)
  166. return outliers
  167. def rolling_outlier_std(x, width, stdevs):
  168. """Return a boolean mask of outliers by stdev within a rolling window.
  169. Outliers are the array elements outside `stdevs` standard deviations from
  170. the smoothed trend line, as calculated from the trend line residuals.
  171. """
  172. if len(x) <= width:
  173. return np.zeros(len(x), dtype=np.bool_)
  174. dists = x - smoothed(x, width)
  175. x_std = rolling_std(dists, width)
  176. outliers = (np.abs(dists) > x_std * stdevs)
  177. return outliers