PageRenderTime 60ms CodeModel.GetById 29ms RepoModel.GetById 0ms app.codeStats 0ms

/scipy/stats/mstats_extras.py

https://github.com/stefanv/scipy3
Python | 388 lines | 179 code | 24 blank | 185 comment | 22 complexity | dad7d5a55a4651958afc662881b25dc1 MD5 | raw file
  1. """
  2. Additional statistics functions, with support to MA.
  3. :author: Pierre GF Gerard-Marchant
  4. :contact: pierregm_at_uga_edu
  5. :date: $Date: 2007-10-29 17:18:13 +0200 (Mon, 29 Oct 2007) $
  6. :version: $Id: morestats.py 3473 2007-10-29 15:18:13Z jarrod.millman $
  7. """
  8. __author__ = "Pierre GF Gerard-Marchant"
  9. __docformat__ = "restructuredtext en"
  10. __all__ = ['compare_medians_ms',
  11. 'hdquantiles', 'hdmedian', 'hdquantiles_sd',
  12. 'idealfourths',
  13. 'median_cihs','mjci','mquantiles_cimj',
  14. 'rsh',
  15. 'trimmed_mean_ci',]
  16. import numpy as np
  17. from numpy import float_, int_, ndarray
  18. import numpy.ma as ma
  19. from numpy.ma import MaskedArray
  20. import mstats_basic as mstats
  21. from scipy.stats.distributions import norm, beta, t, binom
  22. #####--------------------------------------------------------------------------
  23. #---- --- Quantiles ---
  24. #####--------------------------------------------------------------------------
  25. def hdquantiles(data, prob=list([.25,.5,.75]), axis=None, var=False,):
  26. """Computes quantile estimates with the Harrell-Davis method, where the estimates
  27. are calculated as a weighted linear combination of order statistics.
  28. Parameters
  29. ----------
  30. data: ndarray
  31. Data array.
  32. prob: sequence
  33. Sequence of quantiles to compute.
  34. axis : int
  35. Axis along which to compute the quantiles. If None, use a flattened array.
  36. var : boolean
  37. Whether to return the variance of the estimate.
  38. Returns
  39. -------
  40. A (p,) array of quantiles (if ``var`` is False), or a (2,p) array of quantiles
  41. and variances (if ``var`` is True), where ``p`` is the number of quantiles.
  42. Notes
  43. -----
  44. The function is restricted to 2D arrays.
  45. """
  46. def _hd_1D(data,prob,var):
  47. "Computes the HD quantiles for a 1D array. Returns nan for invalid data."
  48. xsorted = np.squeeze(np.sort(data.compressed().view(ndarray)))
  49. # Don't use length here, in case we have a numpy scalar
  50. n = xsorted.size
  51. #.........
  52. hd = np.empty((2,len(prob)), float_)
  53. if n < 2:
  54. hd.flat = np.nan
  55. if var:
  56. return hd
  57. return hd[0]
  58. #.........
  59. v = np.arange(n+1) / float(n)
  60. betacdf = beta.cdf
  61. for (i,p) in enumerate(prob):
  62. _w = betacdf(v, (n+1)*p, (n+1)*(1-p))
  63. w = _w[1:] - _w[:-1]
  64. hd_mean = np.dot(w, xsorted)
  65. hd[0,i] = hd_mean
  66. #
  67. hd[1,i] = np.dot(w, (xsorted-hd_mean)**2)
  68. #
  69. hd[0, prob == 0] = xsorted[0]
  70. hd[0, prob == 1] = xsorted[-1]
  71. if var:
  72. hd[1, prob == 0] = hd[1, prob == 1] = np.nan
  73. return hd
  74. return hd[0]
  75. # Initialization & checks ---------
  76. data = ma.array(data, copy=False, dtype=float_)
  77. p = np.array(prob, copy=False, ndmin=1)
  78. # Computes quantiles along axis (or globally)
  79. if (axis is None) or (data.ndim == 1):
  80. result = _hd_1D(data, p, var)
  81. else:
  82. assert data.ndim <= 2, "Array should be 2D at most !"
  83. result = ma.apply_along_axis(_hd_1D, axis, data, p, var)
  84. #
  85. return ma.fix_invalid(result, copy=False)
  86. #..............................................................................
  87. def hdmedian(data, axis=-1, var=False):
  88. """Returns the Harrell-Davis estimate of the median along the given axis.
  89. Parameters
  90. ----------
  91. data: ndarray
  92. Data array.
  93. axis : int
  94. Axis along which to compute the quantiles. If None, use a flattened array.
  95. var : boolean
  96. Whether to return the variance of the estimate.
  97. """
  98. result = hdquantiles(data,[0.5], axis=axis, var=var)
  99. return result.squeeze()
  100. #..............................................................................
  101. def hdquantiles_sd(data, prob=list([.25,.5,.75]), axis=None):
  102. """Computes the standard error of the Harrell-Davis quantile estimates by jackknife.
  103. Parameters
  104. ----------
  105. data: ndarray
  106. Data array.
  107. prob: sequence
  108. Sequence of quantiles to compute.
  109. axis : int
  110. Axis along which to compute the quantiles. If None, use a flattened array.
  111. Notes
  112. -----
  113. The function is restricted to 2D arrays.
  114. """
  115. def _hdsd_1D(data,prob):
  116. "Computes the std error for 1D arrays."
  117. xsorted = np.sort(data.compressed())
  118. n = len(xsorted)
  119. #.........
  120. hdsd = np.empty(len(prob), float_)
  121. if n < 2:
  122. hdsd.flat = np.nan
  123. #.........
  124. vv = np.arange(n) / float(n-1)
  125. betacdf = beta.cdf
  126. #
  127. for (i,p) in enumerate(prob):
  128. _w = betacdf(vv, (n+1)*p, (n+1)*(1-p))
  129. w = _w[1:] - _w[:-1]
  130. mx_ = np.fromiter([np.dot(w,xsorted[np.r_[range(0,k),
  131. range(k+1,n)].astype(int_)])
  132. for k in range(n)], dtype=float_)
  133. mx_var = np.array(mx_.var(), copy=False, ndmin=1) * n / float(n-1)
  134. hdsd[i] = float(n-1) * np.sqrt(np.diag(mx_var).diagonal() / float(n))
  135. return hdsd
  136. # Initialization & checks ---------
  137. data = ma.array(data, copy=False, dtype=float_)
  138. p = np.array(prob, copy=False, ndmin=1)
  139. # Computes quantiles along axis (or globally)
  140. if (axis is None):
  141. result = _hdsd_1D(data, p)
  142. else:
  143. assert data.ndim <= 2, "Array should be 2D at most !"
  144. result = ma.apply_along_axis(_hdsd_1D, axis, data, p)
  145. #
  146. return ma.fix_invalid(result, copy=False).ravel()
  147. #####--------------------------------------------------------------------------
  148. #---- --- Confidence intervals ---
  149. #####--------------------------------------------------------------------------
  150. def trimmed_mean_ci(data, limits=(0.2,0.2), inclusive=(True,True),
  151. alpha=0.05, axis=None):
  152. """Returns the selected confidence interval of the trimmed mean along the
  153. given axis.
  154. Parameters
  155. ----------
  156. data : sequence
  157. Input data. The data is transformed to a masked array
  158. proportiontocut : float
  159. Proportion of the data to cut from each side of the data .
  160. As a result, (2*proportiontocut*n) values are actually trimmed.
  161. alpha : float
  162. Confidence level of the intervals.
  163. inclusive : tuple of boolean
  164. If relative==False, tuple indicating whether values exactly equal to the
  165. absolute limits are allowed.
  166. If relative==True, tuple indicating whether the number of data being masked
  167. on each side should be rounded (True) or truncated (False).
  168. axis : int
  169. Axis along which to cut. If None, uses a flattened version of the input.
  170. """
  171. data = ma.array(data, copy=False)
  172. trimmed = mstats.trimr(data, limits=limits, inclusive=inclusive, axis=axis)
  173. tmean = trimmed.mean(axis)
  174. tstde = mstats.trimmed_stde(data,limits=limits,inclusive=inclusive,axis=axis)
  175. df = trimmed.count(axis) - 1
  176. tppf = t.ppf(1-alpha/2.,df)
  177. return np.array((tmean - tppf*tstde, tmean+tppf*tstde))
  178. #..............................................................................
  179. def mjci(data, prob=[0.25,0.5,0.75], axis=None):
  180. """Returns the Maritz-Jarrett estimators of the standard error of selected
  181. experimental quantiles of the data.
  182. Parameters
  183. -----------
  184. data: ndarray
  185. Data array.
  186. prob: sequence
  187. Sequence of quantiles to compute.
  188. axis : int
  189. Axis along which to compute the quantiles. If None, use a flattened array.
  190. """
  191. def _mjci_1D(data, p):
  192. data = np.sort(data.compressed())
  193. n = data.size
  194. prob = (np.array(p) * n + 0.5).astype(int_)
  195. betacdf = beta.cdf
  196. #
  197. mj = np.empty(len(prob), float_)
  198. x = np.arange(1,n+1, dtype=float_) / n
  199. y = x - 1./n
  200. for (i,m) in enumerate(prob):
  201. (m1,m2) = (m-1, n-m)
  202. W = betacdf(x,m-1,n-m) - betacdf(y,m-1,n-m)
  203. C1 = np.dot(W,data)
  204. C2 = np.dot(W,data**2)
  205. mj[i] = np.sqrt(C2 - C1**2)
  206. return mj
  207. #
  208. data = ma.array(data, copy=False)
  209. assert data.ndim <= 2, "Array should be 2D at most !"
  210. p = np.array(prob, copy=False, ndmin=1)
  211. # Computes quantiles along axis (or globally)
  212. if (axis is None):
  213. return _mjci_1D(data, p)
  214. else:
  215. return ma.apply_along_axis(_mjci_1D, axis, data, p)
  216. #..............................................................................
  217. def mquantiles_cimj(data, prob=[0.25,0.50,0.75], alpha=0.05, axis=None):
  218. """
  219. Computes the alpha confidence interval for the selected quantiles of the
  220. data, with Maritz-Jarrett estimators.
  221. Parameters
  222. ----------
  223. data: ndarray
  224. Data array.
  225. prob: sequence
  226. Sequence of quantiles to compute.
  227. alpha : float
  228. Confidence level of the intervals.
  229. axis : integer
  230. Axis along which to compute the quantiles.
  231. If None, use a flattened array.
  232. """
  233. alpha = min(alpha, 1-alpha)
  234. z = norm.ppf(1-alpha/2.)
  235. xq = mstats.mquantiles(data, prob, alphap=0, betap=0, axis=axis)
  236. smj = mjci(data, prob, axis=axis)
  237. return (xq - z * smj, xq + z * smj)
  238. #.............................................................................
  239. def median_cihs(data, alpha=0.05, axis=None):
  240. """Computes the alpha-level confidence interval for the median of the data,
  241. following the Hettmasperger-Sheather method.
  242. Parameters
  243. ----------
  244. data : sequence
  245. Input data. Masked values are discarded. The input should be 1D only, or
  246. axis should be set to None.
  247. alpha : float
  248. Confidence level of the intervals.
  249. axis : integer
  250. Axis along which to compute the quantiles. If None, use a flattened array.
  251. """
  252. def _cihs_1D(data, alpha):
  253. data = np.sort(data.compressed())
  254. n = len(data)
  255. alpha = min(alpha, 1-alpha)
  256. k = int(binom._ppf(alpha/2., n, 0.5))
  257. gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5)
  258. if gk < 1-alpha:
  259. k -= 1
  260. gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5)
  261. gkk = binom.cdf(n-k-1,n,0.5) - binom.cdf(k,n,0.5)
  262. I = (gk - 1 + alpha)/(gk - gkk)
  263. lambd = (n-k) * I / float(k + (n-2*k)*I)
  264. lims = (lambd*data[k] + (1-lambd)*data[k-1],
  265. lambd*data[n-k-1] + (1-lambd)*data[n-k])
  266. return lims
  267. data = ma.rray(data, copy=False)
  268. # Computes quantiles along axis (or globally)
  269. if (axis is None):
  270. result = _cihs_1D(data.compressed(), alpha)
  271. else:
  272. assert data.ndim <= 2, "Array should be 2D at most !"
  273. result = ma.apply_along_axis(_cihs_1D, axis, data, alpha)
  274. #
  275. return result
  276. #..............................................................................
  277. def compare_medians_ms(group_1, group_2, axis=None):
  278. """Compares the medians from two independent groups along the given axis.
  279. The comparison is performed using the McKean-Schrader estimate of the standard
  280. error of the medians.
  281. Parameters
  282. ----------
  283. group_1 : {sequence}
  284. First dataset.
  285. group_2 : {sequence}
  286. Second dataset.
  287. axis : {integer}
  288. Axis along which the medians are estimated. If None, the arrays are flattened.
  289. Returns
  290. -------
  291. A (p,) array of comparison values.
  292. """
  293. (med_1, med_2) = (ma.median(group_1,axis=axis), ma.median(group_2,axis=axis))
  294. (std_1, std_2) = (mstats.stde_median(group_1, axis=axis),
  295. mstats.stde_median(group_2, axis=axis))
  296. W = np.abs(med_1 - med_2) / ma.sqrt(std_1**2 + std_2**2)
  297. return 1 - norm.cdf(W)
  298. def idealfourths(data, axis=None):
  299. """Returns an estimate of the lower and upper quartiles of the data along
  300. the given axis, as computed with the ideal fourths.
  301. """
  302. def _idf(data):
  303. x = data.compressed()
  304. n = len(x)
  305. if n < 3:
  306. return [np.nan,np.nan]
  307. (j,h) = divmod(n/4. + 5/12.,1)
  308. qlo = (1-h)*x[j-1] + h*x[j]
  309. k = n - j
  310. qup = (1-h)*x[k] + h*x[k-1]
  311. return [qlo, qup]
  312. data = ma.sort(data, axis=axis).view(MaskedArray)
  313. if (axis is None):
  314. return _idf(data)
  315. else:
  316. return ma.apply_along_axis(_idf, axis, data)
  317. def rsh(data, points=None):
  318. """Evaluates Rosenblatt's shifted histogram estimators for each point
  319. on the dataset 'data'.
  320. Parameters
  321. data : sequence
  322. Input data. Masked values are ignored.
  323. points : sequence
  324. Sequence of points where to evaluate Rosenblatt shifted histogram.
  325. If None, use the data.
  326. """
  327. data = ma.array(data, copy=False)
  328. if points is None:
  329. points = data
  330. else:
  331. points = np.array(points, copy=False, ndmin=1)
  332. if data.ndim != 1:
  333. raise AttributeError("The input array should be 1D only !")
  334. n = data.count()
  335. r = idealfourths(data, axis=None)
  336. h = 1.2 * (r[-1]-r[0]) / n**(1./5)
  337. nhi = (data[:,None] <= points[None,:] + h).sum(0)
  338. nlo = (data[:,None] < points[None,:] - h).sum(0)
  339. return (nhi-nlo) / (2.*n*h)
  340. ###############################################################################