PageRenderTime 97ms CodeModel.GetById 20ms RepoModel.GetById 1ms app.codeStats 0ms

/pandas/stats/misc.py

https://github.com/hoffstein/pandas
Python | 389 lines | 354 code | 13 blank | 22 comment | 4 complexity | dea92f0affa885f10c549938ca74154b MD5 | raw file
  1. from numpy import NaN
  2. from pandas import compat
  3. import numpy as np
  4. from pandas.core.api import Series, DataFrame
  5. from pandas.core.series import remove_na
  6. from pandas.compat import zip, lrange
  7. import pandas.core.common as com
  8. def zscore(series):
  9. return (series - series.mean()) / np.std(series, ddof=0)
  10. def correl_ts(frame1, frame2):
  11. """
  12. Pairwise correlation of columns of two DataFrame objects
  13. Parameters
  14. ----------
  15. Returns
  16. -------
  17. y : Series
  18. """
  19. results = {}
  20. for col, series in compat.iteritems(frame1):
  21. if col in frame2:
  22. other = frame2[col]
  23. idx1 = series.valid().index
  24. idx2 = other.valid().index
  25. common_index = idx1.intersection(idx2)
  26. seriesStand = zscore(series.reindex(common_index))
  27. otherStand = zscore(other.reindex(common_index))
  28. results[col] = (seriesStand * otherStand).mean()
  29. return Series(results)
  30. def correl_xs(frame1, frame2):
  31. return correl_ts(frame1.T, frame2.T)
  32. def percentileofscore(a, score, kind='rank'):
  33. """The percentile rank of a score relative to a list of scores.
  34. A `percentileofscore` of, for example, 80% means that 80% of the
  35. scores in `a` are below the given score. In the case of gaps or
  36. ties, the exact definition depends on the optional keyword, `kind`.
  37. Parameters
  38. ----------
  39. a: array like
  40. Array of scores to which `score` is compared.
  41. score: int or float
  42. Score that is compared to the elements in `a`.
  43. kind: {'rank', 'weak', 'strict', 'mean'}, optional
  44. This optional parameter specifies the interpretation of the
  45. resulting score:
  46. - "rank": Average percentage ranking of score. In case of
  47. multiple matches, average the percentage rankings of
  48. all matching scores.
  49. - "weak": This kind corresponds to the definition of a cumulative
  50. distribution function. A percentileofscore of 80%
  51. means that 80% of values are less than or equal
  52. to the provided score.
  53. - "strict": Similar to "weak", except that only values that are
  54. strictly less than the given score are counted.
  55. - "mean": The average of the "weak" and "strict" scores, often used in
  56. testing. See
  57. http://en.wikipedia.org/wiki/Percentile_rank
  58. Returns
  59. -------
  60. pcos : float
  61. Percentile-position of score (0-100) relative to `a`.
  62. Examples
  63. --------
  64. Three-quarters of the given values lie below a given score:
  65. >>> percentileofscore([1, 2, 3, 4], 3)
  66. 75.0
  67. With multiple matches, note how the scores of the two matches, 0.6
  68. and 0.8 respectively, are averaged:
  69. >>> percentileofscore([1, 2, 3, 3, 4], 3)
  70. 70.0
  71. Only 2/5 values are strictly less than 3:
  72. >>> percentileofscore([1, 2, 3, 3, 4], 3, kind='strict')
  73. 40.0
  74. But 4/5 values are less than or equal to 3:
  75. >>> percentileofscore([1, 2, 3, 3, 4], 3, kind='weak')
  76. 80.0
  77. The average between the weak and the strict scores is
  78. >>> percentileofscore([1, 2, 3, 3, 4], 3, kind='mean')
  79. 60.0
  80. """
  81. a = np.array(a)
  82. n = len(a)
  83. if kind == 'rank':
  84. if not(np.any(a == score)):
  85. a = np.append(a, score)
  86. a_len = np.array(lrange(len(a)))
  87. else:
  88. a_len = np.array(lrange(len(a))) + 1.0
  89. a = np.sort(a)
  90. idx = [a == score]
  91. pct = (np.mean(a_len[idx]) / n) * 100.0
  92. return pct
  93. elif kind == 'strict':
  94. return sum(a < score) / float(n) * 100
  95. elif kind == 'weak':
  96. return sum(a <= score) / float(n) * 100
  97. elif kind == 'mean':
  98. return (sum(a < score) + sum(a <= score)) * 50 / float(n)
  99. else:
  100. raise ValueError("kind can only be 'rank', 'strict', 'weak' or 'mean'")
  101. def percentileRank(frame, column=None, kind='mean'):
  102. """
  103. Return score at percentile for each point in time (cross-section)
  104. Parameters
  105. ----------
  106. frame: DataFrame
  107. column: string or Series, optional
  108. Column name or specific Series to compute percentiles for.
  109. If not provided, percentiles are computed for all values at each
  110. point in time. Note that this can take a LONG time.
  111. kind: {'rank', 'weak', 'strict', 'mean'}, optional
  112. This optional parameter specifies the interpretation of the
  113. resulting score:
  114. - "rank": Average percentage ranking of score. In case of
  115. multiple matches, average the percentage rankings of
  116. all matching scores.
  117. - "weak": This kind corresponds to the definition of a cumulative
  118. distribution function. A percentileofscore of 80%
  119. means that 80% of values are less than or equal
  120. to the provided score.
  121. - "strict": Similar to "weak", except that only values that are
  122. strictly less than the given score are counted.
  123. - "mean": The average of the "weak" and "strict" scores, often used in
  124. testing. See
  125. http://en.wikipedia.org/wiki/Percentile_rank
  126. Returns
  127. -------
  128. TimeSeries or DataFrame, depending on input
  129. """
  130. fun = lambda xs, score: percentileofscore(remove_na(xs),
  131. score, kind=kind)
  132. results = {}
  133. framet = frame.T
  134. if column is not None:
  135. if isinstance(column, Series):
  136. for date, xs in compat.iteritems(frame.T):
  137. results[date] = fun(xs, column.get(date, NaN))
  138. else:
  139. for date, xs in compat.iteritems(frame.T):
  140. results[date] = fun(xs, xs[column])
  141. results = Series(results)
  142. else:
  143. for column in frame.columns:
  144. for date, xs in compat.iteritems(framet):
  145. results.setdefault(date, {})[column] = fun(xs, xs[column])
  146. results = DataFrame(results).T
  147. return results
  148. def bucket(series, k, by=None):
  149. """
  150. Produce DataFrame representing quantiles of a Series
  151. Parameters
  152. ----------
  153. series : Series
  154. k : int
  155. number of quantiles
  156. by : Series or same-length array
  157. bucket by value
  158. Returns
  159. -------
  160. DataFrame
  161. """
  162. if by is None:
  163. by = series
  164. else:
  165. by = by.reindex(series.index)
  166. split = _split_quantile(by, k)
  167. mat = np.empty((len(series), k), dtype=float) * np.NaN
  168. for i, v in enumerate(split):
  169. mat[:, i][v] = series.take(v)
  170. return DataFrame(mat, index=series.index, columns=np.arange(k) + 1)
  171. def _split_quantile(arr, k):
  172. arr = np.asarray(arr)
  173. mask = np.isfinite(arr)
  174. order = arr[mask].argsort()
  175. n = len(arr)
  176. return np.array_split(np.arange(n)[mask].take(order), k)
  177. def bucketcat(series, cats):
  178. """
  179. Produce DataFrame representing quantiles of a Series
  180. Parameters
  181. ----------
  182. series : Series
  183. cat : Series or same-length array
  184. bucket by category; mutually exclusive with 'by'
  185. Returns
  186. -------
  187. DataFrame
  188. """
  189. if not isinstance(series, Series):
  190. series = Series(series, index=np.arange(len(series)))
  191. cats = np.asarray(cats)
  192. unique_labels = np.unique(cats)
  193. unique_labels = unique_labels[com.notnull(unique_labels)]
  194. # group by
  195. data = {}
  196. for label in unique_labels:
  197. data[label] = series[cats == label]
  198. return DataFrame(data, columns=unique_labels)
  199. def bucketpanel(series, bins=None, by=None, cat=None):
  200. """
  201. Bucket data by two Series to create summary panel
  202. Parameters
  203. ----------
  204. series : Series
  205. bins : tuple (length-2)
  206. e.g. (2, 2)
  207. by : tuple of Series
  208. bucket by value
  209. cat : tuple of Series
  210. bucket by category; mutually exclusive with 'by'
  211. Returns
  212. -------
  213. DataFrame
  214. """
  215. use_by = by is not None
  216. use_cat = cat is not None
  217. if use_by and use_cat:
  218. raise Exception('must specify by or cat, but not both')
  219. elif use_by:
  220. if len(by) != 2:
  221. raise Exception('must provide two bucketing series')
  222. xby, yby = by
  223. xbins, ybins = bins
  224. return _bucketpanel_by(series, xby, yby, xbins, ybins)
  225. elif use_cat:
  226. xcat, ycat = cat
  227. return _bucketpanel_cat(series, xcat, ycat)
  228. else:
  229. raise Exception('must specify either values or categories '
  230. 'to bucket by')
  231. def _bucketpanel_by(series, xby, yby, xbins, ybins):
  232. xby = xby.reindex(series.index)
  233. yby = yby.reindex(series.index)
  234. xlabels = _bucket_labels(xby.reindex(series.index), xbins)
  235. ylabels = _bucket_labels(yby.reindex(series.index), ybins)
  236. labels = _uniquify(xlabels, ylabels, xbins, ybins)
  237. mask = com.isnull(labels)
  238. labels[mask] = -1
  239. unique_labels = np.unique(labels)
  240. bucketed = bucketcat(series, labels)
  241. _ulist = list(labels)
  242. index_map = dict((x, _ulist.index(x)) for x in unique_labels)
  243. def relabel(key):
  244. pos = index_map[key]
  245. xlab = xlabels[pos]
  246. ylab = ylabels[pos]
  247. return '%sx%s' % (int(xlab) if com.notnull(xlab) else 'NULL',
  248. int(ylab) if com.notnull(ylab) else 'NULL')
  249. return bucketed.rename(columns=relabel)
  250. def _bucketpanel_cat(series, xcat, ycat):
  251. xlabels, xmapping = _intern(xcat)
  252. ylabels, ymapping = _intern(ycat)
  253. shift = 10 ** (np.ceil(np.log10(ylabels.max())))
  254. labels = xlabels * shift + ylabels
  255. sorter = labels.argsort()
  256. sorted_labels = labels.take(sorter)
  257. sorted_xlabels = xlabels.take(sorter)
  258. sorted_ylabels = ylabels.take(sorter)
  259. unique_labels = np.unique(labels)
  260. unique_labels = unique_labels[com.notnull(unique_labels)]
  261. locs = sorted_labels.searchsorted(unique_labels)
  262. xkeys = sorted_xlabels.take(locs)
  263. ykeys = sorted_ylabels.take(locs)
  264. stringified = ['(%s, %s)' % arg
  265. for arg in zip(xmapping.take(xkeys), ymapping.take(ykeys))]
  266. result = bucketcat(series, labels)
  267. result.columns = stringified
  268. return result
  269. def _intern(values):
  270. # assumed no NaN values
  271. values = np.asarray(values)
  272. uniqued = np.unique(values)
  273. labels = uniqued.searchsorted(values)
  274. return labels, uniqued
  275. def _uniquify(xlabels, ylabels, xbins, ybins):
  276. # encode the stuff, create unique label
  277. shifter = 10 ** max(xbins, ybins)
  278. _xpiece = xlabels * shifter
  279. _ypiece = ylabels
  280. return _xpiece + _ypiece
  281. def _bucket_labels(series, k):
  282. arr = np.asarray(series)
  283. mask = np.isfinite(arr)
  284. order = arr[mask].argsort()
  285. n = len(series)
  286. split = np.array_split(np.arange(n)[mask].take(order), k)
  287. mat = np.empty(n, dtype=float) * np.NaN
  288. for i, v in enumerate(split):
  289. mat[v] = i
  290. return mat + 1