PageRenderTime 41ms CodeModel.GetById 12ms RepoModel.GetById 1ms app.codeStats 0ms

/create_batch_for_workflow/pymodules/python2.7/lib/python/pandas-0.17.1-py2.7-linux-x86_64.egg/pandas/stats/misc.py

https://gitlab.com/pooja043/Globus_Docker_2
Python | 386 lines | 352 code | 12 blank | 22 comment | 4 complexity | e40fc64905792b95961e59ded55fd96d MD5 | raw file
  1. from numpy import NaN
  2. from pandas import compat
  3. import numpy as np
  4. from pandas.core.api import Series, DataFrame, isnull, notnull
  5. from pandas.core.series import remove_na
  6. from pandas.compat import zip
  7. def zscore(series):
  8. return (series - series.mean()) / np.std(series, ddof=0)
  9. def correl_ts(frame1, frame2):
  10. """
  11. Pairwise correlation of columns of two DataFrame objects
  12. Parameters
  13. ----------
  14. Returns
  15. -------
  16. y : Series
  17. """
  18. results = {}
  19. for col, series in compat.iteritems(frame1):
  20. if col in frame2:
  21. other = frame2[col]
  22. idx1 = series.valid().index
  23. idx2 = other.valid().index
  24. common_index = idx1.intersection(idx2)
  25. seriesStand = zscore(series.reindex(common_index))
  26. otherStand = zscore(other.reindex(common_index))
  27. results[col] = (seriesStand * otherStand).mean()
  28. return Series(results)
  29. def correl_xs(frame1, frame2):
  30. return correl_ts(frame1.T, frame2.T)
  31. def percentileofscore(a, score, kind='rank'):
  32. """The percentile rank of a score relative to a list of scores.
  33. A `percentileofscore` of, for example, 80% means that 80% of the
  34. scores in `a` are below the given score. In the case of gaps or
  35. ties, the exact definition depends on the optional keyword, `kind`.
  36. Parameters
  37. ----------
  38. a: array like
  39. Array of scores to which `score` is compared.
  40. score: int or float
  41. Score that is compared to the elements in `a`.
  42. kind: {'rank', 'weak', 'strict', 'mean'}, optional
  43. This optional parameter specifies the interpretation of the
  44. resulting score:
  45. - "rank": Average percentage ranking of score. In case of
  46. multiple matches, average the percentage rankings of
  47. all matching scores.
  48. - "weak": This kind corresponds to the definition of a cumulative
  49. distribution function. A percentileofscore of 80%
  50. means that 80% of values are less than or equal
  51. to the provided score.
  52. - "strict": Similar to "weak", except that only values that are
  53. strictly less than the given score are counted.
  54. - "mean": The average of the "weak" and "strict" scores, often used in
  55. testing. See
  56. http://en.wikipedia.org/wiki/Percentile_rank
  57. Returns
  58. -------
  59. pcos : float
  60. Percentile-position of score (0-100) relative to `a`.
  61. Examples
  62. --------
  63. Three-quarters of the given values lie below a given score:
  64. >>> percentileofscore([1, 2, 3, 4], 3)
  65. 75.0
  66. With multiple matches, note how the scores of the two matches, 0.6
  67. and 0.8 respectively, are averaged:
  68. >>> percentileofscore([1, 2, 3, 3, 4], 3)
  69. 70.0
  70. Only 2/5 values are strictly less than 3:
  71. >>> percentileofscore([1, 2, 3, 3, 4], 3, kind='strict')
  72. 40.0
  73. But 4/5 values are less than or equal to 3:
  74. >>> percentileofscore([1, 2, 3, 3, 4], 3, kind='weak')
  75. 80.0
  76. The average between the weak and the strict scores is
  77. >>> percentileofscore([1, 2, 3, 3, 4], 3, kind='mean')
  78. 60.0
  79. """
  80. a = np.array(a)
  81. n = len(a)
  82. if kind == 'rank':
  83. if not(np.any(a == score)):
  84. a = np.append(a, score)
  85. a_len = np.array(lrange(len(a)))
  86. else:
  87. a_len = np.array(lrange(len(a))) + 1.0
  88. a = np.sort(a)
  89. idx = [a == score]
  90. pct = (np.mean(a_len[idx]) / n) * 100.0
  91. return pct
  92. elif kind == 'strict':
  93. return sum(a < score) / float(n) * 100
  94. elif kind == 'weak':
  95. return sum(a <= score) / float(n) * 100
  96. elif kind == 'mean':
  97. return (sum(a < score) + sum(a <= score)) * 50 / float(n)
  98. else:
  99. raise ValueError("kind can only be 'rank', 'strict', 'weak' or 'mean'")
  100. def percentileRank(frame, column=None, kind='mean'):
  101. """
  102. Return score at percentile for each point in time (cross-section)
  103. Parameters
  104. ----------
  105. frame: DataFrame
  106. column: string or Series, optional
  107. Column name or specific Series to compute percentiles for.
  108. If not provided, percentiles are computed for all values at each
  109. point in time. Note that this can take a LONG time.
  110. kind: {'rank', 'weak', 'strict', 'mean'}, optional
  111. This optional parameter specifies the interpretation of the
  112. resulting score:
  113. - "rank": Average percentage ranking of score. In case of
  114. multiple matches, average the percentage rankings of
  115. all matching scores.
  116. - "weak": This kind corresponds to the definition of a cumulative
  117. distribution function. A percentileofscore of 80%
  118. means that 80% of values are less than or equal
  119. to the provided score.
  120. - "strict": Similar to "weak", except that only values that are
  121. strictly less than the given score are counted.
  122. - "mean": The average of the "weak" and "strict" scores, often used in
  123. testing. See
  124. http://en.wikipedia.org/wiki/Percentile_rank
  125. Returns
  126. -------
  127. TimeSeries or DataFrame, depending on input
  128. """
  129. fun = lambda xs, score: percentileofscore(remove_na(xs),
  130. score, kind=kind)
  131. results = {}
  132. framet = frame.T
  133. if column is not None:
  134. if isinstance(column, Series):
  135. for date, xs in compat.iteritems(frame.T):
  136. results[date] = fun(xs, column.get(date, NaN))
  137. else:
  138. for date, xs in compat.iteritems(frame.T):
  139. results[date] = fun(xs, xs[column])
  140. results = Series(results)
  141. else:
  142. for column in frame.columns:
  143. for date, xs in compat.iteritems(framet):
  144. results.setdefault(date, {})[column] = fun(xs, xs[column])
  145. results = DataFrame(results).T
  146. return results
  147. def bucket(series, k, by=None):
  148. """
  149. Produce DataFrame representing quantiles of a Series
  150. Parameters
  151. ----------
  152. series : Series
  153. k : int
  154. number of quantiles
  155. by : Series or same-length array
  156. bucket by value
  157. Returns
  158. -------
  159. DataFrame
  160. """
  161. if by is None:
  162. by = series
  163. else:
  164. by = by.reindex(series.index)
  165. split = _split_quantile(by, k)
  166. mat = np.empty((len(series), k), dtype=float) * np.NaN
  167. for i, v in enumerate(split):
  168. mat[:, i][v] = series.take(v)
  169. return DataFrame(mat, index=series.index, columns=np.arange(k) + 1)
  170. def _split_quantile(arr, k):
  171. arr = np.asarray(arr)
  172. mask = np.isfinite(arr)
  173. order = arr[mask].argsort()
  174. n = len(arr)
  175. return np.array_split(np.arange(n)[mask].take(order), k)
  176. def bucketcat(series, cats):
  177. """
  178. Produce DataFrame representing quantiles of a Series
  179. Parameters
  180. ----------
  181. series : Series
  182. cat : Series or same-length array
  183. bucket by category; mutually exclusive with 'by'
  184. Returns
  185. -------
  186. DataFrame
  187. """
  188. if not isinstance(series, Series):
  189. series = Series(series, index=np.arange(len(series)))
  190. cats = np.asarray(cats)
  191. unique_labels = np.unique(cats)
  192. unique_labels = unique_labels[com.notnull(unique_labels)]
  193. # group by
  194. data = {}
  195. for label in unique_labels:
  196. data[label] = series[cats == label]
  197. return DataFrame(data, columns=unique_labels)
  198. def bucketpanel(series, bins=None, by=None, cat=None):
  199. """
  200. Bucket data by two Series to create summary panel
  201. Parameters
  202. ----------
  203. series : Series
  204. bins : tuple (length-2)
  205. e.g. (2, 2)
  206. by : tuple of Series
  207. bucket by value
  208. cat : tuple of Series
  209. bucket by category; mutually exclusive with 'by'
  210. Returns
  211. -------
  212. DataFrame
  213. """
  214. use_by = by is not None
  215. use_cat = cat is not None
  216. if use_by and use_cat:
  217. raise Exception('must specify by or cat, but not both')
  218. elif use_by:
  219. if len(by) != 2:
  220. raise Exception('must provide two bucketing series')
  221. xby, yby = by
  222. xbins, ybins = bins
  223. return _bucketpanel_by(series, xby, yby, xbins, ybins)
  224. elif use_cat:
  225. xcat, ycat = cat
  226. return _bucketpanel_cat(series, xcat, ycat)
  227. else:
  228. raise Exception('must specify either values or categories '
  229. 'to bucket by')
  230. def _bucketpanel_by(series, xby, yby, xbins, ybins):
  231. xby = xby.reindex(series.index)
  232. yby = yby.reindex(series.index)
  233. xlabels = _bucket_labels(xby.reindex(series.index), xbins)
  234. ylabels = _bucket_labels(yby.reindex(series.index), ybins)
  235. labels = _uniquify(xlabels, ylabels, xbins, ybins)
  236. mask = com.isnull(labels)
  237. labels[mask] = -1
  238. unique_labels = np.unique(labels)
  239. bucketed = bucketcat(series, labels)
  240. _ulist = list(labels)
  241. index_map = dict((x, _ulist.index(x)) for x in unique_labels)
  242. def relabel(key):
  243. pos = index_map[key]
  244. xlab = xlabels[pos]
  245. ylab = ylabels[pos]
  246. return '%sx%s' % (int(xlab) if com.notnull(xlab) else 'NULL',
  247. int(ylab) if com.notnull(ylab) else 'NULL')
  248. return bucketed.rename(columns=relabel)
  249. def _bucketpanel_cat(series, xcat, ycat):
  250. xlabels, xmapping = _intern(xcat)
  251. ylabels, ymapping = _intern(ycat)
  252. shift = 10 ** (np.ceil(np.log10(ylabels.max())))
  253. labels = xlabels * shift + ylabels
  254. sorter = labels.argsort()
  255. sorted_labels = labels.take(sorter)
  256. sorted_xlabels = xlabels.take(sorter)
  257. sorted_ylabels = ylabels.take(sorter)
  258. unique_labels = np.unique(labels)
  259. unique_labels = unique_labels[com.notnull(unique_labels)]
  260. locs = sorted_labels.searchsorted(unique_labels)
  261. xkeys = sorted_xlabels.take(locs)
  262. ykeys = sorted_ylabels.take(locs)
  263. stringified = ['(%s, %s)' % arg
  264. for arg in zip(xmapping.take(xkeys), ymapping.take(ykeys))]
  265. result = bucketcat(series, labels)
  266. result.columns = stringified
  267. return result
  268. def _intern(values):
  269. # assumed no NaN values
  270. values = np.asarray(values)
  271. uniqued = np.unique(values)
  272. labels = uniqued.searchsorted(values)
  273. return labels, uniqued
  274. def _uniquify(xlabels, ylabels, xbins, ybins):
  275. # encode the stuff, create unique label
  276. shifter = 10 ** max(xbins, ybins)
  277. _xpiece = xlabels * shifter
  278. _ypiece = ylabels
  279. return _xpiece + _ypiece
  280. def _bucket_labels(series, k):
  281. arr = np.asarray(series)
  282. mask = np.isfinite(arr)
  283. order = arr[mask].argsort()
  284. n = len(series)
  285. split = np.array_split(np.arange(n)[mask].take(order), k)
  286. mat = np.empty(n, dtype=float) * np.NaN
  287. for i, v in enumerate(split):
  288. mat[v] = i
  289. return mat + 1