PageRenderTime 37ms CodeModel.GetById 11ms RepoModel.GetById 0ms app.codeStats 0ms

/pandas/tools/tile.py

http://github.com/pydata/pandas
Python | 266 lines | 147 code | 29 blank | 90 comment | 45 complexity | bf3cd049e6c9a639586031931c13c060 MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
  1. """
  2. Quantilization functions and related stuff
  3. """
  4. from pandas.core.api import DataFrame, Series
  5. from pandas.core.categorical import Categorical
  6. from pandas.core.index import _ensure_index
  7. import pandas.core.algorithms as algos
  8. import pandas.core.common as com
  9. import pandas.core.nanops as nanops
  10. from pandas.compat import zip
  11. import numpy as np
  12. def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
  13. include_lowest=False):
  14. """
  15. Return indices of half-open bins to which each value of `x` belongs.
  16. Parameters
  17. ----------
  18. x : array-like
  19. Input array to be binned. It has to be 1-dimensional.
  20. bins : int or sequence of scalars
  21. If `bins` is an int, it defines the number of equal-width bins in the
  22. range of `x`. However, in this case, the range of `x` is extended
  23. by .1% on each side to include the min or max values of `x`. If
  24. `bins` is a sequence it defines the bin edges allowing for
  25. non-uniform bin width. No extension of the range of `x` is done in
  26. this case.
  27. right : bool, optional
  28. Indicates whether the bins include the rightmost edge or not. If
  29. right == True (the default), then the bins [1,2,3,4] indicate
  30. (1,2], (2,3], (3,4].
  31. labels : array or boolean, default None
  32. Labels to use for bin edges, or False to return integer bin labels
  33. retbins : bool, optional
  34. Whether to return the bins or not. Can be useful if bins is given
  35. as a scalar.
  36. precision : int
  37. The precision at which to store and display the bins labels
  38. include_lowest : bool
  39. Whether the first interval should be left-inclusive or not.
  40. Returns
  41. -------
  42. out : Categorical or array of integers if labels is False
  43. bins : ndarray of floats
  44. Returned only if `retbins` is True.
  45. Notes
  46. -----
  47. The `cut` function can be useful for going from a continuous variable to
  48. a categorical variable. For example, `cut` could convert ages to groups
  49. of age ranges.
  50. Any NA values will be NA in the result. Out of bounds values will be NA in
  51. the resulting Categorical object
  52. Examples
  53. --------
  54. >>> cut(np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]), 3, retbins=True)
  55. (array([(0.191, 3.367], (0.191, 3.367], (0.191, 3.367], (3.367, 6.533],
  56. (6.533, 9.7], (0.191, 3.367]], dtype=object),
  57. array([ 0.1905 , 3.36666667, 6.53333333, 9.7 ]))
  58. >>> cut(np.ones(5), 4, labels=False)
  59. array([2, 2, 2, 2, 2])
  60. """
  61. # NOTE: this binning code is changed a bit from histogram for var(x) == 0
  62. if not np.iterable(bins):
  63. if np.isscalar(bins) and bins < 1:
  64. raise ValueError("`bins` should be a positive integer.")
  65. try: # for array-like
  66. sz = x.size
  67. except AttributeError:
  68. x = np.asarray(x)
  69. sz = x.size
  70. if sz == 0:
  71. raise ValueError('Cannot cut empty array')
  72. # handle empty arrays. Can't determine range, so use 0-1.
  73. # rng = (0, 1)
  74. else:
  75. rng = (nanops.nanmin(x), nanops.nanmax(x))
  76. mn, mx = [mi + 0.0 for mi in rng]
  77. if mn == mx: # adjust end points before binning
  78. mn -= .001 * mn
  79. mx += .001 * mx
  80. bins = np.linspace(mn, mx, bins + 1, endpoint=True)
  81. else: # adjust end points after binning
  82. bins = np.linspace(mn, mx, bins + 1, endpoint=True)
  83. adj = (mx - mn) * 0.001 # 0.1% of the range
  84. if right:
  85. bins[0] -= adj
  86. else:
  87. bins[-1] += adj
  88. else:
  89. bins = np.asarray(bins)
  90. if (np.diff(bins) < 0).any():
  91. raise ValueError('bins must increase monotonically.')
  92. return _bins_to_cuts(x, bins, right=right, labels=labels,
  93. retbins=retbins, precision=precision,
  94. include_lowest=include_lowest)
  95. def qcut(x, q, labels=None, retbins=False, precision=3):
  96. """
  97. Quantile-based discretization function. Discretize variable into
  98. equal-sized buckets based on rank or based on sample quantiles. For example
  99. 1000 values for 10 quantiles would produce a Categorical object indicating
  100. quantile membership for each data point.
  101. Parameters
  102. ----------
  103. x : ndarray or Series
  104. q : integer or array of quantiles
  105. Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately
  106. array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles
  107. labels : array or boolean, default None
  108. Labels to use for bin edges, or False to return integer bin labels
  109. retbins : bool, optional
  110. Whether to return the bins or not. Can be useful if bins is given
  111. as a scalar.
  112. precision : int
  113. The precision at which to store and display the bins labels
  114. Returns
  115. -------
  116. cat : Categorical
  117. Notes
  118. -----
  119. Out of bounds values will be NA in the resulting Categorical object
  120. Examples
  121. --------
  122. """
  123. if com.is_integer(q):
  124. quantiles = np.linspace(0, 1, q + 1)
  125. else:
  126. quantiles = q
  127. bins = algos.quantile(x, quantiles)
  128. return _bins_to_cuts(x, bins, labels=labels, retbins=retbins,
  129. precision=precision, include_lowest=True)
  130. def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False,
  131. precision=3, name=None, include_lowest=False):
  132. if name is None and isinstance(x, Series):
  133. name = x.name
  134. x = np.asarray(x)
  135. side = 'left' if right else 'right'
  136. ids = bins.searchsorted(x, side=side)
  137. if len(algos.unique(bins)) < len(bins):
  138. raise ValueError('Bin edges must be unique: %s' % repr(bins))
  139. if include_lowest:
  140. ids[x == bins[0]] = 1
  141. na_mask = com.isnull(x) | (ids == len(bins)) | (ids == 0)
  142. has_nas = na_mask.any()
  143. if labels is not False:
  144. if labels is None:
  145. increases = 0
  146. while True:
  147. try:
  148. levels = _format_levels(bins, precision, right=right,
  149. include_lowest=include_lowest)
  150. except ValueError:
  151. increases += 1
  152. precision += 1
  153. if increases >= 20:
  154. raise
  155. else:
  156. break
  157. else:
  158. if len(labels) != len(bins) - 1:
  159. raise ValueError('Bin labels must be one fewer than '
  160. 'the number of bin edges')
  161. levels = labels
  162. levels = np.asarray(levels, dtype=object)
  163. np.putmask(ids, na_mask, 0)
  164. fac = Categorical(ids - 1, levels, name=name)
  165. else:
  166. fac = ids - 1
  167. if has_nas:
  168. fac = fac.astype(np.float64)
  169. np.putmask(fac, na_mask, np.nan)
  170. if not retbins:
  171. return fac
  172. return fac, bins
  173. def _format_levels(bins, prec, right=True,
  174. include_lowest=False):
  175. fmt = lambda v: _format_label(v, precision=prec)
  176. if right:
  177. levels = []
  178. for a, b in zip(bins, bins[1:]):
  179. fa, fb = fmt(a), fmt(b)
  180. if a != b and fa == fb:
  181. raise ValueError('precision too low')
  182. formatted = '(%s, %s]' % (fa, fb)
  183. levels.append(formatted)
  184. if include_lowest:
  185. levels[0] = '[' + levels[0][1:]
  186. else:
  187. levels = ['[%s, %s)' % (fmt(a), fmt(b))
  188. for a, b in zip(bins, bins[1:])]
  189. return levels
  190. def _format_label(x, precision=3):
  191. fmt_str = '%%.%dg' % precision
  192. if np.isinf(x):
  193. return str(x)
  194. elif com.is_float(x):
  195. frac, whole = np.modf(x)
  196. sgn = '-' if x < 0 else ''
  197. whole = abs(whole)
  198. if frac != 0.0:
  199. val = fmt_str % frac
  200. # rounded up or down
  201. if '.' not in val:
  202. if x < 0:
  203. return '%d' % (-whole - 1)
  204. else:
  205. return '%d' % (whole + 1)
  206. if 'e' in val:
  207. return _trim_zeros(fmt_str % x)
  208. else:
  209. val = _trim_zeros(val)
  210. if '.' in val:
  211. return sgn + '.'.join(('%d' % whole, val.split('.')[1]))
  212. else: # pragma: no cover
  213. return sgn + '.'.join(('%d' % whole, val))
  214. else:
  215. return sgn + '%0.f' % whole
  216. else:
  217. return str(x)
  218. def _trim_zeros(x):
  219. while len(x) > 1 and x[-1] == '0':
  220. x = x[:-1]
  221. if len(x) > 1 and x[-1] == '.':
  222. x = x[:-1]
  223. return x