PageRenderTime 562ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 0ms

/NU_JAE_dnaComplete_optimized/pymodules/python2.7/lib/python/pandas-0.17.1-py2.7-linux-x86_64.egg/pandas/tools/tile.py

https://gitlab.com/pooja043/Globus_Docker_4
Python | 297 lines | 236 code | 4 blank | 57 comment | 0 complexity | 326484ba2e37fe5801a624eced3a6678 MD5 | raw file
  1. """
  2. Quantilization functions and related stuff
  3. """
  4. from pandas.core.api import DataFrame, Series
  5. from pandas.core.categorical import Categorical
  6. from pandas.core.index import _ensure_index
  7. import pandas.core.algorithms as algos
  8. import pandas.core.common as com
  9. import pandas.core.nanops as nanops
  10. from pandas.compat import zip
  11. import numpy as np
  12. def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
  13. include_lowest=False):
  14. """
  15. Return indices of half-open bins to which each value of `x` belongs.
  16. Parameters
  17. ----------
  18. x : array-like
  19. Input array to be binned. It has to be 1-dimensional.
  20. bins : int or sequence of scalars
  21. If `bins` is an int, it defines the number of equal-width bins in the
  22. range of `x`. However, in this case, the range of `x` is extended
  23. by .1% on each side to include the min or max values of `x`. If
  24. `bins` is a sequence it defines the bin edges allowing for
  25. non-uniform bin width. No extension of the range of `x` is done in
  26. this case.
  27. right : bool, optional
  28. Indicates whether the bins include the rightmost edge or not. If
  29. right == True (the default), then the bins [1,2,3,4] indicate
  30. (1,2], (2,3], (3,4].
  31. labels : array or boolean, default None
  32. Used as labels for the resulting bins. Must be of the same length as the resulting
  33. bins. If False, return only integer indicators of the bins.
  34. retbins : bool, optional
  35. Whether to return the bins or not. Can be useful if bins is given
  36. as a scalar.
  37. precision : int
  38. The precision at which to store and display the bins labels
  39. include_lowest : bool
  40. Whether the first interval should be left-inclusive or not.
  41. Returns
  42. -------
  43. out : Categorical or Series or array of integers if labels is False
  44. The return type (Categorical or Series) depends on the input: a Series of type category if
  45. input is a Series else Categorical. Bins are represented as categories when categorical
  46. data is returned.
  47. bins : ndarray of floats
  48. Returned only if `retbins` is True.
  49. Notes
  50. -----
  51. The `cut` function can be useful for going from a continuous variable to
  52. a categorical variable. For example, `cut` could convert ages to groups
  53. of age ranges.
  54. Any NA values will be NA in the result. Out of bounds values will be NA in
  55. the resulting Categorical object
  56. Examples
  57. --------
  58. >>> pd.cut(np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]), 3, retbins=True)
  59. ([(0.191, 3.367], (0.191, 3.367], (0.191, 3.367], (3.367, 6.533], (6.533, 9.7], (0.191, 3.367]]
  60. Categories (3, object): [(0.191, 3.367] < (3.367, 6.533] < (6.533, 9.7]],
  61. array([ 0.1905 , 3.36666667, 6.53333333, 9.7 ]))
  62. >>> pd.cut(np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]), 3, labels=["good","medium","bad"])
  63. [good, good, good, medium, bad, good]
  64. Categories (3, object): [good < medium < bad]
  65. >>> pd.cut(np.ones(5), 4, labels=False)
  66. array([1, 1, 1, 1, 1], dtype=int64)
  67. """
  68. # NOTE: this binning code is changed a bit from histogram for var(x) == 0
  69. if not np.iterable(bins):
  70. if np.isscalar(bins) and bins < 1:
  71. raise ValueError("`bins` should be a positive integer.")
  72. try: # for array-like
  73. sz = x.size
  74. except AttributeError:
  75. x = np.asarray(x)
  76. sz = x.size
  77. if sz == 0:
  78. raise ValueError('Cannot cut empty array')
  79. # handle empty arrays. Can't determine range, so use 0-1.
  80. # rng = (0, 1)
  81. else:
  82. rng = (nanops.nanmin(x), nanops.nanmax(x))
  83. mn, mx = [mi + 0.0 for mi in rng]
  84. if mn == mx: # adjust end points before binning
  85. mn -= .001 * mn
  86. mx += .001 * mx
  87. bins = np.linspace(mn, mx, bins + 1, endpoint=True)
  88. else: # adjust end points after binning
  89. bins = np.linspace(mn, mx, bins + 1, endpoint=True)
  90. adj = (mx - mn) * 0.001 # 0.1% of the range
  91. if right:
  92. bins[0] -= adj
  93. else:
  94. bins[-1] += adj
  95. else:
  96. bins = np.asarray(bins)
  97. if (np.diff(bins) < 0).any():
  98. raise ValueError('bins must increase monotonically.')
  99. return _bins_to_cuts(x, bins, right=right, labels=labels,retbins=retbins, precision=precision,
  100. include_lowest=include_lowest)
  101. def qcut(x, q, labels=None, retbins=False, precision=3):
  102. """
  103. Quantile-based discretization function. Discretize variable into
  104. equal-sized buckets based on rank or based on sample quantiles. For example
  105. 1000 values for 10 quantiles would produce a Categorical object indicating
  106. quantile membership for each data point.
  107. Parameters
  108. ----------
  109. x : ndarray or Series
  110. q : integer or array of quantiles
  111. Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately
  112. array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles
  113. labels : array or boolean, default None
  114. Used as labels for the resulting bins. Must be of the same length as the resulting
  115. bins. If False, return only integer indicators of the bins.
  116. retbins : bool, optional
  117. Whether to return the bins or not. Can be useful if bins is given
  118. as a scalar.
  119. precision : int
  120. The precision at which to store and display the bins labels
  121. Returns
  122. -------
  123. out : Categorical or Series or array of integers if labels is False
  124. The return type (Categorical or Series) depends on the input: a Series of type category if
  125. input is a Series else Categorical. Bins are represented as categories when categorical
  126. data is returned.
  127. bins : ndarray of floats
  128. Returned only if `retbins` is True.
  129. Notes
  130. -----
  131. Out of bounds values will be NA in the resulting Categorical object
  132. Examples
  133. --------
  134. >>> pd.qcut(range(5), 4)
  135. [[0, 1], [0, 1], (1, 2], (2, 3], (3, 4]]
  136. Categories (4, object): [[0, 1] < (1, 2] < (2, 3] < (3, 4]]
  137. >>> pd.qcut(range(5), 3, labels=["good","medium","bad"])
  138. [good, good, medium, bad, bad]
  139. Categories (3, object): [good < medium < bad]
  140. >>> pd.qcut(range(5), 4, labels=False)
  141. array([0, 0, 1, 2, 3], dtype=int64)
  142. """
  143. if com.is_integer(q):
  144. quantiles = np.linspace(0, 1, q + 1)
  145. else:
  146. quantiles = q
  147. bins = algos.quantile(x, quantiles)
  148. return _bins_to_cuts(x, bins, labels=labels, retbins=retbins,precision=precision,
  149. include_lowest=True)
  150. def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False,
  151. precision=3, name=None, include_lowest=False):
  152. x_is_series = isinstance(x, Series)
  153. series_index = None
  154. if x_is_series:
  155. series_index = x.index
  156. if name is None:
  157. name = x.name
  158. x = np.asarray(x)
  159. side = 'left' if right else 'right'
  160. ids = bins.searchsorted(x, side=side)
  161. if len(algos.unique(bins)) < len(bins):
  162. raise ValueError('Bin edges must be unique: %s' % repr(bins))
  163. if include_lowest:
  164. ids[x == bins[0]] = 1
  165. na_mask = com.isnull(x) | (ids == len(bins)) | (ids == 0)
  166. has_nas = na_mask.any()
  167. if labels is not False:
  168. if labels is None:
  169. increases = 0
  170. while True:
  171. try:
  172. levels = _format_levels(bins, precision, right=right,
  173. include_lowest=include_lowest)
  174. except ValueError:
  175. increases += 1
  176. precision += 1
  177. if increases >= 20:
  178. raise
  179. else:
  180. break
  181. else:
  182. if len(labels) != len(bins) - 1:
  183. raise ValueError('Bin labels must be one fewer than '
  184. 'the number of bin edges')
  185. levels = labels
  186. levels = np.asarray(levels, dtype=object)
  187. np.putmask(ids, na_mask, 0)
  188. fac = Categorical(ids - 1, levels, ordered=True, fastpath=True)
  189. else:
  190. fac = ids - 1
  191. if has_nas:
  192. fac = fac.astype(np.float64)
  193. np.putmask(fac, na_mask, np.nan)
  194. if x_is_series:
  195. fac = Series(fac, index=series_index, name=name)
  196. if not retbins:
  197. return fac
  198. return fac, bins
  199. def _format_levels(bins, prec, right=True,
  200. include_lowest=False):
  201. fmt = lambda v: _format_label(v, precision=prec)
  202. if right:
  203. levels = []
  204. for a, b in zip(bins, bins[1:]):
  205. fa, fb = fmt(a), fmt(b)
  206. if a != b and fa == fb:
  207. raise ValueError('precision too low')
  208. formatted = '(%s, %s]' % (fa, fb)
  209. levels.append(formatted)
  210. if include_lowest:
  211. levels[0] = '[' + levels[0][1:]
  212. else:
  213. levels = ['[%s, %s)' % (fmt(a), fmt(b))
  214. for a, b in zip(bins, bins[1:])]
  215. return levels
  216. def _format_label(x, precision=3):
  217. fmt_str = '%%.%dg' % precision
  218. if np.isinf(x):
  219. return str(x)
  220. elif com.is_float(x):
  221. frac, whole = np.modf(x)
  222. sgn = '-' if x < 0 else ''
  223. whole = abs(whole)
  224. if frac != 0.0:
  225. val = fmt_str % frac
  226. # rounded up or down
  227. if '.' not in val:
  228. if x < 0:
  229. return '%d' % (-whole - 1)
  230. else:
  231. return '%d' % (whole + 1)
  232. if 'e' in val:
  233. return _trim_zeros(fmt_str % x)
  234. else:
  235. val = _trim_zeros(val)
  236. if '.' in val:
  237. return sgn + '.'.join(('%d' % whole, val.split('.')[1]))
  238. else: # pragma: no cover
  239. return sgn + '.'.join(('%d' % whole, val))
  240. else:
  241. return sgn + '%0.f' % whole
  242. else:
  243. return str(x)
  244. def _trim_zeros(x):
  245. while len(x) > 1 and x[-1] == '0':
  246. x = x[:-1]
  247. if len(x) > 1 and x[-1] == '.':
  248. x = x[:-1]
  249. return x