PageRenderTime 41ms CodeModel.GetById 13ms RepoModel.GetById 0ms app.codeStats 0ms

/pandas/tools/util.py

http://github.com/wesm/pandas
Python | 213 lines | 196 code | 10 blank | 7 comment | 6 complexity | ed9dcfb7ec7a9ab6eae24e69e8a5b7b4 MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
  1. import numpy as np
  2. import pandas.lib as lib
  3. from pandas.types.common import (is_number,
  4. is_numeric_dtype,
  5. is_datetime_or_timedelta_dtype,
  6. _ensure_object)
  7. from pandas.types.cast import _possibly_downcast_to_dtype
  8. import pandas as pd
  9. from pandas.compat import reduce
  10. from pandas.core.index import Index
  11. from pandas.core import common as com
  12. def match(needles, haystack):
  13. haystack = Index(haystack)
  14. needles = Index(needles)
  15. return haystack.get_indexer(needles)
  16. def cartesian_product(X):
  17. """
  18. Numpy version of itertools.product or pandas.compat.product.
  19. Sometimes faster (for large inputs)...
  20. Examples
  21. --------
  22. >>> cartesian_product([list('ABC'), [1, 2]])
  23. [array(['A', 'A', 'B', 'B', 'C', 'C'], dtype='|S1'),
  24. array([1, 2, 1, 2, 1, 2])]
  25. """
  26. lenX = np.fromiter((len(x) for x in X), dtype=int)
  27. cumprodX = np.cumproduct(lenX)
  28. a = np.roll(cumprodX, 1)
  29. a[0] = 1
  30. b = cumprodX[-1] / cumprodX
  31. return [np.tile(np.repeat(np.asarray(com._values_from_object(x)), b[i]),
  32. np.product(a[i]))
  33. for i, x in enumerate(X)]
  34. def _compose2(f, g):
  35. """Compose 2 callables"""
  36. return lambda *args, **kwargs: f(g(*args, **kwargs))
  37. def compose(*funcs):
  38. """Compose 2 or more callables"""
  39. assert len(funcs) > 1, 'At least 2 callables must be passed to compose'
  40. return reduce(_compose2, funcs)
  41. def to_numeric(arg, errors='raise', downcast=None):
  42. """
  43. Convert argument to a numeric type.
  44. Parameters
  45. ----------
  46. arg : list, tuple, 1-d array, or Series
  47. errors : {'ignore', 'raise', 'coerce'}, default 'raise'
  48. - If 'raise', then invalid parsing will raise an exception
  49. - If 'coerce', then invalid parsing will be set as NaN
  50. - If 'ignore', then invalid parsing will return the input
  51. downcast : {'integer', 'signed', 'unsigned', 'float'} , default None
  52. If not None, and if the data has been successfully cast to a
  53. numerical dtype (or if the data was numeric to begin with),
  54. downcast that resulting data to the smallest numerical dtype
  55. possible according to the following rules:
  56. - 'integer' or 'signed': smallest signed int dtype (min.: np.int8)
  57. - 'unsigned': smallest unsigned int dtype (min.: np.uint8)
  58. - 'float': smallest float dtype (min.: np.float32)
  59. As this behaviour is separate from the core conversion to
  60. numeric values, any errors raised during the downcasting
  61. will be surfaced regardless of the value of the 'errors' input.
  62. In addition, downcasting will only occur if the size
  63. of the resulting data's dtype is strictly larger than
  64. the dtype it is to be cast to, so if none of the dtypes
  65. checked satisfy that specification, no downcasting will be
  66. performed on the data.
  67. .. versionadded:: 0.19.0
  68. Returns
  69. -------
  70. ret : numeric if parsing succeeded.
  71. Return type depends on input. Series if Series, otherwise ndarray
  72. Examples
  73. --------
  74. Take separate series and convert to numeric, coercing when told to
  75. >>> import pandas as pd
  76. >>> s = pd.Series(['1.0', '2', -3])
  77. >>> pd.to_numeric(s)
  78. 0 1.0
  79. 1 2.0
  80. 2 -3.0
  81. dtype: float64
  82. >>> pd.to_numeric(s, downcast='float')
  83. 0 1.0
  84. 1 2.0
  85. 2 -3.0
  86. dtype: float32
  87. >>> pd.to_numeric(s, downcast='signed')
  88. 0 1
  89. 1 2
  90. 2 -3
  91. dtype: int8
  92. >>> s = pd.Series(['apple', '1.0', '2', -3])
  93. >>> pd.to_numeric(s, errors='ignore')
  94. 0 apple
  95. 1 1.0
  96. 2 2
  97. 3 -3
  98. dtype: object
  99. >>> pd.to_numeric(s, errors='coerce')
  100. 0 NaN
  101. 1 1.0
  102. 2 2.0
  103. 3 -3.0
  104. dtype: float64
  105. """
  106. if downcast not in (None, 'integer', 'signed', 'unsigned', 'float'):
  107. raise ValueError('invalid downcasting method provided')
  108. is_series = False
  109. is_index = False
  110. is_scalar = False
  111. if isinstance(arg, pd.Series):
  112. is_series = True
  113. values = arg.values
  114. elif isinstance(arg, pd.Index):
  115. is_index = True
  116. values = arg.asi8
  117. if values is None:
  118. values = arg.values
  119. elif isinstance(arg, (list, tuple)):
  120. values = np.array(arg, dtype='O')
  121. elif np.isscalar(arg):
  122. if is_number(arg):
  123. return arg
  124. is_scalar = True
  125. values = np.array([arg], dtype='O')
  126. elif getattr(arg, 'ndim', 1) > 1:
  127. raise TypeError('arg must be a list, tuple, 1-d array, or Series')
  128. else:
  129. values = arg
  130. try:
  131. if is_numeric_dtype(values):
  132. pass
  133. elif is_datetime_or_timedelta_dtype(values):
  134. values = values.astype(np.int64)
  135. else:
  136. values = _ensure_object(values)
  137. coerce_numeric = False if errors in ('ignore', 'raise') else True
  138. values = lib.maybe_convert_numeric(values, set(),
  139. coerce_numeric=coerce_numeric)
  140. except Exception:
  141. if errors == 'raise':
  142. raise
  143. # attempt downcast only if the data has been successfully converted
  144. # to a numerical dtype and if a downcast method has been specified
  145. if downcast is not None and is_numeric_dtype(values):
  146. typecodes = None
  147. if downcast in ('integer', 'signed'):
  148. typecodes = np.typecodes['Integer']
  149. elif downcast == 'unsigned' and np.min(values) > 0:
  150. typecodes = np.typecodes['UnsignedInteger']
  151. elif downcast == 'float':
  152. typecodes = np.typecodes['Float']
  153. # pandas support goes only to np.float32,
  154. # as float dtypes smaller than that are
  155. # extremely rare and not well supported
  156. float_32_char = np.dtype(np.float32).char
  157. float_32_ind = typecodes.index(float_32_char)
  158. typecodes = typecodes[float_32_ind:]
  159. if typecodes is not None:
  160. # from smallest to largest
  161. for dtype in typecodes:
  162. if np.dtype(dtype).itemsize < values.dtype.itemsize:
  163. values = _possibly_downcast_to_dtype(
  164. values, dtype)
  165. # successful conversion
  166. if values.dtype == dtype:
  167. break
  168. if is_series:
  169. return pd.Series(values, index=arg.index, name=arg.name)
  170. elif is_index:
  171. # because we want to coerce to numeric if possible,
  172. # do not use _shallow_copy_with_infer
  173. return Index(values, name=arg.name)
  174. elif is_scalar:
  175. return values[0]
  176. else:
  177. return values