PageRenderTime 934ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 0ms

/pandas/core/nanops.py

https://github.com/smc77/pandas
Python | 382 lines | 298 code | 66 blank | 18 comment | 107 complexity | 01fa37eae97ee24a5fdb579521c4f150 MD5 | raw file
  1. import sys
  2. import numpy as np
  3. from pandas.core.common import isnull, notnull
  4. import pandas.core.common as com
  5. import pandas._tseries as lib
  6. try:
  7. import bottleneck as bn
  8. _USE_BOTTLENECK = True
  9. except ImportError: # pragma: no cover
  10. _USE_BOTTLENECK = False
  11. def _bottleneck_switch(bn_name, alt, zero_value=None, **kwargs):
  12. try:
  13. bn_func = getattr(bn, bn_name)
  14. except (AttributeError, NameError): # pragma: no cover
  15. bn_func = None
  16. def f(values, axis=None, skipna=True, **kwds):
  17. if len(kwargs) > 0:
  18. for k, v in kwargs.iteritems():
  19. kwds[k] = v
  20. try:
  21. if zero_value is not None and values.size == 0:
  22. if values.ndim == 1:
  23. return 0
  24. else:
  25. result_shape = values.shape[:axis] + values.shape[axis + 1:]
  26. result = np.empty(result_shape)
  27. result.fill(0)
  28. return result
  29. if _USE_BOTTLENECK and skipna and values.dtype != np.object_:
  30. result = bn_func(values, axis=axis, **kwds)
  31. # prefer to treat inf/-inf as NA
  32. if _has_infs(result):
  33. result = alt(values, axis=axis, skipna=skipna, **kwds)
  34. else:
  35. result = alt(values, axis=axis, skipna=skipna, **kwds)
  36. except Exception:
  37. result = alt(values, axis=axis, skipna=skipna, **kwds)
  38. return result
  39. return f
  40. def _has_infs(result):
  41. if isinstance(result, np.ndarray):
  42. if result.dtype == 'f8':
  43. return lib.has_infs_f8(result)
  44. elif result.dtype == 'f4':
  45. return lib.has_infs_f4(result)
  46. else: # pragma: no cover
  47. raise TypeError('Only suppose float32/64 here')
  48. else:
  49. return np.isinf(result) or np.isneginf(result)
  50. def _nansum(values, axis=None, skipna=True):
  51. mask = isnull(values)
  52. if skipna and not issubclass(values.dtype.type, np.integer):
  53. values = values.copy()
  54. np.putmask(values, mask, 0)
  55. the_sum = values.sum(axis)
  56. the_sum = _maybe_null_out(the_sum, axis, mask)
  57. return the_sum
  58. def _nanmean(values, axis=None, skipna=True):
  59. mask = isnull(values)
  60. if skipna and not issubclass(values.dtype.type, np.integer):
  61. values = values.copy()
  62. np.putmask(values, mask, 0)
  63. the_sum = _ensure_numeric(values.sum(axis))
  64. count = _get_counts(mask, axis)
  65. if axis is not None:
  66. the_mean = the_sum / count
  67. ct_mask = count == 0
  68. if ct_mask.any():
  69. the_mean[ct_mask] = np.nan
  70. else:
  71. the_mean = the_sum / count if count > 0 else np.nan
  72. return the_mean
  73. def _nanmedian(values, axis=None, skipna=True):
  74. def get_median(x):
  75. mask = notnull(x)
  76. if not skipna and not mask.all():
  77. return np.nan
  78. return lib.median(x[mask])
  79. if values.dtype != np.float64:
  80. values = values.astype('f8')
  81. if values.ndim > 1:
  82. return np.apply_along_axis(get_median, axis, values)
  83. else:
  84. return get_median(values)
  85. def _nanvar(values, axis=None, skipna=True, ddof=1):
  86. mask = isnull(values)
  87. if axis is not None:
  88. count = (values.shape[axis] - mask.sum(axis)).astype(float)
  89. else:
  90. count = float(values.size - mask.sum())
  91. if skipna:
  92. values = values.copy()
  93. np.putmask(values, mask, 0)
  94. X = _ensure_numeric(values.sum(axis))
  95. XX = _ensure_numeric((values ** 2).sum(axis))
  96. return (XX - X ** 2 / count) / (count - ddof)
  97. def _nanmin(values, axis=None, skipna=True):
  98. mask = isnull(values)
  99. if skipna and not issubclass(values.dtype.type, np.integer):
  100. values = values.copy()
  101. np.putmask(values, mask, np.inf)
  102. # numpy 1.6.1 workaround in Python 3.x
  103. if (values.dtype == np.object_
  104. and sys.version_info[0] >= 3): # pragma: no cover
  105. import __builtin__
  106. if values.ndim > 1:
  107. apply_ax = axis if axis is not None else 0
  108. result = np.apply_along_axis(__builtin__.min, apply_ax, values)
  109. else:
  110. result = __builtin__.min(values)
  111. else:
  112. result = values.min(axis)
  113. return _maybe_null_out(result, axis, mask)
  114. def _nanmax(values, axis=None, skipna=True):
  115. mask = isnull(values)
  116. if skipna and not issubclass(values.dtype.type, np.integer):
  117. values = values.copy()
  118. np.putmask(values, mask, -np.inf)
  119. # numpy 1.6.1 workaround in Python 3.x
  120. if (values.dtype == np.object_
  121. and sys.version_info[0] >= 3): # pragma: no cover
  122. import __builtin__
  123. if values.ndim > 1:
  124. apply_ax = axis if axis is not None else 0
  125. result = np.apply_along_axis(__builtin__.max, apply_ax, values)
  126. else:
  127. result = __builtin__.max(values)
  128. else:
  129. result = values.max(axis)
  130. return _maybe_null_out(result, axis, mask)
  131. def nanargmax(values, axis=None, skipna=True):
  132. """
  133. Returns -1 in the NA case
  134. """
  135. mask = -np.isfinite(values)
  136. if not issubclass(values.dtype.type, np.integer):
  137. values = values.copy()
  138. np.putmask(values, mask, -np.inf)
  139. result = values.argmax(axis)
  140. result = _maybe_arg_null_out(result, axis, mask, skipna)
  141. return result
  142. def nanargmin(values, axis=None, skipna=True):
  143. """
  144. Returns -1 in the NA case
  145. """
  146. mask = -np.isfinite(values)
  147. if not issubclass(values.dtype.type, np.integer):
  148. values = values.copy()
  149. np.putmask(values, mask, np.inf)
  150. result = values.argmin(axis)
  151. result = _maybe_arg_null_out(result, axis, mask, skipna)
  152. return result
  153. nansum = _bottleneck_switch('nansum', _nansum, zero_value=0)
  154. nanmean = _bottleneck_switch('nanmean', _nanmean)
  155. nanmedian = _bottleneck_switch('nanmedian', _nanmedian)
  156. nanvar = _bottleneck_switch('nanvar', _nanvar)
  157. nanmin = _bottleneck_switch('nanmin', _nanmin)
  158. nanmax = _bottleneck_switch('nanmax', _nanmax)
  159. def nanskew(values, axis=None, skipna=True):
  160. if not isinstance(values.dtype.type, np.floating):
  161. values = values.astype('f8')
  162. mask = isnull(values)
  163. count = _get_counts(mask, axis)
  164. if skipna:
  165. values = values.copy()
  166. np.putmask(values, mask, 0)
  167. A = values.sum(axis) / count
  168. B = (values ** 2).sum(axis) / count - A ** 2
  169. C = (values ** 3).sum(axis) / count - A ** 3 - 3 * A * B
  170. # floating point error
  171. B = _zero_out_fperr(B)
  172. C = _zero_out_fperr(C)
  173. result = ((np.sqrt((count ** 2 - count)) * C) /
  174. ((count - 2) * np.sqrt(B) ** 3))
  175. if isinstance(result, np.ndarray):
  176. result = np.where(B == 0, 0, result)
  177. result[count < 3] = np.nan
  178. return result
  179. else:
  180. result = 0 if B == 0 else result
  181. if count < 3:
  182. return np.nan
  183. return result
  184. def nanprod(values, axis=None, skipna=True):
  185. mask = isnull(values)
  186. if skipna and not issubclass(values.dtype.type, np.integer):
  187. values = values.copy()
  188. values[mask] = 1
  189. result = values.prod(axis)
  190. return _maybe_null_out(result, axis, mask)
  191. def _maybe_arg_null_out(result, axis, mask, skipna):
  192. # helper function for nanargmin/nanargmax
  193. if axis is None:
  194. if skipna:
  195. if mask.all():
  196. result = -1
  197. else:
  198. if mask.any():
  199. result = -1
  200. else:
  201. if skipna:
  202. na_mask = mask.all(axis)
  203. else:
  204. na_mask = mask.any(axis)
  205. if na_mask.any():
  206. result[na_mask] = -1
  207. return result
  208. def _get_counts(mask, axis):
  209. if axis is not None:
  210. count = (mask.shape[axis] - mask.sum(axis)).astype(float)
  211. else:
  212. count = float(mask.size - mask.sum())
  213. return count
  214. def _maybe_null_out(result, axis, mask):
  215. if axis is not None:
  216. null_mask = (mask.shape[axis] - mask.sum(axis)) == 0
  217. if null_mask.any():
  218. result = result.astype('f8')
  219. result[null_mask] = np.nan
  220. else:
  221. null_mask = mask.size - mask.sum()
  222. if null_mask == 0:
  223. result = np.nan
  224. return result
  225. def _zero_out_fperr(arg):
  226. if isinstance(arg, np.ndarray):
  227. return np.where(np.abs(arg) < 1e-14, 0, arg)
  228. else:
  229. return 0 if np.abs(arg) < 1e-14 else arg
  230. def nancorr(a, b, method='pearson'):
  231. """
  232. a, b: ndarrays
  233. """
  234. assert(len(a) == len(b))
  235. valid = notnull(a) & notnull(b)
  236. if not valid.all():
  237. a = a[valid]
  238. b = b[valid]
  239. if len(a) == 0:
  240. return np.nan
  241. f = get_corr_func(method)
  242. return f(a, b)
  243. def get_corr_func(method):
  244. if method in ['kendall', 'spearman']:
  245. from scipy.stats import kendalltau, spearmanr
  246. def _pearson(a, b):
  247. return np.corrcoef(a, b)[0, 1]
  248. def _kendall(a, b):
  249. return kendalltau(a, b)[0]
  250. def _spearman(a, b):
  251. return spearmanr(a, b)[0]
  252. _cor_methods = {
  253. 'pearson' : _pearson,
  254. 'kendall' : _kendall,
  255. 'spearman' : _spearman
  256. }
  257. return _cor_methods[method]
  258. def nancov(a, b):
  259. assert(len(a) == len(b))
  260. valid = notnull(a) & notnull(b)
  261. if not valid.all():
  262. a = a[valid]
  263. b = b[valid]
  264. if len(a) == 0:
  265. return np.nan
  266. return np.cov(a, b)[0, 1]
  267. def _ensure_numeric(x):
  268. if isinstance(x, np.ndarray):
  269. if x.dtype == np.object_:
  270. x = x.astype(np.float64)
  271. elif not (com.is_float(x) or com.is_integer(x)):
  272. try:
  273. x = float(x)
  274. except Exception:
  275. raise TypeError('Could not convert %s to numeric' % str(x))
  276. return x
  277. # NA-friendly array comparisons
  278. import operator
  279. def make_nancomp(op):
  280. def f(x, y):
  281. xmask = isnull(x)
  282. ymask = isnull(y)
  283. mask = xmask | ymask
  284. result = op(x, y)
  285. if mask.any():
  286. if result.dtype == np.bool_:
  287. result = result.astype('O')
  288. np.putmask(result, mask, np.nan)
  289. return result
  290. return f
  291. nangt = make_nancomp(operator.gt)
  292. nange = make_nancomp(operator.ge)
  293. nanlt = make_nancomp(operator.lt)
  294. nanle = make_nancomp(operator.le)
  295. naneq = make_nancomp(operator.eq)
  296. nanne = make_nancomp(operator.ne)
  297. def unique1d(values):
  298. """
  299. Hash table-based unique
  300. """
  301. if issubclass(values.dtype.type, np.floating):
  302. if values.dtype != np.float64:
  303. values = values.astype(np.float64)
  304. table = lib.Float64HashTable(len(values))
  305. uniques = np.array(table.unique(values), dtype=np.float64)
  306. elif issubclass(values.dtype.type, np.integer):
  307. if values.dtype != np.int64:
  308. values = values.astype(np.int64)
  309. table = lib.Int64HashTable(len(values))
  310. uniques = np.array(table.unique(values), dtype=np.int64)
  311. else:
  312. if not values.dtype == np.object_:
  313. values = values.astype(np.object_)
  314. table = lib.PyObjectHashTable(len(values))
  315. uniques = lib.list_to_object_array(table.unique(values))
  316. return uniques