PageRenderTime 60ms CodeModel.GetById 17ms RepoModel.GetById 1ms app.codeStats 0ms

/pandas/core/nanops.py

https://github.com/thouis/pandas
Python | 513 lines | 377 code | 117 blank | 19 comment | 137 complexity | fb7ec7ca7852b9760d9f68279bfe3de7 MD5 | raw file
Possible License(s): BSD-3-Clause
  1. import sys
  2. import numpy as np
  3. from pandas.core.common import isnull, notnull
  4. import pandas.core.common as com
  5. import pandas.core.config as cf
  6. import pandas.lib as lib
  7. import pandas.algos as algos
  8. import pandas.hashtable as _hash
  9. try:
  10. import bottleneck as bn
  11. _USE_BOTTLENECK = True
  12. except ImportError: # pragma: no cover
  13. _USE_BOTTLENECK = False
  14. def _bottleneck_switch(bn_name, alt, zero_value=None, **kwargs):
  15. try:
  16. bn_func = getattr(bn, bn_name)
  17. except (AttributeError, NameError): # pragma: no cover
  18. bn_func = None
  19. def f(values, axis=None, skipna=True, **kwds):
  20. if len(kwargs) > 0:
  21. for k, v in kwargs.iteritems():
  22. if k not in kwds:
  23. kwds[k] = v
  24. try:
  25. if zero_value is not None and values.size == 0:
  26. if values.ndim == 1:
  27. return 0
  28. else:
  29. result_shape = values.shape[:axis] + values.shape[axis + 1:]
  30. result = np.empty(result_shape)
  31. result.fill(0)
  32. return result
  33. if _USE_BOTTLENECK and skipna and _bn_ok_dtype(values.dtype):
  34. result = bn_func(values, axis=axis, **kwds)
  35. # prefer to treat inf/-inf as NA
  36. if _has_infs(result):
  37. result = alt(values, axis=axis, skipna=skipna, **kwds)
  38. else:
  39. result = alt(values, axis=axis, skipna=skipna, **kwds)
  40. except Exception:
  41. result = alt(values, axis=axis, skipna=skipna, **kwds)
  42. return result
  43. return f
  44. def _bn_ok_dtype(dt):
  45. # Bottleneck chokes on datetime64
  46. return dt != np.object_ and not issubclass(dt.type, np.datetime64)
  47. def _has_infs(result):
  48. if isinstance(result, np.ndarray):
  49. if result.dtype == 'f8':
  50. return lib.has_infs_f8(result)
  51. elif result.dtype == 'f4':
  52. return lib.has_infs_f4(result)
  53. else: # pragma: no cover
  54. raise TypeError('Only suppose float32/64 here')
  55. else:
  56. return np.isinf(result) or np.isneginf(result)
  57. def nanany(values, axis=None, skipna=True):
  58. mask = isnull(values)
  59. if skipna:
  60. values = values.copy()
  61. np.putmask(values, mask, False)
  62. return values.any(axis)
  63. def nanall(values, axis=None, skipna=True):
  64. mask = isnull(values)
  65. if skipna:
  66. values = values.copy()
  67. np.putmask(values, mask, True)
  68. return values.all(axis)
  69. def _nansum(values, axis=None, skipna=True):
  70. mask = isnull(values)
  71. if skipna and not issubclass(values.dtype.type, np.integer):
  72. values = values.copy()
  73. np.putmask(values, mask, 0)
  74. the_sum = values.sum(axis)
  75. the_sum = _maybe_null_out(the_sum, axis, mask)
  76. return the_sum
  77. def _nanmean(values, axis=None, skipna=True):
  78. mask = isnull(values)
  79. if skipna and not issubclass(values.dtype.type, np.integer):
  80. values = values.copy()
  81. np.putmask(values, mask, 0)
  82. the_sum = _ensure_numeric(values.sum(axis))
  83. count = _get_counts(mask, axis)
  84. if axis is not None:
  85. the_mean = the_sum / count
  86. ct_mask = count == 0
  87. if ct_mask.any():
  88. the_mean[ct_mask] = np.nan
  89. else:
  90. the_mean = the_sum / count if count > 0 else np.nan
  91. return the_mean
  92. def _nanmedian(values, axis=None, skipna=True):
  93. def get_median(x):
  94. mask = notnull(x)
  95. if not skipna and not mask.all():
  96. return np.nan
  97. return algos.median(x[mask])
  98. if values.dtype != np.float64:
  99. values = values.astype('f8')
  100. if values.ndim > 1:
  101. return np.apply_along_axis(get_median, axis, values)
  102. else:
  103. return get_median(values)
  104. def _nanvar(values, axis=None, skipna=True, ddof=1):
  105. mask = isnull(values)
  106. if axis is not None:
  107. count = (values.shape[axis] - mask.sum(axis)).astype(float)
  108. else:
  109. count = float(values.size - mask.sum())
  110. if skipna:
  111. values = values.copy()
  112. np.putmask(values, mask, 0)
  113. X = _ensure_numeric(values.sum(axis))
  114. XX = _ensure_numeric((values ** 2).sum(axis))
  115. return np.fabs((XX - X ** 2 / count) / (count - ddof))
  116. def _nanmin(values, axis=None, skipna=True):
  117. mask = isnull(values)
  118. dtype = values.dtype
  119. if skipna and not issubclass(dtype.type,
  120. (np.integer, np.datetime64)):
  121. values = values.copy()
  122. np.putmask(values, mask, np.inf)
  123. if issubclass(dtype.type, np.datetime64):
  124. values = values.view(np.int64)
  125. # numpy 1.6.1 workaround in Python 3.x
  126. if (values.dtype == np.object_
  127. and sys.version_info[0] >= 3): # pragma: no cover
  128. import __builtin__
  129. if values.ndim > 1:
  130. apply_ax = axis if axis is not None else 0
  131. result = np.apply_along_axis(__builtin__.min, apply_ax, values)
  132. else:
  133. result = __builtin__.min(values)
  134. else:
  135. if ((axis is not None and values.shape[axis] == 0)
  136. or values.size == 0):
  137. result = values.sum(axis)
  138. result.fill(np.nan)
  139. else:
  140. result = values.min(axis)
  141. if issubclass(dtype.type, np.datetime64):
  142. if not isinstance(result, np.ndarray):
  143. result = lib.Timestamp(result)
  144. else:
  145. result = result.view(dtype)
  146. return _maybe_null_out(result, axis, mask)
  147. def _nanmax(values, axis=None, skipna=True):
  148. mask = isnull(values)
  149. dtype = values.dtype
  150. if skipna and not issubclass(dtype.type, (np.integer, np.datetime64)):
  151. values = values.copy()
  152. np.putmask(values, mask, -np.inf)
  153. if issubclass(dtype.type, np.datetime64):
  154. values = values.view(np.int64)
  155. # numpy 1.6.1 workaround in Python 3.x
  156. if (values.dtype == np.object_
  157. and sys.version_info[0] >= 3): # pragma: no cover
  158. import __builtin__
  159. if values.ndim > 1:
  160. apply_ax = axis if axis is not None else 0
  161. result = np.apply_along_axis(__builtin__.max, apply_ax, values)
  162. else:
  163. result = __builtin__.max(values)
  164. else:
  165. if ((axis is not None and values.shape[axis] == 0)
  166. or values.size == 0):
  167. result = values.sum(axis)
  168. result.fill(np.nan)
  169. else:
  170. result = values.max(axis)
  171. if issubclass(dtype.type, np.datetime64):
  172. if not isinstance(result, np.ndarray):
  173. result = lib.Timestamp(result)
  174. else:
  175. result = result.view(dtype)
  176. return _maybe_null_out(result, axis, mask)
  177. def nanargmax(values, axis=None, skipna=True):
  178. """
  179. Returns -1 in the NA case
  180. """
  181. mask = -np.isfinite(values)
  182. if not issubclass(values.dtype.type, np.integer):
  183. values = values.copy()
  184. np.putmask(values, mask, -np.inf)
  185. result = values.argmax(axis)
  186. result = _maybe_arg_null_out(result, axis, mask, skipna)
  187. return result
  188. def nanargmin(values, axis=None, skipna=True):
  189. """
  190. Returns -1 in the NA case
  191. """
  192. mask = -np.isfinite(values)
  193. if not issubclass(values.dtype.type, np.integer):
  194. values = values.copy()
  195. np.putmask(values, mask, np.inf)
  196. result = values.argmin(axis)
  197. result = _maybe_arg_null_out(result, axis, mask, skipna)
  198. return result
  199. nansum = _bottleneck_switch('nansum', _nansum, zero_value=0)
  200. nanmean = _bottleneck_switch('nanmean', _nanmean)
  201. nanmedian = _bottleneck_switch('nanmedian', _nanmedian)
  202. nanvar = _bottleneck_switch('nanvar', _nanvar, ddof=1)
  203. nanmin = _bottleneck_switch('nanmin', _nanmin)
  204. nanmax = _bottleneck_switch('nanmax', _nanmax)
  205. def nanskew(values, axis=None, skipna=True):
  206. if not isinstance(values.dtype.type, np.floating):
  207. values = values.astype('f8')
  208. mask = isnull(values)
  209. count = _get_counts(mask, axis)
  210. if skipna:
  211. values = values.copy()
  212. np.putmask(values, mask, 0)
  213. A = values.sum(axis) / count
  214. B = (values ** 2).sum(axis) / count - A ** 2
  215. C = (values ** 3).sum(axis) / count - A ** 3 - 3 * A * B
  216. # floating point error
  217. B = _zero_out_fperr(B)
  218. C = _zero_out_fperr(C)
  219. result = ((np.sqrt((count ** 2 - count)) * C) /
  220. ((count - 2) * np.sqrt(B) ** 3))
  221. if isinstance(result, np.ndarray):
  222. result = np.where(B == 0, 0, result)
  223. result[count < 3] = np.nan
  224. return result
  225. else:
  226. result = 0 if B == 0 else result
  227. if count < 3:
  228. return np.nan
  229. return result
  230. def nankurt(values, axis=None, skipna=True):
  231. if not isinstance(values.dtype.type, np.floating):
  232. values = values.astype('f8')
  233. mask = isnull(values)
  234. count = _get_counts(mask, axis)
  235. if skipna:
  236. values = values.copy()
  237. np.putmask(values, mask, 0)
  238. A = values.sum(axis) / count
  239. B = (values ** 2).sum(axis) / count - A ** 2
  240. C = (values ** 3).sum(axis) / count - A ** 3 - 3 * A * B
  241. D = (values ** 4).sum(axis) / count - A ** 4 - 6 * B * A * A - 4 * C * A
  242. B = _zero_out_fperr(B)
  243. C = _zero_out_fperr(C)
  244. D = _zero_out_fperr(D)
  245. result = (((count * count - 1.) * D / (B * B) - 3 * ((count - 1.) ** 2)) /
  246. ((count - 2.) * (count - 3.)))
  247. if isinstance(result, np.ndarray):
  248. result = np.where(B == 0, 0, result)
  249. result[count < 4] = np.nan
  250. return result
  251. else:
  252. result = 0 if B == 0 else result
  253. if count < 4:
  254. return np.nan
  255. return result
  256. def nanprod(values, axis=None, skipna=True):
  257. mask = isnull(values)
  258. if skipna and not issubclass(values.dtype.type, np.integer):
  259. values = values.copy()
  260. values[mask] = 1
  261. result = values.prod(axis)
  262. return _maybe_null_out(result, axis, mask)
  263. def _maybe_arg_null_out(result, axis, mask, skipna):
  264. # helper function for nanargmin/nanargmax
  265. if axis is None:
  266. if skipna:
  267. if mask.all():
  268. result = -1
  269. else:
  270. if mask.any():
  271. result = -1
  272. else:
  273. if skipna:
  274. na_mask = mask.all(axis)
  275. else:
  276. na_mask = mask.any(axis)
  277. if na_mask.any():
  278. result[na_mask] = -1
  279. return result
  280. def _get_counts(mask, axis):
  281. if axis is not None:
  282. count = (mask.shape[axis] - mask.sum(axis)).astype(float)
  283. else:
  284. count = float(mask.size - mask.sum())
  285. return count
  286. def _maybe_null_out(result, axis, mask):
  287. if axis is not None:
  288. null_mask = (mask.shape[axis] - mask.sum(axis)) == 0
  289. if null_mask.any():
  290. result = result.astype('f8')
  291. result[null_mask] = np.nan
  292. else:
  293. null_mask = mask.size - mask.sum()
  294. if null_mask == 0:
  295. result = np.nan
  296. return result
  297. def _zero_out_fperr(arg):
  298. if isinstance(arg, np.ndarray):
  299. return np.where(np.abs(arg) < 1e-14, 0, arg)
  300. else:
  301. return 0 if np.abs(arg) < 1e-14 else arg
  302. def nancorr(a, b, method='pearson', min_periods=None):
  303. """
  304. a, b: ndarrays
  305. """
  306. if len(a) != len(b):
  307. raise AssertionError('Operands to nancorr must have same size')
  308. if min_periods is None:
  309. min_periods = 1
  310. valid = notnull(a) & notnull(b)
  311. if not valid.all():
  312. a = a[valid]
  313. b = b[valid]
  314. if len(a) < min_periods:
  315. return np.nan
  316. f = get_corr_func(method)
  317. return f(a, b)
  318. def get_corr_func(method):
  319. if method in ['kendall', 'spearman']:
  320. from scipy.stats import kendalltau, spearmanr
  321. def _pearson(a, b):
  322. return np.corrcoef(a, b)[0, 1]
  323. def _kendall(a, b):
  324. rs = kendalltau(a, b)
  325. if isinstance(rs, tuple):
  326. return rs[0]
  327. return rs
  328. def _spearman(a, b):
  329. return spearmanr(a, b)[0]
  330. _cor_methods = {
  331. 'pearson': _pearson,
  332. 'kendall': _kendall,
  333. 'spearman': _spearman
  334. }
  335. return _cor_methods[method]
  336. def nancov(a, b, min_periods=None):
  337. if len(a) != len(b):
  338. raise AssertionError('Operands to nancov must have same size')
  339. if min_periods is None:
  340. min_periods = 1
  341. valid = notnull(a) & notnull(b)
  342. if not valid.all():
  343. a = a[valid]
  344. b = b[valid]
  345. if len(a) < min_periods:
  346. return np.nan
  347. return np.cov(a, b)[0, 1]
  348. def _ensure_numeric(x):
  349. if isinstance(x, np.ndarray):
  350. if x.dtype == np.object_:
  351. x = x.astype(np.float64)
  352. elif not (com.is_float(x) or com.is_integer(x)):
  353. try:
  354. x = float(x)
  355. except Exception:
  356. raise TypeError('Could not convert %s to numeric' % str(x))
  357. return x
  358. # NA-friendly array comparisons
  359. import operator
  360. def make_nancomp(op):
  361. def f(x, y):
  362. xmask = isnull(x)
  363. ymask = isnull(y)
  364. mask = xmask | ymask
  365. result = op(x, y)
  366. if mask.any():
  367. if result.dtype == np.bool_:
  368. result = result.astype('O')
  369. np.putmask(result, mask, np.nan)
  370. return result
  371. return f
  372. nangt = make_nancomp(operator.gt)
  373. nange = make_nancomp(operator.ge)
  374. nanlt = make_nancomp(operator.lt)
  375. nanle = make_nancomp(operator.le)
  376. naneq = make_nancomp(operator.eq)
  377. nanne = make_nancomp(operator.ne)
  378. def unique1d(values):
  379. """
  380. Hash table-based unique
  381. """
  382. if np.issubdtype(values.dtype, np.floating):
  383. table = _hash.Float64HashTable(len(values))
  384. uniques = np.array(table.unique(com._ensure_float64(values)),
  385. dtype=np.float64)
  386. elif np.issubdtype(values.dtype, np.datetime64):
  387. table = _hash.Int64HashTable(len(values))
  388. uniques = table.unique(com._ensure_int64(values))
  389. uniques = uniques.view('M8[ns]')
  390. elif np.issubdtype(values.dtype, np.integer):
  391. table = _hash.Int64HashTable(len(values))
  392. uniques = table.unique(com._ensure_int64(values))
  393. else:
  394. table = _hash.PyObjectHashTable(len(values))
  395. uniques = table.unique(com._ensure_object(values))
  396. return uniques