PageRenderTime 26ms CodeModel.GetById 14ms RepoModel.GetById 0ms app.codeStats 0ms

/pandas/core/util/hashing.py

https://github.com/neurodebian/pandas
Python | 332 lines | 253 code | 20 blank | 59 comment | 16 complexity | c10c579bd336cb56d22198007062dcad MD5 | raw file
  1. """
  2. data hash pandas / numpy objects
  3. """
  4. import itertools
  5. import numpy as np
  6. from pandas._libs import hashing, tslib
  7. from pandas.core.dtypes.generic import (
  8. ABCMultiIndex,
  9. ABCIndexClass,
  10. ABCSeries,
  11. ABCDataFrame)
  12. from pandas.core.dtypes.common import (
  13. is_categorical_dtype, is_list_like)
  14. from pandas.core.dtypes.missing import isna
  15. from pandas.core.dtypes.cast import infer_dtype_from_scalar
  16. # 16 byte long hashing key
  17. _default_hash_key = '0123456789123456'
  18. def _combine_hash_arrays(arrays, num_items):
  19. """
  20. Parameters
  21. ----------
  22. arrays : generator
  23. num_items : int
  24. Should be the same as CPython's tupleobject.c
  25. """
  26. try:
  27. first = next(arrays)
  28. except StopIteration:
  29. return np.array([], dtype=np.uint64)
  30. arrays = itertools.chain([first], arrays)
  31. mult = np.uint64(1000003)
  32. out = np.zeros_like(first) + np.uint64(0x345678)
  33. for i, a in enumerate(arrays):
  34. inverse_i = num_items - i
  35. out ^= a
  36. out *= mult
  37. mult += np.uint64(82520 + inverse_i + inverse_i)
  38. assert i + 1 == num_items, 'Fed in wrong num_items'
  39. out += np.uint64(97531)
  40. return out
  41. def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None,
  42. categorize=True):
  43. """
  44. Return a data hash of the Index/Series/DataFrame
  45. .. versionadded:: 0.19.2
  46. Parameters
  47. ----------
  48. index : boolean, default True
  49. include the index in the hash (if Series/DataFrame)
  50. encoding : string, default 'utf8'
  51. encoding for data & key when strings
  52. hash_key : string key to encode, default to _default_hash_key
  53. categorize : bool, default True
  54. Whether to first categorize object arrays before hashing. This is more
  55. efficient when the array contains duplicate values.
  56. .. versionadded:: 0.20.0
  57. Returns
  58. -------
  59. Series of uint64, same length as the object
  60. """
  61. from pandas import Series
  62. if hash_key is None:
  63. hash_key = _default_hash_key
  64. if isinstance(obj, ABCMultiIndex):
  65. return Series(hash_tuples(obj, encoding, hash_key),
  66. dtype='uint64', copy=False)
  67. if isinstance(obj, ABCIndexClass):
  68. h = hash_array(obj.values, encoding, hash_key,
  69. categorize).astype('uint64', copy=False)
  70. h = Series(h, index=obj, dtype='uint64', copy=False)
  71. elif isinstance(obj, ABCSeries):
  72. h = hash_array(obj.values, encoding, hash_key,
  73. categorize).astype('uint64', copy=False)
  74. if index:
  75. index_iter = (hash_pandas_object(obj.index,
  76. index=False,
  77. encoding=encoding,
  78. hash_key=hash_key,
  79. categorize=categorize).values
  80. for _ in [None])
  81. arrays = itertools.chain([h], index_iter)
  82. h = _combine_hash_arrays(arrays, 2)
  83. h = Series(h, index=obj.index, dtype='uint64', copy=False)
  84. elif isinstance(obj, ABCDataFrame):
  85. hashes = (hash_array(series.values) for _, series in obj.iteritems())
  86. num_items = len(obj.columns)
  87. if index:
  88. index_hash_generator = (hash_pandas_object(obj.index,
  89. index=False,
  90. encoding=encoding,
  91. hash_key=hash_key,
  92. categorize=categorize).values # noqa
  93. for _ in [None])
  94. num_items += 1
  95. hashes = itertools.chain(hashes, index_hash_generator)
  96. h = _combine_hash_arrays(hashes, num_items)
  97. h = Series(h, index=obj.index, dtype='uint64', copy=False)
  98. else:
  99. raise TypeError("Unexpected type for hashing %s" % type(obj))
  100. return h
  101. def hash_tuples(vals, encoding='utf8', hash_key=None):
  102. """
  103. Hash an MultiIndex / list-of-tuples efficiently
  104. .. versionadded:: 0.20.0
  105. Parameters
  106. ----------
  107. vals : MultiIndex, list-of-tuples, or single tuple
  108. encoding : string, default 'utf8'
  109. hash_key : string key to encode, default to _default_hash_key
  110. Returns
  111. -------
  112. ndarray of hashed values array
  113. """
  114. is_tuple = False
  115. if isinstance(vals, tuple):
  116. vals = [vals]
  117. is_tuple = True
  118. elif not is_list_like(vals):
  119. raise TypeError("must be convertible to a list-of-tuples")
  120. from pandas import Categorical, MultiIndex
  121. if not isinstance(vals, ABCMultiIndex):
  122. vals = MultiIndex.from_tuples(vals)
  123. # create a list-of-Categoricals
  124. vals = [Categorical(vals.labels[level],
  125. vals.levels[level],
  126. ordered=False,
  127. fastpath=True)
  128. for level in range(vals.nlevels)]
  129. # hash the list-of-ndarrays
  130. hashes = (_hash_categorical(cat,
  131. encoding=encoding,
  132. hash_key=hash_key)
  133. for cat in vals)
  134. h = _combine_hash_arrays(hashes, len(vals))
  135. if is_tuple:
  136. h = h[0]
  137. return h
  138. def hash_tuple(val, encoding='utf8', hash_key=None):
  139. """
  140. Hash a single tuple efficiently
  141. Parameters
  142. ----------
  143. val : single tuple
  144. encoding : string, default 'utf8'
  145. hash_key : string key to encode, default to _default_hash_key
  146. Returns
  147. -------
  148. hash
  149. """
  150. hashes = (_hash_scalar(v, encoding=encoding, hash_key=hash_key)
  151. for v in val)
  152. h = _combine_hash_arrays(hashes, len(val))[0]
  153. return h
  154. def _hash_categorical(c, encoding, hash_key):
  155. """
  156. Hash a Categorical by hashing its categories, and then mapping the codes
  157. to the hashes
  158. Parameters
  159. ----------
  160. c : Categorical
  161. encoding : string, default 'utf8'
  162. hash_key : string key to encode, default to _default_hash_key
  163. Returns
  164. -------
  165. ndarray of hashed values array, same size as len(c)
  166. """
  167. hashed = hash_array(c.categories.values, encoding, hash_key,
  168. categorize=False)
  169. # we have uint64, as we don't directly support missing values
  170. # we don't want to use take_nd which will coerce to float
  171. # instead, directly construt the result with a
  172. # max(np.uint64) as the missing value indicator
  173. #
  174. # TODO: GH 15362
  175. mask = c.isna()
  176. if len(hashed):
  177. result = hashed.take(c.codes)
  178. else:
  179. result = np.zeros(len(mask), dtype='uint64')
  180. if mask.any():
  181. result[mask] = np.iinfo(np.uint64).max
  182. return result
  183. def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
  184. """
  185. Given a 1d array, return an array of deterministic integers.
  186. .. versionadded:: 0.19.2
  187. Parameters
  188. ----------
  189. vals : ndarray, Categorical
  190. encoding : string, default 'utf8'
  191. encoding for data & key when strings
  192. hash_key : string key to encode, default to _default_hash_key
  193. categorize : bool, default True
  194. Whether to first categorize object arrays before hashing. This is more
  195. efficient when the array contains duplicate values.
  196. .. versionadded:: 0.20.0
  197. Returns
  198. -------
  199. 1d uint64 numpy array of hash values, same length as the vals
  200. """
  201. if not hasattr(vals, 'dtype'):
  202. raise TypeError("must pass a ndarray-like")
  203. dtype = vals.dtype
  204. if hash_key is None:
  205. hash_key = _default_hash_key
  206. # For categoricals, we hash the categories, then remap the codes to the
  207. # hash values. (This check is above the complex check so that we don't ask
  208. # numpy if categorical is a subdtype of complex, as it will choke).
  209. if is_categorical_dtype(dtype):
  210. return _hash_categorical(vals, encoding, hash_key)
  211. # we'll be working with everything as 64-bit values, so handle this
  212. # 128-bit value early
  213. elif np.issubdtype(dtype, np.complex128):
  214. return hash_array(vals.real) + 23 * hash_array(vals.imag)
  215. # First, turn whatever array this is into unsigned 64-bit ints, if we can
  216. # manage it.
  217. elif isinstance(dtype, np.bool):
  218. vals = vals.astype('u8')
  219. elif issubclass(dtype.type, (np.datetime64, np.timedelta64)):
  220. vals = vals.view('i8').astype('u8', copy=False)
  221. elif issubclass(dtype.type, np.number) and dtype.itemsize <= 8:
  222. vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8')
  223. else:
  224. # With repeated values, its MUCH faster to categorize object dtypes,
  225. # then hash and rename categories. We allow skipping the categorization
  226. # when the values are known/likely to be unique.
  227. if categorize:
  228. from pandas import factorize, Categorical, Index
  229. codes, categories = factorize(vals, sort=False)
  230. cat = Categorical(codes, Index(categories),
  231. ordered=False, fastpath=True)
  232. return _hash_categorical(cat, encoding, hash_key)
  233. try:
  234. vals = hashing.hash_object_array(vals, hash_key, encoding)
  235. except TypeError:
  236. # we have mixed types
  237. vals = hashing.hash_object_array(vals.astype(str).astype(object),
  238. hash_key, encoding)
  239. # Then, redistribute these 64-bit ints within the space of 64-bit ints
  240. vals ^= vals >> 30
  241. vals *= np.uint64(0xbf58476d1ce4e5b9)
  242. vals ^= vals >> 27
  243. vals *= np.uint64(0x94d049bb133111eb)
  244. vals ^= vals >> 31
  245. return vals
  246. def _hash_scalar(val, encoding='utf8', hash_key=None):
  247. """
  248. Hash scalar value
  249. Returns
  250. -------
  251. 1d uint64 numpy array of hash value, of length 1
  252. """
  253. if isna(val):
  254. # this is to be consistent with the _hash_categorical implementation
  255. return np.array([np.iinfo(np.uint64).max], dtype='u8')
  256. if getattr(val, 'tzinfo', None) is not None:
  257. # for tz-aware datetimes, we need the underlying naive UTC value and
  258. # not the tz aware object or pd extension type (as
  259. # infer_dtype_from_scalar would do)
  260. if not isinstance(val, tslib.Timestamp):
  261. val = tslib.Timestamp(val)
  262. val = val.tz_convert(None)
  263. dtype, val = infer_dtype_from_scalar(val)
  264. vals = np.array([val], dtype=dtype)
  265. return hash_array(vals, hash_key=hash_key, encoding=encoding,
  266. categorize=False)