PageRenderTime 43ms CodeModel.GetById 14ms RepoModel.GetById 0ms app.codeStats 0ms

/pandas/indexes/category.py

http://github.com/wesm/pandas
Python | 653 lines | 496 code | 50 blank | 107 comment | 45 complexity | ecc44f00cd26cc0b53c35497e635fbe8 MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
  1. import numpy as np
  2. import pandas.index as _index
  3. from pandas import compat
  4. from pandas.compat.numpy import function as nv
  5. from pandas.types.generic import ABCCategorical, ABCSeries
  6. from pandas.types.common import (is_categorical_dtype,
  7. _ensure_platform_int,
  8. is_list_like,
  9. is_scalar)
  10. from pandas.types.missing import array_equivalent
  11. from pandas.util.decorators import (Appender, cache_readonly,
  12. deprecate_kwarg)
  13. from pandas.core.config import get_option
  14. from pandas.indexes.base import Index, _index_shared_docs
  15. import pandas.core.base as base
  16. import pandas.core.missing as missing
  17. import pandas.indexes.base as ibase
  18. class CategoricalIndex(Index, base.PandasDelegate):
  19. """
  20. Immutable Index implementing an ordered, sliceable set. CategoricalIndex
  21. represents a sparsely populated Index with an underlying Categorical.
  22. .. versionadded:: 0.16.1
  23. Parameters
  24. ----------
  25. data : array-like or Categorical, (1-dimensional)
  26. categories : optional, array-like
  27. categories for the CategoricalIndex
  28. ordered : boolean,
  29. designating if the categories are ordered
  30. copy : bool
  31. Make a copy of input ndarray
  32. name : object
  33. Name to be stored in the index
  34. """
  35. _typ = 'categoricalindex'
  36. _engine_type = _index.Int64Engine
  37. _attributes = ['name']
  38. def __new__(cls, data=None, categories=None, ordered=None, dtype=None,
  39. copy=False, name=None, fastpath=False, **kwargs):
  40. if fastpath:
  41. return cls._simple_new(data, name=name)
  42. if name is None and hasattr(data, 'name'):
  43. name = data.name
  44. if isinstance(data, ABCCategorical):
  45. data = cls._create_categorical(cls, data, categories, ordered)
  46. elif isinstance(data, CategoricalIndex):
  47. data = data._data
  48. data = cls._create_categorical(cls, data, categories, ordered)
  49. else:
  50. # don't allow scalars
  51. # if data is None, then categories must be provided
  52. if is_scalar(data):
  53. if data is not None or categories is None:
  54. cls._scalar_data_error(data)
  55. data = []
  56. data = cls._create_categorical(cls, data, categories, ordered)
  57. if copy:
  58. data = data.copy()
  59. return cls._simple_new(data, name=name)
  60. def _create_from_codes(self, codes, categories=None, ordered=None,
  61. name=None):
  62. """
  63. *this is an internal non-public method*
  64. create the correct categorical from codes
  65. Parameters
  66. ----------
  67. codes : new codes
  68. categories : optional categories, defaults to existing
  69. ordered : optional ordered attribute, defaults to existing
  70. name : optional name attribute, defaults to existing
  71. Returns
  72. -------
  73. CategoricalIndex
  74. """
  75. from pandas.core.categorical import Categorical
  76. if categories is None:
  77. categories = self.categories
  78. if ordered is None:
  79. ordered = self.ordered
  80. if name is None:
  81. name = self.name
  82. cat = Categorical.from_codes(codes, categories=categories,
  83. ordered=self.ordered)
  84. return CategoricalIndex(cat, name=name)
  85. @staticmethod
  86. def _create_categorical(self, data, categories=None, ordered=None):
  87. """
  88. *this is an internal non-public method*
  89. create the correct categorical from data and the properties
  90. Parameters
  91. ----------
  92. data : data for new Categorical
  93. categories : optional categories, defaults to existing
  94. ordered : optional ordered attribute, defaults to existing
  95. Returns
  96. -------
  97. Categorical
  98. """
  99. if not isinstance(data, ABCCategorical):
  100. from pandas.core.categorical import Categorical
  101. data = Categorical(data, categories=categories, ordered=ordered)
  102. else:
  103. if categories is not None:
  104. data = data.set_categories(categories)
  105. if ordered is not None:
  106. data = data.set_ordered(ordered)
  107. return data
  108. @classmethod
  109. def _simple_new(cls, values, name=None, categories=None, ordered=None,
  110. **kwargs):
  111. result = object.__new__(cls)
  112. values = cls._create_categorical(cls, values, categories, ordered)
  113. result._data = values
  114. result.name = name
  115. for k, v in compat.iteritems(kwargs):
  116. setattr(result, k, v)
  117. result._reset_identity()
  118. return result
  119. @Appender(_index_shared_docs['_shallow_copy'])
  120. def _shallow_copy(self, values=None, categories=None, ordered=None,
  121. **kwargs):
  122. # categories and ordered can't be part of attributes,
  123. # as these are properties
  124. if categories is None:
  125. categories = self.categories
  126. if ordered is None:
  127. ordered = self.ordered
  128. return super(CategoricalIndex,
  129. self)._shallow_copy(values=values, categories=categories,
  130. ordered=ordered, **kwargs)
  131. def _is_dtype_compat(self, other):
  132. """
  133. *this is an internal non-public method*
  134. provide a comparison between the dtype of self and other (coercing if
  135. needed)
  136. Raises
  137. ------
  138. TypeError if the dtypes are not compatible
  139. """
  140. if is_categorical_dtype(other):
  141. if isinstance(other, CategoricalIndex):
  142. other = other._values
  143. if not other.is_dtype_equal(self):
  144. raise TypeError("categories must match existing categories "
  145. "when appending")
  146. else:
  147. values = other
  148. if not is_list_like(values):
  149. values = [values]
  150. other = CategoricalIndex(self._create_categorical(
  151. self, other, categories=self.categories, ordered=self.ordered))
  152. if not other.isin(values).all():
  153. raise TypeError("cannot append a non-category item to a "
  154. "CategoricalIndex")
  155. return other
  156. def equals(self, other):
  157. """
  158. Determines if two CategorialIndex objects contain the same elements.
  159. """
  160. if self.is_(other):
  161. return True
  162. try:
  163. other = self._is_dtype_compat(other)
  164. return array_equivalent(self._data, other)
  165. except (TypeError, ValueError):
  166. pass
  167. return False
  168. @property
  169. def _formatter_func(self):
  170. return self.categories._formatter_func
  171. def _format_attrs(self):
  172. """
  173. Return a list of tuples of the (attr,formatted_value)
  174. """
  175. max_categories = (10 if get_option("display.max_categories") == 0 else
  176. get_option("display.max_categories"))
  177. attrs = [
  178. ('categories',
  179. ibase.default_pprint(self.categories,
  180. max_seq_items=max_categories)),
  181. ('ordered', self.ordered)]
  182. if self.name is not None:
  183. attrs.append(('name', ibase.default_pprint(self.name)))
  184. attrs.append(('dtype', "'%s'" % self.dtype))
  185. max_seq_items = get_option('display.max_seq_items') or len(self)
  186. if len(self) > max_seq_items:
  187. attrs.append(('length', len(self)))
  188. return attrs
  189. @property
  190. def inferred_type(self):
  191. return 'categorical'
  192. @property
  193. def values(self):
  194. """ return the underlying data, which is a Categorical """
  195. return self._data
  196. def get_values(self):
  197. """ return the underlying data as an ndarray """
  198. return self._data.get_values()
  199. @property
  200. def codes(self):
  201. return self._data.codes
  202. @property
  203. def categories(self):
  204. return self._data.categories
  205. @property
  206. def ordered(self):
  207. return self._data.ordered
  208. def __contains__(self, key):
  209. hash(key)
  210. return key in self.values
  211. def __array__(self, dtype=None):
  212. """ the array interface, return my values """
  213. return np.array(self._data, dtype=dtype)
  214. @cache_readonly
  215. def _isnan(self):
  216. """ return if each value is nan"""
  217. return self._data.codes == -1
  218. @Appender(ibase._index_shared_docs['fillna'])
  219. def fillna(self, value, downcast=None):
  220. self._assert_can_do_op(value)
  221. return CategoricalIndex(self._data.fillna(value), name=self.name)
  222. def argsort(self, *args, **kwargs):
  223. return self.values.argsort(*args, **kwargs)
  224. @cache_readonly
  225. def _engine(self):
  226. # we are going to look things up with the codes themselves
  227. return self._engine_type(lambda: self.codes.astype('i8'), len(self))
  228. @cache_readonly
  229. def is_unique(self):
  230. return not self.duplicated().any()
  231. @deprecate_kwarg('take_last', 'keep', mapping={True: 'last',
  232. False: 'first'})
  233. @Appender(base._shared_docs['duplicated'] % ibase._index_doc_kwargs)
  234. def duplicated(self, keep='first'):
  235. from pandas.hashtable import duplicated_int64
  236. codes = self.codes.astype('i8')
  237. return duplicated_int64(codes, keep)
  238. def _to_safe_for_reshape(self):
  239. """ convert to object if we are a categorical """
  240. return self.astype('object')
  241. def get_loc(self, key, method=None):
  242. """
  243. Get integer location for requested label
  244. Parameters
  245. ----------
  246. key : label
  247. method : {None}
  248. * default: exact matches only.
  249. Returns
  250. -------
  251. loc : int if unique index, possibly slice or mask if not
  252. """
  253. codes = self.categories.get_loc(key)
  254. if (codes == -1):
  255. raise KeyError(key)
  256. return self._engine.get_loc(codes)
  257. def _can_reindex(self, indexer):
  258. """ always allow reindexing """
  259. pass
  260. def where(self, cond, other=None):
  261. """
  262. .. versionadded:: 0.19.0
  263. Return an Index of same shape as self and whose corresponding
  264. entries are from self where cond is True and otherwise are from
  265. other.
  266. Parameters
  267. ----------
  268. cond : boolean same length as self
  269. other : scalar, or array-like
  270. """
  271. if other is None:
  272. other = self._na_value
  273. values = np.where(cond, self.values, other)
  274. from pandas.core.categorical import Categorical
  275. cat = Categorical(values,
  276. categories=self.categories,
  277. ordered=self.ordered)
  278. return self._shallow_copy(cat, **self._get_attributes_dict())
  279. def reindex(self, target, method=None, level=None, limit=None,
  280. tolerance=None):
  281. """
  282. Create index with target's values (move/add/delete values as necessary)
  283. Returns
  284. -------
  285. new_index : pd.Index
  286. Resulting index
  287. indexer : np.ndarray or None
  288. Indices of output values in original index
  289. """
  290. if method is not None:
  291. raise NotImplementedError("argument method is not implemented for "
  292. "CategoricalIndex.reindex")
  293. if level is not None:
  294. raise NotImplementedError("argument level is not implemented for "
  295. "CategoricalIndex.reindex")
  296. if limit is not None:
  297. raise NotImplementedError("argument limit is not implemented for "
  298. "CategoricalIndex.reindex")
  299. target = ibase._ensure_index(target)
  300. if not is_categorical_dtype(target) and not target.is_unique:
  301. raise ValueError("cannot reindex with a non-unique indexer")
  302. indexer, missing = self.get_indexer_non_unique(np.array(target))
  303. new_target = self.take(indexer)
  304. # filling in missing if needed
  305. if len(missing):
  306. cats = self.categories.get_indexer(target)
  307. if (cats == -1).any():
  308. # coerce to a regular index here!
  309. result = Index(np.array(self), name=self.name)
  310. new_target, indexer, _ = result._reindex_non_unique(
  311. np.array(target))
  312. else:
  313. codes = new_target.codes.copy()
  314. codes[indexer == -1] = cats[missing]
  315. new_target = self._create_from_codes(codes)
  316. # we always want to return an Index type here
  317. # to be consistent with .reindex for other index types (e.g. they don't
  318. # coerce based on the actual values, only on the dtype)
  319. # unless we had an inital Categorical to begin with
  320. # in which case we are going to conform to the passed Categorical
  321. new_target = np.asarray(new_target)
  322. if is_categorical_dtype(target):
  323. new_target = target._shallow_copy(new_target, name=self.name)
  324. else:
  325. new_target = Index(new_target, name=self.name)
  326. return new_target, indexer
  327. def _reindex_non_unique(self, target):
  328. """ reindex from a non-unique; which CategoricalIndex's are almost
  329. always
  330. """
  331. new_target, indexer = self.reindex(target)
  332. new_indexer = None
  333. check = indexer == -1
  334. if check.any():
  335. new_indexer = np.arange(len(self.take(indexer)))
  336. new_indexer[check] = -1
  337. cats = self.categories.get_indexer(target)
  338. if not (cats == -1).any():
  339. # .reindex returns normal Index. Revert to CategoricalIndex if
  340. # all targets are included in my categories
  341. new_target = self._shallow_copy(new_target)
  342. return new_target, indexer, new_indexer
  343. def get_indexer(self, target, method=None, limit=None, tolerance=None):
  344. """
  345. Compute indexer and mask for new index given the current index. The
  346. indexer should be then used as an input to ndarray.take to align the
  347. current data to the new index. The mask determines whether labels are
  348. found or not in the current index
  349. Parameters
  350. ----------
  351. target : MultiIndex or Index (of tuples)
  352. method : {'pad', 'ffill', 'backfill', 'bfill'}
  353. pad / ffill: propagate LAST valid observation forward to next valid
  354. backfill / bfill: use NEXT valid observation to fill gap
  355. Notes
  356. -----
  357. This is a low-level method and probably should be used at your own risk
  358. Examples
  359. --------
  360. >>> indexer, mask = index.get_indexer(new_index)
  361. >>> new_values = cur_values.take(indexer)
  362. >>> new_values[-mask] = np.nan
  363. Returns
  364. -------
  365. (indexer, mask) : (ndarray, ndarray)
  366. """
  367. method = missing.clean_reindex_fill_method(method)
  368. target = ibase._ensure_index(target)
  369. if isinstance(target, CategoricalIndex):
  370. target = target.categories
  371. if method == 'pad' or method == 'backfill':
  372. raise NotImplementedError("method='pad' and method='backfill' not "
  373. "implemented yet for CategoricalIndex")
  374. elif method == 'nearest':
  375. raise NotImplementedError("method='nearest' not implemented yet "
  376. 'for CategoricalIndex')
  377. else:
  378. codes = self.categories.get_indexer(target)
  379. indexer, _ = self._engine.get_indexer_non_unique(codes)
  380. return _ensure_platform_int(indexer)
  381. def get_indexer_non_unique(self, target):
  382. """ this is the same for a CategoricalIndex for get_indexer; the API
  383. returns the missing values as well
  384. """
  385. target = ibase._ensure_index(target)
  386. if isinstance(target, CategoricalIndex):
  387. target = target.categories
  388. codes = self.categories.get_indexer(target)
  389. return self._engine.get_indexer_non_unique(codes)
  390. def _convert_list_indexer(self, keyarr, kind=None):
  391. """
  392. we are passed a list indexer.
  393. Return our indexer or raise if all of the values are not included in
  394. the categories
  395. """
  396. codes = self.categories.get_indexer(keyarr)
  397. if (codes == -1).any():
  398. raise KeyError("a list-indexer must only include values that are "
  399. "in the categories")
  400. return None
  401. @Appender(_index_shared_docs['take'])
  402. def take(self, indices, axis=0, allow_fill=True,
  403. fill_value=None, **kwargs):
  404. nv.validate_take(tuple(), kwargs)
  405. indices = _ensure_platform_int(indices)
  406. taken = self._assert_take_fillable(self.codes, indices,
  407. allow_fill=allow_fill,
  408. fill_value=fill_value,
  409. na_value=-1)
  410. return self._create_from_codes(taken)
  411. def map(self, mapper):
  412. """
  413. Apply mapper function to its categories (not codes).
  414. Parameters
  415. ----------
  416. mapper : callable
  417. Function to be applied. When all categories are mapped
  418. to different categories, the result will be Categorical which has
  419. the same order property as the original. Otherwise, the result will
  420. be np.ndarray.
  421. Returns
  422. -------
  423. applied : Categorical or np.ndarray.
  424. """
  425. return self.values.map(mapper)
  426. def delete(self, loc):
  427. """
  428. Make new Index with passed location(-s) deleted
  429. Returns
  430. -------
  431. new_index : Index
  432. """
  433. return self._create_from_codes(np.delete(self.codes, loc))
  434. def insert(self, loc, item):
  435. """
  436. Make new Index inserting new item at location. Follows
  437. Python list.append semantics for negative values
  438. Parameters
  439. ----------
  440. loc : int
  441. item : object
  442. Returns
  443. -------
  444. new_index : Index
  445. Raises
  446. ------
  447. ValueError if the item is not in the categories
  448. """
  449. code = self.categories.get_indexer([item])
  450. if (code == -1):
  451. raise TypeError("cannot insert an item into a CategoricalIndex "
  452. "that is not already an existing category")
  453. codes = self.codes
  454. codes = np.concatenate((codes[:loc], code, codes[loc:]))
  455. return self._create_from_codes(codes)
  456. def append(self, other):
  457. """
  458. Append a collection of CategoricalIndex options together
  459. Parameters
  460. ----------
  461. other : Index or list/tuple of indices
  462. Returns
  463. -------
  464. appended : Index
  465. Raises
  466. ------
  467. ValueError if other is not in the categories
  468. """
  469. to_concat, name = self._ensure_compat_append(other)
  470. to_concat = [self._is_dtype_compat(c) for c in to_concat]
  471. codes = np.concatenate([c.codes for c in to_concat])
  472. return self._create_from_codes(codes, name=name)
  473. @classmethod
  474. def _add_comparison_methods(cls):
  475. """ add in comparison methods """
  476. def _make_compare(op):
  477. def _evaluate_compare(self, other):
  478. # if we have a Categorical type, then must have the same
  479. # categories
  480. if isinstance(other, CategoricalIndex):
  481. other = other._values
  482. elif isinstance(other, Index):
  483. other = self._create_categorical(
  484. self, other._values, categories=self.categories,
  485. ordered=self.ordered)
  486. if isinstance(other, (ABCCategorical, np.ndarray,
  487. ABCSeries)):
  488. if len(self.values) != len(other):
  489. raise ValueError("Lengths must match to compare")
  490. if isinstance(other, ABCCategorical):
  491. if not self.values.is_dtype_equal(other):
  492. raise TypeError("categorical index comparisions must "
  493. "have the same categories and ordered "
  494. "attributes")
  495. return getattr(self.values, op)(other)
  496. return _evaluate_compare
  497. cls.__eq__ = _make_compare('__eq__')
  498. cls.__ne__ = _make_compare('__ne__')
  499. cls.__lt__ = _make_compare('__lt__')
  500. cls.__gt__ = _make_compare('__gt__')
  501. cls.__le__ = _make_compare('__le__')
  502. cls.__ge__ = _make_compare('__ge__')
  503. def _delegate_method(self, name, *args, **kwargs):
  504. """ method delegation to the ._values """
  505. method = getattr(self._values, name)
  506. if 'inplace' in kwargs:
  507. raise ValueError("cannot use inplace with CategoricalIndex")
  508. res = method(*args, **kwargs)
  509. if is_scalar(res):
  510. return res
  511. return CategoricalIndex(res, name=self.name)
  512. @classmethod
  513. def _add_accessors(cls):
  514. """ add in Categorical accessor methods """
  515. from pandas.core.categorical import Categorical
  516. CategoricalIndex._add_delegate_accessors(
  517. delegate=Categorical, accessors=["rename_categories",
  518. "reorder_categories",
  519. "add_categories",
  520. "remove_categories",
  521. "remove_unused_categories",
  522. "set_categories",
  523. "as_ordered", "as_unordered",
  524. "min", "max"],
  525. typ='method', overwrite=True)
  526. CategoricalIndex._add_numericlike_set_methods_disabled()
  527. CategoricalIndex._add_numeric_methods_disabled()
  528. CategoricalIndex._add_logical_methods_disabled()
  529. CategoricalIndex._add_comparison_methods()
  530. CategoricalIndex._add_accessors()