/CHAID/column.py
https://github.com/Rambatino/CHAID · Python · 309 lines · 196 code · 48 blank · 65 comment · 60 complexity · 59891d432f6286f6b4dcc1bb5181bd0a MD5 · raw file
- import numpy as np
- from math import isnan
- from itertools import combinations
- from .mapping_dict import MappingDict
- def is_sorted(ndarr, nan_val=None):
- store = []
- for arr in ndarr:
- if arr == [] or len(arr) == 1: continue
- if nan_val is not None and nan_val in arr:
- arr.remove(nan_val)
- store.append(arr[-1] - arr[0] == len(arr) - 1)
- return all(store)
- class Column(object):
- """
- A numpy array with metadata
- Parameters
- ----------
- arr : iterable object
- The numpy array
- metadata : dict
- The substitutions of the vector
- missing_id : string
- An identifier for the missing value to be associated
- substitute : bool
- Whether the objects in the given array need to be substitued for
- integers
- """
- def __init__(self, arr=None, metadata=None, missing_id='<missing>',
- substitute=True, weights=None, name=None):
- self.metadata = dict(metadata or {})
- self.arr = np.array(arr)
- self._missing_id = missing_id
- self.weights = weights
- self.name = name
- def __iter__(self):
- return iter(self.arr)
- def __getitem__(self, key):
- raise NotImplementedError
- def __setitem__(self, key, value):
- raise NotImplementedError
- def possible_groupings(self):
- raise NotImplementedError
- @property
- def type(self):
- """
- Returns a string representing the type
- """
- raise NotImplementedError
- def deep_copy(self):
- """
- Returns a deep copy
- """
- raise NotImplementedError
- def bell_set(self, collection, ordinal=False):
- """
- Calculates the Bell set
- """
- if len(collection) == 1:
- yield [ collection ]
- return
- first = collection[0]
- for smaller in self.bell_set(collection[1:]):
- for n, subset in enumerate(smaller):
- if not ordinal or (ordinal and is_sorted(smaller[:n] + [[ first ] + subset] + smaller[n+1:], self._nan)):
- yield smaller[:n] + [[ first ] + subset] + smaller[n+1:]
- if not ordinal or (ordinal and is_sorted([ [ first ] ] + smaller, self._nan)):
- yield [ [ first ] ] + smaller
- class NominalColumn(Column):
- """
- A column containing numerical values that are unrelated to
- one another (i.e. do not follow a progression)
- """
- def __init__(self, arr=None, metadata=None, missing_id='<missing>',
- substitute=True, weights=None, name=None):
- super(self.__class__, self).__init__(arr, metadata=metadata, missing_id=missing_id, weights=weights, name=name)
- if substitute and metadata is None:
- self.substitute_values(arr)
- self._groupings = MappingDict()
- for x in np.unique(self.arr):
- self._groupings[x] = [x]
- def deep_copy(self):
- """
- Returns a deep copy.
- """
- return NominalColumn(self.arr, metadata=self.metadata, name=self.name,
- missing_id=self._missing_id, substitute=False, weights=self.weights)
- def substitute_values(self, vect):
- """
- Internal method to substitute integers into the vector, and construct
- metadata to convert back to the original vector.
- np.nan is always given -1, all other objects are given integers in
- order of apperence.
- Parameters
- ----------
- vect : np.array
- the vector in which to substitute values in
- """
- try:
- unique = np.unique(vect)
- except:
- unique = set(vect)
- unique = [
- x for x in unique if not isinstance(x, float) or not isnan(x)
- ]
- arr = np.copy(vect)
- for new_id, value in enumerate(unique):
- np.place(arr, arr==value, new_id)
- self.metadata[new_id] = value
- arr = arr.astype(np.float)
- np.place(arr, np.isnan(arr), -1)
- self.arr = arr
- if -1 in arr:
- self.metadata[-1] = self._missing_id
- def __getitem__(self, key):
- new_weights = None if self.weights is None else self.weights[key]
- return NominalColumn(self.arr[key], metadata=self.metadata, substitute=False, weights=new_weights, name=self.name)
- def __setitem__(self, key, value):
- self.arr[key] = value
- return self
- def groups(self):
- return list(self._groupings.values())
- def possible_groupings(self):
- return combinations(self._groupings.keys(), 2)
- def all_combinations(self):
- bell_set = self.bell_set(sorted(list(self._groupings.keys())))
- next(bell_set)
- return bell_set
- def group(self, x, y):
- self._groupings[x] += self._groupings[y]
- del self._groupings[y]
- self.arr[self.arr == y] = x
- @property
- def type(self):
- """
- Returns a string representing the type
- """
- return 'nominal'
- class OrdinalColumn(Column):
- """
- A column containing integer values that have an order
- """
- def __init__(self, arr=None, metadata=None, missing_id='<missing>',
- groupings=None, substitute=True, weights=None, name=None):
- super(self.__class__, self).__init__(arr, metadata, missing_id=missing_id, weights=weights, name=name)
- self._nan = np.array([np.nan]).astype(int)[0]
- if substitute and metadata is None:
- self.arr, self.orig_type = self.substitute_values(self.arr)
- elif substitute and metadata and not np.issubdtype(self.arr.dtype, np.integer):
- # custom metadata has been passed in from external source, and must be converted to int
- self.arr = self.arr.astype(int)
- self.metadata = { int(k):v for k, v in metadata.items() }
- self.metadata[self._nan] = missing_id
- self._groupings = {}
- if groupings is None:
- for x in np.unique(self.arr):
- self._groupings[x] = [x, x + 1, False]
- else:
- for x in np.unique(self.arr):
- self._groupings[x] = list(groupings[x])
- self._possible_groups = None
- def substitute_values(self, vect):
- if not np.issubdtype(vect.dtype, np.integer):
- uniq = set(vect)
- uniq_floats = np.array(list(uniq), dtype=float)
- uniq_ints = uniq_floats.astype(int)
- nan = self._missing_id
- self.metadata = {
- new: nan if isnan(as_float) else old
- for old, as_float, new in zip(uniq, uniq_floats, uniq_ints)
- }
- self.arr = self.arr.astype(float)
- return self.arr.astype(int), self.arr.dtype.type
- def deep_copy(self):
- """
- Returns a deep copy.
- """
- return OrdinalColumn(self.arr, metadata=self.metadata, name=self.name,
- missing_id=self._missing_id, substitute=True,
- groupings=self._groupings, weights=self.weights)
- def __getitem__(self, key):
- new_weights = None if self.weights is None else self.weights[key]
- return OrdinalColumn(self.arr[key], metadata=self.metadata, name=self.name,
- missing_id=self._missing_id, substitute=True,
- groupings=self._groupings, weights=new_weights)
- def __setitem__(self, key, value):
- self.arr[key] = value
- return self
- def groups(self):
- vals = self._groupings.values()
- return [
- [x for x in range(minmax[0], minmax[1])] + ([self._nan] if minmax[2] else [])
- for minmax in vals
- ]
- def possible_groupings(self):
- if self._possible_groups is None:
- ranges = sorted(self._groupings.items())
- candidates = zip(ranges[0:], ranges[1:])
- self._possible_groups = [
- (k1, k2) for (k1, minmax1), (k2, minmax2) in candidates
- if minmax1[1] == minmax2[0]
- ]
- if self._nan in self.arr:
- self._possible_groups += [
- (key, self._nan) for key in self._groupings.keys() if key != self._nan
- ]
- return self._possible_groups.__iter__()
- def all_combinations(self):
- bell_set = self.bell_set(sorted(list(self._groupings.keys())), True)
- next(bell_set)
- return bell_set
- def group(self, x, y):
- self._possible_groups = None
- if y != self._nan:
- x = int(x)
- y = int(y)
- x_max = self._groupings[x][1]
- y_min = self._groupings[y][0]
- if y_min >= x_max:
- self._groupings[x][1] = self._groupings[y][1]
- else:
- self._groupings[x][0] = y_min
- self._groupings[x][2] = self._groupings[x][2] or self._groupings[y][2]
- else:
- self._groupings[x][2] = True
- del self._groupings[y]
- self.arr[self.arr == y] = x
- @property
- def type(self):
- """
- Returns a string representing the type
- """
- return 'ordinal'
- class ContinuousColumn(Column):
- """
- A column containing numerical values on a continuous scale
- """
- def __init__(self, arr=None, metadata=None, missing_id='<missing>',
- weights=None):
- if not np.issubdtype(arr.dtype, np.number):
- raise ValueError('Must only pass numerical values to create continuous column')
- super(self.__class__, self).__init__(np.nan_to_num(arr), metadata, missing_id=missing_id, weights=weights)
- def deep_copy(self):
- """
- Returns a deep copy.
- """
- return ContinuousColumn(self.arr, metadata=self.metadata, missing_id=self._missing_id, weights=self.weights)
- def __getitem__(self, key):
- new_weights = None if self.weights is None else self.weights[key]
- return ContinuousColumn(self.arr[key], metadata=self.metadata, weights=new_weights)
- def __setitem__(self, key, value):
- self.arr[key] = value
- return self
- @property
- def type(self):
- """
- Returns a string representing the type
- """
- return 'continuous'