PageRenderTime 48ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 0ms

/pandas/core/common.py

https://github.com/rkabir/pandas
Python | 353 lines | 268 code | 41 blank | 44 comment | 56 complexity | 6f66d7b70bc430edf8e26da8efd7f469 MD5 | raw file
  1. """
  2. Misc tools for implementing data structures
  3. """
  4. from cStringIO import StringIO
  5. import itertools
  6. from numpy.lib.format import read_array, write_array
  7. import numpy as np
  8. import pandas._tseries as _tseries
  9. # XXX: HACK for NumPy 1.5.1 to suppress warnings
  10. try:
  11. np.seterr(all='ignore')
  12. except Exception: # pragma: no cover
  13. pass
  14. class PandasError(Exception):
  15. pass
  16. def isnull(input):
  17. '''
  18. Replacement for numpy.isnan / -numpy.isfinite which is suitable
  19. for use on object arrays.
  20. Parameters
  21. ----------
  22. arr: ndarray or object value
  23. Returns
  24. -------
  25. boolean ndarray or boolean
  26. '''
  27. from pandas.core.generic import PandasObject
  28. from pandas import Series
  29. if isinstance(input, np.ndarray):
  30. if input.dtype.kind in ('O', 'S'):
  31. # Working around NumPy ticket 1542
  32. shape = input.shape
  33. result = np.empty(shape, dtype=bool)
  34. vec = _tseries.isnullobj(input.ravel())
  35. result[:] = vec.reshape(shape)
  36. if isinstance(input, Series):
  37. result = Series(result, index=input.index, copy=False)
  38. else:
  39. result = -np.isfinite(input)
  40. elif isinstance(input, PandasObject):
  41. # TODO: optimize for DataFrame, etc.
  42. return input.apply(isnull)
  43. else:
  44. result = _tseries.checknull(input)
  45. return result
  46. def notnull(input):
  47. '''
  48. Replacement for numpy.isfinite / -numpy.isnan which is suitable
  49. for use on object arrays.
  50. Parameters
  51. ----------
  52. arr: ndarray or object value
  53. Returns
  54. -------
  55. boolean ndarray or boolean
  56. '''
  57. return np.negative(isnull(input))
  58. def _pickle_array(arr):
  59. arr = arr.view(np.ndarray)
  60. buf = StringIO()
  61. write_array(buf, arr)
  62. return buf.getvalue()
  63. def _unpickle_array(bytes):
  64. arr = read_array(StringIO(bytes))
  65. return arr
  66. def null_out_axis(arr, mask, axis):
  67. indexer = [slice(None)] * arr.ndim
  68. indexer[axis] = mask
  69. arr[tuple(indexer)] = np.NaN
  70. #-------------------------------------------------------------------------------
  71. # Lots of little utilities
  72. def _infer_dtype(value):
  73. if isinstance(value, (float, np.floating)):
  74. return float
  75. elif isinstance(value, (bool, np.bool_)):
  76. return bool
  77. elif isinstance(value, (int, np.integer)):
  78. return int
  79. else:
  80. return object
  81. def _is_bool_indexer(key):
  82. if isinstance(key, np.ndarray) and key.dtype == np.object_:
  83. mask = isnull(key)
  84. if mask.any():
  85. raise ValueError('cannot index with vector containing '
  86. 'NA / NaN values')
  87. return set([True, False]).issubset(set(key))
  88. elif isinstance(key, np.ndarray) and key.dtype == np.bool_:
  89. return True
  90. elif isinstance(key, list):
  91. try:
  92. return np.asarray(key).dtype == np.bool_
  93. except TypeError: # pragma: no cover
  94. return False
  95. return False
  96. def _default_index(n):
  97. from pandas.core.index import NULL_INDEX
  98. if n == 0:
  99. return NULL_INDEX
  100. else:
  101. return np.arange(n)
  102. def ensure_float(arr):
  103. if issubclass(arr.dtype.type, np.integer):
  104. arr = arr.astype(float)
  105. return arr
  106. def _mut_exclusive(arg1, arg2):
  107. if arg1 is not None and arg2 is not None:
  108. raise Exception('mutually exclusive arguments')
  109. elif arg1 is not None:
  110. return arg1
  111. else:
  112. return arg2
  113. def _ensure_index(index_like):
  114. from pandas.core.index import Index
  115. if not isinstance(index_like, Index):
  116. index_like = Index(index_like)
  117. return index_like
  118. def _any_none(*args):
  119. for arg in args:
  120. if arg is None:
  121. return True
  122. return False
  123. def _all_not_none(*args):
  124. for arg in args:
  125. if arg is None:
  126. return False
  127. return True
  128. def _try_sort(iterable):
  129. listed = list(iterable)
  130. try:
  131. return sorted(listed)
  132. except Exception:
  133. return listed
  134. def set_printoptions(precision=None, column_space=None):
  135. """
  136. Alter default behavior of DataFrame.toString
  137. precision : int
  138. Floating point output precision
  139. column_space : int
  140. Default space for DataFrame columns, defaults to 12
  141. """
  142. global _float_format, _column_space
  143. if precision is not None:
  144. float_format = '%.' + '%d' % precision + 'g'
  145. _float_format = lambda x: float_format % x
  146. if column_space is not None:
  147. _column_space = column_space
  148. _float_format = lambda x: '%.4g' % x
  149. _column_space = 12
  150. def _pfixed(s, space, nanRep=None, float_format=None):
  151. if isinstance(s, float):
  152. if nanRep is not None and isnull(s):
  153. if np.isnan(s):
  154. s = nanRep
  155. return (' %s' % s).ljust(space)
  156. if float_format:
  157. formatted = float_format(s)
  158. else:
  159. is_neg = s < 0
  160. formatted = _float_format(np.abs(s))
  161. if is_neg:
  162. formatted = '-' + formatted
  163. else:
  164. formatted = ' ' + formatted
  165. return formatted.ljust(space)
  166. else:
  167. return (' %s' % s)[:space].ljust(space)
  168. def _stringify(col):
  169. # unicode workaround
  170. if isinstance(col, tuple):
  171. return str(col)
  172. else:
  173. return '%s' % col
  174. def _format(s, nanRep=None, float_format=None):
  175. if isinstance(s, float):
  176. if nanRep is not None and isnull(s):
  177. if np.isnan(s):
  178. s = nanRep
  179. return (' %s' % str(s))
  180. if float_format:
  181. formatted = float_format(s)
  182. else:
  183. is_neg = s < 0
  184. formatted = _float_format(np.abs(s))
  185. if is_neg:
  186. formatted = '-' + formatted
  187. else:
  188. formatted = ' ' + formatted
  189. return formatted
  190. else:
  191. return ' %s' % str(s)
  192. #-------------------------------------------------------------------------------
  193. # miscellaneous python tools
  194. def rands(n):
  195. """Generates a random alphanumeric string of length *n*"""
  196. from random import Random
  197. import string
  198. return ''.join(Random().sample(string.letters+string.digits, n))
  199. def adjoin(space, *lists):
  200. """
  201. Glues together two sets of strings using the amount of space requested.
  202. The idea is to prettify.
  203. """
  204. outLines = []
  205. newLists = []
  206. lengths = [max(map(len, x)) + space for x in lists[:-1]]
  207. # not the last one
  208. lengths.append(max(map(len, lists[-1])))
  209. maxLen = max(map(len, lists))
  210. for i, lst in enumerate(lists):
  211. nl = [x.ljust(lengths[i]) for x in lst]
  212. nl.extend([' ' * lengths[i]] * (maxLen - len(lst)))
  213. newLists.append(nl)
  214. toJoin = zip(*newLists)
  215. for lines in toJoin:
  216. outLines.append(''.join(lines))
  217. return '\n'.join(outLines)
  218. def iterpairs(seq):
  219. """
  220. Parameters
  221. ----------
  222. seq: sequence
  223. Returns
  224. -------
  225. iterator returning overlapping pairs of elements
  226. Example
  227. -------
  228. >>> iterpairs([1, 2, 3, 4])
  229. [(1, 2), (2, 3), (3, 4)
  230. """
  231. # input may not be sliceable
  232. seq_it = iter(seq)
  233. seq_it_next = iter(seq)
  234. _ = seq_it_next.next()
  235. return itertools.izip(seq_it, seq_it_next)
  236. def indent(string, spaces=4):
  237. dent = ' ' * spaces
  238. return '\n'.join([dent + x for x in string.split('\n')])
  239. def banner(message):
  240. """
  241. Return 80-char width message declaration with = bars on top and bottom.
  242. """
  243. bar = '=' * 80
  244. return '%s\n%s\n%s' % (bar, message, bar)
  245. class groupby(dict):
  246. """
  247. A simple groupby different from the one in itertools.
  248. Does not require the sequence elements to be sorted by keys,
  249. however it is slower.
  250. """
  251. def __init__(self, seq, key=lambda x:x):
  252. for value in seq:
  253. k = key(value)
  254. self.setdefault(k, []).append(value)
  255. __iter__ = dict.iteritems
  256. def map_indices_py(arr):
  257. """
  258. Returns a dictionary with (element, index) pairs for each element in the
  259. given array/list
  260. """
  261. return dict([(x, i) for i, x in enumerate(arr)])
  262. def union(*seqs):
  263. result = set([])
  264. for seq in seqs:
  265. if not isinstance(seq, set):
  266. seq = set(seq)
  267. result |= seq
  268. return type(seqs[0])(list(result))
  269. def difference(a, b):
  270. return type(a)(list(set(a) - set(b)))
  271. def intersection(*seqs):
  272. result = set(seqs[0])
  273. for seq in seqs:
  274. if not isinstance(seq, set):
  275. seq = set(seq)
  276. result &= seq
  277. return type(seqs[0])(list(result))
  278. def _asarray_tuplesafe(values):
  279. if not isinstance(values, (list, np.ndarray)):
  280. values = list(values)
  281. result = np.asarray(values)
  282. if issubclass(result.dtype.type, basestring):
  283. result = np.asarray(values, dtype=object)
  284. if result.ndim == 2:
  285. result = np.empty(len(values), dtype=object)
  286. result[:] = values
  287. return result