PageRenderTime 51ms CodeModel.GetById 18ms RepoModel.GetById 1ms app.codeStats 0ms

/pandas/core/strings.py

https://github.com/kljensen/pandas
Python | 774 lines | 723 code | 23 blank | 28 comment | 21 complexity | 13294c0cd79a436bc4953bd837d3b3dd MD5 | raw file
Possible License(s): BSD-3-Clause
  1. import numpy as np
  2. from functools import wraps
  3. from itertools import izip
  4. from pandas.core.common import isnull
  5. from pandas.core.series import Series
  6. import re
  7. import pandas.lib as lib
  8. import pandas.core.common as com
  9. import operator
  10. class repeat(object):
  11. def __init__(self, obj):
  12. self.obj = obj
  13. def __getitem__(self, i):
  14. return self.obj
  15. class azip(object):
  16. def __init__(self, *args):
  17. self.cols = []
  18. for a in args:
  19. if np.isscalar(a):
  20. self.cols.append(repeat(a))
  21. else:
  22. self.cols.append(a)
  23. def __getitem__(self, i):
  24. return [col[i] for col in self.cols]
  25. def map_iter_args(arr, f, otherargs, n_otherargs, required, n_results):
  26. '''
  27. Substitute for np.vectorize with pandas-friendly dtype inference
  28. Parameters
  29. ----------
  30. arr : ndarray
  31. f : function
  32. Returns
  33. -------
  34. mapped : ndarray
  35. '''
  36. notnull = com.notnull
  37. n = len(arr)
  38. result = np.empty((n, n_results), dtype=object)
  39. for i, val in enumerate(arr):
  40. args = otherargs[i]
  41. if notnull(val) and all(notnull(args[r]) for r in required):
  42. result[i] = f(val, *args)
  43. else:
  44. result[i] = [np.nan] * n_results
  45. return [lib.maybe_convert_objects(col, try_float=0) for col in result.T]
  46. def auto_map(arr, f, otherargs, n_results=1, required='all'):
  47. from pandas.core.series import Series
  48. if all(np.isscalar(a) for a in otherargs):
  49. res = lib.map_infer(arr, lambda v: f(v, *otherargs))
  50. return Series(res, index=arr.index, copy=False)
  51. n_otherargs = len(otherargs)
  52. if required == 'all':
  53. required = list(range(n_otherargs))
  54. res = map_iter_args(arr, f, azip(*otherargs), n_otherargs,
  55. required, n_results)
  56. res = [Series(col, index=arr.index, copy=False) for col in res]
  57. if n_results == 1:
  58. return res[0]
  59. return res
  60. def mapwrap(f, n_results_default=1, required='all'):
  61. # @wraps(f)
  62. def wrapped(arr, n_results=None, *otherargs):
  63. n_results = n_results or n_results_default
  64. return auto_map(arr, f, otherargs, n_results, required)
  65. return wrapped
  66. startswith = mapwrap(lambda x, p: x.startswith(p))
  67. contains = mapwrap(lambda x, p: x.__contains__(p))
  68. upper = mapwrap(lambda x: x.upper())
  69. lower = mapwrap(lambda x: x.lower())
  70. def _get_array_list(arr, others):
  71. if isinstance(others[0], (list, np.ndarray)):
  72. arrays = [arr] + list(others)
  73. else:
  74. arrays = [arr, others]
  75. return [np.asarray(x, dtype=object) for x in arrays]
  76. def str_cat(arr, others=None, sep=None, na_rep=None):
  77. """
  78. Concatenate arrays of strings with given separator
  79. Parameters
  80. ----------
  81. arr : list or array-like
  82. others : list or array, or list of arrays
  83. sep : string or None, default None
  84. na_rep : string or None, default None
  85. If None, an NA in any array will propagate
  86. Returns
  87. -------
  88. concat : array
  89. """
  90. if sep is None:
  91. sep = ''
  92. if others is not None:
  93. arrays = _get_array_list(arr, others)
  94. n = _length_check(arrays)
  95. masks = np.array([isnull(x) for x in arrays])
  96. cats = None
  97. if na_rep is None:
  98. na_mask = np.logical_or.reduce(masks, axis=0)
  99. result = np.empty(n, dtype=object)
  100. np.putmask(result, na_mask, np.nan)
  101. notmask = -na_mask
  102. if sep is None:
  103. for x in arrays:
  104. x = x[notmask]
  105. if cats is None:
  106. cats = x
  107. else:
  108. cats = cats + x[notmask]
  109. else:
  110. tuples = izip(*[x[notmask] for x in arrays])
  111. cats = [sep.join(tup) for tup in tuples]
  112. result[notmask] = cats
  113. else:
  114. for i, x in enumerate(arrays):
  115. x = np.where(masks[i], na_rep, x)
  116. if cats is None:
  117. cats = x
  118. else:
  119. cats = cats + sep + x
  120. result = cats
  121. return result
  122. else:
  123. arr = np.asarray(arr, dtype=object)
  124. mask = isnull(arr)
  125. if na_rep is None and mask.any():
  126. return np.nan
  127. return sep.join(np.where(mask, na_rep, arr))
  128. def _length_check(others):
  129. n = None
  130. for x in others:
  131. if n is None:
  132. n = len(x)
  133. elif len(x) != n:
  134. raise ValueError('All arrays must be same length')
  135. return n
  136. def _na_map(f, arr, na_result=np.nan):
  137. # should really _check_ for NA
  138. def g(x):
  139. try:
  140. return f(x)
  141. except (TypeError, AttributeError):
  142. return na_result
  143. return _map(g, arr)
  144. def _map(f, arr):
  145. if not isinstance(arr, np.ndarray):
  146. arr = np.asarray(arr, dtype=object)
  147. return lib.map_infer(arr, f)
  148. def str_count(arr, pat, flags=0):
  149. """
  150. Count occurrences of pattern in each string
  151. Parameters
  152. ----------
  153. arr : list or array-like
  154. pat : string, valid regular expression
  155. flags : int, default 0 (no flags)
  156. re module flags, e.g. re.IGNORECASE
  157. Returns
  158. -------
  159. counts : arrays
  160. """
  161. regex = re.compile(pat, flags=flags)
  162. f = lambda x: len(regex.findall(x))
  163. return _na_map(f, arr)
  164. def str_contains(arr, pat, case=True, flags=0, na=np.nan):
  165. """
  166. Check whether given pattern is contained in each string in the array
  167. Parameters
  168. ----------
  169. pat : string
  170. Character sequence or regular expression
  171. case : boolean, default True
  172. If True, case sensitive
  173. flags : int, default 0 (no flags)
  174. re module flags, e.g. re.IGNORECASE
  175. na : bool, default NaN
  176. Returns
  177. -------
  178. """
  179. if not case:
  180. flags |= re.IGNORECASE
  181. regex = re.compile(pat, flags=flags)
  182. f = lambda x: bool(regex.search(x))
  183. return _na_map(f, arr, na)
  184. def str_startswith(arr, pat, na=np.nan):
  185. """
  186. Return boolean array indicating whether each string starts with passed
  187. pattern
  188. Parameters
  189. ----------
  190. pat : string
  191. Character sequence
  192. na : bool, default NaN
  193. Returns
  194. -------
  195. startswith : array (boolean)
  196. """
  197. f = lambda x: x.startswith(pat)
  198. return _na_map(f, arr, na)
  199. def str_endswith(arr, pat, na=np.nan):
  200. """
  201. Return boolean array indicating whether each string ends with passed
  202. pattern
  203. Parameters
  204. ----------
  205. pat : string
  206. Character sequence
  207. na : bool, default NaN
  208. Returns
  209. -------
  210. endswith : array (boolean)
  211. """
  212. f = lambda x: x.endswith(pat)
  213. return _na_map(f, arr, na)
  214. def str_lower(arr):
  215. """
  216. Convert strings in array to lowercase
  217. Returns
  218. -------
  219. lowercase : array
  220. """
  221. return _na_map(lambda x: x.lower(), arr)
  222. def str_upper(arr):
  223. """
  224. Convert strings in array to uppercase
  225. Returns
  226. -------
  227. uppercase : array
  228. """
  229. return _na_map(lambda x: x.upper(), arr)
  230. def str_replace(arr, pat, repl, n=0, case=True, flags=0):
  231. """
  232. Replace
  233. Parameters
  234. ----------
  235. pat : string
  236. Character sequence or regular expression
  237. repl : string
  238. Replacement sequence
  239. n : int, default 0 (all)
  240. Number of replacements to make from start
  241. case : boolean, default True
  242. If True, case sensitive
  243. flags : int, default 0 (no flags)
  244. re module flags, e.g. re.IGNORECASE
  245. Returns
  246. -------
  247. replaced : array
  248. """
  249. if not case:
  250. flags |= re.IGNORECASE
  251. regex = re.compile(pat, flags=flags)
  252. def f(x):
  253. return regex.sub(repl, x, count=n)
  254. return _na_map(f, arr)
  255. def str_repeat(arr, repeats):
  256. """
  257. Duplicate each string in the array by indicated number of times
  258. Parameters
  259. ----------
  260. repeats : int or array
  261. Same value for all (int) or different value per (array)
  262. Returns
  263. -------
  264. repeated : array
  265. """
  266. if np.isscalar(repeats):
  267. def rep(x):
  268. try:
  269. return str.__mul__(x, repeats)
  270. except TypeError:
  271. return unicode.__mul__(x, repeats)
  272. return _na_map(rep, arr)
  273. else:
  274. def rep(x, r):
  275. try:
  276. return str.__mul__(x, r)
  277. except TypeError:
  278. return unicode.__mul__(x, r)
  279. repeats = np.asarray(repeats, dtype=object)
  280. result = lib.vec_binop(arr, repeats, rep)
  281. return result
  282. def str_match(arr, pat, flags=0):
  283. """
  284. Find groups in each string (from beginning) using passed regular expression
  285. Parameters
  286. ----------
  287. pat : string
  288. Pattern or regular expression
  289. flags : int, default 0 (no flags)
  290. re module flags, e.g. re.IGNORECASE
  291. Returns
  292. -------
  293. matches : array
  294. """
  295. regex = re.compile(pat, flags=flags)
  296. def f(x):
  297. m = regex.match(x)
  298. if m:
  299. return m.groups()
  300. else:
  301. return []
  302. return _na_map(f, arr)
  303. def str_join(arr, sep):
  304. """
  305. Join lists contained as elements in array, a la str.join
  306. Parameters
  307. ----------
  308. sep : string
  309. Delimiter
  310. Returns
  311. -------
  312. joined : array
  313. """
  314. return _na_map(sep.join, arr)
  315. def str_len(arr):
  316. """
  317. Compute length of each string in array.
  318. Returns
  319. -------
  320. lengths : array
  321. """
  322. return _na_map(len, arr)
  323. def str_findall(arr, pat, flags=0):
  324. """
  325. Find all occurrences of pattern or regular expression
  326. Parameters
  327. ----------
  328. pat : string
  329. Pattern or regular expression
  330. flags : int, default 0 (no flags)
  331. re module flags, e.g. re.IGNORECASE
  332. Returns
  333. -------
  334. matches : array
  335. """
  336. regex = re.compile(pat, flags=flags)
  337. return _na_map(regex.findall, arr)
  338. def str_pad(arr, width, side='left'):
  339. """
  340. Pad strings with whitespace
  341. Parameters
  342. ----------
  343. arr : list or array-like
  344. width : int
  345. Minimum width of resulting string; additional characters will be filled
  346. with spaces
  347. side : {'left', 'right', 'both'}, default 'left'
  348. Returns
  349. -------
  350. padded : array
  351. """
  352. if side == 'left':
  353. f = lambda x: x.rjust(width)
  354. elif side == 'right':
  355. f = lambda x: x.ljust(width)
  356. elif side == 'both':
  357. f = lambda x: x.center(width)
  358. else: # pragma: no cover
  359. raise ValueError('Invalid side')
  360. return _na_map(f, arr)
  361. def str_center(arr, width):
  362. """
  363. "Center" strings, filling left and right side with additional whitespace
  364. Parameters
  365. ----------
  366. width : int
  367. Minimum width of resulting string; additional characters will be filled
  368. with spaces
  369. Returns
  370. -------
  371. centered : array
  372. """
  373. return str_pad(arr, width, side='both')
  374. def str_split(arr, pat=None, n=0):
  375. """
  376. Split each string (a la re.split) in array by given pattern, propagating NA
  377. values
  378. Parameters
  379. ----------
  380. pat : string, default None
  381. String or regular expression to split on. If None, splits on whitespace
  382. n : int, default 0 (all)
  383. Returns
  384. -------
  385. split : array
  386. """
  387. if pat is None:
  388. f = lambda x: x.split()
  389. else:
  390. regex = re.compile(pat)
  391. f = lambda x: regex.split(x, maxsplit=n)
  392. return _na_map(f, arr)
  393. def str_slice(arr, start=None, stop=None, step=1):
  394. """
  395. Slice substrings from each element in array
  396. Parameters
  397. ----------
  398. start : int or None
  399. stop : int or None
  400. Returns
  401. -------
  402. sliced : array
  403. """
  404. obj = slice(start, stop, step)
  405. f = lambda x: x[obj]
  406. return _na_map(f, arr)
  407. def str_slice_replace(arr, start=None, stop=None, repl=None):
  408. """
  409. Parameters
  410. ----------
  411. Returns
  412. -------
  413. replaced : array
  414. """
  415. raise NotImplementedError
  416. def str_strip(arr):
  417. """
  418. Strip whitespace (including newlines) from each string in the array
  419. Returns
  420. -------
  421. stripped : array
  422. """
  423. return _na_map(lambda x: x.strip(), arr)
  424. def str_lstrip(arr):
  425. """
  426. Strip whitespace (including newlines) from left side of each string in the
  427. array
  428. Returns
  429. -------
  430. stripped : array
  431. """
  432. return _na_map(lambda x: x.lstrip(), arr)
  433. def str_rstrip(arr):
  434. """
  435. Strip whitespace (including newlines) from right side of each string in the
  436. array
  437. Returns
  438. -------
  439. stripped : array
  440. """
  441. return _na_map(lambda x: x.rstrip(), arr)
  442. def str_wrap(arr, width=80):
  443. """
  444. Wrap long strings to be formatted in paragraphs
  445. Parameters
  446. ----------
  447. width : int
  448. Maximum line-width
  449. Returns
  450. -------
  451. wrapped : array
  452. """
  453. raise NotImplementedError
  454. def str_get(arr, i):
  455. """
  456. Extract element from lists, tuples, or strings in each element in the array
  457. Parameters
  458. ----------
  459. i : int
  460. Integer index (location)
  461. Returns
  462. -------
  463. items : array
  464. """
  465. f = lambda x: x[i]
  466. return _na_map(f, arr)
  467. def str_decode(arr, encoding):
  468. """
  469. Decode character string to unicode using indicated encoding
  470. Parameters
  471. ----------
  472. encoding : string
  473. Returns
  474. -------
  475. decoded : array
  476. """
  477. f = lambda x: x.decode(encoding)
  478. return _na_map(f, arr)
  479. def str_encode(arr, encoding):
  480. """
  481. Encode character string to unicode using indicated encoding
  482. Parameters
  483. ----------
  484. encoding : string
  485. Returns
  486. -------
  487. encoded : array
  488. """
  489. f = lambda x: x.encode(encoding)
  490. return _na_map(f, arr)
  491. def _noarg_wrapper(f):
  492. def wrapper(self):
  493. result = f(self.series)
  494. return self._wrap_result(result)
  495. wrapper.__name__ = f.__name__
  496. if f.__doc__:
  497. wrapper.__doc__ = f.__doc__
  498. return wrapper
  499. def _pat_wrapper(f, flags=False, na=False):
  500. def wrapper1(self, pat):
  501. result = f(self.series, pat)
  502. return self._wrap_result(result)
  503. def wrapper2(self, pat, flags=0):
  504. result = f(self.series, pat, flags=flags)
  505. return self._wrap_result(result)
  506. def wrapper3(self, pat, na=np.nan):
  507. result = f(self.series, pat, na=na)
  508. return self._wrap_result(result)
  509. wrapper = wrapper3 if na else wrapper2 if flags else wrapper1
  510. wrapper.__name__ = f.__name__
  511. if f.__doc__:
  512. wrapper.__doc__ = f.__doc__
  513. return wrapper
  514. def copy(source):
  515. "Copy a docstring from another source function (if present)"
  516. def do_copy(target):
  517. if source.__doc__:
  518. target.__doc__ = source.__doc__
  519. return target
  520. return do_copy
  521. class StringMethods(object):
  522. """
  523. Vectorized string functions for Series. NAs stay NA unless handled
  524. otherwise by a particular method. Patterned after Python's string methods,
  525. with some inspiration from R's stringr package.
  526. Examples
  527. --------
  528. >>> s.str.split('_')
  529. >>> s.str.replace('_', '')
  530. """
  531. def __init__(self, series):
  532. self.series = series
  533. def __getitem__(self, key):
  534. if isinstance(key, slice):
  535. return self.slice(start=key.start, stop=key.stop,
  536. step=key.step)
  537. else:
  538. return self.get(key)
  539. def _wrap_result(self, result):
  540. return Series(result, index=self.series.index,
  541. name=self.series.name)
  542. @copy(str_cat)
  543. def cat(self, others=None, sep=None, na_rep=None):
  544. result = str_cat(self.series, others=others, sep=sep, na_rep=na_rep)
  545. return self._wrap_result(result)
  546. @copy(str_split)
  547. def split(self, pat=None, n=0):
  548. result = str_split(self.series, pat, n=n)
  549. return self._wrap_result(result)
  550. @copy(str_get)
  551. def get(self, i):
  552. result = str_get(self.series, i)
  553. return self._wrap_result(result)
  554. @copy(str_join)
  555. def join(self, sep):
  556. result = str_join(self.series, sep)
  557. return self._wrap_result(result)
  558. @copy(str_contains)
  559. def contains(self, pat, case=True, flags=0, na=np.nan):
  560. result = str_contains(self.series, pat, case=case, flags=flags,
  561. na=np.nan)
  562. return self._wrap_result(result)
  563. @copy(str_replace)
  564. def replace(self, pat, repl, n=0, case=True):
  565. result = str_replace(self.series, pat, repl, n=n, case=case)
  566. return self._wrap_result(result)
  567. @copy(str_repeat)
  568. def repeat(self, repeats):
  569. result = str_repeat(self.series, repeats)
  570. return self._wrap_result(result)
  571. @copy(str_pad)
  572. def pad(self, width, side='left'):
  573. result = str_pad(self.series, width, side=side)
  574. return self._wrap_result(result)
  575. @copy(str_center)
  576. def center(self, width):
  577. result = str_center(self.series, width)
  578. return self._wrap_result(result)
  579. @copy(str_slice)
  580. def slice(self, start=None, stop=None, step=1):
  581. result = str_slice(self.series, start, stop)
  582. return self._wrap_result(result)
  583. @copy(str_slice)
  584. def slice_replace(self, i=None, j=None):
  585. raise NotImplementedError
  586. @copy(str_decode)
  587. def decode(self, encoding):
  588. result = str_decode(self.series, encoding)
  589. return self._wrap_result(result)
  590. @copy(str_encode)
  591. def encode(self, encoding):
  592. result = str_encode(self.series, encoding)
  593. return self._wrap_result(result)
  594. count = _pat_wrapper(str_count, flags=True)
  595. startswith = _pat_wrapper(str_startswith, na=True)
  596. endswith = _pat_wrapper(str_endswith, na=True)
  597. findall = _pat_wrapper(str_findall, flags=True)
  598. match = _pat_wrapper(str_match, flags=True)
  599. len = _noarg_wrapper(str_len)
  600. strip = _noarg_wrapper(str_strip)
  601. rstrip = _noarg_wrapper(str_rstrip)
  602. lstrip = _noarg_wrapper(str_lstrip)
  603. lower = _noarg_wrapper(str_lower)
  604. upper = _noarg_wrapper(str_upper)