/pandas/core/strings.py

http://github.com/wesm/pandas · Python · 3201 lines · 2996 code · 59 blank · 146 comment · 96 complexity · ad3630c026496b522aad5d44da706a3f MD5 · raw file

Large files are truncated click here to view the full file

  1. # -*- coding: utf-8 -*-
  2. import codecs
  3. import re
  4. import textwrap
  5. import warnings
  6. import numpy as np
  7. import pandas._libs.lib as lib
  8. import pandas._libs.ops as libops
  9. import pandas.compat as compat
  10. from pandas.compat import zip
  11. from pandas.util._decorators import Appender, deprecate_kwarg
  12. from pandas.core.dtypes.common import (
  13. ensure_object, is_bool_dtype, is_categorical_dtype, is_integer,
  14. is_list_like, is_object_dtype, is_re, is_scalar, is_string_like)
  15. from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries
  16. from pandas.core.dtypes.missing import isna
  17. from pandas.core.algorithms import take_1d
  18. from pandas.core.base import NoNewAttributesMixin
  19. import pandas.core.common as com
  20. _cpython_optimized_encoders = (
  21. "utf-8", "utf8", "latin-1", "latin1", "iso-8859-1", "mbcs", "ascii"
  22. )
  23. _cpython_optimized_decoders = _cpython_optimized_encoders + (
  24. "utf-16", "utf-32"
  25. )
  26. _shared_docs = dict()
  27. def cat_core(list_of_columns, sep):
  28. """
  29. Auxiliary function for :meth:`str.cat`
  30. Parameters
  31. ----------
  32. list_of_columns : list of numpy arrays
  33. List of arrays to be concatenated with sep;
  34. these arrays may not contain NaNs!
  35. sep : string
  36. The separator string for concatenating the columns
  37. Returns
  38. -------
  39. nd.array
  40. The concatenation of list_of_columns with sep
  41. """
  42. list_with_sep = [sep] * (2 * len(list_of_columns) - 1)
  43. list_with_sep[::2] = list_of_columns
  44. return np.sum(list_with_sep, axis=0)
  45. def _na_map(f, arr, na_result=np.nan, dtype=object):
  46. # should really _check_ for NA
  47. return _map(f, arr, na_mask=True, na_value=na_result, dtype=dtype)
  48. def _map(f, arr, na_mask=False, na_value=np.nan, dtype=object):
  49. if not len(arr):
  50. return np.ndarray(0, dtype=dtype)
  51. if isinstance(arr, ABCSeries):
  52. arr = arr.values
  53. if not isinstance(arr, np.ndarray):
  54. arr = np.asarray(arr, dtype=object)
  55. if na_mask:
  56. mask = isna(arr)
  57. try:
  58. convert = not all(mask)
  59. result = lib.map_infer_mask(arr, f, mask.view(np.uint8), convert)
  60. except (TypeError, AttributeError) as e:
  61. # Reraise the exception if callable `f` got wrong number of args.
  62. # The user may want to be warned by this, instead of getting NaN
  63. if compat.PY2:
  64. p_err = r'takes (no|(exactly|at (least|most)) ?\d+) arguments?'
  65. else:
  66. p_err = (r'((takes)|(missing)) (?(2)from \d+ to )?\d+ '
  67. r'(?(3)required )positional arguments?')
  68. if len(e.args) >= 1 and re.search(p_err, e.args[0]):
  69. raise e
  70. def g(x):
  71. try:
  72. return f(x)
  73. except (TypeError, AttributeError):
  74. return na_value
  75. return _map(g, arr, dtype=dtype)
  76. if na_value is not np.nan:
  77. np.putmask(result, mask, na_value)
  78. if result.dtype == object:
  79. result = lib.maybe_convert_objects(result)
  80. return result
  81. else:
  82. return lib.map_infer(arr, f)
  83. def str_count(arr, pat, flags=0):
  84. """
  85. Count occurrences of pattern in each string of the Series/Index.
  86. This function is used to count the number of times a particular regex
  87. pattern is repeated in each of the string elements of the
  88. :class:`~pandas.Series`.
  89. Parameters
  90. ----------
  91. pat : str
  92. Valid regular expression.
  93. flags : int, default 0, meaning no flags
  94. Flags for the `re` module. For a complete list, `see here
  95. <https://docs.python.org/3/howto/regex.html#compilation-flags>`_.
  96. **kwargs
  97. For compatibility with other string methods. Not used.
  98. Returns
  99. -------
  100. Series or Index
  101. Same type as the calling object containing the integer counts.
  102. See Also
  103. --------
  104. re : Standard library module for regular expressions.
  105. str.count : Standard library version, without regular expression support.
  106. Notes
  107. -----
  108. Some characters need to be escaped when passing in `pat`.
  109. eg. ``'$'`` has a special meaning in regex and must be escaped when
  110. finding this literal character.
  111. Examples
  112. --------
  113. >>> s = pd.Series(['A', 'B', 'Aaba', 'Baca', np.nan, 'CABA', 'cat'])
  114. >>> s.str.count('a')
  115. 0 0.0
  116. 1 0.0
  117. 2 2.0
  118. 3 2.0
  119. 4 NaN
  120. 5 0.0
  121. 6 1.0
  122. dtype: float64
  123. Escape ``'$'`` to find the literal dollar sign.
  124. >>> s = pd.Series(['$', 'B', 'Aab$', '$$ca', 'C$B$', 'cat'])
  125. >>> s.str.count('\\$')
  126. 0 1
  127. 1 0
  128. 2 1
  129. 3 2
  130. 4 2
  131. 5 0
  132. dtype: int64
  133. This is also available on Index
  134. >>> pd.Index(['A', 'A', 'Aaba', 'cat']).str.count('a')
  135. Int64Index([0, 0, 2, 1], dtype='int64')
  136. """
  137. regex = re.compile(pat, flags=flags)
  138. f = lambda x: len(regex.findall(x))
  139. return _na_map(f, arr, dtype=int)
  140. def str_contains(arr, pat, case=True, flags=0, na=np.nan, regex=True):
  141. """
  142. Test if pattern or regex is contained within a string of a Series or Index.
  143. Return boolean Series or Index based on whether a given pattern or regex is
  144. contained within a string of a Series or Index.
  145. Parameters
  146. ----------
  147. pat : str
  148. Character sequence or regular expression.
  149. case : bool, default True
  150. If True, case sensitive.
  151. flags : int, default 0 (no flags)
  152. Flags to pass through to the re module, e.g. re.IGNORECASE.
  153. na : default NaN
  154. Fill value for missing values.
  155. regex : bool, default True
  156. If True, assumes the pat is a regular expression.
  157. If False, treats the pat as a literal string.
  158. Returns
  159. -------
  160. Series or Index of boolean values
  161. A Series or Index of boolean values indicating whether the
  162. given pattern is contained within the string of each element
  163. of the Series or Index.
  164. See Also
  165. --------
  166. match : Analogous, but stricter, relying on re.match instead of re.search.
  167. Series.str.startswith : Test if the start of each string element matches a
  168. pattern.
  169. Series.str.endswith : Same as startswith, but tests the end of string.
  170. Examples
  171. --------
  172. Returning a Series of booleans using only a literal pattern.
  173. >>> s1 = pd.Series(['Mouse', 'dog', 'house and parrot', '23', np.NaN])
  174. >>> s1.str.contains('og', regex=False)
  175. 0 False
  176. 1 True
  177. 2 False
  178. 3 False
  179. 4 NaN
  180. dtype: object
  181. Returning an Index of booleans using only a literal pattern.
  182. >>> ind = pd.Index(['Mouse', 'dog', 'house and parrot', '23.0', np.NaN])
  183. >>> ind.str.contains('23', regex=False)
  184. Index([False, False, False, True, nan], dtype='object')
  185. Specifying case sensitivity using `case`.
  186. >>> s1.str.contains('oG', case=True, regex=True)
  187. 0 False
  188. 1 False
  189. 2 False
  190. 3 False
  191. 4 NaN
  192. dtype: object
  193. Specifying `na` to be `False` instead of `NaN` replaces NaN values
  194. with `False`. If Series or Index does not contain NaN values
  195. the resultant dtype will be `bool`, otherwise, an `object` dtype.
  196. >>> s1.str.contains('og', na=False, regex=True)
  197. 0 False
  198. 1 True
  199. 2 False
  200. 3 False
  201. 4 False
  202. dtype: bool
  203. Returning 'house' or 'dog' when either expression occurs in a string.
  204. >>> s1.str.contains('house|dog', regex=True)
  205. 0 False
  206. 1 True
  207. 2 True
  208. 3 False
  209. 4 NaN
  210. dtype: object
  211. Ignoring case sensitivity using `flags` with regex.
  212. >>> import re
  213. >>> s1.str.contains('PARROT', flags=re.IGNORECASE, regex=True)
  214. 0 False
  215. 1 False
  216. 2 True
  217. 3 False
  218. 4 NaN
  219. dtype: object
  220. Returning any digit using regular expression.
  221. >>> s1.str.contains('\\d', regex=True)
  222. 0 False
  223. 1 False
  224. 2 False
  225. 3 True
  226. 4 NaN
  227. dtype: object
  228. Ensure `pat` is a not a literal pattern when `regex` is set to True.
  229. Note in the following example one might expect only `s2[1]` and `s2[3]` to
  230. return `True`. However, '.0' as a regex matches any character
  231. followed by a 0.
  232. >>> s2 = pd.Series(['40', '40.0', '41', '41.0', '35'])
  233. >>> s2.str.contains('.0', regex=True)
  234. 0 True
  235. 1 True
  236. 2 False
  237. 3 True
  238. 4 False
  239. dtype: bool
  240. """
  241. if regex:
  242. if not case:
  243. flags |= re.IGNORECASE
  244. regex = re.compile(pat, flags=flags)
  245. if regex.groups > 0:
  246. warnings.warn("This pattern has match groups. To actually get the"
  247. " groups, use str.extract.", UserWarning,
  248. stacklevel=3)
  249. f = lambda x: bool(regex.search(x))
  250. else:
  251. if case:
  252. f = lambda x: pat in x
  253. else:
  254. upper_pat = pat.upper()
  255. f = lambda x: upper_pat in x
  256. uppered = _na_map(lambda x: x.upper(), arr)
  257. return _na_map(f, uppered, na, dtype=bool)
  258. return _na_map(f, arr, na, dtype=bool)
  259. def str_startswith(arr, pat, na=np.nan):
  260. """
  261. Test if the start of each string element matches a pattern.
  262. Equivalent to :meth:`str.startswith`.
  263. Parameters
  264. ----------
  265. pat : str
  266. Character sequence. Regular expressions are not accepted.
  267. na : object, default NaN
  268. Object shown if element tested is not a string.
  269. Returns
  270. -------
  271. Series or Index of bool
  272. A Series of booleans indicating whether the given pattern matches
  273. the start of each string element.
  274. See Also
  275. --------
  276. str.startswith : Python standard library string method.
  277. Series.str.endswith : Same as startswith, but tests the end of string.
  278. Series.str.contains : Tests if string element contains a pattern.
  279. Examples
  280. --------
  281. >>> s = pd.Series(['bat', 'Bear', 'cat', np.nan])
  282. >>> s
  283. 0 bat
  284. 1 Bear
  285. 2 cat
  286. 3 NaN
  287. dtype: object
  288. >>> s.str.startswith('b')
  289. 0 True
  290. 1 False
  291. 2 False
  292. 3 NaN
  293. dtype: object
  294. Specifying `na` to be `False` instead of `NaN`.
  295. >>> s.str.startswith('b', na=False)
  296. 0 True
  297. 1 False
  298. 2 False
  299. 3 False
  300. dtype: bool
  301. """
  302. f = lambda x: x.startswith(pat)
  303. return _na_map(f, arr, na, dtype=bool)
  304. def str_endswith(arr, pat, na=np.nan):
  305. """
  306. Test if the end of each string element matches a pattern.
  307. Equivalent to :meth:`str.endswith`.
  308. Parameters
  309. ----------
  310. pat : str
  311. Character sequence. Regular expressions are not accepted.
  312. na : object, default NaN
  313. Object shown if element tested is not a string.
  314. Returns
  315. -------
  316. Series or Index of bool
  317. A Series of booleans indicating whether the given pattern matches
  318. the end of each string element.
  319. See Also
  320. --------
  321. str.endswith : Python standard library string method.
  322. Series.str.startswith : Same as endswith, but tests the start of string.
  323. Series.str.contains : Tests if string element contains a pattern.
  324. Examples
  325. --------
  326. >>> s = pd.Series(['bat', 'bear', 'caT', np.nan])
  327. >>> s
  328. 0 bat
  329. 1 bear
  330. 2 caT
  331. 3 NaN
  332. dtype: object
  333. >>> s.str.endswith('t')
  334. 0 True
  335. 1 False
  336. 2 False
  337. 3 NaN
  338. dtype: object
  339. Specifying `na` to be `False` instead of `NaN`.
  340. >>> s.str.endswith('t', na=False)
  341. 0 True
  342. 1 False
  343. 2 False
  344. 3 False
  345. dtype: bool
  346. """
  347. f = lambda x: x.endswith(pat)
  348. return _na_map(f, arr, na, dtype=bool)
  349. def str_replace(arr, pat, repl, n=-1, case=None, flags=0, regex=True):
  350. r"""
  351. Replace occurrences of pattern/regex in the Series/Index with
  352. some other string. Equivalent to :meth:`str.replace` or
  353. :func:`re.sub`.
  354. Parameters
  355. ----------
  356. pat : str or compiled regex
  357. String can be a character sequence or regular expression.
  358. .. versionadded:: 0.20.0
  359. `pat` also accepts a compiled regex.
  360. repl : str or callable
  361. Replacement string or a callable. The callable is passed the regex
  362. match object and must return a replacement string to be used.
  363. See :func:`re.sub`.
  364. .. versionadded:: 0.20.0
  365. `repl` also accepts a callable.
  366. n : int, default -1 (all)
  367. Number of replacements to make from start.
  368. case : bool, default None
  369. - If True, case sensitive (the default if `pat` is a string)
  370. - Set to False for case insensitive
  371. - Cannot be set if `pat` is a compiled regex
  372. flags : int, default 0 (no flags)
  373. - re module flags, e.g. re.IGNORECASE
  374. - Cannot be set if `pat` is a compiled regex
  375. regex : bool, default True
  376. - If True, assumes the passed-in pattern is a regular expression.
  377. - If False, treats the pattern as a literal string
  378. - Cannot be set to False if `pat` is a compiled regex or `repl` is
  379. a callable.
  380. .. versionadded:: 0.23.0
  381. Returns
  382. -------
  383. Series or Index of object
  384. A copy of the object with all matching occurrences of `pat` replaced by
  385. `repl`.
  386. Raises
  387. ------
  388. ValueError
  389. * if `regex` is False and `repl` is a callable or `pat` is a compiled
  390. regex
  391. * if `pat` is a compiled regex and `case` or `flags` is set
  392. Notes
  393. -----
  394. When `pat` is a compiled regex, all flags should be included in the
  395. compiled regex. Use of `case`, `flags`, or `regex=False` with a compiled
  396. regex will raise an error.
  397. Examples
  398. --------
  399. When `pat` is a string and `regex` is True (the default), the given `pat`
  400. is compiled as a regex. When `repl` is a string, it replaces matching
  401. regex patterns as with :meth:`re.sub`. NaN value(s) in the Series are
  402. left as is:
  403. >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f.', 'ba', regex=True)
  404. 0 bao
  405. 1 baz
  406. 2 NaN
  407. dtype: object
  408. When `pat` is a string and `regex` is False, every `pat` is replaced with
  409. `repl` as with :meth:`str.replace`:
  410. >>> pd.Series(['f.o', 'fuz', np.nan]).str.replace('f.', 'ba', regex=False)
  411. 0 bao
  412. 1 fuz
  413. 2 NaN
  414. dtype: object
  415. When `repl` is a callable, it is called on every `pat` using
  416. :func:`re.sub`. The callable should expect one positional argument
  417. (a regex object) and return a string.
  418. To get the idea:
  419. >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f', repr)
  420. 0 <_sre.SRE_Match object; span=(0, 1), match='f'>oo
  421. 1 <_sre.SRE_Match object; span=(0, 1), match='f'>uz
  422. 2 NaN
  423. dtype: object
  424. Reverse every lowercase alphabetic word:
  425. >>> repl = lambda m: m.group(0)[::-1]
  426. >>> pd.Series(['foo 123', 'bar baz', np.nan]).str.replace(r'[a-z]+', repl)
  427. 0 oof 123
  428. 1 rab zab
  429. 2 NaN
  430. dtype: object
  431. Using regex groups (extract second group and swap case):
  432. >>> pat = r"(?P<one>\w+) (?P<two>\w+) (?P<three>\w+)"
  433. >>> repl = lambda m: m.group('two').swapcase()
  434. >>> pd.Series(['One Two Three', 'Foo Bar Baz']).str.replace(pat, repl)
  435. 0 tWO
  436. 1 bAR
  437. dtype: object
  438. Using a compiled regex with flags
  439. >>> import re
  440. >>> regex_pat = re.compile(r'FUZ', flags=re.IGNORECASE)
  441. >>> pd.Series(['foo', 'fuz', np.nan]).str.replace(regex_pat, 'bar')
  442. 0 foo
  443. 1 bar
  444. 2 NaN
  445. dtype: object
  446. """
  447. # Check whether repl is valid (GH 13438, GH 15055)
  448. if not (is_string_like(repl) or callable(repl)):
  449. raise TypeError("repl must be a string or callable")
  450. is_compiled_re = is_re(pat)
  451. if regex:
  452. if is_compiled_re:
  453. if (case is not None) or (flags != 0):
  454. raise ValueError("case and flags cannot be set"
  455. " when pat is a compiled regex")
  456. else:
  457. # not a compiled regex
  458. # set default case
  459. if case is None:
  460. case = True
  461. # add case flag, if provided
  462. if case is False:
  463. flags |= re.IGNORECASE
  464. if is_compiled_re or len(pat) > 1 or flags or callable(repl):
  465. n = n if n >= 0 else 0
  466. compiled = re.compile(pat, flags=flags)
  467. f = lambda x: compiled.sub(repl=repl, string=x, count=n)
  468. else:
  469. f = lambda x: x.replace(pat, repl, n)
  470. else:
  471. if is_compiled_re:
  472. raise ValueError("Cannot use a compiled regex as replacement "
  473. "pattern with regex=False")
  474. if callable(repl):
  475. raise ValueError("Cannot use a callable replacement when "
  476. "regex=False")
  477. f = lambda x: x.replace(pat, repl, n)
  478. return _na_map(f, arr)
  479. def str_repeat(arr, repeats):
  480. """
  481. Duplicate each string in the Series or Index.
  482. Parameters
  483. ----------
  484. repeats : int or sequence of int
  485. Same value for all (int) or different value per (sequence).
  486. Returns
  487. -------
  488. Series or Index of object
  489. Series or Index of repeated string objects specified by
  490. input parameter repeats.
  491. Examples
  492. --------
  493. >>> s = pd.Series(['a', 'b', 'c'])
  494. >>> s
  495. 0 a
  496. 1 b
  497. 2 c
  498. dtype: object
  499. Single int repeats string in Series
  500. >>> s.str.repeat(repeats=2)
  501. 0 aa
  502. 1 bb
  503. 2 cc
  504. dtype: object
  505. Sequence of int repeats corresponding string in Series
  506. >>> s.str.repeat(repeats=[1, 2, 3])
  507. 0 a
  508. 1 bb
  509. 2 ccc
  510. dtype: object
  511. """
  512. if is_scalar(repeats):
  513. def rep(x):
  514. try:
  515. return compat.binary_type.__mul__(x, repeats)
  516. except TypeError:
  517. return compat.text_type.__mul__(x, repeats)
  518. return _na_map(rep, arr)
  519. else:
  520. def rep(x, r):
  521. try:
  522. return compat.binary_type.__mul__(x, r)
  523. except TypeError:
  524. return compat.text_type.__mul__(x, r)
  525. repeats = np.asarray(repeats, dtype=object)
  526. result = libops.vec_binop(com.values_from_object(arr), repeats, rep)
  527. return result
  528. def str_match(arr, pat, case=True, flags=0, na=np.nan):
  529. """
  530. Determine if each string matches a regular expression.
  531. Parameters
  532. ----------
  533. pat : str
  534. Character sequence or regular expression.
  535. case : bool, default True
  536. If True, case sensitive.
  537. flags : int, default 0 (no flags)
  538. re module flags, e.g. re.IGNORECASE.
  539. na : default NaN
  540. Fill value for missing values.
  541. Returns
  542. -------
  543. Series/array of boolean values
  544. See Also
  545. --------
  546. contains : Analogous, but less strict, relying on re.search instead of
  547. re.match.
  548. extract : Extract matched groups.
  549. """
  550. if not case:
  551. flags |= re.IGNORECASE
  552. regex = re.compile(pat, flags=flags)
  553. dtype = bool
  554. f = lambda x: bool(regex.match(x))
  555. return _na_map(f, arr, na, dtype=dtype)
  556. def _get_single_group_name(rx):
  557. try:
  558. return list(rx.groupindex.keys()).pop()
  559. except IndexError:
  560. return None
  561. def _groups_or_na_fun(regex):
  562. """Used in both extract_noexpand and extract_frame"""
  563. if regex.groups == 0:
  564. raise ValueError("pattern contains no capture groups")
  565. empty_row = [np.nan] * regex.groups
  566. def f(x):
  567. if not isinstance(x, compat.string_types):
  568. return empty_row
  569. m = regex.search(x)
  570. if m:
  571. return [np.nan if item is None else item for item in m.groups()]
  572. else:
  573. return empty_row
  574. return f
  575. def _str_extract_noexpand(arr, pat, flags=0):
  576. """
  577. Find groups in each string in the Series using passed regular
  578. expression. This function is called from
  579. str_extract(expand=False), and can return Series, DataFrame, or
  580. Index.
  581. """
  582. from pandas import DataFrame, Index
  583. regex = re.compile(pat, flags=flags)
  584. groups_or_na = _groups_or_na_fun(regex)
  585. if regex.groups == 1:
  586. result = np.array([groups_or_na(val)[0] for val in arr], dtype=object)
  587. name = _get_single_group_name(regex)
  588. else:
  589. if isinstance(arr, Index):
  590. raise ValueError("only one regex group is supported with Index")
  591. name = None
  592. names = dict(zip(regex.groupindex.values(), regex.groupindex.keys()))
  593. columns = [names.get(1 + i, i) for i in range(regex.groups)]
  594. if arr.empty:
  595. result = DataFrame(columns=columns, dtype=object)
  596. else:
  597. result = DataFrame(
  598. [groups_or_na(val) for val in arr],
  599. columns=columns,
  600. index=arr.index,
  601. dtype=object)
  602. return result, name
  603. def _str_extract_frame(arr, pat, flags=0):
  604. """
  605. For each subject string in the Series, extract groups from the
  606. first match of regular expression pat. This function is called from
  607. str_extract(expand=True), and always returns a DataFrame.
  608. """
  609. from pandas import DataFrame
  610. regex = re.compile(pat, flags=flags)
  611. groups_or_na = _groups_or_na_fun(regex)
  612. names = dict(zip(regex.groupindex.values(), regex.groupindex.keys()))
  613. columns = [names.get(1 + i, i) for i in range(regex.groups)]
  614. if len(arr) == 0:
  615. return DataFrame(columns=columns, dtype=object)
  616. try:
  617. result_index = arr.index
  618. except AttributeError:
  619. result_index = None
  620. return DataFrame(
  621. [groups_or_na(val) for val in arr],
  622. columns=columns,
  623. index=result_index,
  624. dtype=object)
  625. def str_extract(arr, pat, flags=0, expand=True):
  626. r"""
  627. Extract capture groups in the regex `pat` as columns in a DataFrame.
  628. For each subject string in the Series, extract groups from the
  629. first match of regular expression `pat`.
  630. Parameters
  631. ----------
  632. pat : str
  633. Regular expression pattern with capturing groups.
  634. flags : int, default 0 (no flags)
  635. Flags from the ``re`` module, e.g. ``re.IGNORECASE``, that
  636. modify regular expression matching for things like case,
  637. spaces, etc. For more details, see :mod:`re`.
  638. expand : bool, default True
  639. If True, return DataFrame with one column per capture group.
  640. If False, return a Series/Index if there is one capture group
  641. or DataFrame if there are multiple capture groups.
  642. .. versionadded:: 0.18.0
  643. Returns
  644. -------
  645. DataFrame or Series or Index
  646. A DataFrame with one row for each subject string, and one
  647. column for each group. Any capture group names in regular
  648. expression pat will be used for column names; otherwise
  649. capture group numbers will be used. The dtype of each result
  650. column is always object, even when no match is found. If
  651. ``expand=False`` and pat has only one capture group, then
  652. return a Series (if subject is a Series) or Index (if subject
  653. is an Index).
  654. See Also
  655. --------
  656. extractall : Returns all matches (not just the first match).
  657. Examples
  658. --------
  659. A pattern with two groups will return a DataFrame with two columns.
  660. Non-matches will be NaN.
  661. >>> s = pd.Series(['a1', 'b2', 'c3'])
  662. >>> s.str.extract(r'([ab])(\d)')
  663. 0 1
  664. 0 a 1
  665. 1 b 2
  666. 2 NaN NaN
  667. A pattern may contain optional groups.
  668. >>> s.str.extract(r'([ab])?(\d)')
  669. 0 1
  670. 0 a 1
  671. 1 b 2
  672. 2 NaN 3
  673. Named groups will become column names in the result.
  674. >>> s.str.extract(r'(?P<letter>[ab])(?P<digit>\d)')
  675. letter digit
  676. 0 a 1
  677. 1 b 2
  678. 2 NaN NaN
  679. A pattern with one group will return a DataFrame with one column
  680. if expand=True.
  681. >>> s.str.extract(r'[ab](\d)', expand=True)
  682. 0
  683. 0 1
  684. 1 2
  685. 2 NaN
  686. A pattern with one group will return a Series if expand=False.
  687. >>> s.str.extract(r'[ab](\d)', expand=False)
  688. 0 1
  689. 1 2
  690. 2 NaN
  691. dtype: object
  692. """
  693. if not isinstance(expand, bool):
  694. raise ValueError("expand must be True or False")
  695. if expand:
  696. return _str_extract_frame(arr._orig, pat, flags=flags)
  697. else:
  698. result, name = _str_extract_noexpand(arr._parent, pat, flags=flags)
  699. return arr._wrap_result(result, name=name, expand=expand)
  700. def str_extractall(arr, pat, flags=0):
  701. r"""
  702. For each subject string in the Series, extract groups from all
  703. matches of regular expression pat. When each subject string in the
  704. Series has exactly one match, extractall(pat).xs(0, level='match')
  705. is the same as extract(pat).
  706. .. versionadded:: 0.18.0
  707. Parameters
  708. ----------
  709. pat : str
  710. Regular expression pattern with capturing groups.
  711. flags : int, default 0 (no flags)
  712. A ``re`` module flag, for example ``re.IGNORECASE``. These allow
  713. to modify regular expression matching for things like case, spaces,
  714. etc. Multiple flags can be combined with the bitwise OR operator,
  715. for example ``re.IGNORECASE | re.MULTILINE``.
  716. Returns
  717. -------
  718. DataFrame
  719. A ``DataFrame`` with one row for each match, and one column for each
  720. group. Its rows have a ``MultiIndex`` with first levels that come from
  721. the subject ``Series``. The last level is named 'match' and indexes the
  722. matches in each item of the ``Series``. Any capture group names in
  723. regular expression pat will be used for column names; otherwise capture
  724. group numbers will be used.
  725. See Also
  726. --------
  727. extract : Returns first match only (not all matches).
  728. Examples
  729. --------
  730. A pattern with one group will return a DataFrame with one column.
  731. Indices with no matches will not appear in the result.
  732. >>> s = pd.Series(["a1a2", "b1", "c1"], index=["A", "B", "C"])
  733. >>> s.str.extractall(r"[ab](\d)")
  734. 0
  735. match
  736. A 0 1
  737. 1 2
  738. B 0 1
  739. Capture group names are used for column names of the result.
  740. >>> s.str.extractall(r"[ab](?P<digit>\d)")
  741. digit
  742. match
  743. A 0 1
  744. 1 2
  745. B 0 1
  746. A pattern with two groups will return a DataFrame with two columns.
  747. >>> s.str.extractall(r"(?P<letter>[ab])(?P<digit>\d)")
  748. letter digit
  749. match
  750. A 0 a 1
  751. 1 a 2
  752. B 0 b 1
  753. Optional groups that do not match are NaN in the result.
  754. >>> s.str.extractall(r"(?P<letter>[ab])?(?P<digit>\d)")
  755. letter digit
  756. match
  757. A 0 a 1
  758. 1 a 2
  759. B 0 b 1
  760. C 0 NaN 1
  761. """
  762. regex = re.compile(pat, flags=flags)
  763. # the regex must contain capture groups.
  764. if regex.groups == 0:
  765. raise ValueError("pattern contains no capture groups")
  766. if isinstance(arr, ABCIndexClass):
  767. arr = arr.to_series().reset_index(drop=True)
  768. names = dict(zip(regex.groupindex.values(), regex.groupindex.keys()))
  769. columns = [names.get(1 + i, i) for i in range(regex.groups)]
  770. match_list = []
  771. index_list = []
  772. is_mi = arr.index.nlevels > 1
  773. for subject_key, subject in arr.iteritems():
  774. if isinstance(subject, compat.string_types):
  775. if not is_mi:
  776. subject_key = (subject_key, )
  777. for match_i, match_tuple in enumerate(regex.findall(subject)):
  778. if isinstance(match_tuple, compat.string_types):
  779. match_tuple = (match_tuple,)
  780. na_tuple = [np.NaN if group == "" else group
  781. for group in match_tuple]
  782. match_list.append(na_tuple)
  783. result_key = tuple(subject_key + (match_i, ))
  784. index_list.append(result_key)
  785. from pandas import MultiIndex
  786. index = MultiIndex.from_tuples(
  787. index_list, names=arr.index.names + ["match"])
  788. result = arr._constructor_expanddim(match_list, index=index,
  789. columns=columns)
  790. return result
  791. def str_get_dummies(arr, sep='|'):
  792. """
  793. Split each string in the Series by sep and return a DataFrame
  794. of dummy/indicator variables.
  795. Parameters
  796. ----------
  797. sep : str, default "|"
  798. String to split on.
  799. Returns
  800. -------
  801. DataFrame
  802. Dummy variables corresponding to values of the Series.
  803. See Also
  804. --------
  805. get_dummies : Convert categorical variable into dummy/indicator
  806. variables.
  807. Examples
  808. --------
  809. >>> pd.Series(['a|b', 'a', 'a|c']).str.get_dummies()
  810. a b c
  811. 0 1 1 0
  812. 1 1 0 0
  813. 2 1 0 1
  814. >>> pd.Series(['a|b', np.nan, 'a|c']).str.get_dummies()
  815. a b c
  816. 0 1 1 0
  817. 1 0 0 0
  818. 2 1 0 1
  819. """
  820. arr = arr.fillna('')
  821. try:
  822. arr = sep + arr + sep
  823. except TypeError:
  824. arr = sep + arr.astype(str) + sep
  825. tags = set()
  826. for ts in arr.str.split(sep):
  827. tags.update(ts)
  828. tags = sorted(tags - {""})
  829. dummies = np.empty((len(arr), len(tags)), dtype=np.int64)
  830. for i, t in enumerate(tags):
  831. pat = sep + t + sep
  832. dummies[:, i] = lib.map_infer(arr.values, lambda x: pat in x)
  833. return dummies, tags
  834. def str_join(arr, sep):
  835. """
  836. Join lists contained as elements in the Series/Index with passed delimiter.
  837. If the elements of a Series are lists themselves, join the content of these
  838. lists using the delimiter passed to the function.
  839. This function is an equivalent to :meth:`str.join`.
  840. Parameters
  841. ----------
  842. sep : str
  843. Delimiter to use between list entries.
  844. Returns
  845. -------
  846. Series/Index: object
  847. The list entries concatenated by intervening occurrences of the
  848. delimiter.
  849. Raises
  850. -------
  851. AttributeError
  852. If the supplied Series contains neither strings nor lists.
  853. See Also
  854. --------
  855. str.join : Standard library version of this method.
  856. Series.str.split : Split strings around given separator/delimiter.
  857. Notes
  858. -----
  859. If any of the list items is not a string object, the result of the join
  860. will be `NaN`.
  861. Examples
  862. --------
  863. Example with a list that contains non-string elements.
  864. >>> s = pd.Series([['lion', 'elephant', 'zebra'],
  865. ... [1.1, 2.2, 3.3],
  866. ... ['cat', np.nan, 'dog'],
  867. ... ['cow', 4.5, 'goat'],
  868. ... ['duck', ['swan', 'fish'], 'guppy']])
  869. >>> s
  870. 0 [lion, elephant, zebra]
  871. 1 [1.1, 2.2, 3.3]
  872. 2 [cat, nan, dog]
  873. 3 [cow, 4.5, goat]
  874. 4 [duck, [swan, fish], guppy]
  875. dtype: object
  876. Join all lists using a '-'. The lists containing object(s) of types other
  877. than str will produce a NaN.
  878. >>> s.str.join('-')
  879. 0 lion-elephant-zebra
  880. 1 NaN
  881. 2 NaN
  882. 3 NaN
  883. 4 NaN
  884. dtype: object
  885. """
  886. return _na_map(sep.join, arr)
  887. def str_findall(arr, pat, flags=0):
  888. """
  889. Find all occurrences of pattern or regular expression in the Series/Index.
  890. Equivalent to applying :func:`re.findall` to all the elements in the
  891. Series/Index.
  892. Parameters
  893. ----------
  894. pat : str
  895. Pattern or regular expression.
  896. flags : int, default 0
  897. Flags from ``re`` module, e.g. `re.IGNORECASE` (default is 0, which
  898. means no flags).
  899. Returns
  900. -------
  901. Series/Index of lists of strings
  902. All non-overlapping matches of pattern or regular expression in each
  903. string of this Series/Index.
  904. See Also
  905. --------
  906. count : Count occurrences of pattern or regular expression in each string
  907. of the Series/Index.
  908. extractall : For each string in the Series, extract groups from all matches
  909. of regular expression and return a DataFrame with one row for each
  910. match and one column for each group.
  911. re.findall : The equivalent ``re`` function to all non-overlapping matches
  912. of pattern or regular expression in string, as a list of strings.
  913. Examples
  914. --------
  915. >>> s = pd.Series(['Lion', 'Monkey', 'Rabbit'])
  916. The search for the pattern 'Monkey' returns one match:
  917. >>> s.str.findall('Monkey')
  918. 0 []
  919. 1 [Monkey]
  920. 2 []
  921. dtype: object
  922. On the other hand, the search for the pattern 'MONKEY' doesn't return any
  923. match:
  924. >>> s.str.findall('MONKEY')
  925. 0 []
  926. 1 []
  927. 2 []
  928. dtype: object
  929. Flags can be added to the pattern or regular expression. For instance,
  930. to find the pattern 'MONKEY' ignoring the case:
  931. >>> import re
  932. >>> s.str.findall('MONKEY', flags=re.IGNORECASE)
  933. 0 []
  934. 1 [Monkey]
  935. 2 []
  936. dtype: object
  937. When the pattern matches more than one string in the Series, all matches
  938. are returned:
  939. >>> s.str.findall('on')
  940. 0 [on]
  941. 1 [on]
  942. 2 []
  943. dtype: object
  944. Regular expressions are supported too. For instance, the search for all the
  945. strings ending with the word 'on' is shown next:
  946. >>> s.str.findall('on$')
  947. 0 [on]
  948. 1 []
  949. 2 []
  950. dtype: object
  951. If the pattern is found more than once in the same string, then a list of
  952. multiple strings is returned:
  953. >>> s.str.findall('b')
  954. 0 []
  955. 1 []
  956. 2 [b, b]
  957. dtype: object
  958. """
  959. regex = re.compile(pat, flags=flags)
  960. return _na_map(regex.findall, arr)
  961. def str_find(arr, sub, start=0, end=None, side='left'):
  962. """
  963. Return indexes in each strings in the Series/Index where the
  964. substring is fully contained between [start:end]. Return -1 on failure.
  965. Parameters
  966. ----------
  967. sub : str
  968. Substring being searched.
  969. start : int
  970. Left edge index.
  971. end : int
  972. Right edge index.
  973. side : {'left', 'right'}, default 'left'
  974. Specifies a starting side, equivalent to ``find`` or ``rfind``.
  975. Returns
  976. -------
  977. Series or Index
  978. Indexes where substring is found.
  979. """
  980. if not isinstance(sub, compat.string_types):
  981. msg = 'expected a string object, not {0}'
  982. raise TypeError(msg.format(type(sub).__name__))
  983. if side == 'left':
  984. method = 'find'
  985. elif side == 'right':
  986. method = 'rfind'
  987. else: # pragma: no cover
  988. raise ValueError('Invalid side')
  989. if end is None:
  990. f = lambda x: getattr(x, method)(sub, start)
  991. else:
  992. f = lambda x: getattr(x, method)(sub, start, end)
  993. return _na_map(f, arr, dtype=int)
  994. def str_index(arr, sub, start=0, end=None, side='left'):
  995. if not isinstance(sub, compat.string_types):
  996. msg = 'expected a string object, not {0}'
  997. raise TypeError(msg.format(type(sub).__name__))
  998. if side == 'left':
  999. method = 'index'
  1000. elif side == 'right':
  1001. method = 'rindex'
  1002. else: # pragma: no cover
  1003. raise ValueError('Invalid side')
  1004. if end is None:
  1005. f = lambda x: getattr(x, method)(sub, start)
  1006. else:
  1007. f = lambda x: getattr(x, method)(sub, start, end)
  1008. return _na_map(f, arr, dtype=int)
  1009. def str_pad(arr, width, side='left', fillchar=' '):
  1010. """
  1011. Pad strings in the Series/Index up to width.
  1012. Parameters
  1013. ----------
  1014. width : int
  1015. Minimum width of resulting string; additional characters will be filled
  1016. with character defined in `fillchar`.
  1017. side : {'left', 'right', 'both'}, default 'left'
  1018. Side from which to fill resulting string.
  1019. fillchar : str, default ' '
  1020. Additional character for filling, default is whitespace.
  1021. Returns
  1022. -------
  1023. Series or Index of object
  1024. Returns Series or Index with minimum number of char in object.
  1025. See Also
  1026. --------
  1027. Series.str.rjust : Fills the left side of strings with an arbitrary
  1028. character. Equivalent to ``Series.str.pad(side='left')``.
  1029. Series.str.ljust : Fills the right side of strings with an arbitrary
  1030. character. Equivalent to ``Series.str.pad(side='right')``.
  1031. Series.str.center : Fills boths sides of strings with an arbitrary
  1032. character. Equivalent to ``Series.str.pad(side='both')``.
  1033. Series.str.zfill : Pad strings in the Series/Index by prepending '0'
  1034. character. Equivalent to ``Series.str.pad(side='left', fillchar='0')``.
  1035. Examples
  1036. --------
  1037. >>> s = pd.Series(["caribou", "tiger"])
  1038. >>> s
  1039. 0 caribou
  1040. 1 tiger
  1041. dtype: object
  1042. >>> s.str.pad(width=10)
  1043. 0 caribou
  1044. 1 tiger
  1045. dtype: object
  1046. >>> s.str.pad(width=10, side='right', fillchar='-')
  1047. 0 caribou---
  1048. 1 tiger-----
  1049. dtype: object
  1050. >>> s.str.pad(width=10, side='both', fillchar='-')
  1051. 0 -caribou--
  1052. 1 --tiger---
  1053. dtype: object
  1054. """
  1055. if not isinstance(fillchar, compat.string_types):
  1056. msg = 'fillchar must be a character, not {0}'
  1057. raise TypeError(msg.format(type(fillchar).__name__))
  1058. if len(fillchar) != 1:
  1059. raise TypeError('fillchar must be a character, not str')
  1060. if not is_integer(width):
  1061. msg = 'width must be of integer type, not {0}'
  1062. raise TypeError(msg.format(type(width).__name__))
  1063. if side == 'left':
  1064. f = lambda x: x.rjust(width, fillchar)
  1065. elif side == 'right':
  1066. f = lambda x: x.ljust(width, fillchar)
  1067. elif side == 'both':
  1068. f = lambda x: x.center(width, fillchar)
  1069. else: # pragma: no cover
  1070. raise ValueError('Invalid side')
  1071. return _na_map(f, arr)
  1072. def str_split(arr, pat=None, n=None):
  1073. if pat is None:
  1074. if n is None or n == 0:
  1075. n = -1
  1076. f = lambda x: x.split(pat, n)
  1077. else:
  1078. if len(pat) == 1:
  1079. if n is None or n == 0:
  1080. n = -1
  1081. f = lambda x: x.split(pat, n)
  1082. else:
  1083. if n is None or n == -1:
  1084. n = 0
  1085. regex = re.compile(pat)
  1086. f = lambda x: regex.split(x, maxsplit=n)
  1087. res = _na_map(f, arr)
  1088. return res
  1089. def str_rsplit(arr, pat=None, n=None):
  1090. if n is None or n == 0:
  1091. n = -1
  1092. f = lambda x: x.rsplit(pat, n)
  1093. res = _na_map(f, arr)
  1094. return res
  1095. def str_slice(arr, start=None, stop=None, step=None):
  1096. """
  1097. Slice substrings from each element in the Series or Index.
  1098. Parameters
  1099. ----------
  1100. start : int, optional
  1101. Start position for slice operation.
  1102. stop : int, optional
  1103. Stop position for slice operation.
  1104. step : int, optional
  1105. Step size for slice operation.
  1106. Returns
  1107. -------
  1108. Series or Index of object
  1109. Series or Index from sliced substring from original string object.
  1110. See Also
  1111. --------
  1112. Series.str.slice_replace : Replace a slice with a string.
  1113. Series.str.get : Return element at position.
  1114. Equivalent to `Series.str.slice(start=i, stop=i+1)` with `i`
  1115. being the position.
  1116. Examples
  1117. --------
  1118. >>> s = pd.Series(["koala", "fox", "chameleon"])
  1119. >>> s
  1120. 0 koala
  1121. 1 fox
  1122. 2 chameleon
  1123. dtype: object
  1124. >>> s.str.slice(start=1)
  1125. 0 oala
  1126. 1 ox
  1127. 2 hameleon
  1128. dtype: object
  1129. >>> s.str.slice(stop=2)
  1130. 0 ko
  1131. 1 fo
  1132. 2 ch
  1133. dtype: object
  1134. >>> s.str.slice(step=2)
  1135. 0 kaa
  1136. 1 fx
  1137. 2 caeen
  1138. dtype: object
  1139. >>> s.str.slice(start=0, stop=5, step=3)
  1140. 0 kl
  1141. 1 f
  1142. 2 cm
  1143. dtype: object
  1144. Equivalent behaviour to:
  1145. >>> s.str[0:5:3]
  1146. 0 kl
  1147. 1 f
  1148. 2 cm
  1149. dtype: object
  1150. """
  1151. obj = slice(start, stop, step)
  1152. f = lambda x: x[obj]
  1153. return _na_map(f, arr)
  1154. def str_slice_replace(arr, start=None, stop=None, repl=None):
  1155. """
  1156. Replace a positional slice of a string with another value.
  1157. Parameters
  1158. ----------
  1159. start : int, optional
  1160. Left index position to use for the slice. If not specified (None),
  1161. the slice is unbounded on the left, i.e. slice from the start
  1162. of the string.
  1163. stop : int, optional
  1164. Right index position to use for the slice. If not specified (None),
  1165. the slice is unbounded on the right, i.e. slice until the
  1166. end of the string.
  1167. repl : str, optional
  1168. String for replacement. If not specified (None), the sliced region
  1169. is replaced with an empty string.
  1170. Returns
  1171. -------
  1172. Series or Index
  1173. Same type as the original object.
  1174. See Also
  1175. --------
  1176. Series.str.slice : Just slicing without replacement.
  1177. Examples
  1178. --------
  1179. >>> s = pd.Series(['a', 'ab', 'abc', 'abdc', 'abcde'])
  1180. >>> s
  1181. 0 a
  1182. 1 ab
  1183. 2 abc
  1184. 3 abdc
  1185. 4 abcde
  1186. dtype: object
  1187. Specify just `start`, meaning replace `start` until the end of the
  1188. string with `repl`.
  1189. >>> s.str.slice_replace(1, repl='X')
  1190. 0 aX
  1191. 1 aX
  1192. 2 aX
  1193. 3 aX
  1194. 4 aX
  1195. dtype: object
  1196. Specify just `stop`, meaning the start of the string to `stop` is replaced
  1197. with `repl`, and the rest of the string is included.
  1198. >>> s.str.slice_replace(stop=2, repl='X')
  1199. 0 X
  1200. 1 X
  1201. 2 Xc
  1202. 3 Xdc
  1203. 4 Xcde
  1204. dtype: object
  1205. Specify `start` and `stop`, meaning the slice from `start` to `stop` is
  1206. replaced with `repl`. Everything before or after `start` and `stop` is
  1207. included as is.
  1208. >>> s.str.slice_replace(start=1, stop=3, repl='X')
  1209. 0 aX
  1210. 1 aX
  1211. 2 aX
  1212. 3 aXc
  1213. 4 aXde
  1214. dtype: object
  1215. """
  1216. if repl is None:
  1217. repl = ''
  1218. def f(x):
  1219. if x[start:stop] == '':
  1220. local_stop = start
  1221. else:
  1222. local_stop = stop
  1223. y = ''
  1224. if start is not None:
  1225. y += x[:start]
  1226. y += repl
  1227. if stop is not None:
  1228. y += x[local_stop:]
  1229. return y
  1230. return _na_map(f, arr)
  1231. def str_strip(arr, to_strip=None, side='both'):
  1232. """
  1233. Strip whitespace (including newlines) from each string in the
  1234. Series/Index.
  1235. Parameters
  1236. ----------
  1237. to_strip : str or unicode
  1238. side : {'left', 'right', 'both'}, default 'both'
  1239. Returns
  1240. -------
  1241. Series or Index
  1242. """
  1243. if side == 'both':
  1244. f = lambda x: x.strip(to_strip)
  1245. elif side == 'left':
  1246. f = lambda x: x.lstrip(to_strip)
  1247. elif side == 'right':
  1248. f = lambda x: x.rstrip(to_strip)
  1249. else: # pragma: no cover
  1250. raise ValueError('Invalid side')
  1251. return _na_map(f, arr)
  1252. def str_wrap(arr, width, **kwargs):
  1253. r"""
  1254. Wrap long strings in the Series/Index to be formatted in
  1255. paragraphs with length less than a given width.
  1256. This method has the same keyword parameters and defaults as
  1257. :class:`textwrap.TextWrapper`.
  1258. Parameters
  1259. ----------
  1260. width : int
  1261. Maximum line width.
  1262. expand_tabs : bool, optional
  1263. If True, tab characters will be expanded to spaces (default: True).
  1264. replace_whitespace : bool, optional
  1265. If True, each whitespace character (as defined by string.whitespace)
  1266. remaining after tab expansion will be replaced by a single space
  1267. (default: True).
  1268. drop_whitespace : bool, optional
  1269. If True, whitespace that, after wrapping, happens to end up at the
  1270. beginning or end of a line is dropped (default: True).
  1271. break_long_words : bool, optional
  1272. If True, then words longer than width will be broken in order to ensure
  1273. that no lines are longer than width. If it is false, long words will
  1274. not be broken, and some lines may be longer than width (default: True).
  1275. break_on_hyphens : bool, optional
  1276. If True, wrapping will occur preferably on whitespace and right after
  1277. hyphens in compound words, as it is customary in English. If false,
  1278. only whitespaces will be considered as potentially good places for line
  1279. breaks, but you need to set break_long_words to false if you want truly
  1280. insecable words (default: True).
  1281. Returns
  1282. -------
  1283. Series or Index
  1284. Notes
  1285. -----
  1286. Internally, this method uses a :class:`textwrap.TextWrapper` instance with
  1287. default settings. To achieve behavior matching R's stringr library str_wrap
  1288. function, use the arguments:
  1289. - expand_tabs = False
  1290. - replace_whitespace = True
  1291. - drop_whitespace = True
  1292. - break_long_words = False
  1293. - break_on_hyphens = False
  1294. Examples
  1295. --------
  1296. >>> s = pd.Series(['line to be wrapped', 'another line to be wrapped'])
  1297. >>> s.str.wrap(12)
  1298. 0 line to be\nwrapped
  1299. 1 another line\nto be\nwrapped
  1300. dtype: object
  1301. """
  1302. kwargs['width'] = width
  1303. tw = textwrap.TextWrapper(**kwargs)
  1304. return _na_map(lambda s: '\n'.join(tw.wrap(s)), arr)
  1305. def str_translate(arr, table, deletechars=None):
  1306. """
  1307. Map all characters in the string through the given mapping table.
  1308. Equivalent to standard :meth:`str.translate`. Note that the optional
  1309. argument deletechars is only valid if you are using python 2. For python 3,
  1310. character deletion should be specified via the table argument.
  1311. Parameters
  1312. ----------
  1313. table : dict (python 3), str or None (python 2)
  1314. In python 3, table is a mapping of Unicode ordinals to Unicode
  1315. ordinals, strings, or None. Unmapped characters are left untouched.
  1316. Characters mapped to None are deleted. :meth:`str.maketrans` is a
  1317. helper function for making translation tables.
  1318. In python 2, table is either a string of length 256 or None. If the
  1319. table argument is None, no translation is applied and the operation
  1320. simply removes the characters in deletechars. :func:`string.maketrans`
  1321. is a helper function for making translation tables.
  1322. deletechars : str, optional (python 2)
  1323. A string of characters to delete. This argument is only valid
  1324. in python 2.
  1325. Returns
  1326. -------
  1327. Series or Index
  1328. """
  1329. if deletechars is None:
  1330. f = lambda x: x.translate(table)
  1331. else:
  1332. if compat.PY3:
  1333. raise ValueError("deletechars is not a valid argument for "
  1334. "str.translate in python 3. You should simply "
  1335. "specify character deletions in the table "
  1336. "argument")
  1337. f = lambda x: x.translate(table, deletechars)
  1338. return _na_map(f, arr)
  1339. def str_get(arr, i):
  1340. """
  1341. Extract element from each component at specified position.
  1342. Extract element from lists, tuples, or strings in each element in the
  1343. Series/Index.
  1344. Parameters
  1345. ----------
  1346. i : int
  1347. Position of element to extract.
  1348. Returns
  1349. -------
  1350. Series or Index
  1351. Examples
  1352. --------
  1353. >>> s = pd.Series(["String",
  1354. ... (1, 2, 3),
  1355. ... ["a", "b", "c"],
  1356. ... 123,
  1357. ... -456,
  1358. ... {1: "Hello", "2": "World"}])
  1359. >>> s
  1360. 0 String
  1361. 1 (1, 2, 3)
  1362. 2 [a, b, c]
  1363. 3 123
  1364. 4 -456
  1365. 5 {1: 'Hello', '2': 'World'}
  1366. dtype: object
  1367. >>> s.str.get(1)
  1368. 0 t
  1369. 1 2
  1370. 2 b
  1371. 3 NaN
  1372. 4 NaN
  1373. 5 Hello
  1374. dtype: object
  1375. >>> s.str.get(-1)
  1376. 0 g
  1377. 1 3
  1378. 2 c
  1379. 3 NaN
  1380. 4 NaN
  1381. 5 None
  1382. dtype: object
  1383. """
  1384. def f(x):
  1385. if isinstance(x, dict):
  1386. return x.get(i)
  1387. elif len(x) > i >= -len(x):
  1388. return x[i]
  1389. return np.nan
  1390. return _na_map(f, arr)
  1391. def str_decode(arr, encoding, errors="strict"):
  1392. """
  1393. Decode character string in the Series/Index using indicated encoding.
  1394. Equivalent to :meth:`str.decode` in python2 and :meth:`bytes.decode` in
  1395. python3.
  1396. Parameters
  1397. ----------
  1398. encoding : str
  1399. errors : str, optional
  1400. Returns
  1401. -------
  1402. Series or Index
  1403. """
  1404. if encoding in _cpython_optimized_decoders:
  1405. # CPython optimized implementation
  1406. f = lambda x: x.decode(encoding, errors)
  1407. else:
  1408. decoder = codecs.getdecoder(encoding)
  1409. f = lambda x: decoder(x, errors)[0]
  1410. return _na_map(f, arr)
  1411. def str_encode(arr, encoding, errors="strict"):
  1412. """
  1413. Encode character string in the Series/Index using indicated encoding.
  1414. Equivalent to :meth:`str.encode`.
  1415. Parameters
  1416. ----------
  1417. encoding : str
  1418. errors : str, optional
  1419. Returns
  1420. -------
  1421. encoded : Series/Index of objects
  1422. """
  1423. if encoding in _cpython_optimized_encoders:
  1424. # CPython optimized implementation
  1425. f = lambda x: x.encode(encoding, errors)
  1426. else:
  1427. encoder = codecs.getencoder(encoding)
  1428. f = lambda x: encoder(x, errors)[0]
  1429. return _na_map(f, arr)
  1430. def _noarg_wrapper(f, docstring=None, **kargs):
  1431. def wrapper(self):
  1432. result = _na_map(f, self._parent, **kargs)
  1433. return self._wrap_result(result)
  1434. wrapper.__name__ = f.__name__
  1435. if docstring is not None:
  1436. wrapper.__doc__ = docstring
  1437. else:
  1438. raise ValueError('Provide docstring')
  1439. return wrapper
  1440. def _pat_wrapper(f, flags=False, na=False, **kwargs):
  1441. def wrapper1(self, pat):
  1442. result = f(self._parent, pat)
  1443. return self._wrap_result(result)
  1444. def wrapper2(self, pat, flags=0, **kwargs):
  1445. result = f(self._parent, pat, flags=flags, **kwargs)
  1446. return self._wrap_result(result)
  1447. def wrapper3(self, pat, na=np.nan):
  1448. result = f(self._parent, pat, na=na)
  1449. return self._wrap_result(result)
  1450. wrapper = wrapper3 if na else wrapper2 if flags else wrapper1
  1451. wrapper.__name__ = f.__name__
  1452. if f.__doc__:
  1453. wrapper.__doc__ = f.__doc__
  1454. return wrapper
  1455. def copy(source):
  1456. "Copy a docstring from another source function (if…