PageRenderTime 122ms CodeModel.GetById 11ms RepoModel.GetById 1ms app.codeStats 0ms

/pandas/core/strings.py

http://github.com/wesm/pandas
Python | 3201 lines | 2996 code | 59 blank | 146 comment | 96 complexity | ad3630c026496b522aad5d44da706a3f MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
  1. # -*- coding: utf-8 -*-
  2. import codecs
  3. import re
  4. import textwrap
  5. import warnings
  6. import numpy as np
  7. import pandas._libs.lib as lib
  8. import pandas._libs.ops as libops
  9. import pandas.compat as compat
  10. from pandas.compat import zip
  11. from pandas.util._decorators import Appender, deprecate_kwarg
  12. from pandas.core.dtypes.common import (
  13. ensure_object, is_bool_dtype, is_categorical_dtype, is_integer,
  14. is_list_like, is_object_dtype, is_re, is_scalar, is_string_like)
  15. from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries
  16. from pandas.core.dtypes.missing import isna
  17. from pandas.core.algorithms import take_1d
  18. from pandas.core.base import NoNewAttributesMixin
  19. import pandas.core.common as com
  20. _cpython_optimized_encoders = (
  21. "utf-8", "utf8", "latin-1", "latin1", "iso-8859-1", "mbcs", "ascii"
  22. )
  23. _cpython_optimized_decoders = _cpython_optimized_encoders + (
  24. "utf-16", "utf-32"
  25. )
  26. _shared_docs = dict()
  27. def cat_core(list_of_columns, sep):
  28. """
  29. Auxiliary function for :meth:`str.cat`
  30. Parameters
  31. ----------
  32. list_of_columns : list of numpy arrays
  33. List of arrays to be concatenated with sep;
  34. these arrays may not contain NaNs!
  35. sep : string
  36. The separator string for concatenating the columns
  37. Returns
  38. -------
  39. nd.array
  40. The concatenation of list_of_columns with sep
  41. """
  42. list_with_sep = [sep] * (2 * len(list_of_columns) - 1)
  43. list_with_sep[::2] = list_of_columns
  44. return np.sum(list_with_sep, axis=0)
  45. def _na_map(f, arr, na_result=np.nan, dtype=object):
  46. # should really _check_ for NA
  47. return _map(f, arr, na_mask=True, na_value=na_result, dtype=dtype)
  48. def _map(f, arr, na_mask=False, na_value=np.nan, dtype=object):
  49. if not len(arr):
  50. return np.ndarray(0, dtype=dtype)
  51. if isinstance(arr, ABCSeries):
  52. arr = arr.values
  53. if not isinstance(arr, np.ndarray):
  54. arr = np.asarray(arr, dtype=object)
  55. if na_mask:
  56. mask = isna(arr)
  57. try:
  58. convert = not all(mask)
  59. result = lib.map_infer_mask(arr, f, mask.view(np.uint8), convert)
  60. except (TypeError, AttributeError) as e:
  61. # Reraise the exception if callable `f` got wrong number of args.
  62. # The user may want to be warned by this, instead of getting NaN
  63. if compat.PY2:
  64. p_err = r'takes (no|(exactly|at (least|most)) ?\d+) arguments?'
  65. else:
  66. p_err = (r'((takes)|(missing)) (?(2)from \d+ to )?\d+ '
  67. r'(?(3)required )positional arguments?')
  68. if len(e.args) >= 1 and re.search(p_err, e.args[0]):
  69. raise e
  70. def g(x):
  71. try:
  72. return f(x)
  73. except (TypeError, AttributeError):
  74. return na_value
  75. return _map(g, arr, dtype=dtype)
  76. if na_value is not np.nan:
  77. np.putmask(result, mask, na_value)
  78. if result.dtype == object:
  79. result = lib.maybe_convert_objects(result)
  80. return result
  81. else:
  82. return lib.map_infer(arr, f)
  83. def str_count(arr, pat, flags=0):
  84. """
  85. Count occurrences of pattern in each string of the Series/Index.
  86. This function is used to count the number of times a particular regex
  87. pattern is repeated in each of the string elements of the
  88. :class:`~pandas.Series`.
  89. Parameters
  90. ----------
  91. pat : str
  92. Valid regular expression.
  93. flags : int, default 0, meaning no flags
  94. Flags for the `re` module. For a complete list, `see here
  95. <https://docs.python.org/3/howto/regex.html#compilation-flags>`_.
  96. **kwargs
  97. For compatibility with other string methods. Not used.
  98. Returns
  99. -------
  100. Series or Index
  101. Same type as the calling object containing the integer counts.
  102. See Also
  103. --------
  104. re : Standard library module for regular expressions.
  105. str.count : Standard library version, without regular expression support.
  106. Notes
  107. -----
  108. Some characters need to be escaped when passing in `pat`.
  109. eg. ``'$'`` has a special meaning in regex and must be escaped when
  110. finding this literal character.
  111. Examples
  112. --------
  113. >>> s = pd.Series(['A', 'B', 'Aaba', 'Baca', np.nan, 'CABA', 'cat'])
  114. >>> s.str.count('a')
  115. 0 0.0
  116. 1 0.0
  117. 2 2.0
  118. 3 2.0
  119. 4 NaN
  120. 5 0.0
  121. 6 1.0
  122. dtype: float64
  123. Escape ``'$'`` to find the literal dollar sign.
  124. >>> s = pd.Series(['$', 'B', 'Aab$', '$$ca', 'C$B$', 'cat'])
  125. >>> s.str.count('\\$')
  126. 0 1
  127. 1 0
  128. 2 1
  129. 3 2
  130. 4 2
  131. 5 0
  132. dtype: int64
  133. This is also available on Index
  134. >>> pd.Index(['A', 'A', 'Aaba', 'cat']).str.count('a')
  135. Int64Index([0, 0, 2, 1], dtype='int64')
  136. """
  137. regex = re.compile(pat, flags=flags)
  138. f = lambda x: len(regex.findall(x))
  139. return _na_map(f, arr, dtype=int)
  140. def str_contains(arr, pat, case=True, flags=0, na=np.nan, regex=True):
  141. """
  142. Test if pattern or regex is contained within a string of a Series or Index.
  143. Return boolean Series or Index based on whether a given pattern or regex is
  144. contained within a string of a Series or Index.
  145. Parameters
  146. ----------
  147. pat : str
  148. Character sequence or regular expression.
  149. case : bool, default True
  150. If True, case sensitive.
  151. flags : int, default 0 (no flags)
  152. Flags to pass through to the re module, e.g. re.IGNORECASE.
  153. na : default NaN
  154. Fill value for missing values.
  155. regex : bool, default True
  156. If True, assumes the pat is a regular expression.
  157. If False, treats the pat as a literal string.
  158. Returns
  159. -------
  160. Series or Index of boolean values
  161. A Series or Index of boolean values indicating whether the
  162. given pattern is contained within the string of each element
  163. of the Series or Index.
  164. See Also
  165. --------
  166. match : Analogous, but stricter, relying on re.match instead of re.search.
  167. Series.str.startswith : Test if the start of each string element matches a
  168. pattern.
  169. Series.str.endswith : Same as startswith, but tests the end of string.
  170. Examples
  171. --------
  172. Returning a Series of booleans using only a literal pattern.
  173. >>> s1 = pd.Series(['Mouse', 'dog', 'house and parrot', '23', np.NaN])
  174. >>> s1.str.contains('og', regex=False)
  175. 0 False
  176. 1 True
  177. 2 False
  178. 3 False
  179. 4 NaN
  180. dtype: object
  181. Returning an Index of booleans using only a literal pattern.
  182. >>> ind = pd.Index(['Mouse', 'dog', 'house and parrot', '23.0', np.NaN])
  183. >>> ind.str.contains('23', regex=False)
  184. Index([False, False, False, True, nan], dtype='object')
  185. Specifying case sensitivity using `case`.
  186. >>> s1.str.contains('oG', case=True, regex=True)
  187. 0 False
  188. 1 False
  189. 2 False
  190. 3 False
  191. 4 NaN
  192. dtype: object
  193. Specifying `na` to be `False` instead of `NaN` replaces NaN values
  194. with `False`. If Series or Index does not contain NaN values
  195. the resultant dtype will be `bool`, otherwise, an `object` dtype.
  196. >>> s1.str.contains('og', na=False, regex=True)
  197. 0 False
  198. 1 True
  199. 2 False
  200. 3 False
  201. 4 False
  202. dtype: bool
  203. Returning 'house' or 'dog' when either expression occurs in a string.
  204. >>> s1.str.contains('house|dog', regex=True)
  205. 0 False
  206. 1 True
  207. 2 True
  208. 3 False
  209. 4 NaN
  210. dtype: object
  211. Ignoring case sensitivity using `flags` with regex.
  212. >>> import re
  213. >>> s1.str.contains('PARROT', flags=re.IGNORECASE, regex=True)
  214. 0 False
  215. 1 False
  216. 2 True
  217. 3 False
  218. 4 NaN
  219. dtype: object
  220. Returning any digit using regular expression.
  221. >>> s1.str.contains('\\d', regex=True)
  222. 0 False
  223. 1 False
  224. 2 False
  225. 3 True
  226. 4 NaN
  227. dtype: object
  228. Ensure `pat` is a not a literal pattern when `regex` is set to True.
  229. Note in the following example one might expect only `s2[1]` and `s2[3]` to
  230. return `True`. However, '.0' as a regex matches any character
  231. followed by a 0.
  232. >>> s2 = pd.Series(['40', '40.0', '41', '41.0', '35'])
  233. >>> s2.str.contains('.0', regex=True)
  234. 0 True
  235. 1 True
  236. 2 False
  237. 3 True
  238. 4 False
  239. dtype: bool
  240. """
  241. if regex:
  242. if not case:
  243. flags |= re.IGNORECASE
  244. regex = re.compile(pat, flags=flags)
  245. if regex.groups > 0:
  246. warnings.warn("This pattern has match groups. To actually get the"
  247. " groups, use str.extract.", UserWarning,
  248. stacklevel=3)
  249. f = lambda x: bool(regex.search(x))
  250. else:
  251. if case:
  252. f = lambda x: pat in x
  253. else:
  254. upper_pat = pat.upper()
  255. f = lambda x: upper_pat in x
  256. uppered = _na_map(lambda x: x.upper(), arr)
  257. return _na_map(f, uppered, na, dtype=bool)
  258. return _na_map(f, arr, na, dtype=bool)
  259. def str_startswith(arr, pat, na=np.nan):
  260. """
  261. Test if the start of each string element matches a pattern.
  262. Equivalent to :meth:`str.startswith`.
  263. Parameters
  264. ----------
  265. pat : str
  266. Character sequence. Regular expressions are not accepted.
  267. na : object, default NaN
  268. Object shown if element tested is not a string.
  269. Returns
  270. -------
  271. Series or Index of bool
  272. A Series of booleans indicating whether the given pattern matches
  273. the start of each string element.
  274. See Also
  275. --------
  276. str.startswith : Python standard library string method.
  277. Series.str.endswith : Same as startswith, but tests the end of string.
  278. Series.str.contains : Tests if string element contains a pattern.
  279. Examples
  280. --------
  281. >>> s = pd.Series(['bat', 'Bear', 'cat', np.nan])
  282. >>> s
  283. 0 bat
  284. 1 Bear
  285. 2 cat
  286. 3 NaN
  287. dtype: object
  288. >>> s.str.startswith('b')
  289. 0 True
  290. 1 False
  291. 2 False
  292. 3 NaN
  293. dtype: object
  294. Specifying `na` to be `False` instead of `NaN`.
  295. >>> s.str.startswith('b', na=False)
  296. 0 True
  297. 1 False
  298. 2 False
  299. 3 False
  300. dtype: bool
  301. """
  302. f = lambda x: x.startswith(pat)
  303. return _na_map(f, arr, na, dtype=bool)
  304. def str_endswith(arr, pat, na=np.nan):
  305. """
  306. Test if the end of each string element matches a pattern.
  307. Equivalent to :meth:`str.endswith`.
  308. Parameters
  309. ----------
  310. pat : str
  311. Character sequence. Regular expressions are not accepted.
  312. na : object, default NaN
  313. Object shown if element tested is not a string.
  314. Returns
  315. -------
  316. Series or Index of bool
  317. A Series of booleans indicating whether the given pattern matches
  318. the end of each string element.
  319. See Also
  320. --------
  321. str.endswith : Python standard library string method.
  322. Series.str.startswith : Same as endswith, but tests the start of string.
  323. Series.str.contains : Tests if string element contains a pattern.
  324. Examples
  325. --------
  326. >>> s = pd.Series(['bat', 'bear', 'caT', np.nan])
  327. >>> s
  328. 0 bat
  329. 1 bear
  330. 2 caT
  331. 3 NaN
  332. dtype: object
  333. >>> s.str.endswith('t')
  334. 0 True
  335. 1 False
  336. 2 False
  337. 3 NaN
  338. dtype: object
  339. Specifying `na` to be `False` instead of `NaN`.
  340. >>> s.str.endswith('t', na=False)
  341. 0 True
  342. 1 False
  343. 2 False
  344. 3 False
  345. dtype: bool
  346. """
  347. f = lambda x: x.endswith(pat)
  348. return _na_map(f, arr, na, dtype=bool)
  349. def str_replace(arr, pat, repl, n=-1, case=None, flags=0, regex=True):
  350. r"""
  351. Replace occurrences of pattern/regex in the Series/Index with
  352. some other string. Equivalent to :meth:`str.replace` or
  353. :func:`re.sub`.
  354. Parameters
  355. ----------
  356. pat : str or compiled regex
  357. String can be a character sequence or regular expression.
  358. .. versionadded:: 0.20.0
  359. `pat` also accepts a compiled regex.
  360. repl : str or callable
  361. Replacement string or a callable. The callable is passed the regex
  362. match object and must return a replacement string to be used.
  363. See :func:`re.sub`.
  364. .. versionadded:: 0.20.0
  365. `repl` also accepts a callable.
  366. n : int, default -1 (all)
  367. Number of replacements to make from start.
  368. case : bool, default None
  369. - If True, case sensitive (the default if `pat` is a string)
  370. - Set to False for case insensitive
  371. - Cannot be set if `pat` is a compiled regex
  372. flags : int, default 0 (no flags)
  373. - re module flags, e.g. re.IGNORECASE
  374. - Cannot be set if `pat` is a compiled regex
  375. regex : bool, default True
  376. - If True, assumes the passed-in pattern is a regular expression.
  377. - If False, treats the pattern as a literal string
  378. - Cannot be set to False if `pat` is a compiled regex or `repl` is
  379. a callable.
  380. .. versionadded:: 0.23.0
  381. Returns
  382. -------
  383. Series or Index of object
  384. A copy of the object with all matching occurrences of `pat` replaced by
  385. `repl`.
  386. Raises
  387. ------
  388. ValueError
  389. * if `regex` is False and `repl` is a callable or `pat` is a compiled
  390. regex
  391. * if `pat` is a compiled regex and `case` or `flags` is set
  392. Notes
  393. -----
  394. When `pat` is a compiled regex, all flags should be included in the
  395. compiled regex. Use of `case`, `flags`, or `regex=False` with a compiled
  396. regex will raise an error.
  397. Examples
  398. --------
  399. When `pat` is a string and `regex` is True (the default), the given `pat`
  400. is compiled as a regex. When `repl` is a string, it replaces matching
  401. regex patterns as with :meth:`re.sub`. NaN value(s) in the Series are
  402. left as is:
  403. >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f.', 'ba', regex=True)
  404. 0 bao
  405. 1 baz
  406. 2 NaN
  407. dtype: object
  408. When `pat` is a string and `regex` is False, every `pat` is replaced with
  409. `repl` as with :meth:`str.replace`:
  410. >>> pd.Series(['f.o', 'fuz', np.nan]).str.replace('f.', 'ba', regex=False)
  411. 0 bao
  412. 1 fuz
  413. 2 NaN
  414. dtype: object
  415. When `repl` is a callable, it is called on every `pat` using
  416. :func:`re.sub`. The callable should expect one positional argument
  417. (a regex object) and return a string.
  418. To get the idea:
  419. >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f', repr)
  420. 0 <_sre.SRE_Match object; span=(0, 1), match='f'>oo
  421. 1 <_sre.SRE_Match object; span=(0, 1), match='f'>uz
  422. 2 NaN
  423. dtype: object
  424. Reverse every lowercase alphabetic word:
  425. >>> repl = lambda m: m.group(0)[::-1]
  426. >>> pd.Series(['foo 123', 'bar baz', np.nan]).str.replace(r'[a-z]+', repl)
  427. 0 oof 123
  428. 1 rab zab
  429. 2 NaN
  430. dtype: object
  431. Using regex groups (extract second group and swap case):
  432. >>> pat = r"(?P<one>\w+) (?P<two>\w+) (?P<three>\w+)"
  433. >>> repl = lambda m: m.group('two').swapcase()
  434. >>> pd.Series(['One Two Three', 'Foo Bar Baz']).str.replace(pat, repl)
  435. 0 tWO
  436. 1 bAR
  437. dtype: object
  438. Using a compiled regex with flags
  439. >>> import re
  440. >>> regex_pat = re.compile(r'FUZ', flags=re.IGNORECASE)
  441. >>> pd.Series(['foo', 'fuz', np.nan]).str.replace(regex_pat, 'bar')
  442. 0 foo
  443. 1 bar
  444. 2 NaN
  445. dtype: object
  446. """
  447. # Check whether repl is valid (GH 13438, GH 15055)
  448. if not (is_string_like(repl) or callable(repl)):
  449. raise TypeError("repl must be a string or callable")
  450. is_compiled_re = is_re(pat)
  451. if regex:
  452. if is_compiled_re:
  453. if (case is not None) or (flags != 0):
  454. raise ValueError("case and flags cannot be set"
  455. " when pat is a compiled regex")
  456. else:
  457. # not a compiled regex
  458. # set default case
  459. if case is None:
  460. case = True
  461. # add case flag, if provided
  462. if case is False:
  463. flags |= re.IGNORECASE
  464. if is_compiled_re or len(pat) > 1 or flags or callable(repl):
  465. n = n if n >= 0 else 0
  466. compiled = re.compile(pat, flags=flags)
  467. f = lambda x: compiled.sub(repl=repl, string=x, count=n)
  468. else:
  469. f = lambda x: x.replace(pat, repl, n)
  470. else:
  471. if is_compiled_re:
  472. raise ValueError("Cannot use a compiled regex as replacement "
  473. "pattern with regex=False")
  474. if callable(repl):
  475. raise ValueError("Cannot use a callable replacement when "
  476. "regex=False")
  477. f = lambda x: x.replace(pat, repl, n)
  478. return _na_map(f, arr)
  479. def str_repeat(arr, repeats):
  480. """
  481. Duplicate each string in the Series or Index.
  482. Parameters
  483. ----------
  484. repeats : int or sequence of int
  485. Same value for all (int) or different value per (sequence).
  486. Returns
  487. -------
  488. Series or Index of object
  489. Series or Index of repeated string objects specified by
  490. input parameter repeats.
  491. Examples
  492. --------
  493. >>> s = pd.Series(['a', 'b', 'c'])
  494. >>> s
  495. 0 a
  496. 1 b
  497. 2 c
  498. dtype: object
  499. Single int repeats string in Series
  500. >>> s.str.repeat(repeats=2)
  501. 0 aa
  502. 1 bb
  503. 2 cc
  504. dtype: object
  505. Sequence of int repeats corresponding string in Series
  506. >>> s.str.repeat(repeats=[1, 2, 3])
  507. 0 a
  508. 1 bb
  509. 2 ccc
  510. dtype: object
  511. """
  512. if is_scalar(repeats):
  513. def rep(x):
  514. try:
  515. return compat.binary_type.__mul__(x, repeats)
  516. except TypeError:
  517. return compat.text_type.__mul__(x, repeats)
  518. return _na_map(rep, arr)
  519. else:
  520. def rep(x, r):
  521. try:
  522. return compat.binary_type.__mul__(x, r)
  523. except TypeError:
  524. return compat.text_type.__mul__(x, r)
  525. repeats = np.asarray(repeats, dtype=object)
  526. result = libops.vec_binop(com.values_from_object(arr), repeats, rep)
  527. return result
  528. def str_match(arr, pat, case=True, flags=0, na=np.nan):
  529. """
  530. Determine if each string matches a regular expression.
  531. Parameters
  532. ----------
  533. pat : str
  534. Character sequence or regular expression.
  535. case : bool, default True
  536. If True, case sensitive.
  537. flags : int, default 0 (no flags)
  538. re module flags, e.g. re.IGNORECASE.
  539. na : default NaN
  540. Fill value for missing values.
  541. Returns
  542. -------
  543. Series/array of boolean values
  544. See Also
  545. --------
  546. contains : Analogous, but less strict, relying on re.search instead of
  547. re.match.
  548. extract : Extract matched groups.
  549. """
  550. if not case:
  551. flags |= re.IGNORECASE
  552. regex = re.compile(pat, flags=flags)
  553. dtype = bool
  554. f = lambda x: bool(regex.match(x))
  555. return _na_map(f, arr, na, dtype=dtype)
  556. def _get_single_group_name(rx):
  557. try:
  558. return list(rx.groupindex.keys()).pop()
  559. except IndexError:
  560. return None
  561. def _groups_or_na_fun(regex):
  562. """Used in both extract_noexpand and extract_frame"""
  563. if regex.groups == 0:
  564. raise ValueError("pattern contains no capture groups")
  565. empty_row = [np.nan] * regex.groups
  566. def f(x):
  567. if not isinstance(x, compat.string_types):
  568. return empty_row
  569. m = regex.search(x)
  570. if m:
  571. return [np.nan if item is None else item for item in m.groups()]
  572. else:
  573. return empty_row
  574. return f
  575. def _str_extract_noexpand(arr, pat, flags=0):
  576. """
  577. Find groups in each string in the Series using passed regular
  578. expression. This function is called from
  579. str_extract(expand=False), and can return Series, DataFrame, or
  580. Index.
  581. """
  582. from pandas import DataFrame, Index
  583. regex = re.compile(pat, flags=flags)
  584. groups_or_na = _groups_or_na_fun(regex)
  585. if regex.groups == 1:
  586. result = np.array([groups_or_na(val)[0] for val in arr], dtype=object)
  587. name = _get_single_group_name(regex)
  588. else:
  589. if isinstance(arr, Index):
  590. raise ValueError("only one regex group is supported with Index")
  591. name = None
  592. names = dict(zip(regex.groupindex.values(), regex.groupindex.keys()))
  593. columns = [names.get(1 + i, i) for i in range(regex.groups)]
  594. if arr.empty:
  595. result = DataFrame(columns=columns, dtype=object)
  596. else:
  597. result = DataFrame(
  598. [groups_or_na(val) for val in arr],
  599. columns=columns,
  600. index=arr.index,
  601. dtype=object)
  602. return result, name
  603. def _str_extract_frame(arr, pat, flags=0):
  604. """
  605. For each subject string in the Series, extract groups from the
  606. first match of regular expression pat. This function is called from
  607. str_extract(expand=True), and always returns a DataFrame.
  608. """
  609. from pandas import DataFrame
  610. regex = re.compile(pat, flags=flags)
  611. groups_or_na = _groups_or_na_fun(regex)
  612. names = dict(zip(regex.groupindex.values(), regex.groupindex.keys()))
  613. columns = [names.get(1 + i, i) for i in range(regex.groups)]
  614. if len(arr) == 0:
  615. return DataFrame(columns=columns, dtype=object)
  616. try:
  617. result_index = arr.index
  618. except AttributeError:
  619. result_index = None
  620. return DataFrame(
  621. [groups_or_na(val) for val in arr],
  622. columns=columns,
  623. index=result_index,
  624. dtype=object)
  625. def str_extract(arr, pat, flags=0, expand=True):
  626. r"""
  627. Extract capture groups in the regex `pat` as columns in a DataFrame.
  628. For each subject string in the Series, extract groups from the
  629. first match of regular expression `pat`.
  630. Parameters
  631. ----------
  632. pat : str
  633. Regular expression pattern with capturing groups.
  634. flags : int, default 0 (no flags)
  635. Flags from the ``re`` module, e.g. ``re.IGNORECASE``, that
  636. modify regular expression matching for things like case,
  637. spaces, etc. For more details, see :mod:`re`.
  638. expand : bool, default True
  639. If True, return DataFrame with one column per capture group.
  640. If False, return a Series/Index if there is one capture group
  641. or DataFrame if there are multiple capture groups.
  642. .. versionadded:: 0.18.0
  643. Returns
  644. -------
  645. DataFrame or Series or Index
  646. A DataFrame with one row for each subject string, and one
  647. column for each group. Any capture group names in regular
  648. expression pat will be used for column names; otherwise
  649. capture group numbers will be used. The dtype of each result
  650. column is always object, even when no match is found. If
  651. ``expand=False`` and pat has only one capture group, then
  652. return a Series (if subject is a Series) or Index (if subject
  653. is an Index).
  654. See Also
  655. --------
  656. extractall : Returns all matches (not just the first match).
  657. Examples
  658. --------
  659. A pattern with two groups will return a DataFrame with two columns.
  660. Non-matches will be NaN.
  661. >>> s = pd.Series(['a1', 'b2', 'c3'])
  662. >>> s.str.extract(r'([ab])(\d)')
  663. 0 1
  664. 0 a 1
  665. 1 b 2
  666. 2 NaN NaN
  667. A pattern may contain optional groups.
  668. >>> s.str.extract(r'([ab])?(\d)')
  669. 0 1
  670. 0 a 1
  671. 1 b 2
  672. 2 NaN 3
  673. Named groups will become column names in the result.
  674. >>> s.str.extract(r'(?P<letter>[ab])(?P<digit>\d)')
  675. letter digit
  676. 0 a 1
  677. 1 b 2
  678. 2 NaN NaN
  679. A pattern with one group will return a DataFrame with one column
  680. if expand=True.
  681. >>> s.str.extract(r'[ab](\d)', expand=True)
  682. 0
  683. 0 1
  684. 1 2
  685. 2 NaN
  686. A pattern with one group will return a Series if expand=False.
  687. >>> s.str.extract(r'[ab](\d)', expand=False)
  688. 0 1
  689. 1 2
  690. 2 NaN
  691. dtype: object
  692. """
  693. if not isinstance(expand, bool):
  694. raise ValueError("expand must be True or False")
  695. if expand:
  696. return _str_extract_frame(arr._orig, pat, flags=flags)
  697. else:
  698. result, name = _str_extract_noexpand(arr._parent, pat, flags=flags)
  699. return arr._wrap_result(result, name=name, expand=expand)
  700. def str_extractall(arr, pat, flags=0):
  701. r"""
  702. For each subject string in the Series, extract groups from all
  703. matches of regular expression pat. When each subject string in the
  704. Series has exactly one match, extractall(pat).xs(0, level='match')
  705. is the same as extract(pat).
  706. .. versionadded:: 0.18.0
  707. Parameters
  708. ----------
  709. pat : str
  710. Regular expression pattern with capturing groups.
  711. flags : int, default 0 (no flags)
  712. A ``re`` module flag, for example ``re.IGNORECASE``. These allow
  713. to modify regular expression matching for things like case, spaces,
  714. etc. Multiple flags can be combined with the bitwise OR operator,
  715. for example ``re.IGNORECASE | re.MULTILINE``.
  716. Returns
  717. -------
  718. DataFrame
  719. A ``DataFrame`` with one row for each match, and one column for each
  720. group. Its rows have a ``MultiIndex`` with first levels that come from
  721. the subject ``Series``. The last level is named 'match' and indexes the
  722. matches in each item of the ``Series``. Any capture group names in
  723. regular expression pat will be used for column names; otherwise capture
  724. group numbers will be used.
  725. See Also
  726. --------
  727. extract : Returns first match only (not all matches).
  728. Examples
  729. --------
  730. A pattern with one group will return a DataFrame with one column.
  731. Indices with no matches will not appear in the result.
  732. >>> s = pd.Series(["a1a2", "b1", "c1"], index=["A", "B", "C"])
  733. >>> s.str.extractall(r"[ab](\d)")
  734. 0
  735. match
  736. A 0 1
  737. 1 2
  738. B 0 1
  739. Capture group names are used for column names of the result.
  740. >>> s.str.extractall(r"[ab](?P<digit>\d)")
  741. digit
  742. match
  743. A 0 1
  744. 1 2
  745. B 0 1
  746. A pattern with two groups will return a DataFrame with two columns.
  747. >>> s.str.extractall(r"(?P<letter>[ab])(?P<digit>\d)")
  748. letter digit
  749. match
  750. A 0 a 1
  751. 1 a 2
  752. B 0 b 1
  753. Optional groups that do not match are NaN in the result.
  754. >>> s.str.extractall(r"(?P<letter>[ab])?(?P<digit>\d)")
  755. letter digit
  756. match
  757. A 0 a 1
  758. 1 a 2
  759. B 0 b 1
  760. C 0 NaN 1
  761. """
  762. regex = re.compile(pat, flags=flags)
  763. # the regex must contain capture groups.
  764. if regex.groups == 0:
  765. raise ValueError("pattern contains no capture groups")
  766. if isinstance(arr, ABCIndexClass):
  767. arr = arr.to_series().reset_index(drop=True)
  768. names = dict(zip(regex.groupindex.values(), regex.groupindex.keys()))
  769. columns = [names.get(1 + i, i) for i in range(regex.groups)]
  770. match_list = []
  771. index_list = []
  772. is_mi = arr.index.nlevels > 1
  773. for subject_key, subject in arr.iteritems():
  774. if isinstance(subject, compat.string_types):
  775. if not is_mi:
  776. subject_key = (subject_key, )
  777. for match_i, match_tuple in enumerate(regex.findall(subject)):
  778. if isinstance(match_tuple, compat.string_types):
  779. match_tuple = (match_tuple,)
  780. na_tuple = [np.NaN if group == "" else group
  781. for group in match_tuple]
  782. match_list.append(na_tuple)
  783. result_key = tuple(subject_key + (match_i, ))
  784. index_list.append(result_key)
  785. from pandas import MultiIndex
  786. index = MultiIndex.from_tuples(
  787. index_list, names=arr.index.names + ["match"])
  788. result = arr._constructor_expanddim(match_list, index=index,
  789. columns=columns)
  790. return result
  791. def str_get_dummies(arr, sep='|'):
  792. """
  793. Split each string in the Series by sep and return a DataFrame
  794. of dummy/indicator variables.
  795. Parameters
  796. ----------
  797. sep : str, default "|"
  798. String to split on.
  799. Returns
  800. -------
  801. DataFrame
  802. Dummy variables corresponding to values of the Series.
  803. See Also
  804. --------
  805. get_dummies : Convert categorical variable into dummy/indicator
  806. variables.
  807. Examples
  808. --------
  809. >>> pd.Series(['a|b', 'a', 'a|c']).str.get_dummies()
  810. a b c
  811. 0 1 1 0
  812. 1 1 0 0
  813. 2 1 0 1
  814. >>> pd.Series(['a|b', np.nan, 'a|c']).str.get_dummies()
  815. a b c
  816. 0 1 1 0
  817. 1 0 0 0
  818. 2 1 0 1
  819. """
  820. arr = arr.fillna('')
  821. try:
  822. arr = sep + arr + sep
  823. except TypeError:
  824. arr = sep + arr.astype(str) + sep
  825. tags = set()
  826. for ts in arr.str.split(sep):
  827. tags.update(ts)
  828. tags = sorted(tags - {""})
  829. dummies = np.empty((len(arr), len(tags)), dtype=np.int64)
  830. for i, t in enumerate(tags):
  831. pat = sep + t + sep
  832. dummies[:, i] = lib.map_infer(arr.values, lambda x: pat in x)
  833. return dummies, tags
  834. def str_join(arr, sep):
  835. """
  836. Join lists contained as elements in the Series/Index with passed delimiter.
  837. If the elements of a Series are lists themselves, join the content of these
  838. lists using the delimiter passed to the function.
  839. This function is an equivalent to :meth:`str.join`.
  840. Parameters
  841. ----------
  842. sep : str
  843. Delimiter to use between list entries.
  844. Returns
  845. -------
  846. Series/Index: object
  847. The list entries concatenated by intervening occurrences of the
  848. delimiter.
  849. Raises
  850. -------
  851. AttributeError
  852. If the supplied Series contains neither strings nor lists.
  853. See Also
  854. --------
  855. str.join : Standard library version of this method.
  856. Series.str.split : Split strings around given separator/delimiter.
  857. Notes
  858. -----
  859. If any of the list items is not a string object, the result of the join
  860. will be `NaN`.
  861. Examples
  862. --------
  863. Example with a list that contains non-string elements.
  864. >>> s = pd.Series([['lion', 'elephant', 'zebra'],
  865. ... [1.1, 2.2, 3.3],
  866. ... ['cat', np.nan, 'dog'],
  867. ... ['cow', 4.5, 'goat'],
  868. ... ['duck', ['swan', 'fish'], 'guppy']])
  869. >>> s
  870. 0 [lion, elephant, zebra]
  871. 1 [1.1, 2.2, 3.3]
  872. 2 [cat, nan, dog]
  873. 3 [cow, 4.5, goat]
  874. 4 [duck, [swan, fish], guppy]
  875. dtype: object
  876. Join all lists using a '-'. The lists containing object(s) of types other
  877. than str will produce a NaN.
  878. >>> s.str.join('-')
  879. 0 lion-elephant-zebra
  880. 1 NaN
  881. 2 NaN
  882. 3 NaN
  883. 4 NaN
  884. dtype: object
  885. """
  886. return _na_map(sep.join, arr)
  887. def str_findall(arr, pat, flags=0):
  888. """
  889. Find all occurrences of pattern or regular expression in the Series/Index.
  890. Equivalent to applying :func:`re.findall` to all the elements in the
  891. Series/Index.
  892. Parameters
  893. ----------
  894. pat : str
  895. Pattern or regular expression.
  896. flags : int, default 0
  897. Flags from ``re`` module, e.g. `re.IGNORECASE` (default is 0, which
  898. means no flags).
  899. Returns
  900. -------
  901. Series/Index of lists of strings
  902. All non-overlapping matches of pattern or regular expression in each
  903. string of this Series/Index.
  904. See Also
  905. --------
  906. count : Count occurrences of pattern or regular expression in each string
  907. of the Series/Index.
  908. extractall : For each string in the Series, extract groups from all matches
  909. of regular expression and return a DataFrame with one row for each
  910. match and one column for each group.
  911. re.findall : The equivalent ``re`` function to all non-overlapping matches
  912. of pattern or regular expression in string, as a list of strings.
  913. Examples
  914. --------
  915. >>> s = pd.Series(['Lion', 'Monkey', 'Rabbit'])
  916. The search for the pattern 'Monkey' returns one match:
  917. >>> s.str.findall('Monkey')
  918. 0 []
  919. 1 [Monkey]
  920. 2 []
  921. dtype: object
  922. On the other hand, the search for the pattern 'MONKEY' doesn't return any
  923. match:
  924. >>> s.str.findall('MONKEY')
  925. 0 []
  926. 1 []
  927. 2 []
  928. dtype: object
  929. Flags can be added to the pattern or regular expression. For instance,
  930. to find the pattern 'MONKEY' ignoring the case:
  931. >>> import re
  932. >>> s.str.findall('MONKEY', flags=re.IGNORECASE)
  933. 0 []
  934. 1 [Monkey]
  935. 2 []
  936. dtype: object
  937. When the pattern matches more than one string in the Series, all matches
  938. are returned:
  939. >>> s.str.findall('on')
  940. 0 [on]
  941. 1 [on]
  942. 2 []
  943. dtype: object
  944. Regular expressions are supported too. For instance, the search for all the
  945. strings ending with the word 'on' is shown next:
  946. >>> s.str.findall('on$')
  947. 0 [on]
  948. 1 []
  949. 2 []
  950. dtype: object
  951. If the pattern is found more than once in the same string, then a list of
  952. multiple strings is returned:
  953. >>> s.str.findall('b')
  954. 0 []
  955. 1 []
  956. 2 [b, b]
  957. dtype: object
  958. """
  959. regex = re.compile(pat, flags=flags)
  960. return _na_map(regex.findall, arr)
  961. def str_find(arr, sub, start=0, end=None, side='left'):
  962. """
  963. Return indexes in each strings in the Series/Index where the
  964. substring is fully contained between [start:end]. Return -1 on failure.
  965. Parameters
  966. ----------
  967. sub : str
  968. Substring being searched.
  969. start : int
  970. Left edge index.
  971. end : int
  972. Right edge index.
  973. side : {'left', 'right'}, default 'left'
  974. Specifies a starting side, equivalent to ``find`` or ``rfind``.
  975. Returns
  976. -------
  977. Series or Index
  978. Indexes where substring is found.
  979. """
  980. if not isinstance(sub, compat.string_types):
  981. msg = 'expected a string object, not {0}'
  982. raise TypeError(msg.format(type(sub).__name__))
  983. if side == 'left':
  984. method = 'find'
  985. elif side == 'right':
  986. method = 'rfind'
  987. else: # pragma: no cover
  988. raise ValueError('Invalid side')
  989. if end is None:
  990. f = lambda x: getattr(x, method)(sub, start)
  991. else:
  992. f = lambda x: getattr(x, method)(sub, start, end)
  993. return _na_map(f, arr, dtype=int)
  994. def str_index(arr, sub, start=0, end=None, side='left'):
  995. if not isinstance(sub, compat.string_types):
  996. msg = 'expected a string object, not {0}'
  997. raise TypeError(msg.format(type(sub).__name__))
  998. if side == 'left':
  999. method = 'index'
  1000. elif side == 'right':
  1001. method = 'rindex'
  1002. else: # pragma: no cover
  1003. raise ValueError('Invalid side')
  1004. if end is None:
  1005. f = lambda x: getattr(x, method)(sub, start)
  1006. else:
  1007. f = lambda x: getattr(x, method)(sub, start, end)
  1008. return _na_map(f, arr, dtype=int)
  1009. def str_pad(arr, width, side='left', fillchar=' '):
  1010. """
  1011. Pad strings in the Series/Index up to width.
  1012. Parameters
  1013. ----------
  1014. width : int
  1015. Minimum width of resulting string; additional characters will be filled
  1016. with character defined in `fillchar`.
  1017. side : {'left', 'right', 'both'}, default 'left'
  1018. Side from which to fill resulting string.
  1019. fillchar : str, default ' '
  1020. Additional character for filling, default is whitespace.
  1021. Returns
  1022. -------
  1023. Series or Index of object
  1024. Returns Series or Index with minimum number of char in object.
  1025. See Also
  1026. --------
  1027. Series.str.rjust : Fills the left side of strings with an arbitrary
  1028. character. Equivalent to ``Series.str.pad(side='left')``.
  1029. Series.str.ljust : Fills the right side of strings with an arbitrary
  1030. character. Equivalent to ``Series.str.pad(side='right')``.
  1031. Series.str.center : Fills boths sides of strings with an arbitrary
  1032. character. Equivalent to ``Series.str.pad(side='both')``.
  1033. Series.str.zfill : Pad strings in the Series/Index by prepending '0'
  1034. character. Equivalent to ``Series.str.pad(side='left', fillchar='0')``.
  1035. Examples
  1036. --------
  1037. >>> s = pd.Series(["caribou", "tiger"])
  1038. >>> s
  1039. 0 caribou
  1040. 1 tiger
  1041. dtype: object
  1042. >>> s.str.pad(width=10)
  1043. 0 caribou
  1044. 1 tiger
  1045. dtype: object
  1046. >>> s.str.pad(width=10, side='right', fillchar='-')
  1047. 0 caribou---
  1048. 1 tiger-----
  1049. dtype: object
  1050. >>> s.str.pad(width=10, side='both', fillchar='-')
  1051. 0 -caribou--
  1052. 1 --tiger---
  1053. dtype: object
  1054. """
  1055. if not isinstance(fillchar, compat.string_types):
  1056. msg = 'fillchar must be a character, not {0}'
  1057. raise TypeError(msg.format(type(fillchar).__name__))
  1058. if len(fillchar) != 1:
  1059. raise TypeError('fillchar must be a character, not str')
  1060. if not is_integer(width):
  1061. msg = 'width must be of integer type, not {0}'
  1062. raise TypeError(msg.format(type(width).__name__))
  1063. if side == 'left':
  1064. f = lambda x: x.rjust(width, fillchar)
  1065. elif side == 'right':
  1066. f = lambda x: x.ljust(width, fillchar)
  1067. elif side == 'both':
  1068. f = lambda x: x.center(width, fillchar)
  1069. else: # pragma: no cover
  1070. raise ValueError('Invalid side')
  1071. return _na_map(f, arr)
  1072. def str_split(arr, pat=None, n=None):
  1073. if pat is None:
  1074. if n is None or n == 0:
  1075. n = -1
  1076. f = lambda x: x.split(pat, n)
  1077. else:
  1078. if len(pat) == 1:
  1079. if n is None or n == 0:
  1080. n = -1
  1081. f = lambda x: x.split(pat, n)
  1082. else:
  1083. if n is None or n == -1:
  1084. n = 0
  1085. regex = re.compile(pat)
  1086. f = lambda x: regex.split(x, maxsplit=n)
  1087. res = _na_map(f, arr)
  1088. return res
  1089. def str_rsplit(arr, pat=None, n=None):
  1090. if n is None or n == 0:
  1091. n = -1
  1092. f = lambda x: x.rsplit(pat, n)
  1093. res = _na_map(f, arr)
  1094. return res
  1095. def str_slice(arr, start=None, stop=None, step=None):
  1096. """
  1097. Slice substrings from each element in the Series or Index.
  1098. Parameters
  1099. ----------
  1100. start : int, optional
  1101. Start position for slice operation.
  1102. stop : int, optional
  1103. Stop position for slice operation.
  1104. step : int, optional
  1105. Step size for slice operation.
  1106. Returns
  1107. -------
  1108. Series or Index of object
  1109. Series or Index from sliced substring from original string object.
  1110. See Also
  1111. --------
  1112. Series.str.slice_replace : Replace a slice with a string.
  1113. Series.str.get : Return element at position.
  1114. Equivalent to `Series.str.slice(start=i, stop=i+1)` with `i`
  1115. being the position.
  1116. Examples
  1117. --------
  1118. >>> s = pd.Series(["koala", "fox", "chameleon"])
  1119. >>> s
  1120. 0 koala
  1121. 1 fox
  1122. 2 chameleon
  1123. dtype: object
  1124. >>> s.str.slice(start=1)
  1125. 0 oala
  1126. 1 ox
  1127. 2 hameleon
  1128. dtype: object
  1129. >>> s.str.slice(stop=2)
  1130. 0 ko
  1131. 1 fo
  1132. 2 ch
  1133. dtype: object
  1134. >>> s.str.slice(step=2)
  1135. 0 kaa
  1136. 1 fx
  1137. 2 caeen
  1138. dtype: object
  1139. >>> s.str.slice(start=0, stop=5, step=3)
  1140. 0 kl
  1141. 1 f
  1142. 2 cm
  1143. dtype: object
  1144. Equivalent behaviour to:
  1145. >>> s.str[0:5:3]
  1146. 0 kl
  1147. 1 f
  1148. 2 cm
  1149. dtype: object
  1150. """
  1151. obj = slice(start, stop, step)
  1152. f = lambda x: x[obj]
  1153. return _na_map(f, arr)
  1154. def str_slice_replace(arr, start=None, stop=None, repl=None):
  1155. """
  1156. Replace a positional slice of a string with another value.
  1157. Parameters
  1158. ----------
  1159. start : int, optional
  1160. Left index position to use for the slice. If not specified (None),
  1161. the slice is unbounded on the left, i.e. slice from the start
  1162. of the string.
  1163. stop : int, optional
  1164. Right index position to use for the slice. If not specified (None),
  1165. the slice is unbounded on the right, i.e. slice until the
  1166. end of the string.
  1167. repl : str, optional
  1168. String for replacement. If not specified (None), the sliced region
  1169. is replaced with an empty string.
  1170. Returns
  1171. -------
  1172. Series or Index
  1173. Same type as the original object.
  1174. See Also
  1175. --------
  1176. Series.str.slice : Just slicing without replacement.
  1177. Examples
  1178. --------
  1179. >>> s = pd.Series(['a', 'ab', 'abc', 'abdc', 'abcde'])
  1180. >>> s
  1181. 0 a
  1182. 1 ab
  1183. 2 abc
  1184. 3 abdc
  1185. 4 abcde
  1186. dtype: object
  1187. Specify just `start`, meaning replace `start` until the end of the
  1188. string with `repl`.
  1189. >>> s.str.slice_replace(1, repl='X')
  1190. 0 aX
  1191. 1 aX
  1192. 2 aX
  1193. 3 aX
  1194. 4 aX
  1195. dtype: object
  1196. Specify just `stop`, meaning the start of the string to `stop` is replaced
  1197. with `repl`, and the rest of the string is included.
  1198. >>> s.str.slice_replace(stop=2, repl='X')
  1199. 0 X
  1200. 1 X
  1201. 2 Xc
  1202. 3 Xdc
  1203. 4 Xcde
  1204. dtype: object
  1205. Specify `start` and `stop`, meaning the slice from `start` to `stop` is
  1206. replaced with `repl`. Everything before or after `start` and `stop` is
  1207. included as is.
  1208. >>> s.str.slice_replace(start=1, stop=3, repl='X')
  1209. 0 aX
  1210. 1 aX
  1211. 2 aX
  1212. 3 aXc
  1213. 4 aXde
  1214. dtype: object
  1215. """
  1216. if repl is None:
  1217. repl = ''
  1218. def f(x):
  1219. if x[start:stop] == '':
  1220. local_stop = start
  1221. else:
  1222. local_stop = stop
  1223. y = ''
  1224. if start is not None:
  1225. y += x[:start]
  1226. y += repl
  1227. if stop is not None:
  1228. y += x[local_stop:]
  1229. return y
  1230. return _na_map(f, arr)
  1231. def str_strip(arr, to_strip=None, side='both'):
  1232. """
  1233. Strip whitespace (including newlines) from each string in the
  1234. Series/Index.
  1235. Parameters
  1236. ----------
  1237. to_strip : str or unicode
  1238. side : {'left', 'right', 'both'}, default 'both'
  1239. Returns
  1240. -------
  1241. Series or Index
  1242. """
  1243. if side == 'both':
  1244. f = lambda x: x.strip(to_strip)
  1245. elif side == 'left':
  1246. f = lambda x: x.lstrip(to_strip)
  1247. elif side == 'right':
  1248. f = lambda x: x.rstrip(to_strip)
  1249. else: # pragma: no cover
  1250. raise ValueError('Invalid side')
  1251. return _na_map(f, arr)
  1252. def str_wrap(arr, width, **kwargs):
  1253. r"""
  1254. Wrap long strings in the Series/Index to be formatted in
  1255. paragraphs with length less than a given width.
  1256. This method has the same keyword parameters and defaults as
  1257. :class:`textwrap.TextWrapper`.
  1258. Parameters
  1259. ----------
  1260. width : int
  1261. Maximum line width.
  1262. expand_tabs : bool, optional
  1263. If True, tab characters will be expanded to spaces (default: True).
  1264. replace_whitespace : bool, optional
  1265. If True, each whitespace character (as defined by string.whitespace)
  1266. remaining after tab expansion will be replaced by a single space
  1267. (default: True).
  1268. drop_whitespace : bool, optional
  1269. If True, whitespace that, after wrapping, happens to end up at the
  1270. beginning or end of a line is dropped (default: True).
  1271. break_long_words : bool, optional
  1272. If True, then words longer than width will be broken in order to ensure
  1273. that no lines are longer than width. If it is false, long words will
  1274. not be broken, and some lines may be longer than width (default: True).
  1275. break_on_hyphens : bool, optional
  1276. If True, wrapping will occur preferably on whitespace and right after
  1277. hyphens in compound words, as it is customary in English. If false,
  1278. only whitespaces will be considered as potentially good places for line
  1279. breaks, but you need to set break_long_words to false if you want truly
  1280. insecable words (default: True).
  1281. Returns
  1282. -------
  1283. Series or Index
  1284. Notes
  1285. -----
  1286. Internally, this method uses a :class:`textwrap.TextWrapper` instance with
  1287. default settings. To achieve behavior matching R's stringr library str_wrap
  1288. function, use the arguments:
  1289. - expand_tabs = False
  1290. - replace_whitespace = True
  1291. - drop_whitespace = True
  1292. - break_long_words = False
  1293. - break_on_hyphens = False
  1294. Examples
  1295. --------
  1296. >>> s = pd.Series(['line to be wrapped', 'another line to be wrapped'])
  1297. >>> s.str.wrap(12)
  1298. 0 line to be\nwrapped
  1299. 1 another line\nto be\nwrapped
  1300. dtype: object
  1301. """
  1302. kwargs['width'] = width
  1303. tw = textwrap.TextWrapper(**kwargs)
  1304. return _na_map(lambda s: '\n'.join(tw.wrap(s)), arr)
  1305. def str_translate(arr, table, deletechars=None):
  1306. """
  1307. Map all characters in the string through the given mapping table.
  1308. Equivalent to standard :meth:`str.translate`. Note that the optional
  1309. argument deletechars is only valid if you are using python 2. For python 3,
  1310. character deletion should be specified via the table argument.
  1311. Parameters
  1312. ----------
  1313. table : dict (python 3), str or None (python 2)
  1314. In python 3, table is a mapping of Unicode ordinals to Unicode
  1315. ordinals, strings, or None. Unmapped characters are left untouched.
  1316. Characters mapped to None are deleted. :meth:`str.maketrans` is a
  1317. helper function for making translation tables.
  1318. In python 2, table is either a string of length 256 or None. If the
  1319. table argument is None, no translation is applied and the operation
  1320. simply removes the characters in deletechars. :func:`string.maketrans`
  1321. is a helper function for making translation tables.
  1322. deletechars : str, optional (python 2)
  1323. A string of characters to delete. This argument is only valid
  1324. in python 2.
  1325. Returns
  1326. -------
  1327. Series or Index
  1328. """
  1329. if deletechars is None:
  1330. f = lambda x: x.translate(table)
  1331. else:
  1332. if compat.PY3:
  1333. raise ValueError("deletechars is not a valid argument for "
  1334. "str.translate in python 3. You should simply "
  1335. "specify character deletions in the table "
  1336. "argument")
  1337. f = lambda x: x.translate(table, deletechars)
  1338. return _na_map(f, arr)
  1339. def str_get(arr, i):
  1340. """
  1341. Extract element from each component at specified position.
  1342. Extract element from lists, tuples, or strings in each element in the
  1343. Series/Index.
  1344. Parameters
  1345. ----------
  1346. i : int
  1347. Position of element to extract.
  1348. Returns
  1349. -------
  1350. Series or Index
  1351. Examples
  1352. --------
  1353. >>> s = pd.Series(["String",
  1354. ... (1, 2, 3),
  1355. ... ["a", "b", "c"],
  1356. ... 123,
  1357. ... -456,
  1358. ... {1: "Hello", "2": "World"}])
  1359. >>> s
  1360. 0 String
  1361. 1 (1, 2, 3)
  1362. 2 [a, b, c]
  1363. 3 123
  1364. 4 -456
  1365. 5 {1: 'Hello', '2': 'World'}
  1366. dtype: object
  1367. >>> s.str.get(1)
  1368. 0 t
  1369. 1 2
  1370. 2 b
  1371. 3 NaN
  1372. 4 NaN
  1373. 5 Hello
  1374. dtype: object
  1375. >>> s.str.get(-1)
  1376. 0 g
  1377. 1 3
  1378. 2 c
  1379. 3 NaN
  1380. 4 NaN
  1381. 5 None
  1382. dtype: object
  1383. """
  1384. def f(x):
  1385. if isinstance(x, dict):
  1386. return x.get(i)
  1387. elif len(x) > i >= -len(x):
  1388. return x[i]
  1389. return np.nan
  1390. return _na_map(f, arr)
  1391. def str_decode(arr, encoding, errors="strict"):
  1392. """
  1393. Decode character string in the Series/Index using indicated encoding.
  1394. Equivalent to :meth:`str.decode` in python2 and :meth:`bytes.decode` in
  1395. python3.
  1396. Parameters
  1397. ----------
  1398. encoding : str
  1399. errors : str, optional
  1400. Returns
  1401. -------
  1402. Series or Index
  1403. """
  1404. if encoding in _cpython_optimized_decoders:
  1405. # CPython optimized implementation
  1406. f = lambda x: x.decode(encoding, errors)
  1407. else:
  1408. decoder = codecs.getdecoder(encoding)
  1409. f = lambda x: decoder(x, errors)[0]
  1410. return _na_map(f, arr)
  1411. def str_encode(arr, encoding, errors="strict"):
  1412. """
  1413. Encode character string in the Series/Index using indicated encoding.
  1414. Equivalent to :meth:`str.encode`.
  1415. Parameters
  1416. ----------
  1417. encoding : str
  1418. errors : str, optional
  1419. Returns
  1420. -------
  1421. encoded : Series/Index of objects
  1422. """
  1423. if encoding in _cpython_optimized_encoders:
  1424. # CPython optimized implementation
  1425. f = lambda x: x.encode(encoding, errors)
  1426. else:
  1427. encoder = codecs.getencoder(encoding)
  1428. f = lambda x: encoder(x, errors)[0]
  1429. return _na_map(f, arr)
  1430. def _noarg_wrapper(f, docstring=None, **kargs):
  1431. def wrapper(self):
  1432. result = _na_map(f, self._parent, **kargs)
  1433. return self._wrap_result(result)
  1434. wrapper.__name__ = f.__name__
  1435. if docstring is not None:
  1436. wrapper.__doc__ = docstring
  1437. else:
  1438. raise ValueError('Provide docstring')
  1439. return wrapper
  1440. def _pat_wrapper(f, flags=False, na=False, **kwargs):
  1441. def wrapper1(self, pat):
  1442. result = f(self._parent, pat)
  1443. return self._wrap_result(result)
  1444. def wrapper2(self, pat, flags=0, **kwargs):
  1445. result = f(self._parent, pat, flags=flags, **kwargs)
  1446. return self._wrap_result(result)
  1447. def wrapper3(self, pat, na=np.nan):
  1448. result = f(self._parent, pat, na=na)
  1449. return self._wrap_result(result)
  1450. wrapper = wrapper3 if na else wrapper2 if flags else wrapper1
  1451. wrapper.__name__ = f.__name__
  1452. if f.__doc__:
  1453. wrapper.__doc__ = f.__doc__
  1454. return wrapper
  1455. def copy(source):
  1456. "Copy a docstring from another source function (if present)"
  1457. def do_copy(target):
  1458. if source.__doc__:
  1459. target.__doc__ = source.__doc__
  1460. return target
  1461. return do_copy
  1462. class StringMethods(NoNewAttributesMixin):
  1463. """
  1464. Vectorized string functions for Series and Index. NAs stay NA unless
  1465. handled otherwise by a particular method. Patterned after Python's string
  1466. methods, with some inspiration from R's stringr package.
  1467. Examples
  1468. --------
  1469. >>> s.str.split('_')
  1470. >>> s.str.replace('_', '')
  1471. """
  1472. def __init__(self, data):
  1473. self._validate(data)
  1474. self._is_categorical = is_categorical_dtype(data)
  1475. # .values.categories works for both Series/Index
  1476. self._parent = data.values.categories if self._is_categorical else data
  1477. # save orig to blow up categoricals to the right type
  1478. self._orig = data
  1479. self._freeze()
  1480. @staticmethod
  1481. def _validate(data):
  1482. from pandas.core.index import Index
  1483. if (isinstance(data, ABCSeries) and
  1484. not ((is_categorical_dtype(data.dtype) and
  1485. is_object_dtype(data.values.categories)) or
  1486. (is_object_dtype(data.dtype)))):
  1487. # it's neither a string series not a categorical series with
  1488. # strings inside the categories.
  1489. # this really should exclude all series with any non-string values
  1490. # (instead of test for object dtype), but that isn't practical for
  1491. # performance reasons until we have a str dtype (GH 9343)
  1492. raise AttributeError("Can only use .str accessor with string "
  1493. "values, which use np.object_ dtype in "
  1494. "pandas")
  1495. elif isinstance(data, Index):
  1496. # can't use ABCIndex to exclude non-str
  1497. # see src/inference.pyx which can contain string values
  1498. allowed_types = ('string', 'unicode', 'mixed', 'mixed-integer')
  1499. if is_categorical_dtype(data.dtype):
  1500. inf_type = data.categories.inferred_type
  1501. else:
  1502. inf_type = data.inferred_type
  1503. if inf_type not in allowed_types:
  1504. message = ("Can only use .str accessor with string values "
  1505. "(i.e. inferred_type is 'string', 'unicode' or "
  1506. "'mixed')")
  1507. raise AttributeError(message)
  1508. if data.nlevels > 1:
  1509. message = ("Can only use .str accessor with Index, not "
  1510. "MultiIndex")
  1511. raise AttributeError(message)
  1512. def __getitem__(self, key):
  1513. if isinstance(key, slice):
  1514. return self.slice(start=key.start, stop=key.stop, step=key.step)
  1515. else:
  1516. return self.get(key)
  1517. def __iter__(self):
  1518. i = 0
  1519. g = self.get(i)
  1520. while g.notna().any():
  1521. yield g
  1522. i += 1
  1523. g = self.get(i)
  1524. def _wrap_result(self, result, use_codes=True,
  1525. name=None, expand=None, fill_value=np.nan):
  1526. from pandas import Index, Series, MultiIndex
  1527. # for category, we do the stuff on the categories, so blow it up
  1528. # to the full series again
  1529. # But for some operations, we have to do the stuff on the full values,
  1530. # so make it possible to skip this step as the method already did this
  1531. # before the transformation...
  1532. if use_codes and self._is_categorical:
  1533. # if self._orig is a CategoricalIndex, there is no .cat-accessor
  1534. result = take_1d(result, Series(self._orig, copy=False).cat.codes,
  1535. fill_value=fill_value)
  1536. if not hasattr(result, 'ndim') or not hasattr(result, 'dtype'):
  1537. return result
  1538. assert result.ndim < 3
  1539. if expand is None:
  1540. # infer from ndim if expand is not specified
  1541. expand = result.ndim != 1
  1542. elif expand is True and not isinstance(self._orig, Index):
  1543. # required when expand=True is explicitly specified
  1544. # not needed when inferred
  1545. def cons_row(x):
  1546. if is_list_like(x):
  1547. return x
  1548. else:
  1549. return [x]
  1550. result = [cons_row(x) for x in result]
  1551. if result:
  1552. # propagate nan values to match longest sequence (GH 18450)
  1553. max_len = max(len(x) for x in result)
  1554. result = [x * max_len if len(x) == 0 or x[0] is np.nan
  1555. else x for x in result]
  1556. if not isinstance(expand, bool):
  1557. raise ValueError("expand must be True or False")
  1558. if expand is False:
  1559. # if expand is False, result should have the same name
  1560. # as the original otherwise specified
  1561. if name is None:
  1562. name = getattr(result, 'name', None)
  1563. if name is None:
  1564. # do not use logical or, _orig may be a DataFrame
  1565. # which has "name" column
  1566. name = self._orig.name
  1567. # Wait until we are sure result is a Series or Index before
  1568. # checking attributes (GH 12180)
  1569. if isinstance(self._orig, Index):
  1570. # if result is a boolean np.array, return the np.array
  1571. # instead of wrapping it into a boolean Index (GH 8875)
  1572. if is_bool_dtype(result):
  1573. return result
  1574. if expand:
  1575. result = list(result)
  1576. out = MultiIndex.from_tuples(result, names=name)
  1577. if out.nlevels == 1:
  1578. # We had all tuples of length-one, which are
  1579. # better represented as a regular Index.
  1580. out = out.get_level_values(0)
  1581. return out
  1582. else:
  1583. return Index(result, name=name)
  1584. else:
  1585. index = self._orig.index
  1586. if expand:
  1587. cons = self._orig._constructor_expanddim
  1588. return cons(result, columns=name, index=index)
  1589. else:
  1590. # Must be a Series
  1591. cons = self._orig._constructor
  1592. return cons(result, name=name, index=index)
  1593. def _get_series_list(self, others, ignore_index=False):
  1594. """
  1595. Auxiliary function for :meth:`str.cat`. Turn potentially mixed input
  1596. into a list of Series (elements without an index must match the length
  1597. of the calling Series/Index).
  1598. Parameters
  1599. ----------
  1600. others : Series, Index, DataFrame, np.ndarray, list-like or list-like
  1601. of objects that are Series, Index or np.ndarray (1-dim)
  1602. ignore_index : boolean, default False
  1603. Determines whether to forcefully align others with index of caller
  1604. Returns
  1605. -------
  1606. tuple : (others transformed into list of Series,
  1607. boolean whether FutureWarning should be raised)
  1608. """
  1609. # Once str.cat defaults to alignment, this function can be simplified;
  1610. # will not need `ignore_index` and the second boolean output anymore
  1611. from pandas import Index, Series, DataFrame
  1612. # self._orig is either Series or Index
  1613. idx = self._orig if isinstance(self._orig, Index) else self._orig.index
  1614. err_msg = ('others must be Series, Index, DataFrame, np.ndarrary or '
  1615. 'list-like (either containing only strings or containing '
  1616. 'only objects of type Series/Index/list-like/np.ndarray)')
  1617. # Generally speaking, all objects without an index inherit the index
  1618. # `idx` of the calling Series/Index - i.e. must have matching length.
  1619. # Objects with an index (i.e. Series/Index/DataFrame) keep their own
  1620. # index, *unless* ignore_index is set to True.
  1621. if isinstance(others, Series):
  1622. warn = not others.index.equals(idx)
  1623. # only reconstruct Series when absolutely necessary
  1624. los = [Series(others.values, index=idx)
  1625. if ignore_index and warn else others]
  1626. return (los, warn)
  1627. elif isinstance(others, Index):
  1628. warn = not others.equals(idx)
  1629. los = [Series(others.values,
  1630. index=(idx if ignore_index else others))]
  1631. return (los, warn)
  1632. elif isinstance(others, DataFrame):
  1633. warn = not others.index.equals(idx)
  1634. if ignore_index and warn:
  1635. # without copy, this could change "others"
  1636. # that was passed to str.cat
  1637. others = others.copy()
  1638. others.index = idx
  1639. return ([others[x] for x in others], warn)
  1640. elif isinstance(others, np.ndarray) and others.ndim == 2:
  1641. others = DataFrame(others, index=idx)
  1642. return ([others[x] for x in others], False)
  1643. elif is_list_like(others, allow_sets=False):
  1644. others = list(others) # ensure iterators do not get read twice etc
  1645. # in case of list-like `others`, all elements must be
  1646. # either one-dimensional list-likes or scalars
  1647. if all(is_list_like(x, allow_sets=False) for x in others):
  1648. los = []
  1649. join_warn = False
  1650. depr_warn = False
  1651. # iterate through list and append list of series for each
  1652. # element (which we check to be one-dimensional and non-nested)
  1653. while others:
  1654. nxt = others.pop(0) # nxt is guaranteed list-like by above
  1655. # GH 21950 - DeprecationWarning
  1656. # only allowing Series/Index/np.ndarray[1-dim] will greatly
  1657. # simply this function post-deprecation.
  1658. if not (isinstance(nxt, (Series, Index)) or
  1659. (isinstance(nxt, np.ndarray) and nxt.ndim == 1)):
  1660. depr_warn = True
  1661. if not isinstance(nxt, (DataFrame, Series,
  1662. Index, np.ndarray)):
  1663. # safety for non-persistent list-likes (e.g. iterators)
  1664. # do not map indexed/typed objects; info needed below
  1665. nxt = list(nxt)
  1666. # known types for which we can avoid deep inspection
  1667. no_deep = ((isinstance(nxt, np.ndarray) and nxt.ndim == 1)
  1668. or isinstance(nxt, (Series, Index)))
  1669. # nested list-likes are forbidden:
  1670. # -> elements of nxt must not be list-like
  1671. is_legal = ((no_deep and nxt.dtype == object)
  1672. or all(not is_list_like(x) for x in nxt))
  1673. # DataFrame is false positive of is_legal
  1674. # because "x in df" returns column names
  1675. if not is_legal or isinstance(nxt, DataFrame):
  1676. raise TypeError(err_msg)
  1677. nxt, wnx = self._get_series_list(nxt,
  1678. ignore_index=ignore_index)
  1679. los = los + nxt
  1680. join_warn = join_warn or wnx
  1681. if depr_warn:
  1682. warnings.warn('list-likes other than Series, Index, or '
  1683. 'np.ndarray WITHIN another list-like are '
  1684. 'deprecated and will be removed in a future '
  1685. 'version.', FutureWarning, stacklevel=3)
  1686. return (los, join_warn)
  1687. elif all(not is_list_like(x) for x in others):
  1688. return ([Series(others, index=idx)], False)
  1689. raise TypeError(err_msg)
  1690. def cat(self, others=None, sep=None, na_rep=None, join=None):
  1691. """
  1692. Concatenate strings in the Series/Index with given separator.
  1693. If `others` is specified, this function concatenates the Series/Index
  1694. and elements of `others` element-wise.
  1695. If `others` is not passed, then all values in the Series/Index are
  1696. concatenated into a single string with a given `sep`.
  1697. Parameters
  1698. ----------
  1699. others : Series, Index, DataFrame, np.ndarrary or list-like
  1700. Series, Index, DataFrame, np.ndarray (one- or two-dimensional) and
  1701. other list-likes of strings must have the same length as the
  1702. calling Series/Index, with the exception of indexed objects (i.e.
  1703. Series/Index/DataFrame) if `join` is not None.
  1704. If others is a list-like that contains a combination of Series,
  1705. Index or np.ndarray (1-dim), then all elements will be unpacked and
  1706. must satisfy the above criteria individually.
  1707. If others is None, the method returns the concatenation of all
  1708. strings in the calling Series/Index.
  1709. sep : str, default ''
  1710. The separator between the different elements/columns. By default
  1711. the empty string `''` is used.
  1712. na_rep : str or None, default None
  1713. Representation that is inserted for all missing values:
  1714. - If `na_rep` is None, and `others` is None, missing values in the
  1715. Series/Index are omitted from the result.
  1716. - If `na_rep` is None, and `others` is not None, a row containing a
  1717. missing value in any of the columns (before concatenation) will
  1718. have a missing value in the result.
  1719. join : {'left', 'right', 'outer', 'inner'}, default None
  1720. Determines the join-style between the calling Series/Index and any
  1721. Series/Index/DataFrame in `others` (objects without an index need
  1722. to match the length of the calling Series/Index). If None,
  1723. alignment is disabled, but this option will be removed in a future
  1724. version of pandas and replaced with a default of `'left'`. To
  1725. disable alignment, use `.values` on any Series/Index/DataFrame in
  1726. `others`.
  1727. .. versionadded:: 0.23.0
  1728. Returns
  1729. -------
  1730. str, Series or Index
  1731. If `others` is None, `str` is returned, otherwise a `Series/Index`
  1732. (same type as caller) of objects is returned.
  1733. See Also
  1734. --------
  1735. split : Split each string in the Series/Index.
  1736. join : Join lists contained as elements in the Series/Index.
  1737. Examples
  1738. --------
  1739. When not passing `others`, all values are concatenated into a single
  1740. string:
  1741. >>> s = pd.Series(['a', 'b', np.nan, 'd'])
  1742. >>> s.str.cat(sep=' ')
  1743. 'a b d'
  1744. By default, NA values in the Series are ignored. Using `na_rep`, they
  1745. can be given a representation:
  1746. >>> s.str.cat(sep=' ', na_rep='?')
  1747. 'a b ? d'
  1748. If `others` is specified, corresponding values are concatenated with
  1749. the separator. Result will be a Series of strings.
  1750. >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',')
  1751. 0 a,A
  1752. 1 b,B
  1753. 2 NaN
  1754. 3 d,D
  1755. dtype: object
  1756. Missing values will remain missing in the result, but can again be
  1757. represented using `na_rep`
  1758. >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',', na_rep='-')
  1759. 0 a,A
  1760. 1 b,B
  1761. 2 -,C
  1762. 3 d,D
  1763. dtype: object
  1764. If `sep` is not specified, the values are concatenated without
  1765. separation.
  1766. >>> s.str.cat(['A', 'B', 'C', 'D'], na_rep='-')
  1767. 0 aA
  1768. 1 bB
  1769. 2 -C
  1770. 3 dD
  1771. dtype: object
  1772. Series with different indexes can be aligned before concatenation. The
  1773. `join`-keyword works as in other methods.
  1774. >>> t = pd.Series(['d', 'a', 'e', 'c'], index=[3, 0, 4, 2])
  1775. >>> s.str.cat(t, join='left', na_rep='-')
  1776. 0 aa
  1777. 1 b-
  1778. 2 -c
  1779. 3 dd
  1780. dtype: object
  1781. >>>
  1782. >>> s.str.cat(t, join='outer', na_rep='-')
  1783. 0 aa
  1784. 1 b-
  1785. 2 -c
  1786. 3 dd
  1787. 4 -e
  1788. dtype: object
  1789. >>>
  1790. >>> s.str.cat(t, join='inner', na_rep='-')
  1791. 0 aa
  1792. 2 -c
  1793. 3 dd
  1794. dtype: object
  1795. >>>
  1796. >>> s.str.cat(t, join='right', na_rep='-')
  1797. 3 dd
  1798. 0 aa
  1799. 4 -e
  1800. 2 -c
  1801. dtype: object
  1802. For more examples, see :ref:`here <text.concatenate>`.
  1803. """
  1804. from pandas import Index, Series, concat
  1805. if isinstance(others, compat.string_types):
  1806. raise ValueError("Did you mean to supply a `sep` keyword?")
  1807. if sep is None:
  1808. sep = ''
  1809. if isinstance(self._orig, Index):
  1810. data = Series(self._orig, index=self._orig)
  1811. else: # Series
  1812. data = self._orig
  1813. # concatenate Series/Index with itself if no "others"
  1814. if others is None:
  1815. data = ensure_object(data)
  1816. na_mask = isna(data)
  1817. if na_rep is None and na_mask.any():
  1818. data = data[~na_mask]
  1819. elif na_rep is not None and na_mask.any():
  1820. data = np.where(na_mask, na_rep, data)
  1821. return sep.join(data)
  1822. try:
  1823. # turn anything in "others" into lists of Series
  1824. others, warn = self._get_series_list(others,
  1825. ignore_index=(join is None))
  1826. except ValueError: # do not catch TypeError raised by _get_series_list
  1827. if join is None:
  1828. raise ValueError('All arrays must be same length, except '
  1829. 'those having an index if `join` is not None')
  1830. else:
  1831. raise ValueError('If `others` contains arrays or lists (or '
  1832. 'other list-likes without an index), these '
  1833. 'must all be of the same length as the '
  1834. 'calling Series/Index.')
  1835. if join is None and warn:
  1836. warnings.warn("A future version of pandas will perform index "
  1837. "alignment when `others` is a Series/Index/"
  1838. "DataFrame (or a list-like containing one). To "
  1839. "disable alignment (the behavior before v.0.23) and "
  1840. "silence this warning, use `.values` on any Series/"
  1841. "Index/DataFrame in `others`. To enable alignment "
  1842. "and silence this warning, pass `join='left'|"
  1843. "'outer'|'inner'|'right'`. The future default will "
  1844. "be `join='left'`.", FutureWarning, stacklevel=2)
  1845. # if join is None, _get_series_list already force-aligned indexes
  1846. join = 'left' if join is None else join
  1847. # align if required
  1848. if any(not data.index.equals(x.index) for x in others):
  1849. # Need to add keys for uniqueness in case of duplicate columns
  1850. others = concat(others, axis=1,
  1851. join=(join if join == 'inner' else 'outer'),
  1852. keys=range(len(others)), sort=False, copy=False)
  1853. data, others = data.align(others, join=join)
  1854. others = [others[x] for x in others] # again list of Series
  1855. all_cols = [ensure_object(x) for x in [data] + others]
  1856. na_masks = np.array([isna(x) for x in all_cols])
  1857. union_mask = np.logical_or.reduce(na_masks, axis=0)
  1858. if na_rep is None and union_mask.any():
  1859. # no na_rep means NaNs for all rows where any column has a NaN
  1860. # only necessary if there are actually any NaNs
  1861. result = np.empty(len(data), dtype=object)
  1862. np.putmask(result, union_mask, np.nan)
  1863. not_masked = ~union_mask
  1864. result[not_masked] = cat_core([x[not_masked] for x in all_cols],
  1865. sep)
  1866. elif na_rep is not None and union_mask.any():
  1867. # fill NaNs with na_rep in case there are actually any NaNs
  1868. all_cols = [np.where(nm, na_rep, col)
  1869. for nm, col in zip(na_masks, all_cols)]
  1870. result = cat_core(all_cols, sep)
  1871. else:
  1872. # no NaNs - can just concatenate
  1873. result = cat_core(all_cols, sep)
  1874. if isinstance(self._orig, Index):
  1875. # add dtype for case that result is all-NA
  1876. result = Index(result, dtype=object, name=self._orig.name)
  1877. else: # Series
  1878. result = Series(result, dtype=object, index=data.index,
  1879. name=self._orig.name)
  1880. return result
  1881. _shared_docs['str_split'] = ("""
  1882. Split strings around given separator/delimiter.
  1883. Splits the string in the Series/Index from the %(side)s,
  1884. at the specified delimiter string. Equivalent to :meth:`str.%(method)s`.
  1885. Parameters
  1886. ----------
  1887. pat : str, optional
  1888. String or regular expression to split on.
  1889. If not specified, split on whitespace.
  1890. n : int, default -1 (all)
  1891. Limit number of splits in output.
  1892. ``None``, 0 and -1 will be interpreted as return all splits.
  1893. expand : bool, default False
  1894. Expand the splitted strings into separate columns.
  1895. * If ``True``, return DataFrame/MultiIndex expanding dimensionality.
  1896. * If ``False``, return Series/Index, containing lists of strings.
  1897. Returns
  1898. -------
  1899. Series, Index, DataFrame or MultiIndex
  1900. Type matches caller unless ``expand=True`` (see Notes).
  1901. See Also
  1902. --------
  1903. Series.str.split : Split strings around given separator/delimiter.
  1904. Series.str.rsplit : Splits string around given separator/delimiter,
  1905. starting from the right.
  1906. Series.str.join : Join lists contained as elements in the Series/Index
  1907. with passed delimiter.
  1908. str.split : Standard library version for split.
  1909. str.rsplit : Standard library version for rsplit.
  1910. Notes
  1911. -----
  1912. The handling of the `n` keyword depends on the number of found splits:
  1913. - If found splits > `n`, make first `n` splits only
  1914. - If found splits <= `n`, make all splits
  1915. - If for a certain row the number of found splits < `n`,
  1916. append `None` for padding up to `n` if ``expand=True``
  1917. If using ``expand=True``, Series and Index callers return DataFrame and
  1918. MultiIndex objects, respectively.
  1919. Examples
  1920. --------
  1921. >>> s = pd.Series(["this is a regular sentence",
  1922. "https://docs.python.org/3/tutorial/index.html", np.nan])
  1923. In the default setting, the string is split by whitespace.
  1924. >>> s.str.split()
  1925. 0 [this, is, a, regular, sentence]
  1926. 1 [https://docs.python.org/3/tutorial/index.html]
  1927. 2 NaN
  1928. dtype: object
  1929. Without the `n` parameter, the outputs of `rsplit` and `split`
  1930. are identical.
  1931. >>> s.str.rsplit()
  1932. 0 [this, is, a, regular, sentence]
  1933. 1 [https://docs.python.org/3/tutorial/index.html]
  1934. 2 NaN
  1935. dtype: object
  1936. The `n` parameter can be used to limit the number of splits on the
  1937. delimiter. The outputs of `split` and `rsplit` are different.
  1938. >>> s.str.split(n=2)
  1939. 0 [this, is, a regular sentence]
  1940. 1 [https://docs.python.org/3/tutorial/index.html]
  1941. 2 NaN
  1942. dtype: object
  1943. >>> s.str.rsplit(n=2)
  1944. 0 [this is a, regular, sentence]
  1945. 1 [https://docs.python.org/3/tutorial/index.html]
  1946. 2 NaN
  1947. dtype: object
  1948. The `pat` parameter can be used to split by other characters.
  1949. >>> s.str.split(pat = "/")
  1950. 0 [this is a regular sentence]
  1951. 1 [https:, , docs.python.org, 3, tutorial, index...
  1952. 2 NaN
  1953. dtype: object
  1954. When using ``expand=True``, the split elements will expand out into
  1955. separate columns. If NaN is present, it is propagated throughout
  1956. the columns during the split.
  1957. >>> s.str.split(expand=True)
  1958. 0 1 2 3
  1959. 0 this is a regular
  1960. 1 https://docs.python.org/3/tutorial/index.html None None None
  1961. 2 NaN NaN NaN NaN \
  1962. 4
  1963. 0 sentence
  1964. 1 None
  1965. 2 NaN
  1966. For slightly more complex use cases like splitting the html document name
  1967. from a url, a combination of parameter settings can be used.
  1968. >>> s.str.rsplit("/", n=1, expand=True)
  1969. 0 1
  1970. 0 this is a regular sentence None
  1971. 1 https://docs.python.org/3/tutorial index.html
  1972. 2 NaN NaN
  1973. """)
  1974. @Appender(_shared_docs['str_split'] % {
  1975. 'side': 'beginning',
  1976. 'method': 'split'})
  1977. def split(self, pat=None, n=-1, expand=False):
  1978. result = str_split(self._parent, pat, n=n)
  1979. return self._wrap_result(result, expand=expand)
  1980. @Appender(_shared_docs['str_split'] % {
  1981. 'side': 'end',
  1982. 'method': 'rsplit'})
  1983. def rsplit(self, pat=None, n=-1, expand=False):
  1984. result = str_rsplit(self._parent, pat, n=n)
  1985. return self._wrap_result(result, expand=expand)
  1986. _shared_docs['str_partition'] = ("""
  1987. Split the string at the %(side)s occurrence of `sep`.
  1988. This method splits the string at the %(side)s occurrence of `sep`,
  1989. and returns 3 elements containing the part before the separator,
  1990. the separator itself, and the part after the separator.
  1991. If the separator is not found, return %(return)s.
  1992. Parameters
  1993. ----------
  1994. sep : str, default whitespace
  1995. String to split on.
  1996. pat : str, default whitespace
  1997. .. deprecated:: 0.24.0
  1998. Use ``sep`` instead
  1999. expand : bool, default True
  2000. If True, return DataFrame/MultiIndex expanding dimensionality.
  2001. If False, return Series/Index.
  2002. Returns
  2003. -------
  2004. DataFrame/MultiIndex or Series/Index of objects
  2005. See Also
  2006. --------
  2007. %(also)s
  2008. Series.str.split : Split strings around given separators.
  2009. str.partition : Standard library version.
  2010. Examples
  2011. --------
  2012. >>> s = pd.Series(['Linda van der Berg', 'George Pitt-Rivers'])
  2013. >>> s
  2014. 0 Linda van der Berg
  2015. 1 George Pitt-Rivers
  2016. dtype: object
  2017. >>> s.str.partition()
  2018. 0 1 2
  2019. 0 Linda van der Berg
  2020. 1 George Pitt-Rivers
  2021. To partition by the last space instead of the first one:
  2022. >>> s.str.rpartition()
  2023. 0 1 2
  2024. 0 Linda van der Berg
  2025. 1 George Pitt-Rivers
  2026. To partition by something different than a space:
  2027. >>> s.str.partition('-')
  2028. 0 1 2
  2029. 0 Linda van der Berg
  2030. 1 George Pitt - Rivers
  2031. To return a Series containining tuples instead of a DataFrame:
  2032. >>> s.str.partition('-', expand=False)
  2033. 0 (Linda van der Berg, , )
  2034. 1 (George Pitt, -, Rivers)
  2035. dtype: object
  2036. Also available on indices:
  2037. >>> idx = pd.Index(['X 123', 'Y 999'])
  2038. >>> idx
  2039. Index(['X 123', 'Y 999'], dtype='object')
  2040. Which will create a MultiIndex:
  2041. >>> idx.str.partition()
  2042. MultiIndex(levels=[['X', 'Y'], [' '], ['123', '999']],
  2043. codes=[[0, 1], [0, 0], [0, 1]])
  2044. Or an index with tuples with ``expand=False``:
  2045. >>> idx.str.partition(expand=False)
  2046. Index([('X', ' ', '123'), ('Y', ' ', '999')], dtype='object')
  2047. """)
  2048. @Appender(_shared_docs['str_partition'] % {
  2049. 'side': 'first',
  2050. 'return': '3 elements containing the string itself, followed by two '
  2051. 'empty strings',
  2052. 'also': 'rpartition : Split the string at the last occurrence of '
  2053. '`sep`.'
  2054. })
  2055. @deprecate_kwarg(old_arg_name='pat', new_arg_name='sep')
  2056. def partition(self, sep=' ', expand=True):
  2057. f = lambda x: x.partition(sep)
  2058. result = _na_map(f, self._parent)
  2059. return self._wrap_result(result, expand=expand)
  2060. @Appender(_shared_docs['str_partition'] % {
  2061. 'side': 'last',
  2062. 'return': '3 elements containing two empty strings, followed by the '
  2063. 'string itself',
  2064. 'also': 'partition : Split the string at the first occurrence of '
  2065. '`sep`.'
  2066. })
  2067. @deprecate_kwarg(old_arg_name='pat', new_arg_name='sep')
  2068. def rpartition(self, sep=' ', expand=True):
  2069. f = lambda x: x.rpartition(sep)
  2070. result = _na_map(f, self._parent)
  2071. return self._wrap_result(result, expand=expand)
  2072. @copy(str_get)
  2073. def get(self, i):
  2074. result = str_get(self._parent, i)
  2075. return self._wrap_result(result)
  2076. @copy(str_join)
  2077. def join(self, sep):
  2078. result = str_join(self._parent, sep)
  2079. return self._wrap_result(result)
  2080. @copy(str_contains)
  2081. def contains(self, pat, case=True, flags=0, na=np.nan, regex=True):
  2082. result = str_contains(self._parent, pat, case=case, flags=flags, na=na,
  2083. regex=regex)
  2084. return self._wrap_result(result, fill_value=na)
  2085. @copy(str_match)
  2086. def match(self, pat, case=True, flags=0, na=np.nan):
  2087. result = str_match(self._parent, pat, case=case, flags=flags, na=na)
  2088. return self._wrap_result(result, fill_value=na)
  2089. @copy(str_replace)
  2090. def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True):
  2091. result = str_replace(self._parent, pat, repl, n=n, case=case,
  2092. flags=flags, regex=regex)
  2093. return self._wrap_result(result)
  2094. @copy(str_repeat)
  2095. def repeat(self, repeats):
  2096. result = str_repeat(self._parent, repeats)
  2097. return self._wrap_result(result)
  2098. @copy(str_pad)
  2099. def pad(self, width, side='left', fillchar=' '):
  2100. result = str_pad(self._parent, width, side=side, fillchar=fillchar)
  2101. return self._wrap_result(result)
  2102. _shared_docs['str_pad'] = ("""
  2103. Filling %(side)s side of strings in the Series/Index with an
  2104. additional character. Equivalent to :meth:`str.%(method)s`.
  2105. Parameters
  2106. ----------
  2107. width : int
  2108. Minimum width of resulting string; additional characters will be filled
  2109. with ``fillchar``
  2110. fillchar : str
  2111. Additional character for filling, default is whitespace
  2112. Returns
  2113. -------
  2114. filled : Series/Index of objects
  2115. """)
  2116. @Appender(_shared_docs['str_pad'] % dict(side='left and right',
  2117. method='center'))
  2118. def center(self, width, fillchar=' '):
  2119. return self.pad(width, side='both', fillchar=fillchar)
  2120. @Appender(_shared_docs['str_pad'] % dict(side='right', method='ljust'))
  2121. def ljust(self, width, fillchar=' '):
  2122. return self.pad(width, side='right', fillchar=fillchar)
  2123. @Appender(_shared_docs['str_pad'] % dict(side='left', method='rjust'))
  2124. def rjust(self, width, fillchar=' '):
  2125. return self.pad(width, side='left', fillchar=fillchar)
  2126. def zfill(self, width):
  2127. """
  2128. Pad strings in the Series/Index by prepending '0' characters.
  2129. Strings in the Series/Index are padded with '0' characters on the
  2130. left of the string to reach a total string length `width`. Strings
  2131. in the Series/Index with length greater or equal to `width` are
  2132. unchanged.
  2133. Parameters
  2134. ----------
  2135. width : int
  2136. Minimum length of resulting string; strings with length less
  2137. than `width` be prepended with '0' characters.
  2138. Returns
  2139. -------
  2140. Series/Index of objects
  2141. See Also
  2142. --------
  2143. Series.str.rjust : Fills the left side of strings with an arbitrary
  2144. character.
  2145. Series.str.ljust : Fills the right side of strings with an arbitrary
  2146. character.
  2147. Series.str.pad : Fills the specified sides of strings with an arbitrary
  2148. character.
  2149. Series.str.center : Fills boths sides of strings with an arbitrary
  2150. character.
  2151. Notes
  2152. -----
  2153. Differs from :meth:`str.zfill` which has special handling
  2154. for '+'/'-' in the string.
  2155. Examples
  2156. --------
  2157. >>> s = pd.Series(['-1', '1', '1000', 10, np.nan])
  2158. >>> s
  2159. 0 -1
  2160. 1 1
  2161. 2 1000
  2162. 3 10
  2163. 4 NaN
  2164. dtype: object
  2165. Note that ``10`` and ``NaN`` are not strings, therefore they are
  2166. converted to ``NaN``. The minus sign in ``'-1'`` is treated as a
  2167. regular character and the zero is added to the left of it
  2168. (:meth:`str.zfill` would have moved it to the left). ``1000``
  2169. remains unchanged as it is longer than `width`.
  2170. >>> s.str.zfill(3)
  2171. 0 0-1
  2172. 1 001
  2173. 2 1000
  2174. 3 NaN
  2175. 4 NaN
  2176. dtype: object
  2177. """
  2178. result = str_pad(self._parent, width, side='left', fillchar='0')
  2179. return self._wrap_result(result)
  2180. @copy(str_slice)
  2181. def slice(self, start=None, stop=None, step=None):
  2182. result = str_slice(self._parent, start, stop, step)
  2183. return self._wrap_result(result)
  2184. @copy(str_slice_replace)
  2185. def slice_replace(self, start=None, stop=None, repl=None):
  2186. result = str_slice_replace(self._parent, start, stop, repl)
  2187. return self._wrap_result(result)
  2188. @copy(str_decode)
  2189. def decode(self, encoding, errors="strict"):
  2190. result = str_decode(self._parent, encoding, errors)
  2191. return self._wrap_result(result)
  2192. @copy(str_encode)
  2193. def encode(self, encoding, errors="strict"):
  2194. result = str_encode(self._parent, encoding, errors)
  2195. return self._wrap_result(result)
  2196. _shared_docs['str_strip'] = (r"""
  2197. Remove leading and trailing characters.
  2198. Strip whitespaces (including newlines) or a set of specified characters
  2199. from each string in the Series/Index from %(side)s.
  2200. Equivalent to :meth:`str.%(method)s`.
  2201. Parameters
  2202. ----------
  2203. to_strip : str or None, default None
  2204. Specifying the set of characters to be removed.
  2205. All combinations of this set of characters will be stripped.
  2206. If None then whitespaces are removed.
  2207. Returns
  2208. -------
  2209. Series/Index of objects
  2210. See Also
  2211. --------
  2212. Series.str.strip : Remove leading and trailing characters in Series/Index.
  2213. Series.str.lstrip : Remove leading characters in Series/Index.
  2214. Series.str.rstrip : Remove trailing characters in Series/Index.
  2215. Examples
  2216. --------
  2217. >>> s = pd.Series(['1. Ant. ', '2. Bee!\n', '3. Cat?\t', np.nan])
  2218. >>> s
  2219. 0 1. Ant.
  2220. 1 2. Bee!\n
  2221. 2 3. Cat?\t
  2222. 3 NaN
  2223. dtype: object
  2224. >>> s.str.strip()
  2225. 0 1. Ant.
  2226. 1 2. Bee!
  2227. 2 3. Cat?
  2228. 3 NaN
  2229. dtype: object
  2230. >>> s.str.lstrip('123.')
  2231. 0 Ant.
  2232. 1 Bee!\n
  2233. 2 Cat?\t
  2234. 3 NaN
  2235. dtype: object
  2236. >>> s.str.rstrip('.!? \n\t')
  2237. 0 1. Ant
  2238. 1 2. Bee
  2239. 2 3. Cat
  2240. 3 NaN
  2241. dtype: object
  2242. >>> s.str.strip('123.!? \n\t')
  2243. 0 Ant
  2244. 1 Bee
  2245. 2 Cat
  2246. 3 NaN
  2247. dtype: object
  2248. """)
  2249. @Appender(_shared_docs['str_strip'] % dict(side='left and right sides',
  2250. method='strip'))
  2251. def strip(self, to_strip=None):
  2252. result = str_strip(self._parent, to_strip, side='both')
  2253. return self._wrap_result(result)
  2254. @Appender(_shared_docs['str_strip'] % dict(side='left side',
  2255. method='lstrip'))
  2256. def lstrip(self, to_strip=None):
  2257. result = str_strip(self._parent, to_strip, side='left')
  2258. return self._wrap_result(result)
  2259. @Appender(_shared_docs['str_strip'] % dict(side='right side',
  2260. method='rstrip'))
  2261. def rstrip(self, to_strip=None):
  2262. result = str_strip(self._parent, to_strip, side='right')
  2263. return self._wrap_result(result)
  2264. @copy(str_wrap)
  2265. def wrap(self, width, **kwargs):
  2266. result = str_wrap(self._parent, width, **kwargs)
  2267. return self._wrap_result(result)
  2268. @copy(str_get_dummies)
  2269. def get_dummies(self, sep='|'):
  2270. # we need to cast to Series of strings as only that has all
  2271. # methods available for making the dummies...
  2272. data = self._orig.astype(str) if self._is_categorical else self._parent
  2273. result, name = str_get_dummies(data, sep)
  2274. return self._wrap_result(result, use_codes=(not self._is_categorical),
  2275. name=name, expand=True)
  2276. @copy(str_translate)
  2277. def translate(self, table, deletechars=None):
  2278. result = str_translate(self._parent, table, deletechars)
  2279. return self._wrap_result(result)
  2280. count = _pat_wrapper(str_count, flags=True)
  2281. startswith = _pat_wrapper(str_startswith, na=True)
  2282. endswith = _pat_wrapper(str_endswith, na=True)
  2283. findall = _pat_wrapper(str_findall, flags=True)
  2284. @copy(str_extract)
  2285. def extract(self, pat, flags=0, expand=True):
  2286. return str_extract(self, pat, flags=flags, expand=expand)
  2287. @copy(str_extractall)
  2288. def extractall(self, pat, flags=0):
  2289. return str_extractall(self._orig, pat, flags=flags)
  2290. _shared_docs['find'] = ("""
  2291. Return %(side)s indexes in each strings in the Series/Index
  2292. where the substring is fully contained between [start:end].
  2293. Return -1 on failure. Equivalent to standard :meth:`str.%(method)s`.
  2294. Parameters
  2295. ----------
  2296. sub : str
  2297. Substring being searched
  2298. start : int
  2299. Left edge index
  2300. end : int
  2301. Right edge index
  2302. Returns
  2303. -------
  2304. found : Series/Index of integer values
  2305. See Also
  2306. --------
  2307. %(also)s
  2308. """)
  2309. @Appender(_shared_docs['find'] %
  2310. dict(side='lowest', method='find',
  2311. also='rfind : Return highest indexes in each strings.'))
  2312. def find(self, sub, start=0, end=None):
  2313. result = str_find(self._parent, sub, start=start, end=end, side='left')
  2314. return self._wrap_result(result)
  2315. @Appender(_shared_docs['find'] %
  2316. dict(side='highest', method='rfind',
  2317. also='find : Return lowest indexes in each strings.'))
  2318. def rfind(self, sub, start=0, end=None):
  2319. result = str_find(self._parent, sub,
  2320. start=start, end=end, side='right')
  2321. return self._wrap_result(result)
  2322. def normalize(self, form):
  2323. """
  2324. Return the Unicode normal form for the strings in the Series/Index.
  2325. For more information on the forms, see the
  2326. :func:`unicodedata.normalize`.
  2327. Parameters
  2328. ----------
  2329. form : {'NFC', 'NFKC', 'NFD', 'NFKD'}
  2330. Unicode form
  2331. Returns
  2332. -------
  2333. normalized : Series/Index of objects
  2334. """
  2335. import unicodedata
  2336. f = lambda x: unicodedata.normalize(form, compat.u_safe(x))
  2337. result = _na_map(f, self._parent)
  2338. return self._wrap_result(result)
  2339. _shared_docs['index'] = ("""
  2340. Return %(side)s indexes in each strings where the substring is
  2341. fully contained between [start:end]. This is the same as
  2342. ``str.%(similar)s`` except instead of returning -1, it raises a ValueError
  2343. when the substring is not found. Equivalent to standard ``str.%(method)s``.
  2344. Parameters
  2345. ----------
  2346. sub : str
  2347. Substring being searched
  2348. start : int
  2349. Left edge index
  2350. end : int
  2351. Right edge index
  2352. Returns
  2353. -------
  2354. found : Series/Index of objects
  2355. See Also
  2356. --------
  2357. %(also)s
  2358. """)
  2359. @Appender(_shared_docs['index'] %
  2360. dict(side='lowest', similar='find', method='index',
  2361. also='rindex : Return highest indexes in each strings.'))
  2362. def index(self, sub, start=0, end=None):
  2363. result = str_index(self._parent, sub,
  2364. start=start, end=end, side='left')
  2365. return self._wrap_result(result)
  2366. @Appender(_shared_docs['index'] %
  2367. dict(side='highest', similar='rfind', method='rindex',
  2368. also='index : Return lowest indexes in each strings.'))
  2369. def rindex(self, sub, start=0, end=None):
  2370. result = str_index(self._parent, sub,
  2371. start=start, end=end, side='right')
  2372. return self._wrap_result(result)
  2373. _shared_docs['len'] = ("""
  2374. Compute the length of each element in the Series/Index. The element may be
  2375. a sequence (such as a string, tuple or list) or a collection
  2376. (such as a dictionary).
  2377. Returns
  2378. -------
  2379. Series or Index of int
  2380. A Series or Index of integer values indicating the length of each
  2381. element in the Series or Index.
  2382. See Also
  2383. --------
  2384. str.len : Python built-in function returning the length of an object.
  2385. Series.size : Returns the length of the Series.
  2386. Examples
  2387. --------
  2388. Returns the length (number of characters) in a string. Returns the
  2389. number of entries for dictionaries, lists or tuples.
  2390. >>> s = pd.Series(['dog',
  2391. ... '',
  2392. ... 5,
  2393. ... {'foo' : 'bar'},
  2394. ... [2, 3, 5, 7],
  2395. ... ('one', 'two', 'three')])
  2396. >>> s
  2397. 0 dog
  2398. 1
  2399. 2 5
  2400. 3 {'foo': 'bar'}
  2401. 4 [2, 3, 5, 7]
  2402. 5 (one, two, three)
  2403. dtype: object
  2404. >>> s.str.len()
  2405. 0 3.0
  2406. 1 0.0
  2407. 2 NaN
  2408. 3 1.0
  2409. 4 4.0
  2410. 5 3.0
  2411. dtype: float64
  2412. """)
  2413. len = _noarg_wrapper(len, docstring=_shared_docs['len'], dtype=int)
  2414. _shared_docs['casemethods'] = ("""
  2415. Convert strings in the Series/Index to %(type)s.
  2416. %(version)s
  2417. Equivalent to :meth:`str.%(method)s`.
  2418. Returns
  2419. -------
  2420. Series/Index of objects
  2421. See Also
  2422. --------
  2423. Series.str.lower : Converts all characters to lowercase.
  2424. Series.str.upper : Converts all characters to uppercase.
  2425. Series.str.title : Converts first character of each word to uppercase and
  2426. remaining to lowercase.
  2427. Series.str.capitalize : Converts first character to uppercase and
  2428. remaining to lowercase.
  2429. Series.str.swapcase : Converts uppercase to lowercase and lowercase to
  2430. uppercase.
  2431. Series.str.casefold: Removes all case distinctions in the string.
  2432. Examples
  2433. --------
  2434. >>> s = pd.Series(['lower', 'CAPITALS', 'this is a sentence', 'SwApCaSe'])
  2435. >>> s
  2436. 0 lower
  2437. 1 CAPITALS
  2438. 2 this is a sentence
  2439. 3 SwApCaSe
  2440. dtype: object
  2441. >>> s.str.lower()
  2442. 0 lower
  2443. 1 capitals
  2444. 2 this is a sentence
  2445. 3 swapcase
  2446. dtype: object
  2447. >>> s.str.upper()
  2448. 0 LOWER
  2449. 1 CAPITALS
  2450. 2 THIS IS A SENTENCE
  2451. 3 SWAPCASE
  2452. dtype: object
  2453. >>> s.str.title()
  2454. 0 Lower
  2455. 1 Capitals
  2456. 2 This Is A Sentence
  2457. 3 Swapcase
  2458. dtype: object
  2459. >>> s.str.capitalize()
  2460. 0 Lower
  2461. 1 Capitals
  2462. 2 This is a sentence
  2463. 3 Swapcase
  2464. dtype: object
  2465. >>> s.str.swapcase()
  2466. 0 LOWER
  2467. 1 capitals
  2468. 2 THIS IS A SENTENCE
  2469. 3 sWaPcAsE
  2470. dtype: object
  2471. """)
  2472. _shared_docs['lower'] = dict(type='lowercase', method='lower', version='')
  2473. _shared_docs['upper'] = dict(type='uppercase', method='upper', version='')
  2474. _shared_docs['title'] = dict(type='titlecase', method='title', version='')
  2475. _shared_docs['capitalize'] = dict(type='be capitalized',
  2476. method='capitalize', version='')
  2477. _shared_docs['swapcase'] = dict(type='be swapcased', method='swapcase',
  2478. version='')
  2479. _shared_docs['casefold'] = dict(type='be casefolded', method='casefold',
  2480. version='\n .. versionadded:: 0.25.0\n')
  2481. lower = _noarg_wrapper(lambda x: x.lower(),
  2482. docstring=_shared_docs['casemethods'] %
  2483. _shared_docs['lower'])
  2484. upper = _noarg_wrapper(lambda x: x.upper(),
  2485. docstring=_shared_docs['casemethods'] %
  2486. _shared_docs['upper'])
  2487. title = _noarg_wrapper(lambda x: x.title(),
  2488. docstring=_shared_docs['casemethods'] %
  2489. _shared_docs['title'])
  2490. capitalize = _noarg_wrapper(lambda x: x.capitalize(),
  2491. docstring=_shared_docs['casemethods'] %
  2492. _shared_docs['capitalize'])
  2493. swapcase = _noarg_wrapper(lambda x: x.swapcase(),
  2494. docstring=_shared_docs['casemethods'] %
  2495. _shared_docs['swapcase'])
  2496. casefold = _noarg_wrapper(lambda x: x.casefold(),
  2497. docstring=_shared_docs['casemethods'] %
  2498. _shared_docs['casefold'])
  2499. _shared_docs['ismethods'] = ("""
  2500. Check whether all characters in each string are %(type)s.
  2501. This is equivalent to running the Python string method
  2502. :meth:`str.%(method)s` for each element of the Series/Index. If a string
  2503. has zero characters, ``False`` is returned for that check.
  2504. Returns
  2505. -------
  2506. Series or Index of bool
  2507. Series or Index of boolean values with the same length as the original
  2508. Series/Index.
  2509. See Also
  2510. --------
  2511. Series.str.isalpha : Check whether all characters are alphabetic.
  2512. Series.str.isnumeric : Check whether all characters are numeric.
  2513. Series.str.isalnum : Check whether all characters are alphanumeric.
  2514. Series.str.isdigit : Check whether all characters are digits.
  2515. Series.str.isdecimal : Check whether all characters are decimal.
  2516. Series.str.isspace : Check whether all characters are whitespace.
  2517. Series.str.islower : Check whether all characters are lowercase.
  2518. Series.str.isupper : Check whether all characters are uppercase.
  2519. Series.str.istitle : Check whether all characters are titlecase.
  2520. Examples
  2521. --------
  2522. **Checks for Alphabetic and Numeric Characters**
  2523. >>> s1 = pd.Series(['one', 'one1', '1', ''])
  2524. >>> s1.str.isalpha()
  2525. 0 True
  2526. 1 False
  2527. 2 False
  2528. 3 False
  2529. dtype: bool
  2530. >>> s1.str.isnumeric()
  2531. 0 False
  2532. 1 False
  2533. 2 True
  2534. 3 False
  2535. dtype: bool
  2536. >>> s1.str.isalnum()
  2537. 0 True
  2538. 1 True
  2539. 2 True
  2540. 3 False
  2541. dtype: bool
  2542. Note that checks against characters mixed with any additional punctuation
  2543. or whitespace will evaluate to false for an alphanumeric check.
  2544. >>> s2 = pd.Series(['A B', '1.5', '3,000'])
  2545. >>> s2.str.isalnum()
  2546. 0 False
  2547. 1 False
  2548. 2 False
  2549. dtype: bool
  2550. **More Detailed Checks for Numeric Characters**
  2551. There are several different but overlapping sets of numeric characters that
  2552. can be checked for.
  2553. >>> s3 = pd.Series(['23', '³', '⅕', ''])
  2554. The ``s3.str.isdecimal`` method checks for characters used to form numbers
  2555. in base 10.
  2556. >>> s3.str.isdecimal()
  2557. 0 True
  2558. 1 False
  2559. 2 False
  2560. 3 False
  2561. dtype: bool
  2562. The ``s.str.isdigit`` method is the same as ``s3.str.isdecimal`` but also
  2563. includes special digits, like superscripted and subscripted digits in
  2564. unicode.
  2565. >>> s3.str.isdigit()
  2566. 0 True
  2567. 1 True
  2568. 2 False
  2569. 3 False
  2570. dtype: bool
  2571. The ``s.str.isnumeric`` method is the same as ``s3.str.isdigit`` but also
  2572. includes other characters that can represent quantities such as unicode
  2573. fractions.
  2574. >>> s3.str.isnumeric()
  2575. 0 True
  2576. 1 True
  2577. 2 True
  2578. 3 False
  2579. dtype: bool
  2580. **Checks for Whitespace**
  2581. >>> s4 = pd.Series([' ', '\\t\\r\\n ', ''])
  2582. >>> s4.str.isspace()
  2583. 0 True
  2584. 1 True
  2585. 2 False
  2586. dtype: bool
  2587. **Checks for Character Case**
  2588. >>> s5 = pd.Series(['leopard', 'Golden Eagle', 'SNAKE', ''])
  2589. >>> s5.str.islower()
  2590. 0 True
  2591. 1 False
  2592. 2 False
  2593. 3 False
  2594. dtype: bool
  2595. >>> s5.str.isupper()
  2596. 0 False
  2597. 1 False
  2598. 2 True
  2599. 3 False
  2600. dtype: bool
  2601. The ``s5.str.istitle`` method checks for whether all words are in title
  2602. case (whether only the first letter of each word is capitalized). Words are
  2603. assumed to be as any sequence of non-numeric characters seperated by
  2604. whitespace characters.
  2605. >>> s5.str.istitle()
  2606. 0 False
  2607. 1 True
  2608. 2 False
  2609. 3 False
  2610. dtype: bool
  2611. """)
  2612. _shared_docs['isalnum'] = dict(type='alphanumeric', method='isalnum')
  2613. _shared_docs['isalpha'] = dict(type='alphabetic', method='isalpha')
  2614. _shared_docs['isdigit'] = dict(type='digits', method='isdigit')
  2615. _shared_docs['isspace'] = dict(type='whitespace', method='isspace')
  2616. _shared_docs['islower'] = dict(type='lowercase', method='islower')
  2617. _shared_docs['isupper'] = dict(type='uppercase', method='isupper')
  2618. _shared_docs['istitle'] = dict(type='titlecase', method='istitle')
  2619. _shared_docs['isnumeric'] = dict(type='numeric', method='isnumeric')
  2620. _shared_docs['isdecimal'] = dict(type='decimal', method='isdecimal')
  2621. isalnum = _noarg_wrapper(lambda x: x.isalnum(),
  2622. docstring=_shared_docs['ismethods'] %
  2623. _shared_docs['isalnum'])
  2624. isalpha = _noarg_wrapper(lambda x: x.isalpha(),
  2625. docstring=_shared_docs['ismethods'] %
  2626. _shared_docs['isalpha'])
  2627. isdigit = _noarg_wrapper(lambda x: x.isdigit(),
  2628. docstring=_shared_docs['ismethods'] %
  2629. _shared_docs['isdigit'])
  2630. isspace = _noarg_wrapper(lambda x: x.isspace(),
  2631. docstring=_shared_docs['ismethods'] %
  2632. _shared_docs['isspace'])
  2633. islower = _noarg_wrapper(lambda x: x.islower(),
  2634. docstring=_shared_docs['ismethods'] %
  2635. _shared_docs['islower'])
  2636. isupper = _noarg_wrapper(lambda x: x.isupper(),
  2637. docstring=_shared_docs['ismethods'] %
  2638. _shared_docs['isupper'])
  2639. istitle = _noarg_wrapper(lambda x: x.istitle(),
  2640. docstring=_shared_docs['ismethods'] %
  2641. _shared_docs['istitle'])
  2642. isnumeric = _noarg_wrapper(lambda x: compat.u_safe(x).isnumeric(),
  2643. docstring=_shared_docs['ismethods'] %
  2644. _shared_docs['isnumeric'])
  2645. isdecimal = _noarg_wrapper(lambda x: compat.u_safe(x).isdecimal(),
  2646. docstring=_shared_docs['ismethods'] %
  2647. _shared_docs['isdecimal'])
  2648. @classmethod
  2649. def _make_accessor(cls, data):
  2650. cls._validate(data)
  2651. return cls(data)