/pandas/core/strings.py
Python | 3201 lines | 2996 code | 59 blank | 146 comment | 96 complexity | ad3630c026496b522aad5d44da706a3f MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
Large files files are truncated, but you can click here to view the full file
- # -*- coding: utf-8 -*-
- import codecs
- import re
- import textwrap
- import warnings
- import numpy as np
- import pandas._libs.lib as lib
- import pandas._libs.ops as libops
- import pandas.compat as compat
- from pandas.compat import zip
- from pandas.util._decorators import Appender, deprecate_kwarg
- from pandas.core.dtypes.common import (
- ensure_object, is_bool_dtype, is_categorical_dtype, is_integer,
- is_list_like, is_object_dtype, is_re, is_scalar, is_string_like)
- from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries
- from pandas.core.dtypes.missing import isna
- from pandas.core.algorithms import take_1d
- from pandas.core.base import NoNewAttributesMixin
- import pandas.core.common as com
- _cpython_optimized_encoders = (
- "utf-8", "utf8", "latin-1", "latin1", "iso-8859-1", "mbcs", "ascii"
- )
- _cpython_optimized_decoders = _cpython_optimized_encoders + (
- "utf-16", "utf-32"
- )
- _shared_docs = dict()
- def cat_core(list_of_columns, sep):
- """
- Auxiliary function for :meth:`str.cat`
- Parameters
- ----------
- list_of_columns : list of numpy arrays
- List of arrays to be concatenated with sep;
- these arrays may not contain NaNs!
- sep : string
- The separator string for concatenating the columns
- Returns
- -------
- nd.array
- The concatenation of list_of_columns with sep
- """
- list_with_sep = [sep] * (2 * len(list_of_columns) - 1)
- list_with_sep[::2] = list_of_columns
- return np.sum(list_with_sep, axis=0)
- def _na_map(f, arr, na_result=np.nan, dtype=object):
- # should really _check_ for NA
- return _map(f, arr, na_mask=True, na_value=na_result, dtype=dtype)
- def _map(f, arr, na_mask=False, na_value=np.nan, dtype=object):
- if not len(arr):
- return np.ndarray(0, dtype=dtype)
- if isinstance(arr, ABCSeries):
- arr = arr.values
- if not isinstance(arr, np.ndarray):
- arr = np.asarray(arr, dtype=object)
- if na_mask:
- mask = isna(arr)
- try:
- convert = not all(mask)
- result = lib.map_infer_mask(arr, f, mask.view(np.uint8), convert)
- except (TypeError, AttributeError) as e:
- # Reraise the exception if callable `f` got wrong number of args.
- # The user may want to be warned by this, instead of getting NaN
- if compat.PY2:
- p_err = r'takes (no|(exactly|at (least|most)) ?\d+) arguments?'
- else:
- p_err = (r'((takes)|(missing)) (?(2)from \d+ to )?\d+ '
- r'(?(3)required )positional arguments?')
- if len(e.args) >= 1 and re.search(p_err, e.args[0]):
- raise e
- def g(x):
- try:
- return f(x)
- except (TypeError, AttributeError):
- return na_value
- return _map(g, arr, dtype=dtype)
- if na_value is not np.nan:
- np.putmask(result, mask, na_value)
- if result.dtype == object:
- result = lib.maybe_convert_objects(result)
- return result
- else:
- return lib.map_infer(arr, f)
- def str_count(arr, pat, flags=0):
- """
- Count occurrences of pattern in each string of the Series/Index.
- This function is used to count the number of times a particular regex
- pattern is repeated in each of the string elements of the
- :class:`~pandas.Series`.
- Parameters
- ----------
- pat : str
- Valid regular expression.
- flags : int, default 0, meaning no flags
- Flags for the `re` module. For a complete list, `see here
- <https://docs.python.org/3/howto/regex.html#compilation-flags>`_.
- **kwargs
- For compatibility with other string methods. Not used.
- Returns
- -------
- Series or Index
- Same type as the calling object containing the integer counts.
- See Also
- --------
- re : Standard library module for regular expressions.
- str.count : Standard library version, without regular expression support.
- Notes
- -----
- Some characters need to be escaped when passing in `pat`.
- eg. ``'$'`` has a special meaning in regex and must be escaped when
- finding this literal character.
- Examples
- --------
- >>> s = pd.Series(['A', 'B', 'Aaba', 'Baca', np.nan, 'CABA', 'cat'])
- >>> s.str.count('a')
- 0 0.0
- 1 0.0
- 2 2.0
- 3 2.0
- 4 NaN
- 5 0.0
- 6 1.0
- dtype: float64
- Escape ``'$'`` to find the literal dollar sign.
- >>> s = pd.Series(['$', 'B', 'Aab$', '$$ca', 'C$B$', 'cat'])
- >>> s.str.count('\\$')
- 0 1
- 1 0
- 2 1
- 3 2
- 4 2
- 5 0
- dtype: int64
- This is also available on Index
- >>> pd.Index(['A', 'A', 'Aaba', 'cat']).str.count('a')
- Int64Index([0, 0, 2, 1], dtype='int64')
- """
- regex = re.compile(pat, flags=flags)
- f = lambda x: len(regex.findall(x))
- return _na_map(f, arr, dtype=int)
- def str_contains(arr, pat, case=True, flags=0, na=np.nan, regex=True):
- """
- Test if pattern or regex is contained within a string of a Series or Index.
- Return boolean Series or Index based on whether a given pattern or regex is
- contained within a string of a Series or Index.
- Parameters
- ----------
- pat : str
- Character sequence or regular expression.
- case : bool, default True
- If True, case sensitive.
- flags : int, default 0 (no flags)
- Flags to pass through to the re module, e.g. re.IGNORECASE.
- na : default NaN
- Fill value for missing values.
- regex : bool, default True
- If True, assumes the pat is a regular expression.
- If False, treats the pat as a literal string.
- Returns
- -------
- Series or Index of boolean values
- A Series or Index of boolean values indicating whether the
- given pattern is contained within the string of each element
- of the Series or Index.
- See Also
- --------
- match : Analogous, but stricter, relying on re.match instead of re.search.
- Series.str.startswith : Test if the start of each string element matches a
- pattern.
- Series.str.endswith : Same as startswith, but tests the end of string.
- Examples
- --------
- Returning a Series of booleans using only a literal pattern.
- >>> s1 = pd.Series(['Mouse', 'dog', 'house and parrot', '23', np.NaN])
- >>> s1.str.contains('og', regex=False)
- 0 False
- 1 True
- 2 False
- 3 False
- 4 NaN
- dtype: object
- Returning an Index of booleans using only a literal pattern.
- >>> ind = pd.Index(['Mouse', 'dog', 'house and parrot', '23.0', np.NaN])
- >>> ind.str.contains('23', regex=False)
- Index([False, False, False, True, nan], dtype='object')
- Specifying case sensitivity using `case`.
- >>> s1.str.contains('oG', case=True, regex=True)
- 0 False
- 1 False
- 2 False
- 3 False
- 4 NaN
- dtype: object
- Specifying `na` to be `False` instead of `NaN` replaces NaN values
- with `False`. If Series or Index does not contain NaN values
- the resultant dtype will be `bool`, otherwise, an `object` dtype.
- >>> s1.str.contains('og', na=False, regex=True)
- 0 False
- 1 True
- 2 False
- 3 False
- 4 False
- dtype: bool
- Returning 'house' or 'dog' when either expression occurs in a string.
- >>> s1.str.contains('house|dog', regex=True)
- 0 False
- 1 True
- 2 True
- 3 False
- 4 NaN
- dtype: object
- Ignoring case sensitivity using `flags` with regex.
- >>> import re
- >>> s1.str.contains('PARROT', flags=re.IGNORECASE, regex=True)
- 0 False
- 1 False
- 2 True
- 3 False
- 4 NaN
- dtype: object
- Returning any digit using regular expression.
- >>> s1.str.contains('\\d', regex=True)
- 0 False
- 1 False
- 2 False
- 3 True
- 4 NaN
- dtype: object
- Ensure `pat` is a not a literal pattern when `regex` is set to True.
- Note in the following example one might expect only `s2[1]` and `s2[3]` to
- return `True`. However, '.0' as a regex matches any character
- followed by a 0.
- >>> s2 = pd.Series(['40', '40.0', '41', '41.0', '35'])
- >>> s2.str.contains('.0', regex=True)
- 0 True
- 1 True
- 2 False
- 3 True
- 4 False
- dtype: bool
- """
- if regex:
- if not case:
- flags |= re.IGNORECASE
- regex = re.compile(pat, flags=flags)
- if regex.groups > 0:
- warnings.warn("This pattern has match groups. To actually get the"
- " groups, use str.extract.", UserWarning,
- stacklevel=3)
- f = lambda x: bool(regex.search(x))
- else:
- if case:
- f = lambda x: pat in x
- else:
- upper_pat = pat.upper()
- f = lambda x: upper_pat in x
- uppered = _na_map(lambda x: x.upper(), arr)
- return _na_map(f, uppered, na, dtype=bool)
- return _na_map(f, arr, na, dtype=bool)
- def str_startswith(arr, pat, na=np.nan):
- """
- Test if the start of each string element matches a pattern.
- Equivalent to :meth:`str.startswith`.
- Parameters
- ----------
- pat : str
- Character sequence. Regular expressions are not accepted.
- na : object, default NaN
- Object shown if element tested is not a string.
- Returns
- -------
- Series or Index of bool
- A Series of booleans indicating whether the given pattern matches
- the start of each string element.
- See Also
- --------
- str.startswith : Python standard library string method.
- Series.str.endswith : Same as startswith, but tests the end of string.
- Series.str.contains : Tests if string element contains a pattern.
- Examples
- --------
- >>> s = pd.Series(['bat', 'Bear', 'cat', np.nan])
- >>> s
- 0 bat
- 1 Bear
- 2 cat
- 3 NaN
- dtype: object
- >>> s.str.startswith('b')
- 0 True
- 1 False
- 2 False
- 3 NaN
- dtype: object
- Specifying `na` to be `False` instead of `NaN`.
- >>> s.str.startswith('b', na=False)
- 0 True
- 1 False
- 2 False
- 3 False
- dtype: bool
- """
- f = lambda x: x.startswith(pat)
- return _na_map(f, arr, na, dtype=bool)
- def str_endswith(arr, pat, na=np.nan):
- """
- Test if the end of each string element matches a pattern.
- Equivalent to :meth:`str.endswith`.
- Parameters
- ----------
- pat : str
- Character sequence. Regular expressions are not accepted.
- na : object, default NaN
- Object shown if element tested is not a string.
- Returns
- -------
- Series or Index of bool
- A Series of booleans indicating whether the given pattern matches
- the end of each string element.
- See Also
- --------
- str.endswith : Python standard library string method.
- Series.str.startswith : Same as endswith, but tests the start of string.
- Series.str.contains : Tests if string element contains a pattern.
- Examples
- --------
- >>> s = pd.Series(['bat', 'bear', 'caT', np.nan])
- >>> s
- 0 bat
- 1 bear
- 2 caT
- 3 NaN
- dtype: object
- >>> s.str.endswith('t')
- 0 True
- 1 False
- 2 False
- 3 NaN
- dtype: object
- Specifying `na` to be `False` instead of `NaN`.
- >>> s.str.endswith('t', na=False)
- 0 True
- 1 False
- 2 False
- 3 False
- dtype: bool
- """
- f = lambda x: x.endswith(pat)
- return _na_map(f, arr, na, dtype=bool)
- def str_replace(arr, pat, repl, n=-1, case=None, flags=0, regex=True):
- r"""
- Replace occurrences of pattern/regex in the Series/Index with
- some other string. Equivalent to :meth:`str.replace` or
- :func:`re.sub`.
- Parameters
- ----------
- pat : str or compiled regex
- String can be a character sequence or regular expression.
- .. versionadded:: 0.20.0
- `pat` also accepts a compiled regex.
- repl : str or callable
- Replacement string or a callable. The callable is passed the regex
- match object and must return a replacement string to be used.
- See :func:`re.sub`.
- .. versionadded:: 0.20.0
- `repl` also accepts a callable.
- n : int, default -1 (all)
- Number of replacements to make from start.
- case : bool, default None
- - If True, case sensitive (the default if `pat` is a string)
- - Set to False for case insensitive
- - Cannot be set if `pat` is a compiled regex
- flags : int, default 0 (no flags)
- - re module flags, e.g. re.IGNORECASE
- - Cannot be set if `pat` is a compiled regex
- regex : bool, default True
- - If True, assumes the passed-in pattern is a regular expression.
- - If False, treats the pattern as a literal string
- - Cannot be set to False if `pat` is a compiled regex or `repl` is
- a callable.
- .. versionadded:: 0.23.0
- Returns
- -------
- Series or Index of object
- A copy of the object with all matching occurrences of `pat` replaced by
- `repl`.
- Raises
- ------
- ValueError
- * if `regex` is False and `repl` is a callable or `pat` is a compiled
- regex
- * if `pat` is a compiled regex and `case` or `flags` is set
- Notes
- -----
- When `pat` is a compiled regex, all flags should be included in the
- compiled regex. Use of `case`, `flags`, or `regex=False` with a compiled
- regex will raise an error.
- Examples
- --------
- When `pat` is a string and `regex` is True (the default), the given `pat`
- is compiled as a regex. When `repl` is a string, it replaces matching
- regex patterns as with :meth:`re.sub`. NaN value(s) in the Series are
- left as is:
- >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f.', 'ba', regex=True)
- 0 bao
- 1 baz
- 2 NaN
- dtype: object
- When `pat` is a string and `regex` is False, every `pat` is replaced with
- `repl` as with :meth:`str.replace`:
- >>> pd.Series(['f.o', 'fuz', np.nan]).str.replace('f.', 'ba', regex=False)
- 0 bao
- 1 fuz
- 2 NaN
- dtype: object
- When `repl` is a callable, it is called on every `pat` using
- :func:`re.sub`. The callable should expect one positional argument
- (a regex object) and return a string.
- To get the idea:
- >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f', repr)
- 0 <_sre.SRE_Match object; span=(0, 1), match='f'>oo
- 1 <_sre.SRE_Match object; span=(0, 1), match='f'>uz
- 2 NaN
- dtype: object
- Reverse every lowercase alphabetic word:
- >>> repl = lambda m: m.group(0)[::-1]
- >>> pd.Series(['foo 123', 'bar baz', np.nan]).str.replace(r'[a-z]+', repl)
- 0 oof 123
- 1 rab zab
- 2 NaN
- dtype: object
- Using regex groups (extract second group and swap case):
- >>> pat = r"(?P<one>\w+) (?P<two>\w+) (?P<three>\w+)"
- >>> repl = lambda m: m.group('two').swapcase()
- >>> pd.Series(['One Two Three', 'Foo Bar Baz']).str.replace(pat, repl)
- 0 tWO
- 1 bAR
- dtype: object
- Using a compiled regex with flags
- >>> import re
- >>> regex_pat = re.compile(r'FUZ', flags=re.IGNORECASE)
- >>> pd.Series(['foo', 'fuz', np.nan]).str.replace(regex_pat, 'bar')
- 0 foo
- 1 bar
- 2 NaN
- dtype: object
- """
- # Check whether repl is valid (GH 13438, GH 15055)
- if not (is_string_like(repl) or callable(repl)):
- raise TypeError("repl must be a string or callable")
- is_compiled_re = is_re(pat)
- if regex:
- if is_compiled_re:
- if (case is not None) or (flags != 0):
- raise ValueError("case and flags cannot be set"
- " when pat is a compiled regex")
- else:
- # not a compiled regex
- # set default case
- if case is None:
- case = True
- # add case flag, if provided
- if case is False:
- flags |= re.IGNORECASE
- if is_compiled_re or len(pat) > 1 or flags or callable(repl):
- n = n if n >= 0 else 0
- compiled = re.compile(pat, flags=flags)
- f = lambda x: compiled.sub(repl=repl, string=x, count=n)
- else:
- f = lambda x: x.replace(pat, repl, n)
- else:
- if is_compiled_re:
- raise ValueError("Cannot use a compiled regex as replacement "
- "pattern with regex=False")
- if callable(repl):
- raise ValueError("Cannot use a callable replacement when "
- "regex=False")
- f = lambda x: x.replace(pat, repl, n)
- return _na_map(f, arr)
- def str_repeat(arr, repeats):
- """
- Duplicate each string in the Series or Index.
- Parameters
- ----------
- repeats : int or sequence of int
- Same value for all (int) or different value per (sequence).
- Returns
- -------
- Series or Index of object
- Series or Index of repeated string objects specified by
- input parameter repeats.
- Examples
- --------
- >>> s = pd.Series(['a', 'b', 'c'])
- >>> s
- 0 a
- 1 b
- 2 c
- dtype: object
- Single int repeats string in Series
- >>> s.str.repeat(repeats=2)
- 0 aa
- 1 bb
- 2 cc
- dtype: object
- Sequence of int repeats corresponding string in Series
- >>> s.str.repeat(repeats=[1, 2, 3])
- 0 a
- 1 bb
- 2 ccc
- dtype: object
- """
- if is_scalar(repeats):
- def rep(x):
- try:
- return compat.binary_type.__mul__(x, repeats)
- except TypeError:
- return compat.text_type.__mul__(x, repeats)
- return _na_map(rep, arr)
- else:
- def rep(x, r):
- try:
- return compat.binary_type.__mul__(x, r)
- except TypeError:
- return compat.text_type.__mul__(x, r)
- repeats = np.asarray(repeats, dtype=object)
- result = libops.vec_binop(com.values_from_object(arr), repeats, rep)
- return result
- def str_match(arr, pat, case=True, flags=0, na=np.nan):
- """
- Determine if each string matches a regular expression.
- Parameters
- ----------
- pat : str
- Character sequence or regular expression.
- case : bool, default True
- If True, case sensitive.
- flags : int, default 0 (no flags)
- re module flags, e.g. re.IGNORECASE.
- na : default NaN
- Fill value for missing values.
- Returns
- -------
- Series/array of boolean values
- See Also
- --------
- contains : Analogous, but less strict, relying on re.search instead of
- re.match.
- extract : Extract matched groups.
- """
- if not case:
- flags |= re.IGNORECASE
- regex = re.compile(pat, flags=flags)
- dtype = bool
- f = lambda x: bool(regex.match(x))
- return _na_map(f, arr, na, dtype=dtype)
- def _get_single_group_name(rx):
- try:
- return list(rx.groupindex.keys()).pop()
- except IndexError:
- return None
- def _groups_or_na_fun(regex):
- """Used in both extract_noexpand and extract_frame"""
- if regex.groups == 0:
- raise ValueError("pattern contains no capture groups")
- empty_row = [np.nan] * regex.groups
- def f(x):
- if not isinstance(x, compat.string_types):
- return empty_row
- m = regex.search(x)
- if m:
- return [np.nan if item is None else item for item in m.groups()]
- else:
- return empty_row
- return f
- def _str_extract_noexpand(arr, pat, flags=0):
- """
- Find groups in each string in the Series using passed regular
- expression. This function is called from
- str_extract(expand=False), and can return Series, DataFrame, or
- Index.
- """
- from pandas import DataFrame, Index
- regex = re.compile(pat, flags=flags)
- groups_or_na = _groups_or_na_fun(regex)
- if regex.groups == 1:
- result = np.array([groups_or_na(val)[0] for val in arr], dtype=object)
- name = _get_single_group_name(regex)
- else:
- if isinstance(arr, Index):
- raise ValueError("only one regex group is supported with Index")
- name = None
- names = dict(zip(regex.groupindex.values(), regex.groupindex.keys()))
- columns = [names.get(1 + i, i) for i in range(regex.groups)]
- if arr.empty:
- result = DataFrame(columns=columns, dtype=object)
- else:
- result = DataFrame(
- [groups_or_na(val) for val in arr],
- columns=columns,
- index=arr.index,
- dtype=object)
- return result, name
- def _str_extract_frame(arr, pat, flags=0):
- """
- For each subject string in the Series, extract groups from the
- first match of regular expression pat. This function is called from
- str_extract(expand=True), and always returns a DataFrame.
- """
- from pandas import DataFrame
- regex = re.compile(pat, flags=flags)
- groups_or_na = _groups_or_na_fun(regex)
- names = dict(zip(regex.groupindex.values(), regex.groupindex.keys()))
- columns = [names.get(1 + i, i) for i in range(regex.groups)]
- if len(arr) == 0:
- return DataFrame(columns=columns, dtype=object)
- try:
- result_index = arr.index
- except AttributeError:
- result_index = None
- return DataFrame(
- [groups_or_na(val) for val in arr],
- columns=columns,
- index=result_index,
- dtype=object)
- def str_extract(arr, pat, flags=0, expand=True):
- r"""
- Extract capture groups in the regex `pat` as columns in a DataFrame.
- For each subject string in the Series, extract groups from the
- first match of regular expression `pat`.
- Parameters
- ----------
- pat : str
- Regular expression pattern with capturing groups.
- flags : int, default 0 (no flags)
- Flags from the ``re`` module, e.g. ``re.IGNORECASE``, that
- modify regular expression matching for things like case,
- spaces, etc. For more details, see :mod:`re`.
- expand : bool, default True
- If True, return DataFrame with one column per capture group.
- If False, return a Series/Index if there is one capture group
- or DataFrame if there are multiple capture groups.
- .. versionadded:: 0.18.0
- Returns
- -------
- DataFrame or Series or Index
- A DataFrame with one row for each subject string, and one
- column for each group. Any capture group names in regular
- expression pat will be used for column names; otherwise
- capture group numbers will be used. The dtype of each result
- column is always object, even when no match is found. If
- ``expand=False`` and pat has only one capture group, then
- return a Series (if subject is a Series) or Index (if subject
- is an Index).
- See Also
- --------
- extractall : Returns all matches (not just the first match).
- Examples
- --------
- A pattern with two groups will return a DataFrame with two columns.
- Non-matches will be NaN.
- >>> s = pd.Series(['a1', 'b2', 'c3'])
- >>> s.str.extract(r'([ab])(\d)')
- 0 1
- 0 a 1
- 1 b 2
- 2 NaN NaN
- A pattern may contain optional groups.
- >>> s.str.extract(r'([ab])?(\d)')
- 0 1
- 0 a 1
- 1 b 2
- 2 NaN 3
- Named groups will become column names in the result.
- >>> s.str.extract(r'(?P<letter>[ab])(?P<digit>\d)')
- letter digit
- 0 a 1
- 1 b 2
- 2 NaN NaN
- A pattern with one group will return a DataFrame with one column
- if expand=True.
- >>> s.str.extract(r'[ab](\d)', expand=True)
- 0
- 0 1
- 1 2
- 2 NaN
- A pattern with one group will return a Series if expand=False.
- >>> s.str.extract(r'[ab](\d)', expand=False)
- 0 1
- 1 2
- 2 NaN
- dtype: object
- """
- if not isinstance(expand, bool):
- raise ValueError("expand must be True or False")
- if expand:
- return _str_extract_frame(arr._orig, pat, flags=flags)
- else:
- result, name = _str_extract_noexpand(arr._parent, pat, flags=flags)
- return arr._wrap_result(result, name=name, expand=expand)
- def str_extractall(arr, pat, flags=0):
- r"""
- For each subject string in the Series, extract groups from all
- matches of regular expression pat. When each subject string in the
- Series has exactly one match, extractall(pat).xs(0, level='match')
- is the same as extract(pat).
- .. versionadded:: 0.18.0
- Parameters
- ----------
- pat : str
- Regular expression pattern with capturing groups.
- flags : int, default 0 (no flags)
- A ``re`` module flag, for example ``re.IGNORECASE``. These allow
- to modify regular expression matching for things like case, spaces,
- etc. Multiple flags can be combined with the bitwise OR operator,
- for example ``re.IGNORECASE | re.MULTILINE``.
- Returns
- -------
- DataFrame
- A ``DataFrame`` with one row for each match, and one column for each
- group. Its rows have a ``MultiIndex`` with first levels that come from
- the subject ``Series``. The last level is named 'match' and indexes the
- matches in each item of the ``Series``. Any capture group names in
- regular expression pat will be used for column names; otherwise capture
- group numbers will be used.
- See Also
- --------
- extract : Returns first match only (not all matches).
- Examples
- --------
- A pattern with one group will return a DataFrame with one column.
- Indices with no matches will not appear in the result.
- >>> s = pd.Series(["a1a2", "b1", "c1"], index=["A", "B", "C"])
- >>> s.str.extractall(r"[ab](\d)")
- 0
- match
- A 0 1
- 1 2
- B 0 1
- Capture group names are used for column names of the result.
- >>> s.str.extractall(r"[ab](?P<digit>\d)")
- digit
- match
- A 0 1
- 1 2
- B 0 1
- A pattern with two groups will return a DataFrame with two columns.
- >>> s.str.extractall(r"(?P<letter>[ab])(?P<digit>\d)")
- letter digit
- match
- A 0 a 1
- 1 a 2
- B 0 b 1
- Optional groups that do not match are NaN in the result.
- >>> s.str.extractall(r"(?P<letter>[ab])?(?P<digit>\d)")
- letter digit
- match
- A 0 a 1
- 1 a 2
- B 0 b 1
- C 0 NaN 1
- """
- regex = re.compile(pat, flags=flags)
- # the regex must contain capture groups.
- if regex.groups == 0:
- raise ValueError("pattern contains no capture groups")
- if isinstance(arr, ABCIndexClass):
- arr = arr.to_series().reset_index(drop=True)
- names = dict(zip(regex.groupindex.values(), regex.groupindex.keys()))
- columns = [names.get(1 + i, i) for i in range(regex.groups)]
- match_list = []
- index_list = []
- is_mi = arr.index.nlevels > 1
- for subject_key, subject in arr.iteritems():
- if isinstance(subject, compat.string_types):
- if not is_mi:
- subject_key = (subject_key, )
- for match_i, match_tuple in enumerate(regex.findall(subject)):
- if isinstance(match_tuple, compat.string_types):
- match_tuple = (match_tuple,)
- na_tuple = [np.NaN if group == "" else group
- for group in match_tuple]
- match_list.append(na_tuple)
- result_key = tuple(subject_key + (match_i, ))
- index_list.append(result_key)
- from pandas import MultiIndex
- index = MultiIndex.from_tuples(
- index_list, names=arr.index.names + ["match"])
- result = arr._constructor_expanddim(match_list, index=index,
- columns=columns)
- return result
- def str_get_dummies(arr, sep='|'):
- """
- Split each string in the Series by sep and return a DataFrame
- of dummy/indicator variables.
- Parameters
- ----------
- sep : str, default "|"
- String to split on.
- Returns
- -------
- DataFrame
- Dummy variables corresponding to values of the Series.
- See Also
- --------
- get_dummies : Convert categorical variable into dummy/indicator
- variables.
- Examples
- --------
- >>> pd.Series(['a|b', 'a', 'a|c']).str.get_dummies()
- a b c
- 0 1 1 0
- 1 1 0 0
- 2 1 0 1
- >>> pd.Series(['a|b', np.nan, 'a|c']).str.get_dummies()
- a b c
- 0 1 1 0
- 1 0 0 0
- 2 1 0 1
- """
- arr = arr.fillna('')
- try:
- arr = sep + arr + sep
- except TypeError:
- arr = sep + arr.astype(str) + sep
- tags = set()
- for ts in arr.str.split(sep):
- tags.update(ts)
- tags = sorted(tags - {""})
- dummies = np.empty((len(arr), len(tags)), dtype=np.int64)
- for i, t in enumerate(tags):
- pat = sep + t + sep
- dummies[:, i] = lib.map_infer(arr.values, lambda x: pat in x)
- return dummies, tags
- def str_join(arr, sep):
- """
- Join lists contained as elements in the Series/Index with passed delimiter.
- If the elements of a Series are lists themselves, join the content of these
- lists using the delimiter passed to the function.
- This function is an equivalent to :meth:`str.join`.
- Parameters
- ----------
- sep : str
- Delimiter to use between list entries.
- Returns
- -------
- Series/Index: object
- The list entries concatenated by intervening occurrences of the
- delimiter.
- Raises
- -------
- AttributeError
- If the supplied Series contains neither strings nor lists.
- See Also
- --------
- str.join : Standard library version of this method.
- Series.str.split : Split strings around given separator/delimiter.
- Notes
- -----
- If any of the list items is not a string object, the result of the join
- will be `NaN`.
- Examples
- --------
- Example with a list that contains non-string elements.
- >>> s = pd.Series([['lion', 'elephant', 'zebra'],
- ... [1.1, 2.2, 3.3],
- ... ['cat', np.nan, 'dog'],
- ... ['cow', 4.5, 'goat'],
- ... ['duck', ['swan', 'fish'], 'guppy']])
- >>> s
- 0 [lion, elephant, zebra]
- 1 [1.1, 2.2, 3.3]
- 2 [cat, nan, dog]
- 3 [cow, 4.5, goat]
- 4 [duck, [swan, fish], guppy]
- dtype: object
- Join all lists using a '-'. The lists containing object(s) of types other
- than str will produce a NaN.
- >>> s.str.join('-')
- 0 lion-elephant-zebra
- 1 NaN
- 2 NaN
- 3 NaN
- 4 NaN
- dtype: object
- """
- return _na_map(sep.join, arr)
- def str_findall(arr, pat, flags=0):
- """
- Find all occurrences of pattern or regular expression in the Series/Index.
- Equivalent to applying :func:`re.findall` to all the elements in the
- Series/Index.
- Parameters
- ----------
- pat : str
- Pattern or regular expression.
- flags : int, default 0
- Flags from ``re`` module, e.g. `re.IGNORECASE` (default is 0, which
- means no flags).
- Returns
- -------
- Series/Index of lists of strings
- All non-overlapping matches of pattern or regular expression in each
- string of this Series/Index.
- See Also
- --------
- count : Count occurrences of pattern or regular expression in each string
- of the Series/Index.
- extractall : For each string in the Series, extract groups from all matches
- of regular expression and return a DataFrame with one row for each
- match and one column for each group.
- re.findall : The equivalent ``re`` function to all non-overlapping matches
- of pattern or regular expression in string, as a list of strings.
- Examples
- --------
- >>> s = pd.Series(['Lion', 'Monkey', 'Rabbit'])
- The search for the pattern 'Monkey' returns one match:
- >>> s.str.findall('Monkey')
- 0 []
- 1 [Monkey]
- 2 []
- dtype: object
- On the other hand, the search for the pattern 'MONKEY' doesn't return any
- match:
- >>> s.str.findall('MONKEY')
- 0 []
- 1 []
- 2 []
- dtype: object
- Flags can be added to the pattern or regular expression. For instance,
- to find the pattern 'MONKEY' ignoring the case:
- >>> import re
- >>> s.str.findall('MONKEY', flags=re.IGNORECASE)
- 0 []
- 1 [Monkey]
- 2 []
- dtype: object
- When the pattern matches more than one string in the Series, all matches
- are returned:
- >>> s.str.findall('on')
- 0 [on]
- 1 [on]
- 2 []
- dtype: object
- Regular expressions are supported too. For instance, the search for all the
- strings ending with the word 'on' is shown next:
- >>> s.str.findall('on$')
- 0 [on]
- 1 []
- 2 []
- dtype: object
- If the pattern is found more than once in the same string, then a list of
- multiple strings is returned:
- >>> s.str.findall('b')
- 0 []
- 1 []
- 2 [b, b]
- dtype: object
- """
- regex = re.compile(pat, flags=flags)
- return _na_map(regex.findall, arr)
- def str_find(arr, sub, start=0, end=None, side='left'):
- """
- Return indexes in each strings in the Series/Index where the
- substring is fully contained between [start:end]. Return -1 on failure.
- Parameters
- ----------
- sub : str
- Substring being searched.
- start : int
- Left edge index.
- end : int
- Right edge index.
- side : {'left', 'right'}, default 'left'
- Specifies a starting side, equivalent to ``find`` or ``rfind``.
- Returns
- -------
- Series or Index
- Indexes where substring is found.
- """
- if not isinstance(sub, compat.string_types):
- msg = 'expected a string object, not {0}'
- raise TypeError(msg.format(type(sub).__name__))
- if side == 'left':
- method = 'find'
- elif side == 'right':
- method = 'rfind'
- else: # pragma: no cover
- raise ValueError('Invalid side')
- if end is None:
- f = lambda x: getattr(x, method)(sub, start)
- else:
- f = lambda x: getattr(x, method)(sub, start, end)
- return _na_map(f, arr, dtype=int)
- def str_index(arr, sub, start=0, end=None, side='left'):
- if not isinstance(sub, compat.string_types):
- msg = 'expected a string object, not {0}'
- raise TypeError(msg.format(type(sub).__name__))
- if side == 'left':
- method = 'index'
- elif side == 'right':
- method = 'rindex'
- else: # pragma: no cover
- raise ValueError('Invalid side')
- if end is None:
- f = lambda x: getattr(x, method)(sub, start)
- else:
- f = lambda x: getattr(x, method)(sub, start, end)
- return _na_map(f, arr, dtype=int)
- def str_pad(arr, width, side='left', fillchar=' '):
- """
- Pad strings in the Series/Index up to width.
- Parameters
- ----------
- width : int
- Minimum width of resulting string; additional characters will be filled
- with character defined in `fillchar`.
- side : {'left', 'right', 'both'}, default 'left'
- Side from which to fill resulting string.
- fillchar : str, default ' '
- Additional character for filling, default is whitespace.
- Returns
- -------
- Series or Index of object
- Returns Series or Index with minimum number of char in object.
- See Also
- --------
- Series.str.rjust : Fills the left side of strings with an arbitrary
- character. Equivalent to ``Series.str.pad(side='left')``.
- Series.str.ljust : Fills the right side of strings with an arbitrary
- character. Equivalent to ``Series.str.pad(side='right')``.
- Series.str.center : Fills boths sides of strings with an arbitrary
- character. Equivalent to ``Series.str.pad(side='both')``.
- Series.str.zfill : Pad strings in the Series/Index by prepending '0'
- character. Equivalent to ``Series.str.pad(side='left', fillchar='0')``.
- Examples
- --------
- >>> s = pd.Series(["caribou", "tiger"])
- >>> s
- 0 caribou
- 1 tiger
- dtype: object
- >>> s.str.pad(width=10)
- 0 caribou
- 1 tiger
- dtype: object
- >>> s.str.pad(width=10, side='right', fillchar='-')
- 0 caribou---
- 1 tiger-----
- dtype: object
- >>> s.str.pad(width=10, side='both', fillchar='-')
- 0 -caribou--
- 1 --tiger---
- dtype: object
- """
- if not isinstance(fillchar, compat.string_types):
- msg = 'fillchar must be a character, not {0}'
- raise TypeError(msg.format(type(fillchar).__name__))
- if len(fillchar) != 1:
- raise TypeError('fillchar must be a character, not str')
- if not is_integer(width):
- msg = 'width must be of integer type, not {0}'
- raise TypeError(msg.format(type(width).__name__))
- if side == 'left':
- f = lambda x: x.rjust(width, fillchar)
- elif side == 'right':
- f = lambda x: x.ljust(width, fillchar)
- elif side == 'both':
- f = lambda x: x.center(width, fillchar)
- else: # pragma: no cover
- raise ValueError('Invalid side')
- return _na_map(f, arr)
- def str_split(arr, pat=None, n=None):
- if pat is None:
- if n is None or n == 0:
- n = -1
- f = lambda x: x.split(pat, n)
- else:
- if len(pat) == 1:
- if n is None or n == 0:
- n = -1
- f = lambda x: x.split(pat, n)
- else:
- if n is None or n == -1:
- n = 0
- regex = re.compile(pat)
- f = lambda x: regex.split(x, maxsplit=n)
- res = _na_map(f, arr)
- return res
- def str_rsplit(arr, pat=None, n=None):
- if n is None or n == 0:
- n = -1
- f = lambda x: x.rsplit(pat, n)
- res = _na_map(f, arr)
- return res
- def str_slice(arr, start=None, stop=None, step=None):
- """
- Slice substrings from each element in the Series or Index.
- Parameters
- ----------
- start : int, optional
- Start position for slice operation.
- stop : int, optional
- Stop position for slice operation.
- step : int, optional
- Step size for slice operation.
- Returns
- -------
- Series or Index of object
- Series or Index from sliced substring from original string object.
- See Also
- --------
- Series.str.slice_replace : Replace a slice with a string.
- Series.str.get : Return element at position.
- Equivalent to `Series.str.slice(start=i, stop=i+1)` with `i`
- being the position.
- Examples
- --------
- >>> s = pd.Series(["koala", "fox", "chameleon"])
- >>> s
- 0 koala
- 1 fox
- 2 chameleon
- dtype: object
- >>> s.str.slice(start=1)
- 0 oala
- 1 ox
- 2 hameleon
- dtype: object
- >>> s.str.slice(stop=2)
- 0 ko
- 1 fo
- 2 ch
- dtype: object
- >>> s.str.slice(step=2)
- 0 kaa
- 1 fx
- 2 caeen
- dtype: object
- >>> s.str.slice(start=0, stop=5, step=3)
- 0 kl
- 1 f
- 2 cm
- dtype: object
- Equivalent behaviour to:
- >>> s.str[0:5:3]
- 0 kl
- 1 f
- 2 cm
- dtype: object
- """
- obj = slice(start, stop, step)
- f = lambda x: x[obj]
- return _na_map(f, arr)
- def str_slice_replace(arr, start=None, stop=None, repl=None):
- """
- Replace a positional slice of a string with another value.
- Parameters
- ----------
- start : int, optional
- Left index position to use for the slice. If not specified (None),
- the slice is unbounded on the left, i.e. slice from the start
- of the string.
- stop : int, optional
- Right index position to use for the slice. If not specified (None),
- the slice is unbounded on the right, i.e. slice until the
- end of the string.
- repl : str, optional
- String for replacement. If not specified (None), the sliced region
- is replaced with an empty string.
- Returns
- -------
- Series or Index
- Same type as the original object.
- See Also
- --------
- Series.str.slice : Just slicing without replacement.
- Examples
- --------
- >>> s = pd.Series(['a', 'ab', 'abc', 'abdc', 'abcde'])
- >>> s
- 0 a
- 1 ab
- 2 abc
- 3 abdc
- 4 abcde
- dtype: object
- Specify just `start`, meaning replace `start` until the end of the
- string with `repl`.
- >>> s.str.slice_replace(1, repl='X')
- 0 aX
- 1 aX
- 2 aX
- 3 aX
- 4 aX
- dtype: object
- Specify just `stop`, meaning the start of the string to `stop` is replaced
- with `repl`, and the rest of the string is included.
- >>> s.str.slice_replace(stop=2, repl='X')
- 0 X
- 1 X
- 2 Xc
- 3 Xdc
- 4 Xcde
- dtype: object
- Specify `start` and `stop`, meaning the slice from `start` to `stop` is
- replaced with `repl`. Everything before or after `start` and `stop` is
- included as is.
- >>> s.str.slice_replace(start=1, stop=3, repl='X')
- 0 aX
- 1 aX
- 2 aX
- 3 aXc
- 4 aXde
- dtype: object
- """
- if repl is None:
- repl = ''
- def f(x):
- if x[start:stop] == '':
- local_stop = start
- else:
- local_stop = stop
- y = ''
- if start is not None:
- y += x[:start]
- y += repl
- if stop is not None:
- y += x[local_stop:]
- return y
- return _na_map(f, arr)
- def str_strip(arr, to_strip=None, side='both'):
- """
- Strip whitespace (including newlines) from each string in the
- Series/Index.
- Parameters
- ----------
- to_strip : str or unicode
- side : {'left', 'right', 'both'}, default 'both'
- Returns
- -------
- Series or Index
- """
- if side == 'both':
- f = lambda x: x.strip(to_strip)
- elif side == 'left':
- f = lambda x: x.lstrip(to_strip)
- elif side == 'right':
- f = lambda x: x.rstrip(to_strip)
- else: # pragma: no cover
- raise ValueError('Invalid side')
- return _na_map(f, arr)
- def str_wrap(arr, width, **kwargs):
- r"""
- Wrap long strings in the Series/Index to be formatted in
- paragraphs with length less than a given width.
- This method has the same keyword parameters and defaults as
- :class:`textwrap.TextWrapper`.
- Parameters
- ----------
- width : int
- Maximum line width.
- expand_tabs : bool, optional
- If True, tab characters will be expanded to spaces (default: True).
- replace_whitespace : bool, optional
- If True, each whitespace character (as defined by string.whitespace)
- remaining after tab expansion will be replaced by a single space
- (default: True).
- drop_whitespace : bool, optional
- If True, whitespace that, after wrapping, happens to end up at the
- beginning or end of a line is dropped (default: True).
- break_long_words : bool, optional
- If True, then words longer than width will be broken in order to ensure
- that no lines are longer than width. If it is false, long words will
- not be broken, and some lines may be longer than width (default: True).
- break_on_hyphens : bool, optional
- If True, wrapping will occur preferably on whitespace and right after
- hyphens in compound words, as it is customary in English. If false,
- only whitespaces will be considered as potentially good places for line
- breaks, but you need to set break_long_words to false if you want truly
- insecable words (default: True).
- Returns
- -------
- Series or Index
- Notes
- -----
- Internally, this method uses a :class:`textwrap.TextWrapper` instance with
- default settings. To achieve behavior matching R's stringr library str_wrap
- function, use the arguments:
- - expand_tabs = False
- - replace_whitespace = True
- - drop_whitespace = True
- - break_long_words = False
- - break_on_hyphens = False
- Examples
- --------
- >>> s = pd.Series(['line to be wrapped', 'another line to be wrapped'])
- >>> s.str.wrap(12)
- 0 line to be\nwrapped
- 1 another line\nto be\nwrapped
- dtype: object
- """
- kwargs['width'] = width
- tw = textwrap.TextWrapper(**kwargs)
- return _na_map(lambda s: '\n'.join(tw.wrap(s)), arr)
- def str_translate(arr, table, deletechars=None):
- """
- Map all characters in the string through the given mapping table.
- Equivalent to standard :meth:`str.translate`. Note that the optional
- argument deletechars is only valid if you are using python 2. For python 3,
- character deletion should be specified via the table argument.
- Parameters
- ----------
- table : dict (python 3), str or None (python 2)
- In python 3, table is a mapping of Unicode ordinals to Unicode
- ordinals, strings, or None. Unmapped characters are left untouched.
- Characters mapped to None are deleted. :meth:`str.maketrans` is a
- helper function for making translation tables.
- In python 2, table is either a string of length 256 or None. If the
- table argument is None, no translation is applied and the operation
- simply removes the characters in deletechars. :func:`string.maketrans`
- is a helper function for making translation tables.
- deletechars : str, optional (python 2)
- A string of characters to delete. This argument is only valid
- in python 2.
- Returns
- -------
- Series or Index
- """
- if deletechars is None:
- f = lambda x: x.translate(table)
- else:
- if compat.PY3:
- raise ValueError("deletechars is not a valid argument for "
- "str.translate in python 3. You should simply "
- "specify character deletions in the table "
- "argument")
- f = lambda x: x.translate(table, deletechars)
- return _na_map(f, arr)
- def str_get(arr, i):
- """
- Extract element from each component at specified position.
- Extract element from lists, tuples, or strings in each element in the
- Series/Index.
- Parameters
- ----------
- i : int
- Position of element to extract.
- Returns
- -------
- Series or Index
- Examples
- --------
- >>> s = pd.Series(["String",
- ... (1, 2, 3),
- ... ["a", "b", "c"],
- ... 123,
- ... -456,
- ... {1: "Hello", "2": "World"}])
- >>> s
- 0 String
- 1 (1, 2, 3)
- 2 [a, b, c]
- 3 123
- 4 -456
- 5 {1: 'Hello', '2': 'World'}
- dtype: object
- >>> s.str.get(1)
- 0 t
- 1 2
- 2 b
- 3 NaN
- 4 NaN
- 5 Hello
- dtype: object
- >>> s.str.get(-1)
- 0 g
- 1 3
- 2 c
- 3 NaN
- 4 NaN
- 5 None
- dtype: object
- """
- def f(x):
- if isinstance(x, dict):
- return x.get(i)
- elif len(x) > i >= -len(x):
- return x[i]
- return np.nan
- return _na_map(f, arr)
- def str_decode(arr, encoding, errors="strict"):
- """
- Decode character string in the Series/Index using indicated encoding.
- Equivalent to :meth:`str.decode` in python2 and :meth:`bytes.decode` in
- python3.
- Parameters
- ----------
- encoding : str
- errors : str, optional
- Returns
- -------
- Series or Index
- """
- if encoding in _cpython_optimized_decoders:
- # CPython optimized implementation
- f = lambda x: x.decode(encoding, errors)
- else:
- decoder = codecs.getdecoder(encoding)
- f = lambda x: decoder(x, errors)[0]
- return _na_map(f, arr)
- def str_encode(arr, encoding, errors="strict"):
- """
- Encode character string in the Series/Index using indicated encoding.
- Equivalent to :meth:`str.encode`.
- Parameters
- ----------
- encoding : str
- errors : str, optional
- Returns
- -------
- encoded : Series/Index of objects
- """
- if encoding in _cpython_optimized_encoders:
- # CPython optimized implementation
- f = lambda x: x.encode(encoding, errors)
- else:
- encoder = codecs.getencoder(encoding)
- f = lambda x: encoder(x, errors)[0]
- return _na_map(f, arr)
- def _noarg_wrapper(f, docstring=None, **kargs):
- def wrapper(self):
- result = _na_map(f, self._parent, **kargs)
- return self._wrap_result(result)
- wrapper.__name__ = f.__name__
- if docstring is not None:
- wrapper.__doc__ = docstring
- else:
- raise ValueError('Provide docstring')
- return wrapper
- def _pat_wrapper(f, flags=False, na=False, **kwargs):
- def wrapper1(self, pat):
- result = f(self._parent, pat)
- return self._wrap_result(result)
- def wrapper2(self, pat, flags=0, **kwargs):
- result = f(self._parent, pat, flags=flags, **kwargs)
- return self._wrap_result(result)
- def wrapper3(self, pat, na=np.nan):
- result = f(self._parent, pat, na=na)
- return self._wrap_result(result)
- wrapper = wrapper3 if na else wrapper2 if flags else wrapper1
- wrapper.__name__ = f.__name__
- if f.__doc__:
- wrapper.__doc__ = f.__doc__
- return wrapper
- def copy(source):
- "Copy a docstring from another source function (if …
Large files files are truncated, but you can click here to view the full file