libs/text-splitters/langchain_text_splitters/character.py PYTHON 798 lines View on github.com → Search inside
1"""Character text splitters."""23from __future__ import annotations45import re6from typing import Any, Literal78from langchain_text_splitters.base import Language, TextSplitter91011class CharacterTextSplitter(TextSplitter):12    """Splitting text that looks at characters."""1314    def __init__(15        self,16        separator: str = "\n\n",17        is_separator_regex: bool = False,  # noqa: FBT001,FBT00218        **kwargs: Any,19    ) -> None:20        """Create a new TextSplitter."""21        super().__init__(**kwargs)22        self._separator = separator23        self._is_separator_regex = is_separator_regex2425    def split_text(self, text: str) -> list[str]:26        """Split into chunks without re-inserting lookaround separators.2728        Args:29            text: The text to split.3031        Returns:32            A list of text chunks.33        """34        # 1. Determine split pattern: raw regex or escaped literal35        sep_pattern = (36            self._separator if self._is_separator_regex else re.escape(self._separator)37        )3839        # 2. Initial split (keep separator if requested)40        splits = _split_text_with_regex(41            text, sep_pattern, keep_separator=self._keep_separator42        )4344        # 3. Detect zero-width lookaround so we never re-insert it45        lookaround_prefixes = ("(?=", "(?<!", "(?<=", "(?!")46        is_lookaround = self._is_separator_regex and any(47            self._separator.startswith(p) for p in lookaround_prefixes48        )4950        # 4. Decide merge separator:51        #    - if keep_separator or lookaround -> don't re-insert52        #    - else -> re-insert literal separator53        merge_sep = ""54        if not (self._keep_separator or is_lookaround):55            merge_sep = self._separator5657        # 5. Merge adjacent splits and return58        return self._merge_splits(splits, merge_sep)596061def _split_text_with_regex(62    text: str, separator: str, *, keep_separator: bool | Literal["start", "end"]63) -> list[str]:64    # Now that we have the separator, split the text65    if separator:66        if keep_separator:67            # The parentheses in the pattern keep the delimiters in the result.68            splits_ = re.split(f"({separator})", text)69            splits = (70                ([splits_[i] + splits_[i + 1] for i in range(0, len(splits_) - 1, 2)])71                if keep_separator == "end"72                else ([splits_[i] + splits_[i + 1] for i in range(1, len(splits_), 2)])73            )74            if len(splits_) % 2 == 0:75                splits += splits_[-1:]76            splits = (77                ([*splits, splits_[-1]])78                if keep_separator == "end"79                else ([splits_[0], *splits])80            )81        else:82            splits = re.split(separator, text)83    else:84        splits = list(text)85    return [s for s in splits if s]868788class RecursiveCharacterTextSplitter(TextSplitter):89    """Splitting text by recursively look at characters.9091    Recursively tries to split by different characters to find one92    that works.93    """9495    def __init__(96        self,97        separators: list[str] | None = None,98        keep_separator: bool | Literal["start", "end"] = True,  # noqa: FBT001,FBT00299        is_separator_regex: bool = False,  # noqa: FBT001,FBT002100        **kwargs: Any,101    ) -> None:102        """Create a new TextSplitter."""103        super().__init__(keep_separator=keep_separator, **kwargs)104        self._separators = separators or ["\n\n", "\n", " ", ""]105        self._is_separator_regex = is_separator_regex106107    def _split_text(self, text: str, separators: list[str]) -> list[str]:108        """Split incoming text and return chunks."""109        final_chunks = []110        # Get appropriate separator to use111        separator = separators[-1]112        new_separators = []113        for i, s_ in enumerate(separators):114            separator_ = s_ if self._is_separator_regex else re.escape(s_)115            if not s_:116                separator = s_117                break118            if re.search(separator_, text):119                separator = s_120                new_separators = separators[i + 1 :]121                break122123        separator_ = separator if self._is_separator_regex else re.escape(separator)124        splits = _split_text_with_regex(125            text, separator_, keep_separator=self._keep_separator126        )127128        # Now go merging things, recursively splitting longer texts.129        good_splits = []130        separator_ = "" if self._keep_separator else separator131        for s in splits:132            if self._length_function(s) < self._chunk_size:133                good_splits.append(s)134            else:135                if good_splits:136                    merged_text = self._merge_splits(good_splits, separator_)137                    final_chunks.extend(merged_text)138                    good_splits = []139                if not new_separators:140                    final_chunks.append(s)141                else:142                    other_info = self._split_text(s, new_separators)143                    final_chunks.extend(other_info)144        if good_splits:145            merged_text = self._merge_splits(good_splits, separator_)146            final_chunks.extend(merged_text)147        return final_chunks148149    def split_text(self, text: str) -> list[str]:150        """Split the input text into smaller chunks based on predefined separators.151152        Args:153            text: The input text to be split.154155        Returns:156            A list of text chunks obtained after splitting.157        """158        return self._split_text(text, self._separators)159160    @classmethod161    def from_language(162        cls, language: Language, **kwargs: Any163    ) -> RecursiveCharacterTextSplitter:164        """Return an instance of this class based on a specific language.165166        This method initializes the text splitter with language-specific separators.167168        Args:169            language: The language to configure the text splitter for.170            **kwargs: Additional keyword arguments to customize the splitter.171172        Returns:173            An instance of the text splitter configured for the specified language.174        """175        separators = cls.get_separators_for_language(language)176        return cls(separators=separators, is_separator_regex=True, **kwargs)177178    @staticmethod179    def get_separators_for_language(language: Language) -> list[str]:180        """Retrieve a list of separators specific to the given language.181182        Args:183            language: The language for which to get the separators.184185        Returns:186            A list of separators appropriate for the specified language.187188        Raises:189            ValueError: If the language is not implemented or supported.190        """191        if language in {Language.C, Language.CPP}:192            return [193                # Split along class definitions194                "\nclass ",195                # Split along function definitions196                "\nvoid ",197                "\nint ",198                "\nfloat ",199                "\ndouble ",200                # Split along control flow statements201                "\nif ",202                "\nfor ",203                "\nwhile ",204                "\nswitch ",205                "\ncase ",206                # Split by the normal type of lines207                "\n\n",208                "\n",209                " ",210                "",211            ]212        if language == Language.GO:213            return [214                # Split along function definitions215                "\nfunc ",216                "\nvar ",217                "\nconst ",218                "\ntype ",219                # Split along control flow statements220                "\nif ",221                "\nfor ",222                "\nswitch ",223                "\ncase ",224                # Split by the normal type of lines225                "\n\n",226                "\n",227                " ",228                "",229            ]230        if language == Language.JAVA:231            return [232                # Split along class definitions233                "\nclass ",234                # Split along method definitions235                "\npublic ",236                "\nprotected ",237                "\nprivate ",238                "\nstatic ",239                # Split along control flow statements240                "\nif ",241                "\nfor ",242                "\nwhile ",243                "\nswitch ",244                "\ncase ",245                # Split by the normal type of lines246                "\n\n",247                "\n",248                " ",249                "",250            ]251        if language == Language.KOTLIN:252            return [253                # Split along class definitions254                "\nclass ",255                # Split along method definitions256                "\npublic ",257                "\nprotected ",258                "\nprivate ",259                "\ninternal ",260                "\ncompanion ",261                "\nfun ",262                "\nval ",263                "\nvar ",264                # Split along control flow statements265                "\nif ",266                "\nfor ",267                "\nwhile ",268                "\nwhen ",269                "\nelse ",270                # Split by the normal type of lines271                "\n\n",272                "\n",273                " ",274                "",275            ]276        if language == Language.JS:277            return [278                # Split along function definitions279                "\nfunction ",280                "\nconst ",281                "\nlet ",282                "\nvar ",283                "\nclass ",284                # Split along control flow statements285                "\nif ",286                "\nfor ",287                "\nwhile ",288                "\nswitch ",289                "\ncase ",290                "\ndefault ",291                # Split by the normal type of lines292                "\n\n",293                "\n",294                " ",295                "",296            ]297        if language == Language.TS:298            return [299                "\nenum ",300                "\ninterface ",301                "\nnamespace ",302                "\ntype ",303                # Split along class definitions304                "\nclass ",305                # Split along function definitions306                "\nfunction ",307                "\nconst ",308                "\nlet ",309                "\nvar ",310                # Split along control flow statements311                "\nif ",312                "\nfor ",313                "\nwhile ",314                "\nswitch ",315                "\ncase ",316                "\ndefault ",317                # Split by the normal type of lines318                "\n\n",319                "\n",320                " ",321                "",322            ]323        if language == Language.PHP:324            return [325                # Split along function definitions326                "\nfunction ",327                # Split along class definitions328                "\nclass ",329                # Split along control flow statements330                "\nif ",331                "\nforeach ",332                "\nwhile ",333                "\ndo ",334                "\nswitch ",335                "\ncase ",336                # Split by the normal type of lines337                "\n\n",338                "\n",339                " ",340                "",341            ]342        if language == Language.PROTO:343            return [344                # Split along message definitions345                "\nmessage ",346                # Split along service definitions347                "\nservice ",348                # Split along enum definitions349                "\nenum ",350                # Split along option definitions351                "\noption ",352                # Split along import statements353                "\nimport ",354                # Split along syntax declarations355                "\nsyntax ",356                # Split by the normal type of lines357                "\n\n",358                "\n",359                " ",360                "",361            ]362        if language == Language.PYTHON:363            return [364                # First, try to split along class definitions365                "\nclass ",366                "\ndef ",367                "\n\tdef ",368                # Now split by the normal type of lines369                "\n\n",370                "\n",371                " ",372                "",373            ]374        if language == Language.R:375            return [376                # Split along function definitions377                "\nfunction ",378                # Split along S4 class and method definitions379                "\nsetClass\\(",380                "\nsetMethod\\(",381                "\nsetGeneric\\(",382                # Split along control flow statements383                "\nif ",384                "\nelse ",385                "\nfor ",386                "\nwhile ",387                "\nrepeat ",388                # Split along package loading389                "\nlibrary\\(",390                "\nrequire\\(",391                # Split by the normal type of lines392                "\n\n",393                "\n",394                " ",395                "",396            ]397        if language == Language.RST:398            return [399                # Split along section titles400                "\n=+\n",401                "\n-+\n",402                "\n\\*+\n",403                # Split along directive markers404                "\n\n.. *\n\n",405                # Split by the normal type of lines406                "\n\n",407                "\n",408                " ",409                "",410            ]411        if language == Language.RUBY:412            return [413                # Split along method definitions414                "\ndef ",415                "\nclass ",416                # Split along control flow statements417                "\nif ",418                "\nunless ",419                "\nwhile ",420                "\nfor ",421                "\ndo ",422                "\nbegin ",423                "\nrescue ",424                # Split by the normal type of lines425                "\n\n",426                "\n",427                " ",428                "",429            ]430        if language == Language.ELIXIR:431            return [432                # Split along method function and module definition433                "\ndef ",434                "\ndefp ",435                "\ndefmodule ",436                "\ndefprotocol ",437                "\ndefmacro ",438                "\ndefmacrop ",439                # Split along control flow statements440                "\nif ",441                "\nunless ",442                "\ncase ",443                "\ncond ",444                "\nwith ",445                "\nfor ",446                "\ndo ",447                # Split by the normal type of lines448                "\n\n",449                "\n",450                " ",451                "",452            ]453        if language == Language.RUST:454            return [455                # Split along function definitions456                "\nfn ",457                "\nconst ",458                "\nlet ",459                # Split along control flow statements460                "\nif ",461                "\nwhile ",462                "\nfor ",463                "\nloop ",464                "\nmatch ",465                # Split by the normal type of lines466                "\n\n",467                "\n",468                " ",469                "",470            ]471        if language == Language.SCALA:472            return [473                # Split along class definitions474                "\nclass ",475                "\nobject ",476                # Split along method definitions477                "\ndef ",478                "\nval ",479                "\nvar ",480                # Split along control flow statements481                "\nif ",482                "\nfor ",483                "\nwhile ",484                "\nmatch ",485                "\ncase ",486                # Split by the normal type of lines487                "\n\n",488                "\n",489                " ",490                "",491            ]492        if language == Language.SWIFT:493            return [494                # Split along function definitions495                "\nfunc ",496                # Split along class definitions497                "\nclass ",498                "\nstruct ",499                "\nenum ",500                # Split along control flow statements501                "\nif ",502                "\nfor ",503                "\nwhile ",504                "\ndo ",505                "\nswitch ",506                "\ncase ",507                # Split by the normal type of lines508                "\n\n",509                "\n",510                " ",511                "",512            ]513        if language == Language.MARKDOWN:514            return [515                # First, try to split along Markdown headings (starting with level 2)516                "\n#{1,6} ",517                # Note the alternative syntax for headings (below) is not handled here518                # Heading level 2519                # ---------------520                # End of code block521                "```\n",522                # Horizontal lines523                "\n\\*\\*\\*+\n",524                "\n---+\n",525                "\n___+\n",526                # Note that this splitter doesn't handle horizontal lines defined527                # by *three or more* of ***, ---, or ___, but this is not handled528                "\n\n",529                "\n",530                " ",531                "",532            ]533        if language == Language.LATEX:534            return [535                # First, try to split along Latex sections536                "\n\\\\chapter{",537                "\n\\\\section{",538                "\n\\\\subsection{",539                "\n\\\\subsubsection{",540                # Now split by environments541                "\n\\\\begin{enumerate}",542                "\n\\\\begin{itemize}",543                "\n\\\\begin{description}",544                "\n\\\\begin{list}",545                "\n\\\\begin{quote}",546                "\n\\\\begin{quotation}",547                "\n\\\\begin{verse}",548                "\n\\\\begin{verbatim}",549                # Now split by math environments550                "\n\\\\begin{align}",551                "$$",552                "$",553                # Now split by the normal type of lines554                " ",555                "",556            ]557        if language == Language.HTML:558            return [559                # First, try to split along HTML tags560                "<body",561                "<div",562                "<p",563                "<br",564                "<li",565                "<h1",566                "<h2",567                "<h3",568                "<h4",569                "<h5",570                "<h6",571                "<span",572                "<table",573                "<tr",574                "<td",575                "<th",576                "<ul",577                "<ol",578                "<header",579                "<footer",580                "<nav",581                # Head582                "<head",583                "<style",584                "<script",585                "<meta",586                "<title",587                "",588            ]589        if language == Language.CSHARP:590            return [591                "\ninterface ",592                "\nenum ",593                "\ndelegate ",594                "\nevent ",595                # Split along class definitions596                "\nclass ",597                "\nabstract ",598                # Split along method definitions599                "\npublic ",600                "\nprotected ",601                "\nprivate ",602                "\nstatic ",603                "\nreturn ",604                # Split along control flow statements605                "\nif ",606                "\ncontinue ",607                "\nfor ",608                "\nforeach ",609                "\nwhile ",610                "\nswitch ",611                "\nbreak ",612                "\ncase ",613                "\nelse ",614                # Split by exceptions615                "\ntry ",616                "\nthrow ",617                "\nfinally ",618                "\ncatch ",619                # Split by the normal type of lines620                "\n\n",621                "\n",622                " ",623                "",624            ]625        if language == Language.SOL:626            return [627                # Split along compiler information definitions628                "\npragma ",629                "\nusing ",630                # Split along contract definitions631                "\ncontract ",632                "\ninterface ",633                "\nlibrary ",634                # Split along method definitions635                "\nconstructor ",636                "\ntype ",637                "\nfunction ",638                "\nevent ",639                "\nmodifier ",640                "\nerror ",641                "\nstruct ",642                "\nenum ",643                # Split along control flow statements644                "\nif ",645                "\nfor ",646                "\nwhile ",647                "\ndo while ",648                "\nassembly ",649                # Split by the normal type of lines650                "\n\n",651                "\n",652                " ",653                "",654            ]655        if language == Language.COBOL:656            return [657                # Split along divisions658                "\nIDENTIFICATION DIVISION.",659                "\nENVIRONMENT DIVISION.",660                "\nDATA DIVISION.",661                "\nPROCEDURE DIVISION.",662                # Split along sections within DATA DIVISION663                "\nWORKING-STORAGE SECTION.",664                "\nLINKAGE SECTION.",665                "\nFILE SECTION.",666                # Split along sections within PROCEDURE DIVISION667                "\nINPUT-OUTPUT SECTION.",668                # Split along paragraphs and common statements669                "\nOPEN ",670                "\nCLOSE ",671                "\nREAD ",672                "\nWRITE ",673                "\nIF ",674                "\nELSE ",675                "\nMOVE ",676                "\nPERFORM ",677                "\nUNTIL ",678                "\nVARYING ",679                "\nACCEPT ",680                "\nDISPLAY ",681                "\nSTOP RUN.",682                # Split by the normal type of lines683                "\n",684                " ",685                "",686            ]687        if language == Language.LUA:688            return [689                # Split along variable and table definitions690                "\nlocal ",691                # Split along function definitions692                "\nfunction ",693                # Split along control flow statements694                "\nif ",695                "\nfor ",696                "\nwhile ",697                "\nrepeat ",698                # Split by the normal type of lines699                "\n\n",700                "\n",701                " ",702                "",703            ]704        if language == Language.HASKELL:705            return [706                # Split along function definitions707                "\nmain :: ",708                "\nmain = ",709                "\nlet ",710                "\nin ",711                "\ndo ",712                "\nwhere ",713                "\n:: ",714                "\n= ",715                # Split along type declarations716                "\ndata ",717                "\nnewtype ",718                "\ntype ",719                # Split along module declarations720                "\nmodule ",721                # Split along import statements722                "\nimport ",723                "\nqualified ",724                "\nimport qualified ",725                # Split along typeclass declarations726                "\nclass ",727                "\ninstance ",728                # Split along case expressions729                "\ncase ",730                # Split along guards in function definitions731                "\n| ",732                # Split along record field declarations733                "\n= {",734                "\n, ",735                # Split by the normal type of lines736                "\n\n",737                "\n",738                " ",739                "",740            ]741        if language == Language.POWERSHELL:742            return [743                # Split along function definitions744                "\nfunction ",745                # Split along parameter declarations (escape parentheses)746                "\nparam ",747                # Split along control flow statements748                "\nif ",749                "\nforeach ",750                "\nfor ",751                "\nwhile ",752                "\nswitch ",753                # Split along class definitions (for PowerShell 5.0 and above)754                "\nclass ",755                # Split along try-catch-finally blocks756                "\ntry ",757                "\ncatch ",758                "\nfinally ",759                # Split by normal lines and empty spaces760                "\n\n",761                "\n",762                " ",763                "",764            ]765        if language == Language.VISUALBASIC6:766            vis = r"(?:Public|Private|Friend|Global|Static)\s+"767            return [768                # Split along definitions769                rf"\n(?!End\s){vis}?Sub\s+",770                rf"\n(?!End\s){vis}?Function\s+",771                rf"\n(?!End\s){vis}?Property\s+(?:Get|Let|Set)\s+",772                rf"\n(?!End\s){vis}?Type\s+",773                rf"\n(?!End\s){vis}?Enum\s+",774                # Split along control flow statements775                r"\n(?!End\s)If\s+",776                r"\nElseIf\s+",777                r"\nElse\s+",778                r"\nSelect\s+Case\s+",779                r"\nCase\s+",780                r"\nFor\s+",781                r"\nDo\s+",782                r"\nWhile\s+",783                r"\nWith\s+",784                # Split by the normal type of lines785                r"\n\n",786                r"\n",787                " ",788                "",789            ]790791        if language in Language._value2member_map_:792            msg = f"Language {language} is not implemented yet!"793            raise ValueError(msg)794        msg = (795            f"Language {language} is not supported! Please choose from {list(Language)}"796        )797        raise ValueError(msg)

Code quality findings 7

Avoid unnecessary list conversions; use generators where possible
unnecessary-list
splits = list(text)
Ensure functions have docstrings for documentation
missing-docstring
def from_language(
Ensure functions have docstrings for documentation
missing-docstring
"\ndef ",
Ensure functions have docstrings for documentation
missing-docstring
"\n\tdef ",
Ensure functions have docstrings for documentation
missing-docstring
"\ndef ",
Ensure functions have docstrings for documentation
missing-docstring
"\ndef ",
Ensure functions have docstrings for documentation
missing-docstring
"\ndef ",

Get this view in your editor

Same data, no extra tab — call code_get_file + code_get_findings over MCP from Claude/Cursor/Copilot.