Avoid unnecessary list conversions; use generators where possible
splits = list(text)
1"""Character text splitters."""23from __future__ import annotations45import re6from typing import Any, Literal78from langchain_text_splitters.base import Language, TextSplitter91011class CharacterTextSplitter(TextSplitter):12 """Splitting text that looks at characters."""1314 def __init__(15 self,16 separator: str = "\n\n",17 is_separator_regex: bool = False, # noqa: FBT001,FBT00218 **kwargs: Any,19 ) -> None:20 """Create a new TextSplitter."""21 super().__init__(**kwargs)22 self._separator = separator23 self._is_separator_regex = is_separator_regex2425 def split_text(self, text: str) -> list[str]:26 """Split into chunks without re-inserting lookaround separators.2728 Args:29 text: The text to split.3031 Returns:32 A list of text chunks.33 """34 # 1. Determine split pattern: raw regex or escaped literal35 sep_pattern = (36 self._separator if self._is_separator_regex else re.escape(self._separator)37 )3839 # 2. Initial split (keep separator if requested)40 splits = _split_text_with_regex(41 text, sep_pattern, keep_separator=self._keep_separator42 )4344 # 3. Detect zero-width lookaround so we never re-insert it45 lookaround_prefixes = ("(?=", "(?<!", "(?<=", "(?!")46 is_lookaround = self._is_separator_regex and any(47 self._separator.startswith(p) for p in lookaround_prefixes48 )4950 # 4. Decide merge separator:51 # - if keep_separator or lookaround -> don't re-insert52 # - else -> re-insert literal separator53 merge_sep = ""54 if not (self._keep_separator or is_lookaround):55 merge_sep = self._separator5657 # 5. Merge adjacent splits and return58 return self._merge_splits(splits, merge_sep)596061def _split_text_with_regex(62 text: str, separator: str, *, keep_separator: bool | Literal["start", "end"]63) -> list[str]:64 # Now that we have the separator, split the text65 if separator:66 if keep_separator:67 # The parentheses in the pattern keep the delimiters in the result.68 splits_ = re.split(f"({separator})", text)69 splits = (70 ([splits_[i] + splits_[i + 1] for i in range(0, len(splits_) - 1, 2)])71 if keep_separator == "end"72 else ([splits_[i] + splits_[i + 1] for i in range(1, len(splits_), 2)])73 )74 if len(splits_) % 2 == 0:75 splits += splits_[-1:]76 splits = (77 ([*splits, splits_[-1]])78 if keep_separator == "end"79 else ([splits_[0], *splits])80 )81 else:82 splits = re.split(separator, text)83 else:84 splits = list(text)85 return [s for s in splits if s]868788class RecursiveCharacterTextSplitter(TextSplitter):89 """Splitting text by recursively look at characters.9091 Recursively tries to split by different characters to find one92 that works.93 """9495 def __init__(96 self,97 separators: list[str] | None = None,98 keep_separator: bool | Literal["start", "end"] = True, # noqa: FBT001,FBT00299 is_separator_regex: bool = False, # noqa: FBT001,FBT002100 **kwargs: Any,101 ) -> None:102 """Create a new TextSplitter."""103 super().__init__(keep_separator=keep_separator, **kwargs)104 self._separators = separators or ["\n\n", "\n", " ", ""]105 self._is_separator_regex = is_separator_regex106107 def _split_text(self, text: str, separators: list[str]) -> list[str]:108 """Split incoming text and return chunks."""109 final_chunks = []110 # Get appropriate separator to use111 separator = separators[-1]112 new_separators = []113 for i, s_ in enumerate(separators):114 separator_ = s_ if self._is_separator_regex else re.escape(s_)115 if not s_:116 separator = s_117 break118 if re.search(separator_, text):119 separator = s_120 new_separators = separators[i + 1 :]121 break122123 separator_ = separator if self._is_separator_regex else re.escape(separator)124 splits = _split_text_with_regex(125 text, separator_, keep_separator=self._keep_separator126 )127128 # Now go merging things, recursively splitting longer texts.129 good_splits = []130 separator_ = "" if self._keep_separator else separator131 for s in splits:132 if self._length_function(s) < self._chunk_size:133 good_splits.append(s)134 else:135 if good_splits:136 merged_text = self._merge_splits(good_splits, separator_)137 final_chunks.extend(merged_text)138 good_splits = []139 if not new_separators:140 final_chunks.append(s)141 else:142 other_info = self._split_text(s, new_separators)143 final_chunks.extend(other_info)144 if good_splits:145 merged_text = self._merge_splits(good_splits, separator_)146 final_chunks.extend(merged_text)147 return final_chunks148149 def split_text(self, text: str) -> list[str]:150 """Split the input text into smaller chunks based on predefined separators.151152 Args:153 text: The input text to be split.154155 Returns:156 A list of text chunks obtained after splitting.157 """158 return self._split_text(text, self._separators)159160 @classmethod161 def from_language(162 cls, language: Language, **kwargs: Any163 ) -> RecursiveCharacterTextSplitter:164 """Return an instance of this class based on a specific language.165166 This method initializes the text splitter with language-specific separators.167168 Args:169 language: The language to configure the text splitter for.170 **kwargs: Additional keyword arguments to customize the splitter.171172 Returns:173 An instance of the text splitter configured for the specified language.174 """175 separators = cls.get_separators_for_language(language)176 return cls(separators=separators, is_separator_regex=True, **kwargs)177178 @staticmethod179 def get_separators_for_language(language: Language) -> list[str]:180 """Retrieve a list of separators specific to the given language.181182 Args:183 language: The language for which to get the separators.184185 Returns:186 A list of separators appropriate for the specified language.187188 Raises:189 ValueError: If the language is not implemented or supported.190 """191 if language in {Language.C, Language.CPP}:192 return [193 # Split along class definitions194 "\nclass ",195 # Split along function definitions196 "\nvoid ",197 "\nint ",198 "\nfloat ",199 "\ndouble ",200 # Split along control flow statements201 "\nif ",202 "\nfor ",203 "\nwhile ",204 "\nswitch ",205 "\ncase ",206 # Split by the normal type of lines207 "\n\n",208 "\n",209 " ",210 "",211 ]212 if language == Language.GO:213 return [214 # Split along function definitions215 "\nfunc ",216 "\nvar ",217 "\nconst ",218 "\ntype ",219 # Split along control flow statements220 "\nif ",221 "\nfor ",222 "\nswitch ",223 "\ncase ",224 # Split by the normal type of lines225 "\n\n",226 "\n",227 " ",228 "",229 ]230 if language == Language.JAVA:231 return [232 # Split along class definitions233 "\nclass ",234 # Split along method definitions235 "\npublic ",236 "\nprotected ",237 "\nprivate ",238 "\nstatic ",239 # Split along control flow statements240 "\nif ",241 "\nfor ",242 "\nwhile ",243 "\nswitch ",244 "\ncase ",245 # Split by the normal type of lines246 "\n\n",247 "\n",248 " ",249 "",250 ]251 if language == Language.KOTLIN:252 return [253 # Split along class definitions254 "\nclass ",255 # Split along method definitions256 "\npublic ",257 "\nprotected ",258 "\nprivate ",259 "\ninternal ",260 "\ncompanion ",261 "\nfun ",262 "\nval ",263 "\nvar ",264 # Split along control flow statements265 "\nif ",266 "\nfor ",267 "\nwhile ",268 "\nwhen ",269 "\nelse ",270 # Split by the normal type of lines271 "\n\n",272 "\n",273 " ",274 "",275 ]276 if language == Language.JS:277 return [278 # Split along function definitions279 "\nfunction ",280 "\nconst ",281 "\nlet ",282 "\nvar ",283 "\nclass ",284 # Split along control flow statements285 "\nif ",286 "\nfor ",287 "\nwhile ",288 "\nswitch ",289 "\ncase ",290 "\ndefault ",291 # Split by the normal type of lines292 "\n\n",293 "\n",294 " ",295 "",296 ]297 if language == Language.TS:298 return [299 "\nenum ",300 "\ninterface ",301 "\nnamespace ",302 "\ntype ",303 # Split along class definitions304 "\nclass ",305 # Split along function definitions306 "\nfunction ",307 "\nconst ",308 "\nlet ",309 "\nvar ",310 # Split along control flow statements311 "\nif ",312 "\nfor ",313 "\nwhile ",314 "\nswitch ",315 "\ncase ",316 "\ndefault ",317 # Split by the normal type of lines318 "\n\n",319 "\n",320 " ",321 "",322 ]323 if language == Language.PHP:324 return [325 # Split along function definitions326 "\nfunction ",327 # Split along class definitions328 "\nclass ",329 # Split along control flow statements330 "\nif ",331 "\nforeach ",332 "\nwhile ",333 "\ndo ",334 "\nswitch ",335 "\ncase ",336 # Split by the normal type of lines337 "\n\n",338 "\n",339 " ",340 "",341 ]342 if language == Language.PROTO:343 return [344 # Split along message definitions345 "\nmessage ",346 # Split along service definitions347 "\nservice ",348 # Split along enum definitions349 "\nenum ",350 # Split along option definitions351 "\noption ",352 # Split along import statements353 "\nimport ",354 # Split along syntax declarations355 "\nsyntax ",356 # Split by the normal type of lines357 "\n\n",358 "\n",359 " ",360 "",361 ]362 if language == Language.PYTHON:363 return [364 # First, try to split along class definitions365 "\nclass ",366 "\ndef ",367 "\n\tdef ",368 # Now split by the normal type of lines369 "\n\n",370 "\n",371 " ",372 "",373 ]374 if language == Language.R:375 return [376 # Split along function definitions377 "\nfunction ",378 # Split along S4 class and method definitions379 "\nsetClass\\(",380 "\nsetMethod\\(",381 "\nsetGeneric\\(",382 # Split along control flow statements383 "\nif ",384 "\nelse ",385 "\nfor ",386 "\nwhile ",387 "\nrepeat ",388 # Split along package loading389 "\nlibrary\\(",390 "\nrequire\\(",391 # Split by the normal type of lines392 "\n\n",393 "\n",394 " ",395 "",396 ]397 if language == Language.RST:398 return [399 # Split along section titles400 "\n=+\n",401 "\n-+\n",402 "\n\\*+\n",403 # Split along directive markers404 "\n\n.. *\n\n",405 # Split by the normal type of lines406 "\n\n",407 "\n",408 " ",409 "",410 ]411 if language == Language.RUBY:412 return [413 # Split along method definitions414 "\ndef ",415 "\nclass ",416 # Split along control flow statements417 "\nif ",418 "\nunless ",419 "\nwhile ",420 "\nfor ",421 "\ndo ",422 "\nbegin ",423 "\nrescue ",424 # Split by the normal type of lines425 "\n\n",426 "\n",427 " ",428 "",429 ]430 if language == Language.ELIXIR:431 return [432 # Split along method function and module definition433 "\ndef ",434 "\ndefp ",435 "\ndefmodule ",436 "\ndefprotocol ",437 "\ndefmacro ",438 "\ndefmacrop ",439 # Split along control flow statements440 "\nif ",441 "\nunless ",442 "\ncase ",443 "\ncond ",444 "\nwith ",445 "\nfor ",446 "\ndo ",447 # Split by the normal type of lines448 "\n\n",449 "\n",450 " ",451 "",452 ]453 if language == Language.RUST:454 return [455 # Split along function definitions456 "\nfn ",457 "\nconst ",458 "\nlet ",459 # Split along control flow statements460 "\nif ",461 "\nwhile ",462 "\nfor ",463 "\nloop ",464 "\nmatch ",465 # Split by the normal type of lines466 "\n\n",467 "\n",468 " ",469 "",470 ]471 if language == Language.SCALA:472 return [473 # Split along class definitions474 "\nclass ",475 "\nobject ",476 # Split along method definitions477 "\ndef ",478 "\nval ",479 "\nvar ",480 # Split along control flow statements481 "\nif ",482 "\nfor ",483 "\nwhile ",484 "\nmatch ",485 "\ncase ",486 # Split by the normal type of lines487 "\n\n",488 "\n",489 " ",490 "",491 ]492 if language == Language.SWIFT:493 return [494 # Split along function definitions495 "\nfunc ",496 # Split along class definitions497 "\nclass ",498 "\nstruct ",499 "\nenum ",500 # Split along control flow statements501 "\nif ",502 "\nfor ",503 "\nwhile ",504 "\ndo ",505 "\nswitch ",506 "\ncase ",507 # Split by the normal type of lines508 "\n\n",509 "\n",510 " ",511 "",512 ]513 if language == Language.MARKDOWN:514 return [515 # First, try to split along Markdown headings (starting with level 2)516 "\n#{1,6} ",517 # Note the alternative syntax for headings (below) is not handled here518 # Heading level 2519 # ---------------520 # End of code block521 "```\n",522 # Horizontal lines523 "\n\\*\\*\\*+\n",524 "\n---+\n",525 "\n___+\n",526 # Note that this splitter doesn't handle horizontal lines defined527 # by *three or more* of ***, ---, or ___, but this is not handled528 "\n\n",529 "\n",530 " ",531 "",532 ]533 if language == Language.LATEX:534 return [535 # First, try to split along Latex sections536 "\n\\\\chapter{",537 "\n\\\\section{",538 "\n\\\\subsection{",539 "\n\\\\subsubsection{",540 # Now split by environments541 "\n\\\\begin{enumerate}",542 "\n\\\\begin{itemize}",543 "\n\\\\begin{description}",544 "\n\\\\begin{list}",545 "\n\\\\begin{quote}",546 "\n\\\\begin{quotation}",547 "\n\\\\begin{verse}",548 "\n\\\\begin{verbatim}",549 # Now split by math environments550 "\n\\\\begin{align}",551 "$$",552 "$",553 # Now split by the normal type of lines554 " ",555 "",556 ]557 if language == Language.HTML:558 return [559 # First, try to split along HTML tags560 "<body",561 "<div",562 "<p",563 "<br",564 "<li",565 "<h1",566 "<h2",567 "<h3",568 "<h4",569 "<h5",570 "<h6",571 "<span",572 "<table",573 "<tr",574 "<td",575 "<th",576 "<ul",577 "<ol",578 "<header",579 "<footer",580 "<nav",581 # Head582 "<head",583 "<style",584 "<script",585 "<meta",586 "<title",587 "",588 ]589 if language == Language.CSHARP:590 return [591 "\ninterface ",592 "\nenum ",593 "\ndelegate ",594 "\nevent ",595 # Split along class definitions596 "\nclass ",597 "\nabstract ",598 # Split along method definitions599 "\npublic ",600 "\nprotected ",601 "\nprivate ",602 "\nstatic ",603 "\nreturn ",604 # Split along control flow statements605 "\nif ",606 "\ncontinue ",607 "\nfor ",608 "\nforeach ",609 "\nwhile ",610 "\nswitch ",611 "\nbreak ",612 "\ncase ",613 "\nelse ",614 # Split by exceptions615 "\ntry ",616 "\nthrow ",617 "\nfinally ",618 "\ncatch ",619 # Split by the normal type of lines620 "\n\n",621 "\n",622 " ",623 "",624 ]625 if language == Language.SOL:626 return [627 # Split along compiler information definitions628 "\npragma ",629 "\nusing ",630 # Split along contract definitions631 "\ncontract ",632 "\ninterface ",633 "\nlibrary ",634 # Split along method definitions635 "\nconstructor ",636 "\ntype ",637 "\nfunction ",638 "\nevent ",639 "\nmodifier ",640 "\nerror ",641 "\nstruct ",642 "\nenum ",643 # Split along control flow statements644 "\nif ",645 "\nfor ",646 "\nwhile ",647 "\ndo while ",648 "\nassembly ",649 # Split by the normal type of lines650 "\n\n",651 "\n",652 " ",653 "",654 ]655 if language == Language.COBOL:656 return [657 # Split along divisions658 "\nIDENTIFICATION DIVISION.",659 "\nENVIRONMENT DIVISION.",660 "\nDATA DIVISION.",661 "\nPROCEDURE DIVISION.",662 # Split along sections within DATA DIVISION663 "\nWORKING-STORAGE SECTION.",664 "\nLINKAGE SECTION.",665 "\nFILE SECTION.",666 # Split along sections within PROCEDURE DIVISION667 "\nINPUT-OUTPUT SECTION.",668 # Split along paragraphs and common statements669 "\nOPEN ",670 "\nCLOSE ",671 "\nREAD ",672 "\nWRITE ",673 "\nIF ",674 "\nELSE ",675 "\nMOVE ",676 "\nPERFORM ",677 "\nUNTIL ",678 "\nVARYING ",679 "\nACCEPT ",680 "\nDISPLAY ",681 "\nSTOP RUN.",682 # Split by the normal type of lines683 "\n",684 " ",685 "",686 ]687 if language == Language.LUA:688 return [689 # Split along variable and table definitions690 "\nlocal ",691 # Split along function definitions692 "\nfunction ",693 # Split along control flow statements694 "\nif ",695 "\nfor ",696 "\nwhile ",697 "\nrepeat ",698 # Split by the normal type of lines699 "\n\n",700 "\n",701 " ",702 "",703 ]704 if language == Language.HASKELL:705 return [706 # Split along function definitions707 "\nmain :: ",708 "\nmain = ",709 "\nlet ",710 "\nin ",711 "\ndo ",712 "\nwhere ",713 "\n:: ",714 "\n= ",715 # Split along type declarations716 "\ndata ",717 "\nnewtype ",718 "\ntype ",719 # Split along module declarations720 "\nmodule ",721 # Split along import statements722 "\nimport ",723 "\nqualified ",724 "\nimport qualified ",725 # Split along typeclass declarations726 "\nclass ",727 "\ninstance ",728 # Split along case expressions729 "\ncase ",730 # Split along guards in function definitions731 "\n| ",732 # Split along record field declarations733 "\n= {",734 "\n, ",735 # Split by the normal type of lines736 "\n\n",737 "\n",738 " ",739 "",740 ]741 if language == Language.POWERSHELL:742 return [743 # Split along function definitions744 "\nfunction ",745 # Split along parameter declarations (escape parentheses)746 "\nparam ",747 # Split along control flow statements748 "\nif ",749 "\nforeach ",750 "\nfor ",751 "\nwhile ",752 "\nswitch ",753 # Split along class definitions (for PowerShell 5.0 and above)754 "\nclass ",755 # Split along try-catch-finally blocks756 "\ntry ",757 "\ncatch ",758 "\nfinally ",759 # Split by normal lines and empty spaces760 "\n\n",761 "\n",762 " ",763 "",764 ]765 if language == Language.VISUALBASIC6:766 vis = r"(?:Public|Private|Friend|Global|Static)\s+"767 return [768 # Split along definitions769 rf"\n(?!End\s){vis}?Sub\s+",770 rf"\n(?!End\s){vis}?Function\s+",771 rf"\n(?!End\s){vis}?Property\s+(?:Get|Let|Set)\s+",772 rf"\n(?!End\s){vis}?Type\s+",773 rf"\n(?!End\s){vis}?Enum\s+",774 # Split along control flow statements775 r"\n(?!End\s)If\s+",776 r"\nElseIf\s+",777 r"\nElse\s+",778 r"\nSelect\s+Case\s+",779 r"\nCase\s+",780 r"\nFor\s+",781 r"\nDo\s+",782 r"\nWhile\s+",783 r"\nWith\s+",784 # Split by the normal type of lines785 r"\n\n",786 r"\n",787 " ",788 "",789 ]790791 if language in Language._value2member_map_:792 msg = f"Language {language} is not implemented yet!"793 raise ValueError(msg)794 msg = (795 f"Language {language} is not supported! Please choose from {list(Language)}"796 )797 raise ValueError(msg)
Same data, no extra tab — call code_get_file + code_get_findings over MCP from Claude/Cursor/Copilot.