1"""Test text splitting functionality."""23from __future__ import annotations45import json6import random7import re8import string9import textwrap10from typing import TYPE_CHECKING, Any1112import pytest13from langchain_core._api import suppress_langchain_beta_warning14from langchain_core.documents import Document1516from langchain_text_splitters import (17 Language,18 RecursiveCharacterTextSplitter,19 TextSplitter,20 Tokenizer,21)22from langchain_text_splitters.base import split_text_on_tokens23from langchain_text_splitters.character import CharacterTextSplitter24from langchain_text_splitters.html import (25 HTMLHeaderTextSplitter,26 HTMLSectionSplitter,27 HTMLSemanticPreservingSplitter,28)29from langchain_text_splitters.json import RecursiveJsonSplitter30from langchain_text_splitters.jsx import JSFrameworkTextSplitter31from langchain_text_splitters.markdown import (32 ExperimentalMarkdownSyntaxTextSplitter,33 MarkdownHeaderTextSplitter,34)35from langchain_text_splitters.python import PythonCodeTextSplitter3637if TYPE_CHECKING:38 from collections.abc import Callable3940 from bs4 import Tag4142FAKE_PYTHON_TEXT = """43class Foo:4445 def bar():464748def foo():4950def testing_func():5152def bar():53"""545556def test_character_text_splitter() -> None:57 """Test splitting by character count."""58 text = "foo bar baz 123"59 splitter = CharacterTextSplitter(separator=" ", chunk_size=7, chunk_overlap=3)60 output = splitter.split_text(text)61 expected_output = ["foo bar", "bar baz", "baz 123"]62 assert output == expected_output636465def test_character_text_splitter_empty_doc() -> None:66 """Test splitting by character count doesn't create empty documents."""67 text = "foo bar"68 splitter = CharacterTextSplitter(separator=" ", chunk_size=2, chunk_overlap=0)69 output = splitter.split_text(text)70 expected_output = ["foo", "bar"]71 assert output == expected_output727374def test_character_text_splitter_separtor_empty_doc() -> None:75 """Test edge cases are separators."""76 text = "f b"77 splitter = CharacterTextSplitter(separator=" ", chunk_size=2, chunk_overlap=0)78 output = splitter.split_text(text)79 expected_output = ["f", "b"]80 assert output == expected_output818283def test_character_text_splitter_long() -> None:84 """Test splitting by character count on long words."""85 text = "foo bar baz a a"86 splitter = CharacterTextSplitter(separator=" ", chunk_size=3, chunk_overlap=1)87 output = splitter.split_text(text)88 expected_output = ["foo", "bar", "baz", "a a"]89 assert output == expected_output909192def test_character_text_splitter_short_words_first() -> None:93 """Test splitting by character count when shorter words are first."""94 text = "a a foo bar baz"95 splitter = CharacterTextSplitter(separator=" ", chunk_size=3, chunk_overlap=1)96 output = splitter.split_text(text)97 expected_output = ["a a", "foo", "bar", "baz"]98 assert output == expected_output99100101def test_character_text_splitter_longer_words() -> None:102 """Test splitting by characters when splits not found easily."""103 text = "foo bar baz 123"104 splitter = CharacterTextSplitter(separator=" ", chunk_size=1, chunk_overlap=1)105 output = splitter.split_text(text)106 expected_output = ["foo", "bar", "baz", "123"]107 assert output == expected_output108109110# edge cases111def test_character_text_splitter_no_separator_in_text() -> None:112 """Text splitting where there is no separator but a single word."""113 text = "singleword"114 splitter = CharacterTextSplitter(separator=" ", chunk_size=10, chunk_overlap=0)115 output = splitter.split_text(text)116 expected_output = ["singleword"]117 assert output == expected_output118119120def test_character_text_splitter_handle_chunksize_equal_to_chunkoverlap() -> None:121 """Text splitting safe guards when chunk size is equal chunk overlap."""122 text = "hello"123 splitter = CharacterTextSplitter(separator=" ", chunk_size=5, chunk_overlap=5)124 output = splitter.split_text(text)125 expected_output = ["hello"]126 assert output == expected_output127128129def test_character_text_splitter_empty_input() -> None:130 """Test splitting safely where there is no input to process."""131 text = ""132 splitter = CharacterTextSplitter(separator=" ", chunk_size=5, chunk_overlap=0)133 output = splitter.split_text(text)134 expected_output: list[str] = []135 assert output == expected_output136137138def test_character_text_splitter_whitespace_only() -> None:139 """Test splitting safely where there is white space."""140 text = " "141 splitter = CharacterTextSplitter(separator=" ", chunk_size=5, chunk_overlap=0)142 output = splitter.split_text(text)143 expected_output: list[str] = []144 assert output == expected_output145146147@pytest.mark.parametrize(148 ("separator", "is_separator_regex"), [(re.escape("."), True), (".", False)]149)150def test_character_text_splitter_keep_separator_regex(151 *, separator: str, is_separator_regex: bool152) -> None:153 """Test CharacterTextSplitter keep separator regex.154155 Test splitting by characters while keeping the separator156 that is a regex special character.157 """158 text = "foo.bar.baz.123"159 splitter = CharacterTextSplitter(160 separator=separator,161 chunk_size=1,162 chunk_overlap=0,163 keep_separator=True,164 is_separator_regex=is_separator_regex,165 )166 output = splitter.split_text(text)167 expected_output = ["foo", ".bar", ".baz", ".123"]168 assert output == expected_output169170171@pytest.mark.parametrize(172 ("separator", "is_separator_regex"), [(re.escape("."), True), (".", False)]173)174def test_character_text_splitter_keep_separator_regex_start(175 *, separator: str, is_separator_regex: bool176) -> None:177 """Test CharacterTextSplitter keep separator regex and put at start.178179 Test splitting by characters while keeping the separator180 that is a regex special character and placing it at the start of each chunk.181 """182 text = "foo.bar.baz.123"183 splitter = CharacterTextSplitter(184 separator=separator,185 chunk_size=1,186 chunk_overlap=0,187 keep_separator="start",188 is_separator_regex=is_separator_regex,189 )190 output = splitter.split_text(text)191 expected_output = ["foo", ".bar", ".baz", ".123"]192 assert output == expected_output193194195@pytest.mark.parametrize(196 ("separator", "is_separator_regex"), [(re.escape("."), True), (".", False)]197)198def test_character_text_splitter_keep_separator_regex_end(199 *, separator: str, is_separator_regex: bool200) -> None:201 """Test CharacterTextSplitter keep separator regex and put at end.202203 Test splitting by characters while keeping the separator204 that is a regex special character and placing it at the end of each chunk.205 """206 text = "foo.bar.baz.123"207 splitter = CharacterTextSplitter(208 separator=separator,209 chunk_size=1,210 chunk_overlap=0,211 keep_separator="end",212 is_separator_regex=is_separator_regex,213 )214 output = splitter.split_text(text)215 expected_output = ["foo.", "bar.", "baz.", "123"]216 assert output == expected_output217218219@pytest.mark.parametrize(220 ("separator", "is_separator_regex"), [(re.escape("."), True), (".", False)]221)222def test_character_text_splitter_discard_separator_regex(223 *, separator: str, is_separator_regex: bool224) -> None:225 """Test CharacterTextSplitter discard separator regex.226227 Test splitting by characters discarding the separator228 that is a regex special character.229 """230 text = "foo.bar.baz.123"231 splitter = CharacterTextSplitter(232 separator=separator,233 chunk_size=1,234 chunk_overlap=0,235 keep_separator=False,236 is_separator_regex=is_separator_regex,237 )238 output = splitter.split_text(text)239 expected_output = ["foo", "bar", "baz", "123"]240 assert output == expected_output241242243def test_recursive_character_text_splitter_keep_separators() -> None:244 split_tags = [",", "."]245 query = "Apple,banana,orange and tomato."246 # start247 splitter = RecursiveCharacterTextSplitter(248 chunk_size=10,249 chunk_overlap=0,250 separators=split_tags,251 keep_separator="start",252 )253 result = splitter.split_text(query)254 assert result == ["Apple", ",banana", ",orange and tomato", "."]255256 # end257 splitter = RecursiveCharacterTextSplitter(258 chunk_size=10,259 chunk_overlap=0,260 separators=split_tags,261 keep_separator="end",262 )263 result = splitter.split_text(query)264 assert result == ["Apple,", "banana,", "orange and tomato."]265266267def test_character_text_splitting_args() -> None:268 """Test invalid arguments."""269 with pytest.raises(270 ValueError,271 match=re.escape(272 "Got a larger chunk overlap (4) than chunk size (2), should be smaller."273 ),274 ):275 CharacterTextSplitter(chunk_size=2, chunk_overlap=4)276 for invalid_size in (0, -1):277 with pytest.raises(ValueError, match="chunk_size must be > 0, got"):278 CharacterTextSplitter(chunk_size=invalid_size)279 with pytest.raises(ValueError, match="chunk_overlap must be >= 0, got -1"):280 CharacterTextSplitter(chunk_size=2, chunk_overlap=-1)281282283def test_merge_splits() -> None:284 """Test merging splits with a given separator."""285 splitter = CharacterTextSplitter(separator=" ", chunk_size=9, chunk_overlap=2)286 splits = ["foo", "bar", "baz"]287 expected_output = ["foo bar", "baz"]288 output = splitter._merge_splits(splits, separator=" ")289 assert output == expected_output290291292def test_create_documents() -> None:293 """Test create documents method."""294 texts = ["foo bar", "baz"]295 splitter = CharacterTextSplitter(separator=" ", chunk_size=3, chunk_overlap=0)296 docs = splitter.create_documents(texts)297 expected_docs = [298 Document(page_content="foo"),299 Document(page_content="bar"),300 Document(page_content="baz"),301 ]302 assert docs == expected_docs303304305def test_create_documents_with_metadata() -> None:306 """Test create documents with metadata method."""307 texts = ["foo bar", "baz"]308 splitter = CharacterTextSplitter(separator=" ", chunk_size=3, chunk_overlap=0)309 docs = splitter.create_documents(texts, [{"source": "1"}, {"source": "2"}])310 expected_docs = [311 Document(page_content="foo", metadata={"source": "1"}),312 Document(page_content="bar", metadata={"source": "1"}),313 Document(page_content="baz", metadata={"source": "2"}),314 ]315 assert docs == expected_docs316317318@pytest.mark.parametrize(319 ("splitter", "text", "expected_docs"),320 [321 (322 CharacterTextSplitter(323 separator=" ", chunk_size=7, chunk_overlap=3, add_start_index=True324 ),325 "foo bar baz 123",326 [327 Document(page_content="foo bar", metadata={"start_index": 0}),328 Document(page_content="bar baz", metadata={"start_index": 4}),329 Document(page_content="baz 123", metadata={"start_index": 8}),330 ],331 ),332 (333 RecursiveCharacterTextSplitter(334 chunk_size=6,335 chunk_overlap=0,336 separators=["\n\n", "\n", " ", ""],337 add_start_index=True,338 ),339 "w1 w1 w1 w1 w1 w1 w1 w1 w1",340 [341 Document(page_content="w1 w1", metadata={"start_index": 0}),342 Document(page_content="w1 w1", metadata={"start_index": 6}),343 Document(page_content="w1 w1", metadata={"start_index": 12}),344 Document(page_content="w1 w1", metadata={"start_index": 18}),345 Document(page_content="w1", metadata={"start_index": 24}),346 ],347 ),348 ],349)350def test_create_documents_with_start_index(351 splitter: TextSplitter, text: str, expected_docs: list[Document]352) -> None:353 """Test create documents method."""354 docs = splitter.create_documents([text])355 assert docs == expected_docs356 for doc in docs:357 s_i = doc.metadata["start_index"]358 assert text[s_i : s_i + len(doc.page_content)] == doc.page_content359360361def test_metadata_not_shallow() -> None:362 """Test that metadatas are not shallow."""363 texts = ["foo bar"]364 splitter = CharacterTextSplitter(separator=" ", chunk_size=3, chunk_overlap=0)365 docs = splitter.create_documents(texts, [{"source": "1"}])366 expected_docs = [367 Document(page_content="foo", metadata={"source": "1"}),368 Document(page_content="bar", metadata={"source": "1"}),369 ]370 assert docs == expected_docs371 docs[0].metadata["foo"] = 1372 assert docs[0].metadata == {"source": "1", "foo": 1}373 assert docs[1].metadata == {"source": "1"}374375376def test_iterative_text_splitter_keep_separator() -> None:377 chunk_size = 5378 output = __test_iterative_text_splitter(chunk_size=chunk_size, keep_separator=True)379380 assert output == [381 "....5",382 "X..3",383 "Y...4",384 "X....5",385 "Y...",386 ]387388389def test_iterative_text_splitter_discard_separator() -> None:390 chunk_size = 5391 output = __test_iterative_text_splitter(chunk_size=chunk_size, keep_separator=False)392393 assert output == [394 "....5",395 "..3",396 "...4",397 "....5",398 "...",399 ]400401402def __test_iterative_text_splitter(403 *, chunk_size: int, keep_separator: bool404) -> list[str]:405 chunk_size += 1 if keep_separator else 0406407 splitter = RecursiveCharacterTextSplitter(408 chunk_size=chunk_size,409 chunk_overlap=0,410 separators=["X", "Y"],411 keep_separator=keep_separator,412 )413 text = "....5X..3Y...4X....5Y..."414 output = splitter.split_text(text)415 for chunk in output:416 assert len(chunk) <= chunk_size, f"Chunk is larger than {chunk_size}"417 return output418419420def test_iterative_text_splitter() -> None:421 """Test iterative text splitter."""422 text = """Hi.\n\nI'm Harrison.\n\nHow? Are? You?\nOkay then f f f f.423This is a weird text to write, but gotta test the splittingggg some how.424425Bye!\n\n-H."""426 splitter = RecursiveCharacterTextSplitter(chunk_size=10, chunk_overlap=1)427 output = splitter.split_text(text)428 expected_output = [429 "Hi.",430 "I'm",431 "Harrison.",432 "How? Are?",433 "You?",434 "Okay then",435 "f f f f.",436 "This is a",437 "weird",438 "text to",439 "write,",440 "but gotta",441 "test the",442 "splitting",443 "gggg",444 "some how.",445 "Bye!",446 "-H.",447 ]448 assert output == expected_output449450451def test_split_documents() -> None:452 """Test split_documents."""453 splitter = CharacterTextSplitter(separator="", chunk_size=1, chunk_overlap=0)454 docs = [455 Document(page_content="foo", metadata={"source": "1"}),456 Document(page_content="bar", metadata={"source": "2"}),457 Document(page_content="baz", metadata={"source": "1"}),458 ]459 expected_output = [460 Document(page_content="f", metadata={"source": "1"}),461 Document(page_content="o", metadata={"source": "1"}),462 Document(page_content="o", metadata={"source": "1"}),463 Document(page_content="b", metadata={"source": "2"}),464 Document(page_content="a", metadata={"source": "2"}),465 Document(page_content="r", metadata={"source": "2"}),466 Document(page_content="b", metadata={"source": "1"}),467 Document(page_content="a", metadata={"source": "1"}),468 Document(page_content="z", metadata={"source": "1"}),469 ]470 assert splitter.split_documents(docs) == expected_output471472473def test_python_text_splitter() -> None:474 splitter = PythonCodeTextSplitter(chunk_size=30, chunk_overlap=0)475 splits = splitter.split_text(FAKE_PYTHON_TEXT)476 split_0 = """class Foo:\n\n def bar():"""477 split_1 = """def foo():"""478 split_2 = """def testing_func():"""479 split_3 = """def bar():"""480 expected_splits = [split_0, split_1, split_2, split_3]481 assert splits == expected_splits482483484FAKE_JSX_TEXT = """485import React from 'react';486import OtherComponent from './OtherComponent';487488function MyComponent() {489 const [count, setCount] = React.useState(0);490491 const handleClick = () => {492 setCount(count + 1);493 };494495 return (496 <div>497 <h1>Counter: {count}</h1>498 <button onClick={handleClick}>499 Increment500 </button>501 <OtherComponent />502 </div>503 );504}505506export default MyComponent;507"""508509510def test_jsx_text_splitter() -> None:511 splitter = JSFrameworkTextSplitter(chunk_size=30, chunk_overlap=0)512 splits = splitter.split_text(FAKE_JSX_TEXT)513514 expected_splits = [515 (516 "\nimport React from 'react';\n"517 "import OtherComponent from './OtherComponent';\n"518 ),519 "\nfunction MyComponent() {\n const [count, setCount] = React.useState(0);",520 "\n\n const handleClick = () => {\n setCount(count + 1);\n };",521 "return (",522 "<div>",523 "<h1>Counter: {count}</h1>\n ",524 "<button onClick={handleClick}>\n Increment\n </button>\n ",525 "<OtherComponent />\n </div>\n );\n}\n",526 "export default MyComponent;",527 ]528 assert [s.strip() for s in splits] == [s.strip() for s in expected_splits]529530531FAKE_VUE_TEXT = """532<template>533 <div>534 <h1>{{ title }}</h1>535 <button @click="increment">536 Count is: {{ count }}537 </button>538 </div>539</template>540541<script>542export default {543 data() {544 return {545 title: 'Counter App',546 count: 0547 }548 },549 methods: {550 increment() {551 this.count++552 }553 }554}555</script>556557<style>558button {559 color: blue;560}561</style>562"""563564565def test_vue_text_splitter() -> None:566 splitter = JSFrameworkTextSplitter(chunk_size=30, chunk_overlap=0)567 splits = splitter.split_text(FAKE_VUE_TEXT)568569 expected_splits = [570 "<template>",571 "<div>",572 "<h1>{{ title }}</h1>",573 (574 '<button @click="increment">\n Count is: {{ count }}\n'575 " </button>\n </div>\n</template>"576 ),577 "<script>",578 "export",579 (580 " default {\n data() {\n return {\n title: 'Counter App',\n "581 "count: 0\n }\n },\n methods: {\n increment() {\n "582 "this.count++\n }\n }\n}\n</script>"583 ),584 "<style>\nbutton {\n color: blue;\n}\n</style>",585 ]586 assert [s.strip() for s in splits] == [s.strip() for s in expected_splits]587588589FAKE_SVELTE_TEXT = """590<script>591 let count = 0592593 function increment() {594 count += 1595 }596</script>597598<main>599 <h1>Counter App</h1>600 <button on:click={increment}>601 Count is: {count}602 </button>603</main>604605<style>606 button {607 color: blue;608 }609</style>610"""611612613def test_svelte_text_splitter() -> None:614 splitter = JSFrameworkTextSplitter(chunk_size=30, chunk_overlap=0)615 splits = splitter.split_text(FAKE_SVELTE_TEXT)616617 expected_splits = [618 "<script>\n let count = 0",619 "\n\n function increment() {\n count += 1\n }\n</script>",620 "<main>",621 "<h1>Counter App</h1>",622 "<button on:click={increment}>\n Count is: {count}\n </button>\n</main>",623 "<style>\n button {\n color: blue;\n }\n</style>",624 ]625 assert [s.strip() for s in splits] == [s.strip() for s in expected_splits]626627628def test_jsx_splitter_separator_not_mutated_across_calls() -> None:629 """Regression test: repeated split_text() calls must not mutate separators.630631 Calling split_text() multiple times on the same JSFrameworkTextSplitter632 instance must not grow the internal separator list between calls.633634 Before the fix, self._separators was overwritten with the full expanded list635 on every invocation, so a second call would start with the already-expanded636 list and append even more separators.637 """638 splitter = JSFrameworkTextSplitter(chunk_size=30, chunk_overlap=0)639640 # Record separator count after constructing (should be 0 - no custom separators)641 initial_sep_count = len(splitter._separators)642643 # Call split_text twice; the results should be identical for identical input644 splits_first = splitter.split_text(FAKE_JSX_TEXT)645 splits_second = splitter.split_text(FAKE_JSX_TEXT)646647 assert splits_first == splits_second, (648 "split_text() must return identical results on repeated calls with the "649 "same input"650 )651 assert len(splitter._separators) == initial_sep_count, (652 "split_text() must not mutate self._separators between calls"653 )654655656CHUNK_SIZE = 16657658659def test_python_code_splitter() -> None:660 splitter = RecursiveCharacterTextSplitter.from_language(661 Language.PYTHON, chunk_size=CHUNK_SIZE, chunk_overlap=0662 )663 code = """664def hello_world():665 print("Hello, World!")666667# Call the function668hello_world()669 """670 chunks = splitter.split_text(code)671 assert chunks == [672 "def",673 "hello_world():",674 'print("Hello,',675 'World!")',676 "# Call the",677 "function",678 "hello_world()",679 ]680681682def test_golang_code_splitter() -> None:683 splitter = RecursiveCharacterTextSplitter.from_language(684 Language.GO, chunk_size=CHUNK_SIZE, chunk_overlap=0685 )686 code = """687package main688689import "fmt"690691func helloWorld() {692 fmt.Println("Hello, World!")693}694695func main() {696 helloWorld()697}698 """699 chunks = splitter.split_text(code)700 assert chunks == [701 "package main",702 'import "fmt"',703 "func",704 "helloWorld() {",705 'fmt.Println("He',706 "llo,",707 'World!")',708 "}",709 "func main() {",710 "helloWorld()",711 "}",712 ]713714715def test_rst_code_splitter() -> None:716 splitter = RecursiveCharacterTextSplitter.from_language(717 Language.RST, chunk_size=CHUNK_SIZE, chunk_overlap=0718 )719 code = """720Sample Document721===============722723Section724-------725726This is the content of the section.727728Lists729-----730731- Item 1732- Item 2733- Item 3734735Comment736*******737Not a comment738739.. This is a comment740 """741 chunks = splitter.split_text(code)742 assert chunks == [743 "Sample Document",744 "===============",745 "Section",746 "-------",747 "This is the",748 "content of the",749 "section.",750 "Lists",751 "-----",752 "- Item 1",753 "- Item 2",754 "- Item 3",755 "Comment",756 "*******",757 "Not a comment",758 ".. This is a",759 "comment",760 ]761 # Special test for special characters762 code = "harry\n***\nbabylon is"763 chunks = splitter.split_text(code)764 assert chunks == ["harry", "***\nbabylon is"]765766767def test_proto_file_splitter() -> None:768 splitter = RecursiveCharacterTextSplitter.from_language(769 Language.PROTO, chunk_size=CHUNK_SIZE, chunk_overlap=0770 )771 code = """772syntax = "proto3";773774package example;775776message Person {777 string name = 1;778 int32 age = 2;779 repeated string hobbies = 3;780}781 """782 chunks = splitter.split_text(code)783 assert chunks == [784 "syntax =",785 '"proto3";',786 "package",787 "example;",788 "message Person",789 "{",790 "string name",791 "= 1;",792 "int32 age =",793 "2;",794 "repeated",795 "string hobbies",796 "= 3;",797 "}",798 ]799800801def test_javascript_code_splitter() -> None:802 splitter = RecursiveCharacterTextSplitter.from_language(803 Language.JS, chunk_size=CHUNK_SIZE, chunk_overlap=0804 )805 code = """806function helloWorld() {807 console.log("Hello, World!");808}809810// Call the function811helloWorld();812 """813 chunks = splitter.split_text(code)814 assert chunks == [815 "function",816 "helloWorld() {",817 'console.log("He',818 "llo,",819 'World!");',820 "}",821 "// Call the",822 "function",823 "helloWorld();",824 ]825826827def test_cobol_code_splitter() -> None:828 splitter = RecursiveCharacterTextSplitter.from_language(829 Language.COBOL, chunk_size=CHUNK_SIZE, chunk_overlap=0830 )831 code = """832IDENTIFICATION DIVISION.833PROGRAM-ID. HelloWorld.834DATA DIVISION.835WORKING-STORAGE SECTION.83601 GREETING PIC X(12) VALUE 'Hello, World!'.837PROCEDURE DIVISION.838DISPLAY GREETING.839STOP RUN.840 """841 chunks = splitter.split_text(code)842 assert chunks == [843 "IDENTIFICATION",844 "DIVISION.",845 "PROGRAM-ID.",846 "HelloWorld.",847 "DATA DIVISION.",848 "WORKING-STORAGE",849 "SECTION.",850 "01 GREETING",851 "PIC X(12)",852 "VALUE 'Hello,",853 "World!'.",854 "PROCEDURE",855 "DIVISION.",856 "DISPLAY",857 "GREETING.",858 "STOP RUN.",859 ]860861862def test_typescript_code_splitter() -> None:863 splitter = RecursiveCharacterTextSplitter.from_language(864 Language.TS, chunk_size=CHUNK_SIZE, chunk_overlap=0865 )866 code = """867function helloWorld(): void {868 console.log("Hello, World!");869}870871// Call the function872helloWorld();873 """874 chunks = splitter.split_text(code)875 assert chunks == [876 "function",877 "helloWorld():",878 "void {",879 'console.log("He',880 "llo,",881 'World!");',882 "}",883 "// Call the",884 "function",885 "helloWorld();",886 ]887888889def test_java_code_splitter() -> None:890 splitter = RecursiveCharacterTextSplitter.from_language(891 Language.JAVA, chunk_size=CHUNK_SIZE, chunk_overlap=0892 )893 code = """894public class HelloWorld {895 public static void main(String[] args) {896 System.out.println("Hello, World!");897 }898}899 """900 chunks = splitter.split_text(code)901 assert chunks == [902 "public class",903 "HelloWorld {",904 "public",905 "static void",906 "main(String[]",907 "args) {",908 "System.out.prin",909 'tln("Hello,',910 'World!");',911 "}\n}",912 ]913914915def test_kotlin_code_splitter() -> None:916 splitter = RecursiveCharacterTextSplitter.from_language(917 Language.KOTLIN, chunk_size=CHUNK_SIZE, chunk_overlap=0918 )919 code = """920class HelloWorld {921 companion object {922 @JvmStatic923 fun main(args: Array<String>) {924 println("Hello, World!")925 }926 }927}928 """929 chunks = splitter.split_text(code)930 assert chunks == [931 "class",932 "HelloWorld {",933 "companion",934 "object {",935 "@JvmStatic",936 "fun",937 "main(args:",938 "Array<String>)",939 "{",940 'println("Hello,',941 'World!")',942 "}\n }",943 "}",944 ]945946947def test_csharp_code_splitter() -> None:948 splitter = RecursiveCharacterTextSplitter.from_language(949 Language.CSHARP, chunk_size=CHUNK_SIZE, chunk_overlap=0950 )951 code = """952using System;953class Program954{955 static void Main()956 {957 int age = 30; // Change the age value as needed958959 // Categorize the age without any console output960 if (age < 18)961 {962 // Age is under 18963 }964 else if (age >= 18 && age < 65)965 {966 // Age is an adult967 }968 else969 {970 // Age is a senior citizen971 }972 }973}974 """975976 chunks = splitter.split_text(code)977 assert chunks == [978 "using System;",979 "class Program\n{",980 "static void",981 "Main()",982 "{",983 "int age",984 "= 30; // Change",985 "the age value",986 "as needed",987 "//",988 "Categorize the",989 "age without any",990 "console output",991 "if (age",992 "< 18)",993 "{",994 "//",995 "Age is under 18",996 "}",997 "else if",998 "(age >= 18 &&",999 "age < 65)",1000 "{",1001 "//",1002 "Age is an adult",1003 "}",1004 "else",1005 "{",1006 "//",1007 "Age is a senior",1008 "citizen",1009 "}\n }",1010 "}",1011 ]101210131014def test_csharp_separators_no_java_keywords() -> None:1015 """C# separators should not contain Java-only keywords."""1016 splitter = RecursiveCharacterTextSplitter.from_language(1017 Language.CSHARP, chunk_size=CHUNK_SIZE, chunk_overlap=01018 )1019 # "implements" is a Java keyword; C# uses ":" for interface implementation1020 assert "\nimplements " not in splitter._separators102110221023def test_elixir_separators_no_while() -> None:1024 """Elixir has no while loop; the separator should not be present."""1025 splitter = RecursiveCharacterTextSplitter.from_language(1026 Language.ELIXIR, chunk_size=CHUNK_SIZE, chunk_overlap=01027 )1028 assert "\nwhile " not in splitter._separators102910301031def test_cpp_code_splitter() -> None:1032 splitter = RecursiveCharacterTextSplitter.from_language(1033 Language.CPP, chunk_size=CHUNK_SIZE, chunk_overlap=01034 )1035 code = """1036#include <iostream>10371038int main() {1039 std::cout << "Hello, World!" << std::endl;1040 return 0;1041}1042 """1043 chunks = splitter.split_text(code)1044 assert chunks == [1045 "#include",1046 "<iostream>",1047 "int main() {",1048 "std::cout",1049 '<< "Hello,',1050 'World!" <<',1051 "std::endl;",1052 "return 0;\n}",1053 ]105410551056def test_scala_code_splitter() -> None:1057 splitter = RecursiveCharacterTextSplitter.from_language(1058 Language.SCALA, chunk_size=CHUNK_SIZE, chunk_overlap=01059 )1060 code = """1061object HelloWorld {1062 def main(args: Array[String]): Unit = {1063 println("Hello, World!")1064 }1065}1066 """1067 chunks = splitter.split_text(code)1068 assert chunks == [1069 "object",1070 "HelloWorld {",1071 "def",1072 "main(args:",1073 "Array[String]):",1074 "Unit = {",1075 'println("Hello,',1076 'World!")',1077 "}\n}",1078 ]107910801081def test_ruby_code_splitter() -> None:1082 splitter = RecursiveCharacterTextSplitter.from_language(1083 Language.RUBY, chunk_size=CHUNK_SIZE, chunk_overlap=01084 )1085 code = """1086def hello_world1087 puts "Hello, World!"1088end10891090hello_world1091 """1092 chunks = splitter.split_text(code)1093 assert chunks == [1094 "def hello_world",1095 'puts "Hello,',1096 'World!"',1097 "end",1098 "hello_world",1099 ]110011011102def test_php_code_splitter() -> None:1103 splitter = RecursiveCharacterTextSplitter.from_language(1104 Language.PHP, chunk_size=CHUNK_SIZE, chunk_overlap=01105 )1106 code = """1107<?php1108function hello_world() {1109 echo "Hello, World!";1110}11111112hello_world();1113?>1114 """1115 chunks = splitter.split_text(code)1116 assert chunks == [1117 "<?php",1118 "function",1119 "hello_world() {",1120 "echo",1121 '"Hello,',1122 'World!";',1123 "}",1124 "hello_world();",1125 "?>",1126 ]112711281129def test_swift_code_splitter() -> None:1130 splitter = RecursiveCharacterTextSplitter.from_language(1131 Language.SWIFT, chunk_size=CHUNK_SIZE, chunk_overlap=01132 )1133 code = """1134func helloWorld() {1135 print("Hello, World!")1136}11371138helloWorld()1139 """1140 chunks = splitter.split_text(code)1141 assert chunks == [1142 "func",1143 "helloWorld() {",1144 'print("Hello,',1145 'World!")',1146 "}",1147 "helloWorld()",1148 ]114911501151def test_rust_code_splitter() -> None:1152 splitter = RecursiveCharacterTextSplitter.from_language(1153 Language.RUST, chunk_size=CHUNK_SIZE, chunk_overlap=01154 )1155 code = """1156fn main() {1157 println!("Hello, World!");1158}1159 """1160 chunks = splitter.split_text(code)1161 assert chunks == ["fn main() {", 'println!("Hello', ",", 'World!");', "}"]116211631164def test_r_code_splitter() -> None:1165 splitter = RecursiveCharacterTextSplitter.from_language(1166 Language.R, chunk_size=CHUNK_SIZE, chunk_overlap=01167 )1168 code = """1169library(dplyr)11701171my_func <- function(x) {1172 return(x + 1)1173}11741175if (TRUE) {1176 print("Hello")1177}1178 """1179 chunks = splitter.split_text(code)1180 assert chunks == [1181 "library(dplyr)",1182 "my_func <-",1183 "function(x) {",1184 "return(x +",1185 "1)",1186 "}",1187 "if (TRUE) {",1188 'print("Hello")',1189 "}",1190 ]119111921193def test_markdown_code_splitter() -> None:1194 splitter = RecursiveCharacterTextSplitter.from_language(1195 Language.MARKDOWN, chunk_size=CHUNK_SIZE, chunk_overlap=01196 )1197 code = """1198# Sample Document11991200## Section12011202This is the content of the section.12031204## Lists12051206- Item 11207- Item 21208- Item 312091210### Horizontal lines12111212***********1213____________1214-------------------12151216#### Code blocks1217```1218This is a code block12191220# sample code1221a = 11222b = 21223```1224 """1225 chunks = splitter.split_text(code)1226 assert chunks == [1227 "# Sample",1228 "Document",1229 "## Section",1230 "This is the",1231 "content of the",1232 "section.",1233 "## Lists",1234 "- Item 1",1235 "- Item 2",1236 "- Item 3",1237 "### Horizontal",1238 "lines",1239 "***********",1240 "____________",1241 "---------------",1242 "----",1243 "#### Code",1244 "blocks",1245 "```",1246 "This is a code",1247 "block",1248 "# sample code",1249 "a = 1\nb = 2",1250 "```",1251 ]1252 # Special test for special characters1253 code = "harry\n***\nbabylon is"1254 chunks = splitter.split_text(code)1255 assert chunks == ["harry", "***\nbabylon is"]125612571258def test_latex_code_splitter() -> None:1259 splitter = RecursiveCharacterTextSplitter.from_language(1260 Language.LATEX, chunk_size=CHUNK_SIZE, chunk_overlap=01261 )1262 code = """1263Hi Harrison!1264\\chapter{1}1265"""1266 chunks = splitter.split_text(code)1267 assert chunks == ["Hi Harrison!", "\\chapter{1}"]126812691270def test_html_code_splitter() -> None:1271 splitter = RecursiveCharacterTextSplitter.from_language(1272 Language.HTML, chunk_size=60, chunk_overlap=01273 )1274 code = """1275<h1>Sample Document</h1>1276 <h2>Section</h2>1277 <p id="1234">Reference content.</p>12781279 <h2>Lists</h2>1280 <ul>1281 <li>Item 1</li>1282 <li>Item 2</li>1283 <li>Item 3</li>1284 </ul>12851286 <h3>A block</h3>1287 <div class="amazing">1288 <p>Some text</p>1289 <p>Some more text</p>1290 </div>1291 """1292 chunks = splitter.split_text(code)1293 assert chunks == [1294 "<h1>Sample Document</h1>\n <h2>Section</h2>",1295 '<p id="1234">Reference content.</p>',1296 "<h2>Lists</h2>\n <ul>",1297 "<li>Item 1</li>\n <li>Item 2</li>",1298 "<li>Item 3</li>\n </ul>",1299 "<h3>A block</h3>",1300 '<div class="amazing">',1301 "<p>Some text</p>",1302 "<p>Some more text</p>\n </div>",1303 ]130413051306def test_md_header_text_splitter_1() -> None:1307 """Test markdown splitter by header: Case 1."""1308 markdown_document = (1309 "# Foo\n\n"1310 " ## Bar\n\n"1311 "Hi this is Jim\n\n"1312 "Hi this is Joe\n\n"1313 " ## Baz\n\n"1314 " Hi this is Molly"1315 )1316 headers_to_split_on = [1317 ("#", "Header 1"),1318 ("##", "Header 2"),1319 ]1320 markdown_splitter = MarkdownHeaderTextSplitter(1321 headers_to_split_on=headers_to_split_on,1322 )1323 output = markdown_splitter.split_text(markdown_document)1324 expected_output = [1325 Document(1326 page_content="Hi this is Jim \nHi this is Joe",1327 metadata={"Header 1": "Foo", "Header 2": "Bar"},1328 ),1329 Document(1330 page_content="Hi this is Molly",1331 metadata={"Header 1": "Foo", "Header 2": "Baz"},1332 ),1333 ]1334 assert output == expected_output133513361337def test_md_header_text_splitter_2() -> None:1338 """Test markdown splitter by header: Case 2."""1339 markdown_document = (1340 "# Foo\n\n"1341 " ## Bar\n\n"1342 "Hi this is Jim\n\n"1343 "Hi this is Joe\n\n"1344 " ### Boo \n\n"1345 " Hi this is Lance \n\n"1346 " ## Baz\n\n"1347 " Hi this is Molly"1348 )13491350 headers_to_split_on = [1351 ("#", "Header 1"),1352 ("##", "Header 2"),1353 ("###", "Header 3"),1354 ]1355 markdown_splitter = MarkdownHeaderTextSplitter(1356 headers_to_split_on=headers_to_split_on,1357 )1358 output = markdown_splitter.split_text(markdown_document)1359 expected_output = [1360 Document(1361 page_content="Hi this is Jim \nHi this is Joe",1362 metadata={"Header 1": "Foo", "Header 2": "Bar"},1363 ),1364 Document(1365 page_content="Hi this is Lance",1366 metadata={"Header 1": "Foo", "Header 2": "Bar", "Header 3": "Boo"},1367 ),1368 Document(1369 page_content="Hi this is Molly",1370 metadata={"Header 1": "Foo", "Header 2": "Baz"},1371 ),1372 ]1373 assert output == expected_output137413751376def test_md_header_text_splitter_3() -> None:1377 """Test markdown splitter by header: Case 3."""1378 markdown_document = (1379 "# Foo\n\n"1380 " ## Bar\n\n"1381 "Hi this is Jim\n\n"1382 "Hi this is Joe\n\n"1383 " ### Boo \n\n"1384 " Hi this is Lance \n\n"1385 " #### Bim \n\n"1386 " Hi this is John \n\n"1387 " ## Baz\n\n"1388 " Hi this is Molly"1389 )13901391 headers_to_split_on = [1392 ("#", "Header 1"),1393 ("##", "Header 2"),1394 ("###", "Header 3"),1395 ("####", "Header 4"),1396 ]13971398 markdown_splitter = MarkdownHeaderTextSplitter(1399 headers_to_split_on=headers_to_split_on,1400 )1401 output = markdown_splitter.split_text(markdown_document)14021403 expected_output = [1404 Document(1405 page_content="Hi this is Jim \nHi this is Joe",1406 metadata={"Header 1": "Foo", "Header 2": "Bar"},1407 ),1408 Document(1409 page_content="Hi this is Lance",1410 metadata={"Header 1": "Foo", "Header 2": "Bar", "Header 3": "Boo"},1411 ),1412 Document(1413 page_content="Hi this is John",1414 metadata={1415 "Header 1": "Foo",1416 "Header 2": "Bar",1417 "Header 3": "Boo",1418 "Header 4": "Bim",1419 },1420 ),1421 Document(1422 page_content="Hi this is Molly",1423 metadata={"Header 1": "Foo", "Header 2": "Baz"},1424 ),1425 ]14261427 assert output == expected_output142814291430def test_md_header_text_splitter_preserve_headers_1() -> None:1431 """Test markdown splitter by header: Preserve Headers."""1432 markdown_document = (1433 "# Foo\n\n"1434 " ## Bat\n\n"1435 "Hi this is Jim\n\n"1436 "Hi Joe\n\n"1437 "## Baz\n\n"1438 "# Bar\n\n"1439 "This is Alice\n\n"1440 "This is Bob"1441 )1442 headers_to_split_on = [1443 ("#", "Header 1"),1444 ]1445 markdown_splitter = MarkdownHeaderTextSplitter(1446 headers_to_split_on=headers_to_split_on,1447 strip_headers=False,1448 )1449 output = markdown_splitter.split_text(markdown_document)1450 expected_output = [1451 Document(1452 page_content="# Foo \n## Bat \nHi this is Jim \nHi Joe \n## Baz",1453 metadata={"Header 1": "Foo"},1454 ),1455 Document(1456 page_content="# Bar \nThis is Alice \nThis is Bob",1457 metadata={"Header 1": "Bar"},1458 ),1459 ]1460 assert output == expected_output146114621463def test_md_header_text_splitter_preserve_headers_2() -> None:1464 """Test markdown splitter by header: Preserve Headers."""1465 markdown_document = (1466 "# Foo\n\n"1467 " ## Bar\n\n"1468 "Hi this is Jim\n\n"1469 "Hi this is Joe\n\n"1470 "### Boo \n\n"1471 "Hi this is Lance\n\n"1472 "## Baz\n\n"1473 "Hi this is Molly\n"1474 " ## Buz\n"1475 "# Bop"1476 )1477 headers_to_split_on = [1478 ("#", "Header 1"),1479 ("##", "Header 2"),1480 ("###", "Header 3"),1481 ]1482 markdown_splitter = MarkdownHeaderTextSplitter(1483 headers_to_split_on=headers_to_split_on,1484 strip_headers=False,1485 )1486 output = markdown_splitter.split_text(markdown_document)1487 expected_output = [1488 Document(1489 page_content="# Foo \n## Bar \nHi this is Jim \nHi this is Joe",1490 metadata={"Header 1": "Foo", "Header 2": "Bar"},1491 ),1492 Document(1493 page_content="### Boo \nHi this is Lance",1494 metadata={"Header 1": "Foo", "Header 2": "Bar", "Header 3": "Boo"},1495 ),1496 Document(1497 page_content="## Baz \nHi this is Molly",1498 metadata={"Header 1": "Foo", "Header 2": "Baz"},1499 ),1500 Document(1501 page_content="## Buz",1502 metadata={"Header 1": "Foo", "Header 2": "Buz"},1503 ),1504 Document(page_content="# Bop", metadata={"Header 1": "Bop"}),1505 ]1506 assert output == expected_output150715081509@pytest.mark.parametrize("fence", [("```"), ("~~~")])1510def test_md_header_text_splitter_fenced_code_block(fence: str) -> None:1511 """Test markdown splitter by header: Fenced code block."""1512 markdown_document = (1513 f"# This is a Header\n\n{fence}\nfoo()\n# Not a header\nbar()\n{fence}"1514 )15151516 headers_to_split_on = [1517 ("#", "Header 1"),1518 ("##", "Header 2"),1519 ]15201521 markdown_splitter = MarkdownHeaderTextSplitter(1522 headers_to_split_on=headers_to_split_on,1523 )1524 output = markdown_splitter.split_text(markdown_document)15251526 expected_output = [1527 Document(1528 page_content=f"{fence}\nfoo()\n# Not a header\nbar()\n{fence}",1529 metadata={"Header 1": "This is a Header"},1530 ),1531 ]15321533 assert output == expected_output153415351536@pytest.mark.parametrize(("fence", "other_fence"), [("```", "~~~"), ("~~~", "```")])1537def test_md_header_text_splitter_fenced_code_block_interleaved(1538 fence: str, other_fence: str1539) -> None:1540 """Test markdown splitter by header: Interleaved fenced code block."""1541 markdown_document = (1542 "# This is a Header\n\n"1543 f"{fence}\n"1544 "foo\n"1545 "# Not a header\n"1546 f"{other_fence}\n"1547 "# Not a header\n"1548 f"{fence}"1549 )15501551 headers_to_split_on = [1552 ("#", "Header 1"),1553 ("##", "Header 2"),1554 ]15551556 markdown_splitter = MarkdownHeaderTextSplitter(1557 headers_to_split_on=headers_to_split_on,1558 )1559 output = markdown_splitter.split_text(markdown_document)15601561 expected_output = [1562 Document(1563 page_content=(1564 f"{fence}\nfoo\n# Not a header\n{other_fence}\n# Not a header\n{fence}"1565 ),1566 metadata={"Header 1": "This is a Header"},1567 ),1568 ]15691570 assert output == expected_output157115721573@pytest.mark.parametrize("characters", ["\ufeff"])1574def test_md_header_text_splitter_with_invisible_characters(characters: str) -> None:1575 """Test markdown splitter by header: Fenced code block."""1576 markdown_document = f"{characters}# Foo\n\nfoo()\n{characters}## Bar\n\nbar()"15771578 headers_to_split_on = [1579 ("#", "Header 1"),1580 ("##", "Header 2"),1581 ]15821583 markdown_splitter = MarkdownHeaderTextSplitter(1584 headers_to_split_on=headers_to_split_on,1585 )1586 output = markdown_splitter.split_text(markdown_document)15871588 expected_output = [1589 Document(1590 page_content="foo()",1591 metadata={"Header 1": "Foo"},1592 ),1593 Document(1594 page_content="bar()",1595 metadata={"Header 1": "Foo", "Header 2": "Bar"},1596 ),1597 ]15981599 assert output == expected_output160016011602def test_md_header_text_splitter_with_custom_headers() -> None:1603 """Test markdown splitter with custom header patterns like **Header**."""1604 markdown_document = """**Chapter 1**16051606This is the content for chapter 1.16071608***Section 1.1***16091610This is the content for section 1.1.16111612**Chapter 2**16131614This is the content for chapter 2.16151616***Section 2.1***16171618This is the content for section 2.1.1619"""16201621 headers_to_split_on = [1622 ("**", "Bold Header"),1623 ("***", "Bold Italic Header"),1624 ]16251626 custom_header_patterns = {1627 "**": 1, # Level 1 headers1628 "***": 2, # Level 2 headers1629 }1630 markdown_splitter = MarkdownHeaderTextSplitter(1631 headers_to_split_on=headers_to_split_on,1632 custom_header_patterns=custom_header_patterns,1633 )1634 output = markdown_splitter.split_text(markdown_document)16351636 expected_output = [1637 Document(1638 page_content="This is the content for chapter 1.",1639 metadata={"Bold Header": "Chapter 1"},1640 ),1641 Document(1642 page_content="This is the content for section 1.1.",1643 metadata={"Bold Header": "Chapter 1", "Bold Italic Header": "Section 1.1"},1644 ),1645 Document(1646 page_content="This is the content for chapter 2.",1647 metadata={"Bold Header": "Chapter 2"},1648 ),1649 Document(1650 page_content="This is the content for section 2.1.",1651 metadata={"Bold Header": "Chapter 2", "Bold Italic Header": "Section 2.1"},1652 ),1653 ]16541655 assert output == expected_output165616571658def test_md_header_text_splitter_mixed_headers() -> None:1659 """Test markdown splitter with both standard and custom headers."""1660 markdown_document = """# Standard Header 116611662Content under standard header.16631664**Custom Header 1**16651666Content under custom header.16671668## Standard Header 216691670Content under standard header 2.16711672***Custom Header 2***16731674Content under custom header 2.1675"""16761677 headers_to_split_on = [1678 ("#", "Header 1"),1679 ("##", "Header 2"),1680 ("**", "Bold Header"),1681 ("***", "Bold Italic Header"),1682 ]16831684 custom_header_patterns = {1685 "**": 1, # Same level as #1686 "***": 2, # Same level as ##1687 }16881689 markdown_splitter = MarkdownHeaderTextSplitter(1690 headers_to_split_on=headers_to_split_on,1691 custom_header_patterns=custom_header_patterns,1692 )1693 output = markdown_splitter.split_text(markdown_document)16941695 expected_output = [1696 Document(1697 page_content="Content under standard header.",1698 metadata={"Header 1": "Standard Header 1"},1699 ),1700 Document(1701 page_content="Content under custom header.",1702 metadata={"Bold Header": "Custom Header 1"},1703 ),1704 Document(1705 page_content="Content under standard header 2.",1706 metadata={1707 "Bold Header": "Custom Header 1",1708 "Header 2": "Standard Header 2",1709 },1710 ),1711 Document(1712 page_content="Content under custom header 2.",1713 metadata={1714 "Bold Header": "Custom Header 1",1715 "Bold Italic Header": "Custom Header 2",1716 },1717 ),1718 ]17191720 assert output == expected_output172117221723EXPERIMENTAL_MARKDOWN_DOCUMENT = (1724 "# My Header 1\n"1725 "Content for header 1\n"1726 "## Header 2\n"1727 "Content for header 2\n"1728 "### Header 3\n"1729 "Content for header 3\n"1730 "## Header 2 Again\n"1731 "This should be tagged with Header 1 and Header 2 Again\n"1732 "```python\n"1733 "def func_definition():\n"1734 " print('Keep the whitespace consistent')\n"1735 "```\n"1736 "# Header 1 again\n"1737 "We should also split on the horizontal line\n"1738 "----\n"1739 "This will be a new doc but with the same header metadata\n\n"1740 "And it includes a new paragraph"1741)174217431744def test_experimental_markdown_syntax_text_splitter() -> None:1745 """Test experimental markdown syntax splitter."""1746 markdown_splitter = ExperimentalMarkdownSyntaxTextSplitter()1747 output = markdown_splitter.split_text(EXPERIMENTAL_MARKDOWN_DOCUMENT)17481749 expected_output = [1750 Document(1751 page_content="Content for header 1\n",1752 metadata={"Header 1": "My Header 1"},1753 ),1754 Document(1755 page_content="Content for header 2\n",1756 metadata={"Header 1": "My Header 1", "Header 2": "Header 2"},1757 ),1758 Document(1759 page_content="Content for header 3\n",1760 metadata={1761 "Header 1": "My Header 1",1762 "Header 2": "Header 2",1763 "Header 3": "Header 3",1764 },1765 ),1766 Document(1767 page_content="This should be tagged with Header 1 and Header 2 Again\n",1768 metadata={"Header 1": "My Header 1", "Header 2": "Header 2 Again"},1769 ),1770 Document(1771 page_content=(1772 "```python\ndef func_definition():\n "1773 "print('Keep the whitespace consistent')\n```\n"1774 ),1775 metadata={1776 "Code": "python",1777 "Header 1": "My Header 1",1778 "Header 2": "Header 2 Again",1779 },1780 ),1781 Document(1782 page_content="We should also split on the horizontal line\n",1783 metadata={"Header 1": "Header 1 again"},1784 ),1785 Document(1786 page_content=(1787 "This will be a new doc but with the same header metadata\n\n"1788 "And it includes a new paragraph"1789 ),1790 metadata={"Header 1": "Header 1 again"},1791 ),1792 ]17931794 assert output == expected_output179517961797def test_experimental_markdown_syntax_text_splitter_header_configuration() -> None:1798 """Test experimental markdown syntax splitter."""1799 headers_to_split_on = [("#", "Encabezamiento 1")]18001801 markdown_splitter = ExperimentalMarkdownSyntaxTextSplitter(1802 headers_to_split_on=headers_to_split_on1803 )1804 output = markdown_splitter.split_text(EXPERIMENTAL_MARKDOWN_DOCUMENT)18051806 expected_output = [1807 Document(1808 page_content=(1809 "Content for header 1\n"1810 "## Header 2\n"1811 "Content for header 2\n"1812 "### Header 3\n"1813 "Content for header 3\n"1814 "## Header 2 Again\n"1815 "This should be tagged with Header 1 and Header 2 Again\n"1816 ),1817 metadata={"Encabezamiento 1": "My Header 1"},1818 ),1819 Document(1820 page_content=(1821 "```python\ndef func_definition():\n "1822 "print('Keep the whitespace consistent')\n```\n"1823 ),1824 metadata={"Code": "python", "Encabezamiento 1": "My Header 1"},1825 ),1826 Document(1827 page_content="We should also split on the horizontal line\n",1828 metadata={"Encabezamiento 1": "Header 1 again"},1829 ),1830 Document(1831 page_content=(1832 "This will be a new doc but with the same header metadata\n\n"1833 "And it includes a new paragraph"1834 ),1835 metadata={"Encabezamiento 1": "Header 1 again"},1836 ),1837 ]18381839 assert output == expected_output184018411842def test_experimental_markdown_syntax_text_splitter_with_headers() -> None:1843 """Test experimental markdown syntax splitter."""1844 markdown_splitter = ExperimentalMarkdownSyntaxTextSplitter(strip_headers=False)1845 output = markdown_splitter.split_text(EXPERIMENTAL_MARKDOWN_DOCUMENT)18461847 expected_output = [1848 Document(1849 page_content="# My Header 1\nContent for header 1\n",1850 metadata={"Header 1": "My Header 1"},1851 ),1852 Document(1853 page_content="## Header 2\nContent for header 2\n",1854 metadata={"Header 1": "My Header 1", "Header 2": "Header 2"},1855 ),1856 Document(1857 page_content="### Header 3\nContent for header 3\n",1858 metadata={1859 "Header 1": "My Header 1",1860 "Header 2": "Header 2",1861 "Header 3": "Header 3",1862 },1863 ),1864 Document(1865 page_content=(1866 "## Header 2 Again\n"1867 "This should be tagged with Header 1 and Header 2 Again\n"1868 ),1869 metadata={"Header 1": "My Header 1", "Header 2": "Header 2 Again"},1870 ),1871 Document(1872 page_content=(1873 "```python\ndef func_definition():\n "1874 "print('Keep the whitespace consistent')\n```\n"1875 ),1876 metadata={1877 "Code": "python",1878 "Header 1": "My Header 1",1879 "Header 2": "Header 2 Again",1880 },1881 ),1882 Document(1883 page_content=(1884 "# Header 1 again\nWe should also split on the horizontal line\n"1885 ),1886 metadata={"Header 1": "Header 1 again"},1887 ),1888 Document(1889 page_content=(1890 "This will be a new doc but with the same header metadata\n\n"1891 "And it includes a new paragraph"1892 ),1893 metadata={"Header 1": "Header 1 again"},1894 ),1895 ]18961897 assert output == expected_output189818991900def test_experimental_markdown_syntax_text_splitter_split_lines() -> None:1901 """Test experimental markdown syntax splitter."""1902 markdown_splitter = ExperimentalMarkdownSyntaxTextSplitter(return_each_line=True)1903 output = markdown_splitter.split_text(EXPERIMENTAL_MARKDOWN_DOCUMENT)19041905 expected_output = [1906 Document(1907 page_content="Content for header 1", metadata={"Header 1": "My Header 1"}1908 ),1909 Document(1910 page_content="Content for header 2",1911 metadata={"Header 1": "My Header 1", "Header 2": "Header 2"},1912 ),1913 Document(1914 page_content="Content for header 3",1915 metadata={1916 "Header 1": "My Header 1",1917 "Header 2": "Header 2",1918 "Header 3": "Header 3",1919 },1920 ),1921 Document(1922 page_content="This should be tagged with Header 1 and Header 2 Again",1923 metadata={"Header 1": "My Header 1", "Header 2": "Header 2 Again"},1924 ),1925 Document(1926 page_content="```python",1927 metadata={1928 "Code": "python",1929 "Header 1": "My Header 1",1930 "Header 2": "Header 2 Again",1931 },1932 ),1933 Document(1934 page_content="def func_definition():",1935 metadata={1936 "Code": "python",1937 "Header 1": "My Header 1",1938 "Header 2": "Header 2 Again",1939 },1940 ),1941 Document(1942 page_content=" print('Keep the whitespace consistent')",1943 metadata={1944 "Code": "python",1945 "Header 1": "My Header 1",1946 "Header 2": "Header 2 Again",1947 },1948 ),1949 Document(1950 page_content="```",1951 metadata={1952 "Code": "python",1953 "Header 1": "My Header 1",1954 "Header 2": "Header 2 Again",1955 },1956 ),1957 Document(1958 page_content="We should also split on the horizontal line",1959 metadata={"Header 1": "Header 1 again"},1960 ),1961 Document(1962 page_content="This will be a new doc but with the same header metadata",1963 metadata={"Header 1": "Header 1 again"},1964 ),1965 Document(1966 page_content="And it includes a new paragraph",1967 metadata={"Header 1": "Header 1 again"},1968 ),1969 ]19701971 assert output == expected_output197219731974EXPERIMENTAL_MARKDOWN_DOCUMENTS = [1975 (1976 "# My Header 1 From Document 1\n"1977 "Content for header 1 from Document 1\n"1978 "## Header 2 From Document 1\n"1979 "Content for header 2 from Document 1\n"1980 "```python\n"1981 "def func_definition():\n"1982 " print('Keep the whitespace consistent')\n"1983 "```\n"1984 "# Header 1 again From Document 1\n"1985 "We should also split on the horizontal line\n"1986 "----\n"1987 "This will be a new doc but with the same header metadata\n\n"1988 "And it includes a new paragraph"1989 ),1990 (1991 "# My Header 1 From Document 2\n"1992 "Content for header 1 from Document 2\n"1993 "## Header 2 From Document 2\n"1994 "Content for header 2 from Document 2\n"1995 "```python\n"1996 "def func_definition():\n"1997 " print('Keep the whitespace consistent')\n"1998 "```\n"1999 "# Header 1 again From Document 2\n"2000 "We should also split on the horizontal line\n"
Findings
✓ No findings reported for this file.