libs/text-splitters/tests/unit_tests/test_text_splitters.py PYTHON 4,246 lines View on github.com → Search inside
File is large — showing lines 1–2,000 of 4,246.
1"""Test text splitting functionality."""23from __future__ import annotations45import json6import random7import re8import string9import textwrap10from typing import TYPE_CHECKING, Any1112import pytest13from langchain_core._api import suppress_langchain_beta_warning14from langchain_core.documents import Document1516from langchain_text_splitters import (17    Language,18    RecursiveCharacterTextSplitter,19    TextSplitter,20    Tokenizer,21)22from langchain_text_splitters.base import split_text_on_tokens23from langchain_text_splitters.character import CharacterTextSplitter24from langchain_text_splitters.html import (25    HTMLHeaderTextSplitter,26    HTMLSectionSplitter,27    HTMLSemanticPreservingSplitter,28)29from langchain_text_splitters.json import RecursiveJsonSplitter30from langchain_text_splitters.jsx import JSFrameworkTextSplitter31from langchain_text_splitters.markdown import (32    ExperimentalMarkdownSyntaxTextSplitter,33    MarkdownHeaderTextSplitter,34)35from langchain_text_splitters.python import PythonCodeTextSplitter3637if TYPE_CHECKING:38    from collections.abc import Callable3940    from bs4 import Tag4142FAKE_PYTHON_TEXT = """43class Foo:4445    def bar():464748def foo():4950def testing_func():5152def bar():53"""545556def test_character_text_splitter() -> None:57    """Test splitting by character count."""58    text = "foo bar baz 123"59    splitter = CharacterTextSplitter(separator=" ", chunk_size=7, chunk_overlap=3)60    output = splitter.split_text(text)61    expected_output = ["foo bar", "bar baz", "baz 123"]62    assert output == expected_output636465def test_character_text_splitter_empty_doc() -> None:66    """Test splitting by character count doesn't create empty documents."""67    text = "foo  bar"68    splitter = CharacterTextSplitter(separator=" ", chunk_size=2, chunk_overlap=0)69    output = splitter.split_text(text)70    expected_output = ["foo", "bar"]71    assert output == expected_output727374def test_character_text_splitter_separtor_empty_doc() -> None:75    """Test edge cases are separators."""76    text = "f b"77    splitter = CharacterTextSplitter(separator=" ", chunk_size=2, chunk_overlap=0)78    output = splitter.split_text(text)79    expected_output = ["f", "b"]80    assert output == expected_output818283def test_character_text_splitter_long() -> None:84    """Test splitting by character count on long words."""85    text = "foo bar baz a a"86    splitter = CharacterTextSplitter(separator=" ", chunk_size=3, chunk_overlap=1)87    output = splitter.split_text(text)88    expected_output = ["foo", "bar", "baz", "a a"]89    assert output == expected_output909192def test_character_text_splitter_short_words_first() -> None:93    """Test splitting by character count when shorter words are first."""94    text = "a a foo bar baz"95    splitter = CharacterTextSplitter(separator=" ", chunk_size=3, chunk_overlap=1)96    output = splitter.split_text(text)97    expected_output = ["a a", "foo", "bar", "baz"]98    assert output == expected_output99100101def test_character_text_splitter_longer_words() -> None:102    """Test splitting by characters when splits not found easily."""103    text = "foo bar baz 123"104    splitter = CharacterTextSplitter(separator=" ", chunk_size=1, chunk_overlap=1)105    output = splitter.split_text(text)106    expected_output = ["foo", "bar", "baz", "123"]107    assert output == expected_output108109110# edge cases111def test_character_text_splitter_no_separator_in_text() -> None:112    """Text splitting where there is no separator but a single word."""113    text = "singleword"114    splitter = CharacterTextSplitter(separator=" ", chunk_size=10, chunk_overlap=0)115    output = splitter.split_text(text)116    expected_output = ["singleword"]117    assert output == expected_output118119120def test_character_text_splitter_handle_chunksize_equal_to_chunkoverlap() -> None:121    """Text splitting safe guards when chunk size is equal chunk overlap."""122    text = "hello"123    splitter = CharacterTextSplitter(separator=" ", chunk_size=5, chunk_overlap=5)124    output = splitter.split_text(text)125    expected_output = ["hello"]126    assert output == expected_output127128129def test_character_text_splitter_empty_input() -> None:130    """Test splitting safely where there is no input to process."""131    text = ""132    splitter = CharacterTextSplitter(separator=" ", chunk_size=5, chunk_overlap=0)133    output = splitter.split_text(text)134    expected_output: list[str] = []135    assert output == expected_output136137138def test_character_text_splitter_whitespace_only() -> None:139    """Test splitting safely where there is white space."""140    text = " "141    splitter = CharacterTextSplitter(separator=" ", chunk_size=5, chunk_overlap=0)142    output = splitter.split_text(text)143    expected_output: list[str] = []144    assert output == expected_output145146147@pytest.mark.parametrize(148    ("separator", "is_separator_regex"), [(re.escape("."), True), (".", False)]149)150def test_character_text_splitter_keep_separator_regex(151    *, separator: str, is_separator_regex: bool152) -> None:153    """Test CharacterTextSplitter keep separator regex.154155    Test splitting by characters while keeping the separator156    that is a regex special character.157    """158    text = "foo.bar.baz.123"159    splitter = CharacterTextSplitter(160        separator=separator,161        chunk_size=1,162        chunk_overlap=0,163        keep_separator=True,164        is_separator_regex=is_separator_regex,165    )166    output = splitter.split_text(text)167    expected_output = ["foo", ".bar", ".baz", ".123"]168    assert output == expected_output169170171@pytest.mark.parametrize(172    ("separator", "is_separator_regex"), [(re.escape("."), True), (".", False)]173)174def test_character_text_splitter_keep_separator_regex_start(175    *, separator: str, is_separator_regex: bool176) -> None:177    """Test CharacterTextSplitter keep separator regex and put at start.178179    Test splitting by characters while keeping the separator180    that is a regex special character and placing it at the start of each chunk.181    """182    text = "foo.bar.baz.123"183    splitter = CharacterTextSplitter(184        separator=separator,185        chunk_size=1,186        chunk_overlap=0,187        keep_separator="start",188        is_separator_regex=is_separator_regex,189    )190    output = splitter.split_text(text)191    expected_output = ["foo", ".bar", ".baz", ".123"]192    assert output == expected_output193194195@pytest.mark.parametrize(196    ("separator", "is_separator_regex"), [(re.escape("."), True), (".", False)]197)198def test_character_text_splitter_keep_separator_regex_end(199    *, separator: str, is_separator_regex: bool200) -> None:201    """Test CharacterTextSplitter keep separator regex and put at end.202203    Test splitting by characters while keeping the separator204    that is a regex special character and placing it at the end of each chunk.205    """206    text = "foo.bar.baz.123"207    splitter = CharacterTextSplitter(208        separator=separator,209        chunk_size=1,210        chunk_overlap=0,211        keep_separator="end",212        is_separator_regex=is_separator_regex,213    )214    output = splitter.split_text(text)215    expected_output = ["foo.", "bar.", "baz.", "123"]216    assert output == expected_output217218219@pytest.mark.parametrize(220    ("separator", "is_separator_regex"), [(re.escape("."), True), (".", False)]221)222def test_character_text_splitter_discard_separator_regex(223    *, separator: str, is_separator_regex: bool224) -> None:225    """Test CharacterTextSplitter discard separator regex.226227    Test splitting by characters discarding the separator228    that is a regex special character.229    """230    text = "foo.bar.baz.123"231    splitter = CharacterTextSplitter(232        separator=separator,233        chunk_size=1,234        chunk_overlap=0,235        keep_separator=False,236        is_separator_regex=is_separator_regex,237    )238    output = splitter.split_text(text)239    expected_output = ["foo", "bar", "baz", "123"]240    assert output == expected_output241242243def test_recursive_character_text_splitter_keep_separators() -> None:244    split_tags = [",", "."]245    query = "Apple,banana,orange and tomato."246    # start247    splitter = RecursiveCharacterTextSplitter(248        chunk_size=10,249        chunk_overlap=0,250        separators=split_tags,251        keep_separator="start",252    )253    result = splitter.split_text(query)254    assert result == ["Apple", ",banana", ",orange and tomato", "."]255256    # end257    splitter = RecursiveCharacterTextSplitter(258        chunk_size=10,259        chunk_overlap=0,260        separators=split_tags,261        keep_separator="end",262    )263    result = splitter.split_text(query)264    assert result == ["Apple,", "banana,", "orange and tomato."]265266267def test_character_text_splitting_args() -> None:268    """Test invalid arguments."""269    with pytest.raises(270        ValueError,271        match=re.escape(272            "Got a larger chunk overlap (4) than chunk size (2), should be smaller."273        ),274    ):275        CharacterTextSplitter(chunk_size=2, chunk_overlap=4)276    for invalid_size in (0, -1):277        with pytest.raises(ValueError, match="chunk_size must be > 0, got"):278            CharacterTextSplitter(chunk_size=invalid_size)279    with pytest.raises(ValueError, match="chunk_overlap must be >= 0, got -1"):280        CharacterTextSplitter(chunk_size=2, chunk_overlap=-1)281282283def test_merge_splits() -> None:284    """Test merging splits with a given separator."""285    splitter = CharacterTextSplitter(separator=" ", chunk_size=9, chunk_overlap=2)286    splits = ["foo", "bar", "baz"]287    expected_output = ["foo bar", "baz"]288    output = splitter._merge_splits(splits, separator=" ")289    assert output == expected_output290291292def test_create_documents() -> None:293    """Test create documents method."""294    texts = ["foo bar", "baz"]295    splitter = CharacterTextSplitter(separator=" ", chunk_size=3, chunk_overlap=0)296    docs = splitter.create_documents(texts)297    expected_docs = [298        Document(page_content="foo"),299        Document(page_content="bar"),300        Document(page_content="baz"),301    ]302    assert docs == expected_docs303304305def test_create_documents_with_metadata() -> None:306    """Test create documents with metadata method."""307    texts = ["foo bar", "baz"]308    splitter = CharacterTextSplitter(separator=" ", chunk_size=3, chunk_overlap=0)309    docs = splitter.create_documents(texts, [{"source": "1"}, {"source": "2"}])310    expected_docs = [311        Document(page_content="foo", metadata={"source": "1"}),312        Document(page_content="bar", metadata={"source": "1"}),313        Document(page_content="baz", metadata={"source": "2"}),314    ]315    assert docs == expected_docs316317318@pytest.mark.parametrize(319    ("splitter", "text", "expected_docs"),320    [321        (322            CharacterTextSplitter(323                separator=" ", chunk_size=7, chunk_overlap=3, add_start_index=True324            ),325            "foo bar baz 123",326            [327                Document(page_content="foo bar", metadata={"start_index": 0}),328                Document(page_content="bar baz", metadata={"start_index": 4}),329                Document(page_content="baz 123", metadata={"start_index": 8}),330            ],331        ),332        (333            RecursiveCharacterTextSplitter(334                chunk_size=6,335                chunk_overlap=0,336                separators=["\n\n", "\n", " ", ""],337                add_start_index=True,338            ),339            "w1 w1 w1 w1 w1 w1 w1 w1 w1",340            [341                Document(page_content="w1 w1", metadata={"start_index": 0}),342                Document(page_content="w1 w1", metadata={"start_index": 6}),343                Document(page_content="w1 w1", metadata={"start_index": 12}),344                Document(page_content="w1 w1", metadata={"start_index": 18}),345                Document(page_content="w1", metadata={"start_index": 24}),346            ],347        ),348    ],349)350def test_create_documents_with_start_index(351    splitter: TextSplitter, text: str, expected_docs: list[Document]352) -> None:353    """Test create documents method."""354    docs = splitter.create_documents([text])355    assert docs == expected_docs356    for doc in docs:357        s_i = doc.metadata["start_index"]358        assert text[s_i : s_i + len(doc.page_content)] == doc.page_content359360361def test_metadata_not_shallow() -> None:362    """Test that metadatas are not shallow."""363    texts = ["foo bar"]364    splitter = CharacterTextSplitter(separator=" ", chunk_size=3, chunk_overlap=0)365    docs = splitter.create_documents(texts, [{"source": "1"}])366    expected_docs = [367        Document(page_content="foo", metadata={"source": "1"}),368        Document(page_content="bar", metadata={"source": "1"}),369    ]370    assert docs == expected_docs371    docs[0].metadata["foo"] = 1372    assert docs[0].metadata == {"source": "1", "foo": 1}373    assert docs[1].metadata == {"source": "1"}374375376def test_iterative_text_splitter_keep_separator() -> None:377    chunk_size = 5378    output = __test_iterative_text_splitter(chunk_size=chunk_size, keep_separator=True)379380    assert output == [381        "....5",382        "X..3",383        "Y...4",384        "X....5",385        "Y...",386    ]387388389def test_iterative_text_splitter_discard_separator() -> None:390    chunk_size = 5391    output = __test_iterative_text_splitter(chunk_size=chunk_size, keep_separator=False)392393    assert output == [394        "....5",395        "..3",396        "...4",397        "....5",398        "...",399    ]400401402def __test_iterative_text_splitter(403    *, chunk_size: int, keep_separator: bool404) -> list[str]:405    chunk_size += 1 if keep_separator else 0406407    splitter = RecursiveCharacterTextSplitter(408        chunk_size=chunk_size,409        chunk_overlap=0,410        separators=["X", "Y"],411        keep_separator=keep_separator,412    )413    text = "....5X..3Y...4X....5Y..."414    output = splitter.split_text(text)415    for chunk in output:416        assert len(chunk) <= chunk_size, f"Chunk is larger than {chunk_size}"417    return output418419420def test_iterative_text_splitter() -> None:421    """Test iterative text splitter."""422    text = """Hi.\n\nI'm Harrison.\n\nHow? Are? You?\nOkay then f f f f.423This is a weird text to write, but gotta test the splittingggg some how.424425Bye!\n\n-H."""426    splitter = RecursiveCharacterTextSplitter(chunk_size=10, chunk_overlap=1)427    output = splitter.split_text(text)428    expected_output = [429        "Hi.",430        "I'm",431        "Harrison.",432        "How? Are?",433        "You?",434        "Okay then",435        "f f f f.",436        "This is a",437        "weird",438        "text to",439        "write,",440        "but gotta",441        "test the",442        "splitting",443        "gggg",444        "some how.",445        "Bye!",446        "-H.",447    ]448    assert output == expected_output449450451def test_split_documents() -> None:452    """Test split_documents."""453    splitter = CharacterTextSplitter(separator="", chunk_size=1, chunk_overlap=0)454    docs = [455        Document(page_content="foo", metadata={"source": "1"}),456        Document(page_content="bar", metadata={"source": "2"}),457        Document(page_content="baz", metadata={"source": "1"}),458    ]459    expected_output = [460        Document(page_content="f", metadata={"source": "1"}),461        Document(page_content="o", metadata={"source": "1"}),462        Document(page_content="o", metadata={"source": "1"}),463        Document(page_content="b", metadata={"source": "2"}),464        Document(page_content="a", metadata={"source": "2"}),465        Document(page_content="r", metadata={"source": "2"}),466        Document(page_content="b", metadata={"source": "1"}),467        Document(page_content="a", metadata={"source": "1"}),468        Document(page_content="z", metadata={"source": "1"}),469    ]470    assert splitter.split_documents(docs) == expected_output471472473def test_python_text_splitter() -> None:474    splitter = PythonCodeTextSplitter(chunk_size=30, chunk_overlap=0)475    splits = splitter.split_text(FAKE_PYTHON_TEXT)476    split_0 = """class Foo:\n\n    def bar():"""477    split_1 = """def foo():"""478    split_2 = """def testing_func():"""479    split_3 = """def bar():"""480    expected_splits = [split_0, split_1, split_2, split_3]481    assert splits == expected_splits482483484FAKE_JSX_TEXT = """485import React from 'react';486import OtherComponent from './OtherComponent';487488function MyComponent() {489  const [count, setCount] = React.useState(0);490491  const handleClick = () => {492    setCount(count + 1);493  };494495  return (496    <div>497      <h1>Counter: {count}</h1>498      <button onClick={handleClick}>499        Increment500      </button>501      <OtherComponent />502    </div>503  );504}505506export default MyComponent;507"""508509510def test_jsx_text_splitter() -> None:511    splitter = JSFrameworkTextSplitter(chunk_size=30, chunk_overlap=0)512    splits = splitter.split_text(FAKE_JSX_TEXT)513514    expected_splits = [515        (516            "\nimport React from 'react';\n"517            "import OtherComponent from './OtherComponent';\n"518        ),519        "\nfunction MyComponent() {\n  const [count, setCount] = React.useState(0);",520        "\n\n  const handleClick = () => {\n    setCount(count + 1);\n  };",521        "return (",522        "<div>",523        "<h1>Counter: {count}</h1>\n      ",524        "<button onClick={handleClick}>\n        Increment\n      </button>\n      ",525        "<OtherComponent />\n    </div>\n  );\n}\n",526        "export default MyComponent;",527    ]528    assert [s.strip() for s in splits] == [s.strip() for s in expected_splits]529530531FAKE_VUE_TEXT = """532<template>533  <div>534    <h1>{{ title }}</h1>535    <button @click="increment">536      Count is: {{ count }}537    </button>538  </div>539</template>540541<script>542export default {543  data() {544    return {545      title: 'Counter App',546      count: 0547    }548  },549  methods: {550    increment() {551      this.count++552    }553  }554}555</script>556557<style>558button {559  color: blue;560}561</style>562"""563564565def test_vue_text_splitter() -> None:566    splitter = JSFrameworkTextSplitter(chunk_size=30, chunk_overlap=0)567    splits = splitter.split_text(FAKE_VUE_TEXT)568569    expected_splits = [570        "<template>",571        "<div>",572        "<h1>{{ title }}</h1>",573        (574            '<button @click="increment">\n      Count is: {{ count }}\n'575            "    </button>\n  </div>\n</template>"576        ),577        "<script>",578        "export",579        (580            " default {\n  data() {\n    return {\n      title: 'Counter App',\n      "581            "count: 0\n    }\n  },\n  methods: {\n    increment() {\n      "582            "this.count++\n    }\n  }\n}\n</script>"583        ),584        "<style>\nbutton {\n  color: blue;\n}\n</style>",585    ]586    assert [s.strip() for s in splits] == [s.strip() for s in expected_splits]587588589FAKE_SVELTE_TEXT = """590<script>591  let count = 0592593  function increment() {594    count += 1595  }596</script>597598<main>599  <h1>Counter App</h1>600  <button on:click={increment}>601    Count is: {count}602  </button>603</main>604605<style>606  button {607    color: blue;608  }609</style>610"""611612613def test_svelte_text_splitter() -> None:614    splitter = JSFrameworkTextSplitter(chunk_size=30, chunk_overlap=0)615    splits = splitter.split_text(FAKE_SVELTE_TEXT)616617    expected_splits = [618        "<script>\n  let count = 0",619        "\n\n  function increment() {\n    count += 1\n  }\n</script>",620        "<main>",621        "<h1>Counter App</h1>",622        "<button on:click={increment}>\n    Count is: {count}\n  </button>\n</main>",623        "<style>\n  button {\n    color: blue;\n  }\n</style>",624    ]625    assert [s.strip() for s in splits] == [s.strip() for s in expected_splits]626627628def test_jsx_splitter_separator_not_mutated_across_calls() -> None:629    """Regression test: repeated split_text() calls must not mutate separators.630631    Calling split_text() multiple times on the same JSFrameworkTextSplitter632    instance must not grow the internal separator list between calls.633634    Before the fix, self._separators was overwritten with the full expanded list635    on every invocation, so a second call would start with the already-expanded636    list and append even more separators.637    """638    splitter = JSFrameworkTextSplitter(chunk_size=30, chunk_overlap=0)639640    # Record separator count after constructing (should be 0 - no custom separators)641    initial_sep_count = len(splitter._separators)642643    # Call split_text twice; the results should be identical for identical input644    splits_first = splitter.split_text(FAKE_JSX_TEXT)645    splits_second = splitter.split_text(FAKE_JSX_TEXT)646647    assert splits_first == splits_second, (648        "split_text() must return identical results on repeated calls with the "649        "same input"650    )651    assert len(splitter._separators) == initial_sep_count, (652        "split_text() must not mutate self._separators between calls"653    )654655656CHUNK_SIZE = 16657658659def test_python_code_splitter() -> None:660    splitter = RecursiveCharacterTextSplitter.from_language(661        Language.PYTHON, chunk_size=CHUNK_SIZE, chunk_overlap=0662    )663    code = """664def hello_world():665    print("Hello, World!")666667# Call the function668hello_world()669    """670    chunks = splitter.split_text(code)671    assert chunks == [672        "def",673        "hello_world():",674        'print("Hello,',675        'World!")',676        "# Call the",677        "function",678        "hello_world()",679    ]680681682def test_golang_code_splitter() -> None:683    splitter = RecursiveCharacterTextSplitter.from_language(684        Language.GO, chunk_size=CHUNK_SIZE, chunk_overlap=0685    )686    code = """687package main688689import "fmt"690691func helloWorld() {692    fmt.Println("Hello, World!")693}694695func main() {696    helloWorld()697}698    """699    chunks = splitter.split_text(code)700    assert chunks == [701        "package main",702        'import "fmt"',703        "func",704        "helloWorld() {",705        'fmt.Println("He',706        "llo,",707        'World!")',708        "}",709        "func main() {",710        "helloWorld()",711        "}",712    ]713714715def test_rst_code_splitter() -> None:716    splitter = RecursiveCharacterTextSplitter.from_language(717        Language.RST, chunk_size=CHUNK_SIZE, chunk_overlap=0718    )719    code = """720Sample Document721===============722723Section724-------725726This is the content of the section.727728Lists729-----730731- Item 1732- Item 2733- Item 3734735Comment736*******737Not a comment738739.. This is a comment740    """741    chunks = splitter.split_text(code)742    assert chunks == [743        "Sample Document",744        "===============",745        "Section",746        "-------",747        "This is the",748        "content of the",749        "section.",750        "Lists",751        "-----",752        "- Item 1",753        "- Item 2",754        "- Item 3",755        "Comment",756        "*******",757        "Not a comment",758        ".. This is a",759        "comment",760    ]761    # Special test for special characters762    code = "harry\n***\nbabylon is"763    chunks = splitter.split_text(code)764    assert chunks == ["harry", "***\nbabylon is"]765766767def test_proto_file_splitter() -> None:768    splitter = RecursiveCharacterTextSplitter.from_language(769        Language.PROTO, chunk_size=CHUNK_SIZE, chunk_overlap=0770    )771    code = """772syntax = "proto3";773774package example;775776message Person {777    string name = 1;778    int32 age = 2;779    repeated string hobbies = 3;780}781    """782    chunks = splitter.split_text(code)783    assert chunks == [784        "syntax =",785        '"proto3";',786        "package",787        "example;",788        "message Person",789        "{",790        "string name",791        "= 1;",792        "int32 age =",793        "2;",794        "repeated",795        "string hobbies",796        "= 3;",797        "}",798    ]799800801def test_javascript_code_splitter() -> None:802    splitter = RecursiveCharacterTextSplitter.from_language(803        Language.JS, chunk_size=CHUNK_SIZE, chunk_overlap=0804    )805    code = """806function helloWorld() {807  console.log("Hello, World!");808}809810// Call the function811helloWorld();812    """813    chunks = splitter.split_text(code)814    assert chunks == [815        "function",816        "helloWorld() {",817        'console.log("He',818        "llo,",819        'World!");',820        "}",821        "// Call the",822        "function",823        "helloWorld();",824    ]825826827def test_cobol_code_splitter() -> None:828    splitter = RecursiveCharacterTextSplitter.from_language(829        Language.COBOL, chunk_size=CHUNK_SIZE, chunk_overlap=0830    )831    code = """832IDENTIFICATION DIVISION.833PROGRAM-ID. HelloWorld.834DATA DIVISION.835WORKING-STORAGE SECTION.83601 GREETING           PIC X(12)   VALUE 'Hello, World!'.837PROCEDURE DIVISION.838DISPLAY GREETING.839STOP RUN.840    """841    chunks = splitter.split_text(code)842    assert chunks == [843        "IDENTIFICATION",844        "DIVISION.",845        "PROGRAM-ID.",846        "HelloWorld.",847        "DATA DIVISION.",848        "WORKING-STORAGE",849        "SECTION.",850        "01 GREETING",851        "PIC X(12)",852        "VALUE 'Hello,",853        "World!'.",854        "PROCEDURE",855        "DIVISION.",856        "DISPLAY",857        "GREETING.",858        "STOP RUN.",859    ]860861862def test_typescript_code_splitter() -> None:863    splitter = RecursiveCharacterTextSplitter.from_language(864        Language.TS, chunk_size=CHUNK_SIZE, chunk_overlap=0865    )866    code = """867function helloWorld(): void {868  console.log("Hello, World!");869}870871// Call the function872helloWorld();873    """874    chunks = splitter.split_text(code)875    assert chunks == [876        "function",877        "helloWorld():",878        "void {",879        'console.log("He',880        "llo,",881        'World!");',882        "}",883        "// Call the",884        "function",885        "helloWorld();",886    ]887888889def test_java_code_splitter() -> None:890    splitter = RecursiveCharacterTextSplitter.from_language(891        Language.JAVA, chunk_size=CHUNK_SIZE, chunk_overlap=0892    )893    code = """894public class HelloWorld {895    public static void main(String[] args) {896        System.out.println("Hello, World!");897    }898}899    """900    chunks = splitter.split_text(code)901    assert chunks == [902        "public class",903        "HelloWorld {",904        "public",905        "static void",906        "main(String[]",907        "args) {",908        "System.out.prin",909        'tln("Hello,',910        'World!");',911        "}\n}",912    ]913914915def test_kotlin_code_splitter() -> None:916    splitter = RecursiveCharacterTextSplitter.from_language(917        Language.KOTLIN, chunk_size=CHUNK_SIZE, chunk_overlap=0918    )919    code = """920class HelloWorld {921    companion object {922        @JvmStatic923        fun main(args: Array<String>) {924            println("Hello, World!")925        }926    }927}928    """929    chunks = splitter.split_text(code)930    assert chunks == [931        "class",932        "HelloWorld {",933        "companion",934        "object {",935        "@JvmStatic",936        "fun",937        "main(args:",938        "Array<String>)",939        "{",940        'println("Hello,',941        'World!")',942        "}\n    }",943        "}",944    ]945946947def test_csharp_code_splitter() -> None:948    splitter = RecursiveCharacterTextSplitter.from_language(949        Language.CSHARP, chunk_size=CHUNK_SIZE, chunk_overlap=0950    )951    code = """952using System;953class Program954{955    static void Main()956    {957        int age = 30; // Change the age value as needed958959        // Categorize the age without any console output960        if (age < 18)961        {962            // Age is under 18963        }964        else if (age >= 18 && age < 65)965        {966            // Age is an adult967        }968        else969        {970            // Age is a senior citizen971        }972    }973}974    """975976    chunks = splitter.split_text(code)977    assert chunks == [978        "using System;",979        "class Program\n{",980        "static void",981        "Main()",982        "{",983        "int age",984        "= 30; // Change",985        "the age value",986        "as needed",987        "//",988        "Categorize the",989        "age without any",990        "console output",991        "if (age",992        "< 18)",993        "{",994        "//",995        "Age is under 18",996        "}",997        "else if",998        "(age >= 18 &&",999        "age < 65)",1000        "{",1001        "//",1002        "Age is an adult",1003        "}",1004        "else",1005        "{",1006        "//",1007        "Age is a senior",1008        "citizen",1009        "}\n    }",1010        "}",1011    ]101210131014def test_csharp_separators_no_java_keywords() -> None:1015    """C# separators should not contain Java-only keywords."""1016    splitter = RecursiveCharacterTextSplitter.from_language(1017        Language.CSHARP, chunk_size=CHUNK_SIZE, chunk_overlap=01018    )1019    # "implements" is a Java keyword; C# uses ":" for interface implementation1020    assert "\nimplements " not in splitter._separators102110221023def test_elixir_separators_no_while() -> None:1024    """Elixir has no while loop; the separator should not be present."""1025    splitter = RecursiveCharacterTextSplitter.from_language(1026        Language.ELIXIR, chunk_size=CHUNK_SIZE, chunk_overlap=01027    )1028    assert "\nwhile " not in splitter._separators102910301031def test_cpp_code_splitter() -> None:1032    splitter = RecursiveCharacterTextSplitter.from_language(1033        Language.CPP, chunk_size=CHUNK_SIZE, chunk_overlap=01034    )1035    code = """1036#include <iostream>10371038int main() {1039    std::cout << "Hello, World!" << std::endl;1040    return 0;1041}1042    """1043    chunks = splitter.split_text(code)1044    assert chunks == [1045        "#include",1046        "<iostream>",1047        "int main() {",1048        "std::cout",1049        '<< "Hello,',1050        'World!" <<',1051        "std::endl;",1052        "return 0;\n}",1053    ]105410551056def test_scala_code_splitter() -> None:1057    splitter = RecursiveCharacterTextSplitter.from_language(1058        Language.SCALA, chunk_size=CHUNK_SIZE, chunk_overlap=01059    )1060    code = """1061object HelloWorld {1062  def main(args: Array[String]): Unit = {1063    println("Hello, World!")1064  }1065}1066    """1067    chunks = splitter.split_text(code)1068    assert chunks == [1069        "object",1070        "HelloWorld {",1071        "def",1072        "main(args:",1073        "Array[String]):",1074        "Unit = {",1075        'println("Hello,',1076        'World!")',1077        "}\n}",1078    ]107910801081def test_ruby_code_splitter() -> None:1082    splitter = RecursiveCharacterTextSplitter.from_language(1083        Language.RUBY, chunk_size=CHUNK_SIZE, chunk_overlap=01084    )1085    code = """1086def hello_world1087  puts "Hello, World!"1088end10891090hello_world1091    """1092    chunks = splitter.split_text(code)1093    assert chunks == [1094        "def hello_world",1095        'puts "Hello,',1096        'World!"',1097        "end",1098        "hello_world",1099    ]110011011102def test_php_code_splitter() -> None:1103    splitter = RecursiveCharacterTextSplitter.from_language(1104        Language.PHP, chunk_size=CHUNK_SIZE, chunk_overlap=01105    )1106    code = """1107<?php1108function hello_world() {1109    echo "Hello, World!";1110}11111112hello_world();1113?>1114    """1115    chunks = splitter.split_text(code)1116    assert chunks == [1117        "<?php",1118        "function",1119        "hello_world() {",1120        "echo",1121        '"Hello,',1122        'World!";',1123        "}",1124        "hello_world();",1125        "?>",1126    ]112711281129def test_swift_code_splitter() -> None:1130    splitter = RecursiveCharacterTextSplitter.from_language(1131        Language.SWIFT, chunk_size=CHUNK_SIZE, chunk_overlap=01132    )1133    code = """1134func helloWorld() {1135    print("Hello, World!")1136}11371138helloWorld()1139    """1140    chunks = splitter.split_text(code)1141    assert chunks == [1142        "func",1143        "helloWorld() {",1144        'print("Hello,',1145        'World!")',1146        "}",1147        "helloWorld()",1148    ]114911501151def test_rust_code_splitter() -> None:1152    splitter = RecursiveCharacterTextSplitter.from_language(1153        Language.RUST, chunk_size=CHUNK_SIZE, chunk_overlap=01154    )1155    code = """1156fn main() {1157    println!("Hello, World!");1158}1159    """1160    chunks = splitter.split_text(code)1161    assert chunks == ["fn main() {", 'println!("Hello', ",", 'World!");', "}"]116211631164def test_r_code_splitter() -> None:1165    splitter = RecursiveCharacterTextSplitter.from_language(1166        Language.R, chunk_size=CHUNK_SIZE, chunk_overlap=01167    )1168    code = """1169library(dplyr)11701171my_func <- function(x) {1172    return(x + 1)1173}11741175if (TRUE) {1176    print("Hello")1177}1178    """1179    chunks = splitter.split_text(code)1180    assert chunks == [1181        "library(dplyr)",1182        "my_func <-",1183        "function(x) {",1184        "return(x +",1185        "1)",1186        "}",1187        "if (TRUE) {",1188        'print("Hello")',1189        "}",1190    ]119111921193def test_markdown_code_splitter() -> None:1194    splitter = RecursiveCharacterTextSplitter.from_language(1195        Language.MARKDOWN, chunk_size=CHUNK_SIZE, chunk_overlap=01196    )1197    code = """1198# Sample Document11991200## Section12011202This is the content of the section.12031204## Lists12051206- Item 11207- Item 21208- Item 312091210### Horizontal lines12111212***********1213____________1214-------------------12151216#### Code blocks1217```1218This is a code block12191220# sample code1221a = 11222b = 21223```1224    """1225    chunks = splitter.split_text(code)1226    assert chunks == [1227        "# Sample",1228        "Document",1229        "## Section",1230        "This is the",1231        "content of the",1232        "section.",1233        "## Lists",1234        "- Item 1",1235        "- Item 2",1236        "- Item 3",1237        "### Horizontal",1238        "lines",1239        "***********",1240        "____________",1241        "---------------",1242        "----",1243        "#### Code",1244        "blocks",1245        "```",1246        "This is a code",1247        "block",1248        "# sample code",1249        "a = 1\nb = 2",1250        "```",1251    ]1252    # Special test for special characters1253    code = "harry\n***\nbabylon is"1254    chunks = splitter.split_text(code)1255    assert chunks == ["harry", "***\nbabylon is"]125612571258def test_latex_code_splitter() -> None:1259    splitter = RecursiveCharacterTextSplitter.from_language(1260        Language.LATEX, chunk_size=CHUNK_SIZE, chunk_overlap=01261    )1262    code = """1263Hi Harrison!1264\\chapter{1}1265"""1266    chunks = splitter.split_text(code)1267    assert chunks == ["Hi Harrison!", "\\chapter{1}"]126812691270def test_html_code_splitter() -> None:1271    splitter = RecursiveCharacterTextSplitter.from_language(1272        Language.HTML, chunk_size=60, chunk_overlap=01273    )1274    code = """1275<h1>Sample Document</h1>1276    <h2>Section</h2>1277        <p id="1234">Reference content.</p>12781279    <h2>Lists</h2>1280        <ul>1281            <li>Item 1</li>1282            <li>Item 2</li>1283            <li>Item 3</li>1284        </ul>12851286        <h3>A block</h3>1287            <div class="amazing">1288                <p>Some text</p>1289                <p>Some more text</p>1290            </div>1291    """1292    chunks = splitter.split_text(code)1293    assert chunks == [1294        "<h1>Sample Document</h1>\n    <h2>Section</h2>",1295        '<p id="1234">Reference content.</p>',1296        "<h2>Lists</h2>\n        <ul>",1297        "<li>Item 1</li>\n            <li>Item 2</li>",1298        "<li>Item 3</li>\n        </ul>",1299        "<h3>A block</h3>",1300        '<div class="amazing">',1301        "<p>Some text</p>",1302        "<p>Some more text</p>\n            </div>",1303    ]130413051306def test_md_header_text_splitter_1() -> None:1307    """Test markdown splitter by header: Case 1."""1308    markdown_document = (1309        "# Foo\n\n"1310        "    ## Bar\n\n"1311        "Hi this is Jim\n\n"1312        "Hi this is Joe\n\n"1313        " ## Baz\n\n"1314        " Hi this is Molly"1315    )1316    headers_to_split_on = [1317        ("#", "Header 1"),1318        ("##", "Header 2"),1319    ]1320    markdown_splitter = MarkdownHeaderTextSplitter(1321        headers_to_split_on=headers_to_split_on,1322    )1323    output = markdown_splitter.split_text(markdown_document)1324    expected_output = [1325        Document(1326            page_content="Hi this is Jim  \nHi this is Joe",1327            metadata={"Header 1": "Foo", "Header 2": "Bar"},1328        ),1329        Document(1330            page_content="Hi this is Molly",1331            metadata={"Header 1": "Foo", "Header 2": "Baz"},1332        ),1333    ]1334    assert output == expected_output133513361337def test_md_header_text_splitter_2() -> None:1338    """Test markdown splitter by header: Case 2."""1339    markdown_document = (1340        "# Foo\n\n"1341        "    ## Bar\n\n"1342        "Hi this is Jim\n\n"1343        "Hi this is Joe\n\n"1344        " ### Boo \n\n"1345        " Hi this is Lance \n\n"1346        " ## Baz\n\n"1347        " Hi this is Molly"1348    )13491350    headers_to_split_on = [1351        ("#", "Header 1"),1352        ("##", "Header 2"),1353        ("###", "Header 3"),1354    ]1355    markdown_splitter = MarkdownHeaderTextSplitter(1356        headers_to_split_on=headers_to_split_on,1357    )1358    output = markdown_splitter.split_text(markdown_document)1359    expected_output = [1360        Document(1361            page_content="Hi this is Jim  \nHi this is Joe",1362            metadata={"Header 1": "Foo", "Header 2": "Bar"},1363        ),1364        Document(1365            page_content="Hi this is Lance",1366            metadata={"Header 1": "Foo", "Header 2": "Bar", "Header 3": "Boo"},1367        ),1368        Document(1369            page_content="Hi this is Molly",1370            metadata={"Header 1": "Foo", "Header 2": "Baz"},1371        ),1372    ]1373    assert output == expected_output137413751376def test_md_header_text_splitter_3() -> None:1377    """Test markdown splitter by header: Case 3."""1378    markdown_document = (1379        "# Foo\n\n"1380        "    ## Bar\n\n"1381        "Hi this is Jim\n\n"1382        "Hi this is Joe\n\n"1383        " ### Boo \n\n"1384        " Hi this is Lance \n\n"1385        " #### Bim \n\n"1386        " Hi this is John \n\n"1387        " ## Baz\n\n"1388        " Hi this is Molly"1389    )13901391    headers_to_split_on = [1392        ("#", "Header 1"),1393        ("##", "Header 2"),1394        ("###", "Header 3"),1395        ("####", "Header 4"),1396    ]13971398    markdown_splitter = MarkdownHeaderTextSplitter(1399        headers_to_split_on=headers_to_split_on,1400    )1401    output = markdown_splitter.split_text(markdown_document)14021403    expected_output = [1404        Document(1405            page_content="Hi this is Jim  \nHi this is Joe",1406            metadata={"Header 1": "Foo", "Header 2": "Bar"},1407        ),1408        Document(1409            page_content="Hi this is Lance",1410            metadata={"Header 1": "Foo", "Header 2": "Bar", "Header 3": "Boo"},1411        ),1412        Document(1413            page_content="Hi this is John",1414            metadata={1415                "Header 1": "Foo",1416                "Header 2": "Bar",1417                "Header 3": "Boo",1418                "Header 4": "Bim",1419            },1420        ),1421        Document(1422            page_content="Hi this is Molly",1423            metadata={"Header 1": "Foo", "Header 2": "Baz"},1424        ),1425    ]14261427    assert output == expected_output142814291430def test_md_header_text_splitter_preserve_headers_1() -> None:1431    """Test markdown splitter by header: Preserve Headers."""1432    markdown_document = (1433        "# Foo\n\n"1434        "    ## Bat\n\n"1435        "Hi this is Jim\n\n"1436        "Hi Joe\n\n"1437        "## Baz\n\n"1438        "# Bar\n\n"1439        "This is Alice\n\n"1440        "This is Bob"1441    )1442    headers_to_split_on = [1443        ("#", "Header 1"),1444    ]1445    markdown_splitter = MarkdownHeaderTextSplitter(1446        headers_to_split_on=headers_to_split_on,1447        strip_headers=False,1448    )1449    output = markdown_splitter.split_text(markdown_document)1450    expected_output = [1451        Document(1452            page_content="# Foo  \n## Bat  \nHi this is Jim  \nHi Joe  \n## Baz",1453            metadata={"Header 1": "Foo"},1454        ),1455        Document(1456            page_content="# Bar  \nThis is Alice  \nThis is Bob",1457            metadata={"Header 1": "Bar"},1458        ),1459    ]1460    assert output == expected_output146114621463def test_md_header_text_splitter_preserve_headers_2() -> None:1464    """Test markdown splitter by header: Preserve Headers."""1465    markdown_document = (1466        "# Foo\n\n"1467        "    ## Bar\n\n"1468        "Hi this is Jim\n\n"1469        "Hi this is Joe\n\n"1470        "### Boo \n\n"1471        "Hi this is Lance\n\n"1472        "## Baz\n\n"1473        "Hi this is Molly\n"1474        "    ## Buz\n"1475        "# Bop"1476    )1477    headers_to_split_on = [1478        ("#", "Header 1"),1479        ("##", "Header 2"),1480        ("###", "Header 3"),1481    ]1482    markdown_splitter = MarkdownHeaderTextSplitter(1483        headers_to_split_on=headers_to_split_on,1484        strip_headers=False,1485    )1486    output = markdown_splitter.split_text(markdown_document)1487    expected_output = [1488        Document(1489            page_content="# Foo  \n## Bar  \nHi this is Jim  \nHi this is Joe",1490            metadata={"Header 1": "Foo", "Header 2": "Bar"},1491        ),1492        Document(1493            page_content="### Boo  \nHi this is Lance",1494            metadata={"Header 1": "Foo", "Header 2": "Bar", "Header 3": "Boo"},1495        ),1496        Document(1497            page_content="## Baz  \nHi this is Molly",1498            metadata={"Header 1": "Foo", "Header 2": "Baz"},1499        ),1500        Document(1501            page_content="## Buz",1502            metadata={"Header 1": "Foo", "Header 2": "Buz"},1503        ),1504        Document(page_content="# Bop", metadata={"Header 1": "Bop"}),1505    ]1506    assert output == expected_output150715081509@pytest.mark.parametrize("fence", [("```"), ("~~~")])1510def test_md_header_text_splitter_fenced_code_block(fence: str) -> None:1511    """Test markdown splitter by header: Fenced code block."""1512    markdown_document = (1513        f"# This is a Header\n\n{fence}\nfoo()\n# Not a header\nbar()\n{fence}"1514    )15151516    headers_to_split_on = [1517        ("#", "Header 1"),1518        ("##", "Header 2"),1519    ]15201521    markdown_splitter = MarkdownHeaderTextSplitter(1522        headers_to_split_on=headers_to_split_on,1523    )1524    output = markdown_splitter.split_text(markdown_document)15251526    expected_output = [1527        Document(1528            page_content=f"{fence}\nfoo()\n# Not a header\nbar()\n{fence}",1529            metadata={"Header 1": "This is a Header"},1530        ),1531    ]15321533    assert output == expected_output153415351536@pytest.mark.parametrize(("fence", "other_fence"), [("```", "~~~"), ("~~~", "```")])1537def test_md_header_text_splitter_fenced_code_block_interleaved(1538    fence: str, other_fence: str1539) -> None:1540    """Test markdown splitter by header: Interleaved fenced code block."""1541    markdown_document = (1542        "# This is a Header\n\n"1543        f"{fence}\n"1544        "foo\n"1545        "# Not a header\n"1546        f"{other_fence}\n"1547        "# Not a header\n"1548        f"{fence}"1549    )15501551    headers_to_split_on = [1552        ("#", "Header 1"),1553        ("##", "Header 2"),1554    ]15551556    markdown_splitter = MarkdownHeaderTextSplitter(1557        headers_to_split_on=headers_to_split_on,1558    )1559    output = markdown_splitter.split_text(markdown_document)15601561    expected_output = [1562        Document(1563            page_content=(1564                f"{fence}\nfoo\n# Not a header\n{other_fence}\n# Not a header\n{fence}"1565            ),1566            metadata={"Header 1": "This is a Header"},1567        ),1568    ]15691570    assert output == expected_output157115721573@pytest.mark.parametrize("characters", ["\ufeff"])1574def test_md_header_text_splitter_with_invisible_characters(characters: str) -> None:1575    """Test markdown splitter by header: Fenced code block."""1576    markdown_document = f"{characters}# Foo\n\nfoo()\n{characters}## Bar\n\nbar()"15771578    headers_to_split_on = [1579        ("#", "Header 1"),1580        ("##", "Header 2"),1581    ]15821583    markdown_splitter = MarkdownHeaderTextSplitter(1584        headers_to_split_on=headers_to_split_on,1585    )1586    output = markdown_splitter.split_text(markdown_document)15871588    expected_output = [1589        Document(1590            page_content="foo()",1591            metadata={"Header 1": "Foo"},1592        ),1593        Document(1594            page_content="bar()",1595            metadata={"Header 1": "Foo", "Header 2": "Bar"},1596        ),1597    ]15981599    assert output == expected_output160016011602def test_md_header_text_splitter_with_custom_headers() -> None:1603    """Test markdown splitter with custom header patterns like **Header**."""1604    markdown_document = """**Chapter 1**16051606This is the content for chapter 1.16071608***Section 1.1***16091610This is the content for section 1.1.16111612**Chapter 2**16131614This is the content for chapter 2.16151616***Section 2.1***16171618This is the content for section 2.1.1619"""16201621    headers_to_split_on = [1622        ("**", "Bold Header"),1623        ("***", "Bold Italic Header"),1624    ]16251626    custom_header_patterns = {1627        "**": 1,  # Level 1 headers1628        "***": 2,  # Level 2 headers1629    }1630    markdown_splitter = MarkdownHeaderTextSplitter(1631        headers_to_split_on=headers_to_split_on,1632        custom_header_patterns=custom_header_patterns,1633    )1634    output = markdown_splitter.split_text(markdown_document)16351636    expected_output = [1637        Document(1638            page_content="This is the content for chapter 1.",1639            metadata={"Bold Header": "Chapter 1"},1640        ),1641        Document(1642            page_content="This is the content for section 1.1.",1643            metadata={"Bold Header": "Chapter 1", "Bold Italic Header": "Section 1.1"},1644        ),1645        Document(1646            page_content="This is the content for chapter 2.",1647            metadata={"Bold Header": "Chapter 2"},1648        ),1649        Document(1650            page_content="This is the content for section 2.1.",1651            metadata={"Bold Header": "Chapter 2", "Bold Italic Header": "Section 2.1"},1652        ),1653    ]16541655    assert output == expected_output165616571658def test_md_header_text_splitter_mixed_headers() -> None:1659    """Test markdown splitter with both standard and custom headers."""1660    markdown_document = """# Standard Header 116611662Content under standard header.16631664**Custom Header 1**16651666Content under custom header.16671668## Standard Header 216691670Content under standard header 2.16711672***Custom Header 2***16731674Content under custom header 2.1675"""16761677    headers_to_split_on = [1678        ("#", "Header 1"),1679        ("##", "Header 2"),1680        ("**", "Bold Header"),1681        ("***", "Bold Italic Header"),1682    ]16831684    custom_header_patterns = {1685        "**": 1,  # Same level as #1686        "***": 2,  # Same level as ##1687    }16881689    markdown_splitter = MarkdownHeaderTextSplitter(1690        headers_to_split_on=headers_to_split_on,1691        custom_header_patterns=custom_header_patterns,1692    )1693    output = markdown_splitter.split_text(markdown_document)16941695    expected_output = [1696        Document(1697            page_content="Content under standard header.",1698            metadata={"Header 1": "Standard Header 1"},1699        ),1700        Document(1701            page_content="Content under custom header.",1702            metadata={"Bold Header": "Custom Header 1"},1703        ),1704        Document(1705            page_content="Content under standard header 2.",1706            metadata={1707                "Bold Header": "Custom Header 1",1708                "Header 2": "Standard Header 2",1709            },1710        ),1711        Document(1712            page_content="Content under custom header 2.",1713            metadata={1714                "Bold Header": "Custom Header 1",1715                "Bold Italic Header": "Custom Header 2",1716            },1717        ),1718    ]17191720    assert output == expected_output172117221723EXPERIMENTAL_MARKDOWN_DOCUMENT = (1724    "# My Header 1\n"1725    "Content for header 1\n"1726    "## Header 2\n"1727    "Content for header 2\n"1728    "### Header 3\n"1729    "Content for header 3\n"1730    "## Header 2 Again\n"1731    "This should be tagged with Header 1 and Header 2 Again\n"1732    "```python\n"1733    "def func_definition():\n"1734    "   print('Keep the whitespace consistent')\n"1735    "```\n"1736    "# Header 1 again\n"1737    "We should also split on the horizontal line\n"1738    "----\n"1739    "This will be a new doc but with the same header metadata\n\n"1740    "And it includes a new paragraph"1741)174217431744def test_experimental_markdown_syntax_text_splitter() -> None:1745    """Test experimental markdown syntax splitter."""1746    markdown_splitter = ExperimentalMarkdownSyntaxTextSplitter()1747    output = markdown_splitter.split_text(EXPERIMENTAL_MARKDOWN_DOCUMENT)17481749    expected_output = [1750        Document(1751            page_content="Content for header 1\n",1752            metadata={"Header 1": "My Header 1"},1753        ),1754        Document(1755            page_content="Content for header 2\n",1756            metadata={"Header 1": "My Header 1", "Header 2": "Header 2"},1757        ),1758        Document(1759            page_content="Content for header 3\n",1760            metadata={1761                "Header 1": "My Header 1",1762                "Header 2": "Header 2",1763                "Header 3": "Header 3",1764            },1765        ),1766        Document(1767            page_content="This should be tagged with Header 1 and Header 2 Again\n",1768            metadata={"Header 1": "My Header 1", "Header 2": "Header 2 Again"},1769        ),1770        Document(1771            page_content=(1772                "```python\ndef func_definition():\n   "1773                "print('Keep the whitespace consistent')\n```\n"1774            ),1775            metadata={1776                "Code": "python",1777                "Header 1": "My Header 1",1778                "Header 2": "Header 2 Again",1779            },1780        ),1781        Document(1782            page_content="We should also split on the horizontal line\n",1783            metadata={"Header 1": "Header 1 again"},1784        ),1785        Document(1786            page_content=(1787                "This will be a new doc but with the same header metadata\n\n"1788                "And it includes a new paragraph"1789            ),1790            metadata={"Header 1": "Header 1 again"},1791        ),1792    ]17931794    assert output == expected_output179517961797def test_experimental_markdown_syntax_text_splitter_header_configuration() -> None:1798    """Test experimental markdown syntax splitter."""1799    headers_to_split_on = [("#", "Encabezamiento 1")]18001801    markdown_splitter = ExperimentalMarkdownSyntaxTextSplitter(1802        headers_to_split_on=headers_to_split_on1803    )1804    output = markdown_splitter.split_text(EXPERIMENTAL_MARKDOWN_DOCUMENT)18051806    expected_output = [1807        Document(1808            page_content=(1809                "Content for header 1\n"1810                "## Header 2\n"1811                "Content for header 2\n"1812                "### Header 3\n"1813                "Content for header 3\n"1814                "## Header 2 Again\n"1815                "This should be tagged with Header 1 and Header 2 Again\n"1816            ),1817            metadata={"Encabezamiento 1": "My Header 1"},1818        ),1819        Document(1820            page_content=(1821                "```python\ndef func_definition():\n   "1822                "print('Keep the whitespace consistent')\n```\n"1823            ),1824            metadata={"Code": "python", "Encabezamiento 1": "My Header 1"},1825        ),1826        Document(1827            page_content="We should also split on the horizontal line\n",1828            metadata={"Encabezamiento 1": "Header 1 again"},1829        ),1830        Document(1831            page_content=(1832                "This will be a new doc but with the same header metadata\n\n"1833                "And it includes a new paragraph"1834            ),1835            metadata={"Encabezamiento 1": "Header 1 again"},1836        ),1837    ]18381839    assert output == expected_output184018411842def test_experimental_markdown_syntax_text_splitter_with_headers() -> None:1843    """Test experimental markdown syntax splitter."""1844    markdown_splitter = ExperimentalMarkdownSyntaxTextSplitter(strip_headers=False)1845    output = markdown_splitter.split_text(EXPERIMENTAL_MARKDOWN_DOCUMENT)18461847    expected_output = [1848        Document(1849            page_content="# My Header 1\nContent for header 1\n",1850            metadata={"Header 1": "My Header 1"},1851        ),1852        Document(1853            page_content="## Header 2\nContent for header 2\n",1854            metadata={"Header 1": "My Header 1", "Header 2": "Header 2"},1855        ),1856        Document(1857            page_content="### Header 3\nContent for header 3\n",1858            metadata={1859                "Header 1": "My Header 1",1860                "Header 2": "Header 2",1861                "Header 3": "Header 3",1862            },1863        ),1864        Document(1865            page_content=(1866                "## Header 2 Again\n"1867                "This should be tagged with Header 1 and Header 2 Again\n"1868            ),1869            metadata={"Header 1": "My Header 1", "Header 2": "Header 2 Again"},1870        ),1871        Document(1872            page_content=(1873                "```python\ndef func_definition():\n   "1874                "print('Keep the whitespace consistent')\n```\n"1875            ),1876            metadata={1877                "Code": "python",1878                "Header 1": "My Header 1",1879                "Header 2": "Header 2 Again",1880            },1881        ),1882        Document(1883            page_content=(1884                "# Header 1 again\nWe should also split on the horizontal line\n"1885            ),1886            metadata={"Header 1": "Header 1 again"},1887        ),1888        Document(1889            page_content=(1890                "This will be a new doc but with the same header metadata\n\n"1891                "And it includes a new paragraph"1892            ),1893            metadata={"Header 1": "Header 1 again"},1894        ),1895    ]18961897    assert output == expected_output189818991900def test_experimental_markdown_syntax_text_splitter_split_lines() -> None:1901    """Test experimental markdown syntax splitter."""1902    markdown_splitter = ExperimentalMarkdownSyntaxTextSplitter(return_each_line=True)1903    output = markdown_splitter.split_text(EXPERIMENTAL_MARKDOWN_DOCUMENT)19041905    expected_output = [1906        Document(1907            page_content="Content for header 1", metadata={"Header 1": "My Header 1"}1908        ),1909        Document(1910            page_content="Content for header 2",1911            metadata={"Header 1": "My Header 1", "Header 2": "Header 2"},1912        ),1913        Document(1914            page_content="Content for header 3",1915            metadata={1916                "Header 1": "My Header 1",1917                "Header 2": "Header 2",1918                "Header 3": "Header 3",1919            },1920        ),1921        Document(1922            page_content="This should be tagged with Header 1 and Header 2 Again",1923            metadata={"Header 1": "My Header 1", "Header 2": "Header 2 Again"},1924        ),1925        Document(1926            page_content="```python",1927            metadata={1928                "Code": "python",1929                "Header 1": "My Header 1",1930                "Header 2": "Header 2 Again",1931            },1932        ),1933        Document(1934            page_content="def func_definition():",1935            metadata={1936                "Code": "python",1937                "Header 1": "My Header 1",1938                "Header 2": "Header 2 Again",1939            },1940        ),1941        Document(1942            page_content="   print('Keep the whitespace consistent')",1943            metadata={1944                "Code": "python",1945                "Header 1": "My Header 1",1946                "Header 2": "Header 2 Again",1947            },1948        ),1949        Document(1950            page_content="```",1951            metadata={1952                "Code": "python",1953                "Header 1": "My Header 1",1954                "Header 2": "Header 2 Again",1955            },1956        ),1957        Document(1958            page_content="We should also split on the horizontal line",1959            metadata={"Header 1": "Header 1 again"},1960        ),1961        Document(1962            page_content="This will be a new doc but with the same header metadata",1963            metadata={"Header 1": "Header 1 again"},1964        ),1965        Document(1966            page_content="And it includes a new paragraph",1967            metadata={"Header 1": "Header 1 again"},1968        ),1969    ]19701971    assert output == expected_output197219731974EXPERIMENTAL_MARKDOWN_DOCUMENTS = [1975    (1976        "# My Header 1 From Document 1\n"1977        "Content for header 1 from Document 1\n"1978        "## Header 2 From Document 1\n"1979        "Content for header 2 from Document 1\n"1980        "```python\n"1981        "def func_definition():\n"1982        "   print('Keep the whitespace consistent')\n"1983        "```\n"1984        "# Header 1 again From Document 1\n"1985        "We should also split on the horizontal line\n"1986        "----\n"1987        "This will be a new doc but with the same header metadata\n\n"1988        "And it includes a new paragraph"1989    ),1990    (1991        "# My Header 1 From Document 2\n"1992        "Content for header 1 from Document 2\n"1993        "## Header 2 From Document 2\n"1994        "Content for header 2 from Document 2\n"1995        "```python\n"1996        "def func_definition():\n"1997        "   print('Keep the whitespace consistent')\n"1998        "```\n"1999        "# Header 1 again From Document 2\n"2000        "We should also split on the horizontal line\n"

Findings

✓ No findings reported for this file.

Get this view in your editor

Same data, no extra tab — call code_get_file + code_get_findings over MCP from Claude/Cursor/Copilot.