libs/text-splitters/langchain_text_splitters/html.py · langchain-ai/langchain

1"""HTML text splitters."""23from __future__ import annotations45import copy6import pathlib7import re8from io import StringIO9from typing import (10    IO,11    TYPE_CHECKING,12    Any,13    Literal,14    TypedDict,15    cast,16)1718from langchain_core._api import beta, deprecated19from langchain_core.documents import BaseDocumentTransformer, Document20from typing_extensions import override2122from langchain_text_splitters.character import RecursiveCharacterTextSplitter2324if TYPE_CHECKING:25    from collections.abc import Callable, Iterable, Iterator, Sequence2627    from bs4 import BeautifulSoup, Tag28    from bs4.element import NavigableString, PageElement, ResultSet293031class ElementType(TypedDict):32    """Element type as typed dict."""3334    url: str35    xpath: str36    content: str37    metadata: dict[str, str]383940def _import_bs4(41    *, import_error_message: str42) -> tuple[type[BeautifulSoup], type[Tag], type[NavigableString]]:43    try:44        from bs4 import BeautifulSoup, Tag  # noqa: PLC041545        from bs4.element import NavigableString  # noqa: PLC041546    except ImportError as err:47        raise ImportError(import_error_message) from err48    return BeautifulSoup, Tag, NavigableString495051def _import_lxml_etree() -> object:52    try:53        from lxml import etree  # noqa: PLC041554    except ImportError as err:55        msg = "Unable to import lxml, please install with `pip install lxml`."56        raise ImportError(msg) from err57    return etree585960def _import_nltk() -> object:61    try:62        import nltk  # noqa: PLC041563    except ImportError as err:64        msg = "Could not import nltk. Please install it with 'pip install nltk'."65        raise ImportError(msg) from err66    return nltk676869# Unfortunately, BeautifulSoup doesn't define overloads for Tag.find_all.70# So doing the type resolution ourselves.717273def _find_all_strings(74    tag: Tag,75    *,76    recursive: bool = True,77) -> ResultSet[NavigableString]:78    return tag.find_all(string=True, recursive=recursive)798081def _find_all_tags(82    tag: Tag,83    *,84    name: bool | str | list[str] | None = None,85    recursive: bool = True,86) -> ResultSet[Tag]:87    return tag.find_all(name, recursive=recursive)888990class HTMLHeaderTextSplitter:91    """Split HTML content into structured Documents based on specified headers.9293    Splits HTML content by detecting specified header tags and creating hierarchical94    `Document` objects that reflect the semantic structure of the original content. For95    each identified section, the splitter associates the extracted text with metadata96    corresponding to the encountered headers.9798    If no specified headers are found, the entire content is returned as a single99    `Document`. This allows for flexible handling of HTML input, ensuring that100    information is organized according to its semantic headers.101102    The splitter provides the option to return each HTML element as a separate103    `Document` or aggregate them into semantically meaningful chunks. It also104    gracefully handles multiple levels of nested headers, creating a rich,105    hierarchical representation of the content.106107    Example:108        ```python109        from langchain_text_splitters.html_header_text_splitter import (110            HTMLHeaderTextSplitter,111        )112113        # Define headers for splitting on h1 and h2 tags.114        headers_to_split_on = [("h1", "Main Topic"), ("h2", "Sub Topic")]115116        splitter = HTMLHeaderTextSplitter(117            headers_to_split_on=headers_to_split_on,118            return_each_element=False119        )120121        html_content = \"\"\"122        <html>123            <body>124                <h1>Introduction</h1>125                <p>Welcome to the introduction section.</p>126                <h2>Background</h2>127                <p>Some background details here.</p>128                <h1>Conclusion</h1>129                <p>Final thoughts.</p>130            </body>131        </html>132        \"\"\"133134        documents = splitter.split_text(html_content)135136        # 'documents' now contains Document objects reflecting the hierarchy:137        # - Document with metadata={"Main Topic": "Introduction"} and138        #   content="Introduction"139        # - Document with metadata={"Main Topic": "Introduction"} and140        #   content="Welcome to the introduction section."141        # - Document with metadata={"Main Topic": "Introduction",142        #   "Sub Topic": "Background"} and content="Background"143        # - Document with metadata={"Main Topic": "Introduction",144        #   "Sub Topic": "Background"} and content="Some background details here."145        # - Document with metadata={"Main Topic": "Conclusion"} and146        #   content="Conclusion"147        # - Document with metadata={"Main Topic": "Conclusion"} and148        #   content="Final thoughts."149        ```150    """151152    def __init__(153        self,154        headers_to_split_on: list[tuple[str, str]],155        return_each_element: bool = False,  # noqa: FBT001,FBT002156    ) -> None:157        """Initialize with headers to split on.158159        Args:160            headers_to_split_on: A list of `(header_tag,161                header_name)` pairs representing the headers that define splitting162                boundaries.163164                For example, `[("h1", "Header 1"), ("h2", "Header 2")]` will split165                content by `h1` and `h2` tags, assigning their textual content to the166                `Document` metadata.167            return_each_element: If `True`, every HTML element encountered168                (including headers, paragraphs, etc.) is returned as a separate169                `Document`.170171                If `False`, content under the same header hierarchy is aggregated into172                fewer `Document` objects.173        """174        # Sort headers by their numeric level so that h1 < h2 < h3...175        self.headers_to_split_on = sorted(176            headers_to_split_on, key=lambda x: int(x[0][1:])177        )178        self.header_mapping = dict(self.headers_to_split_on)179        self.header_tags = [tag for tag, _ in self.headers_to_split_on]180        self.return_each_element = return_each_element181182    def split_text(self, text: str) -> list[Document]:183        """Split the given text into a list of `Document` objects.184185        Args:186            text: The HTML text to split.187188        Returns:189            A list of split `Document` objects.190191                Each `Document` contains `page_content` holding the extracted text and192                `metadata` that maps the header hierarchy to their corresponding titles.193        """194        return self.split_text_from_file(StringIO(text))195196    @deprecated(197        since="1.1.2",198        removal="2.0.0",199        addendum=(200            "Fetch the HTML content from the URL yourself and pass it to `split_text`."201        ),202    )203    def split_text_from_url(204        self,205        url: str,206        timeout: int = 10,207        **kwargs: Any,  # noqa: ARG002208    ) -> list[Document]:209        """Fetch text content from a URL and split it into documents.210211        Args:212            url: The URL to fetch content from.213            timeout: Timeout for the request.214            **kwargs: Additional keyword arguments for the request.215216        Returns:217            A list of split `Document` objects.218219                Each `Document` contains `page_content` holding the extracted text and220                `metadata` that maps the header hierarchy to their corresponding titles.221222        Raises:223            requests.RequestException: If the HTTP request fails.224        """225        from langchain_core._security._transport import (  # noqa: PLC0415226            ssrf_safe_client,227        )228229        with ssrf_safe_client() as client:230            response = client.get(url, timeout=timeout)231            response.raise_for_status()232            return self.split_text(response.text)233234    def split_text_from_file(self, file: str | IO[str]) -> list[Document]:235        """Split HTML content from a file into a list of `Document` objects.236237        Args:238            file: A file path or a file-like object containing HTML content.239240        Returns:241            A list of split `Document` objects.242243                Each `Document` contains `page_content` holding the extracted text and244                `metadata` that maps the header hierarchy to their corresponding titles.245        """246        if isinstance(file, str):247            html_content = pathlib.Path(file).read_text(encoding="utf-8")248        else:249            html_content = file.read()250        return list(self._generate_documents(html_content))251252    def _generate_documents(self, html_content: str) -> Iterator[Document]:253        """Private method that performs a DFS traversal over the DOM and yields.254255        Document objects on-the-fly. This approach maintains the same splitting logic256        (headers vs. non-headers, chunking, etc.) while walking the DOM explicitly in257        code.258259        Args:260            html_content: The raw HTML content.261262        Yields:263            Document objects as they are created.264265        Raises:266            ImportError: If BeautifulSoup is not installed.267        """268        beautiful_soup, tag_cls, _ = _import_bs4(269            import_error_message=(270                "Unable to import BeautifulSoup. Please install via `pip install bs4`."271            )272        )273274        soup = beautiful_soup(html_content, "html.parser")275        body = soup.body or soup276277        # Dictionary of active headers:278        #   key = user-defined header name (e.g. "Header 1")279        #   value = tuple of header_text, level, dom_depth280        active_headers: dict[str, tuple[str, int, int]] = {}281        current_chunk: list[str] = []282283        def finalize_chunk() -> Document | None:284            """Finalize the accumulated chunk into a single Document."""285            if not current_chunk:286                return None287288            final_text = "  \n".join(line for line in current_chunk if line.strip())289            current_chunk.clear()290            if not final_text.strip():291                return None292293            final_meta = {k: v[0] for k, v in active_headers.items()}294            return Document(page_content=final_text, metadata=final_meta)295296        # We'll use a stack for DFS traversal297        stack = [body]298        while stack:299            node = stack.pop()300            children = list(node.children)301302            stack.extend(303                child for child in reversed(children) if isinstance(child, tag_cls)304            )305306            tag = getattr(node, "name", None)307            if not tag:308                continue309310            text_elements = [311                str(child).strip() for child in _find_all_strings(node, recursive=False)312            ]313            node_text = " ".join(elem for elem in text_elements if elem)314            if not node_text:315                continue316317            dom_depth = len(list(node.parents))318319            # If this node is one of our headers320            if tag in self.header_tags:321                # If we're aggregating, finalize whatever chunk we had322                if not self.return_each_element:323                    doc = finalize_chunk()324                    if doc:325                        yield doc326327                # Determine numeric level (h1->1, h2->2, etc.)328                try:329                    level = int(tag[1:])330                except ValueError:331                    level = 9999332333                # Remove any active headers that are at or deeper than this new level334                headers_to_remove = [335                    k for k, (_, lvl, d) in active_headers.items() if lvl >= level336                ]337                for key in headers_to_remove:338                    del active_headers[key]339340                # Add/Update the active header341                header_name = self.header_mapping[tag]342                active_headers[header_name] = (node_text, level, dom_depth)343344                # Always yield a Document for the header345                header_meta = {k: v[0] for k, v in active_headers.items()}346                yield Document(page_content=node_text, metadata=header_meta)347348            else:349                headers_out_of_scope = [350                    k for k, (_, _, d) in active_headers.items() if dom_depth < d351                ]352                for key in headers_out_of_scope:353                    del active_headers[key]354355                if self.return_each_element:356                    # Yield each element's text as its own Document357                    meta = {k: v[0] for k, v in active_headers.items()}358                    yield Document(page_content=node_text, metadata=meta)359                else:360                    # Accumulate text in our chunk361                    current_chunk.append(node_text)362363        # If we're aggregating and have leftover chunk, yield it364        if not self.return_each_element:365            doc = finalize_chunk()366            if doc:367                yield doc368369370class HTMLSectionSplitter:371    """Splitting HTML files based on specified tag and font sizes.372373    Requires lxml package.374    """375376    def __init__(377        self,378        headers_to_split_on: list[tuple[str, str]],379        **kwargs: Any,380    ) -> None:381        """Create a new `HTMLSectionSplitter`.382383        Args:384            headers_to_split_on: List of tuples of headers we want to track mapped to385                (arbitrary) keys for metadata.386387                Allowed header values: `h1`, `h2`, `h3`, `h4`, `h5`, `h6`, e.g.:388                `[("h1", "Header 1"), ("h2", "Header 2"]`.389            **kwargs: Additional optional arguments for customizations.390391        """392        self.headers_to_split_on = dict(headers_to_split_on)393        self.xslt_path = (394            pathlib.Path(__file__).parent / "xsl/converting_to_header.xslt"395        ).absolute()396        self.kwargs = kwargs397398    def split_documents(self, documents: Iterable[Document]) -> list[Document]:399        """Split documents.400401        Args:402            documents: Iterable of `Document` objects to be split.403404        Returns:405            A list of split `Document` objects.406        """407        texts, metadatas = [], []408        for doc in documents:409            texts.append(doc.page_content)410            metadatas.append(doc.metadata)411        results = self.create_documents(texts, metadatas=metadatas)412413        text_splitter = RecursiveCharacterTextSplitter(**self.kwargs)414415        return text_splitter.split_documents(results)416417    def split_text(self, text: str) -> list[Document]:418        """Split HTML text string.419420        Args:421            text: HTML text422423        Returns:424            A list of split `Document` objects.425        """426        return self.split_text_from_file(StringIO(text))427428    def create_documents(429        self, texts: list[str], metadatas: list[dict[Any, Any]] | None = None430    ) -> list[Document]:431        """Create a list of `Document` objects from a list of texts.432433        Args:434            texts: A list of texts to be split and converted into documents.435            metadatas: Optional list of metadata to associate with each document.436437        Returns:438            A list of `Document` objects.439        """440        metadatas_ = metadatas or [{}] * len(texts)441        documents = []442        for i, text in enumerate(texts):443            for chunk in self.split_text(text):444                metadata = copy.deepcopy(metadatas_[i])445446                for key in chunk.metadata:447                    if chunk.metadata[key] == "#TITLE#":448                        chunk.metadata[key] = metadata["Title"]449                metadata = {**metadata, **chunk.metadata}450                new_doc = Document(page_content=chunk.page_content, metadata=metadata)451                documents.append(new_doc)452        return documents453454    def split_html_by_headers(self, html_doc: str) -> list[dict[str, str | None]]:455        """Split an HTML document into sections based on specified header tags.456457        This method uses BeautifulSoup to parse the HTML content and divides it into458        sections based on headers defined in `headers_to_split_on`. Each section459        contains the header text, content under the header, and the tag name.460461        Args:462            html_doc: The HTML document to be split into sections.463464        Returns:465            A list of dictionaries representing sections.466467                Each dictionary contains:468469                * `'header'`: The header text or a default title for the first section.470                * `'content'`: The content under the header.471                * `'tag_name'`: The name of the header tag (e.g., `h1`, `h2`).472473        Raises:474            ImportError: If BeautifulSoup is not installed.475        """476        beautiful_soup, _, _ = _import_bs4(477            import_error_message=(478                "Unable to import BeautifulSoup/PageElement, "479                "please install with `pip install bs4`."480            )481        )482483        soup = beautiful_soup(html_doc, "html.parser")484        header_names = list(self.headers_to_split_on.keys())485        sections: list[dict[str, str | None]] = []486487        headers = _find_all_tags(soup, name=["body", *header_names])488489        for i, header in enumerate(headers):490            if i == 0:491                current_header = "#TITLE#"492                current_header_tag = "h1"493                section_content: list[str] = []494            else:495                current_header = header.text.strip()496                current_header_tag = header.name497                section_content = []498            for element in header.next_elements:499                if i + 1 < len(headers) and element == headers[i + 1]:500                    break501                if isinstance(element, str):502                    section_content.append(element)503            content = " ".join(section_content).strip()504505            if content:506                sections.append(507                    {508                        "header": current_header,509                        "content": content,510                        "tag_name": current_header_tag,511                    }512                )513514        return sections515516    def convert_possible_tags_to_header(self, html_content: str) -> str:517        """Convert specific HTML tags to headers using an XSLT transformation.518519        This method uses an XSLT file to transform the HTML content, converting520        certain tags into headers for easier parsing. If no XSLT path is provided,521        the HTML content is returned unchanged.522523        Args:524            html_content: The HTML content to be transformed.525526        Returns:527            The transformed HTML content as a string.528529        Raises:530            ImportError: If the `lxml` library is not installed.531        """532        etree = cast("Any", _import_lxml_etree())533        # use lxml library to parse html document and return xml ElementTree534        # Create secure parsers to prevent XXE attacks535        html_parser = etree.HTMLParser(no_network=True)536        xslt_parser = etree.XMLParser(537            resolve_entities=False, no_network=True, load_dtd=False538        )539540        # Apply XSLT access control to prevent file/network access541        # DENY_ALL is a predefined access control that blocks all file/network access542        ac = etree.XSLTAccessControl.DENY_ALL543544        tree = etree.parse(StringIO(html_content), html_parser)545        xslt_tree = etree.parse(self.xslt_path, xslt_parser)546        transform = etree.XSLT(xslt_tree, access_control=ac)547        result = transform(tree)548        return str(result)549550    def split_text_from_file(self, file: StringIO) -> list[Document]:551        """Split HTML content from a file into a list of `Document` objects.552553        Args:554            file: A file path or a file-like object containing HTML content.555556        Returns:557            A list of split `Document` objects.558        """559        file_content = file.getvalue()560        file_content = self.convert_possible_tags_to_header(file_content)561        sections = self.split_html_by_headers(file_content)562563        return [564            Document(565                cast("str", section["content"]),566                metadata={567                    self.headers_to_split_on[str(section["tag_name"])]: section[568                        "header"569                    ]570                },571            )572            for section in sections573        ]574575576@beta()577class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):578    """Split HTML content preserving semantic structure.579580    Splits HTML content by headers into generalized chunks, preserving semantic581    structure. If chunks exceed the maximum chunk size, it uses582    `RecursiveCharacterTextSplitter` for further splitting.583584    The splitter preserves full HTML elements and converts links to Markdown-like links.585    It can also preserve images, videos, and audio elements by converting them into586    Markdown format. Note that some chunks may exceed the maximum size to maintain587    semantic integrity.588589    !!! version-added "Added in `langchain-text-splitters` 0.3.5"590591    Example:592        ```python593        from langchain_text_splitters.html import HTMLSemanticPreservingSplitter594595        def custom_iframe_extractor(iframe_tag):596            ```597            Custom handler function to extract the 'src' attribute from an <iframe> tag.598            Converts the iframe to a Markdown-like link: [iframe:<src>](src).599600            Args:601                iframe_tag (bs4.element.Tag): The <iframe> tag to be processed.602603            Returns:604                str: A formatted string representing the iframe in Markdown-like format.605            ```606            iframe_src = iframe_tag.get('src', '')607            return f"[iframe:{iframe_src}]({iframe_src})"608609        text_splitter = HTMLSemanticPreservingSplitter(610            headers_to_split_on=[("h1", "Header 1"), ("h2", "Header 2")],611            max_chunk_size=500,612            preserve_links=True,613            preserve_images=True,614            custom_handlers={"iframe": custom_iframe_extractor}615        )616        ```617    """  # noqa: D214618619    def __init__(620        self,621        headers_to_split_on: list[tuple[str, str]],622        *,623        max_chunk_size: int = 1000,624        chunk_overlap: int = 0,625        separators: list[str] | None = None,626        elements_to_preserve: list[str] | None = None,627        preserve_links: bool = False,628        preserve_images: bool = False,629        preserve_videos: bool = False,630        preserve_audio: bool = False,631        custom_handlers: dict[str, Callable[[Tag], str]] | None = None,632        stopword_removal: bool = False,633        stopword_lang: str = "english",634        normalize_text: bool = False,635        external_metadata: dict[str, str] | None = None,636        allowlist_tags: list[str] | None = None,637        denylist_tags: list[str] | None = None,638        preserve_parent_metadata: bool = False,639        keep_separator: bool | Literal["start", "end"] = True,640    ) -> None:641        """Initialize splitter.642643        Args:644            headers_to_split_on: HTML headers (e.g., `h1`, `h2`) that define content645                sections.646            max_chunk_size: Maximum size for each chunk, with allowance for exceeding647                this limit to preserve semantics.648            chunk_overlap: Number of characters to overlap between chunks to ensure649                contextual continuity.650            separators: Delimiters used by `RecursiveCharacterTextSplitter` for651                further splitting.652            elements_to_preserve: HTML tags (e.g., `table`, `ul`) to remain653                intact during splitting.654            preserve_links: Converts `a` tags to Markdown links (`[text](url)`).655            preserve_images: Converts `img` tags to Markdown images (`![alt](src)`).656            preserve_videos: Converts `video` tags to Markdown video links657                (`![video](src)`).658            preserve_audio: Converts `audio` tags to Markdown audio links659                (`![audio](src)`).660            custom_handlers: Optional custom handlers for specific HTML tags, allowing661                tailored extraction or processing.662            stopword_removal: Optionally remove stopwords from the text.663            stopword_lang: The language of stopwords to remove.664            normalize_text: Optionally normalize text (e.g., lowercasing, removing665                punctuation).666            external_metadata: Additional metadata to attach to the Document objects.667            allowlist_tags: Only these tags will be retained in the HTML.668            denylist_tags: These tags will be removed from the HTML.669            preserve_parent_metadata: Whether to pass through parent document metadata670                to split documents when calling671                `transform_documents/atransform_documents()`.672            keep_separator: Whether separators should be at the beginning of a chunk, at673                the end, or not at all.674675        Raises:676            ImportError: If BeautifulSoup or NLTK (when stopword removal is enabled)677                is not installed.678        """679        _import_bs4(680            import_error_message=(681                "Could not import BeautifulSoup. "682                "Please install it with 'pip install bs4'."683            )684        )685686        self._headers_to_split_on = sorted(headers_to_split_on)687        self._max_chunk_size = max_chunk_size688        self._elements_to_preserve = elements_to_preserve or []689        self._preserve_links = preserve_links690        self._preserve_images = preserve_images691        self._preserve_videos = preserve_videos692        self._preserve_audio = preserve_audio693        self._custom_handlers = custom_handlers or {}694        self._stopword_removal = stopword_removal695        self._stopword_lang = stopword_lang696        self._normalize_text = normalize_text697        self._external_metadata = external_metadata or {}698        self._allowlist_tags = allowlist_tags699        self._preserve_parent_metadata = preserve_parent_metadata700        self._keep_separator = keep_separator701        if allowlist_tags:702            self._allowlist_tags = list(703                set(allowlist_tags + [header[0] for header in headers_to_split_on])704            )705        self._denylist_tags = denylist_tags706        if denylist_tags:707            self._denylist_tags = [708                tag709                for tag in denylist_tags710                if tag not in [header[0] for header in headers_to_split_on]711            ]712        if separators:713            self._recursive_splitter = RecursiveCharacterTextSplitter(714                separators=separators,715                keep_separator=keep_separator,716                chunk_size=max_chunk_size,717                chunk_overlap=chunk_overlap,718            )719        else:720            self._recursive_splitter = RecursiveCharacterTextSplitter(721                keep_separator=keep_separator,722                chunk_size=max_chunk_size,723                chunk_overlap=chunk_overlap,724            )725726        if self._stopword_removal:727            nltk = cast("Any", _import_nltk())728            nltk.download("stopwords")729            self._stopwords = set(nltk.corpus.stopwords.words(self._stopword_lang))730731    def split_text(self, text: str) -> list[Document]:732        """Splits the provided HTML text into smaller chunks based on the configuration.733734        Args:735            text: The HTML content to be split.736737        Returns:738            A list of `Document` objects containing the split content.739        """740        beautiful_soup, _, _ = _import_bs4(741            import_error_message=(742                "Could not import BeautifulSoup. "743                "Please install it with 'pip install bs4'."744            )745        )746        soup = beautiful_soup(text, "html.parser")747748        self._process_media(soup)749750        if self._preserve_links:751            self._process_links(soup)752753        if self._allowlist_tags or self._denylist_tags:754            self._filter_tags(soup)755756        return self._process_html(soup)757758    @override759    def transform_documents(760        self, documents: Sequence[Document], **kwargs: Any761    ) -> list[Document]:762        """Transform sequence of documents by splitting them.763764        Args:765            documents: A sequence of `Document` objects to be split.766767        Returns:768            A sequence of split `Document` objects.769        """770        transformed = []771        for doc in documents:772            splits = self.split_text(doc.page_content)773            if self._preserve_parent_metadata:774                splits = [775                    Document(776                        page_content=split_doc.page_content,777                        metadata={**doc.metadata, **split_doc.metadata},778                    )779                    for split_doc in splits780                ]781            transformed.extend(splits)782        return transformed783784    def _process_media(self, soup: BeautifulSoup) -> None:785        """Processes the media elements.786787        Process elements in the HTML content by wrapping them in a <media-wrapper> tag788        and converting them to Markdown format.789790        Args:791            soup: Parsed HTML content using BeautifulSoup.792        """793        if self._preserve_images:794            for img_tag in _find_all_tags(soup, name="img"):795                img_src = img_tag.get("src", "")796                markdown_img = f"![image:{img_src}]({img_src})"797                wrapper = soup.new_tag("media-wrapper")798                wrapper.string = markdown_img799                img_tag.replace_with(wrapper)800801        if self._preserve_videos:802            for video_tag in _find_all_tags(soup, name="video"):803                video_src = video_tag.get("src", "")804                markdown_video = f"![video:{video_src}]({video_src})"805                wrapper = soup.new_tag("media-wrapper")806                wrapper.string = markdown_video807                video_tag.replace_with(wrapper)808809        if self._preserve_audio:810            for audio_tag in _find_all_tags(soup, name="audio"):811                audio_src = audio_tag.get("src", "")812                markdown_audio = f"![audio:{audio_src}]({audio_src})"813                wrapper = soup.new_tag("media-wrapper")814                wrapper.string = markdown_audio815                audio_tag.replace_with(wrapper)816817    @staticmethod818    def _process_links(soup: BeautifulSoup) -> None:819        """Processes the links in the HTML content.820821        Args:822            soup: Parsed HTML content using BeautifulSoup.823        """824        _, _, navigable_string = _import_bs4(825            import_error_message=(826                "Could not import BeautifulSoup. "827                "Please install it with 'pip install bs4'."828            )829        )830        for a_tag in _find_all_tags(soup, name="a"):831            a_href = a_tag.get("href", "")832            a_text = a_tag.get_text(strip=True)833            markdown_link = f"[{a_text}]({a_href})"834            wrapper = soup.new_tag("link-wrapper")835            wrapper.string = markdown_link836            a_tag.replace_with(navigable_string(markdown_link))837838    def _filter_tags(self, soup: BeautifulSoup) -> None:839        """Filters the HTML content based on the allowlist and denylist tags.840841        Args:842            soup: Parsed HTML content using BeautifulSoup.843        """844        if self._allowlist_tags:845            for tag in _find_all_tags(soup, name=True):846                if tag.name not in self._allowlist_tags:847                    tag.decompose()848849        if self._denylist_tags:850            for tag in _find_all_tags(soup, name=self._denylist_tags):851                tag.decompose()852853    def _normalize_and_clean_text(self, text: str) -> str:854        """Normalizes the text by removing extra spaces and newlines.855856        Args:857            text: The text to be normalized.858859        Returns:860            The normalized text.861        """862        if self._normalize_text:863            text = text.lower()864            text = re.sub(r"[^\w\s]", "", text)865            text = re.sub(r"\s+", " ", text).strip()866867        if self._stopword_removal:868            text = " ".join(869                [word for word in text.split() if word not in self._stopwords]870            )871872        return text873874    def _process_html(self, soup: BeautifulSoup) -> list[Document]:875        """Processes the HTML content using BeautifulSoup and splits it using headers.876877        Args:878            soup: Parsed HTML content using BeautifulSoup.879880        Returns:881            A list of `Document` objects containing the split content.882        """883        _, tag_cls, _ = _import_bs4(884            import_error_message=(885                "Could not import BeautifulSoup. "886                "Please install it with 'pip install bs4'."887            )888        )889        documents: list[Document] = []890        current_headers: dict[str, str] = {}891        current_content: list[str] = []892        preserved_elements: dict[str, str] = {}893        placeholder_count: int = 0894895        def _get_element_text(element: PageElement) -> str:896            """Recursively extracts and processes the text of an element.897898            Applies custom handlers where applicable, and ensures correct spacing.899900            Args:901                element: The HTML element to process.902903            Returns:904                The processed text of the element, or an empty string for905                    elements with no extractable text.906            """907            if isinstance(element, tag_cls):908                if element.name in self._custom_handlers:909                    return self._custom_handlers[element.name](element)910911                text = ""912                for child in element.children:913                    child_text = _get_element_text(child).strip()914                    if text and child_text:915                        text += " "916                    text += child_text917918                return self._normalize_and_clean_text(text)919920            if hasattr(element, "string") and isinstance(element.string, str):921                return self._normalize_and_clean_text(element.string)922923            return ""924925        elements = _find_all_tags(soup, recursive=False)926927        def _process_element(928            element: ResultSet[Tag],929            documents: list[Document],930            current_headers: dict[str, str],931            current_content: list[str],932            preserved_elements: dict[str, str],933            placeholder_count: int,934        ) -> tuple[list[Document], dict[str, str], list[str], dict[str, str], int]:935            for elem in element:936                if elem.name in [h[0] for h in self._headers_to_split_on]:937                    if current_content:938                        documents.extend(939                            self._create_documents(940                                current_headers,941                                " ".join(current_content),942                                preserved_elements,943                            )944                        )945                        current_content.clear()946                        preserved_elements.clear()947                    header_name = elem.get_text(strip=True)948                    current_headers = {949                        dict(self._headers_to_split_on)[elem.name]: header_name950                    }951                elif elem.name in self._elements_to_preserve:952                    placeholder = f"PRESERVED_{placeholder_count}"953                    preserved_elements[placeholder] = _get_element_text(elem)954                    current_content.append(placeholder)955                    placeholder_count += 1956                else:957                    # Recursively process children to find nested headers or958                    # preserved elements.959                    children = _find_all_tags(elem, recursive=False)960                    if children:961                        # Element has children - recursively process them.962                        (963                            documents,964                            current_headers,965                            current_content,966                            preserved_elements,967                            placeholder_count,968                        ) = _process_element(969                            children,970                            documents,971                            current_headers,972                            current_content,973                            preserved_elements,974                            placeholder_count,975                        )976                        # After processing children, extract only text977                        # strings from this element (not its children). Used978                        # recursive=False to avoid double-counting.979                        content = " ".join(_find_all_strings(elem, recursive=False))980                        if content:981                            content = self._normalize_and_clean_text(content)982                            current_content.append(content)983                    else:984                        # Leaf element with no children, so we extract its985                        # text and add to current content. Handles986                        # text-only elements like <p>, <span>, <div>987                        content = _get_element_text(elem)988                        if content:989                            current_content.append(content)990991            return (992                documents,993                current_headers,994                current_content,995                preserved_elements,996                placeholder_count,997            )998999        # Process the elements1000        (1001            documents,1002            current_headers,1003            current_content,1004            preserved_elements,1005            placeholder_count,1006        ) = _process_element(1007            elements,1008            documents,1009            current_headers,1010            current_content,1011            preserved_elements,1012            placeholder_count,1013        )10141015        # Handle any remaining content1016        if current_content:1017            documents.extend(1018                self._create_documents(1019                    current_headers,1020                    " ".join(current_content),1021                    preserved_elements,1022                )1023            )10241025        return documents10261027    def _create_documents(1028        self, headers: dict[str, str], content: str, preserved_elements: dict[str, str]1029    ) -> list[Document]:1030        """Creates Document objects from the provided headers, content, and elements.10311032        Args:1033            headers: The headers to attach as metadata to the `Document`.1034            content: The content of the `Document`.1035            preserved_elements: Preserved elements to be reinserted into the content.10361037        Returns:1038            A list of `Document` objects.1039        """1040        content = re.sub(r"\s+", " ", content).strip()10411042        metadata = {**headers, **self._external_metadata}10431044        if len(content) <= self._max_chunk_size:1045            page_content = self._reinsert_preserved_elements(1046                content, preserved_elements1047            )1048            return [Document(page_content=page_content, metadata=metadata)]1049        return self._further_split_chunk(content, metadata, preserved_elements)10501051    def _further_split_chunk(1052        self, content: str, metadata: dict[Any, Any], preserved_elements: dict[str, str]1053    ) -> list[Document]:1054        """Further splits the content into smaller chunks.10551056        Args:1057            content: The content to be split.1058            metadata: Metadata to attach to each chunk.1059            preserved_elements: Preserved elements to be reinserted into each chunk.10601061        Returns:1062            A list of `Document` objects containing the split content.1063        """1064        splits = self._recursive_splitter.split_text(content)1065        result = []10661067        for split in splits:1068            split_with_preserved = self._reinsert_preserved_elements(1069                split, preserved_elements1070            )1071            if split_with_preserved.strip():1072                result.append(1073                    Document(1074                        page_content=split_with_preserved.strip(),1075                        metadata=metadata,1076                    )1077                )10781079        return result10801081    @staticmethod1082    def _reinsert_preserved_elements(1083        content: str, preserved_elements: dict[str, str]1084    ) -> str:1085        """Reinserts preserved elements into the content into their original positions.10861087        Args:1088            content: The content where placeholders need to be replaced.1089            preserved_elements: Preserved elements to be reinserted.10901091        Returns:1092            The content with placeholders replaced by preserved elements.1093        """1094        for placeholder, preserved_content in reversed(preserved_elements.items()):1095            content = content.replace(placeholder, preserved_content.strip())1096        return content109710981099# %%