libs/text-splitters/langchain_text_splitters/html.py PYTHON 1,078 lines View on github.com → Search inside
1"""HTML text splitters."""23from __future__ import annotations45import copy6import pathlib7import re8from io import StringIO9from typing import (10    IO,11    TYPE_CHECKING,12    Any,13    Literal,14    TypedDict,15    cast,16)1718from langchain_core._api import beta, deprecated19from langchain_core.documents import BaseDocumentTransformer, Document20from typing_extensions import override2122from langchain_text_splitters.character import RecursiveCharacterTextSplitter2324if TYPE_CHECKING:25    from collections.abc import Callable, Iterable, Iterator, Sequence2627    from bs4.element import ResultSet2829try:30    import nltk3132    _HAS_NLTK = True33except ImportError:34    _HAS_NLTK = False3536try:37    from bs4 import BeautifulSoup, Tag38    from bs4.element import NavigableString, PageElement3940    _HAS_BS4 = True41except ImportError:42    _HAS_BS4 = False4344try:45    from lxml import etree4647    _HAS_LXML = True48except ImportError:49    _HAS_LXML = False505152class ElementType(TypedDict):53    """Element type as typed dict."""5455    url: str56    xpath: str57    content: str58    metadata: dict[str, str]596061# Unfortunately, BeautifulSoup doesn't define overloads for Tag.find_all.62# So doing the type resolution ourselves.636465def _find_all_strings(66    tag: Tag,67    *,68    recursive: bool = True,69) -> ResultSet[NavigableString]:70    return tag.find_all(string=True, recursive=recursive)717273def _find_all_tags(74    tag: Tag,75    *,76    name: bool | str | list[str] | None = None,77    recursive: bool = True,78) -> ResultSet[Tag]:79    return tag.find_all(name, recursive=recursive)808182class HTMLHeaderTextSplitter:83    """Split HTML content into structured Documents based on specified headers.8485    Splits HTML content by detecting specified header tags and creating hierarchical86    `Document` objects that reflect the semantic structure of the original content. For87    each identified section, the splitter associates the extracted text with metadata88    corresponding to the encountered headers.8990    If no specified headers are found, the entire content is returned as a single91    `Document`. This allows for flexible handling of HTML input, ensuring that92    information is organized according to its semantic headers.9394    The splitter provides the option to return each HTML element as a separate95    `Document` or aggregate them into semantically meaningful chunks. It also96    gracefully handles multiple levels of nested headers, creating a rich,97    hierarchical representation of the content.9899    Example:100        ```python101        from langchain_text_splitters.html_header_text_splitter import (102            HTMLHeaderTextSplitter,103        )104105        # Define headers for splitting on h1 and h2 tags.106        headers_to_split_on = [("h1", "Main Topic"), ("h2", "Sub Topic")]107108        splitter = HTMLHeaderTextSplitter(109            headers_to_split_on=headers_to_split_on,110            return_each_element=False111        )112113        html_content = \"\"\"114        <html>115            <body>116                <h1>Introduction</h1>117                <p>Welcome to the introduction section.</p>118                <h2>Background</h2>119                <p>Some background details here.</p>120                <h1>Conclusion</h1>121                <p>Final thoughts.</p>122            </body>123        </html>124        \"\"\"125126        documents = splitter.split_text(html_content)127128        # 'documents' now contains Document objects reflecting the hierarchy:129        # - Document with metadata={"Main Topic": "Introduction"} and130        #   content="Introduction"131        # - Document with metadata={"Main Topic": "Introduction"} and132        #   content="Welcome to the introduction section."133        # - Document with metadata={"Main Topic": "Introduction",134        #   "Sub Topic": "Background"} and content="Background"135        # - Document with metadata={"Main Topic": "Introduction",136        #   "Sub Topic": "Background"} and content="Some background details here."137        # - Document with metadata={"Main Topic": "Conclusion"} and138        #   content="Conclusion"139        # - Document with metadata={"Main Topic": "Conclusion"} and140        #   content="Final thoughts."141        ```142    """143144    def __init__(145        self,146        headers_to_split_on: list[tuple[str, str]],147        return_each_element: bool = False,  # noqa: FBT001,FBT002148    ) -> None:149        """Initialize with headers to split on.150151        Args:152            headers_to_split_on: A list of `(header_tag,153                header_name)` pairs representing the headers that define splitting154                boundaries.155156                For example, `[("h1", "Header 1"), ("h2", "Header 2")]` will split157                content by `h1` and `h2` tags, assigning their textual content to the158                `Document` metadata.159            return_each_element: If `True`, every HTML element encountered160                (including headers, paragraphs, etc.) is returned as a separate161                `Document`.162163                If `False`, content under the same header hierarchy is aggregated into164                fewer `Document` objects.165        """166        # Sort headers by their numeric level so that h1 < h2 < h3...167        self.headers_to_split_on = sorted(168            headers_to_split_on, key=lambda x: int(x[0][1:])169        )170        self.header_mapping = dict(self.headers_to_split_on)171        self.header_tags = [tag for tag, _ in self.headers_to_split_on]172        self.return_each_element = return_each_element173174    def split_text(self, text: str) -> list[Document]:175        """Split the given text into a list of `Document` objects.176177        Args:178            text: The HTML text to split.179180        Returns:181            A list of split `Document` objects.182183                Each `Document` contains `page_content` holding the extracted text and184                `metadata` that maps the header hierarchy to their corresponding titles.185        """186        return self.split_text_from_file(StringIO(text))187188    @deprecated(189        since="1.1.2",190        removal="2.0.0",191        addendum=(192            "Fetch the HTML content from the URL yourself and pass it to `split_text`."193        ),194    )195    def split_text_from_url(196        self,197        url: str,198        timeout: int = 10,199        **kwargs: Any,  # noqa: ARG002200    ) -> list[Document]:201        """Fetch text content from a URL and split it into documents.202203        Args:204            url: The URL to fetch content from.205            timeout: Timeout for the request.206            **kwargs: Additional keyword arguments for the request.207208        Returns:209            A list of split `Document` objects.210211                Each `Document` contains `page_content` holding the extracted text and212                `metadata` that maps the header hierarchy to their corresponding titles.213214        Raises:215            requests.RequestException: If the HTTP request fails.216        """217        from langchain_core._security._transport import (  # noqa: PLC0415218            ssrf_safe_client,219        )220221        with ssrf_safe_client() as client:222            response = client.get(url, timeout=timeout)223            response.raise_for_status()224            return self.split_text(response.text)225226    def split_text_from_file(self, file: str | IO[str]) -> list[Document]:227        """Split HTML content from a file into a list of `Document` objects.228229        Args:230            file: A file path or a file-like object containing HTML content.231232        Returns:233            A list of split `Document` objects.234235                Each `Document` contains `page_content` holding the extracted text and236                `metadata` that maps the header hierarchy to their corresponding titles.237        """238        if isinstance(file, str):239            html_content = pathlib.Path(file).read_text(encoding="utf-8")240        else:241            html_content = file.read()242        return list(self._generate_documents(html_content))243244    def _generate_documents(self, html_content: str) -> Iterator[Document]:245        """Private method that performs a DFS traversal over the DOM and yields.246247        Document objects on-the-fly. This approach maintains the same splitting logic248        (headers vs. non-headers, chunking, etc.) while walking the DOM explicitly in249        code.250251        Args:252            html_content: The raw HTML content.253254        Yields:255            Document objects as they are created.256257        Raises:258            ImportError: If BeautifulSoup is not installed.259        """260        if not _HAS_BS4:261            msg = (262                "Unable to import BeautifulSoup. Please install via `pip install bs4`."263            )264            raise ImportError(msg)265266        soup = BeautifulSoup(html_content, "html.parser")267        body = soup.body or soup268269        # Dictionary of active headers:270        #   key = user-defined header name (e.g. "Header 1")271        #   value = tuple of header_text, level, dom_depth272        active_headers: dict[str, tuple[str, int, int]] = {}273        current_chunk: list[str] = []274275        def finalize_chunk() -> Document | None:276            """Finalize the accumulated chunk into a single Document."""277            if not current_chunk:278                return None279280            final_text = "  \n".join(line for line in current_chunk if line.strip())281            current_chunk.clear()282            if not final_text.strip():283                return None284285            final_meta = {k: v[0] for k, v in active_headers.items()}286            return Document(page_content=final_text, metadata=final_meta)287288        # We'll use a stack for DFS traversal289        stack = [body]290        while stack:291            node = stack.pop()292            children = list(node.children)293294            stack.extend(295                child for child in reversed(children) if isinstance(child, Tag)296            )297298            tag = getattr(node, "name", None)299            if not tag:300                continue301302            text_elements = [303                str(child).strip() for child in _find_all_strings(node, recursive=False)304            ]305            node_text = " ".join(elem for elem in text_elements if elem)306            if not node_text:307                continue308309            dom_depth = len(list(node.parents))310311            # If this node is one of our headers312            if tag in self.header_tags:313                # If we're aggregating, finalize whatever chunk we had314                if not self.return_each_element:315                    doc = finalize_chunk()316                    if doc:317                        yield doc318319                # Determine numeric level (h1->1, h2->2, etc.)320                try:321                    level = int(tag[1:])322                except ValueError:323                    level = 9999324325                # Remove any active headers that are at or deeper than this new level326                headers_to_remove = [327                    k for k, (_, lvl, d) in active_headers.items() if lvl >= level328                ]329                for key in headers_to_remove:330                    del active_headers[key]331332                # Add/Update the active header333                header_name = self.header_mapping[tag]334                active_headers[header_name] = (node_text, level, dom_depth)335336                # Always yield a Document for the header337                header_meta = {k: v[0] for k, v in active_headers.items()}338                yield Document(page_content=node_text, metadata=header_meta)339340            else:341                headers_out_of_scope = [342                    k for k, (_, _, d) in active_headers.items() if dom_depth < d343                ]344                for key in headers_out_of_scope:345                    del active_headers[key]346347                if self.return_each_element:348                    # Yield each element's text as its own Document349                    meta = {k: v[0] for k, v in active_headers.items()}350                    yield Document(page_content=node_text, metadata=meta)351                else:352                    # Accumulate text in our chunk353                    current_chunk.append(node_text)354355        # If we're aggregating and have leftover chunk, yield it356        if not self.return_each_element:357            doc = finalize_chunk()358            if doc:359                yield doc360361362class HTMLSectionSplitter:363    """Splitting HTML files based on specified tag and font sizes.364365    Requires lxml package.366    """367368    def __init__(369        self,370        headers_to_split_on: list[tuple[str, str]],371        **kwargs: Any,372    ) -> None:373        """Create a new `HTMLSectionSplitter`.374375        Args:376            headers_to_split_on: List of tuples of headers we want to track mapped to377                (arbitrary) keys for metadata.378379                Allowed header values: `h1`, `h2`, `h3`, `h4`, `h5`, `h6`, e.g.:380                `[("h1", "Header 1"), ("h2", "Header 2"]`.381            **kwargs: Additional optional arguments for customizations.382383        """384        self.headers_to_split_on = dict(headers_to_split_on)385        self.xslt_path = (386            pathlib.Path(__file__).parent / "xsl/converting_to_header.xslt"387        ).absolute()388        self.kwargs = kwargs389390    def split_documents(self, documents: Iterable[Document]) -> list[Document]:391        """Split documents.392393        Args:394            documents: Iterable of `Document` objects to be split.395396        Returns:397            A list of split `Document` objects.398        """399        texts, metadatas = [], []400        for doc in documents:401            texts.append(doc.page_content)402            metadatas.append(doc.metadata)403        results = self.create_documents(texts, metadatas=metadatas)404405        text_splitter = RecursiveCharacterTextSplitter(**self.kwargs)406407        return text_splitter.split_documents(results)408409    def split_text(self, text: str) -> list[Document]:410        """Split HTML text string.411412        Args:413            text: HTML text414415        Returns:416            A list of split `Document` objects.417        """418        return self.split_text_from_file(StringIO(text))419420    def create_documents(421        self, texts: list[str], metadatas: list[dict[Any, Any]] | None = None422    ) -> list[Document]:423        """Create a list of `Document` objects from a list of texts.424425        Args:426            texts: A list of texts to be split and converted into documents.427            metadatas: Optional list of metadata to associate with each document.428429        Returns:430            A list of `Document` objects.431        """432        metadatas_ = metadatas or [{}] * len(texts)433        documents = []434        for i, text in enumerate(texts):435            for chunk in self.split_text(text):436                metadata = copy.deepcopy(metadatas_[i])437438                for key in chunk.metadata:439                    if chunk.metadata[key] == "#TITLE#":440                        chunk.metadata[key] = metadata["Title"]441                metadata = {**metadata, **chunk.metadata}442                new_doc = Document(page_content=chunk.page_content, metadata=metadata)443                documents.append(new_doc)444        return documents445446    def split_html_by_headers(self, html_doc: str) -> list[dict[str, str | None]]:447        """Split an HTML document into sections based on specified header tags.448449        This method uses BeautifulSoup to parse the HTML content and divides it into450        sections based on headers defined in `headers_to_split_on`. Each section451        contains the header text, content under the header, and the tag name.452453        Args:454            html_doc: The HTML document to be split into sections.455456        Returns:457            A list of dictionaries representing sections.458459                Each dictionary contains:460461                * `'header'`: The header text or a default title for the first section.462                * `'content'`: The content under the header.463                * `'tag_name'`: The name of the header tag (e.g., `h1`, `h2`).464465        Raises:466            ImportError: If BeautifulSoup is not installed.467        """468        if not _HAS_BS4:469            msg = "Unable to import BeautifulSoup/PageElement, \470                    please install with `pip install \471                    bs4`."472            raise ImportError(msg)473474        soup = BeautifulSoup(html_doc, "html.parser")475        header_names = list(self.headers_to_split_on.keys())476        sections: list[dict[str, str | None]] = []477478        headers = _find_all_tags(soup, name=["body", *header_names])479480        for i, header in enumerate(headers):481            if i == 0:482                current_header = "#TITLE#"483                current_header_tag = "h1"484                section_content: list[str] = []485            else:486                current_header = header.text.strip()487                current_header_tag = header.name488                section_content = []489            for element in header.next_elements:490                if i + 1 < len(headers) and element == headers[i + 1]:491                    break492                if isinstance(element, str):493                    section_content.append(element)494            content = " ".join(section_content).strip()495496            if content:497                sections.append(498                    {499                        "header": current_header,500                        "content": content,501                        "tag_name": current_header_tag,502                    }503                )504505        return sections506507    def convert_possible_tags_to_header(self, html_content: str) -> str:508        """Convert specific HTML tags to headers using an XSLT transformation.509510        This method uses an XSLT file to transform the HTML content, converting511        certain tags into headers for easier parsing. If no XSLT path is provided,512        the HTML content is returned unchanged.513514        Args:515            html_content: The HTML content to be transformed.516517        Returns:518            The transformed HTML content as a string.519520        Raises:521            ImportError: If the `lxml` library is not installed.522        """523        if not _HAS_LXML:524            msg = "Unable to import lxml, please install with `pip install lxml`."525            raise ImportError(msg)526        # use lxml library to parse html document and return xml ElementTree527        # Create secure parsers to prevent XXE attacks528        html_parser = etree.HTMLParser(no_network=True)529        xslt_parser = etree.XMLParser(530            resolve_entities=False, no_network=True, load_dtd=False531        )532533        # Apply XSLT access control to prevent file/network access534        # DENY_ALL is a predefined access control that blocks all file/network access535        # Type ignore needed due to incomplete lxml type stubs536        ac = etree.XSLTAccessControl.DENY_ALL  # type: ignore[attr-defined]537538        tree = etree.parse(StringIO(html_content), html_parser)539        xslt_tree = etree.parse(self.xslt_path, xslt_parser)540        transform = etree.XSLT(xslt_tree, access_control=ac)541        result = transform(tree)542        return str(result)543544    def split_text_from_file(self, file: StringIO) -> list[Document]:545        """Split HTML content from a file into a list of `Document` objects.546547        Args:548            file: A file path or a file-like object containing HTML content.549550        Returns:551            A list of split `Document` objects.552        """553        file_content = file.getvalue()554        file_content = self.convert_possible_tags_to_header(file_content)555        sections = self.split_html_by_headers(file_content)556557        return [558            Document(559                cast("str", section["content"]),560                metadata={561                    self.headers_to_split_on[str(section["tag_name"])]: section[562                        "header"563                    ]564                },565            )566            for section in sections567        ]568569570@beta()571class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):572    """Split HTML content preserving semantic structure.573574    Splits HTML content by headers into generalized chunks, preserving semantic575    structure. If chunks exceed the maximum chunk size, it uses576    `RecursiveCharacterTextSplitter` for further splitting.577578    The splitter preserves full HTML elements and converts links to Markdown-like links.579    It can also preserve images, videos, and audio elements by converting them into580    Markdown format. Note that some chunks may exceed the maximum size to maintain581    semantic integrity.582583    !!! version-added "Added in `langchain-text-splitters` 0.3.5"584585    Example:586        ```python587        from langchain_text_splitters.html import HTMLSemanticPreservingSplitter588589        def custom_iframe_extractor(iframe_tag):590            ```591            Custom handler function to extract the 'src' attribute from an <iframe> tag.592            Converts the iframe to a Markdown-like link: [iframe:<src>](src).593594            Args:595                iframe_tag (bs4.element.Tag): The <iframe> tag to be processed.596597            Returns:598                str: A formatted string representing the iframe in Markdown-like format.599            ```600            iframe_src = iframe_tag.get('src', '')601            return f"[iframe:{iframe_src}]({iframe_src})"602603        text_splitter = HTMLSemanticPreservingSplitter(604            headers_to_split_on=[("h1", "Header 1"), ("h2", "Header 2")],605            max_chunk_size=500,606            preserve_links=True,607            preserve_images=True,608            custom_handlers={"iframe": custom_iframe_extractor}609        )610        ```611    """  # noqa: D214612613    def __init__(614        self,615        headers_to_split_on: list[tuple[str, str]],616        *,617        max_chunk_size: int = 1000,618        chunk_overlap: int = 0,619        separators: list[str] | None = None,620        elements_to_preserve: list[str] | None = None,621        preserve_links: bool = False,622        preserve_images: bool = False,623        preserve_videos: bool = False,624        preserve_audio: bool = False,625        custom_handlers: dict[str, Callable[[Tag], str]] | None = None,626        stopword_removal: bool = False,627        stopword_lang: str = "english",628        normalize_text: bool = False,629        external_metadata: dict[str, str] | None = None,630        allowlist_tags: list[str] | None = None,631        denylist_tags: list[str] | None = None,632        preserve_parent_metadata: bool = False,633        keep_separator: bool | Literal["start", "end"] = True,634    ) -> None:635        """Initialize splitter.636637        Args:638            headers_to_split_on: HTML headers (e.g., `h1`, `h2`) that define content639                sections.640            max_chunk_size: Maximum size for each chunk, with allowance for exceeding641                this limit to preserve semantics.642            chunk_overlap: Number of characters to overlap between chunks to ensure643                contextual continuity.644            separators: Delimiters used by `RecursiveCharacterTextSplitter` for645                further splitting.646            elements_to_preserve: HTML tags (e.g., `table`, `ul`) to remain647                intact during splitting.648            preserve_links: Converts `a` tags to Markdown links (`[text](url)`).649            preserve_images: Converts `img` tags to Markdown images (`![alt](src)`).650            preserve_videos: Converts `video` tags to Markdown video links651                (`![video](src)`).652            preserve_audio: Converts `audio` tags to Markdown audio links653                (`![audio](src)`).654            custom_handlers: Optional custom handlers for specific HTML tags, allowing655                tailored extraction or processing.656            stopword_removal: Optionally remove stopwords from the text.657            stopword_lang: The language of stopwords to remove.658            normalize_text: Optionally normalize text (e.g., lowercasing, removing659                punctuation).660            external_metadata: Additional metadata to attach to the Document objects.661            allowlist_tags: Only these tags will be retained in the HTML.662            denylist_tags: These tags will be removed from the HTML.663            preserve_parent_metadata: Whether to pass through parent document metadata664                to split documents when calling665                `transform_documents/atransform_documents()`.666            keep_separator: Whether separators should be at the beginning of a chunk, at667                the end, or not at all.668669        Raises:670            ImportError: If BeautifulSoup or NLTK (when stopword removal is enabled)671                is not installed.672        """673        if not _HAS_BS4:674            msg = (675                "Could not import BeautifulSoup. "676                "Please install it with 'pip install bs4'."677            )678            raise ImportError(msg)679680        self._headers_to_split_on = sorted(headers_to_split_on)681        self._max_chunk_size = max_chunk_size682        self._elements_to_preserve = elements_to_preserve or []683        self._preserve_links = preserve_links684        self._preserve_images = preserve_images685        self._preserve_videos = preserve_videos686        self._preserve_audio = preserve_audio687        self._custom_handlers = custom_handlers or {}688        self._stopword_removal = stopword_removal689        self._stopword_lang = stopword_lang690        self._normalize_text = normalize_text691        self._external_metadata = external_metadata or {}692        self._allowlist_tags = allowlist_tags693        self._preserve_parent_metadata = preserve_parent_metadata694        self._keep_separator = keep_separator695        if allowlist_tags:696            self._allowlist_tags = list(697                set(allowlist_tags + [header[0] for header in headers_to_split_on])698            )699        self._denylist_tags = denylist_tags700        if denylist_tags:701            self._denylist_tags = [702                tag703                for tag in denylist_tags704                if tag not in [header[0] for header in headers_to_split_on]705            ]706        if separators:707            self._recursive_splitter = RecursiveCharacterTextSplitter(708                separators=separators,709                keep_separator=keep_separator,710                chunk_size=max_chunk_size,711                chunk_overlap=chunk_overlap,712            )713        else:714            self._recursive_splitter = RecursiveCharacterTextSplitter(715                keep_separator=keep_separator,716                chunk_size=max_chunk_size,717                chunk_overlap=chunk_overlap,718            )719720        if self._stopword_removal:721            if not _HAS_NLTK:722                msg = (723                    "Could not import nltk. Please install it with 'pip install nltk'."724                )725                raise ImportError(msg)726            nltk.download("stopwords")727            self._stopwords = set(nltk.corpus.stopwords.words(self._stopword_lang))728729    def split_text(self, text: str) -> list[Document]:730        """Splits the provided HTML text into smaller chunks based on the configuration.731732        Args:733            text: The HTML content to be split.734735        Returns:736            A list of `Document` objects containing the split content.737        """738        soup = BeautifulSoup(text, "html.parser")739740        self._process_media(soup)741742        if self._preserve_links:743            self._process_links(soup)744745        if self._allowlist_tags or self._denylist_tags:746            self._filter_tags(soup)747748        return self._process_html(soup)749750    @override751    def transform_documents(752        self, documents: Sequence[Document], **kwargs: Any753    ) -> list[Document]:754        """Transform sequence of documents by splitting them.755756        Args:757            documents: A sequence of `Document` objects to be split.758759        Returns:760            A sequence of split `Document` objects.761        """762        transformed = []763        for doc in documents:764            splits = self.split_text(doc.page_content)765            if self._preserve_parent_metadata:766                splits = [767                    Document(768                        page_content=split_doc.page_content,769                        metadata={**doc.metadata, **split_doc.metadata},770                    )771                    for split_doc in splits772                ]773            transformed.extend(splits)774        return transformed775776    def _process_media(self, soup: BeautifulSoup) -> None:777        """Processes the media elements.778779        Process elements in the HTML content by wrapping them in a <media-wrapper> tag780        and converting them to Markdown format.781782        Args:783            soup: Parsed HTML content using BeautifulSoup.784        """785        if self._preserve_images:786            for img_tag in _find_all_tags(soup, name="img"):787                img_src = img_tag.get("src", "")788                markdown_img = f"![image:{img_src}]({img_src})"789                wrapper = soup.new_tag("media-wrapper")790                wrapper.string = markdown_img791                img_tag.replace_with(wrapper)792793        if self._preserve_videos:794            for video_tag in _find_all_tags(soup, name="video"):795                video_src = video_tag.get("src", "")796                markdown_video = f"![video:{video_src}]({video_src})"797                wrapper = soup.new_tag("media-wrapper")798                wrapper.string = markdown_video799                video_tag.replace_with(wrapper)800801        if self._preserve_audio:802            for audio_tag in _find_all_tags(soup, name="audio"):803                audio_src = audio_tag.get("src", "")804                markdown_audio = f"![audio:{audio_src}]({audio_src})"805                wrapper = soup.new_tag("media-wrapper")806                wrapper.string = markdown_audio807                audio_tag.replace_with(wrapper)808809    @staticmethod810    def _process_links(soup: BeautifulSoup) -> None:811        """Processes the links in the HTML content.812813        Args:814            soup: Parsed HTML content using BeautifulSoup.815        """816        for a_tag in _find_all_tags(soup, name="a"):817            a_href = a_tag.get("href", "")818            a_text = a_tag.get_text(strip=True)819            markdown_link = f"[{a_text}]({a_href})"820            wrapper = soup.new_tag("link-wrapper")821            wrapper.string = markdown_link822            a_tag.replace_with(NavigableString(markdown_link))823824    def _filter_tags(self, soup: BeautifulSoup) -> None:825        """Filters the HTML content based on the allowlist and denylist tags.826827        Args:828            soup: Parsed HTML content using BeautifulSoup.829        """830        if self._allowlist_tags:831            for tag in _find_all_tags(soup, name=True):832                if tag.name not in self._allowlist_tags:833                    tag.decompose()834835        if self._denylist_tags:836            for tag in _find_all_tags(soup, name=self._denylist_tags):837                tag.decompose()838839    def _normalize_and_clean_text(self, text: str) -> str:840        """Normalizes the text by removing extra spaces and newlines.841842        Args:843            text: The text to be normalized.844845        Returns:846            The normalized text.847        """848        if self._normalize_text:849            text = text.lower()850            text = re.sub(r"[^\w\s]", "", text)851            text = re.sub(r"\s+", " ", text).strip()852853        if self._stopword_removal:854            text = " ".join(855                [word for word in text.split() if word not in self._stopwords]856            )857858        return text859860    def _process_html(self, soup: BeautifulSoup) -> list[Document]:861        """Processes the HTML content using BeautifulSoup and splits it using headers.862863        Args:864            soup: Parsed HTML content using BeautifulSoup.865866        Returns:867            A list of `Document` objects containing the split content.868        """869        documents: list[Document] = []870        current_headers: dict[str, str] = {}871        current_content: list[str] = []872        preserved_elements: dict[str, str] = {}873        placeholder_count: int = 0874875        def _get_element_text(element: PageElement) -> str:876            """Recursively extracts and processes the text of an element.877878            Applies custom handlers where applicable, and ensures correct spacing.879880            Args:881                element: The HTML element to process.882883            Returns:884                The processed text of the element.885            """886            element = cast("Tag | NavigableString", element)887            if element.name in self._custom_handlers:888                return self._custom_handlers[element.name](element)889890            text = ""891892            if element.name is not None:893                for child in element.children:894                    child_text = _get_element_text(child).strip()895                    if text and child_text:896                        text += " "897                    text += child_text898            elif element.string:899                text += element.string900901            return self._normalize_and_clean_text(text)902903        elements = _find_all_tags(soup, recursive=False)904905        def _process_element(906            element: ResultSet[Tag],907            documents: list[Document],908            current_headers: dict[str, str],909            current_content: list[str],910            preserved_elements: dict[str, str],911            placeholder_count: int,912        ) -> tuple[list[Document], dict[str, str], list[str], dict[str, str], int]:913            for elem in element:914                if elem.name in [h[0] for h in self._headers_to_split_on]:915                    if current_content:916                        documents.extend(917                            self._create_documents(918                                current_headers,919                                " ".join(current_content),920                                preserved_elements,921                            )922                        )923                        current_content.clear()924                        preserved_elements.clear()925                    header_name = elem.get_text(strip=True)926                    current_headers = {927                        dict(self._headers_to_split_on)[elem.name]: header_name928                    }929                elif elem.name in self._elements_to_preserve:930                    placeholder = f"PRESERVED_{placeholder_count}"931                    preserved_elements[placeholder] = _get_element_text(elem)932                    current_content.append(placeholder)933                    placeholder_count += 1934                else:935                    # Recursively process children to find nested headers or936                    # preserved elements.937                    children = _find_all_tags(elem, recursive=False)938                    if children:939                        # Element has children - recursively process them.940                        (941                            documents,942                            current_headers,943                            current_content,944                            preserved_elements,945                            placeholder_count,946                        ) = _process_element(947                            children,948                            documents,949                            current_headers,950                            current_content,951                            preserved_elements,952                            placeholder_count,953                        )954                        # After processing children, extract only text955                        # strings from this element (not its children). Used956                        # recursive=False to avoid double-counting.957                        content = " ".join(_find_all_strings(elem, recursive=False))958                        if content:959                            content = self._normalize_and_clean_text(content)960                            current_content.append(content)961                    else:962                        # Leaf element with no children, so we extract its963                        # text and add to current content. Handles964                        # text-only elements like <p>, <span>, <div>965                        content = _get_element_text(elem)966                        if content:967                            current_content.append(content)968969            return (970                documents,971                current_headers,972                current_content,973                preserved_elements,974                placeholder_count,975            )976977        # Process the elements978        (979            documents,980            current_headers,981            current_content,982            preserved_elements,983            placeholder_count,984        ) = _process_element(985            elements,986            documents,987            current_headers,988            current_content,989            preserved_elements,990            placeholder_count,991        )992993        # Handle any remaining content994        if current_content:995            documents.extend(996                self._create_documents(997                    current_headers,998                    " ".join(current_content),999                    preserved_elements,1000                )1001            )10021003        return documents10041005    def _create_documents(1006        self, headers: dict[str, str], content: str, preserved_elements: dict[str, str]1007    ) -> list[Document]:1008        """Creates Document objects from the provided headers, content, and elements.10091010        Args:1011            headers: The headers to attach as metadata to the `Document`.1012            content: The content of the `Document`.1013            preserved_elements: Preserved elements to be reinserted into the content.10141015        Returns:1016            A list of `Document` objects.1017        """1018        content = re.sub(r"\s+", " ", content).strip()10191020        metadata = {**headers, **self._external_metadata}10211022        if len(content) <= self._max_chunk_size:1023            page_content = self._reinsert_preserved_elements(1024                content, preserved_elements1025            )1026            return [Document(page_content=page_content, metadata=metadata)]1027        return self._further_split_chunk(content, metadata, preserved_elements)10281029    def _further_split_chunk(1030        self, content: str, metadata: dict[Any, Any], preserved_elements: dict[str, str]1031    ) -> list[Document]:1032        """Further splits the content into smaller chunks.10331034        Args:1035            content: The content to be split.1036            metadata: Metadata to attach to each chunk.1037            preserved_elements: Preserved elements to be reinserted into each chunk.10381039        Returns:1040            A list of `Document` objects containing the split content.1041        """1042        splits = self._recursive_splitter.split_text(content)1043        result = []10441045        for split in splits:1046            split_with_preserved = self._reinsert_preserved_elements(1047                split, preserved_elements1048            )1049            if split_with_preserved.strip():1050                result.append(1051                    Document(1052                        page_content=split_with_preserved.strip(),1053                        metadata=metadata,1054                    )1055                )10561057        return result10581059    @staticmethod1060    def _reinsert_preserved_elements(1061        content: str, preserved_elements: dict[str, str]1062    ) -> str:1063        """Reinserts preserved elements into the content into their original positions.10641065        Args:1066            content: The content where placeholders need to be replaced.1067            preserved_elements: Preserved elements to be reinserted.10681069        Returns:1070            The content with placeholders replaced by preserved elements.1071        """1072        for placeholder, preserved_content in reversed(preserved_elements.items()):1073            content = content.replace(placeholder, preserved_content.strip())1074        return content107510761077# %%

Code quality findings 13

Ensure functions have docstrings for documentation
missing-docstring
def split_text_from_url(
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
if isinstance(file, str):
Avoid unnecessary list conversions; use generators where possible
unnecessary-list
return list(self._generate_documents(html_content))
Avoid unnecessary list conversions; use generators where possible
unnecessary-list
children = list(node.children)
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
child for child in reversed(children) if isinstance(child, Tag)
Avoid unless necessary; Python's garbage collector typically handles object deletion
unnecessary-del
del active_headers[key]
Avoid unless necessary; Python's garbage collector typically handles object deletion
unnecessary-del
del active_headers[key]
Ensure functions have docstrings for documentation
missing-docstring
def create_documents(
Avoid unnecessary list conversions; use generators where possible
unnecessary-list
header_names = list(self.headers_to_split_on.keys())
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
if isinstance(element, str):
Ensure functions have docstrings for documentation
missing-docstring
def custom_iframe_extractor(iframe_tag):
Avoid unnecessary list conversions; use generators where possible
unnecessary-list
self._allowlist_tags = list(
Ensure functions have docstrings for documentation
missing-docstring
def transform_documents(

Get this view in your editor

Same data, no extra tab — call code_get_file + code_get_findings over MCP from Claude/Cursor/Copilot.