Ensure functions have docstrings for documentation
def split_text_from_url(
1"""HTML text splitters."""23from __future__ import annotations45import copy6import pathlib7import re8from io import StringIO9from typing import (10 IO,11 TYPE_CHECKING,12 Any,13 Literal,14 TypedDict,15 cast,16)1718from langchain_core._api import beta, deprecated19from langchain_core.documents import BaseDocumentTransformer, Document20from typing_extensions import override2122from langchain_text_splitters.character import RecursiveCharacterTextSplitter2324if TYPE_CHECKING:25 from collections.abc import Callable, Iterable, Iterator, Sequence2627 from bs4.element import ResultSet2829try:30 import nltk3132 _HAS_NLTK = True33except ImportError:34 _HAS_NLTK = False3536try:37 from bs4 import BeautifulSoup, Tag38 from bs4.element import NavigableString, PageElement3940 _HAS_BS4 = True41except ImportError:42 _HAS_BS4 = False4344try:45 from lxml import etree4647 _HAS_LXML = True48except ImportError:49 _HAS_LXML = False505152class ElementType(TypedDict):53 """Element type as typed dict."""5455 url: str56 xpath: str57 content: str58 metadata: dict[str, str]596061# Unfortunately, BeautifulSoup doesn't define overloads for Tag.find_all.62# So doing the type resolution ourselves.636465def _find_all_strings(66 tag: Tag,67 *,68 recursive: bool = True,69) -> ResultSet[NavigableString]:70 return tag.find_all(string=True, recursive=recursive)717273def _find_all_tags(74 tag: Tag,75 *,76 name: bool | str | list[str] | None = None,77 recursive: bool = True,78) -> ResultSet[Tag]:79 return tag.find_all(name, recursive=recursive)808182class HTMLHeaderTextSplitter:83 """Split HTML content into structured Documents based on specified headers.8485 Splits HTML content by detecting specified header tags and creating hierarchical86 `Document` objects that reflect the semantic structure of the original content. For87 each identified section, the splitter associates the extracted text with metadata88 corresponding to the encountered headers.8990 If no specified headers are found, the entire content is returned as a single91 `Document`. This allows for flexible handling of HTML input, ensuring that92 information is organized according to its semantic headers.9394 The splitter provides the option to return each HTML element as a separate95 `Document` or aggregate them into semantically meaningful chunks. It also96 gracefully handles multiple levels of nested headers, creating a rich,97 hierarchical representation of the content.9899 Example:100 ```python101 from langchain_text_splitters.html_header_text_splitter import (102 HTMLHeaderTextSplitter,103 )104105 # Define headers for splitting on h1 and h2 tags.106 headers_to_split_on = [("h1", "Main Topic"), ("h2", "Sub Topic")]107108 splitter = HTMLHeaderTextSplitter(109 headers_to_split_on=headers_to_split_on,110 return_each_element=False111 )112113 html_content = \"\"\"114 <html>115 <body>116 <h1>Introduction</h1>117 <p>Welcome to the introduction section.</p>118 <h2>Background</h2>119 <p>Some background details here.</p>120 <h1>Conclusion</h1>121 <p>Final thoughts.</p>122 </body>123 </html>124 \"\"\"125126 documents = splitter.split_text(html_content)127128 # 'documents' now contains Document objects reflecting the hierarchy:129 # - Document with metadata={"Main Topic": "Introduction"} and130 # content="Introduction"131 # - Document with metadata={"Main Topic": "Introduction"} and132 # content="Welcome to the introduction section."133 # - Document with metadata={"Main Topic": "Introduction",134 # "Sub Topic": "Background"} and content="Background"135 # - Document with metadata={"Main Topic": "Introduction",136 # "Sub Topic": "Background"} and content="Some background details here."137 # - Document with metadata={"Main Topic": "Conclusion"} and138 # content="Conclusion"139 # - Document with metadata={"Main Topic": "Conclusion"} and140 # content="Final thoughts."141 ```142 """143144 def __init__(145 self,146 headers_to_split_on: list[tuple[str, str]],147 return_each_element: bool = False, # noqa: FBT001,FBT002148 ) -> None:149 """Initialize with headers to split on.150151 Args:152 headers_to_split_on: A list of `(header_tag,153 header_name)` pairs representing the headers that define splitting154 boundaries.155156 For example, `[("h1", "Header 1"), ("h2", "Header 2")]` will split157 content by `h1` and `h2` tags, assigning their textual content to the158 `Document` metadata.159 return_each_element: If `True`, every HTML element encountered160 (including headers, paragraphs, etc.) is returned as a separate161 `Document`.162163 If `False`, content under the same header hierarchy is aggregated into164 fewer `Document` objects.165 """166 # Sort headers by their numeric level so that h1 < h2 < h3...167 self.headers_to_split_on = sorted(168 headers_to_split_on, key=lambda x: int(x[0][1:])169 )170 self.header_mapping = dict(self.headers_to_split_on)171 self.header_tags = [tag for tag, _ in self.headers_to_split_on]172 self.return_each_element = return_each_element173174 def split_text(self, text: str) -> list[Document]:175 """Split the given text into a list of `Document` objects.176177 Args:178 text: The HTML text to split.179180 Returns:181 A list of split `Document` objects.182183 Each `Document` contains `page_content` holding the extracted text and184 `metadata` that maps the header hierarchy to their corresponding titles.185 """186 return self.split_text_from_file(StringIO(text))187188 @deprecated(189 since="1.1.2",190 removal="2.0.0",191 addendum=(192 "Fetch the HTML content from the URL yourself and pass it to `split_text`."193 ),194 )195 def split_text_from_url(196 self,197 url: str,198 timeout: int = 10,199 **kwargs: Any, # noqa: ARG002200 ) -> list[Document]:201 """Fetch text content from a URL and split it into documents.202203 Args:204 url: The URL to fetch content from.205 timeout: Timeout for the request.206 **kwargs: Additional keyword arguments for the request.207208 Returns:209 A list of split `Document` objects.210211 Each `Document` contains `page_content` holding the extracted text and212 `metadata` that maps the header hierarchy to their corresponding titles.213214 Raises:215 requests.RequestException: If the HTTP request fails.216 """217 from langchain_core._security._transport import ( # noqa: PLC0415218 ssrf_safe_client,219 )220221 with ssrf_safe_client() as client:222 response = client.get(url, timeout=timeout)223 response.raise_for_status()224 return self.split_text(response.text)225226 def split_text_from_file(self, file: str | IO[str]) -> list[Document]:227 """Split HTML content from a file into a list of `Document` objects.228229 Args:230 file: A file path or a file-like object containing HTML content.231232 Returns:233 A list of split `Document` objects.234235 Each `Document` contains `page_content` holding the extracted text and236 `metadata` that maps the header hierarchy to their corresponding titles.237 """238 if isinstance(file, str):239 html_content = pathlib.Path(file).read_text(encoding="utf-8")240 else:241 html_content = file.read()242 return list(self._generate_documents(html_content))243244 def _generate_documents(self, html_content: str) -> Iterator[Document]:245 """Private method that performs a DFS traversal over the DOM and yields.246247 Document objects on-the-fly. This approach maintains the same splitting logic248 (headers vs. non-headers, chunking, etc.) while walking the DOM explicitly in249 code.250251 Args:252 html_content: The raw HTML content.253254 Yields:255 Document objects as they are created.256257 Raises:258 ImportError: If BeautifulSoup is not installed.259 """260 if not _HAS_BS4:261 msg = (262 "Unable to import BeautifulSoup. Please install via `pip install bs4`."263 )264 raise ImportError(msg)265266 soup = BeautifulSoup(html_content, "html.parser")267 body = soup.body or soup268269 # Dictionary of active headers:270 # key = user-defined header name (e.g. "Header 1")271 # value = tuple of header_text, level, dom_depth272 active_headers: dict[str, tuple[str, int, int]] = {}273 current_chunk: list[str] = []274275 def finalize_chunk() -> Document | None:276 """Finalize the accumulated chunk into a single Document."""277 if not current_chunk:278 return None279280 final_text = " \n".join(line for line in current_chunk if line.strip())281 current_chunk.clear()282 if not final_text.strip():283 return None284285 final_meta = {k: v[0] for k, v in active_headers.items()}286 return Document(page_content=final_text, metadata=final_meta)287288 # We'll use a stack for DFS traversal289 stack = [body]290 while stack:291 node = stack.pop()292 children = list(node.children)293294 stack.extend(295 child for child in reversed(children) if isinstance(child, Tag)296 )297298 tag = getattr(node, "name", None)299 if not tag:300 continue301302 text_elements = [303 str(child).strip() for child in _find_all_strings(node, recursive=False)304 ]305 node_text = " ".join(elem for elem in text_elements if elem)306 if not node_text:307 continue308309 dom_depth = len(list(node.parents))310311 # If this node is one of our headers312 if tag in self.header_tags:313 # If we're aggregating, finalize whatever chunk we had314 if not self.return_each_element:315 doc = finalize_chunk()316 if doc:317 yield doc318319 # Determine numeric level (h1->1, h2->2, etc.)320 try:321 level = int(tag[1:])322 except ValueError:323 level = 9999324325 # Remove any active headers that are at or deeper than this new level326 headers_to_remove = [327 k for k, (_, lvl, d) in active_headers.items() if lvl >= level328 ]329 for key in headers_to_remove:330 del active_headers[key]331332 # Add/Update the active header333 header_name = self.header_mapping[tag]334 active_headers[header_name] = (node_text, level, dom_depth)335336 # Always yield a Document for the header337 header_meta = {k: v[0] for k, v in active_headers.items()}338 yield Document(page_content=node_text, metadata=header_meta)339340 else:341 headers_out_of_scope = [342 k for k, (_, _, d) in active_headers.items() if dom_depth < d343 ]344 for key in headers_out_of_scope:345 del active_headers[key]346347 if self.return_each_element:348 # Yield each element's text as its own Document349 meta = {k: v[0] for k, v in active_headers.items()}350 yield Document(page_content=node_text, metadata=meta)351 else:352 # Accumulate text in our chunk353 current_chunk.append(node_text)354355 # If we're aggregating and have leftover chunk, yield it356 if not self.return_each_element:357 doc = finalize_chunk()358 if doc:359 yield doc360361362class HTMLSectionSplitter:363 """Splitting HTML files based on specified tag and font sizes.364365 Requires lxml package.366 """367368 def __init__(369 self,370 headers_to_split_on: list[tuple[str, str]],371 **kwargs: Any,372 ) -> None:373 """Create a new `HTMLSectionSplitter`.374375 Args:376 headers_to_split_on: List of tuples of headers we want to track mapped to377 (arbitrary) keys for metadata.378379 Allowed header values: `h1`, `h2`, `h3`, `h4`, `h5`, `h6`, e.g.:380 `[("h1", "Header 1"), ("h2", "Header 2"]`.381 **kwargs: Additional optional arguments for customizations.382383 """384 self.headers_to_split_on = dict(headers_to_split_on)385 self.xslt_path = (386 pathlib.Path(__file__).parent / "xsl/converting_to_header.xslt"387 ).absolute()388 self.kwargs = kwargs389390 def split_documents(self, documents: Iterable[Document]) -> list[Document]:391 """Split documents.392393 Args:394 documents: Iterable of `Document` objects to be split.395396 Returns:397 A list of split `Document` objects.398 """399 texts, metadatas = [], []400 for doc in documents:401 texts.append(doc.page_content)402 metadatas.append(doc.metadata)403 results = self.create_documents(texts, metadatas=metadatas)404405 text_splitter = RecursiveCharacterTextSplitter(**self.kwargs)406407 return text_splitter.split_documents(results)408409 def split_text(self, text: str) -> list[Document]:410 """Split HTML text string.411412 Args:413 text: HTML text414415 Returns:416 A list of split `Document` objects.417 """418 return self.split_text_from_file(StringIO(text))419420 def create_documents(421 self, texts: list[str], metadatas: list[dict[Any, Any]] | None = None422 ) -> list[Document]:423 """Create a list of `Document` objects from a list of texts.424425 Args:426 texts: A list of texts to be split and converted into documents.427 metadatas: Optional list of metadata to associate with each document.428429 Returns:430 A list of `Document` objects.431 """432 metadatas_ = metadatas or [{}] * len(texts)433 documents = []434 for i, text in enumerate(texts):435 for chunk in self.split_text(text):436 metadata = copy.deepcopy(metadatas_[i])437438 for key in chunk.metadata:439 if chunk.metadata[key] == "#TITLE#":440 chunk.metadata[key] = metadata["Title"]441 metadata = {**metadata, **chunk.metadata}442 new_doc = Document(page_content=chunk.page_content, metadata=metadata)443 documents.append(new_doc)444 return documents445446 def split_html_by_headers(self, html_doc: str) -> list[dict[str, str | None]]:447 """Split an HTML document into sections based on specified header tags.448449 This method uses BeautifulSoup to parse the HTML content and divides it into450 sections based on headers defined in `headers_to_split_on`. Each section451 contains the header text, content under the header, and the tag name.452453 Args:454 html_doc: The HTML document to be split into sections.455456 Returns:457 A list of dictionaries representing sections.458459 Each dictionary contains:460461 * `'header'`: The header text or a default title for the first section.462 * `'content'`: The content under the header.463 * `'tag_name'`: The name of the header tag (e.g., `h1`, `h2`).464465 Raises:466 ImportError: If BeautifulSoup is not installed.467 """468 if not _HAS_BS4:469 msg = "Unable to import BeautifulSoup/PageElement, \470 please install with `pip install \471 bs4`."472 raise ImportError(msg)473474 soup = BeautifulSoup(html_doc, "html.parser")475 header_names = list(self.headers_to_split_on.keys())476 sections: list[dict[str, str | None]] = []477478 headers = _find_all_tags(soup, name=["body", *header_names])479480 for i, header in enumerate(headers):481 if i == 0:482 current_header = "#TITLE#"483 current_header_tag = "h1"484 section_content: list[str] = []485 else:486 current_header = header.text.strip()487 current_header_tag = header.name488 section_content = []489 for element in header.next_elements:490 if i + 1 < len(headers) and element == headers[i + 1]:491 break492 if isinstance(element, str):493 section_content.append(element)494 content = " ".join(section_content).strip()495496 if content:497 sections.append(498 {499 "header": current_header,500 "content": content,501 "tag_name": current_header_tag,502 }503 )504505 return sections506507 def convert_possible_tags_to_header(self, html_content: str) -> str:508 """Convert specific HTML tags to headers using an XSLT transformation.509510 This method uses an XSLT file to transform the HTML content, converting511 certain tags into headers for easier parsing. If no XSLT path is provided,512 the HTML content is returned unchanged.513514 Args:515 html_content: The HTML content to be transformed.516517 Returns:518 The transformed HTML content as a string.519520 Raises:521 ImportError: If the `lxml` library is not installed.522 """523 if not _HAS_LXML:524 msg = "Unable to import lxml, please install with `pip install lxml`."525 raise ImportError(msg)526 # use lxml library to parse html document and return xml ElementTree527 # Create secure parsers to prevent XXE attacks528 html_parser = etree.HTMLParser(no_network=True)529 xslt_parser = etree.XMLParser(530 resolve_entities=False, no_network=True, load_dtd=False531 )532533 # Apply XSLT access control to prevent file/network access534 # DENY_ALL is a predefined access control that blocks all file/network access535 # Type ignore needed due to incomplete lxml type stubs536 ac = etree.XSLTAccessControl.DENY_ALL # type: ignore[attr-defined]537538 tree = etree.parse(StringIO(html_content), html_parser)539 xslt_tree = etree.parse(self.xslt_path, xslt_parser)540 transform = etree.XSLT(xslt_tree, access_control=ac)541 result = transform(tree)542 return str(result)543544 def split_text_from_file(self, file: StringIO) -> list[Document]:545 """Split HTML content from a file into a list of `Document` objects.546547 Args:548 file: A file path or a file-like object containing HTML content.549550 Returns:551 A list of split `Document` objects.552 """553 file_content = file.getvalue()554 file_content = self.convert_possible_tags_to_header(file_content)555 sections = self.split_html_by_headers(file_content)556557 return [558 Document(559 cast("str", section["content"]),560 metadata={561 self.headers_to_split_on[str(section["tag_name"])]: section[562 "header"563 ]564 },565 )566 for section in sections567 ]568569570@beta()571class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):572 """Split HTML content preserving semantic structure.573574 Splits HTML content by headers into generalized chunks, preserving semantic575 structure. If chunks exceed the maximum chunk size, it uses576 `RecursiveCharacterTextSplitter` for further splitting.577578 The splitter preserves full HTML elements and converts links to Markdown-like links.579 It can also preserve images, videos, and audio elements by converting them into580 Markdown format. Note that some chunks may exceed the maximum size to maintain581 semantic integrity.582583 !!! version-added "Added in `langchain-text-splitters` 0.3.5"584585 Example:586 ```python587 from langchain_text_splitters.html import HTMLSemanticPreservingSplitter588589 def custom_iframe_extractor(iframe_tag):590 ```591 Custom handler function to extract the 'src' attribute from an <iframe> tag.592 Converts the iframe to a Markdown-like link: [iframe:<src>](src).593594 Args:595 iframe_tag (bs4.element.Tag): The <iframe> tag to be processed.596597 Returns:598 str: A formatted string representing the iframe in Markdown-like format.599 ```600 iframe_src = iframe_tag.get('src', '')601 return f"[iframe:{iframe_src}]({iframe_src})"602603 text_splitter = HTMLSemanticPreservingSplitter(604 headers_to_split_on=[("h1", "Header 1"), ("h2", "Header 2")],605 max_chunk_size=500,606 preserve_links=True,607 preserve_images=True,608 custom_handlers={"iframe": custom_iframe_extractor}609 )610 ```611 """ # noqa: D214612613 def __init__(614 self,615 headers_to_split_on: list[tuple[str, str]],616 *,617 max_chunk_size: int = 1000,618 chunk_overlap: int = 0,619 separators: list[str] | None = None,620 elements_to_preserve: list[str] | None = None,621 preserve_links: bool = False,622 preserve_images: bool = False,623 preserve_videos: bool = False,624 preserve_audio: bool = False,625 custom_handlers: dict[str, Callable[[Tag], str]] | None = None,626 stopword_removal: bool = False,627 stopword_lang: str = "english",628 normalize_text: bool = False,629 external_metadata: dict[str, str] | None = None,630 allowlist_tags: list[str] | None = None,631 denylist_tags: list[str] | None = None,632 preserve_parent_metadata: bool = False,633 keep_separator: bool | Literal["start", "end"] = True,634 ) -> None:635 """Initialize splitter.636637 Args:638 headers_to_split_on: HTML headers (e.g., `h1`, `h2`) that define content639 sections.640 max_chunk_size: Maximum size for each chunk, with allowance for exceeding641 this limit to preserve semantics.642 chunk_overlap: Number of characters to overlap between chunks to ensure643 contextual continuity.644 separators: Delimiters used by `RecursiveCharacterTextSplitter` for645 further splitting.646 elements_to_preserve: HTML tags (e.g., `table`, `ul`) to remain647 intact during splitting.648 preserve_links: Converts `a` tags to Markdown links (`[text](url)`).649 preserve_images: Converts `img` tags to Markdown images (``).650 preserve_videos: Converts `video` tags to Markdown video links651 (``).652 preserve_audio: Converts `audio` tags to Markdown audio links653 (``).654 custom_handlers: Optional custom handlers for specific HTML tags, allowing655 tailored extraction or processing.656 stopword_removal: Optionally remove stopwords from the text.657 stopword_lang: The language of stopwords to remove.658 normalize_text: Optionally normalize text (e.g., lowercasing, removing659 punctuation).660 external_metadata: Additional metadata to attach to the Document objects.661 allowlist_tags: Only these tags will be retained in the HTML.662 denylist_tags: These tags will be removed from the HTML.663 preserve_parent_metadata: Whether to pass through parent document metadata664 to split documents when calling665 `transform_documents/atransform_documents()`.666 keep_separator: Whether separators should be at the beginning of a chunk, at667 the end, or not at all.668669 Raises:670 ImportError: If BeautifulSoup or NLTK (when stopword removal is enabled)671 is not installed.672 """673 if not _HAS_BS4:674 msg = (675 "Could not import BeautifulSoup. "676 "Please install it with 'pip install bs4'."677 )678 raise ImportError(msg)679680 self._headers_to_split_on = sorted(headers_to_split_on)681 self._max_chunk_size = max_chunk_size682 self._elements_to_preserve = elements_to_preserve or []683 self._preserve_links = preserve_links684 self._preserve_images = preserve_images685 self._preserve_videos = preserve_videos686 self._preserve_audio = preserve_audio687 self._custom_handlers = custom_handlers or {}688 self._stopword_removal = stopword_removal689 self._stopword_lang = stopword_lang690 self._normalize_text = normalize_text691 self._external_metadata = external_metadata or {}692 self._allowlist_tags = allowlist_tags693 self._preserve_parent_metadata = preserve_parent_metadata694 self._keep_separator = keep_separator695 if allowlist_tags:696 self._allowlist_tags = list(697 set(allowlist_tags + [header[0] for header in headers_to_split_on])698 )699 self._denylist_tags = denylist_tags700 if denylist_tags:701 self._denylist_tags = [702 tag703 for tag in denylist_tags704 if tag not in [header[0] for header in headers_to_split_on]705 ]706 if separators:707 self._recursive_splitter = RecursiveCharacterTextSplitter(708 separators=separators,709 keep_separator=keep_separator,710 chunk_size=max_chunk_size,711 chunk_overlap=chunk_overlap,712 )713 else:714 self._recursive_splitter = RecursiveCharacterTextSplitter(715 keep_separator=keep_separator,716 chunk_size=max_chunk_size,717 chunk_overlap=chunk_overlap,718 )719720 if self._stopword_removal:721 if not _HAS_NLTK:722 msg = (723 "Could not import nltk. Please install it with 'pip install nltk'."724 )725 raise ImportError(msg)726 nltk.download("stopwords")727 self._stopwords = set(nltk.corpus.stopwords.words(self._stopword_lang))728729 def split_text(self, text: str) -> list[Document]:730 """Splits the provided HTML text into smaller chunks based on the configuration.731732 Args:733 text: The HTML content to be split.734735 Returns:736 A list of `Document` objects containing the split content.737 """738 soup = BeautifulSoup(text, "html.parser")739740 self._process_media(soup)741742 if self._preserve_links:743 self._process_links(soup)744745 if self._allowlist_tags or self._denylist_tags:746 self._filter_tags(soup)747748 return self._process_html(soup)749750 @override751 def transform_documents(752 self, documents: Sequence[Document], **kwargs: Any753 ) -> list[Document]:754 """Transform sequence of documents by splitting them.755756 Args:757 documents: A sequence of `Document` objects to be split.758759 Returns:760 A sequence of split `Document` objects.761 """762 transformed = []763 for doc in documents:764 splits = self.split_text(doc.page_content)765 if self._preserve_parent_metadata:766 splits = [767 Document(768 page_content=split_doc.page_content,769 metadata={**doc.metadata, **split_doc.metadata},770 )771 for split_doc in splits772 ]773 transformed.extend(splits)774 return transformed775776 def _process_media(self, soup: BeautifulSoup) -> None:777 """Processes the media elements.778779 Process elements in the HTML content by wrapping them in a <media-wrapper> tag780 and converting them to Markdown format.781782 Args:783 soup: Parsed HTML content using BeautifulSoup.784 """785 if self._preserve_images:786 for img_tag in _find_all_tags(soup, name="img"):787 img_src = img_tag.get("src", "")788 markdown_img = f""789 wrapper = soup.new_tag("media-wrapper")790 wrapper.string = markdown_img791 img_tag.replace_with(wrapper)792793 if self._preserve_videos:794 for video_tag in _find_all_tags(soup, name="video"):795 video_src = video_tag.get("src", "")796 markdown_video = f""797 wrapper = soup.new_tag("media-wrapper")798 wrapper.string = markdown_video799 video_tag.replace_with(wrapper)800801 if self._preserve_audio:802 for audio_tag in _find_all_tags(soup, name="audio"):803 audio_src = audio_tag.get("src", "")804 markdown_audio = f""805 wrapper = soup.new_tag("media-wrapper")806 wrapper.string = markdown_audio807 audio_tag.replace_with(wrapper)808809 @staticmethod810 def _process_links(soup: BeautifulSoup) -> None:811 """Processes the links in the HTML content.812813 Args:814 soup: Parsed HTML content using BeautifulSoup.815 """816 for a_tag in _find_all_tags(soup, name="a"):817 a_href = a_tag.get("href", "")818 a_text = a_tag.get_text(strip=True)819 markdown_link = f"[{a_text}]({a_href})"820 wrapper = soup.new_tag("link-wrapper")821 wrapper.string = markdown_link822 a_tag.replace_with(NavigableString(markdown_link))823824 def _filter_tags(self, soup: BeautifulSoup) -> None:825 """Filters the HTML content based on the allowlist and denylist tags.826827 Args:828 soup: Parsed HTML content using BeautifulSoup.829 """830 if self._allowlist_tags:831 for tag in _find_all_tags(soup, name=True):832 if tag.name not in self._allowlist_tags:833 tag.decompose()834835 if self._denylist_tags:836 for tag in _find_all_tags(soup, name=self._denylist_tags):837 tag.decompose()838839 def _normalize_and_clean_text(self, text: str) -> str:840 """Normalizes the text by removing extra spaces and newlines.841842 Args:843 text: The text to be normalized.844845 Returns:846 The normalized text.847 """848 if self._normalize_text:849 text = text.lower()850 text = re.sub(r"[^\w\s]", "", text)851 text = re.sub(r"\s+", " ", text).strip()852853 if self._stopword_removal:854 text = " ".join(855 [word for word in text.split() if word not in self._stopwords]856 )857858 return text859860 def _process_html(self, soup: BeautifulSoup) -> list[Document]:861 """Processes the HTML content using BeautifulSoup and splits it using headers.862863 Args:864 soup: Parsed HTML content using BeautifulSoup.865866 Returns:867 A list of `Document` objects containing the split content.868 """869 documents: list[Document] = []870 current_headers: dict[str, str] = {}871 current_content: list[str] = []872 preserved_elements: dict[str, str] = {}873 placeholder_count: int = 0874875 def _get_element_text(element: PageElement) -> str:876 """Recursively extracts and processes the text of an element.877878 Applies custom handlers where applicable, and ensures correct spacing.879880 Args:881 element: The HTML element to process.882883 Returns:884 The processed text of the element.885 """886 element = cast("Tag | NavigableString", element)887 if element.name in self._custom_handlers:888 return self._custom_handlers[element.name](element)889890 text = ""891892 if element.name is not None:893 for child in element.children:894 child_text = _get_element_text(child).strip()895 if text and child_text:896 text += " "897 text += child_text898 elif element.string:899 text += element.string900901 return self._normalize_and_clean_text(text)902903 elements = _find_all_tags(soup, recursive=False)904905 def _process_element(906 element: ResultSet[Tag],907 documents: list[Document],908 current_headers: dict[str, str],909 current_content: list[str],910 preserved_elements: dict[str, str],911 placeholder_count: int,912 ) -> tuple[list[Document], dict[str, str], list[str], dict[str, str], int]:913 for elem in element:914 if elem.name in [h[0] for h in self._headers_to_split_on]:915 if current_content:916 documents.extend(917 self._create_documents(918 current_headers,919 " ".join(current_content),920 preserved_elements,921 )922 )923 current_content.clear()924 preserved_elements.clear()925 header_name = elem.get_text(strip=True)926 current_headers = {927 dict(self._headers_to_split_on)[elem.name]: header_name928 }929 elif elem.name in self._elements_to_preserve:930 placeholder = f"PRESERVED_{placeholder_count}"931 preserved_elements[placeholder] = _get_element_text(elem)932 current_content.append(placeholder)933 placeholder_count += 1934 else:935 # Recursively process children to find nested headers or936 # preserved elements.937 children = _find_all_tags(elem, recursive=False)938 if children:939 # Element has children - recursively process them.940 (941 documents,942 current_headers,943 current_content,944 preserved_elements,945 placeholder_count,946 ) = _process_element(947 children,948 documents,949 current_headers,950 current_content,951 preserved_elements,952 placeholder_count,953 )954 # After processing children, extract only text955 # strings from this element (not its children). Used956 # recursive=False to avoid double-counting.957 content = " ".join(_find_all_strings(elem, recursive=False))958 if content:959 content = self._normalize_and_clean_text(content)960 current_content.append(content)961 else:962 # Leaf element with no children, so we extract its963 # text and add to current content. Handles964 # text-only elements like <p>, <span>, <div>965 content = _get_element_text(elem)966 if content:967 current_content.append(content)968969 return (970 documents,971 current_headers,972 current_content,973 preserved_elements,974 placeholder_count,975 )976977 # Process the elements978 (979 documents,980 current_headers,981 current_content,982 preserved_elements,983 placeholder_count,984 ) = _process_element(985 elements,986 documents,987 current_headers,988 current_content,989 preserved_elements,990 placeholder_count,991 )992993 # Handle any remaining content994 if current_content:995 documents.extend(996 self._create_documents(997 current_headers,998 " ".join(current_content),999 preserved_elements,1000 )1001 )10021003 return documents10041005 def _create_documents(1006 self, headers: dict[str, str], content: str, preserved_elements: dict[str, str]1007 ) -> list[Document]:1008 """Creates Document objects from the provided headers, content, and elements.10091010 Args:1011 headers: The headers to attach as metadata to the `Document`.1012 content: The content of the `Document`.1013 preserved_elements: Preserved elements to be reinserted into the content.10141015 Returns:1016 A list of `Document` objects.1017 """1018 content = re.sub(r"\s+", " ", content).strip()10191020 metadata = {**headers, **self._external_metadata}10211022 if len(content) <= self._max_chunk_size:1023 page_content = self._reinsert_preserved_elements(1024 content, preserved_elements1025 )1026 return [Document(page_content=page_content, metadata=metadata)]1027 return self._further_split_chunk(content, metadata, preserved_elements)10281029 def _further_split_chunk(1030 self, content: str, metadata: dict[Any, Any], preserved_elements: dict[str, str]1031 ) -> list[Document]:1032 """Further splits the content into smaller chunks.10331034 Args:1035 content: The content to be split.1036 metadata: Metadata to attach to each chunk.1037 preserved_elements: Preserved elements to be reinserted into each chunk.10381039 Returns:1040 A list of `Document` objects containing the split content.1041 """1042 splits = self._recursive_splitter.split_text(content)1043 result = []10441045 for split in splits:1046 split_with_preserved = self._reinsert_preserved_elements(1047 split, preserved_elements1048 )1049 if split_with_preserved.strip():1050 result.append(1051 Document(1052 page_content=split_with_preserved.strip(),1053 metadata=metadata,1054 )1055 )10561057 return result10581059 @staticmethod1060 def _reinsert_preserved_elements(1061 content: str, preserved_elements: dict[str, str]1062 ) -> str:1063 """Reinserts preserved elements into the content into their original positions.10641065 Args:1066 content: The content where placeholders need to be replaced.1067 preserved_elements: Preserved elements to be reinserted.10681069 Returns:1070 The content with placeholders replaced by preserved elements.1071 """1072 for placeholder, preserved_content in reversed(preserved_elements.items()):1073 content = content.replace(placeholder, preserved_content.strip())1074 return content107510761077# %%
Same data, no extra tab — call code_get_file + code_get_findings over MCP from Claude/Cursor/Copilot.