libs/langchain/langchain_classic/chains/natbot/crawler.py · langchain-ai/langchain

1import logging2import time3from sys import platform4from typing import (5    TYPE_CHECKING,6    Any,7    TypedDict,8)910if TYPE_CHECKING:11    from playwright.sync_api import Browser, CDPSession, Page1213logger = logging.getLogger(__name__)1415black_listed_elements: set[str] = {16    "html",17    "head",18    "title",19    "meta",20    "iframe",21    "body",22    "script",23    "style",24    "path",25    "svg",26    "br",27    "::marker",28}293031class ElementInViewPort(TypedDict):32    """A typed dictionary containing information about elements in the viewport."""3334    node_index: str35    backend_node_id: int36    node_name: str | None37    node_value: str | None38    node_meta: list[str]39    is_clickable: bool40    origin_x: int41    origin_y: int42    center_x: int43    center_y: int444546class Crawler:47    """A crawler for web pages.4849    **Security Note**: This is an implementation of a crawler that uses a browser via50        Playwright.5152        This crawler can be used to load arbitrary webpages INCLUDING content53        from the local file system.5455        Control access to who can submit crawling requests and what network access56        the crawler has.5758        Make sure to scope permissions to the minimal permissions necessary for59        the application.6061        See https://docs.langchain.com/oss/python/security-policy for more information.62    """6364    def __init__(self) -> None:65        """Initialize the crawler."""66        try:67            from playwright.sync_api import sync_playwright68        except ImportError as e:69            msg = (70                "Could not import playwright python package. "71                "Please install it with `pip install playwright`."72            )73            raise ImportError(msg) from e74        self.browser: Browser = (75            sync_playwright().start().chromium.launch(headless=False)76        )77        self.page: Page = self.browser.new_page()78        self.page.set_viewport_size({"width": 1280, "height": 1080})79        self.page_element_buffer: dict[int, ElementInViewPort]80        self.client: CDPSession8182    def go_to_page(self, url: str) -> None:83        """Navigate to the given URL.8485        Args:86            url: The URL to navigate to. If it does not contain a scheme, it will be87                prefixed with "http://".88        """89        self.page.goto(url=url if "://" in url else "http://" + url)90        self.client = self.page.context.new_cdp_session(self.page)91        self.page_element_buffer = {}9293    def scroll(self, direction: str) -> None:94        """Scroll the page in the given direction.9596        Args:97            direction: The direction to scroll in, either "up" or "down".98        """99        if direction == "up":100            self.page.evaluate(101                "(document.scrollingElement || document.body).scrollTop = "102                "(document.scrollingElement || document.body).scrollTop - "103                "window.innerHeight;"104            )105        elif direction == "down":106            self.page.evaluate(107                "(document.scrollingElement || document.body).scrollTop = "108                "(document.scrollingElement || document.body).scrollTop + "109                "window.innerHeight;"110            )111112    def click(self, id_: str | int) -> None:113        """Click on an element with the given id.114115        Args:116            id_: The id of the element to click on.117        """118        # Inject javascript into the page which removes the target= attribute from links119        js = """120		links = document.getElementsByTagName("a");121		for (var i = 0; i < links.length; i++) {122			links[i].removeAttribute("target");123		}124		"""125        self.page.evaluate(js)126127        element = self.page_element_buffer.get(int(id_))128        if element:129            x: float = element["center_x"]130            y: float = element["center_y"]131132            self.page.mouse.click(x, y)133        else:134            print("Could not find element")  # noqa: T201135136    def type(self, id_: str | int, text: str) -> None:137        """Type text into an element with the given id.138139        Args:140            id_: The id of the element to type into.141            text: The text to type into the element.142        """143        self.click(id_)144        self.page.keyboard.type(text)145146    def enter(self) -> None:147        """Press the Enter key."""148        self.page.keyboard.press("Enter")149150    def crawl(self) -> list[str]:151        """Crawl the current page.152153        Returns:154            A list of the elements in the viewport.155        """156        page = self.page157        page_element_buffer = self.page_element_buffer158        start = time.time()159160        page_state_as_text = []161162        device_pixel_ratio: float = page.evaluate("window.devicePixelRatio")163        if platform == "darwin" and device_pixel_ratio == 1:  # lies164            device_pixel_ratio = 2165166        win_upper_bound: float = page.evaluate("window.pageYOffset")167        win_left_bound: float = page.evaluate("window.pageXOffset")168        win_width: float = page.evaluate("window.screen.width")169        win_height: float = page.evaluate("window.screen.height")170        win_right_bound: float = win_left_bound + win_width171        win_lower_bound: float = win_upper_bound + win_height172173        # 	percentage_progress_start = (win_upper_bound / document_scroll_height) * 100174        # 	percentage_progress_end = (175        # 		(win_height + win_upper_bound) / document_scroll_height176        # 	) * 100177        percentage_progress_start = 1178        percentage_progress_end = 2179180        page_state_as_text.append(181            {182                "x": 0,183                "y": 0,184                "text": f"[scrollbar {percentage_progress_start:0.2f}-"185                f"{percentage_progress_end:0.2f}%]",186            }187        )188189        tree = self.client.send(190            "DOMSnapshot.captureSnapshot",191            {"computedStyles": [], "includeDOMRects": True, "includePaintOrder": True},192        )193        strings: dict[int, str] = tree["strings"]194        document: dict[str, Any] = tree["documents"][0]195        nodes: dict[str, Any] = document["nodes"]196        backend_node_id: dict[int, int] = nodes["backendNodeId"]197        attributes: dict[int, dict[int, Any]] = nodes["attributes"]198        node_value: dict[int, int] = nodes["nodeValue"]199        parent: dict[int, int] = nodes["parentIndex"]200        node_names: dict[int, int] = nodes["nodeName"]201        is_clickable: set[int] = set(nodes["isClickable"]["index"])202203        input_value: dict[str, Any] = nodes["inputValue"]204        input_value_index: list[int] = input_value["index"]205        input_value_values: list[int] = input_value["value"]206207        layout: dict[str, Any] = document["layout"]208        layout_node_index: list[int] = layout["nodeIndex"]209        bounds: dict[int, list[float]] = layout["bounds"]210211        cursor: int = 0212213        child_nodes: dict[str, list[dict[str, Any]]] = {}214        elements_in_view_port: list[ElementInViewPort] = []215216        anchor_ancestry: dict[str, tuple[bool, int | None]] = {"-1": (False, None)}217        button_ancestry: dict[str, tuple[bool, int | None]] = {"-1": (False, None)}218219        def convert_name(220            node_name: str | None,221            has_click_handler: bool | None,  # noqa: FBT001222        ) -> str:223            if node_name == "a":224                return "link"225            if node_name == "input":226                return "input"227            if node_name == "img":228                return "img"229            if (230                node_name == "button" or has_click_handler231            ):  # found pages that needed this quirk232                return "button"233            return "text"234235        def find_attributes(236            attributes: dict[int, Any], keys: list[str]237        ) -> dict[str, str]:238            values = {}239240            for [key_index, value_index] in zip(*(iter(attributes),) * 2, strict=False):241                if value_index < 0:242                    continue243                key = strings[key_index]244                value = strings[value_index]245246                if key in keys:247                    values[key] = value248                    keys.remove(key)249250                    if not keys:251                        return values252253            return values254255        def add_to_hash_tree(256            hash_tree: dict[str, tuple[bool, int | None]],257            tag: str,258            node_id: int,259            node_name: str | None,260            parent_id: int,261        ) -> tuple[bool, int | None]:262            parent_id_str = str(parent_id)263            if parent_id_str not in hash_tree:264                parent_name = strings[node_names[parent_id]].lower()265                grand_parent_id = parent[parent_id]266267                add_to_hash_tree(268                    hash_tree, tag, parent_id, parent_name, grand_parent_id269                )270271            is_parent_desc_anchor, anchor_id = hash_tree[parent_id_str]272273            # even if the anchor is nested in another anchor, we set the "root" for all274            # descendants to be ::Self275            if node_name == tag:276                value: tuple[bool, int | None] = (True, node_id)277            elif (278                is_parent_desc_anchor279            ):  # reuse the parent's anchor_id (which could be much higher in the tree)280                value = (True, anchor_id)281            else:282                value = (283                    False,284                    None,285                )286                # not a descendant of an anchor, most likely it will become text, an287                # interactive element or discarded288289            hash_tree[str(node_id)] = value290291            return value292293        for index, node_name_index in enumerate(node_names):294            node_parent = parent[index]295            node_name: str | None = strings[node_name_index].lower()296297            is_ancestor_of_anchor, anchor_id = add_to_hash_tree(298                anchor_ancestry, "a", index, node_name, node_parent299            )300301            is_ancestor_of_button, button_id = add_to_hash_tree(302                button_ancestry, "button", index, node_name, node_parent303            )304305            try:306                cursor = layout_node_index.index(index)307                # TODO: replace this with proper cursoring, ignoring the fact this is308                # O(n^2) for the moment309            except ValueError:310                continue311312            if node_name in black_listed_elements:313                continue314315            [x, y, width, height] = bounds[cursor]316            x /= device_pixel_ratio317            y /= device_pixel_ratio318            width /= device_pixel_ratio319            height /= device_pixel_ratio320321            elem_left_bound = x322            elem_top_bound = y323            elem_right_bound = x + width324            elem_lower_bound = y + height325326            partially_is_in_viewport = (327                elem_left_bound < win_right_bound328                and elem_right_bound >= win_left_bound329                and elem_top_bound < win_lower_bound330                and elem_lower_bound >= win_upper_bound331            )332333            if not partially_is_in_viewport:334                continue335336            meta_data: list[str] = []337338            # inefficient to grab the same set of keys for kinds of objects, but it's339            # fine for now340            element_attributes = find_attributes(341                attributes[index], ["type", "placeholder", "aria-label", "title", "alt"]342            )343344            ancestor_exception = is_ancestor_of_anchor or is_ancestor_of_button345            ancestor_node_key = (346                None347                if not ancestor_exception348                else str(anchor_id)349                if is_ancestor_of_anchor350                else str(button_id)351            )352            ancestor_node = (353                None354                if not ancestor_exception355                else child_nodes.setdefault(str(ancestor_node_key), [])356            )357358            if node_name == "#text" and ancestor_exception and ancestor_node:359                text = strings[node_value[index]]360                if text in {"|", "•"}:361                    continue362                ancestor_node.append({"type": "type", "value": text})363            else:364                if (365                    node_name == "input" and element_attributes.get("type") == "submit"366                ) or node_name == "button":367                    node_name = "button"368                    element_attributes.pop(369                        "type", None370                    )  # prevent [button ... (button)..]371372                for key in element_attributes:373                    if ancestor_exception and ancestor_node:374                        ancestor_node.append(375                            {376                                "type": "attribute",377                                "key": key,378                                "value": element_attributes[key],379                            }380                        )381                    else:382                        meta_data.append(element_attributes[key])383384            element_node_value = None385386            if node_value[index] >= 0:387                element_node_value = strings[node_value[index]]388                if (389                    element_node_value == "|"390                    # commonly used as a separator, does not add much context - lets391                    # save ourselves some token space392                ):393                    continue394            elif (395                node_name == "input"396                and index in input_value_index397                and element_node_value is None398            ):399                node_input_text_index = input_value_index.index(index)400                text_index = input_value_values[node_input_text_index]401                if node_input_text_index >= 0 and text_index >= 0:402                    element_node_value = strings[text_index]403404            # remove redundant elements405            if ancestor_exception and (node_name not in {"a", "button"}):406                continue407408            elements_in_view_port.append(409                {410                    "node_index": str(index),411                    "backend_node_id": backend_node_id[index],412                    "node_name": node_name,413                    "node_value": element_node_value,414                    "node_meta": meta_data,415                    "is_clickable": index in is_clickable,416                    "origin_x": int(x),417                    "origin_y": int(y),418                    "center_x": int(x + (width / 2)),419                    "center_y": int(y + (height / 2)),420                }421            )422423        # lets filter further to remove anything that does not hold any text nor has424        # click handlers + merge text from leaf#text nodes with the parent425        elements_of_interest = []426        id_counter = 0427428        for element in elements_in_view_port:429            node_index = element.get("node_index")430            node_name = element.get("node_name")431            element_node_value = element.get("node_value")432            node_is_clickable = element.get("is_clickable")433            node_meta_data: list[str] | None = element.get("node_meta")434435            inner_text = f"{element_node_value} " if element_node_value else ""436            meta = ""437438            if node_index in child_nodes:439                for child in child_nodes[node_index]:440                    entry_type = child.get("type")441                    entry_value = child.get("value")442443                    if entry_type == "attribute" and node_meta_data:444                        entry_key = child.get("key")445                        node_meta_data.append(f'{entry_key}="{entry_value}"')446                    else:447                        inner_text += f"{entry_value} "448449            if node_meta_data:450                meta_string = " ".join(node_meta_data)451                meta = f" {meta_string}"452453            if inner_text != "":454                inner_text = f"{inner_text.strip()}"455456            converted_node_name = convert_name(node_name, node_is_clickable)457458            # not very elegant, more like a placeholder459            if (460                (converted_node_name != "button" or meta == "")461                and converted_node_name not in {"link", "input", "img", "textarea"}462            ) and inner_text.strip() == "":463                continue464465            page_element_buffer[id_counter] = element466467            if inner_text != "":468                elements_of_interest.append(469                    f"<{converted_node_name} id={id_counter}{meta}>{inner_text}"470                    f"</{converted_node_name}>"471                )472            else:473                elements_of_interest.append(474                    f"""<{converted_node_name} id={id_counter}{meta}/>"""475                )476            id_counter += 1477478        print(f"Parsing time: {time.time() - start:0.2f} seconds")  # noqa: T201479        return elements_of_interest