Use logging module for better control and configurability
print("Could not find element") # noqa: T201
1import logging2import time3from sys import platform4from typing import (5 TYPE_CHECKING,6 Any,7 TypedDict,8)910if TYPE_CHECKING:11 from playwright.sync_api import Browser, CDPSession, Page1213logger = logging.getLogger(__name__)1415black_listed_elements: set[str] = {16 "html",17 "head",18 "title",19 "meta",20 "iframe",21 "body",22 "script",23 "style",24 "path",25 "svg",26 "br",27 "::marker",28}293031class ElementInViewPort(TypedDict):32 """A typed dictionary containing information about elements in the viewport."""3334 node_index: str35 backend_node_id: int36 node_name: str | None37 node_value: str | None38 node_meta: list[str]39 is_clickable: bool40 origin_x: int41 origin_y: int42 center_x: int43 center_y: int444546class Crawler:47 """A crawler for web pages.4849 **Security Note**: This is an implementation of a crawler that uses a browser via50 Playwright.5152 This crawler can be used to load arbitrary webpages INCLUDING content53 from the local file system.5455 Control access to who can submit crawling requests and what network access56 the crawler has.5758 Make sure to scope permissions to the minimal permissions necessary for59 the application.6061 See https://docs.langchain.com/oss/python/security-policy for more information.62 """6364 def __init__(self) -> None:65 """Initialize the crawler."""66 try:67 from playwright.sync_api import sync_playwright68 except ImportError as e:69 msg = (70 "Could not import playwright python package. "71 "Please install it with `pip install playwright`."72 )73 raise ImportError(msg) from e74 self.browser: Browser = (75 sync_playwright().start().chromium.launch(headless=False)76 )77 self.page: Page = self.browser.new_page()78 self.page.set_viewport_size({"width": 1280, "height": 1080})79 self.page_element_buffer: dict[int, ElementInViewPort]80 self.client: CDPSession8182 def go_to_page(self, url: str) -> None:83 """Navigate to the given URL.8485 Args:86 url: The URL to navigate to. If it does not contain a scheme, it will be87 prefixed with "http://".88 """89 self.page.goto(url=url if "://" in url else "http://" + url)90 self.client = self.page.context.new_cdp_session(self.page)91 self.page_element_buffer = {}9293 def scroll(self, direction: str) -> None:94 """Scroll the page in the given direction.9596 Args:97 direction: The direction to scroll in, either "up" or "down".98 """99 if direction == "up":100 self.page.evaluate(101 "(document.scrollingElement || document.body).scrollTop = "102 "(document.scrollingElement || document.body).scrollTop - "103 "window.innerHeight;"104 )105 elif direction == "down":106 self.page.evaluate(107 "(document.scrollingElement || document.body).scrollTop = "108 "(document.scrollingElement || document.body).scrollTop + "109 "window.innerHeight;"110 )111112 def click(self, id_: str | int) -> None:113 """Click on an element with the given id.114115 Args:116 id_: The id of the element to click on.117 """118 # Inject javascript into the page which removes the target= attribute from links119 js = """120 links = document.getElementsByTagName("a");121 for (var i = 0; i < links.length; i++) {122 links[i].removeAttribute("target");123 }124 """125 self.page.evaluate(js)126127 element = self.page_element_buffer.get(int(id_))128 if element:129 x: float = element["center_x"]130 y: float = element["center_y"]131132 self.page.mouse.click(x, y)133 else:134 print("Could not find element") # noqa: T201135136 def type(self, id_: str | int, text: str) -> None:137 """Type text into an element with the given id.138139 Args:140 id_: The id of the element to type into.141 text: The text to type into the element.142 """143 self.click(id_)144 self.page.keyboard.type(text)145146 def enter(self) -> None:147 """Press the Enter key."""148 self.page.keyboard.press("Enter")149150 def crawl(self) -> list[str]:151 """Crawl the current page.152153 Returns:154 A list of the elements in the viewport.155 """156 page = self.page157 page_element_buffer = self.page_element_buffer158 start = time.time()159160 page_state_as_text = []161162 device_pixel_ratio: float = page.evaluate("window.devicePixelRatio")163 if platform == "darwin" and device_pixel_ratio == 1: # lies164 device_pixel_ratio = 2165166 win_upper_bound: float = page.evaluate("window.pageYOffset")167 win_left_bound: float = page.evaluate("window.pageXOffset")168 win_width: float = page.evaluate("window.screen.width")169 win_height: float = page.evaluate("window.screen.height")170 win_right_bound: float = win_left_bound + win_width171 win_lower_bound: float = win_upper_bound + win_height172173 # percentage_progress_start = (win_upper_bound / document_scroll_height) * 100174 # percentage_progress_end = (175 # (win_height + win_upper_bound) / document_scroll_height176 # ) * 100177 percentage_progress_start = 1178 percentage_progress_end = 2179180 page_state_as_text.append(181 {182 "x": 0,183 "y": 0,184 "text": f"[scrollbar {percentage_progress_start:0.2f}-"185 f"{percentage_progress_end:0.2f}%]",186 }187 )188189 tree = self.client.send(190 "DOMSnapshot.captureSnapshot",191 {"computedStyles": [], "includeDOMRects": True, "includePaintOrder": True},192 )193 strings: dict[int, str] = tree["strings"]194 document: dict[str, Any] = tree["documents"][0]195 nodes: dict[str, Any] = document["nodes"]196 backend_node_id: dict[int, int] = nodes["backendNodeId"]197 attributes: dict[int, dict[int, Any]] = nodes["attributes"]198 node_value: dict[int, int] = nodes["nodeValue"]199 parent: dict[int, int] = nodes["parentIndex"]200 node_names: dict[int, int] = nodes["nodeName"]201 is_clickable: set[int] = set(nodes["isClickable"]["index"])202203 input_value: dict[str, Any] = nodes["inputValue"]204 input_value_index: list[int] = input_value["index"]205 input_value_values: list[int] = input_value["value"]206207 layout: dict[str, Any] = document["layout"]208 layout_node_index: list[int] = layout["nodeIndex"]209 bounds: dict[int, list[float]] = layout["bounds"]210211 cursor: int = 0212213 child_nodes: dict[str, list[dict[str, Any]]] = {}214 elements_in_view_port: list[ElementInViewPort] = []215216 anchor_ancestry: dict[str, tuple[bool, int | None]] = {"-1": (False, None)}217 button_ancestry: dict[str, tuple[bool, int | None]] = {"-1": (False, None)}218219 def convert_name(220 node_name: str | None,221 has_click_handler: bool | None, # noqa: FBT001222 ) -> str:223 if node_name == "a":224 return "link"225 if node_name == "input":226 return "input"227 if node_name == "img":228 return "img"229 if (230 node_name == "button" or has_click_handler231 ): # found pages that needed this quirk232 return "button"233 return "text"234235 def find_attributes(236 attributes: dict[int, Any], keys: list[str]237 ) -> dict[str, str]:238 values = {}239240 for [key_index, value_index] in zip(*(iter(attributes),) * 2, strict=False):241 if value_index < 0:242 continue243 key = strings[key_index]244 value = strings[value_index]245246 if key in keys:247 values[key] = value248 keys.remove(key)249250 if not keys:251 return values252253 return values254255 def add_to_hash_tree(256 hash_tree: dict[str, tuple[bool, int | None]],257 tag: str,258 node_id: int,259 node_name: str | None,260 parent_id: int,261 ) -> tuple[bool, int | None]:262 parent_id_str = str(parent_id)263 if parent_id_str not in hash_tree:264 parent_name = strings[node_names[parent_id]].lower()265 grand_parent_id = parent[parent_id]266267 add_to_hash_tree(268 hash_tree, tag, parent_id, parent_name, grand_parent_id269 )270271 is_parent_desc_anchor, anchor_id = hash_tree[parent_id_str]272273 # even if the anchor is nested in another anchor, we set the "root" for all274 # descendants to be ::Self275 if node_name == tag:276 value: tuple[bool, int | None] = (True, node_id)277 elif (278 is_parent_desc_anchor279 ): # reuse the parent's anchor_id (which could be much higher in the tree)280 value = (True, anchor_id)281 else:282 value = (283 False,284 None,285 )286 # not a descendant of an anchor, most likely it will become text, an287 # interactive element or discarded288289 hash_tree[str(node_id)] = value290291 return value292293 for index, node_name_index in enumerate(node_names):294 node_parent = parent[index]295 node_name: str | None = strings[node_name_index].lower()296297 is_ancestor_of_anchor, anchor_id = add_to_hash_tree(298 anchor_ancestry, "a", index, node_name, node_parent299 )300301 is_ancestor_of_button, button_id = add_to_hash_tree(302 button_ancestry, "button", index, node_name, node_parent303 )304305 try:306 cursor = layout_node_index.index(index)307 # TODO: replace this with proper cursoring, ignoring the fact this is308 # O(n^2) for the moment309 except ValueError:310 continue311312 if node_name in black_listed_elements:313 continue314315 [x, y, width, height] = bounds[cursor]316 x /= device_pixel_ratio317 y /= device_pixel_ratio318 width /= device_pixel_ratio319 height /= device_pixel_ratio320321 elem_left_bound = x322 elem_top_bound = y323 elem_right_bound = x + width324 elem_lower_bound = y + height325326 partially_is_in_viewport = (327 elem_left_bound < win_right_bound328 and elem_right_bound >= win_left_bound329 and elem_top_bound < win_lower_bound330 and elem_lower_bound >= win_upper_bound331 )332333 if not partially_is_in_viewport:334 continue335336 meta_data: list[str] = []337338 # inefficient to grab the same set of keys for kinds of objects, but it's339 # fine for now340 element_attributes = find_attributes(341 attributes[index], ["type", "placeholder", "aria-label", "title", "alt"]342 )343344 ancestor_exception = is_ancestor_of_anchor or is_ancestor_of_button345 ancestor_node_key = (346 None347 if not ancestor_exception348 else str(anchor_id)349 if is_ancestor_of_anchor350 else str(button_id)351 )352 ancestor_node = (353 None354 if not ancestor_exception355 else child_nodes.setdefault(str(ancestor_node_key), [])356 )357358 if node_name == "#text" and ancestor_exception and ancestor_node:359 text = strings[node_value[index]]360 if text in {"|", "•"}:361 continue362 ancestor_node.append({"type": "type", "value": text})363 else:364 if (365 node_name == "input" and element_attributes.get("type") == "submit"366 ) or node_name == "button":367 node_name = "button"368 element_attributes.pop(369 "type", None370 ) # prevent [button ... (button)..]371372 for key in element_attributes:373 if ancestor_exception and ancestor_node:374 ancestor_node.append(375 {376 "type": "attribute",377 "key": key,378 "value": element_attributes[key],379 }380 )381 else:382 meta_data.append(element_attributes[key])383384 element_node_value = None385386 if node_value[index] >= 0:387 element_node_value = strings[node_value[index]]388 if (389 element_node_value == "|"390 # commonly used as a separator, does not add much context - lets391 # save ourselves some token space392 ):393 continue394 elif (395 node_name == "input"396 and index in input_value_index397 and element_node_value is None398 ):399 node_input_text_index = input_value_index.index(index)400 text_index = input_value_values[node_input_text_index]401 if node_input_text_index >= 0 and text_index >= 0:402 element_node_value = strings[text_index]403404 # remove redundant elements405 if ancestor_exception and (node_name not in {"a", "button"}):406 continue407408 elements_in_view_port.append(409 {410 "node_index": str(index),411 "backend_node_id": backend_node_id[index],412 "node_name": node_name,413 "node_value": element_node_value,414 "node_meta": meta_data,415 "is_clickable": index in is_clickable,416 "origin_x": int(x),417 "origin_y": int(y),418 "center_x": int(x + (width / 2)),419 "center_y": int(y + (height / 2)),420 }421 )422423 # lets filter further to remove anything that does not hold any text nor has424 # click handlers + merge text from leaf#text nodes with the parent425 elements_of_interest = []426 id_counter = 0427428 for element in elements_in_view_port:429 node_index = element.get("node_index")430 node_name = element.get("node_name")431 element_node_value = element.get("node_value")432 node_is_clickable = element.get("is_clickable")433 node_meta_data: list[str] | None = element.get("node_meta")434435 inner_text = f"{element_node_value} " if element_node_value else ""436 meta = ""437438 if node_index in child_nodes:439 for child in child_nodes[node_index]:440 entry_type = child.get("type")441 entry_value = child.get("value")442443 if entry_type == "attribute" and node_meta_data:444 entry_key = child.get("key")445 node_meta_data.append(f'{entry_key}="{entry_value}"')446 else:447 inner_text += f"{entry_value} "448449 if node_meta_data:450 meta_string = " ".join(node_meta_data)451 meta = f" {meta_string}"452453 if inner_text != "":454 inner_text = f"{inner_text.strip()}"455456 converted_node_name = convert_name(node_name, node_is_clickable)457458 # not very elegant, more like a placeholder459 if (460 (converted_node_name != "button" or meta == "")461 and converted_node_name not in {"link", "input", "img", "textarea"}462 ) and inner_text.strip() == "":463 continue464465 page_element_buffer[id_counter] = element466467 if inner_text != "":468 elements_of_interest.append(469 f"<{converted_node_name} id={id_counter}{meta}>{inner_text}"470 f"</{converted_node_name}>"471 )472 else:473 elements_of_interest.append(474 f"""<{converted_node_name} id={id_counter}{meta}/>"""475 )476 id_counter += 1477478 print(f"Parsing time: {time.time() - start:0.2f} seconds") # noqa: T201479 return elements_of_interest
Same data, no extra tab — call code_get_file + code_get_findings over MCP from Claude/Cursor/Copilot.