Ensure functions have docstrings for documentation
def update(
1"""Base classes for indexing."""23from __future__ import annotations45import abc6import time7from abc import ABC, abstractmethod8from typing import TYPE_CHECKING, Any, TypedDict910from typing_extensions import override1112from langchain_core._api import beta13from langchain_core.retrievers import BaseRetriever14from langchain_core.runnables import run_in_executor1516if TYPE_CHECKING:17 from collections.abc import Sequence1819 from langchain_core.documents import Document202122class RecordManager(ABC):23 """Abstract base class representing the interface for a record manager.2425 The record manager abstraction is used by the langchain indexing API.2627 The record manager keeps track of which documents have been28 written into a `VectorStore` and when they were written.2930 The indexing API computes hashes for each document and stores the hash31 together with the write time and the source id in the record manager.3233 On subsequent indexing runs, the indexing API can check the record manager34 to determine which documents have already been indexed and which have not.3536 This allows the indexing API to avoid re-indexing documents that have37 already been indexed, and to only index new documents.3839 The main benefit of this abstraction is that it works across many vectorstores.40 To be supported, a `VectorStore` needs to only support the ability to add and41 delete documents by ID. Using the record manager, the indexing API will42 be able to delete outdated documents and avoid redundant indexing of documents43 that have already been indexed.4445 The main constraints of this abstraction are:4647 1. It relies on the time-stamps to determine which documents have been48 indexed and which have not. This means that the time-stamps must be49 monotonically increasing. The timestamp should be the timestamp50 as measured by the server to minimize issues.51 2. The record manager is currently implemented separately from the52 vectorstore, which means that the overall system becomes distributed53 and may create issues with consistency. For example, writing to54 record manager succeeds, but corresponding writing to `VectorStore` fails.55 """5657 def __init__(58 self,59 namespace: str,60 ) -> None:61 """Initialize the record manager.6263 Args:64 namespace: The namespace for the record manager.65 """66 self.namespace = namespace6768 @abstractmethod69 def create_schema(self) -> None:70 """Create the database schema for the record manager."""7172 @abstractmethod73 async def acreate_schema(self) -> None:74 """Asynchronously create the database schema for the record manager."""7576 @abstractmethod77 def get_time(self) -> float:78 """Get the current server time as a high resolution timestamp!7980 It's important to get this from the server to ensure a monotonic clock,81 otherwise there may be data loss when cleaning up old documents!8283 Returns:84 The current server time as a float timestamp.85 """8687 @abstractmethod88 async def aget_time(self) -> float:89 """Asynchronously get the current server time as a high resolution timestamp.9091 It's important to get this from the server to ensure a monotonic clock,92 otherwise there may be data loss when cleaning up old documents!9394 Returns:95 The current server time as a float timestamp.96 """9798 @abstractmethod99 def update(100 self,101 keys: Sequence[str],102 *,103 group_ids: Sequence[str | None] | None = None,104 time_at_least: float | None = None,105 ) -> None:106 """Upsert records into the database.107108 Args:109 keys: A list of record keys to upsert.110 group_ids: A list of group IDs corresponding to the keys.111 time_at_least: Optional timestamp. Implementation can use this112 to optionally verify that the timestamp IS at least this time113 in the system that stores the data.114115 e.g., use to validate that the time in the postgres database116 is equal to or larger than the given timestamp, if not117 raise an error.118119 This is meant to help prevent time-drift issues since120 time may not be monotonically increasing!121122 Raises:123 ValueError: If the length of keys doesn't match the length of group_ids.124 """125126 @abstractmethod127 async def aupdate(128 self,129 keys: Sequence[str],130 *,131 group_ids: Sequence[str | None] | None = None,132 time_at_least: float | None = None,133 ) -> None:134 """Asynchronously upsert records into the database.135136 Args:137 keys: A list of record keys to upsert.138 group_ids: A list of group IDs corresponding to the keys.139 time_at_least: Optional timestamp. Implementation can use this140 to optionally verify that the timestamp IS at least this time141 in the system that stores the data.142143 e.g., use to validate that the time in the postgres database144 is equal to or larger than the given timestamp, if not145 raise an error.146147 This is meant to help prevent time-drift issues since148 time may not be monotonically increasing!149150 Raises:151 ValueError: If the length of keys doesn't match the length of group_ids.152 """153154 @abstractmethod155 def exists(self, keys: Sequence[str]) -> list[bool]:156 """Check if the provided keys exist in the database.157158 Args:159 keys: A list of keys to check.160161 Returns:162 A list of boolean values indicating the existence of each key.163 """164165 @abstractmethod166 async def aexists(self, keys: Sequence[str]) -> list[bool]:167 """Asynchronously check if the provided keys exist in the database.168169 Args:170 keys: A list of keys to check.171172 Returns:173 A list of boolean values indicating the existence of each key.174 """175176 @abstractmethod177 def list_keys(178 self,179 *,180 before: float | None = None,181 after: float | None = None,182 group_ids: Sequence[str] | None = None,183 limit: int | None = None,184 ) -> list[str]:185 """List records in the database based on the provided filters.186187 Args:188 before: Filter to list records updated before this time.189 after: Filter to list records updated after this time.190 group_ids: Filter to list records with specific group IDs.191 limit: optional limit on the number of records to return.192193 Returns:194 A list of keys for the matching records.195 """196197 @abstractmethod198 async def alist_keys(199 self,200 *,201 before: float | None = None,202 after: float | None = None,203 group_ids: Sequence[str] | None = None,204 limit: int | None = None,205 ) -> list[str]:206 """Asynchronously list records in the database based on the provided filters.207208 Args:209 before: Filter to list records updated before this time.210 after: Filter to list records updated after this time.211 group_ids: Filter to list records with specific group IDs.212 limit: optional limit on the number of records to return.213214 Returns:215 A list of keys for the matching records.216 """217218 @abstractmethod219 def delete_keys(self, keys: Sequence[str]) -> None:220 """Delete specified records from the database.221222 Args:223 keys: A list of keys to delete.224 """225226 @abstractmethod227 async def adelete_keys(self, keys: Sequence[str]) -> None:228 """Asynchronously delete specified records from the database.229230 Args:231 keys: A list of keys to delete.232 """233234235class _Record(TypedDict):236 group_id: str | None237 updated_at: float238239240class InMemoryRecordManager(RecordManager):241 """An in-memory record manager for testing purposes."""242243 def __init__(self, namespace: str) -> None:244 """Initialize the in-memory record manager.245246 Args:247 namespace: The namespace for the record manager.248 """249 super().__init__(namespace)250 # Each key points to a dictionary251 # of {'group_id': group_id, 'updated_at': timestamp}252 self.records: dict[str, _Record] = {}253 self.namespace = namespace254255 def create_schema(self) -> None:256 """In-memory schema creation is simply ensuring the structure is initialized."""257258 async def acreate_schema(self) -> None:259 """In-memory schema creation is simply ensuring the structure is initialized."""260261 @override262 def get_time(self) -> float:263 return time.time()264265 @override266 async def aget_time(self) -> float:267 return self.get_time()268269 def update(270 self,271 keys: Sequence[str],272 *,273 group_ids: Sequence[str | None] | None = None,274 time_at_least: float | None = None,275 ) -> None:276 """Upsert records into the database.277278 Args:279 keys: A list of record keys to upsert.280 group_ids: A list of group IDs corresponding to the keys.281282 time_at_least: Optional timestamp. Implementation can use this283 to optionally verify that the timestamp IS at least this time284 in the system that stores.285 E.g., use to validate that the time in the postgres database286 is equal to or larger than the given timestamp, if not287 raise an error.288 This is meant to help prevent time-drift issues since289 time may not be monotonically increasing!290291 Raises:292 ValueError: If the length of keys doesn't match the length of group293 ids.294 ValueError: If time_at_least is in the future.295 """296 if group_ids and len(keys) != len(group_ids):297 msg = "Length of keys must match length of group_ids"298 raise ValueError(msg)299 for index, key in enumerate(keys):300 group_id = group_ids[index] if group_ids else None301 if time_at_least and time_at_least > self.get_time():302 msg = "time_at_least must be in the past"303 raise ValueError(msg)304 self.records[key] = {"group_id": group_id, "updated_at": self.get_time()}305306 async def aupdate(307 self,308 keys: Sequence[str],309 *,310 group_ids: Sequence[str | None] | None = None,311 time_at_least: float | None = None,312 ) -> None:313 """Async upsert records into the database.314315 Args:316 keys: A list of record keys to upsert.317 group_ids: A list of group IDs corresponding to the keys.318319 time_at_least: Optional timestamp. Implementation can use this320 to optionally verify that the timestamp IS at least this time321 in the system that stores.322 E.g., use to validate that the time in the postgres database323 is equal to or larger than the given timestamp, if not324 raise an error.325 This is meant to help prevent time-drift issues since326 time may not be monotonically increasing!327 """328 self.update(keys, group_ids=group_ids, time_at_least=time_at_least)329330 def exists(self, keys: Sequence[str]) -> list[bool]:331 """Check if the provided keys exist in the database.332333 Args:334 keys: A list of keys to check.335336 Returns:337 A list of boolean values indicating the existence of each key.338 """339 return [key in self.records for key in keys]340341 async def aexists(self, keys: Sequence[str]) -> list[bool]:342 """Async check if the provided keys exist in the database.343344 Args:345 keys: A list of keys to check.346347 Returns:348 A list of boolean values indicating the existence of each key.349 """350 return self.exists(keys)351352 def list_keys(353 self,354 *,355 before: float | None = None,356 after: float | None = None,357 group_ids: Sequence[str] | None = None,358 limit: int | None = None,359 ) -> list[str]:360 """List records in the database based on the provided filters.361362 Args:363 before: Filter to list records updated before this time.364365 after: Filter to list records updated after this time.366367 group_ids: Filter to list records with specific group IDs.368369 limit: optional limit on the number of records to return.370371372 Returns:373 A list of keys for the matching records.374 """375 result = []376 for key, data in self.records.items():377 if before and data["updated_at"] >= before:378 continue379 if after and data["updated_at"] <= after:380 continue381 if group_ids and data["group_id"] not in group_ids:382 continue383 result.append(key)384 if limit:385 return result[:limit]386 return result387388 async def alist_keys(389 self,390 *,391 before: float | None = None,392 after: float | None = None,393 group_ids: Sequence[str] | None = None,394 limit: int | None = None,395 ) -> list[str]:396 """Async list records in the database based on the provided filters.397398 Args:399 before: Filter to list records updated before this time.400401 after: Filter to list records updated after this time.402403 group_ids: Filter to list records with specific group IDs.404405 limit: optional limit on the number of records to return.406407408 Returns:409 A list of keys for the matching records.410 """411 return self.list_keys(412 before=before, after=after, group_ids=group_ids, limit=limit413 )414415 def delete_keys(self, keys: Sequence[str]) -> None:416 """Delete specified records from the database.417418 Args:419 keys: A list of keys to delete.420 """421 for key in keys:422 if key in self.records:423 del self.records[key]424425 async def adelete_keys(self, keys: Sequence[str]) -> None:426 """Async delete specified records from the database.427428 Args:429 keys: A list of keys to delete.430 """431 self.delete_keys(keys)432433434class UpsertResponse(TypedDict):435 """A generic response for upsert operations.436437 The upsert response will be used by abstractions that implement an upsert438 operation for content that can be upserted by ID.439440 Upsert APIs that accept inputs with IDs and generate IDs internally441 will return a response that includes the IDs that succeeded and the IDs442 that failed.443444 If there are no failures, the failed list will be empty, and the order445 of the IDs in the succeeded list will match the order of the input documents.446447 If there are failures, the response becomes ill defined, and a user of the API448 cannot determine which generated ID corresponds to which input document.449450 It is recommended for users explicitly attach the IDs to the items being451 indexed to avoid this issue.452 """453454 succeeded: list[str]455 """The IDs that were successfully indexed."""456 failed: list[str]457 """The IDs that failed to index."""458459460class DeleteResponse(TypedDict, total=False):461 """A generic response for delete operation.462463 The fields in this response are optional and whether the `VectorStore`464 returns them or not is up to the implementation.465 """466467 num_deleted: int468 """The number of items that were successfully deleted.469470 If returned, this should only include *actual* deletions.471472 If the ID did not exist to begin with,473 it should not be included in this count.474 """475476 succeeded: Sequence[str]477 """The IDs that were successfully deleted.478479 If returned, this should only include *actual* deletions.480481 If the ID did not exist to begin with,482 it should not be included in this list.483 """484485 failed: Sequence[str]486 """The IDs that failed to be deleted.487488 !!! warning489 Deleting an ID that does not exist is **NOT** considered a failure.490 """491492 num_failed: int493 """The number of items that failed to be deleted."""494495496@beta(message="Added in 0.2.29. The abstraction is subject to change.")497class DocumentIndex(BaseRetriever):498 """A document retriever that supports indexing operations.499500 This indexing interface is designed to be a generic abstraction for storing and501 querying documents that has an ID and metadata associated with it.502503 The interface is designed to be agnostic to the underlying implementation of the504 indexing system.505506 The interface is designed to support the following operations:507508 1. Storing document in the index.509 2. Fetching document by ID.510 3. Searching for document using a query.511 """512513 @abc.abstractmethod514 def upsert(self, items: Sequence[Document], /, **kwargs: Any) -> UpsertResponse:515 """Upsert documents into the index.516517 The upsert functionality should utilize the ID field of the content object518 if it is provided. If the ID is not provided, the upsert method is free519 to generate an ID for the content.520521 When an ID is specified and the content already exists in the `VectorStore`,522 the upsert method should update the content with the new data. If the content523 does not exist, the upsert method should add the item to the `VectorStore`.524525 Args:526 items: Sequence of documents to add to the `VectorStore`.527 **kwargs: Additional keyword arguments.528529 Returns:530 A response object that contains the list of IDs that were531 successfully added or updated in the `VectorStore` and the list of IDs that532 failed to be added or updated.533 """534535 async def aupsert(536 self, items: Sequence[Document], /, **kwargs: Any537 ) -> UpsertResponse:538 """Add or update documents in the `VectorStore`. Async version of `upsert`.539540 The upsert functionality should utilize the ID field of the item541 if it is provided. If the ID is not provided, the upsert method is free542 to generate an ID for the item.543544 When an ID is specified and the item already exists in the `VectorStore`,545 the upsert method should update the item with the new data. If the item546 does not exist, the upsert method should add the item to the `VectorStore`.547548 Args:549 items: Sequence of documents to add to the `VectorStore`.550 **kwargs: Additional keyword arguments.551552 Returns:553 A response object that contains the list of IDs that were554 successfully added or updated in the `VectorStore` and the list of IDs that555 failed to be added or updated.556 """557 return await run_in_executor(558 None,559 self.upsert,560 items,561 **kwargs,562 )563564 @abc.abstractmethod565 def delete(self, ids: list[str] | None = None, **kwargs: Any) -> DeleteResponse:566 """Delete by IDs or other criteria.567568 Calling delete without any input parameters should raise a ValueError!569570 Args:571 ids: List of IDs to delete.572 **kwargs: Additional keyword arguments. This is up to the implementation.573 For example, can include an option to delete the entire index,574 or else issue a non-blocking delete etc.575576 Returns:577 A response object that contains the list of IDs that were578 successfully deleted and the list of IDs that failed to be deleted.579 """580581 async def adelete(582 self, ids: list[str] | None = None, **kwargs: Any583 ) -> DeleteResponse:584 """Delete by IDs or other criteria. Async variant.585586 Calling adelete without any input parameters should raise a ValueError!587588 Args:589 ids: List of IDs to delete.590 **kwargs: Additional keyword arguments. This is up to the implementation.591 For example, can include an option to delete the entire index.592593 Returns:594 A response object that contains the list of IDs that were595 successfully deleted and the list of IDs that failed to be deleted.596 """597 return await run_in_executor(598 None,599 self.delete,600 ids,601 **kwargs,602 )603604 @abc.abstractmethod605 def get(606 self,607 ids: Sequence[str],608 /,609 **kwargs: Any,610 ) -> list[Document]:611 """Get documents by id.612613 Fewer documents may be returned than requested if some IDs are not found or614 if there are duplicated IDs.615616 Users should not assume that the order of the returned documents matches617 the order of the input IDs. Instead, users should rely on the ID field of the618 returned documents.619620 This method should **NOT** raise exceptions if no documents are found for621 some IDs.622623 Args:624 ids: List of IDs to get.625 **kwargs: Additional keyword arguments. These are up to the implementation.626627 Returns:628 List of documents that were found.629 """630631 async def aget(632 self,633 ids: Sequence[str],634 /,635 **kwargs: Any,636 ) -> list[Document]:637 """Get documents by id.638639 Fewer documents may be returned than requested if some IDs are not found or640 if there are duplicated IDs.641642 Users should not assume that the order of the returned documents matches643 the order of the input IDs. Instead, users should rely on the ID field of the644 returned documents.645646 This method should **NOT** raise exceptions if no documents are found for647 some IDs.648649 Args:650 ids: List of IDs to get.651 **kwargs: Additional keyword arguments. These are up to the implementation.652653 Returns:654 List of documents that were found.655 """656 return await run_in_executor(657 None,658 self.get,659 ids,660 **kwargs,661 )
Same data, no extra tab — call code_get_file + code_get_findings over MCP from Claude/Cursor/Copilot.