libs/core/langchain_core/indexing/base.py · langchain-ai/langchain

1"""Base classes for indexing."""23from __future__ import annotations45import abc6import time7from abc import ABC, abstractmethod8from typing import TYPE_CHECKING, Any, TypedDict910from typing_extensions import override1112from langchain_core._api import beta13from langchain_core.retrievers import BaseRetriever14from langchain_core.runnables import run_in_executor1516if TYPE_CHECKING:17    from collections.abc import Sequence1819    from langchain_core.documents import Document202122class RecordManager(ABC):23    """Abstract base class representing the interface for a record manager.2425    The record manager abstraction is used by the langchain indexing API.2627    The record manager keeps track of which documents have been28    written into a `VectorStore` and when they were written.2930    The indexing API computes hashes for each document and stores the hash31    together with the write time and the source id in the record manager.3233    On subsequent indexing runs, the indexing API can check the record manager34    to determine which documents have already been indexed and which have not.3536    This allows the indexing API to avoid re-indexing documents that have37    already been indexed, and to only index new documents.3839    The main benefit of this abstraction is that it works across many vectorstores.40    To be supported, a `VectorStore` needs to only support the ability to add and41    delete documents by ID. Using the record manager, the indexing API will42    be able to delete outdated documents and avoid redundant indexing of documents43    that have already been indexed.4445    The main constraints of this abstraction are:4647    1. It relies on the time-stamps to determine which documents have been48        indexed and which have not. This means that the time-stamps must be49        monotonically increasing. The timestamp should be the timestamp50        as measured by the server to minimize issues.51    2. The record manager is currently implemented separately from the52        vectorstore, which means that the overall system becomes distributed53        and may create issues with consistency. For example, writing to54        record manager succeeds, but corresponding writing to `VectorStore` fails.55    """5657    def __init__(58        self,59        namespace: str,60    ) -> None:61        """Initialize the record manager.6263        Args:64            namespace: The namespace for the record manager.65        """66        self.namespace = namespace6768    @abstractmethod69    def create_schema(self) -> None:70        """Create the database schema for the record manager."""7172    @abstractmethod73    async def acreate_schema(self) -> None:74        """Asynchronously create the database schema for the record manager."""7576    @abstractmethod77    def get_time(self) -> float:78        """Get the current server time as a high resolution timestamp!7980        It's important to get this from the server to ensure a monotonic clock,81        otherwise there may be data loss when cleaning up old documents!8283        Returns:84            The current server time as a float timestamp.85        """8687    @abstractmethod88    async def aget_time(self) -> float:89        """Asynchronously get the current server time as a high resolution timestamp.9091        It's important to get this from the server to ensure a monotonic clock,92        otherwise there may be data loss when cleaning up old documents!9394        Returns:95            The current server time as a float timestamp.96        """9798    @abstractmethod99    def update(100        self,101        keys: Sequence[str],102        *,103        group_ids: Sequence[str | None] | None = None,104        time_at_least: float | None = None,105    ) -> None:106        """Upsert records into the database.107108        Args:109            keys: A list of record keys to upsert.110            group_ids: A list of group IDs corresponding to the keys.111            time_at_least: Optional timestamp. Implementation can use this112                to optionally verify that the timestamp IS at least this time113                in the system that stores the data.114115                e.g., use to validate that the time in the postgres database116                is equal to or larger than the given timestamp, if not117                raise an error.118119                This is meant to help prevent time-drift issues since120                time may not be monotonically increasing!121122        Raises:123            ValueError: If the length of keys doesn't match the length of group_ids.124        """125126    @abstractmethod127    async def aupdate(128        self,129        keys: Sequence[str],130        *,131        group_ids: Sequence[str | None] | None = None,132        time_at_least: float | None = None,133    ) -> None:134        """Asynchronously upsert records into the database.135136        Args:137            keys: A list of record keys to upsert.138            group_ids: A list of group IDs corresponding to the keys.139            time_at_least: Optional timestamp. Implementation can use this140                to optionally verify that the timestamp IS at least this time141                in the system that stores the data.142143                e.g., use to validate that the time in the postgres database144                is equal to or larger than the given timestamp, if not145                raise an error.146147                This is meant to help prevent time-drift issues since148                time may not be monotonically increasing!149150        Raises:151            ValueError: If the length of keys doesn't match the length of group_ids.152        """153154    @abstractmethod155    def exists(self, keys: Sequence[str]) -> list[bool]:156        """Check if the provided keys exist in the database.157158        Args:159            keys: A list of keys to check.160161        Returns:162            A list of boolean values indicating the existence of each key.163        """164165    @abstractmethod166    async def aexists(self, keys: Sequence[str]) -> list[bool]:167        """Asynchronously check if the provided keys exist in the database.168169        Args:170            keys: A list of keys to check.171172        Returns:173            A list of boolean values indicating the existence of each key.174        """175176    @abstractmethod177    def list_keys(178        self,179        *,180        before: float | None = None,181        after: float | None = None,182        group_ids: Sequence[str] | None = None,183        limit: int | None = None,184    ) -> list[str]:185        """List records in the database based on the provided filters.186187        Args:188            before: Filter to list records updated before this time.189            after: Filter to list records updated after this time.190            group_ids: Filter to list records with specific group IDs.191            limit: optional limit on the number of records to return.192193        Returns:194            A list of keys for the matching records.195        """196197    @abstractmethod198    async def alist_keys(199        self,200        *,201        before: float | None = None,202        after: float | None = None,203        group_ids: Sequence[str] | None = None,204        limit: int | None = None,205    ) -> list[str]:206        """Asynchronously list records in the database based on the provided filters.207208        Args:209            before: Filter to list records updated before this time.210            after: Filter to list records updated after this time.211            group_ids: Filter to list records with specific group IDs.212            limit: optional limit on the number of records to return.213214        Returns:215            A list of keys for the matching records.216        """217218    @abstractmethod219    def delete_keys(self, keys: Sequence[str]) -> None:220        """Delete specified records from the database.221222        Args:223            keys: A list of keys to delete.224        """225226    @abstractmethod227    async def adelete_keys(self, keys: Sequence[str]) -> None:228        """Asynchronously delete specified records from the database.229230        Args:231            keys: A list of keys to delete.232        """233234235class _Record(TypedDict):236    group_id: str | None237    updated_at: float238239240class InMemoryRecordManager(RecordManager):241    """An in-memory record manager for testing purposes."""242243    def __init__(self, namespace: str) -> None:244        """Initialize the in-memory record manager.245246        Args:247            namespace: The namespace for the record manager.248        """249        super().__init__(namespace)250        # Each key points to a dictionary251        # of {'group_id': group_id, 'updated_at': timestamp}252        self.records: dict[str, _Record] = {}253        self.namespace = namespace254255    def create_schema(self) -> None:256        """In-memory schema creation is simply ensuring the structure is initialized."""257258    async def acreate_schema(self) -> None:259        """In-memory schema creation is simply ensuring the structure is initialized."""260261    @override262    def get_time(self) -> float:263        return time.time()264265    @override266    async def aget_time(self) -> float:267        return self.get_time()268269    def update(270        self,271        keys: Sequence[str],272        *,273        group_ids: Sequence[str | None] | None = None,274        time_at_least: float | None = None,275    ) -> None:276        """Upsert records into the database.277278        Args:279            keys: A list of record keys to upsert.280            group_ids: A list of group IDs corresponding to the keys.281282            time_at_least: Optional timestamp. Implementation can use this283                to optionally verify that the timestamp IS at least this time284                in the system that stores.285                E.g., use to validate that the time in the postgres database286                is equal to or larger than the given timestamp, if not287                raise an error.288                This is meant to help prevent time-drift issues since289                time may not be monotonically increasing!290291        Raises:292            ValueError: If the length of keys doesn't match the length of group293                ids.294            ValueError: If time_at_least is in the future.295        """296        if group_ids and len(keys) != len(group_ids):297            msg = "Length of keys must match length of group_ids"298            raise ValueError(msg)299        for index, key in enumerate(keys):300            group_id = group_ids[index] if group_ids else None301            if time_at_least and time_at_least > self.get_time():302                msg = "time_at_least must be in the past"303                raise ValueError(msg)304            self.records[key] = {"group_id": group_id, "updated_at": self.get_time()}305306    async def aupdate(307        self,308        keys: Sequence[str],309        *,310        group_ids: Sequence[str | None] | None = None,311        time_at_least: float | None = None,312    ) -> None:313        """Async upsert records into the database.314315        Args:316            keys: A list of record keys to upsert.317            group_ids: A list of group IDs corresponding to the keys.318319            time_at_least: Optional timestamp. Implementation can use this320                to optionally verify that the timestamp IS at least this time321                in the system that stores.322                E.g., use to validate that the time in the postgres database323                is equal to or larger than the given timestamp, if not324                raise an error.325                This is meant to help prevent time-drift issues since326                time may not be monotonically increasing!327        """328        self.update(keys, group_ids=group_ids, time_at_least=time_at_least)329330    def exists(self, keys: Sequence[str]) -> list[bool]:331        """Check if the provided keys exist in the database.332333        Args:334            keys: A list of keys to check.335336        Returns:337            A list of boolean values indicating the existence of each key.338        """339        return [key in self.records for key in keys]340341    async def aexists(self, keys: Sequence[str]) -> list[bool]:342        """Async check if the provided keys exist in the database.343344        Args:345            keys: A list of keys to check.346347        Returns:348            A list of boolean values indicating the existence of each key.349        """350        return self.exists(keys)351352    def list_keys(353        self,354        *,355        before: float | None = None,356        after: float | None = None,357        group_ids: Sequence[str] | None = None,358        limit: int | None = None,359    ) -> list[str]:360        """List records in the database based on the provided filters.361362        Args:363            before: Filter to list records updated before this time.364365            after: Filter to list records updated after this time.366367            group_ids: Filter to list records with specific group IDs.368369            limit: optional limit on the number of records to return.370371372        Returns:373            A list of keys for the matching records.374        """375        result = []376        for key, data in self.records.items():377            if before and data["updated_at"] >= before:378                continue379            if after and data["updated_at"] <= after:380                continue381            if group_ids and data["group_id"] not in group_ids:382                continue383            result.append(key)384        if limit:385            return result[:limit]386        return result387388    async def alist_keys(389        self,390        *,391        before: float | None = None,392        after: float | None = None,393        group_ids: Sequence[str] | None = None,394        limit: int | None = None,395    ) -> list[str]:396        """Async list records in the database based on the provided filters.397398        Args:399            before: Filter to list records updated before this time.400401            after: Filter to list records updated after this time.402403            group_ids: Filter to list records with specific group IDs.404405            limit: optional limit on the number of records to return.406407408        Returns:409            A list of keys for the matching records.410        """411        return self.list_keys(412            before=before, after=after, group_ids=group_ids, limit=limit413        )414415    def delete_keys(self, keys: Sequence[str]) -> None:416        """Delete specified records from the database.417418        Args:419            keys: A list of keys to delete.420        """421        for key in keys:422            if key in self.records:423                del self.records[key]424425    async def adelete_keys(self, keys: Sequence[str]) -> None:426        """Async delete specified records from the database.427428        Args:429            keys: A list of keys to delete.430        """431        self.delete_keys(keys)432433434class UpsertResponse(TypedDict):435    """A generic response for upsert operations.436437    The upsert response will be used by abstractions that implement an upsert438    operation for content that can be upserted by ID.439440    Upsert APIs that accept inputs with IDs and generate IDs internally441    will return a response that includes the IDs that succeeded and the IDs442    that failed.443444    If there are no failures, the failed list will be empty, and the order445    of the IDs in the succeeded list will match the order of the input documents.446447    If there are failures, the response becomes ill defined, and a user of the API448    cannot determine which generated ID corresponds to which input document.449450    It is recommended for users explicitly attach the IDs to the items being451    indexed to avoid this issue.452    """453454    succeeded: list[str]455    """The IDs that were successfully indexed."""456    failed: list[str]457    """The IDs that failed to index."""458459460class DeleteResponse(TypedDict, total=False):461    """A generic response for delete operation.462463    The fields in this response are optional and whether the `VectorStore`464    returns them or not is up to the implementation.465    """466467    num_deleted: int468    """The number of items that were successfully deleted.469470    If returned, this should only include *actual* deletions.471472    If the ID did not exist to begin with,473    it should not be included in this count.474    """475476    succeeded: Sequence[str]477    """The IDs that were successfully deleted.478479    If returned, this should only include *actual* deletions.480481    If the ID did not exist to begin with,482    it should not be included in this list.483    """484485    failed: Sequence[str]486    """The IDs that failed to be deleted.487488    !!! warning489        Deleting an ID that does not exist is **NOT** considered a failure.490    """491492    num_failed: int493    """The number of items that failed to be deleted."""494495496@beta(message="Added in 0.2.29. The abstraction is subject to change.")497class DocumentIndex(BaseRetriever):498    """A document retriever that supports indexing operations.499500    This indexing interface is designed to be a generic abstraction for storing and501    querying documents that has an ID and metadata associated with it.502503    The interface is designed to be agnostic to the underlying implementation of the504    indexing system.505506    The interface is designed to support the following operations:507508    1. Storing document in the index.509    2. Fetching document by ID.510    3. Searching for document using a query.511    """512513    @abc.abstractmethod514    def upsert(self, items: Sequence[Document], /, **kwargs: Any) -> UpsertResponse:515        """Upsert documents into the index.516517        The upsert functionality should utilize the ID field of the content object518        if it is provided. If the ID is not provided, the upsert method is free519        to generate an ID for the content.520521        When an ID is specified and the content already exists in the `VectorStore`,522        the upsert method should update the content with the new data. If the content523        does not exist, the upsert method should add the item to the `VectorStore`.524525        Args:526            items: Sequence of documents to add to the `VectorStore`.527            **kwargs: Additional keyword arguments.528529        Returns:530            A response object that contains the list of IDs that were531            successfully added or updated in the `VectorStore` and the list of IDs that532            failed to be added or updated.533        """534535    async def aupsert(536        self, items: Sequence[Document], /, **kwargs: Any537    ) -> UpsertResponse:538        """Add or update documents in the `VectorStore`. Async version of `upsert`.539540        The upsert functionality should utilize the ID field of the item541        if it is provided. If the ID is not provided, the upsert method is free542        to generate an ID for the item.543544        When an ID is specified and the item already exists in the `VectorStore`,545        the upsert method should update the item with the new data. If the item546        does not exist, the upsert method should add the item to the `VectorStore`.547548        Args:549            items: Sequence of documents to add to the `VectorStore`.550            **kwargs: Additional keyword arguments.551552        Returns:553            A response object that contains the list of IDs that were554            successfully added or updated in the `VectorStore` and the list of IDs that555            failed to be added or updated.556        """557        return await run_in_executor(558            None,559            self.upsert,560            items,561            **kwargs,562        )563564    @abc.abstractmethod565    def delete(self, ids: list[str] | None = None, **kwargs: Any) -> DeleteResponse:566        """Delete by IDs or other criteria.567568        Calling delete without any input parameters should raise a ValueError!569570        Args:571            ids: List of IDs to delete.572            **kwargs: Additional keyword arguments. This is up to the implementation.573                For example, can include an option to delete the entire index,574                or else issue a non-blocking delete etc.575576        Returns:577            A response object that contains the list of IDs that were578            successfully deleted and the list of IDs that failed to be deleted.579        """580581    async def adelete(582        self, ids: list[str] | None = None, **kwargs: Any583    ) -> DeleteResponse:584        """Delete by IDs or other criteria. Async variant.585586        Calling adelete without any input parameters should raise a ValueError!587588        Args:589            ids: List of IDs to delete.590            **kwargs: Additional keyword arguments. This is up to the implementation.591                For example, can include an option to delete the entire index.592593        Returns:594            A response object that contains the list of IDs that were595            successfully deleted and the list of IDs that failed to be deleted.596        """597        return await run_in_executor(598            None,599            self.delete,600            ids,601            **kwargs,602        )603604    @abc.abstractmethod605    def get(606        self,607        ids: Sequence[str],608        /,609        **kwargs: Any,610    ) -> list[Document]:611        """Get documents by id.612613        Fewer documents may be returned than requested if some IDs are not found or614        if there are duplicated IDs.615616        Users should not assume that the order of the returned documents matches617        the order of the input IDs. Instead, users should rely on the ID field of the618        returned documents.619620        This method should **NOT** raise exceptions if no documents are found for621        some IDs.622623        Args:624            ids: List of IDs to get.625            **kwargs: Additional keyword arguments. These are up to the implementation.626627        Returns:628            List of documents that were found.629        """630631    async def aget(632        self,633        ids: Sequence[str],634        /,635        **kwargs: Any,636    ) -> list[Document]:637        """Get documents by id.638639        Fewer documents may be returned than requested if some IDs are not found or640        if there are duplicated IDs.641642        Users should not assume that the order of the returned documents matches643        the order of the input IDs. Instead, users should rely on the ID field of the644        returned documents.645646        This method should **NOT** raise exceptions if no documents are found for647        some IDs.648649        Args:650            ids: List of IDs to get.651            **kwargs: Additional keyword arguments. These are up to the implementation.652653        Returns:654            List of documents that were found.655        """656        return await run_in_executor(657            None,658            self.get,659            ids,660            **kwargs,661        )