libs/core/langchain_core/indexing/api.py · langchain-ai/langchain

1"""Module contains logic for indexing documents into vector stores."""23from __future__ import annotations45import hashlib6import json7import uuid8import warnings9from itertools import islice10from typing import (11    TYPE_CHECKING,12    Any,13    Literal,14    TypedDict,15    TypeVar,16    cast,17)1819from langchain_core.document_loaders.base import BaseLoader20from langchain_core.documents import Document21from langchain_core.exceptions import LangChainException22from langchain_core.indexing.base import DocumentIndex, RecordManager23from langchain_core.vectorstores import VectorStore2425if TYPE_CHECKING:26    from collections.abc import (27        AsyncIterable,28        AsyncIterator,29        Callable,30        Iterable,31        Iterator,32        Sequence,33    )3435# Magic UUID to use as a namespace for hashing.36# Used to try and generate a unique UUID for each document37# from hashing the document content and metadata.38NAMESPACE_UUID = uuid.UUID(int=1984)394041T = TypeVar("T")424344def _hash_string_to_uuid(input_string: str) -> str:45    """Hashes a string and returns the corresponding UUID."""46    hash_value = hashlib.sha1(47        input_string.encode("utf-8"), usedforsecurity=False48    ).hexdigest()49    return str(uuid.uuid5(NAMESPACE_UUID, hash_value))505152_WARNED_ABOUT_SHA1: bool = False535455def _warn_about_sha1() -> None:56    """Emit a one-time warning about SHA-1 collision weaknesses."""57    # Global variable OK in this case58    global _WARNED_ABOUT_SHA1  # noqa: PLW060359    if not _WARNED_ABOUT_SHA1:60        warnings.warn(61            "Using SHA-1 for document hashing. SHA-1 is *not* "62            "collision-resistant; a motivated attacker can construct distinct inputs "63            "that map to the same fingerprint. If this matters in your "64            "threat model, switch to a stronger algorithm such "65            "as 'blake2b', 'sha256', or 'sha512' by specifying "66            " `key_encoder` parameter in the `index` or `aindex` function. ",67            category=UserWarning,68            stacklevel=2,69        )70        _WARNED_ABOUT_SHA1 = True717273def _hash_string(74    input_string: str, *, algorithm: Literal["sha1", "sha256", "sha512", "blake2b"]75) -> uuid.UUID:76    """Hash *input_string* to a deterministic UUID using the configured algorithm."""77    if algorithm == "sha1":78        _warn_about_sha1()79    hash_value = _calculate_hash(input_string, algorithm)80    return uuid.uuid5(NAMESPACE_UUID, hash_value)818283def _hash_nested_dict(84    data: dict[Any, Any], *, algorithm: Literal["sha1", "sha256", "sha512", "blake2b"]85) -> uuid.UUID:86    """Hash a nested dictionary to a UUID using the configured algorithm."""87    serialized_data = json.dumps(data, sort_keys=True)88    return _hash_string(serialized_data, algorithm=algorithm)899091def _batch(size: int, iterable: Iterable[T]) -> Iterator[list[T]]:92    """Utility batching function."""93    if size <= 0:94        msg = f"Batch size must be a positive integer, got {size}."95        raise ValueError(msg)96    it = iter(iterable)97    while True:98        chunk = list(islice(it, size))99        if not chunk:100            return101        yield chunk102103104async def _abatch(size: int, iterable: AsyncIterable[T]) -> AsyncIterator[list[T]]:105    """Utility batching function."""106    if size <= 0:107        msg = f"Batch size must be a positive integer, got {size}."108        raise ValueError(msg)109    batch: list[T] = []110    async for element in iterable:111        if len(batch) < size:112            batch.append(element)113114        if len(batch) >= size:115            yield batch116            batch = []117118    if batch:119        yield batch120121122def _get_source_id_assigner(123    source_id_key: str | Callable[[Document], str] | None,124) -> Callable[[Document], str | None]:125    """Get the source id from the document."""126    if source_id_key is None:127        return lambda _doc: None128    if isinstance(source_id_key, str):129        return lambda doc: doc.metadata[source_id_key]130    if callable(source_id_key):131        return source_id_key132    msg = (  # type: ignore[unreachable]133        f"source_id_key should be either None, a string or a callable. "134        f"Got {source_id_key} of type {type(source_id_key)}."135    )136    raise ValueError(msg)137138139def _deduplicate_in_order(140    hashed_documents: Iterable[Document],141) -> Iterator[Document]:142    """Deduplicate a list of hashed documents while preserving order."""143    seen: set[str] = set()144145    for hashed_doc in hashed_documents:146        if hashed_doc.id not in seen:147            # At this stage, the id is guaranteed to be a string.148            # Avoiding unnecessary run time checks.149            seen.add(cast("str", hashed_doc.id))150            yield hashed_doc151152153class IndexingException(LangChainException):154    """Raised when an indexing operation fails."""155156157def _calculate_hash(158    text: str, algorithm: Literal["sha1", "sha256", "sha512", "blake2b"]159) -> str:160    """Return a hexadecimal digest of *text* using *algorithm*."""161    if algorithm == "sha1":162        # Calculate the SHA-1 hash and return it as a UUID.163        digest = hashlib.sha1(text.encode("utf-8"), usedforsecurity=False).hexdigest()164        return str(uuid.uuid5(NAMESPACE_UUID, digest))165    if algorithm == "blake2b":166        return hashlib.blake2b(text.encode("utf-8")).hexdigest()167    if algorithm == "sha256":168        return hashlib.sha256(text.encode("utf-8")).hexdigest()169    if algorithm == "sha512":170        return hashlib.sha512(text.encode("utf-8")).hexdigest()171    msg = f"Unsupported hashing algorithm: {algorithm}"  # type: ignore[unreachable]172    raise ValueError(msg)173174175def _get_document_with_hash(176    document: Document,177    *,178    key_encoder: Callable[[Document], str]179    | Literal["sha1", "sha256", "sha512", "blake2b"],180) -> Document:181    """Calculate a hash of the document, and assign it to the uid.182183    When using one of the predefined hashing algorithms, the hash is calculated184    by hashing the content and the metadata of the document.185186    Args:187        document: Document to hash.188        key_encoder: Hashing algorithm to use for hashing the document.189            If not provided, a default encoder using SHA-1 will be used.190            SHA-1 is not collision-resistant, and a motivated attacker191            could craft two different texts that hash to the192            same cache key.193194            New applications should use one of the alternative encoders195            or provide a custom and strong key encoder function to avoid this risk.196197            When changing the key encoder, you must change the198            index as well to avoid duplicated documents in the cache.199200    Raises:201        ValueError: If the metadata cannot be serialized using json.202203    Returns:204        Document with a unique identifier based on the hash of the content and metadata.205    """206    metadata: dict[str, Any] = dict(document.metadata or {})207208    if callable(key_encoder):209        # If key_encoder is a callable, we use it to generate the hash.210        hash_ = key_encoder(document)211    else:212        # The hashes are calculated separate for the content and the metadata.213        content_hash = _calculate_hash(document.page_content, algorithm=key_encoder)214        try:215            serialized_meta = json.dumps(metadata, sort_keys=True)216        except Exception as e:217            msg = (218                f"Failed to hash metadata: {e}. "219                f"Please use a dict that can be serialized using json."220            )221            raise ValueError(msg) from e222        metadata_hash = _calculate_hash(serialized_meta, algorithm=key_encoder)223        hash_ = _calculate_hash(content_hash + metadata_hash, algorithm=key_encoder)224225    return Document(226        # Assign a unique identifier based on the hash.227        id=hash_,228        page_content=document.page_content,229        metadata=document.metadata,230    )231232233# This internal abstraction was imported by the langchain package internally, so234# we keep it here for backwards compatibility.235class _HashedDocument:236    def __init__(self, *args: Any, **kwargs: Any) -> None:237        """Raise an error if this class is instantiated."""238        msg = (239            "_HashedDocument is an internal abstraction that was deprecated in "240            " langchain-core 0.3.63. This abstraction is marked as private and "241            " should not have been used directly. If you are seeing this error, please "242            " update your code appropriately."243        )244        raise NotImplementedError(msg)245246247def _delete(248    vector_store: VectorStore | DocumentIndex,249    ids: list[str],250) -> None:251    """Delete documents from a vector store or document index by their IDs.252253    Args:254        vector_store: The vector store or document index to delete from.255        ids: List of document IDs to delete.256257    Raises:258        IndexingException: If the delete operation fails.259        TypeError: If the `vector_store` is neither a `VectorStore` nor a260            `DocumentIndex`.261    """262    if isinstance(vector_store, VectorStore):263        delete_ok = vector_store.delete(ids)264        if delete_ok is not None and delete_ok is False:265            msg = "The delete operation to VectorStore failed."266            raise IndexingException(msg)267    elif isinstance(vector_store, DocumentIndex):268        delete_response = vector_store.delete(ids)269        if "num_failed" in delete_response and delete_response["num_failed"] > 0:270            msg = "The delete operation to DocumentIndex failed."271            raise IndexingException(msg)272    else:273        msg = (  # type: ignore[unreachable]274            f"Vectorstore should be either a VectorStore or a DocumentIndex. "275            f"Got {type(vector_store)}."276        )277        raise TypeError(msg)278279280# PUBLIC API281282283class IndexingResult(TypedDict):284    """Return a detailed a breakdown of the result of the indexing operation."""285286    num_added: int287    """Number of added documents."""288    num_updated: int289    """Number of updated documents because they were not up to date."""290    num_deleted: int291    """Number of deleted documents."""292    num_skipped: int293    """Number of skipped documents because they were already up to date."""294295296def index(297    docs_source: BaseLoader | Iterable[Document],298    record_manager: RecordManager,299    vector_store: VectorStore | DocumentIndex,300    *,301    batch_size: int = 100,302    cleanup: Literal["incremental", "full", "scoped_full"] | None = None,303    source_id_key: str | Callable[[Document], str] | None = None,304    cleanup_batch_size: int = 1_000,305    force_update: bool = False,306    key_encoder: Literal["sha1", "sha256", "sha512", "blake2b"]307    | Callable[[Document], str] = "sha1",308    upsert_kwargs: dict[str, Any] | None = None,309) -> IndexingResult:310    """Index data from the loader into the vector store.311312    Indexing functionality uses a manager to keep track of which documents313    are in the vector store.314315    This allows us to keep track of which documents were updated, and which316    documents were deleted, which documents should be skipped.317318    For the time being, documents are indexed using their hashes, and users319    are not able to specify the uid of the document.320321    !!! warning "Behavior changed in `langchain-core` 0.3.25"322323        Added `scoped_full` cleanup mode.324325    !!! warning326327        * In full mode, the loader should be returning328            the entire dataset, and not just a subset of the dataset.329            Otherwise, the auto_cleanup will remove documents that it is not330            supposed to.331        * In incremental mode, if documents associated with a particular332            source id appear across different batches, the indexing API333            will do some redundant work. This will still result in the334            correct end state of the index, but will unfortunately not be335            100% efficient. For example, if a given document is split into 15336            chunks, and we index them using a batch size of 5, we'll have 3 batches337            all with the same source id. In general, to avoid doing too much338            redundant work select as big a batch size as possible.339        * The `scoped_full` mode is suitable if determining an appropriate batch size340            is challenging or if your data loader cannot return the entire dataset at341            once. This mode keeps track of source IDs in memory, which should be fine342            for most use cases. If your dataset is large (10M+ docs), you will likely343            need to parallelize the indexing process regardless.344345    Args:346        docs_source: Data loader or iterable of documents to index.347        record_manager: Timestamped set to keep track of which documents were348            updated.349        vector_store: `VectorStore` or DocumentIndex to index the documents into.350        batch_size: Batch size to use when indexing.351        cleanup: How to handle clean up of documents.352353            - incremental: Cleans up all documents that haven't been updated AND354                that are associated with source IDs that were seen during indexing.355                Clean up is done continuously during indexing helping to minimize the356                probability of users seeing duplicated content.357            - full: Delete all documents that have not been returned by the loader358                during this run of indexing.359                Clean up runs after all documents have been indexed.360                This means that users may see duplicated content during indexing.361            - scoped_full: Similar to Full, but only deletes all documents362                that haven't been updated AND that are associated with363                source IDs that were seen during indexing.364            - None: Do not delete any documents.365        source_id_key: Optional key that helps identify the original source366            of the document.367        cleanup_batch_size: Batch size to use when cleaning up documents.368        force_update: Force update documents even if they are present in the369            record manager. Useful if you are re-indexing with updated embeddings.370        key_encoder: Hashing algorithm to use for hashing the document content and371            metadata. Options include "blake2b", "sha256", and "sha512".372373            !!! version-added "Added in `langchain-core` 0.3.66"374375        key_encoder: Hashing algorithm to use for hashing the document.376            If not provided, a default encoder using SHA-1 will be used.377            SHA-1 is not collision-resistant, and a motivated attacker378            could craft two different texts that hash to the379            same cache key.380381            New applications should use one of the alternative encoders382            or provide a custom and strong key encoder function to avoid this risk.383384            When changing the key encoder, you must change the385            index as well to avoid duplicated documents in the cache.386        upsert_kwargs: Additional keyword arguments to pass to the add_documents387            method of the `VectorStore` or the upsert method of the DocumentIndex.388            For example, you can use this to specify a custom vector_field:389            upsert_kwargs={"vector_field": "embedding"}390            !!! version-added "Added in `langchain-core` 0.3.10"391392    Returns:393        Indexing result which contains information about how many documents394        were added, updated, deleted, or skipped.395396    Raises:397        ValueError: If cleanup mode is not one of 'incremental', 'full' or None398        ValueError: If cleanup mode is incremental and source_id_key is None.399        ValueError: If `VectorStore` does not have400            "delete" and "add_documents" required methods.401        ValueError: If source_id_key is not None, but is not a string or callable.402        TypeError: If `vectorstore` is not a `VectorStore` or a DocumentIndex.403        AssertionError: If `source_id` is None when cleanup mode is incremental.404            (should be unreachable code).405    """406    # Behavior is deprecated, but we keep it for backwards compatibility.407    # # Warn only once per process.408    if key_encoder == "sha1":409        _warn_about_sha1()410411    if cleanup not in {"incremental", "full", "scoped_full", None}:412        msg = (413            f"cleanup should be one of 'incremental', 'full', 'scoped_full' or None. "414            f"Got {cleanup}."415        )416        raise ValueError(msg)417418    if (cleanup in {"incremental", "scoped_full"}) and source_id_key is None:419        msg = (420            "Source id key is required when cleanup mode is incremental or scoped_full."421        )422        raise ValueError(msg)423424    destination = vector_store  # Renaming internally for clarity425426    # If it's a vectorstore, let's check if it has the required methods.427    if isinstance(destination, VectorStore):428        # Check that the Vectorstore has required methods implemented429        methods = ["delete", "add_documents"]430431        for method in methods:432            if not hasattr(destination, method):433                msg = (434                    f"Vectorstore {destination} does not have required method {method}"435                )436                raise ValueError(msg)437438        if type(destination).delete == VectorStore.delete:439            # Checking if the VectorStore has overridden the default delete method440            # implementation which just raises a NotImplementedError441            msg = "Vectorstore has not implemented the delete method"442            raise ValueError(msg)443    elif isinstance(destination, DocumentIndex):444        pass445    else:446        msg = (  # type: ignore[unreachable]447            f"Vectorstore should be either a VectorStore or a DocumentIndex. "448            f"Got {type(destination)}."449        )450        raise TypeError(msg)451452    if isinstance(docs_source, BaseLoader):453        try:454            doc_iterator = docs_source.lazy_load()455        except NotImplementedError:456            doc_iterator = iter(docs_source.load())457    else:458        doc_iterator = iter(docs_source)459460    source_id_assigner = _get_source_id_assigner(source_id_key)461462    # Mark when the update started.463    index_start_dt = record_manager.get_time()464    num_added = 0465    num_skipped = 0466    num_updated = 0467    num_deleted = 0468    scoped_full_cleanup_source_ids: set[str] = set()469470    for doc_batch in _batch(batch_size, doc_iterator):471        # Track original batch size before deduplication472        original_batch_size = len(doc_batch)473474        hashed_docs = list(475            _deduplicate_in_order(476                [477                    _get_document_with_hash(doc, key_encoder=key_encoder)478                    for doc in doc_batch479                ]480            )481        )482        # Count documents removed by within-batch deduplication483        num_skipped += original_batch_size - len(hashed_docs)484485        source_ids: Sequence[str | None] = [486            source_id_assigner(hashed_doc) for hashed_doc in hashed_docs487        ]488489        if cleanup in {"incremental", "scoped_full"}:490            # Source IDs are required.491            for source_id, hashed_doc in zip(source_ids, hashed_docs, strict=False):492                if source_id is None:493                    msg = (494                        f"Source IDs are required when cleanup mode is "495                        f"incremental or scoped_full. "496                        f"Document that starts with "497                        f"content: {hashed_doc.page_content[:100]} "498                        f"was not assigned as source id."499                    )500                    raise ValueError(msg)501                if cleanup == "scoped_full":502                    scoped_full_cleanup_source_ids.add(source_id)503            # Source IDs cannot be None after for loop above.504            source_ids = cast("Sequence[str]", source_ids)505506        exists_batch = record_manager.exists(507            cast("Sequence[str]", [doc.id for doc in hashed_docs])508        )509510        # Filter out documents that already exist in the record store.511        uids = []512        docs_to_index = []513        uids_to_refresh = []514        seen_docs: set[str] = set()515        for hashed_doc, doc_exists in zip(hashed_docs, exists_batch, strict=False):516            hashed_id = cast("str", hashed_doc.id)517            if doc_exists:518                if force_update:519                    seen_docs.add(hashed_id)520                else:521                    uids_to_refresh.append(hashed_id)522                    continue523            uids.append(hashed_id)524            docs_to_index.append(hashed_doc)525526        # Update refresh timestamp527        if uids_to_refresh:528            record_manager.update(uids_to_refresh, time_at_least=index_start_dt)529            num_skipped += len(uids_to_refresh)530531        # Be pessimistic and assume that all vector store write will fail.532        # First write to vector store533        if docs_to_index:534            if isinstance(destination, VectorStore):535                destination.add_documents(536                    docs_to_index,537                    ids=uids,538                    batch_size=batch_size,539                    **(upsert_kwargs or {}),540                )541            elif isinstance(destination, DocumentIndex):542                destination.upsert(543                    docs_to_index,544                    **(upsert_kwargs or {}),545                )546547            num_added += len(docs_to_index) - len(seen_docs)548            num_updated += len(seen_docs)549550        # And only then update the record store.551        # Update ALL records, even if they already exist since we want to refresh552        # their timestamp.553        record_manager.update(554            cast("Sequence[str]", [doc.id for doc in hashed_docs]),555            group_ids=source_ids,556            time_at_least=index_start_dt,557        )558559        # If source IDs are provided, we can do the deletion incrementally!560        if cleanup == "incremental":561            # Get the uids of the documents that were not returned by the loader.562            # mypy isn't good enough to determine that source IDs cannot be None563            # here due to a check that's happening above, so we check again.564            for source_id in source_ids:565                if source_id is None:566                    msg = (567                        "source_id cannot be None at this point. "568                        "Reached unreachable code."569                    )570                    raise AssertionError(msg)571572            source_ids_ = cast("Sequence[str]", source_ids)573574            while uids_to_delete := record_manager.list_keys(575                group_ids=source_ids_, before=index_start_dt, limit=cleanup_batch_size576            ):577                # Then delete from vector store.578                _delete(destination, uids_to_delete)579                # First delete from record store.580                record_manager.delete_keys(uids_to_delete)581                num_deleted += len(uids_to_delete)582583    if cleanup == "full" or (584        cleanup == "scoped_full" and scoped_full_cleanup_source_ids585    ):586        delete_group_ids: Sequence[str] | None = None587        if cleanup == "scoped_full":588            delete_group_ids = list(scoped_full_cleanup_source_ids)589        while uids_to_delete := record_manager.list_keys(590            group_ids=delete_group_ids, before=index_start_dt, limit=cleanup_batch_size591        ):592            # First delete from record store.593            _delete(destination, uids_to_delete)594            # Then delete from record manager.595            record_manager.delete_keys(uids_to_delete)596            num_deleted += len(uids_to_delete)597598    return {599        "num_added": num_added,600        "num_updated": num_updated,601        "num_skipped": num_skipped,602        "num_deleted": num_deleted,603    }604605606# Define an asynchronous generator function607async def _to_async_iterator(iterator: Iterable[T]) -> AsyncIterator[T]:608    """Convert an iterable to an async iterator."""609    for item in iterator:610        yield item611612613async def _adelete(614    vector_store: VectorStore | DocumentIndex,615    ids: list[str],616) -> None:617    if isinstance(vector_store, VectorStore):618        delete_ok = await vector_store.adelete(ids)619        if delete_ok is not None and delete_ok is False:620            msg = "The delete operation to VectorStore failed."621            raise IndexingException(msg)622    elif isinstance(vector_store, DocumentIndex):623        delete_response = await vector_store.adelete(ids)624        if "num_failed" in delete_response and delete_response["num_failed"] > 0:625            msg = "The delete operation to DocumentIndex failed."626            raise IndexingException(msg)627    else:628        msg = (  # type: ignore[unreachable]629            f"Vectorstore should be either a VectorStore or a DocumentIndex. "630            f"Got {type(vector_store)}."631        )632        raise TypeError(msg)633634635async def aindex(636    docs_source: BaseLoader | Iterable[Document] | AsyncIterator[Document],637    record_manager: RecordManager,638    vector_store: VectorStore | DocumentIndex,639    *,640    batch_size: int = 100,641    cleanup: Literal["incremental", "full", "scoped_full"] | None = None,642    source_id_key: str | Callable[[Document], str] | None = None,643    cleanup_batch_size: int = 1_000,644    force_update: bool = False,645    key_encoder: Literal["sha1", "sha256", "sha512", "blake2b"]646    | Callable[[Document], str] = "sha1",647    upsert_kwargs: dict[str, Any] | None = None,648) -> IndexingResult:649    """Async index data from the loader into the vector store.650651    Indexing functionality uses a manager to keep track of which documents652    are in the vector store.653654    This allows us to keep track of which documents were updated, and which655    documents were deleted, which documents should be skipped.656657    For the time being, documents are indexed using their hashes, and users658    are not able to specify the uid of the document.659660    !!! warning "Behavior changed in `langchain-core` 0.3.25"661662        Added `scoped_full` cleanup mode.663664    !!! warning665666        * In full mode, the loader should be returning667            the entire dataset, and not just a subset of the dataset.668            Otherwise, the auto_cleanup will remove documents that it is not669            supposed to.670        * In incremental mode, if documents associated with a particular671            source id appear across different batches, the indexing API672            will do some redundant work. This will still result in the673            correct end state of the index, but will unfortunately not be674            100% efficient. For example, if a given document is split into 15675            chunks, and we index them using a batch size of 5, we'll have 3 batches676            all with the same source id. In general, to avoid doing too much677            redundant work select as big a batch size as possible.678        * The `scoped_full` mode is suitable if determining an appropriate batch size679            is challenging or if your data loader cannot return the entire dataset at680            once. This mode keeps track of source IDs in memory, which should be fine681            for most use cases. If your dataset is large (10M+ docs), you will likely682            need to parallelize the indexing process regardless.683684    Args:685        docs_source: Data loader or iterable of documents to index.686        record_manager: Timestamped set to keep track of which documents were687            updated.688        vector_store: `VectorStore` or DocumentIndex to index the documents into.689        batch_size: Batch size to use when indexing.690        cleanup: How to handle clean up of documents.691692            - incremental: Cleans up all documents that haven't been updated AND693                that are associated with source IDs that were seen during indexing.694                Clean up is done continuously during indexing helping to minimize the695                probability of users seeing duplicated content.696            - full: Delete all documents that have not been returned by the loader697                during this run of indexing.698                Clean up runs after all documents have been indexed.699                This means that users may see duplicated content during indexing.700            - scoped_full: Similar to Full, but only deletes all documents701                that haven't been updated AND that are associated with702                source IDs that were seen during indexing.703            - None: Do not delete any documents.704        source_id_key: Optional key that helps identify the original source705            of the document.706        cleanup_batch_size: Batch size to use when cleaning up documents.707        force_update: Force update documents even if they are present in the708            record manager. Useful if you are re-indexing with updated embeddings.709        key_encoder: Hashing algorithm to use for hashing the document content and710            metadata. Options include "blake2b", "sha256", and "sha512".711712            !!! version-added "Added in `langchain-core` 0.3.66"713714        key_encoder: Hashing algorithm to use for hashing the document.715            If not provided, a default encoder using SHA-1 will be used.716            SHA-1 is not collision-resistant, and a motivated attacker717            could craft two different texts that hash to the718            same cache key.719720            New applications should use one of the alternative encoders721            or provide a custom and strong key encoder function to avoid this risk.722723            When changing the key encoder, you must change the724            index as well to avoid duplicated documents in the cache.725        upsert_kwargs: Additional keyword arguments to pass to the add_documents726            method of the `VectorStore` or the upsert method of the DocumentIndex.727            For example, you can use this to specify a custom vector_field:728            upsert_kwargs={"vector_field": "embedding"}729            !!! version-added "Added in `langchain-core` 0.3.10"730731    Returns:732        Indexing result which contains information about how many documents733        were added, updated, deleted, or skipped.734735    Raises:736        ValueError: If cleanup mode is not one of 'incremental', 'full' or None737        ValueError: If cleanup mode is incremental and source_id_key is None.738        ValueError: If `VectorStore` does not have739            "adelete" and "aadd_documents" required methods.740        ValueError: If source_id_key is not None, but is not a string or callable.741        TypeError: If `vector_store` is not a `VectorStore` or DocumentIndex.742        AssertionError: If `source_id_key` is None when cleanup mode is743            incremental or `scoped_full` (should be unreachable).744    """745    # Behavior is deprecated, but we keep it for backwards compatibility.746    # # Warn only once per process.747    if key_encoder == "sha1":748        _warn_about_sha1()749750    if cleanup not in {"incremental", "full", "scoped_full", None}:751        msg = (752            f"cleanup should be one of 'incremental', 'full', 'scoped_full' or None. "753            f"Got {cleanup}."754        )755        raise ValueError(msg)756757    if (cleanup in {"incremental", "scoped_full"}) and source_id_key is None:758        msg = (759            "Source id key is required when cleanup mode is incremental or scoped_full."760        )761        raise ValueError(msg)762763    destination = vector_store  # Renaming internally for clarity764765    # If it's a vectorstore, let's check if it has the required methods.766    if isinstance(destination, VectorStore):767        # Check that the Vectorstore has required methods implemented768        # Check that the Vectorstore has required methods implemented769        methods = ["adelete", "aadd_documents"]770771        for method in methods:772            if not hasattr(destination, method):773                msg = (774                    f"Vectorstore {destination} does not have required method {method}"775                )776                raise ValueError(msg)777778        if (779            type(destination).adelete == VectorStore.adelete780            and type(destination).delete == VectorStore.delete781        ):782            # Checking if the VectorStore has overridden the default adelete or delete783            # methods implementation which just raises a NotImplementedError784            msg = "Vectorstore has not implemented the adelete or delete method"785            raise ValueError(msg)786    elif isinstance(destination, DocumentIndex):787        pass788    else:789        msg = (  # type: ignore[unreachable]790            f"Vectorstore should be either a VectorStore or a DocumentIndex. "791            f"Got {type(destination)}."792        )793        raise TypeError(msg)794    async_doc_iterator: AsyncIterator[Document]795    if isinstance(docs_source, BaseLoader):796        try:797            async_doc_iterator = docs_source.alazy_load()798        except NotImplementedError:799            # Exception triggered when neither lazy_load nor alazy_load are implemented.800            # * The default implementation of alazy_load uses lazy_load.801            # * The default implementation of lazy_load raises NotImplementedError.802            # In such a case, we use the load method and convert it to an async803            # iterator.804            async_doc_iterator = _to_async_iterator(docs_source.load())805    elif hasattr(docs_source, "__aiter__"):806        async_doc_iterator = docs_source  # type: ignore[assignment]807    else:808        async_doc_iterator = _to_async_iterator(docs_source)809810    source_id_assigner = _get_source_id_assigner(source_id_key)811812    # Mark when the update started.813    index_start_dt = await record_manager.aget_time()814    num_added = 0815    num_skipped = 0816    num_updated = 0817    num_deleted = 0818    scoped_full_cleanup_source_ids: set[str] = set()819820    async for doc_batch in _abatch(batch_size, async_doc_iterator):821        # Track original batch size before deduplication822        original_batch_size = len(doc_batch)823824        hashed_docs = list(825            _deduplicate_in_order(826                [827                    _get_document_with_hash(doc, key_encoder=key_encoder)828                    for doc in doc_batch829                ]830            )831        )832        # Count documents removed by within-batch deduplication833        num_skipped += original_batch_size - len(hashed_docs)834835        source_ids: Sequence[str | None] = [836            source_id_assigner(doc) for doc in hashed_docs837        ]838839        if cleanup in {"incremental", "scoped_full"}:840            # If the cleanup mode is incremental, source IDs are required.841            for source_id, hashed_doc in zip(source_ids, hashed_docs, strict=False):842                if source_id is None:843                    msg = (844                        f"Source IDs are required when cleanup mode is "845                        f"incremental or scoped_full. "846                        f"Document that starts with "847                        f"content: {hashed_doc.page_content[:100]} "848                        f"was not assigned as source id."849                    )850                    raise ValueError(msg)851                if cleanup == "scoped_full":852                    scoped_full_cleanup_source_ids.add(source_id)853            # Source IDs cannot be None after for loop above.854            source_ids = cast("Sequence[str]", source_ids)855856        exists_batch = await record_manager.aexists(857            cast("Sequence[str]", [doc.id for doc in hashed_docs])858        )859860        # Filter out documents that already exist in the record store.861        uids: list[str] = []862        docs_to_index: list[Document] = []863        uids_to_refresh = []864        seen_docs: set[str] = set()865        for hashed_doc, doc_exists in zip(hashed_docs, exists_batch, strict=False):866            hashed_id = cast("str", hashed_doc.id)867            if doc_exists:868                if force_update:869                    seen_docs.add(hashed_id)870                else:871                    uids_to_refresh.append(hashed_id)872                    continue873            uids.append(hashed_id)874            docs_to_index.append(hashed_doc)875876        if uids_to_refresh:877            # Must be updated to refresh timestamp.878            await record_manager.aupdate(uids_to_refresh, time_at_least=index_start_dt)879            num_skipped += len(uids_to_refresh)880881        # Be pessimistic and assume that all vector store write will fail.882        # First write to vector store883        if docs_to_index:884            if isinstance(destination, VectorStore):885                await destination.aadd_documents(886                    docs_to_index,887                    ids=uids,888                    batch_size=batch_size,889                    **(upsert_kwargs or {}),890                )891            elif isinstance(destination, DocumentIndex):892                await destination.aupsert(893                    docs_to_index,894                    **(upsert_kwargs or {}),895                )896            num_added += len(docs_to_index) - len(seen_docs)897            num_updated += len(seen_docs)898899        # And only then update the record store.900        # Update ALL records, even if they already exist since we want to refresh901        # their timestamp.902        await record_manager.aupdate(903            cast("Sequence[str]", [doc.id for doc in hashed_docs]),904            group_ids=source_ids,905            time_at_least=index_start_dt,906        )907908        # If source IDs are provided, we can do the deletion incrementally!909910        if cleanup == "incremental":911            # Get the uids of the documents that were not returned by the loader.912913            # mypy isn't good enough to determine that source IDs cannot be None914            # here due to a check that's happening above, so we check again.915            for source_id in source_ids:916                if source_id is None:917                    msg = (918                        "source_id cannot be None at this point. "919                        "Reached unreachable code."920                    )921                    raise AssertionError(msg)922923            source_ids_ = cast("Sequence[str]", source_ids)924925            while uids_to_delete := await record_manager.alist_keys(926                group_ids=source_ids_, before=index_start_dt, limit=cleanup_batch_size927            ):928                # Then delete from vector store.929                await _adelete(destination, uids_to_delete)930                # First delete from record store.931                await record_manager.adelete_keys(uids_to_delete)932                num_deleted += len(uids_to_delete)933934    if cleanup == "full" or (935        cleanup == "scoped_full" and scoped_full_cleanup_source_ids936    ):937        delete_group_ids: Sequence[str] | None = None938        if cleanup == "scoped_full":939            delete_group_ids = list(scoped_full_cleanup_source_ids)940        while uids_to_delete := await record_manager.alist_keys(941            group_ids=delete_group_ids, before=index_start_dt, limit=cleanup_batch_size942        ):943            # First delete from record store.944            await _adelete(destination, uids_to_delete)945            # Then delete from record manager.946            await record_manager.adelete_keys(uids_to_delete)947            num_deleted += len(uids_to_delete)948949    return {950        "num_added": num_added,951        "num_updated": num_updated,952        "num_skipped": num_skipped,953        "num_deleted": num_deleted,954    }
Code quality findings 26

Avoid global variables; use function parameters or class attributes for better scope management
L58
global-variable
global _WARNED_ABOUT_SHA1 # noqa: PLW0603
Avoid unnecessary list conversions; use generators where possible
L98
unnecessary-list
chunk = list(islice(it, size))
Overuse may indicate design issues; consider polymorphism
L128
isinstance-overuse
if isinstance(source_id_key, str):
Overuse may indicate design issues; consider polymorphism
L262
isinstance-overuse
if isinstance(vector_store, VectorStore):
Overuse may indicate design issues; consider polymorphism
L267
isinstance-overuse
elif isinstance(vector_store, DocumentIndex):
Ensure functions have docstrings for documentation
L296
missing-docstring
def index(
Overuse may indicate design issues; consider polymorphism
L427
isinstance-overuse
if isinstance(destination, VectorStore):
Use isinstance() for type checking instead of type()
L438
type-check
if type(destination).delete == VectorStore.delete:
Overuse may indicate design issues; consider polymorphism
L443
isinstance-overuse
elif isinstance(destination, DocumentIndex):
Overuse may indicate design issues; consider polymorphism
L452
isinstance-overuse
if isinstance(docs_source, BaseLoader):
Avoid unnecessary list conversions; use generators where possible
L474
unnecessary-list
hashed_docs = list(
Overuse may indicate design issues; consider polymorphism
L534
isinstance-overuse
if isinstance(destination, VectorStore):
Overuse may indicate design issues; consider polymorphism
L541
isinstance-overuse
elif isinstance(destination, DocumentIndex):
Avoid unnecessary list conversions; use generators where possible
L588
unnecessary-list
delete_group_ids = list(scoped_full_cleanup_source_ids)
Overuse may indicate design issues; consider polymorphism
L617
isinstance-overuse
if isinstance(vector_store, VectorStore):
Overuse may indicate design issues; consider polymorphism
L622
isinstance-overuse
elif isinstance(vector_store, DocumentIndex):
Ensure functions have docstrings for documentation
L635
missing-docstring
async def aindex(
Overuse may indicate design issues; consider polymorphism
L766
isinstance-overuse
if isinstance(destination, VectorStore):
Use isinstance() for type checking instead of type()
L779
type-check
type(destination).adelete == VectorStore.adelete
Use isinstance() for type checking instead of type()
L780
type-check
and type(destination).delete == VectorStore.delete
Overuse may indicate design issues; consider polymorphism
L786
isinstance-overuse
elif isinstance(destination, DocumentIndex):
Overuse may indicate design issues; consider polymorphism
L795
isinstance-overuse
if isinstance(docs_source, BaseLoader):
Avoid unnecessary list conversions; use generators where possible
L824
unnecessary-list
hashed_docs = list(
Overuse may indicate design issues; consider polymorphism
L884
isinstance-overuse
if isinstance(destination, VectorStore):
Overuse may indicate design issues; consider polymorphism
L891
isinstance-overuse
elif isinstance(destination, DocumentIndex):
Avoid unnecessary list conversions; use generators where possible
L939
unnecessary-list
delete_group_ids = list(scoped_full_cleanup_source_ids)
Code quality findings 26

Get this view in your editor