Avoid global variables; use function parameters or class attributes for better scope management
global _WARNED_ABOUT_SHA1 # noqa: PLW0603
1"""Module contains logic for indexing documents into vector stores."""23from __future__ import annotations45import hashlib6import json7import uuid8import warnings9from itertools import islice10from typing import (11 TYPE_CHECKING,12 Any,13 Literal,14 TypedDict,15 TypeVar,16 cast,17)1819from langchain_core.document_loaders.base import BaseLoader20from langchain_core.documents import Document21from langchain_core.exceptions import LangChainException22from langchain_core.indexing.base import DocumentIndex, RecordManager23from langchain_core.vectorstores import VectorStore2425if TYPE_CHECKING:26 from collections.abc import (27 AsyncIterable,28 AsyncIterator,29 Callable,30 Iterable,31 Iterator,32 Sequence,33 )3435# Magic UUID to use as a namespace for hashing.36# Used to try and generate a unique UUID for each document37# from hashing the document content and metadata.38NAMESPACE_UUID = uuid.UUID(int=1984)394041T = TypeVar("T")424344def _hash_string_to_uuid(input_string: str) -> str:45 """Hashes a string and returns the corresponding UUID."""46 hash_value = hashlib.sha1(47 input_string.encode("utf-8"), usedforsecurity=False48 ).hexdigest()49 return str(uuid.uuid5(NAMESPACE_UUID, hash_value))505152_WARNED_ABOUT_SHA1: bool = False535455def _warn_about_sha1() -> None:56 """Emit a one-time warning about SHA-1 collision weaknesses."""57 # Global variable OK in this case58 global _WARNED_ABOUT_SHA1 # noqa: PLW060359 if not _WARNED_ABOUT_SHA1:60 warnings.warn(61 "Using SHA-1 for document hashing. SHA-1 is *not* "62 "collision-resistant; a motivated attacker can construct distinct inputs "63 "that map to the same fingerprint. If this matters in your "64 "threat model, switch to a stronger algorithm such "65 "as 'blake2b', 'sha256', or 'sha512' by specifying "66 " `key_encoder` parameter in the `index` or `aindex` function. ",67 category=UserWarning,68 stacklevel=2,69 )70 _WARNED_ABOUT_SHA1 = True717273def _hash_string(74 input_string: str, *, algorithm: Literal["sha1", "sha256", "sha512", "blake2b"]75) -> uuid.UUID:76 """Hash *input_string* to a deterministic UUID using the configured algorithm."""77 if algorithm == "sha1":78 _warn_about_sha1()79 hash_value = _calculate_hash(input_string, algorithm)80 return uuid.uuid5(NAMESPACE_UUID, hash_value)818283def _hash_nested_dict(84 data: dict[Any, Any], *, algorithm: Literal["sha1", "sha256", "sha512", "blake2b"]85) -> uuid.UUID:86 """Hash a nested dictionary to a UUID using the configured algorithm."""87 serialized_data = json.dumps(data, sort_keys=True)88 return _hash_string(serialized_data, algorithm=algorithm)899091def _batch(size: int, iterable: Iterable[T]) -> Iterator[list[T]]:92 """Utility batching function."""93 if size <= 0:94 msg = f"Batch size must be a positive integer, got {size}."95 raise ValueError(msg)96 it = iter(iterable)97 while True:98 chunk = list(islice(it, size))99 if not chunk:100 return101 yield chunk102103104async def _abatch(size: int, iterable: AsyncIterable[T]) -> AsyncIterator[list[T]]:105 """Utility batching function."""106 if size <= 0:107 msg = f"Batch size must be a positive integer, got {size}."108 raise ValueError(msg)109 batch: list[T] = []110 async for element in iterable:111 if len(batch) < size:112 batch.append(element)113114 if len(batch) >= size:115 yield batch116 batch = []117118 if batch:119 yield batch120121122def _get_source_id_assigner(123 source_id_key: str | Callable[[Document], str] | None,124) -> Callable[[Document], str | None]:125 """Get the source id from the document."""126 if source_id_key is None:127 return lambda _doc: None128 if isinstance(source_id_key, str):129 return lambda doc: doc.metadata[source_id_key]130 if callable(source_id_key):131 return source_id_key132 msg = (133 f"source_id_key should be either None, a string or a callable. "134 f"Got {source_id_key} of type {type(source_id_key)}."135 )136 raise ValueError(msg)137138139def _deduplicate_in_order(140 hashed_documents: Iterable[Document],141) -> Iterator[Document]:142 """Deduplicate a list of hashed documents while preserving order."""143 seen: set[str] = set()144145 for hashed_doc in hashed_documents:146 if hashed_doc.id not in seen:147 # At this stage, the id is guaranteed to be a string.148 # Avoiding unnecessary run time checks.149 seen.add(cast("str", hashed_doc.id))150 yield hashed_doc151152153class IndexingException(LangChainException):154 """Raised when an indexing operation fails."""155156157def _calculate_hash(158 text: str, algorithm: Literal["sha1", "sha256", "sha512", "blake2b"]159) -> str:160 """Return a hexadecimal digest of *text* using *algorithm*."""161 if algorithm == "sha1":162 # Calculate the SHA-1 hash and return it as a UUID.163 digest = hashlib.sha1(text.encode("utf-8"), usedforsecurity=False).hexdigest()164 return str(uuid.uuid5(NAMESPACE_UUID, digest))165 if algorithm == "blake2b":166 return hashlib.blake2b(text.encode("utf-8")).hexdigest()167 if algorithm == "sha256":168 return hashlib.sha256(text.encode("utf-8")).hexdigest()169 if algorithm == "sha512":170 return hashlib.sha512(text.encode("utf-8")).hexdigest()171 msg = f"Unsupported hashing algorithm: {algorithm}"172 raise ValueError(msg)173174175def _get_document_with_hash(176 document: Document,177 *,178 key_encoder: Callable[[Document], str]179 | Literal["sha1", "sha256", "sha512", "blake2b"],180) -> Document:181 """Calculate a hash of the document, and assign it to the uid.182183 When using one of the predefined hashing algorithms, the hash is calculated184 by hashing the content and the metadata of the document.185186 Args:187 document: Document to hash.188 key_encoder: Hashing algorithm to use for hashing the document.189 If not provided, a default encoder using SHA-1 will be used.190 SHA-1 is not collision-resistant, and a motivated attacker191 could craft two different texts that hash to the192 same cache key.193194 New applications should use one of the alternative encoders195 or provide a custom and strong key encoder function to avoid this risk.196197 When changing the key encoder, you must change the198 index as well to avoid duplicated documents in the cache.199200 Raises:201 ValueError: If the metadata cannot be serialized using json.202203 Returns:204 Document with a unique identifier based on the hash of the content and metadata.205 """206 metadata: dict[str, Any] = dict(document.metadata or {})207208 if callable(key_encoder):209 # If key_encoder is a callable, we use it to generate the hash.210 hash_ = key_encoder(document)211 else:212 # The hashes are calculated separate for the content and the metadata.213 content_hash = _calculate_hash(document.page_content, algorithm=key_encoder)214 try:215 serialized_meta = json.dumps(metadata, sort_keys=True)216 except Exception as e:217 msg = (218 f"Failed to hash metadata: {e}. "219 f"Please use a dict that can be serialized using json."220 )221 raise ValueError(msg) from e222 metadata_hash = _calculate_hash(serialized_meta, algorithm=key_encoder)223 hash_ = _calculate_hash(content_hash + metadata_hash, algorithm=key_encoder)224225 return Document(226 # Assign a unique identifier based on the hash.227 id=hash_,228 page_content=document.page_content,229 metadata=document.metadata,230 )231232233# This internal abstraction was imported by the langchain package internally, so234# we keep it here for backwards compatibility.235class _HashedDocument:236 def __init__(self, *args: Any, **kwargs: Any) -> None:237 """Raise an error if this class is instantiated."""238 msg = (239 "_HashedDocument is an internal abstraction that was deprecated in "240 " langchain-core 0.3.63. This abstraction is marked as private and "241 " should not have been used directly. If you are seeing this error, please "242 " update your code appropriately."243 )244 raise NotImplementedError(msg)245246247def _delete(248 vector_store: VectorStore | DocumentIndex,249 ids: list[str],250) -> None:251 """Delete documents from a vector store or document index by their IDs.252253 Args:254 vector_store: The vector store or document index to delete from.255 ids: List of document IDs to delete.256257 Raises:258 IndexingException: If the delete operation fails.259 TypeError: If the `vector_store` is neither a `VectorStore` nor a260 `DocumentIndex`.261 """262 if isinstance(vector_store, VectorStore):263 delete_ok = vector_store.delete(ids)264 if delete_ok is not None and delete_ok is False:265 msg = "The delete operation to VectorStore failed."266 raise IndexingException(msg)267 elif isinstance(vector_store, DocumentIndex):268 delete_response = vector_store.delete(ids)269 if "num_failed" in delete_response and delete_response["num_failed"] > 0:270 msg = "The delete operation to DocumentIndex failed."271 raise IndexingException(msg)272 else:273 msg = (274 f"Vectorstore should be either a VectorStore or a DocumentIndex. "275 f"Got {type(vector_store)}."276 )277 raise TypeError(msg)278279280# PUBLIC API281282283class IndexingResult(TypedDict):284 """Return a detailed a breakdown of the result of the indexing operation."""285286 num_added: int287 """Number of added documents."""288 num_updated: int289 """Number of updated documents because they were not up to date."""290 num_deleted: int291 """Number of deleted documents."""292 num_skipped: int293 """Number of skipped documents because they were already up to date."""294295296def index(297 docs_source: BaseLoader | Iterable[Document],298 record_manager: RecordManager,299 vector_store: VectorStore | DocumentIndex,300 *,301 batch_size: int = 100,302 cleanup: Literal["incremental", "full", "scoped_full"] | None = None,303 source_id_key: str | Callable[[Document], str] | None = None,304 cleanup_batch_size: int = 1_000,305 force_update: bool = False,306 key_encoder: Literal["sha1", "sha256", "sha512", "blake2b"]307 | Callable[[Document], str] = "sha1",308 upsert_kwargs: dict[str, Any] | None = None,309) -> IndexingResult:310 """Index data from the loader into the vector store.311312 Indexing functionality uses a manager to keep track of which documents313 are in the vector store.314315 This allows us to keep track of which documents were updated, and which316 documents were deleted, which documents should be skipped.317318 For the time being, documents are indexed using their hashes, and users319 are not able to specify the uid of the document.320321 !!! warning "Behavior changed in `langchain-core` 0.3.25"322323 Added `scoped_full` cleanup mode.324325 !!! warning326327 * In full mode, the loader should be returning328 the entire dataset, and not just a subset of the dataset.329 Otherwise, the auto_cleanup will remove documents that it is not330 supposed to.331 * In incremental mode, if documents associated with a particular332 source id appear across different batches, the indexing API333 will do some redundant work. This will still result in the334 correct end state of the index, but will unfortunately not be335 100% efficient. For example, if a given document is split into 15336 chunks, and we index them using a batch size of 5, we'll have 3 batches337 all with the same source id. In general, to avoid doing too much338 redundant work select as big a batch size as possible.339 * The `scoped_full` mode is suitable if determining an appropriate batch size340 is challenging or if your data loader cannot return the entire dataset at341 once. This mode keeps track of source IDs in memory, which should be fine342 for most use cases. If your dataset is large (10M+ docs), you will likely343 need to parallelize the indexing process regardless.344345 Args:346 docs_source: Data loader or iterable of documents to index.347 record_manager: Timestamped set to keep track of which documents were348 updated.349 vector_store: `VectorStore` or DocumentIndex to index the documents into.350 batch_size: Batch size to use when indexing.351 cleanup: How to handle clean up of documents.352353 - incremental: Cleans up all documents that haven't been updated AND354 that are associated with source IDs that were seen during indexing.355 Clean up is done continuously during indexing helping to minimize the356 probability of users seeing duplicated content.357 - full: Delete all documents that have not been returned by the loader358 during this run of indexing.359 Clean up runs after all documents have been indexed.360 This means that users may see duplicated content during indexing.361 - scoped_full: Similar to Full, but only deletes all documents362 that haven't been updated AND that are associated with363 source IDs that were seen during indexing.364 - None: Do not delete any documents.365 source_id_key: Optional key that helps identify the original source366 of the document.367 cleanup_batch_size: Batch size to use when cleaning up documents.368 force_update: Force update documents even if they are present in the369 record manager. Useful if you are re-indexing with updated embeddings.370 key_encoder: Hashing algorithm to use for hashing the document content and371 metadata. Options include "blake2b", "sha256", and "sha512".372373 !!! version-added "Added in `langchain-core` 0.3.66"374375 key_encoder: Hashing algorithm to use for hashing the document.376 If not provided, a default encoder using SHA-1 will be used.377 SHA-1 is not collision-resistant, and a motivated attacker378 could craft two different texts that hash to the379 same cache key.380381 New applications should use one of the alternative encoders382 or provide a custom and strong key encoder function to avoid this risk.383384 When changing the key encoder, you must change the385 index as well to avoid duplicated documents in the cache.386 upsert_kwargs: Additional keyword arguments to pass to the add_documents387 method of the `VectorStore` or the upsert method of the DocumentIndex.388 For example, you can use this to specify a custom vector_field:389 upsert_kwargs={"vector_field": "embedding"}390 !!! version-added "Added in `langchain-core` 0.3.10"391392 Returns:393 Indexing result which contains information about how many documents394 were added, updated, deleted, or skipped.395396 Raises:397 ValueError: If cleanup mode is not one of 'incremental', 'full' or None398 ValueError: If cleanup mode is incremental and source_id_key is None.399 ValueError: If `VectorStore` does not have400 "delete" and "add_documents" required methods.401 ValueError: If source_id_key is not None, but is not a string or callable.402 TypeError: If `vectorstore` is not a `VectorStore` or a DocumentIndex.403 AssertionError: If `source_id` is None when cleanup mode is incremental.404 (should be unreachable code).405 """406 # Behavior is deprecated, but we keep it for backwards compatibility.407 # # Warn only once per process.408 if key_encoder == "sha1":409 _warn_about_sha1()410411 if cleanup not in {"incremental", "full", "scoped_full", None}:412 msg = (413 f"cleanup should be one of 'incremental', 'full', 'scoped_full' or None. "414 f"Got {cleanup}."415 )416 raise ValueError(msg)417418 if (cleanup in {"incremental", "scoped_full"}) and source_id_key is None:419 msg = (420 "Source id key is required when cleanup mode is incremental or scoped_full."421 )422 raise ValueError(msg)423424 destination = vector_store # Renaming internally for clarity425426 # If it's a vectorstore, let's check if it has the required methods.427 if isinstance(destination, VectorStore):428 # Check that the Vectorstore has required methods implemented429 methods = ["delete", "add_documents"]430431 for method in methods:432 if not hasattr(destination, method):433 msg = (434 f"Vectorstore {destination} does not have required method {method}"435 )436 raise ValueError(msg)437438 if type(destination).delete == VectorStore.delete:439 # Checking if the VectorStore has overridden the default delete method440 # implementation which just raises a NotImplementedError441 msg = "Vectorstore has not implemented the delete method"442 raise ValueError(msg)443 elif isinstance(destination, DocumentIndex):444 pass445 else:446 msg = (447 f"Vectorstore should be either a VectorStore or a DocumentIndex. "448 f"Got {type(destination)}."449 )450 raise TypeError(msg)451452 if isinstance(docs_source, BaseLoader):453 try:454 doc_iterator = docs_source.lazy_load()455 except NotImplementedError:456 doc_iterator = iter(docs_source.load())457 else:458 doc_iterator = iter(docs_source)459460 source_id_assigner = _get_source_id_assigner(source_id_key)461462 # Mark when the update started.463 index_start_dt = record_manager.get_time()464 num_added = 0465 num_skipped = 0466 num_updated = 0467 num_deleted = 0468 scoped_full_cleanup_source_ids: set[str] = set()469470 for doc_batch in _batch(batch_size, doc_iterator):471 # Track original batch size before deduplication472 original_batch_size = len(doc_batch)473474 hashed_docs = list(475 _deduplicate_in_order(476 [477 _get_document_with_hash(doc, key_encoder=key_encoder)478 for doc in doc_batch479 ]480 )481 )482 # Count documents removed by within-batch deduplication483 num_skipped += original_batch_size - len(hashed_docs)484485 source_ids: Sequence[str | None] = [486 source_id_assigner(hashed_doc) for hashed_doc in hashed_docs487 ]488489 if cleanup in {"incremental", "scoped_full"}:490 # Source IDs are required.491 for source_id, hashed_doc in zip(source_ids, hashed_docs, strict=False):492 if source_id is None:493 msg = (494 f"Source IDs are required when cleanup mode is "495 f"incremental or scoped_full. "496 f"Document that starts with "497 f"content: {hashed_doc.page_content[:100]} "498 f"was not assigned as source id."499 )500 raise ValueError(msg)501 if cleanup == "scoped_full":502 scoped_full_cleanup_source_ids.add(source_id)503 # Source IDs cannot be None after for loop above.504 source_ids = cast("Sequence[str]", source_ids)505506 exists_batch = record_manager.exists(507 cast("Sequence[str]", [doc.id for doc in hashed_docs])508 )509510 # Filter out documents that already exist in the record store.511 uids = []512 docs_to_index = []513 uids_to_refresh = []514 seen_docs: set[str] = set()515 for hashed_doc, doc_exists in zip(hashed_docs, exists_batch, strict=False):516 hashed_id = cast("str", hashed_doc.id)517 if doc_exists:518 if force_update:519 seen_docs.add(hashed_id)520 else:521 uids_to_refresh.append(hashed_id)522 continue523 uids.append(hashed_id)524 docs_to_index.append(hashed_doc)525526 # Update refresh timestamp527 if uids_to_refresh:528 record_manager.update(uids_to_refresh, time_at_least=index_start_dt)529 num_skipped += len(uids_to_refresh)530531 # Be pessimistic and assume that all vector store write will fail.532 # First write to vector store533 if docs_to_index:534 if isinstance(destination, VectorStore):535 destination.add_documents(536 docs_to_index,537 ids=uids,538 batch_size=batch_size,539 **(upsert_kwargs or {}),540 )541 elif isinstance(destination, DocumentIndex):542 destination.upsert(543 docs_to_index,544 **(upsert_kwargs or {}),545 )546547 num_added += len(docs_to_index) - len(seen_docs)548 num_updated += len(seen_docs)549550 # And only then update the record store.551 # Update ALL records, even if they already exist since we want to refresh552 # their timestamp.553 record_manager.update(554 cast("Sequence[str]", [doc.id for doc in hashed_docs]),555 group_ids=source_ids,556 time_at_least=index_start_dt,557 )558559 # If source IDs are provided, we can do the deletion incrementally!560 if cleanup == "incremental":561 # Get the uids of the documents that were not returned by the loader.562 # mypy isn't good enough to determine that source IDs cannot be None563 # here due to a check that's happening above, so we check again.564 for source_id in source_ids:565 if source_id is None:566 msg = (567 "source_id cannot be None at this point. "568 "Reached unreachable code."569 )570 raise AssertionError(msg)571572 source_ids_ = cast("Sequence[str]", source_ids)573574 while uids_to_delete := record_manager.list_keys(575 group_ids=source_ids_, before=index_start_dt, limit=cleanup_batch_size576 ):577 # Then delete from vector store.578 _delete(destination, uids_to_delete)579 # First delete from record store.580 record_manager.delete_keys(uids_to_delete)581 num_deleted += len(uids_to_delete)582583 if cleanup == "full" or (584 cleanup == "scoped_full" and scoped_full_cleanup_source_ids585 ):586 delete_group_ids: Sequence[str] | None = None587 if cleanup == "scoped_full":588 delete_group_ids = list(scoped_full_cleanup_source_ids)589 while uids_to_delete := record_manager.list_keys(590 group_ids=delete_group_ids, before=index_start_dt, limit=cleanup_batch_size591 ):592 # First delete from record store.593 _delete(destination, uids_to_delete)594 # Then delete from record manager.595 record_manager.delete_keys(uids_to_delete)596 num_deleted += len(uids_to_delete)597598 return {599 "num_added": num_added,600 "num_updated": num_updated,601 "num_skipped": num_skipped,602 "num_deleted": num_deleted,603 }604605606# Define an asynchronous generator function607async def _to_async_iterator(iterator: Iterable[T]) -> AsyncIterator[T]:608 """Convert an iterable to an async iterator."""609 for item in iterator:610 yield item611612613async def _adelete(614 vector_store: VectorStore | DocumentIndex,615 ids: list[str],616) -> None:617 if isinstance(vector_store, VectorStore):618 delete_ok = await vector_store.adelete(ids)619 if delete_ok is not None and delete_ok is False:620 msg = "The delete operation to VectorStore failed."621 raise IndexingException(msg)622 elif isinstance(vector_store, DocumentIndex):623 delete_response = await vector_store.adelete(ids)624 if "num_failed" in delete_response and delete_response["num_failed"] > 0:625 msg = "The delete operation to DocumentIndex failed."626 raise IndexingException(msg)627 else:628 msg = (629 f"Vectorstore should be either a VectorStore or a DocumentIndex. "630 f"Got {type(vector_store)}."631 )632 raise TypeError(msg)633634635async def aindex(636 docs_source: BaseLoader | Iterable[Document] | AsyncIterator[Document],637 record_manager: RecordManager,638 vector_store: VectorStore | DocumentIndex,639 *,640 batch_size: int = 100,641 cleanup: Literal["incremental", "full", "scoped_full"] | None = None,642 source_id_key: str | Callable[[Document], str] | None = None,643 cleanup_batch_size: int = 1_000,644 force_update: bool = False,645 key_encoder: Literal["sha1", "sha256", "sha512", "blake2b"]646 | Callable[[Document], str] = "sha1",647 upsert_kwargs: dict[str, Any] | None = None,648) -> IndexingResult:649 """Async index data from the loader into the vector store.650651 Indexing functionality uses a manager to keep track of which documents652 are in the vector store.653654 This allows us to keep track of which documents were updated, and which655 documents were deleted, which documents should be skipped.656657 For the time being, documents are indexed using their hashes, and users658 are not able to specify the uid of the document.659660 !!! warning "Behavior changed in `langchain-core` 0.3.25"661662 Added `scoped_full` cleanup mode.663664 !!! warning665666 * In full mode, the loader should be returning667 the entire dataset, and not just a subset of the dataset.668 Otherwise, the auto_cleanup will remove documents that it is not669 supposed to.670 * In incremental mode, if documents associated with a particular671 source id appear across different batches, the indexing API672 will do some redundant work. This will still result in the673 correct end state of the index, but will unfortunately not be674 100% efficient. For example, if a given document is split into 15675 chunks, and we index them using a batch size of 5, we'll have 3 batches676 all with the same source id. In general, to avoid doing too much677 redundant work select as big a batch size as possible.678 * The `scoped_full` mode is suitable if determining an appropriate batch size679 is challenging or if your data loader cannot return the entire dataset at680 once. This mode keeps track of source IDs in memory, which should be fine681 for most use cases. If your dataset is large (10M+ docs), you will likely682 need to parallelize the indexing process regardless.683684 Args:685 docs_source: Data loader or iterable of documents to index.686 record_manager: Timestamped set to keep track of which documents were687 updated.688 vector_store: `VectorStore` or DocumentIndex to index the documents into.689 batch_size: Batch size to use when indexing.690 cleanup: How to handle clean up of documents.691692 - incremental: Cleans up all documents that haven't been updated AND693 that are associated with source IDs that were seen during indexing.694 Clean up is done continuously during indexing helping to minimize the695 probability of users seeing duplicated content.696 - full: Delete all documents that have not been returned by the loader697 during this run of indexing.698 Clean up runs after all documents have been indexed.699 This means that users may see duplicated content during indexing.700 - scoped_full: Similar to Full, but only deletes all documents701 that haven't been updated AND that are associated with702 source IDs that were seen during indexing.703 - None: Do not delete any documents.704 source_id_key: Optional key that helps identify the original source705 of the document.706 cleanup_batch_size: Batch size to use when cleaning up documents.707 force_update: Force update documents even if they are present in the708 record manager. Useful if you are re-indexing with updated embeddings.709 key_encoder: Hashing algorithm to use for hashing the document content and710 metadata. Options include "blake2b", "sha256", and "sha512".711712 !!! version-added "Added in `langchain-core` 0.3.66"713714 key_encoder: Hashing algorithm to use for hashing the document.715 If not provided, a default encoder using SHA-1 will be used.716 SHA-1 is not collision-resistant, and a motivated attacker717 could craft two different texts that hash to the718 same cache key.719720 New applications should use one of the alternative encoders721 or provide a custom and strong key encoder function to avoid this risk.722723 When changing the key encoder, you must change the724 index as well to avoid duplicated documents in the cache.725 upsert_kwargs: Additional keyword arguments to pass to the add_documents726 method of the `VectorStore` or the upsert method of the DocumentIndex.727 For example, you can use this to specify a custom vector_field:728 upsert_kwargs={"vector_field": "embedding"}729 !!! version-added "Added in `langchain-core` 0.3.10"730731 Returns:732 Indexing result which contains information about how many documents733 were added, updated, deleted, or skipped.734735 Raises:736 ValueError: If cleanup mode is not one of 'incremental', 'full' or None737 ValueError: If cleanup mode is incremental and source_id_key is None.738 ValueError: If `VectorStore` does not have739 "adelete" and "aadd_documents" required methods.740 ValueError: If source_id_key is not None, but is not a string or callable.741 TypeError: If `vector_store` is not a `VectorStore` or DocumentIndex.742 AssertionError: If `source_id_key` is None when cleanup mode is743 incremental or `scoped_full` (should be unreachable).744 """745 # Behavior is deprecated, but we keep it for backwards compatibility.746 # # Warn only once per process.747 if key_encoder == "sha1":748 _warn_about_sha1()749750 if cleanup not in {"incremental", "full", "scoped_full", None}:751 msg = (752 f"cleanup should be one of 'incremental', 'full', 'scoped_full' or None. "753 f"Got {cleanup}."754 )755 raise ValueError(msg)756757 if (cleanup in {"incremental", "scoped_full"}) and source_id_key is None:758 msg = (759 "Source id key is required when cleanup mode is incremental or scoped_full."760 )761 raise ValueError(msg)762763 destination = vector_store # Renaming internally for clarity764765 # If it's a vectorstore, let's check if it has the required methods.766 if isinstance(destination, VectorStore):767 # Check that the Vectorstore has required methods implemented768 # Check that the Vectorstore has required methods implemented769 methods = ["adelete", "aadd_documents"]770771 for method in methods:772 if not hasattr(destination, method):773 msg = (774 f"Vectorstore {destination} does not have required method {method}"775 )776 raise ValueError(msg)777778 if (779 type(destination).adelete == VectorStore.adelete780 and type(destination).delete == VectorStore.delete781 ):782 # Checking if the VectorStore has overridden the default adelete or delete783 # methods implementation which just raises a NotImplementedError784 msg = "Vectorstore has not implemented the adelete or delete method"785 raise ValueError(msg)786 elif isinstance(destination, DocumentIndex):787 pass788 else:789 msg = (790 f"Vectorstore should be either a VectorStore or a DocumentIndex. "791 f"Got {type(destination)}."792 )793 raise TypeError(msg)794 async_doc_iterator: AsyncIterator[Document]795 if isinstance(docs_source, BaseLoader):796 try:797 async_doc_iterator = docs_source.alazy_load()798 except NotImplementedError:799 # Exception triggered when neither lazy_load nor alazy_load are implemented.800 # * The default implementation of alazy_load uses lazy_load.801 # * The default implementation of lazy_load raises NotImplementedError.802 # In such a case, we use the load method and convert it to an async803 # iterator.804 async_doc_iterator = _to_async_iterator(docs_source.load())805 elif hasattr(docs_source, "__aiter__"):806 async_doc_iterator = docs_source # type: ignore[assignment]807 else:808 async_doc_iterator = _to_async_iterator(docs_source)809810 source_id_assigner = _get_source_id_assigner(source_id_key)811812 # Mark when the update started.813 index_start_dt = await record_manager.aget_time()814 num_added = 0815 num_skipped = 0816 num_updated = 0817 num_deleted = 0818 scoped_full_cleanup_source_ids: set[str] = set()819820 async for doc_batch in _abatch(batch_size, async_doc_iterator):821 # Track original batch size before deduplication822 original_batch_size = len(doc_batch)823824 hashed_docs = list(825 _deduplicate_in_order(826 [827 _get_document_with_hash(doc, key_encoder=key_encoder)828 for doc in doc_batch829 ]830 )831 )832 # Count documents removed by within-batch deduplication833 num_skipped += original_batch_size - len(hashed_docs)834835 source_ids: Sequence[str | None] = [836 source_id_assigner(doc) for doc in hashed_docs837 ]838839 if cleanup in {"incremental", "scoped_full"}:840 # If the cleanup mode is incremental, source IDs are required.841 for source_id, hashed_doc in zip(source_ids, hashed_docs, strict=False):842 if source_id is None:843 msg = (844 f"Source IDs are required when cleanup mode is "845 f"incremental or scoped_full. "846 f"Document that starts with "847 f"content: {hashed_doc.page_content[:100]} "848 f"was not assigned as source id."849 )850 raise ValueError(msg)851 if cleanup == "scoped_full":852 scoped_full_cleanup_source_ids.add(source_id)853 # Source IDs cannot be None after for loop above.854 source_ids = cast("Sequence[str]", source_ids)855856 exists_batch = await record_manager.aexists(857 cast("Sequence[str]", [doc.id for doc in hashed_docs])858 )859860 # Filter out documents that already exist in the record store.861 uids: list[str] = []862 docs_to_index: list[Document] = []863 uids_to_refresh = []864 seen_docs: set[str] = set()865 for hashed_doc, doc_exists in zip(hashed_docs, exists_batch, strict=False):866 hashed_id = cast("str", hashed_doc.id)867 if doc_exists:868 if force_update:869 seen_docs.add(hashed_id)870 else:871 uids_to_refresh.append(hashed_id)872 continue873 uids.append(hashed_id)874 docs_to_index.append(hashed_doc)875876 if uids_to_refresh:877 # Must be updated to refresh timestamp.878 await record_manager.aupdate(uids_to_refresh, time_at_least=index_start_dt)879 num_skipped += len(uids_to_refresh)880881 # Be pessimistic and assume that all vector store write will fail.882 # First write to vector store883 if docs_to_index:884 if isinstance(destination, VectorStore):885 await destination.aadd_documents(886 docs_to_index,887 ids=uids,888 batch_size=batch_size,889 **(upsert_kwargs or {}),890 )891 elif isinstance(destination, DocumentIndex):892 await destination.aupsert(893 docs_to_index,894 **(upsert_kwargs or {}),895 )896 num_added += len(docs_to_index) - len(seen_docs)897 num_updated += len(seen_docs)898899 # And only then update the record store.900 # Update ALL records, even if they already exist since we want to refresh901 # their timestamp.902 await record_manager.aupdate(903 cast("Sequence[str]", [doc.id for doc in hashed_docs]),904 group_ids=source_ids,905 time_at_least=index_start_dt,906 )907908 # If source IDs are provided, we can do the deletion incrementally!909910 if cleanup == "incremental":911 # Get the uids of the documents that were not returned by the loader.912913 # mypy isn't good enough to determine that source IDs cannot be None914 # here due to a check that's happening above, so we check again.915 for source_id in source_ids:916 if source_id is None:917 msg = (918 "source_id cannot be None at this point. "919 "Reached unreachable code."920 )921 raise AssertionError(msg)922923 source_ids_ = cast("Sequence[str]", source_ids)924925 while uids_to_delete := await record_manager.alist_keys(926 group_ids=source_ids_, before=index_start_dt, limit=cleanup_batch_size927 ):928 # Then delete from vector store.929 await _adelete(destination, uids_to_delete)930 # First delete from record store.931 await record_manager.adelete_keys(uids_to_delete)932 num_deleted += len(uids_to_delete)933934 if cleanup == "full" or (935 cleanup == "scoped_full" and scoped_full_cleanup_source_ids936 ):937 delete_group_ids: Sequence[str] | None = None938 if cleanup == "scoped_full":939 delete_group_ids = list(scoped_full_cleanup_source_ids)940 while uids_to_delete := await record_manager.alist_keys(941 group_ids=delete_group_ids, before=index_start_dt, limit=cleanup_batch_size942 ):943 # First delete from record store.944 await _adelete(destination, uids_to_delete)945 # Then delete from record manager.946 await record_manager.adelete_keys(uids_to_delete)947 num_deleted += len(uids_to_delete)948949 return {950 "num_added": num_added,951 "num_updated": num_updated,952 "num_skipped": num_skipped,953 "num_deleted": num_deleted,954 }
Same data, no extra tab — call code_get_file + code_get_findings over MCP from Claude/Cursor/Copilot.