Use logging module for better control and configurability
print(f"{id}: {doc['text']}")
1"""In-memory vector store."""23from __future__ import annotations45import json6import uuid7from pathlib import Path8from typing import (9 TYPE_CHECKING,10 Any,11)1213from typing_extensions import override1415from langchain_core.documents import Document16from langchain_core.load import dumpd, load17from langchain_core.vectorstores import VectorStore18from langchain_core.vectorstores.utils import _cosine_similarity as cosine_similarity19from langchain_core.vectorstores.utils import maximal_marginal_relevance2021if TYPE_CHECKING:22 from collections.abc import Callable, Iterator, Sequence2324 from langchain_core.embeddings import Embeddings2526try:27 import numpy as np2829 _HAS_NUMPY = True30except ImportError:31 _HAS_NUMPY = False323334class InMemoryVectorStore(VectorStore):35 """In-memory vector store implementation.3637 Uses a dictionary, and computes cosine similarity for search using numpy.3839 Setup:40 Install `langchain-core`.4142 ```bash43 pip install -U langchain-core44 ```4546 Key init args — indexing params:4748 * embedding_function: Embeddings49 Embedding function to use.5051 Instantiate:52 ```python53 from langchain_core.vectorstores import InMemoryVectorStore54 from langchain_openai import OpenAIEmbeddings5556 vector_store = InMemoryVectorStore(OpenAIEmbeddings())57 ```5859 Add Documents:60 ```python61 from langchain_core.documents import Document6263 document_1 = Document(id="1", page_content="foo", metadata={"baz": "bar"})64 document_2 = Document(id="2", page_content="thud", metadata={"bar": "baz"})65 document_3 = Document(id="3", page_content="i will be deleted :(")6667 documents = [document_1, document_2, document_3]68 vector_store.add_documents(documents=documents)69 ```7071 Inspect documents:72 ```python73 top_n = 1074 for index, (id, doc) in enumerate(vector_store.store.items()):75 if index < top_n:76 # docs have keys 'id', 'vector', 'text', 'metadata'77 print(f"{id}: {doc['text']}")78 else:79 break80 ```8182 Delete Documents:83 ```python84 vector_store.delete(ids=["3"])85 ```8687 Search:88 ```python89 results = vector_store.similarity_search(query="thud", k=1)90 for doc in results:91 print(f"* {doc.page_content} [{doc.metadata}]")92 ```9394 ```txt95 * thud [{'bar': 'baz'}]96 ```9798 Search with filter:99 ```python100 def _filter_function(doc: Document) -> bool:101 return doc.metadata.get("bar") == "baz"102103104 results = vector_store.similarity_search(105 query="thud", k=1, filter=_filter_function106 )107 for doc in results:108 print(f"* {doc.page_content} [{doc.metadata}]")109 ```110111 ```txt112 * thud [{'bar': 'baz'}]113 ```114115 Search with score:116 ```python117 results = vector_store.similarity_search_with_score(query="qux", k=1)118 for doc, score in results:119 print(f"* [SIM={score:3f}] {doc.page_content} [{doc.metadata}]")120 ```121122 ```txt123 * [SIM=0.832268] foo [{'baz': 'bar'}]124 ```125126 Async:127 ```python128 # add documents129 # await vector_store.aadd_documents(documents=documents)130131 # delete documents132 # await vector_store.adelete(ids=["3"])133134 # search135 # results = vector_store.asimilarity_search(query="thud", k=1)136137 # search with score138 results = await vector_store.asimilarity_search_with_score(query="qux", k=1)139 for doc, score in results:140 print(f"* [SIM={score:3f}] {doc.page_content} [{doc.metadata}]")141 ```142143 ```txt144 * [SIM=0.832268] foo [{'baz': 'bar'}]145 ```146147 Use as Retriever:148 ```python149 retriever = vector_store.as_retriever(150 search_type="mmr",151 search_kwargs={"k": 1, "fetch_k": 2, "lambda_mult": 0.5},152 )153 retriever.invoke("thud")154 ```155156 ```txt157 [Document(id='2', metadata={'bar': 'baz'}, page_content='thud')]158 ```159 """160161 def __init__(self, embedding: Embeddings) -> None:162 """Initialize with the given embedding function.163164 Args:165 embedding: embedding function to use.166 """167 # TODO: would be nice to change to168 # dict[str, Document] at some point (will be a breaking change)169 self.store: dict[str, dict[str, Any]] = {}170 self.embedding = embedding171172 @property173 @override174 def embeddings(self) -> Embeddings:175 return self.embedding176177 @override178 def delete(self, ids: Sequence[str] | None = None, **kwargs: Any) -> None:179 if ids:180 for id_ in ids:181 self.store.pop(id_, None)182183 @override184 async def adelete(self, ids: Sequence[str] | None = None, **kwargs: Any) -> None:185 self.delete(ids)186187 @override188 def add_documents(189 self,190 documents: list[Document],191 ids: list[str] | None = None,192 **kwargs: Any,193 ) -> list[str]:194 texts = [doc.page_content for doc in documents]195 vectors = self.embedding.embed_documents(texts)196197 if ids and len(ids) != len(texts):198 msg = (199 f"ids must be the same length as texts. "200 f"Got {len(ids)} ids and {len(texts)} texts."201 )202 raise ValueError(msg)203204 id_iterator: Iterator[str | None] = (205 iter(ids) if ids else iter(doc.id for doc in documents)206 )207208 ids_ = []209210 for doc, vector in zip(documents, vectors, strict=False):211 doc_id = next(id_iterator)212 doc_id_ = doc_id or str(uuid.uuid4())213 ids_.append(doc_id_)214 self.store[doc_id_] = {215 "id": doc_id_,216 "vector": vector,217 "text": doc.page_content,218 "metadata": doc.metadata,219 }220221 return ids_222223 @override224 async def aadd_documents(225 self, documents: list[Document], ids: list[str] | None = None, **kwargs: Any226 ) -> list[str]:227 texts = [doc.page_content for doc in documents]228 vectors = await self.embedding.aembed_documents(texts)229230 if ids and len(ids) != len(texts):231 msg = (232 f"ids must be the same length as texts. "233 f"Got {len(ids)} ids and {len(texts)} texts."234 )235 raise ValueError(msg)236237 id_iterator: Iterator[str | None] = (238 iter(ids) if ids else iter(doc.id for doc in documents)239 )240 ids_: list[str] = []241242 for doc, vector in zip(documents, vectors, strict=False):243 doc_id = next(id_iterator)244 doc_id_ = doc_id or str(uuid.uuid4())245 ids_.append(doc_id_)246 self.store[doc_id_] = {247 "id": doc_id_,248 "vector": vector,249 "text": doc.page_content,250 "metadata": doc.metadata,251 }252253 return ids_254255 @override256 def get_by_ids(self, ids: Sequence[str], /) -> list[Document]:257 """Get documents by their ids.258259 Args:260 ids: The IDs of the documents to get.261262 Returns:263 A list of `Document` objects.264 """265 documents = []266267 for doc_id in ids:268 doc = self.store.get(doc_id)269 if doc:270 documents.append(271 Document(272 id=doc["id"],273 page_content=doc["text"],274 metadata=doc["metadata"],275 )276 )277 return documents278279 @override280 async def aget_by_ids(self, ids: Sequence[str], /) -> list[Document]:281 """Async get documents by their ids.282283 Args:284 ids: The IDs of the documents to get.285286 Returns:287 A list of `Document` objects.288 """289 return self.get_by_ids(ids)290291 def _similarity_search_with_score_by_vector(292 self,293 embedding: list[float],294 k: int = 4,295 filter: Callable[[Document], bool] | None = None, # noqa: A002296 ) -> list[tuple[Document, float, list[float]]]:297 # Get all docs with fixed order in list298 docs = list(self.store.values())299300 if filter is not None:301 docs = [302 doc303 for doc in docs304 if filter(305 Document(306 id=doc["id"], page_content=doc["text"], metadata=doc["metadata"]307 )308 )309 ]310311 if not docs:312 return []313314 similarity = cosine_similarity([embedding], [doc["vector"] for doc in docs])[0]315316 # Get the indices ordered by similarity score317 top_k_idx = similarity.argsort()[::-1][:k]318319 return [320 (321 Document(322 id=doc_dict["id"],323 page_content=doc_dict["text"],324 metadata=doc_dict["metadata"],325 ),326 float(similarity[idx].item()),327 doc_dict["vector"],328 )329 for idx in top_k_idx330 # Assign using walrus operator to avoid multiple lookups331 if (doc_dict := docs[idx])332 ]333334 def similarity_search_with_score_by_vector(335 self,336 embedding: list[float],337 k: int = 4,338 filter: Callable[[Document], bool] | None = None, # noqa: A002339 **_kwargs: Any,340 ) -> list[tuple[Document, float]]:341 """Search for the most similar documents to the given embedding.342343 Args:344 embedding: The embedding to search for.345 k: The number of documents to return.346 filter: A function to filter the documents.347348 Returns:349 A list of tuples of `Document` objects and their similarity scores.350 """351 return [352 (doc, similarity)353 for doc, similarity, _ in self._similarity_search_with_score_by_vector(354 embedding=embedding, k=k, filter=filter355 )356 ]357358 @override359 def similarity_search_with_score(360 self,361 query: str,362 k: int = 4,363 **kwargs: Any,364 ) -> list[tuple[Document, float]]:365 embedding = self.embedding.embed_query(query)366 return self.similarity_search_with_score_by_vector(367 embedding,368 k,369 **kwargs,370 )371372 @override373 async def asimilarity_search_with_score(374 self, query: str, k: int = 4, **kwargs: Any375 ) -> list[tuple[Document, float]]:376 embedding = await self.embedding.aembed_query(query)377 return self.similarity_search_with_score_by_vector(378 embedding,379 k,380 **kwargs,381 )382383 @override384 def similarity_search_by_vector(385 self,386 embedding: list[float],387 k: int = 4,388 **kwargs: Any,389 ) -> list[Document]:390 docs_and_scores = self.similarity_search_with_score_by_vector(391 embedding,392 k,393 **kwargs,394 )395 return [doc for doc, _ in docs_and_scores]396397 @override398 async def asimilarity_search_by_vector(399 self, embedding: list[float], k: int = 4, **kwargs: Any400 ) -> list[Document]:401 return self.similarity_search_by_vector(embedding, k, **kwargs)402403 @override404 def similarity_search(405 self, query: str, k: int = 4, **kwargs: Any406 ) -> list[Document]:407 return [doc for doc, _ in self.similarity_search_with_score(query, k, **kwargs)]408409 @override410 async def asimilarity_search(411 self, query: str, k: int = 4, **kwargs: Any412 ) -> list[Document]:413 return [414 doc415 for doc, _ in await self.asimilarity_search_with_score(query, k, **kwargs)416 ]417418 @override419 def max_marginal_relevance_search_by_vector(420 self,421 embedding: list[float],422 k: int = 4,423 fetch_k: int = 20,424 lambda_mult: float = 0.5,425 *,426 filter: Callable[[Document], bool] | None = None,427 **kwargs: Any,428 ) -> list[Document]:429 prefetch_hits = self._similarity_search_with_score_by_vector(430 embedding=embedding,431 k=fetch_k,432 filter=filter,433 )434435 if not _HAS_NUMPY:436 msg = (437 "numpy must be installed to use max_marginal_relevance_search "438 "pip install numpy"439 )440 raise ImportError(msg)441442 mmr_chosen_indices = maximal_marginal_relevance(443 np.array(embedding, dtype=np.float32),444 [vector for _, _, vector in prefetch_hits],445 k=k,446 lambda_mult=lambda_mult,447 )448 return [prefetch_hits[idx][0] for idx in mmr_chosen_indices]449450 @override451 def max_marginal_relevance_search(452 self,453 query: str,454 k: int = 4,455 fetch_k: int = 20,456 lambda_mult: float = 0.5,457 **kwargs: Any,458 ) -> list[Document]:459 embedding_vector = self.embedding.embed_query(query)460 return self.max_marginal_relevance_search_by_vector(461 embedding_vector,462 k,463 fetch_k,464 lambda_mult=lambda_mult,465 **kwargs,466 )467468 @override469 async def amax_marginal_relevance_search(470 self,471 query: str,472 k: int = 4,473 fetch_k: int = 20,474 lambda_mult: float = 0.5,475 **kwargs: Any,476 ) -> list[Document]:477 embedding_vector = await self.embedding.aembed_query(query)478 return self.max_marginal_relevance_search_by_vector(479 embedding_vector,480 k,481 fetch_k,482 lambda_mult=lambda_mult,483 **kwargs,484 )485486 @classmethod487 @override488 def from_texts(489 cls,490 texts: list[str],491 embedding: Embeddings,492 metadatas: list[dict] | None = None,493 **kwargs: Any,494 ) -> InMemoryVectorStore:495 store = cls(496 embedding=embedding,497 )498 store.add_texts(texts=texts, metadatas=metadatas, **kwargs)499 return store500501 @classmethod502 @override503 async def afrom_texts(504 cls,505 texts: list[str],506 embedding: Embeddings,507 metadatas: list[dict] | None = None,508 **kwargs: Any,509 ) -> InMemoryVectorStore:510 store = cls(511 embedding=embedding,512 )513 await store.aadd_texts(texts=texts, metadatas=metadatas, **kwargs)514 return store515516 @classmethod517 def load(518 cls, path: str, embedding: Embeddings, **kwargs: Any519 ) -> InMemoryVectorStore:520 """Load a vector store from a file.521522 Args:523 path: The path to load the vector store from.524 embedding: The embedding to use.525 **kwargs: Additional arguments to pass to the constructor.526527 Returns:528 A `VectorStore` object.529 """530 path_: Path = Path(path)531 with path_.open("r", encoding="utf-8") as f:532 store = load(json.load(f), allowed_objects=[Document])533 vectorstore = cls(embedding=embedding, **kwargs)534 vectorstore.store = store535 return vectorstore536537 def dump(self, path: str) -> None:538 """Dump the vector store to a file.539540 Args:541 path: The path to dump the vector store to.542 """543 path_: Path = Path(path)544 path_.parent.mkdir(exist_ok=True, parents=True)545 with path_.open("w", encoding="utf-8") as f:546 json.dump(dumpd(self.store), f, indent=2)
Same data, no extra tab — call code_get_file + code_get_findings over MCP from Claude/Cursor/Copilot.