libs/core/langchain_core/vectorstores/in_memory.py · langchain-ai/langchain

1"""In-memory vector store."""23from __future__ import annotations45import json6import uuid7from pathlib import Path8from typing import (9    TYPE_CHECKING,10    Any,11)1213from typing_extensions import override1415from langchain_core.documents import Document16from langchain_core.load import dumpd, load17from langchain_core.vectorstores import VectorStore18from langchain_core.vectorstores.utils import _cosine_similarity as cosine_similarity19from langchain_core.vectorstores.utils import maximal_marginal_relevance2021if TYPE_CHECKING:22    from collections.abc import Callable, Iterator, Sequence2324    from langchain_core.embeddings import Embeddings2526try:27    import numpy as np2829    _HAS_NUMPY = True30except ImportError:31    _HAS_NUMPY = False323334class InMemoryVectorStore(VectorStore):35    """In-memory vector store implementation.3637    Uses a dictionary, and computes cosine similarity for search using numpy.3839    Setup:40        Install `langchain-core`.4142        ```bash43        pip install -U langchain-core44        ```4546    Key init args — indexing params:4748        * embedding_function: Embeddings49            Embedding function to use.5051    Instantiate:52        ```python53        from langchain_core.vectorstores import InMemoryVectorStore54        from langchain_openai import OpenAIEmbeddings5556        vector_store = InMemoryVectorStore(OpenAIEmbeddings())57        ```5859    Add Documents:60        ```python61        from langchain_core.documents import Document6263        document_1 = Document(id="1", page_content="foo", metadata={"baz": "bar"})64        document_2 = Document(id="2", page_content="thud", metadata={"bar": "baz"})65        document_3 = Document(id="3", page_content="i will be deleted :(")6667        documents = [document_1, document_2, document_3]68        vector_store.add_documents(documents=documents)69        ```7071    Inspect documents:72        ```python73        top_n = 1074        for index, (id, doc) in enumerate(vector_store.store.items()):75            if index < top_n:76                # docs have keys 'id', 'vector', 'text', 'metadata'77                print(f"{id}: {doc['text']}")78            else:79                break80        ```8182    Delete Documents:83        ```python84        vector_store.delete(ids=["3"])85        ```8687    Search:88        ```python89        results = vector_store.similarity_search(query="thud", k=1)90        for doc in results:91            print(f"* {doc.page_content} [{doc.metadata}]")92        ```9394        ```txt95        * thud [{'bar': 'baz'}]96        ```9798    Search with filter:99        ```python100        def _filter_function(doc: Document) -> bool:101            return doc.metadata.get("bar") == "baz"102103104        results = vector_store.similarity_search(105            query="thud", k=1, filter=_filter_function106        )107        for doc in results:108            print(f"* {doc.page_content} [{doc.metadata}]")109        ```110111        ```txt112        * thud [{'bar': 'baz'}]113        ```114115    Search with score:116        ```python117        results = vector_store.similarity_search_with_score(query="qux", k=1)118        for doc, score in results:119            print(f"* [SIM={score:3f}] {doc.page_content} [{doc.metadata}]")120        ```121122        ```txt123        * [SIM=0.832268] foo [{'baz': 'bar'}]124        ```125126    Async:127        ```python128        # add documents129        # await vector_store.aadd_documents(documents=documents)130131        # delete documents132        # await vector_store.adelete(ids=["3"])133134        # search135        # results = vector_store.asimilarity_search(query="thud", k=1)136137        # search with score138        results = await vector_store.asimilarity_search_with_score(query="qux", k=1)139        for doc, score in results:140            print(f"* [SIM={score:3f}] {doc.page_content} [{doc.metadata}]")141        ```142143        ```txt144        * [SIM=0.832268] foo [{'baz': 'bar'}]145        ```146147    Use as Retriever:148        ```python149        retriever = vector_store.as_retriever(150            search_type="mmr",151            search_kwargs={"k": 1, "fetch_k": 2, "lambda_mult": 0.5},152        )153        retriever.invoke("thud")154        ```155156        ```txt157        [Document(id='2', metadata={'bar': 'baz'}, page_content='thud')]158        ```159    """160161    def __init__(self, embedding: Embeddings) -> None:162        """Initialize with the given embedding function.163164        Args:165            embedding: embedding function to use.166        """167        # TODO: would be nice to change to168        # dict[str, Document] at some point (will be a breaking change)169        self.store: dict[str, dict[str, Any]] = {}170        self.embedding = embedding171172    @property173    @override174    def embeddings(self) -> Embeddings:175        return self.embedding176177    @override178    def delete(self, ids: Sequence[str] | None = None, **kwargs: Any) -> None:179        if ids:180            for id_ in ids:181                self.store.pop(id_, None)182183    @override184    async def adelete(self, ids: Sequence[str] | None = None, **kwargs: Any) -> None:185        self.delete(ids)186187    @override188    def add_documents(189        self,190        documents: list[Document],191        ids: list[str] | None = None,192        **kwargs: Any,193    ) -> list[str]:194        texts = [doc.page_content for doc in documents]195        vectors = self.embedding.embed_documents(texts)196197        if ids and len(ids) != len(texts):198            msg = (199                f"ids must be the same length as texts. "200                f"Got {len(ids)} ids and {len(texts)} texts."201            )202            raise ValueError(msg)203204        id_iterator: Iterator[str | None] = (205            iter(ids) if ids else iter(doc.id for doc in documents)206        )207208        ids_ = []209210        for doc, vector in zip(documents, vectors, strict=False):211            doc_id = next(id_iterator)212            doc_id_ = doc_id or str(uuid.uuid4())213            ids_.append(doc_id_)214            self.store[doc_id_] = {215                "id": doc_id_,216                "vector": vector,217                "text": doc.page_content,218                "metadata": doc.metadata,219            }220221        return ids_222223    @override224    async def aadd_documents(225        self, documents: list[Document], ids: list[str] | None = None, **kwargs: Any226    ) -> list[str]:227        texts = [doc.page_content for doc in documents]228        vectors = await self.embedding.aembed_documents(texts)229230        if ids and len(ids) != len(texts):231            msg = (232                f"ids must be the same length as texts. "233                f"Got {len(ids)} ids and {len(texts)} texts."234            )235            raise ValueError(msg)236237        id_iterator: Iterator[str | None] = (238            iter(ids) if ids else iter(doc.id for doc in documents)239        )240        ids_: list[str] = []241242        for doc, vector in zip(documents, vectors, strict=False):243            doc_id = next(id_iterator)244            doc_id_ = doc_id or str(uuid.uuid4())245            ids_.append(doc_id_)246            self.store[doc_id_] = {247                "id": doc_id_,248                "vector": vector,249                "text": doc.page_content,250                "metadata": doc.metadata,251            }252253        return ids_254255    @override256    def get_by_ids(self, ids: Sequence[str], /) -> list[Document]:257        """Get documents by their ids.258259        Args:260            ids: The IDs of the documents to get.261262        Returns:263            A list of `Document` objects.264        """265        documents = []266267        for doc_id in ids:268            doc = self.store.get(doc_id)269            if doc:270                documents.append(271                    Document(272                        id=doc["id"],273                        page_content=doc["text"],274                        metadata=doc["metadata"],275                    )276                )277        return documents278279    @override280    async def aget_by_ids(self, ids: Sequence[str], /) -> list[Document]:281        """Async get documents by their ids.282283        Args:284            ids: The IDs of the documents to get.285286        Returns:287            A list of `Document` objects.288        """289        return self.get_by_ids(ids)290291    def _similarity_search_with_score_by_vector(292        self,293        embedding: list[float],294        k: int = 4,295        filter: Callable[[Document], bool] | None = None,  # noqa: A002296    ) -> list[tuple[Document, float, list[float]]]:297        # Get all docs with fixed order in list298        docs = list(self.store.values())299300        if filter is not None:301            docs = [302                doc303                for doc in docs304                if filter(305                    Document(306                        id=doc["id"], page_content=doc["text"], metadata=doc["metadata"]307                    )308                )309            ]310311        if not docs:312            return []313314        similarity = cosine_similarity([embedding], [doc["vector"] for doc in docs])[0]315316        # Get the indices ordered by similarity score317        top_k_idx = similarity.argsort()[::-1][:k]318319        return [320            (321                Document(322                    id=doc_dict["id"],323                    page_content=doc_dict["text"],324                    metadata=doc_dict["metadata"],325                ),326                float(similarity[idx].item()),327                doc_dict["vector"],328            )329            for idx in top_k_idx330            # Assign using walrus operator to avoid multiple lookups331            if (doc_dict := docs[idx])332        ]333334    def similarity_search_with_score_by_vector(335        self,336        embedding: list[float],337        k: int = 4,338        filter: Callable[[Document], bool] | None = None,  # noqa: A002339        **_kwargs: Any,340    ) -> list[tuple[Document, float]]:341        """Search for the most similar documents to the given embedding.342343        Args:344            embedding: The embedding to search for.345            k: The number of documents to return.346            filter: A function to filter the documents.347348        Returns:349            A list of tuples of `Document` objects and their similarity scores.350        """351        return [352            (doc, similarity)353            for doc, similarity, _ in self._similarity_search_with_score_by_vector(354                embedding=embedding, k=k, filter=filter355            )356        ]357358    @override359    def similarity_search_with_score(360        self,361        query: str,362        k: int = 4,363        **kwargs: Any,364    ) -> list[tuple[Document, float]]:365        embedding = self.embedding.embed_query(query)366        return self.similarity_search_with_score_by_vector(367            embedding,368            k,369            **kwargs,370        )371372    @override373    async def asimilarity_search_with_score(374        self, query: str, k: int = 4, **kwargs: Any375    ) -> list[tuple[Document, float]]:376        embedding = await self.embedding.aembed_query(query)377        return self.similarity_search_with_score_by_vector(378            embedding,379            k,380            **kwargs,381        )382383    @override384    def similarity_search_by_vector(385        self,386        embedding: list[float],387        k: int = 4,388        **kwargs: Any,389    ) -> list[Document]:390        docs_and_scores = self.similarity_search_with_score_by_vector(391            embedding,392            k,393            **kwargs,394        )395        return [doc for doc, _ in docs_and_scores]396397    @override398    async def asimilarity_search_by_vector(399        self, embedding: list[float], k: int = 4, **kwargs: Any400    ) -> list[Document]:401        return self.similarity_search_by_vector(embedding, k, **kwargs)402403    @override404    def similarity_search(405        self, query: str, k: int = 4, **kwargs: Any406    ) -> list[Document]:407        return [doc for doc, _ in self.similarity_search_with_score(query, k, **kwargs)]408409    @override410    async def asimilarity_search(411        self, query: str, k: int = 4, **kwargs: Any412    ) -> list[Document]:413        return [414            doc415            for doc, _ in await self.asimilarity_search_with_score(query, k, **kwargs)416        ]417418    @override419    def max_marginal_relevance_search_by_vector(420        self,421        embedding: list[float],422        k: int = 4,423        fetch_k: int = 20,424        lambda_mult: float = 0.5,425        *,426        filter: Callable[[Document], bool] | None = None,427        **kwargs: Any,428    ) -> list[Document]:429        prefetch_hits = self._similarity_search_with_score_by_vector(430            embedding=embedding,431            k=fetch_k,432            filter=filter,433        )434435        if not _HAS_NUMPY:436            msg = (437                "numpy must be installed to use max_marginal_relevance_search "438                "pip install numpy"439            )440            raise ImportError(msg)441442        mmr_chosen_indices = maximal_marginal_relevance(443            np.array(embedding, dtype=np.float32),444            [vector for _, _, vector in prefetch_hits],445            k=k,446            lambda_mult=lambda_mult,447        )448        return [prefetch_hits[idx][0] for idx in mmr_chosen_indices]449450    @override451    def max_marginal_relevance_search(452        self,453        query: str,454        k: int = 4,455        fetch_k: int = 20,456        lambda_mult: float = 0.5,457        **kwargs: Any,458    ) -> list[Document]:459        embedding_vector = self.embedding.embed_query(query)460        return self.max_marginal_relevance_search_by_vector(461            embedding_vector,462            k,463            fetch_k,464            lambda_mult=lambda_mult,465            **kwargs,466        )467468    @override469    async def amax_marginal_relevance_search(470        self,471        query: str,472        k: int = 4,473        fetch_k: int = 20,474        lambda_mult: float = 0.5,475        **kwargs: Any,476    ) -> list[Document]:477        embedding_vector = await self.embedding.aembed_query(query)478        return self.max_marginal_relevance_search_by_vector(479            embedding_vector,480            k,481            fetch_k,482            lambda_mult=lambda_mult,483            **kwargs,484        )485486    @classmethod487    @override488    def from_texts(489        cls,490        texts: list[str],491        embedding: Embeddings,492        metadatas: list[dict] | None = None,493        **kwargs: Any,494    ) -> InMemoryVectorStore:495        store = cls(496            embedding=embedding,497        )498        store.add_texts(texts=texts, metadatas=metadatas, **kwargs)499        return store500501    @classmethod502    @override503    async def afrom_texts(504        cls,505        texts: list[str],506        embedding: Embeddings,507        metadatas: list[dict] | None = None,508        **kwargs: Any,509    ) -> InMemoryVectorStore:510        store = cls(511            embedding=embedding,512        )513        await store.aadd_texts(texts=texts, metadatas=metadatas, **kwargs)514        return store515516    @classmethod517    def load(518        cls, path: str, embedding: Embeddings, **kwargs: Any519    ) -> InMemoryVectorStore:520        """Load a vector store from a file.521522        Args:523            path: The path to load the vector store from.524            embedding: The embedding to use.525            **kwargs: Additional arguments to pass to the constructor.526527        Returns:528            A `VectorStore` object.529        """530        path_: Path = Path(path)531        with path_.open("r", encoding="utf-8") as f:532            store = load(json.load(f), allowed_objects=[Document])533        vectorstore = cls(embedding=embedding, **kwargs)534        vectorstore.store = store535        return vectorstore536537    def dump(self, path: str) -> None:538        """Dump the vector store to a file.539540        Args:541            path: The path to dump the vector store to.542        """543        path_: Path = Path(path)544        path_.parent.mkdir(exist_ok=True, parents=True)545        with path_.open("w", encoding="utf-8") as f:546            json.dump(dumpd(self.store), f, indent=2)

Code quality findings 24

Use logging module for better control and configurability

L77

print-statement

print(f"{id}: {doc['text']}")

Use logging module for better control and configurability

L91

print-statement

print(f"* {doc.page_content} [{doc.metadata}]")

Use logging module for better control and configurability

L108

print-statement

print(f"* {doc.page_content} [{doc.metadata}]")

Use logging module for better control and configurability

L119

print-statement

print(f"* [SIM={score:3f}] {doc.page_content} [{doc.metadata}]")

Use logging module for better control and configurability

L140

print-statement

print(f"* [SIM={score:3f}] {doc.page_content} [{doc.metadata}]")

Ensure functions have docstrings for documentation

L174

missing-docstring

def embeddings(self) -> Embeddings:

Ensure functions have docstrings for documentation

L178

missing-docstring

def delete(self, ids: Sequence[str] | None = None, **kwargs: Any) -> None:

Ensure functions have docstrings for documentation

L184

missing-docstring

async def adelete(self, ids: Sequence[str] | None = None, **kwargs: Any) -> None:

Ensure functions have docstrings for documentation

L188

missing-docstring

def add_documents(

Ensure functions have docstrings for documentation

L224

missing-docstring

async def aadd_documents(

Avoid unnecessary list conversions; use generators where possible

L298

unnecessary-list

docs = list(self.store.values())

Ensure functions have docstrings for documentation

L334

missing-docstring

def similarity_search_with_score_by_vector(

Ensure functions have docstrings for documentation

L359

missing-docstring

def similarity_search_with_score(

Ensure functions have docstrings for documentation

L373

missing-docstring

async def asimilarity_search_with_score(

Ensure functions have docstrings for documentation

L384

missing-docstring

def similarity_search_by_vector(

Ensure functions have docstrings for documentation

L398

missing-docstring

async def asimilarity_search_by_vector(

Ensure functions have docstrings for documentation

L404

missing-docstring

def similarity_search(

Ensure functions have docstrings for documentation

L410

missing-docstring

async def asimilarity_search(

Ensure functions have docstrings for documentation

L419

missing-docstring

def max_marginal_relevance_search_by_vector(

Ensure functions have docstrings for documentation

L451

missing-docstring

def max_marginal_relevance_search(

Ensure functions have docstrings for documentation

L469

missing-docstring

async def amax_marginal_relevance_search(

Ensure functions have docstrings for documentation

L488

missing-docstring

def from_texts(

Ensure functions have docstrings for documentation

L503

missing-docstring

async def afrom_texts(

Ensure functions have docstrings for documentation

L517

missing-docstring

def load(

Code quality findings 24

Get this view in your editor