libs/core/langchain_core/documents/base.py · langchain-ai/langchain

1"""Base classes for media and documents.23This module contains core abstractions for **data retrieval and processing workflows**:45- `BaseMedia`: Base class providing `id` and `metadata` fields6- `Blob`: Raw data loading (files, binary data) - used by document loaders7- `Document`: Text content for retrieval (RAG, vector stores, semantic search)89!!! note "Not for LLM chat messages"1011    These classes are for data processing pipelines, not LLM I/O. For multimodal12    content in chat messages (images, audio in conversations), see13    `langchain.messages` content blocks instead.14"""1516from __future__ import annotations1718import contextlib19import mimetypes20from io import BufferedReader, BytesIO21from pathlib import Path, PurePath22from typing import TYPE_CHECKING, Any, Literal, cast2324from pydantic import ConfigDict, Field, model_validator2526from langchain_core.load.serializable import Serializable2728if TYPE_CHECKING:29    from collections.abc import Generator3031PathLike = str | PurePath323334class BaseMedia(Serializable):35    """Base class for content used in retrieval and data processing workflows.3637    Provides common fields for content that needs to be stored, indexed, or searched.3839    !!! note4041        For multimodal content in **chat messages** (images, audio sent to/from LLMs),42        use `langchain.messages` content blocks instead.43    """4445    # The ID field is optional at the moment.46    # It will likely become required in a future major release after47    # it has been adopted by enough VectorStore implementations.48    id: str | None = Field(default=None, coerce_numbers_to_str=True)49    """An optional identifier for the document.5051    Ideally this should be unique across the document collection and formatted52    as a UUID, but this will not be enforced.53    """5455    metadata: dict[Any, Any] = Field(default_factory=dict)56    """Arbitrary metadata associated with the content."""575859class Blob(BaseMedia):60    """Raw data abstraction for document loading and file processing.6162    Represents raw bytes or text, either in-memory or by file reference. Used63    primarily by document loaders to decouple data loading from parsing.6465    Inspired by [Mozilla's `Blob`](https://developer.mozilla.org/en-US/docs/Web/API/Blob)6667    ???+ example "Initialize a blob from in-memory data"6869        ```python70        from langchain_core.documents import Blob7172        blob = Blob.from_data("Hello, world!")7374        # Read the blob as a string75        print(blob.as_string())7677        # Read the blob as bytes78        print(blob.as_bytes())7980        # Read the blob as a byte stream81        with blob.as_bytes_io() as f:82            print(f.read())83        ```8485    ??? example "Load from memory and specify MIME type and metadata"8687        ```python88        from langchain_core.documents import Blob8990        blob = Blob.from_data(91            data="Hello, world!",92            mime_type="text/plain",93            metadata={"source": "https://example.com"},94        )95        ```9697    ??? example "Load the blob from a file"9899        ```python100        from langchain_core.documents import Blob101102        blob = Blob.from_path("path/to/file.txt")103104        # Read the blob as a string105        print(blob.as_string())106107        # Read the blob as bytes108        print(blob.as_bytes())109110        # Read the blob as a byte stream111        with blob.as_bytes_io() as f:112            print(f.read())113        ```114    """115116    data: bytes | str | None = None117    """Raw data associated with the `Blob`."""118119    mimetype: str | None = None120    """MIME type, not to be confused with a file extension."""121122    encoding: str = "utf-8"123    """Encoding to use if decoding the bytes into a string.124125    Uses `utf-8` as default encoding if decoding to string.126    """127128    path: PathLike | None = None129    """Location where the original content was found."""130131    model_config = ConfigDict(132        arbitrary_types_allowed=True,133        frozen=True,134    )135136    @property137    def source(self) -> str | None:138        """The source location of the blob as string if known otherwise none.139140        If a path is associated with the `Blob`, it will default to the path location.141142        Unless explicitly set via a metadata field called `'source'`, in which143        case that value will be used instead.144        """145        if self.metadata and "source" in self.metadata:146            return cast("str | None", self.metadata["source"])147        return str(self.path) if self.path else None148149    @model_validator(mode="before")150    @classmethod151    def check_blob_is_valid(cls, values: dict[str, Any]) -> Any:152        """Verify that either data or path is provided."""153        if "data" not in values and "path" not in values:154            msg = "Either data or path must be provided"155            raise ValueError(msg)156        return values157158    def as_string(self) -> str:159        """Read data as a string.160161        Raises:162            ValueError: If the blob cannot be represented as a string.163164        Returns:165            The data as a string.166        """167        if self.data is None and self.path:168            return Path(self.path).read_text(encoding=self.encoding)169        if isinstance(self.data, bytes):170            return self.data.decode(self.encoding)171        if isinstance(self.data, str):172            return self.data173        msg = f"Unable to get string for blob {self}"174        raise ValueError(msg)175176    def as_bytes(self) -> bytes:177        """Read data as bytes.178179        Raises:180            ValueError: If the blob cannot be represented as bytes.181182        Returns:183            The data as bytes.184        """185        if isinstance(self.data, bytes):186            return self.data187        if isinstance(self.data, str):188            return self.data.encode(self.encoding)189        if self.data is None and self.path:190            return Path(self.path).read_bytes()191        msg = f"Unable to get bytes for blob {self}"192        raise ValueError(msg)193194    @contextlib.contextmanager195    def as_bytes_io(self) -> Generator[BytesIO | BufferedReader, None, None]:196        """Read data as a byte stream.197198        Raises:199            NotImplementedError: If the blob cannot be represented as a byte stream.200201        Yields:202            The data as a byte stream.203        """204        if isinstance(self.data, bytes):205            yield BytesIO(self.data)206        elif self.data is None and self.path:207            with Path(self.path).open("rb") as f:208                yield f209        else:210            msg = f"Unable to convert blob {self}"211            raise NotImplementedError(msg)212213    @classmethod214    def from_path(215        cls,216        path: PathLike,217        *,218        encoding: str = "utf-8",219        mime_type: str | None = None,220        guess_type: bool = True,221        metadata: dict[Any, Any] | None = None,222    ) -> Blob:223        """Load the blob from a path like object.224225        Args:226            path: Path-like object to file to be read227            encoding: Encoding to use if decoding the bytes into a string228            mime_type: If provided, will be set as the MIME type of the data229            guess_type: If `True`, the MIME type will be guessed from the file230                extension, if a MIME type was not provided231            metadata: Metadata to associate with the `Blob`232233        Returns:234            `Blob` instance235        """236        if mime_type is None and guess_type:237            mimetype = mimetypes.guess_type(path)[0]238        else:239            mimetype = mime_type240        # We do not load the data immediately, instead we treat the blob as a241        # reference to the underlying data.242        return cls(243            data=None,244            mimetype=mimetype,245            encoding=encoding,246            path=path,247            metadata=metadata if metadata is not None else {},248        )249250    @classmethod251    def from_data(252        cls,253        data: str | bytes,254        *,255        encoding: str = "utf-8",256        mime_type: str | None = None,257        path: str | None = None,258        metadata: dict[Any, Any] | None = None,259    ) -> Blob:260        """Initialize the `Blob` from in-memory data.261262        Args:263            data: The in-memory data associated with the `Blob`264            encoding: Encoding to use if decoding the bytes into a string265            mime_type: If provided, will be set as the MIME type of the data266            path: If provided, will be set as the source from which the data came267            metadata: Metadata to associate with the `Blob`268269        Returns:270            `Blob` instance271        """272        return cls(273            data=data,274            mimetype=mime_type,275            encoding=encoding,276            path=path,277            metadata=metadata if metadata is not None else {},278        )279280    def __repr__(self) -> str:281        """Return the blob representation."""282        str_repr = f"Blob {id(self)}"283        if self.source:284            str_repr += f" {self.source}"285        return str_repr286287288class Document(BaseMedia):289    """Class for storing a piece of text and associated metadata.290291    !!! note292293        `Document` is for **retrieval workflows**, not chat I/O. For sending text294        to an LLM in a conversation, use message types from `langchain.messages`.295296    Example:297        ```python298        from langchain_core.documents import Document299300        document = Document(301            page_content="Hello, world!", metadata={"source": "https://example.com"}302        )303        ```304    """305306    page_content: str307    """String text."""308309    type: Literal["Document"] = "Document"310311    def __init__(self, page_content: str, **kwargs: Any) -> None:312        """Pass page_content in as positional or named arg."""313        # my-py is complaining that page_content is not defined on the base class.314        # Here, we're relying on pydantic base class to handle the validation.315        super().__init__(page_content=page_content, **kwargs)  # type: ignore[call-arg,unused-ignore]316317    @classmethod318    def is_lc_serializable(cls) -> bool:319        """Return `True` as this class is serializable."""320        return True321322    @classmethod323    def get_lc_namespace(cls) -> list[str]:324        """Get the namespace of the LangChain object.325326        Returns:327            `["langchain", "schema", "document"]`328        """329        return ["langchain", "schema", "document"]330331    def __str__(self) -> str:332        """Override `__str__` to restrict it to page_content and metadata.333334        Returns:335            A string representation of the `Document`.336        """337        # The format matches pydantic format for __str__.338        #339        # The purpose of this change is to make sure that user code that feeds340        # Document objects directly into prompts remains unchanged due to the addition341        # of the id field (or any other fields in the future).342        #343        # This override will likely be removed in the future in favor of a more general344        # solution of formatting content directly inside the prompts.345        if self.metadata:346            return f"page_content='{self.page_content}' metadata={self.metadata}"347        return f"page_content='{self.page_content}'"