Use logging module for better control and configurability
print(blob.as_string())
1"""Base classes for media and documents.23This module contains core abstractions for **data retrieval and processing workflows**:45- `BaseMedia`: Base class providing `id` and `metadata` fields6- `Blob`: Raw data loading (files, binary data) - used by document loaders7- `Document`: Text content for retrieval (RAG, vector stores, semantic search)89!!! note "Not for LLM chat messages"1011 These classes are for data processing pipelines, not LLM I/O. For multimodal12 content in chat messages (images, audio in conversations), see13 `langchain.messages` content blocks instead.14"""1516from __future__ import annotations1718import contextlib19import mimetypes20from io import BufferedReader, BytesIO21from pathlib import Path, PurePath22from typing import TYPE_CHECKING, Any, Literal, cast2324from pydantic import ConfigDict, Field, model_validator2526from langchain_core.load.serializable import Serializable2728if TYPE_CHECKING:29 from collections.abc import Generator3031PathLike = str | PurePath323334class BaseMedia(Serializable):35 """Base class for content used in retrieval and data processing workflows.3637 Provides common fields for content that needs to be stored, indexed, or searched.3839 !!! note4041 For multimodal content in **chat messages** (images, audio sent to/from LLMs),42 use `langchain.messages` content blocks instead.43 """4445 # The ID field is optional at the moment.46 # It will likely become required in a future major release after47 # it has been adopted by enough VectorStore implementations.48 id: str | None = Field(default=None, coerce_numbers_to_str=True)49 """An optional identifier for the document.5051 Ideally this should be unique across the document collection and formatted52 as a UUID, but this will not be enforced.53 """5455 metadata: dict[Any, Any] = Field(default_factory=dict)56 """Arbitrary metadata associated with the content."""575859class Blob(BaseMedia):60 """Raw data abstraction for document loading and file processing.6162 Represents raw bytes or text, either in-memory or by file reference. Used63 primarily by document loaders to decouple data loading from parsing.6465 Inspired by [Mozilla's `Blob`](https://developer.mozilla.org/en-US/docs/Web/API/Blob)6667 ???+ example "Initialize a blob from in-memory data"6869 ```python70 from langchain_core.documents import Blob7172 blob = Blob.from_data("Hello, world!")7374 # Read the blob as a string75 print(blob.as_string())7677 # Read the blob as bytes78 print(blob.as_bytes())7980 # Read the blob as a byte stream81 with blob.as_bytes_io() as f:82 print(f.read())83 ```8485 ??? example "Load from memory and specify MIME type and metadata"8687 ```python88 from langchain_core.documents import Blob8990 blob = Blob.from_data(91 data="Hello, world!",92 mime_type="text/plain",93 metadata={"source": "https://example.com"},94 )95 ```9697 ??? example "Load the blob from a file"9899 ```python100 from langchain_core.documents import Blob101102 blob = Blob.from_path("path/to/file.txt")103104 # Read the blob as a string105 print(blob.as_string())106107 # Read the blob as bytes108 print(blob.as_bytes())109110 # Read the blob as a byte stream111 with blob.as_bytes_io() as f:112 print(f.read())113 ```114 """115116 data: bytes | str | None = None117 """Raw data associated with the `Blob`."""118119 mimetype: str | None = None120 """MIME type, not to be confused with a file extension."""121122 encoding: str = "utf-8"123 """Encoding to use if decoding the bytes into a string.124125 Uses `utf-8` as default encoding if decoding to string.126 """127128 path: PathLike | None = None129 """Location where the original content was found."""130131 model_config = ConfigDict(132 arbitrary_types_allowed=True,133 frozen=True,134 )135136 @property137 def source(self) -> str | None:138 """The source location of the blob as string if known otherwise none.139140 If a path is associated with the `Blob`, it will default to the path location.141142 Unless explicitly set via a metadata field called `'source'`, in which143 case that value will be used instead.144 """145 if self.metadata and "source" in self.metadata:146 return cast("str | None", self.metadata["source"])147 return str(self.path) if self.path else None148149 @model_validator(mode="before")150 @classmethod151 def check_blob_is_valid(cls, values: dict[str, Any]) -> Any:152 """Verify that either data or path is provided."""153 if "data" not in values and "path" not in values:154 msg = "Either data or path must be provided"155 raise ValueError(msg)156 return values157158 def as_string(self) -> str:159 """Read data as a string.160161 Raises:162 ValueError: If the blob cannot be represented as a string.163164 Returns:165 The data as a string.166 """167 if self.data is None and self.path:168 return Path(self.path).read_text(encoding=self.encoding)169 if isinstance(self.data, bytes):170 return self.data.decode(self.encoding)171 if isinstance(self.data, str):172 return self.data173 msg = f"Unable to get string for blob {self}"174 raise ValueError(msg)175176 def as_bytes(self) -> bytes:177 """Read data as bytes.178179 Raises:180 ValueError: If the blob cannot be represented as bytes.181182 Returns:183 The data as bytes.184 """185 if isinstance(self.data, bytes):186 return self.data187 if isinstance(self.data, str):188 return self.data.encode(self.encoding)189 if self.data is None and self.path:190 return Path(self.path).read_bytes()191 msg = f"Unable to get bytes for blob {self}"192 raise ValueError(msg)193194 @contextlib.contextmanager195 def as_bytes_io(self) -> Generator[BytesIO | BufferedReader, None, None]:196 """Read data as a byte stream.197198 Raises:199 NotImplementedError: If the blob cannot be represented as a byte stream.200201 Yields:202 The data as a byte stream.203 """204 if isinstance(self.data, bytes):205 yield BytesIO(self.data)206 elif self.data is None and self.path:207 with Path(self.path).open("rb") as f:208 yield f209 else:210 msg = f"Unable to convert blob {self}"211 raise NotImplementedError(msg)212213 @classmethod214 def from_path(215 cls,216 path: PathLike,217 *,218 encoding: str = "utf-8",219 mime_type: str | None = None,220 guess_type: bool = True,221 metadata: dict[Any, Any] | None = None,222 ) -> Blob:223 """Load the blob from a path like object.224225 Args:226 path: Path-like object to file to be read227 encoding: Encoding to use if decoding the bytes into a string228 mime_type: If provided, will be set as the MIME type of the data229 guess_type: If `True`, the MIME type will be guessed from the file230 extension, if a MIME type was not provided231 metadata: Metadata to associate with the `Blob`232233 Returns:234 `Blob` instance235 """236 if mime_type is None and guess_type:237 mimetype = mimetypes.guess_type(path)[0]238 else:239 mimetype = mime_type240 # We do not load the data immediately, instead we treat the blob as a241 # reference to the underlying data.242 return cls(243 data=None,244 mimetype=mimetype,245 encoding=encoding,246 path=path,247 metadata=metadata if metadata is not None else {},248 )249250 @classmethod251 def from_data(252 cls,253 data: str | bytes,254 *,255 encoding: str = "utf-8",256 mime_type: str | None = None,257 path: str | None = None,258 metadata: dict[Any, Any] | None = None,259 ) -> Blob:260 """Initialize the `Blob` from in-memory data.261262 Args:263 data: The in-memory data associated with the `Blob`264 encoding: Encoding to use if decoding the bytes into a string265 mime_type: If provided, will be set as the MIME type of the data266 path: If provided, will be set as the source from which the data came267 metadata: Metadata to associate with the `Blob`268269 Returns:270 `Blob` instance271 """272 return cls(273 data=data,274 mimetype=mime_type,275 encoding=encoding,276 path=path,277 metadata=metadata if metadata is not None else {},278 )279280 def __repr__(self) -> str:281 """Return the blob representation."""282 str_repr = f"Blob {id(self)}"283 if self.source:284 str_repr += f" {self.source}"285 return str_repr286287288class Document(BaseMedia):289 """Class for storing a piece of text and associated metadata.290291 !!! note292293 `Document` is for **retrieval workflows**, not chat I/O. For sending text294 to an LLM in a conversation, use message types from `langchain.messages`.295296 Example:297 ```python298 from langchain_core.documents import Document299300 document = Document(301 page_content="Hello, world!", metadata={"source": "https://example.com"}302 )303 ```304 """305306 page_content: str307 """String text."""308309 type: Literal["Document"] = "Document"310311 def __init__(self, page_content: str, **kwargs: Any) -> None:312 """Pass page_content in as positional or named arg."""313 # my-py is complaining that page_content is not defined on the base class.314 # Here, we're relying on pydantic base class to handle the validation.315 super().__init__(page_content=page_content, **kwargs) # type: ignore[call-arg,unused-ignore]316317 @classmethod318 def is_lc_serializable(cls) -> bool:319 """Return `True` as this class is serializable."""320 return True321322 @classmethod323 def get_lc_namespace(cls) -> list[str]:324 """Get the namespace of the LangChain object.325326 Returns:327 `["langchain", "schema", "document"]`328 """329 return ["langchain", "schema", "document"]330331 def __str__(self) -> str:332 """Override `__str__` to restrict it to page_content and metadata.333334 Returns:335 A string representation of the `Document`.336 """337 # The format matches pydantic format for __str__.338 #339 # The purpose of this change is to make sure that user code that feeds340 # Document objects directly into prompts remains unchanged due to the addition341 # of the id field (or any other fields in the future).342 #343 # This override will likely be removed in the future in favor of a more general344 # solution of formatting content directly inside the prompts.345 if self.metadata:346 return f"page_content='{self.page_content}' metadata={self.metadata}"347 return f"page_content='{self.page_content}'"
Same data, no extra tab — call code_get_file + code_get_findings over MCP from Claude/Cursor/Copilot.