libs/core/langchain_core/document_loaders/langsmith.py PYTHON 144 lines View on github.com → Search inside
1"""LangSmith document loader."""23import datetime4import json5import uuid6from collections.abc import Callable, Iterator, Sequence7from typing import Any89from langsmith import Client as LangSmithClient10from typing_extensions import override1112from langchain_core.document_loaders.base import BaseLoader13from langchain_core.documents import Document14from langchain_core.tracers._compat import pydantic_to_dict151617class LangSmithLoader(BaseLoader):18    """Load LangSmith Dataset examples as `Document` objects.1920    Loads the example inputs as the `Document` page content and places the entire21    example into the `Document` metadata. This allows you to easily create few-shot22    example retrievers from the loaded documents.2324    ??? example "Lazy loading"2526        ```python27        from langchain_core.document_loaders import LangSmithLoader2829        loader = LangSmithLoader(dataset_id="...", limit=100)30        docs = []31        for doc in loader.lazy_load():32            docs.append(doc)33        ```3435        ```python36        # -> [Document("...", metadata={"inputs": {...}, "outputs": {...}, ...}), ...]37        ```38    """3940    def __init__(41        self,42        *,43        dataset_id: uuid.UUID | str | None = None,44        dataset_name: str | None = None,45        example_ids: Sequence[uuid.UUID | str] | None = None,46        as_of: datetime.datetime | str | None = None,47        splits: Sequence[str] | None = None,48        inline_s3_urls: bool = True,49        offset: int = 0,50        limit: int | None = None,51        metadata: dict[str, Any] | None = None,52        filter: str | None = None,  # noqa: A00253        content_key: str = "",54        format_content: Callable[..., str] | None = None,55        client: LangSmithClient | None = None,56        **client_kwargs: Any,57    ) -> None:58        """Create a LangSmith loader.5960        Args:61            dataset_id: The ID of the dataset to filter by.62            dataset_name: The name of the dataset to filter by.63            content_key: The inputs key to set as `Document` page content.6465                `'.'` characters are interpreted as nested keys, e.g.66                `content_key="first.second"` will result in67                `Document(page_content=format_content(example.inputs["first"]["second"]))`68            format_content: Function for converting the content extracted from the example69                inputs into a string.7071                Defaults to JSON-encoding the contents.72            example_ids: The IDs of the examples to filter by.73            as_of: The dataset version tag or timestamp to retrieve the examples as of.7475                Response examples will only be those that were present at the time of76                the tagged (or timestamped) version.77            splits: A list of dataset splits, which are divisions of your dataset such78                as `train`, `test`, or `validation`.7980                Returns examples only from the specified splits.81            inline_s3_urls: Whether to inline S3 URLs.82            offset: The offset to start from.83            limit: The maximum number of examples to return.84            metadata: Metadata to filter by.85            filter: A structured filter string to apply to the examples.86            client: LangSmith Client.8788                If not provided will be initialized from below args.89            client_kwargs: Keyword args to pass to LangSmith client init.9091                Should only be specified if `client` isn't.9293        Raises:94            ValueError: If both `client` and `client_kwargs` are provided.95        """  # noqa: E50196        if client and client_kwargs:97            raise ValueError98        self._client = client or LangSmithClient(**client_kwargs)99        self.content_key = list(content_key.split(".")) if content_key else []100        self.format_content = format_content or _stringify101        self.dataset_id = dataset_id102        self.dataset_name = dataset_name103        self.example_ids = example_ids104        self.as_of = as_of105        self.splits = splits106        self.inline_s3_urls = inline_s3_urls107        self.offset = offset108        self.limit = limit109        self.metadata = metadata110        self.filter = filter111112    @override113    def lazy_load(self) -> Iterator[Document]:114        for example in self._client.list_examples(115            dataset_id=self.dataset_id,116            dataset_name=self.dataset_name,117            example_ids=self.example_ids,118            as_of=self.as_of,119            splits=self.splits,120            inline_s3_urls=self.inline_s3_urls,121            offset=self.offset,122            limit=self.limit,123            metadata=self.metadata,124            filter=self.filter,125        ):126            content: Any = example.inputs127            for key in self.content_key:128                content = content[key]129            content_str = self.format_content(content)130            metadata = pydantic_to_dict(example)131            # Stringify datetime and UUID types.132            for k in ("dataset_id", "created_at", "modified_at", "source_run_id", "id"):133                metadata[k] = str(metadata[k]) if metadata[k] else metadata[k]134            yield Document(content_str, metadata=metadata)135136137def _stringify(x: str | dict[str, Any]) -> str:138    if isinstance(x, str):139        return x140    try:141        return json.dumps(x, indent=2)142    except Exception:143        return str(x)

Code quality findings 4

Avoid unnecessary list conversions; use generators where possible
unnecessary-list
self.content_key = list(content_key.split(".")) if content_key else []
Ensure functions have docstrings for documentation
missing-docstring
def lazy_load(self) -> Iterator[Document]:
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
if isinstance(x, str):
Catch specific exceptions instead of Exception to avoid masking bugs
broad-except
except Exception:

Get this view in your editor

Same data, no extra tab — call code_get_file + code_get_findings over MCP from Claude/Cursor/Copilot.