libs/core/langchain_core/document_loaders/langsmith.py · langchain-ai/langchain

1"""LangSmith document loader."""23import datetime4import json5import uuid6from collections.abc import Callable, Iterator, Mapping, Sequence7from typing import Any89from langsmith import Client as LangSmithClient10from typing_extensions import override1112from langchain_core.document_loaders.base import BaseLoader13from langchain_core.documents import Document14from langchain_core.tracers._compat import pydantic_to_dict151617class LangSmithLoader(BaseLoader):18    """Load LangSmith Dataset examples as `Document` objects.1920    Loads the example inputs as the `Document` page content and places the entire21    example into the `Document` metadata. This allows you to easily create few-shot22    example retrievers from the loaded documents.2324    ??? example "Lazy loading"2526        ```python27        from langchain_core.document_loaders import LangSmithLoader2829        loader = LangSmithLoader(dataset_id="...", limit=100)30        docs = []31        for doc in loader.lazy_load():32            docs.append(doc)33        ```3435        ```python36        # -> [Document("...", metadata={"inputs": {...}, "outputs": {...}, ...}), ...]37        ```38    """3940    def __init__(41        self,42        *,43        dataset_id: uuid.UUID | str | None = None,44        dataset_name: str | None = None,45        example_ids: Sequence[uuid.UUID | str] | None = None,46        as_of: datetime.datetime | str | None = None,47        splits: Sequence[str] | None = None,48        inline_s3_urls: bool = True,49        offset: int = 0,50        limit: int | None = None,51        metadata: dict[str, Any] | None = None,52        filter: str | None = None,  # noqa: A00253        content_key: str = "",54        format_content: Callable[..., str] | None = None,55        client: LangSmithClient | None = None,56        **client_kwargs: Any,57    ) -> None:58        """Create a LangSmith loader.5960        Args:61            dataset_id: The ID of the dataset to filter by.62            dataset_name: The name of the dataset to filter by.63            content_key: The inputs key to set as `Document` page content.6465                `'.'` characters are interpreted as nested keys, e.g.66                `content_key="first.second"` will result in67                `Document(page_content=format_content(example.inputs["first"]["second"]))`68            format_content: Function for converting the content extracted from the example69                inputs into a string.7071                Defaults to JSON-encoding the contents.72            example_ids: The IDs of the examples to filter by.73            as_of: The dataset version tag or timestamp to retrieve the examples as of.7475                Response examples will only be those that were present at the time of76                the tagged (or timestamped) version.77            splits: A list of dataset splits, which are divisions of your dataset such78                as `train`, `test`, or `validation`.7980                Returns examples only from the specified splits.81            inline_s3_urls: Whether to inline S3 URLs.82            offset: The offset to start from.83            limit: The maximum number of examples to return.84            metadata: Metadata to filter by.85            filter: A structured filter string to apply to the examples.86            client: LangSmith Client.8788                If not provided will be initialized from below args.89            client_kwargs: Keyword args to pass to LangSmith client init.9091                Should only be specified if `client` isn't.9293        Raises:94            ValueError: If both `client` and `client_kwargs` are provided.95        """  # noqa: E50196        if client and client_kwargs:97            msg = (98                "Received both `client` and `client_kwargs`. "99                "Pass `client_kwargs` only when `client` is not provided."100            )101            raise ValueError(msg)102        self._client = client or LangSmithClient(**client_kwargs)103        self.content_key = list(content_key.split(".")) if content_key else []104        self.format_content = format_content or _stringify105        self.dataset_id = dataset_id106        self.dataset_name = dataset_name107        self.example_ids = example_ids108        self.as_of = as_of109        self.splits = splits110        self.inline_s3_urls = inline_s3_urls111        self.offset = offset112        self.limit = limit113        self.metadata = metadata114        self.filter = filter115116    @override117    def lazy_load(self) -> Iterator[Document]:118        for example in self._client.list_examples(119            dataset_id=self.dataset_id,120            dataset_name=self.dataset_name,121            example_ids=self.example_ids,122            as_of=self.as_of,123            splits=self.splits,124            inline_s3_urls=self.inline_s3_urls,125            offset=self.offset,126            limit=self.limit,127            metadata=self.metadata,128            filter=self.filter,129        ):130            content = _get_content_from_inputs(example.inputs, self.content_key)131            content_str = self.format_content(content)132            metadata = pydantic_to_dict(example)133            # Stringify datetime and UUID types.134            for k in ("dataset_id", "created_at", "modified_at", "source_run_id", "id"):135                metadata[k] = str(metadata[k]) if metadata[k] else metadata[k]136            yield Document(content_str, metadata=metadata)137138139def _get_content_from_inputs(inputs: Any, content_key: Sequence[str]) -> Any:140    """Resolve nested example input content for `LangSmithLoader`.141142    Args:143        inputs: Example input payload returned by LangSmith.144        content_key: Ordered key path used to extract the document content.145146    Returns:147        The extracted content value.148149    Raises:150        ValueError: If a key in `content_key` is missing, or a value along the path151            (including `inputs` itself) is not a mapping.152    """153    content = inputs154    full_path = ".".join(content_key)155156    for i, key in enumerate(content_key):157        current_path = ".".join(content_key[:i]) or "<root>"158        if not isinstance(content, Mapping):159            msg = (160                f"Could not resolve content_key {full_path!r}: expected a mapping at "161                f"{current_path!r}, but found {type(content).__name__}."162            )163            # A too-deep `content_key` is an invalid-argument error, not a runtime164            # type bug, so it is unified with the missing-key case as `ValueError`.165            raise ValueError(msg)  # noqa: TRY004166        if key not in content:167            msg = (168                f"Could not resolve content_key {full_path!r}: missing key {key!r} "169                f"under {current_path!r}."170            )171            raise ValueError(msg)172        content = content[key]173174    return content175176177def _stringify(x: str | dict[str, Any]) -> str:178    if isinstance(x, str):179        return x180    try:181        return json.dumps(x, indent=2)182    except Exception:183        return str(x)
Findings

✓ No findings reported for this file.
Findings

Get this view in your editor