Avoid unnecessary list conversions; use generators where possible
self.content_key = list(content_key.split(".")) if content_key else []
1"""LangSmith document loader."""23import datetime4import json5import uuid6from collections.abc import Callable, Iterator, Sequence7from typing import Any89from langsmith import Client as LangSmithClient10from typing_extensions import override1112from langchain_core.document_loaders.base import BaseLoader13from langchain_core.documents import Document14from langchain_core.tracers._compat import pydantic_to_dict151617class LangSmithLoader(BaseLoader):18 """Load LangSmith Dataset examples as `Document` objects.1920 Loads the example inputs as the `Document` page content and places the entire21 example into the `Document` metadata. This allows you to easily create few-shot22 example retrievers from the loaded documents.2324 ??? example "Lazy loading"2526 ```python27 from langchain_core.document_loaders import LangSmithLoader2829 loader = LangSmithLoader(dataset_id="...", limit=100)30 docs = []31 for doc in loader.lazy_load():32 docs.append(doc)33 ```3435 ```python36 # -> [Document("...", metadata={"inputs": {...}, "outputs": {...}, ...}), ...]37 ```38 """3940 def __init__(41 self,42 *,43 dataset_id: uuid.UUID | str | None = None,44 dataset_name: str | None = None,45 example_ids: Sequence[uuid.UUID | str] | None = None,46 as_of: datetime.datetime | str | None = None,47 splits: Sequence[str] | None = None,48 inline_s3_urls: bool = True,49 offset: int = 0,50 limit: int | None = None,51 metadata: dict[str, Any] | None = None,52 filter: str | None = None, # noqa: A00253 content_key: str = "",54 format_content: Callable[..., str] | None = None,55 client: LangSmithClient | None = None,56 **client_kwargs: Any,57 ) -> None:58 """Create a LangSmith loader.5960 Args:61 dataset_id: The ID of the dataset to filter by.62 dataset_name: The name of the dataset to filter by.63 content_key: The inputs key to set as `Document` page content.6465 `'.'` characters are interpreted as nested keys, e.g.66 `content_key="first.second"` will result in67 `Document(page_content=format_content(example.inputs["first"]["second"]))`68 format_content: Function for converting the content extracted from the example69 inputs into a string.7071 Defaults to JSON-encoding the contents.72 example_ids: The IDs of the examples to filter by.73 as_of: The dataset version tag or timestamp to retrieve the examples as of.7475 Response examples will only be those that were present at the time of76 the tagged (or timestamped) version.77 splits: A list of dataset splits, which are divisions of your dataset such78 as `train`, `test`, or `validation`.7980 Returns examples only from the specified splits.81 inline_s3_urls: Whether to inline S3 URLs.82 offset: The offset to start from.83 limit: The maximum number of examples to return.84 metadata: Metadata to filter by.85 filter: A structured filter string to apply to the examples.86 client: LangSmith Client.8788 If not provided will be initialized from below args.89 client_kwargs: Keyword args to pass to LangSmith client init.9091 Should only be specified if `client` isn't.9293 Raises:94 ValueError: If both `client` and `client_kwargs` are provided.95 """ # noqa: E50196 if client and client_kwargs:97 raise ValueError98 self._client = client or LangSmithClient(**client_kwargs)99 self.content_key = list(content_key.split(".")) if content_key else []100 self.format_content = format_content or _stringify101 self.dataset_id = dataset_id102 self.dataset_name = dataset_name103 self.example_ids = example_ids104 self.as_of = as_of105 self.splits = splits106 self.inline_s3_urls = inline_s3_urls107 self.offset = offset108 self.limit = limit109 self.metadata = metadata110 self.filter = filter111112 @override113 def lazy_load(self) -> Iterator[Document]:114 for example in self._client.list_examples(115 dataset_id=self.dataset_id,116 dataset_name=self.dataset_name,117 example_ids=self.example_ids,118 as_of=self.as_of,119 splits=self.splits,120 inline_s3_urls=self.inline_s3_urls,121 offset=self.offset,122 limit=self.limit,123 metadata=self.metadata,124 filter=self.filter,125 ):126 content: Any = example.inputs127 for key in self.content_key:128 content = content[key]129 content_str = self.format_content(content)130 metadata = pydantic_to_dict(example)131 # Stringify datetime and UUID types.132 for k in ("dataset_id", "created_at", "modified_at", "source_run_id", "id"):133 metadata[k] = str(metadata[k]) if metadata[k] else metadata[k]134 yield Document(content_str, metadata=metadata)135136137def _stringify(x: str | dict[str, Any]) -> str:138 if isinstance(x, str):139 return x140 try:141 return json.dumps(x, indent=2)142 except Exception:143 return str(x)
Same data, no extra tab — call code_get_file + code_get_findings over MCP from Claude/Cursor/Copilot.