Ensure functions have docstrings for documentation
def get_aggregate_feedback(
1"""Utilities for running language models or Chains over datasets."""23from __future__ import annotations45import concurrent.futures6import dataclasses7import functools8import inspect9import logging10import uuid11from collections.abc import Callable12from datetime import datetime, timezone13from typing import (14 TYPE_CHECKING,15 Any,16 cast,17)1819from langchain_core._api import warn_deprecated20from langchain_core.callbacks import Callbacks21from langchain_core.language_models import BaseLanguageModel22from langchain_core.messages import BaseMessage, messages_from_dict23from langchain_core.outputs import ChatResult, LLMResult24from langchain_core.runnables import Runnable, RunnableConfig, RunnableLambda25from langchain_core.runnables import config as runnable_config26from langchain_core.runnables import utils as runnable_utils27from langchain_core.tracers.evaluation import (28 EvaluatorCallbackHandler,29 wait_for_all_evaluators,30)31from langchain_core.tracers.langchain import LangChainTracer32from langsmith.client import Client33from langsmith.env import get_git_info, get_langchain_env_var_metadata34from langsmith.evaluation import (35 EvaluationResult,36 RunEvaluator,37)38from langsmith.evaluation import (39 run_evaluator as run_evaluator_dec,40)41from langsmith.run_helpers import as_runnable, is_traceable_function42from langsmith.schemas import Dataset, DataType, Example, Run, TracerSession43from langsmith.utils import LangSmithError44from requests import HTTPError45from typing_extensions import TypedDict4647from langchain_classic.chains.base import Chain48from langchain_classic.evaluation.loading import load_evaluator49from langchain_classic.evaluation.schema import (50 EvaluatorType,51 PairwiseStringEvaluator,52 StringEvaluator,53)54from langchain_classic.smith import evaluation as smith_eval55from langchain_classic.smith.evaluation import config as smith_eval_config56from langchain_classic.smith.evaluation import name_generation, progress5758if TYPE_CHECKING:59 import pandas as pd6061logger = logging.getLogger(__name__)6263MODEL_OR_CHAIN_FACTORY = (64 Callable[[], Chain | Runnable]65 | BaseLanguageModel66 | Callable[[dict], Any]67 | Runnable68 | Chain69)70MCF = Callable[[], Chain | Runnable] | BaseLanguageModel717273class InputFormatError(Exception):74 """Raised when the input format is invalid."""757677## Shared Utilities787980class TestResult(dict):81 """A dictionary of the results of a single test run."""8283 def get_aggregate_feedback(84 self,85 ) -> pd.DataFrame:86 """Return quantiles for the feedback scores.8788 This method calculates and prints the quantiles for the feedback scores89 across all feedback keys.9091 Returns:92 A DataFrame containing the quantiles for each feedback key.93 """94 df = self.to_dataframe()95 # Drop all things starting with inputs., outputs., and reference96 to_drop = [97 col98 for col in df.columns99 if col.startswith(("inputs.", "outputs.", "reference"))100 or col in {"input", "output"}101 ]102 return df.describe(include="all").drop(to_drop, axis=1)103104 def to_dataframe(self) -> pd.DataFrame:105 """Convert the results to a dataframe."""106 try:107 import pandas as pd108 except ImportError as e:109 msg = (110 "Pandas is required to convert the results to a dataframe."111 " to install pandas, run `pip install pandas`."112 )113 raise ImportError(msg) from e114115 indices = []116 records = []117 for example_id, result in self["results"].items():118 feedback = result["feedback"]119 output_ = result.get("output")120 if isinstance(output_, dict):121 output = {f"outputs.{k}": v for k, v in output_.items()}122 elif output_ is None:123 output = {}124 else:125 output = {"output": output_}126127 r = {128 **{f"inputs.{k}": v for k, v in result["input"].items()},129 **output,130 }131 if "reference" in result:132 if isinstance(result["reference"], dict):133 r.update(134 {f"reference.{k}": v for k, v in result["reference"].items()},135 )136 else:137 r["reference"] = result["reference"]138 r.update(139 {140 **{f"feedback.{f.key}": f.score for f in feedback},141 "error": result.get("Error"),142 "execution_time": result["execution_time"],143 "run_id": result.get("run_id"),144 },145 )146 records.append(r)147 indices.append(example_id)148149 return pd.DataFrame(records, index=indices)150151152class EvalError(dict):153 """Your architecture raised an error."""154155 def __init__(self, Error: BaseException, **kwargs: Any) -> None: # noqa: N803156 """Initialize the `EvalError` with an error and additional attributes.157158 Args:159 Error: The error that occurred.160 **kwargs: Additional attributes to include in the error.161 """162 super().__init__(Error=Error, **kwargs)163164 def __getattr__(self, name: str) -> Any:165 """Get an attribute from the `EvalError`.166167 Args:168 name: The name of the attribute to get.169170 Returns:171 The value of the attribute.172173 Raises:174 AttributeError: If the attribute does not exist.175 """176 try:177 return self[name]178 except KeyError as e:179 msg = f"'EvalError' object has no attribute '{name}'"180 raise AttributeError(msg) from e181182183def _wrap_in_chain_factory(184 llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,185 dataset_name: str = "<my_dataset>",186) -> MCF:187 """Wrap in a chain factory.188189 Forgive the user if they pass in a chain without memory instead of a chain190 factory. It's a common mistake. Raise a more helpful error message as well.191 """192 if isinstance(llm_or_chain_factory, Chain):193 chain = llm_or_chain_factory194 chain_class = chain.__class__.__name__195 if llm_or_chain_factory.memory is not None:196 memory_class = chain.memory.__class__.__name__197 msg = (198 "Cannot directly evaluate a chain with stateful memory."199 " To evaluate this chain, pass in a chain constructor"200 " that initializes fresh memory each time it is called."201 " This will safeguard against information"202 " leakage between dataset examples."203 "\nFor example:\n\n"204 "def chain_constructor():\n"205 f" new_memory = {memory_class}(...)\n"206 f" return {chain_class}"207 "(memory=new_memory, ...)\n\n"208 f'run_on_dataset("{dataset_name}", chain_constructor, ...)'209 )210 raise ValueError(msg)211 return lambda: chain212 if isinstance(llm_or_chain_factory, BaseLanguageModel):213 return llm_or_chain_factory214 if isinstance(llm_or_chain_factory, Runnable):215 # Memory may exist here, but it's not elegant to check all those cases.216 lcf = llm_or_chain_factory217 return lambda: lcf218 if callable(llm_or_chain_factory):219 if is_traceable_function(llm_or_chain_factory):220 runnable_ = as_runnable(cast("Callable", llm_or_chain_factory))221 return lambda: runnable_222 try:223 _model = llm_or_chain_factory() # type: ignore[call-arg]224 except TypeError:225 # It's an arbitrary function, wrap it in a RunnableLambda226 user_func = cast("Callable", llm_or_chain_factory)227 sig = inspect.signature(user_func)228 logger.info("Wrapping function %s as RunnableLambda.", sig)229 wrapped = RunnableLambda(user_func)230 return lambda: wrapped231 constructor = cast("Callable", llm_or_chain_factory)232 if isinstance(_model, BaseLanguageModel):233 # It's not uncommon to do an LLM constructor instead of raw LLM,234 # so we'll unpack it for the user.235 return _model236 if is_traceable_function(cast("Callable", _model)):237 runnable_ = as_runnable(cast("Callable", _model))238 return lambda: runnable_239 if not isinstance(_model, Runnable):240 # This is unlikely to happen - a constructor for a model function241 return lambda: RunnableLambda(constructor)242 # Typical correct case243 return constructor244 return llm_or_chain_factory # type: ignore[unreachable]245246247def _get_prompt(inputs: dict[str, Any]) -> str:248 """Get prompt from inputs.249250 Args:251 inputs: The input dictionary.252253 Returns:254 A string prompt.255256 Raises:257 InputFormatError: If the input format is invalid.258 """259 if not inputs:260 msg = "Inputs should not be empty."261 raise InputFormatError(msg)262263 prompts = []264 if "prompt" in inputs:265 if not isinstance(inputs["prompt"], str):266 msg = f"Expected string for 'prompt', got {type(inputs['prompt']).__name__}"267 raise InputFormatError(msg)268 prompts = [inputs["prompt"]]269 elif "prompts" in inputs:270 if not isinstance(inputs["prompts"], list) or not all(271 isinstance(i, str) for i in inputs["prompts"]272 ):273 msg = (274 "Expected list of strings for 'prompts',"275 f" got {type(inputs['prompts']).__name__}"276 )277 raise InputFormatError(msg)278 prompts = inputs["prompts"]279 elif len(inputs) == 1:280 prompt_ = next(iter(inputs.values()))281 if isinstance(prompt_, str):282 prompts = [prompt_]283 elif isinstance(prompt_, list) and all(isinstance(i, str) for i in prompt_):284 prompts = prompt_285 else:286 msg = f"LLM Run expects string prompt input. Got {inputs}"287 raise InputFormatError(msg)288 else:289 msg = f"LLM Run expects 'prompt' or 'prompts' in inputs. Got {inputs}"290 raise InputFormatError(msg)291 if len(prompts) == 1:292 return prompts[0]293 msg = f"LLM Run expects single prompt input. Got {len(prompts)} prompts."294 raise InputFormatError(msg)295296297class ChatModelInput(TypedDict):298 """Input for a chat model."""299300 messages: list[BaseMessage]301302303def _get_messages(inputs: dict[str, Any]) -> dict:304 """Get Chat Messages from inputs.305306 Args:307 inputs: The input dictionary.308309 Returns:310 A list of chat messages.311312 Raises:313 InputFormatError: If the input format is invalid.314 """315 if not inputs:316 msg = "Inputs should not be empty."317 raise InputFormatError(msg)318 input_copy = inputs.copy()319 if "messages" in inputs:320 input_copy["input"] = input_copy.pop("messages")321 elif len(inputs) == 1:322 input_copy["input"] = next(iter(inputs.values()))323 if "input" in input_copy:324 raw_messages = input_copy["input"]325 if isinstance(raw_messages, list) and all(326 isinstance(i, dict) for i in raw_messages327 ):328 raw_messages = [raw_messages]329 if len(raw_messages) == 1:330 input_copy["input"] = messages_from_dict(raw_messages[0])331 else:332 msg = (333 "Batch messages not supported. Please provide a"334 " single list of messages."335 )336 raise InputFormatError(msg)337 return input_copy338 msg = (339 f"Chat Run expects single List[dict] or List[List[dict]] 'messages'"340 f" input. Got {inputs}"341 )342 raise InputFormatError(msg)343344345## Shared data validation utilities346def _validate_example_inputs_for_language_model(347 first_example: Example,348 input_mapper: Callable[[dict], Any] | None,349) -> None:350 if input_mapper:351 prompt_input = input_mapper(first_example.inputs or {})352 if not isinstance(prompt_input, str) and not (353 isinstance(prompt_input, list)354 and all(isinstance(msg, BaseMessage) for msg in prompt_input)355 ):356 msg = (357 "When using an input_mapper to prepare dataset example inputs"358 " for an LLM or chat model, the output must a single string or"359 " a list of chat messages."360 f"\nGot: {prompt_input} of type {type(prompt_input)}."361 )362 raise InputFormatError(msg)363 else:364 try:365 _get_prompt(first_example.inputs or {})366 except InputFormatError:367 try:368 _get_messages(first_example.inputs or {})369 except InputFormatError as err2:370 msg = (371 "Example inputs do not match language model input format. "372 "Expected a dictionary with messages or a single prompt."373 f" Got: {first_example.inputs}"374 " Please update your dataset OR provide an input_mapper"375 " to convert the example.inputs to a compatible format"376 " for the llm or chat model you wish to evaluate."377 )378 raise InputFormatError(msg) from err2379380381def _validate_example_inputs_for_chain(382 first_example: Example,383 chain: Chain,384 input_mapper: Callable[[dict], Any] | None,385) -> None:386 """Validate that the example inputs match the chain input keys."""387 if input_mapper:388 first_inputs = input_mapper(first_example.inputs or {})389 missing_keys = set(chain.input_keys).difference(first_inputs)390 if not isinstance(first_inputs, dict):391 msg = (392 "When using an input_mapper to prepare dataset example"393 " inputs for a chain, the mapped value must be a dictionary."394 f"\nGot: {first_inputs} of type {type(first_inputs)}."395 )396 raise InputFormatError(msg)397 if missing_keys:398 msg = (399 "Missing keys after loading example using input_mapper."400 f"\nExpected: {chain.input_keys}. Got: {first_inputs.keys()}"401 )402 raise InputFormatError(msg)403 else:404 first_inputs = first_example.inputs or {}405 missing_keys = set(chain.input_keys).difference(first_inputs)406 if len(first_inputs) == 1 and len(chain.input_keys) == 1:407 # We can pass this through the run method.408 # Refrain from calling to validate.409 pass410 elif missing_keys:411 msg = (412 "Example inputs missing expected chain input keys."413 " Please provide an input_mapper to convert the example.inputs"414 " to a compatible format for the chain you wish to evaluate."415 f"Expected: {chain.input_keys}. "416 f"Got: {first_inputs.keys()}"417 )418 raise InputFormatError(msg)419420421def _validate_example_inputs(422 example: Example,423 llm_or_chain_factory: MCF,424 input_mapper: Callable[[dict], Any] | None,425) -> None:426 """Validate that the example inputs are valid for the model."""427 if isinstance(llm_or_chain_factory, BaseLanguageModel):428 _validate_example_inputs_for_language_model(example, input_mapper)429 else:430 chain = llm_or_chain_factory()431 if isinstance(chain, Chain):432 # Otherwise it's a runnable433 _validate_example_inputs_for_chain(example, chain, input_mapper)434 elif isinstance(chain, Runnable):435 logger.debug("Skipping input validation for %s", chain)436437438## Shared Evaluator Setup Utilities439440441def _setup_evaluation(442 llm_or_chain_factory: MCF,443 examples: list[Example],444 evaluation: smith_eval.RunEvalConfig | None,445 data_type: DataType,446) -> list[RunEvaluator] | None:447 """Configure the evaluators to run on the results of the chain."""448 if evaluation:449 if isinstance(llm_or_chain_factory, BaseLanguageModel):450 run_inputs, run_outputs = None, None451 run_type = "llm"452 else:453 run_type = "chain"454 chain = llm_or_chain_factory()455 run_inputs = chain.input_keys if isinstance(chain, Chain) else None456 run_outputs = chain.output_keys if isinstance(chain, Chain) else None457 run_evaluators = _load_run_evaluators(458 evaluation,459 run_type,460 data_type,461 list(examples[0].outputs) if examples[0].outputs else None,462 run_inputs,463 run_outputs,464 )465 else:466 # TODO: Create a default helpfulness evaluator467 run_evaluators = None468 return run_evaluators469470471def _determine_input_key(472 config: smith_eval.RunEvalConfig,473 run_inputs: list[str] | None,474) -> str | None:475 input_key = None476 if config.input_key:477 input_key = config.input_key478 if run_inputs and input_key not in run_inputs:479 logger.warning(480 "Input key %s not in chain's specified input keys %s. "481 "Evaluation behavior may be undefined.",482 input_key,483 run_inputs,484 )485 elif run_inputs and len(run_inputs) == 1:486 input_key = run_inputs[0]487 elif run_inputs is not None and len(run_inputs) > 1:488 logger.warning(489 "Chain expects multiple input keys: %s,"490 " Evaluator is likely to fail. Evaluation behavior may be undefined."491 " Specify an input_key in the RunEvalConfig to avoid this warning.",492 run_inputs,493 )494495 return input_key496497498def _determine_prediction_key(499 config: smith_eval.RunEvalConfig,500 run_outputs: list[str] | None,501) -> str | None:502 prediction_key = None503 if config.prediction_key:504 prediction_key = config.prediction_key505 if run_outputs and prediction_key not in run_outputs:506 logger.warning(507 "Prediction key %s not in chain's specified output keys %s. "508 "Evaluation behavior may be undefined.",509 prediction_key,510 run_outputs,511 )512 elif run_outputs and len(run_outputs) == 1:513 prediction_key = run_outputs[0]514 elif run_outputs is not None and len(run_outputs) > 1:515 logger.warning(516 "Chain expects multiple output keys: %s,"517 " Evaluation behavior may be undefined. Specify a prediction_key"518 " in the RunEvalConfig to avoid this warning.",519 run_outputs,520 )521 return prediction_key522523524def _determine_reference_key(525 config: smith_eval.RunEvalConfig,526 example_outputs: list[str] | None,527) -> str | None:528 if config.reference_key:529 reference_key = config.reference_key530 if example_outputs and reference_key not in example_outputs:531 msg = (532 f"Reference key {reference_key} not in Dataset"533 f" example outputs: {example_outputs}"534 )535 raise ValueError(msg)536 elif example_outputs and len(example_outputs) == 1:537 reference_key = next(iter(example_outputs))538 else:539 reference_key = None540 return reference_key541542543def _construct_run_evaluator(544 eval_config: smith_eval_config.SINGLE_EVAL_CONFIG_TYPE545 | smith_eval_config.CUSTOM_EVALUATOR_TYPE,546 eval_llm: BaseLanguageModel | None,547 run_type: str,548 data_type: DataType,549 example_outputs: list[str] | None,550 reference_key: str | None,551 input_key: str | None,552 prediction_key: str | None,553) -> RunEvaluator:554 if isinstance(eval_config, RunEvaluator):555 return eval_config556 if isinstance(eval_config, (EvaluatorType, str)):557 if not isinstance(eval_config, EvaluatorType):558 eval_config = EvaluatorType(eval_config)559 evaluator_ = load_evaluator(eval_config, llm=eval_llm)560 eval_type_tag = eval_config.value561 elif isinstance(eval_config, smith_eval_config.EvalConfig):562 kwargs = {"llm": eval_llm, **eval_config.get_kwargs()}563 evaluator_ = load_evaluator(eval_config.evaluator_type, **kwargs)564 eval_type_tag = eval_config.evaluator_type.value565 # Override keys if specified in the config566 if isinstance(eval_config, smith_eval_config.SingleKeyEvalConfig):567 input_key = eval_config.input_key or input_key568 prediction_key = eval_config.prediction_key or prediction_key569 reference_key = eval_config.reference_key or reference_key570 elif callable(eval_config):571 # Assume we can decorate572 return run_evaluator_dec(eval_config)573 else:574 msg = f"Unknown evaluator type: {type(eval_config)}"575 raise ValueError(msg) # noqa: TRY004576577 if isinstance(evaluator_, StringEvaluator):578 if evaluator_.requires_reference and reference_key is None:579 msg = (580 f"Must specify reference_key in smith_eval.RunEvalConfig to use"581 f" evaluator of type {eval_type_tag} with"582 f" dataset with multiple output keys: {example_outputs}."583 )584 raise ValueError(msg)585 run_evaluator = smith_eval.StringRunEvaluatorChain.from_run_and_data_type(586 evaluator_,587 run_type,588 data_type,589 input_key=input_key,590 prediction_key=prediction_key,591 reference_key=reference_key,592 tags=[eval_type_tag],593 )594 elif isinstance(evaluator_, PairwiseStringEvaluator):595 msg = (596 f"Run evaluator for {eval_type_tag} is not implemented."597 " PairwiseStringEvaluators compare the outputs of two different models"598 " rather than the output of a single model."599 " Did you mean to use a StringEvaluator instead?"600 "\nSee: https://python.langchain.com/docs/guides/evaluation/string/"601 )602 raise NotImplementedError(msg)603604 else:605 msg = f"Run evaluator for {eval_type_tag} is not implemented"606 raise NotImplementedError(msg)607 return run_evaluator608609610def _get_keys(611 config: smith_eval.RunEvalConfig,612 run_inputs: list[str] | None,613 run_outputs: list[str] | None,614 example_outputs: list[str] | None,615) -> tuple[str | None, str | None, str | None]:616 input_key = _determine_input_key(config, run_inputs)617 prediction_key = _determine_prediction_key(config, run_outputs)618 reference_key = _determine_reference_key(config, example_outputs)619 return input_key, prediction_key, reference_key620621622def _load_run_evaluators(623 config: smith_eval.RunEvalConfig,624 run_type: str,625 data_type: DataType,626 example_outputs: list[str] | None,627 run_inputs: list[str] | None,628 run_outputs: list[str] | None,629) -> list[RunEvaluator]:630 """Load run evaluators from a configuration.631632 Args:633 config: Configuration for the run evaluators.634 run_type: The type of run.635 data_type: The type of dataset used in the run.636 example_outputs: The example outputs.637 run_inputs: The input keys for the run.638 run_outputs: The output keys for the run.639640 Returns:641 A list of run evaluators.642 """643 run_evaluators = []644 input_key, prediction_key, reference_key = None, None, None645 if config.evaluators or (646 config.custom_evaluators647 and any(isinstance(e, StringEvaluator) for e in config.custom_evaluators)648 ):649 input_key, prediction_key, reference_key = _get_keys(650 config,651 run_inputs,652 run_outputs,653 example_outputs,654 )655 for eval_config in config.evaluators:656 run_evaluator = _construct_run_evaluator(657 eval_config,658 config.eval_llm,659 run_type,660 data_type,661 example_outputs,662 reference_key,663 input_key,664 prediction_key,665 )666 run_evaluators.append(run_evaluator)667 custom_evaluators = config.custom_evaluators or []668 for custom_evaluator in custom_evaluators:669 if isinstance(custom_evaluator, RunEvaluator):670 run_evaluators.append(custom_evaluator)671 elif isinstance(custom_evaluator, StringEvaluator):672 run_evaluators.append(673 smith_eval.StringRunEvaluatorChain.from_run_and_data_type(674 custom_evaluator,675 run_type,676 data_type,677 input_key=input_key,678 prediction_key=prediction_key,679 reference_key=reference_key,680 ),681 )682 elif callable(custom_evaluator):683 run_evaluators.append(run_evaluator_dec(custom_evaluator))684 else:685 msg = ( # type: ignore[unreachable]686 f"Unsupported custom evaluator: {custom_evaluator}."687 f" Expected RunEvaluator or StringEvaluator."688 )689 raise ValueError(msg) # noqa: TRY004690691 return run_evaluators692693694### Async Helpers695696697async def _arun_llm(698 llm: BaseLanguageModel,699 inputs: dict[str, Any],700 *,701 tags: list[str] | None = None,702 callbacks: Callbacks = None,703 input_mapper: Callable[[dict], Any] | None = None,704 metadata: dict[str, Any] | None = None,705) -> str | BaseMessage:706 """Asynchronously run the language model.707708 Args:709 llm: The language model to run.710 inputs: The input dictionary.711 tags: Optional tags to add to the run.712 callbacks: Optional callbacks to use during the run.713 input_mapper: Optional function to map inputs to the expected format.714 metadata: Optional metadata to add to the run.715716 Returns:717 The LLMResult or ChatResult.718719 Raises:720 ValueError: If the LLM type is unsupported.721 InputFormatError: If the input format is invalid.722 """723 if input_mapper is not None:724 prompt_or_messages = input_mapper(inputs)725 if isinstance(prompt_or_messages, str) or (726 isinstance(prompt_or_messages, list)727 and all(isinstance(msg, BaseMessage) for msg in prompt_or_messages)728 ):729 return await llm.ainvoke(730 prompt_or_messages,731 config=RunnableConfig(732 callbacks=callbacks,733 tags=tags or [],734 metadata=metadata or {},735 ),736 )737 msg = (738 "Input mapper returned invalid format"739 f" {prompt_or_messages}"740 "\nExpected a single string or list of chat messages."741 )742 raise InputFormatError(msg)743744 try:745 prompt = _get_prompt(inputs)746 llm_output: str | BaseMessage = await llm.ainvoke(747 prompt,748 config=RunnableConfig(749 callbacks=callbacks,750 tags=tags or [],751 metadata=metadata or {},752 ),753 )754 except InputFormatError:755 llm_inputs = _get_messages(inputs)756 llm_output = await llm.ainvoke(757 **llm_inputs,758 config=RunnableConfig(759 callbacks=callbacks,760 tags=tags or [],761 metadata=metadata or {},762 ),763 )764 return llm_output765766767async def _arun_chain(768 chain: Chain | Runnable,769 inputs: dict[str, Any],770 callbacks: Callbacks,771 *,772 tags: list[str] | None = None,773 input_mapper: Callable[[dict], Any] | None = None,774 metadata: dict[str, Any] | None = None,775) -> dict | str:776 """Run a chain asynchronously on inputs."""777 inputs_ = inputs if input_mapper is None else input_mapper(inputs)778 if (779 isinstance(chain, Chain)780 and isinstance(inputs_, dict)781 and len(inputs_) == 1782 and chain.input_keys783 ):784 val = next(iter(inputs_.values()))785 output = await chain.ainvoke(786 val,787 config=RunnableConfig(788 callbacks=callbacks,789 tags=tags or [],790 metadata=metadata or {},791 ),792 )793 else:794 runnable_config = RunnableConfig(795 tags=tags or [],796 callbacks=callbacks,797 metadata=metadata or {},798 )799 output = await chain.ainvoke(inputs_, config=runnable_config)800 return output801802803async def _arun_llm_or_chain(804 example: Example,805 config: RunnableConfig,806 *,807 llm_or_chain_factory: MCF,808 input_mapper: Callable[[dict], Any] | None = None,809) -> dict | str | LLMResult | ChatResult:810 """Asynchronously run the Chain or language model.811812 Args:813 example: The example to run.814 config: The configuration for the run.815 llm_or_chain_factory: The Chain or language model constructor to run.816 input_mapper: Optional function to map the input to the expected format.817818 Returns:819 A list of outputs.820 """821 chain_or_llm = (822 "LLM" if isinstance(llm_or_chain_factory, BaseLanguageModel) else "Chain"823 )824 result = None825 try:826 if isinstance(llm_or_chain_factory, BaseLanguageModel):827 output: Any = await _arun_llm(828 llm_or_chain_factory,829 example.inputs or {},830 tags=config["tags"],831 callbacks=config["callbacks"],832 input_mapper=input_mapper,833 metadata=config.get("metadata"),834 )835 else:836 chain = llm_or_chain_factory()837 output = await _arun_chain(838 chain,839 example.inputs or {},840 tags=config["tags"],841 callbacks=config["callbacks"],842 input_mapper=input_mapper,843 metadata=config.get("metadata"),844 )845 result = output846 except Exception as e: # noqa: BLE001847 logger.warning(848 "%s failed for example %s with inputs %s\n%s",849 chain_or_llm,850 example.id,851 example.inputs,852 e,853 )854 result = EvalError(Error=e)855 return result856857858## Sync Utilities859860861def _run_llm(862 llm: BaseLanguageModel,863 inputs: dict[str, Any],864 callbacks: Callbacks,865 *,866 tags: list[str] | None = None,867 input_mapper: Callable[[dict], Any] | None = None,868 metadata: dict[str, Any] | None = None,869) -> str | BaseMessage:870 """Run the language model on the example.871872 Args:873 llm: The language model to run.874 inputs: The input dictionary.875 callbacks: The callbacks to use during the run.876 tags: Optional tags to add to the run.877 input_mapper: function to map to the inputs dictionary from an Example878 metadata: Optional metadata to add to the run.879880 Returns:881 The LLMResult or ChatResult.882883 Raises:884 ValueError: If the LLM type is unsupported.885 InputFormatError: If the input format is invalid.886 """887 # Most of this is legacy code; we could probably remove a lot of it.888 if input_mapper is not None:889 prompt_or_messages = input_mapper(inputs)890 if isinstance(prompt_or_messages, str) or (891 isinstance(prompt_or_messages, list)892 and all(isinstance(msg, BaseMessage) for msg in prompt_or_messages)893 ):894 llm_output: str | BaseMessage = llm.invoke(895 prompt_or_messages,896 config=RunnableConfig(897 callbacks=callbacks,898 tags=tags or [],899 metadata=metadata or {},900 ),901 )902 else:903 msg = (904 "Input mapper returned invalid format: "905 f" {prompt_or_messages}"906 "\nExpected a single string or list of chat messages."907 )908 raise InputFormatError(msg)909 else:910 try:911 llm_prompts = _get_prompt(inputs)912 llm_output = llm.invoke(913 llm_prompts,914 config=RunnableConfig(915 callbacks=callbacks,916 tags=tags or [],917 metadata=metadata or {},918 ),919 )920 except InputFormatError:921 llm_inputs = _get_messages(inputs)922 llm_output = llm.invoke(923 **llm_inputs,924 config=RunnableConfig(callbacks=callbacks, metadata=metadata or {}),925 )926 return llm_output927928929def _run_chain(930 chain: Chain | Runnable,931 inputs: dict[str, Any],932 callbacks: Callbacks,933 *,934 tags: list[str] | None = None,935 input_mapper: Callable[[dict], Any] | None = None,936 metadata: dict[str, Any] | None = None,937) -> dict | str:938 """Run a chain on inputs."""939 inputs_ = inputs if input_mapper is None else input_mapper(inputs)940 if (941 isinstance(chain, Chain)942 and isinstance(inputs_, dict)943 and len(inputs_) == 1944 and chain.input_keys945 ):946 val = next(iter(inputs_.values()))947 output = chain.invoke(948 val,949 config=RunnableConfig(950 callbacks=callbacks,951 tags=tags or [],952 metadata=metadata or {},953 ),954 )955 else:956 runnable_config = RunnableConfig(957 tags=tags or [],958 callbacks=callbacks,959 metadata=metadata or {},960 )961 output = chain.invoke(inputs_, config=runnable_config)962 return output963964965def _run_llm_or_chain(966 example: Example,967 config: RunnableConfig,968 *,969 llm_or_chain_factory: MCF,970 input_mapper: Callable[[dict], Any] | None = None,971) -> dict | str | LLMResult | ChatResult:972 """Run the Chain or language model synchronously.973974 Args:975 example: The example to run.976 config: The configuration for the run.977 llm_or_chain_factory: The Chain or language model constructor to run.978 input_mapper: Optional function to map the input to the expected format.979980 Returns:981 The outputs of the model or chain.982 """983 chain_or_llm = (984 "LLM" if isinstance(llm_or_chain_factory, BaseLanguageModel) else "Chain"985 )986 result = None987 try:988 if isinstance(llm_or_chain_factory, BaseLanguageModel):989 output: Any = _run_llm(990 llm_or_chain_factory,991 example.inputs or {},992 config["callbacks"],993 tags=config["tags"],994 input_mapper=input_mapper,995 metadata=config.get("metadata"),996 )997 else:998 chain = llm_or_chain_factory()999 output = _run_chain(1000 chain,1001 example.inputs or {},1002 config["callbacks"],1003 tags=config["tags"],1004 input_mapper=input_mapper,1005 metadata=config.get("metadata"),1006 )1007 result = output1008 except Exception as e: # noqa: BLE0011009 error_type = type(e).__name__1010 logger.warning(1011 "%s failed for example %s with inputs %s\nError Type: %s, Message: %s",1012 chain_or_llm,1013 example.id,1014 example.inputs,1015 error_type,1016 e,1017 )1018 result = EvalError(Error=e)1019 return result102010211022def _prepare_eval_run(1023 client: Client,1024 dataset_name: str,1025 llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,1026 project_name: str,1027 project_metadata: dict[str, Any] | None = None,1028 tags: list[str] | None = None,1029 dataset_version: str | datetime | None = None,1030) -> tuple[MCF, TracerSession, Dataset, list[Example]]:1031 wrapped_model = _wrap_in_chain_factory(llm_or_chain_factory, dataset_name)1032 dataset = client.read_dataset(dataset_name=dataset_name)10331034 examples = list(client.list_examples(dataset_id=dataset.id, as_of=dataset_version))1035 if not examples:1036 msg = f"Dataset {dataset_name} has no example rows."1037 raise ValueError(msg)1038 modified_at = [ex.modified_at for ex in examples if ex.modified_at]1039 # Should always be defined in practice when fetched,1040 # but the typing permits None1041 max_modified_at = max(modified_at) if modified_at else None1042 inferred_version = max_modified_at.isoformat() if max_modified_at else None10431044 try:1045 project_metadata = project_metadata or {}1046 git_info = get_git_info()1047 if git_info:1048 project_metadata = {1049 **project_metadata,1050 "git": git_info,1051 }10521053 project_metadata["dataset_version"] = inferred_version1054 project = client.create_project(1055 project_name,1056 reference_dataset_id=dataset.id,1057 project_extra={"tags": tags} if tags else {},1058 metadata=project_metadata,1059 )1060 except (HTTPError, ValueError, LangSmithError) as e:1061 if "already exists " not in str(e):1062 raise1063 uid = uuid.uuid4()1064 example_msg = f"""1065run_on_dataset(1066 ...1067 project_name="{project_name} - {uid}", # Update since {project_name} already exists1068)1069"""1070 msg = (1071 f"Test project {project_name} already exists. Please use a different name:"1072 f"\n\n{example_msg}"1073 )1074 raise ValueError(msg) from e1075 comparison_url = dataset.url + f"/compare?selectedSessions={project.id}"1076 print( # noqa: T2011077 f"View the evaluation results for project '{project_name}'"1078 f" at:\n{comparison_url}\n\n"1079 f"View all tests for Dataset {dataset_name} at:\n{dataset.url}",1080 flush=True,1081 )1082 return wrapped_model, project, dataset, examples108310841085class _RowResult(TypedDict, total=False):1086 """A dictionary of the results for a single example row."""10871088 feedback: list[EvaluationResult] | None1089 execution_time: float | None1090 run_id: str | None109110921093@dataclasses.dataclass1094class _DatasetRunContainer:1095 """A container to help manage the state of a eval run."""10961097 client: Client1098 project: TracerSession1099 wrapped_model: MCF1100 examples: list[Example]1101 configs: list[RunnableConfig]1102 batch_evaluators: list[smith_eval_config.BATCH_EVALUATOR_LIKE] | None = None11031104 def _merge_test_outputs(1105 self,1106 batch_results: list,1107 all_eval_results: dict[str, _RowResult],1108 ) -> dict:1109 results: dict = {}1110 for example, output in zip(self.examples, batch_results, strict=False):1111 row_result = all_eval_results.get(str(example.id), {})1112 results[str(example.id)] = {1113 "input": example.inputs,1114 "feedback": row_result.get("feedback", []),1115 "execution_time": row_result.get("execution_time"),1116 "run_id": row_result.get("run_id"),1117 }1118 if isinstance(output, EvalError):1119 results[str(example.id)]["Error"] = output.Error1120 else:1121 results[str(example.id)]["output"] = output1122 if example.outputs:1123 results[str(example.id)]["reference"] = example.outputs1124 return results11251126 def _run_batch_evaluators(self, runs: dict[str, Run]) -> list[dict]:1127 evaluators = self.batch_evaluators1128 if not evaluators:1129 return []1130 runs_list = [runs[str(example.id)] for example in self.examples]1131 aggregate_feedback = []1132 with concurrent.futures.ThreadPoolExecutor() as executor:1133 for evaluator in evaluators:1134 try:1135 result = evaluator(runs_list, self.examples)1136 if isinstance(result, EvaluationResult):1137 result = result.model_dump()1138 aggregate_feedback.append(cast("dict", result))1139 executor.submit(1140 self.client.create_feedback,1141 **result,1142 run_id=None,1143 project_id=self.project.id,1144 )1145 except Exception:1146 logger.exception(1147 "Error running batch evaluator %s", repr(evaluator)1148 )1149 return aggregate_feedback11501151 def _collect_metrics(self) -> tuple[dict[str, _RowResult], dict[str, Run]]:1152 all_eval_results: dict = {}1153 all_runs: dict = {}1154 for c in self.configs:1155 for callback in cast("list", c["callbacks"]):1156 if isinstance(callback, EvaluatorCallbackHandler):1157 eval_results = callback.logged_eval_results1158 for (_, example_id), v in eval_results.items():1159 all_eval_results.setdefault(str(example_id), {}).update(1160 {"feedback": v},1161 )1162 elif isinstance(callback, LangChainTracer):1163 run = callback.latest_run1164 execution_time = (1165 (run.end_time - run.start_time).total_seconds()1166 if run and run.end_time1167 else None1168 )1169 run_id = str(run.id) if run else None1170 all_eval_results.setdefault(str(callback.example_id), {}).update(1171 {1172 "execution_time": execution_time,1173 "run_id": run_id,1174 "run": run,1175 },1176 )1177 all_runs[str(callback.example_id)] = run1178 return cast("dict[str, _RowResult]", all_eval_results), all_runs11791180 def _collect_test_results(1181 self,1182 batch_results: list[dict | str | LLMResult | ChatResult],1183 ) -> TestResult:1184 logger.info("Waiting for evaluators to complete.")1185 wait_for_all_evaluators()1186 all_eval_results, all_runs = self._collect_metrics()1187 aggregate_feedback = None1188 if self.batch_evaluators:1189 logger.info("Running session evaluators.")1190 aggregate_feedback = self._run_batch_evaluators(all_runs)1191 results = self._merge_test_outputs(batch_results, all_eval_results)1192 return TestResult(1193 project_name=self.project.name,1194 results=results,1195 aggregate_metrics=aggregate_feedback,1196 )11971198 def finish(1199 self,1200 batch_results: list,1201 verbose: bool = False, # noqa: FBT001,FBT0021202 ) -> TestResult:1203 results = self._collect_test_results(batch_results)1204 if verbose:1205 try:1206 agg_feedback = results.get_aggregate_feedback()1207 _display_aggregate_results(agg_feedback)1208 except Exception as e: # noqa: BLE0011209 logger.debug("Failed to print aggregate feedback: %s", e, exc_info=True)1210 try:1211 # Closing the project permits name changing and metric optimizations1212 self.client.update_project(1213 self.project.id,1214 end_time=datetime.now(timezone.utc),1215 )1216 except Exception as e: # noqa: BLE0011217 logger.debug("Failed to close project: %s", e, exc_info=True)1218 return results12191220 @classmethod1221 def prepare(1222 cls,1223 client: Client,1224 dataset_name: str,1225 llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,1226 project_name: str | None,1227 evaluation: smith_eval.RunEvalConfig | None = None,1228 tags: list[str] | None = None,1229 input_mapper: Callable[[dict], Any] | None = None,1230 concurrency_level: int = 5,1231 project_metadata: dict[str, Any] | None = None,1232 revision_id: str | None = None,1233 dataset_version: datetime | str | None = None,1234 ) -> _DatasetRunContainer:1235 project_name = project_name or name_generation.random_name()1236 if revision_id:1237 if not project_metadata:1238 project_metadata = {}1239 project_metadata.update({"revision_id": revision_id})1240 wrapped_model, project, dataset, examples = _prepare_eval_run(1241 client,1242 dataset_name,1243 llm_or_chain_factory,1244 project_name,1245 project_metadata=project_metadata,1246 tags=tags,1247 dataset_version=dataset_version,1248 )1249 tags = tags or []1250 for k, v in (project.metadata.get("git") or {}).items():1251 tags.append(f"git:{k}={v}")1252 run_metadata = {"dataset_version": project.metadata["dataset_version"]}1253 if revision_id:1254 run_metadata["revision_id"] = revision_id1255 wrapped_model = _wrap_in_chain_factory(llm_or_chain_factory)1256 run_evaluators = _setup_evaluation(1257 wrapped_model,1258 examples,1259 evaluation,1260 dataset.data_type or DataType.kv,1261 )1262 _validate_example_inputs(examples[0], wrapped_model, input_mapper)1263 progress_bar = progress.ProgressBarCallback(len(examples))1264 configs = [1265 RunnableConfig(1266 callbacks=[1267 LangChainTracer(1268 project_name=project.name,1269 client=client,1270 example_id=example.id,1271 ),1272 EvaluatorCallbackHandler(1273 evaluators=run_evaluators or [],1274 client=client,1275 example_id=example.id,1276 max_concurrency=0,1277 ),1278 progress_bar,1279 ],1280 tags=tags,1281 max_concurrency=concurrency_level,1282 metadata=run_metadata,1283 )1284 for example in examples1285 ]1286 return cls(1287 client=client,1288 project=project,1289 wrapped_model=wrapped_model,1290 examples=examples,1291 configs=configs,1292 batch_evaluators=evaluation.batch_evaluators if evaluation else None,1293 )129412951296def _is_jupyter_environment() -> bool:1297 try:1298 from IPython.core.getipython import get_ipython12991300 res = get_ipython() # type: ignore[no-untyped-call]1301 return res is not None and "zmqshell" in str(type(res))1302 except ImportError:1303 return False130413051306def _display_aggregate_results(aggregate_results: pd.DataFrame) -> None:1307 if _is_jupyter_environment():1308 from IPython.display import HTML, display13091310 display(HTML("<h3>Experiment Results:</h3>")) # type: ignore[no-untyped-call]1311 display(aggregate_results) # type: ignore[no-untyped-call]1312 else:1313 formatted_string = aggregate_results.to_string(1314 float_format=lambda x: f"{x:.2f}",1315 justify="right",1316 )1317 print("\n Experiment Results:") # noqa: T2011318 print(formatted_string) # noqa: T201131913201321_INPUT_MAPPER_DEP_WARNING = (1322 "The input_mapper argument is deprecated and "1323 "will be removed in a future release. Please add a "1324 " RunnableLambda to your chain to map inputs to the expected format"1325 " instead. Example:\n"1326 "def construct_chain():\n"1327 " my_chain = ...\n"1328 " input_mapper = {'other_key': 'MyOtherInput', 'my_input_key': x}\n"1329 " return input_mapper | my_chain\n"1330 "run_on_dataset(..., llm_or_chain_factory=construct_chain)\n"1331 "(See https://api.python.langchain.com/en/latest/schema/"1332 "langchain.schema.runnable.base.RunnableLambda.html)"1333)13341335## Public API133613371338async def arun_on_dataset(1339 client: Client | None,1340 dataset_name: str,1341 llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,1342 *,1343 evaluation: smith_eval.RunEvalConfig | None = None,1344 dataset_version: datetime | str | None = None,1345 concurrency_level: int = 5,1346 project_name: str | None = None,1347 project_metadata: dict[str, Any] | None = None,1348 verbose: bool = False,1349 revision_id: str | None = None,1350 **kwargs: Any,1351) -> dict[str, Any]:1352 """Run on dataset.13531354 Run the Chain or language model on a dataset and store traces1355 to the specified project name.13561357 For the (usually faster) async version of this function,1358 see `arun_on_dataset`.13591360 Args:1361 dataset_name: Name of the dataset to run the chain on.1362 llm_or_chain_factory: Language model or Chain constructor to run1363 over the dataset. The Chain constructor is used to permit1364 independent calls on each example without carrying over state.1365 evaluation: Configuration for evaluators to run on the1366 results of the chain.1367 dataset_version: Optional version of the dataset.1368 concurrency_level: The number of async tasks to run concurrently.1369 project_name: Name of the project to store the traces in.1370 Defaults to `{dataset_name}-{chain class name}-{datetime}`.1371 project_metadata: Optional metadata to add to the project.1372 Useful for storing information the test variant.1373 (prompt version, model version, etc.)1374 client: LangSmith client to use to access the dataset and to1375 log feedback and run traces.1376 verbose: Whether to print progress.1377 revision_id: Optional revision identifier to assign this test run to1378 track the performance of different versions of your system.1379 **kwargs: Should not be used, but is provided for backwards compatibility.13801381 Returns:1382 `dict` containing the run's project name and the resulting model outputs.13831384 Examples:1385 ```python1386 from langsmith import Client1387 from langchain_openai import ChatOpenAI1388 from langchain_classic.chains import LLMChain1389 from langchain_classic.smith import smith_eval.RunEvalConfig, run_on_dataset13901391 # Chains may have memory. Passing in a constructor function lets the1392 # evaluation framework avoid cross-contamination between runs.1393 def construct_chain():1394 model = ChatOpenAI(temperature=0)1395 chain = LLMChain.from_string(1396 model,1397 "What's the answer to {your_input_key}"1398 )1399 return chain14001401 # Load off-the-shelf evaluators via config or the EvaluatorType (string or enum)1402 evaluation_config = smith_eval.RunEvalConfig(1403 evaluators=[1404 "qa", # "Correctness" against a reference answer1405 "embedding_distance",1406 smith_eval.RunEvalConfig.Criteria("helpfulness"),1407 smith_eval.RunEvalConfig.Criteria({1408 "fifth-grader-score": "Do you have to be smarter than a fifth "1409 "grader to answer this question?"1410 }),1411 ]1412 )14131414 client = Client()1415 await arun_on_dataset(1416 client,1417 dataset_name="<my_dataset_name>",1418 llm_or_chain_factory=construct_chain,1419 evaluation=evaluation_config,1420 )1421 ```1422 You can also create custom evaluators by subclassing the `StringEvaluator or1423 LangSmith's `RunEvaluator` classes.14241425 ```python1426 from typing import Optional1427 from langchain_classic.evaluation import StringEvaluator142814291430 class MyStringEvaluator(StringEvaluator):1431 @property1432 def requires_input(self) -> bool:1433 return False14341435 @property1436 def requires_reference(self) -> bool:1437 return True14381439 @property1440 def evaluation_name(self) -> str:1441 return "exact_match"14421443 def _evaluate_strings(1444 self, prediction, reference=None, input=None, **kwargs1445 ) -> dict:1446 return {"score": prediction == reference}144714481449 evaluation_config = smith_eval.RunEvalConfig(1450 custom_evaluators=[MyStringEvaluator()],1451 )14521453 await arun_on_dataset(1454 client,1455 dataset_name="<my_dataset_name>",1456 llm_or_chain_factory=construct_chain,1457 evaluation=evaluation_config,1458 )1459 ```1460 """1461 input_mapper = kwargs.pop("input_mapper", None)1462 if input_mapper:1463 warn_deprecated("0.0.305", message=_INPUT_MAPPER_DEP_WARNING, pending=True)1464 if revision_id is None:1465 revision_id = get_langchain_env_var_metadata().get("revision_id")1466 tags = kwargs.pop("tags", None)1467 if tags:1468 warn_deprecated(1469 "0.1.9",1470 message="The tags argument is deprecated and will be"1471 " removed in a future release. Please specify project_metadata instead.",1472 pending=True,1473 )14741475 if kwargs:1476 warn_deprecated(1477 "0.0.305",1478 message="The following arguments are deprecated and "1479 "will be removed in a future release: "1480 f"{kwargs.keys()}.",1481 removal="0.0.305",1482 )1483 client = client or Client()1484 container = _DatasetRunContainer.prepare(1485 client,1486 dataset_name,1487 llm_or_chain_factory,1488 project_name,1489 evaluation,1490 tags,1491 input_mapper,1492 concurrency_level,1493 project_metadata=project_metadata,1494 revision_id=revision_id,1495 dataset_version=dataset_version,1496 )1497 batch_results = await runnable_utils.gather_with_concurrency(1498 container.configs[0].get("max_concurrency"),1499 *map(1500 functools.partial(1501 _arun_llm_or_chain,1502 llm_or_chain_factory=container.wrapped_model,1503 input_mapper=input_mapper,1504 ),1505 container.examples,1506 container.configs,1507 ),1508 )1509 return container.finish(batch_results, verbose=verbose)151015111512def run_on_dataset(1513 client: Client | None,1514 dataset_name: str,1515 llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,1516 *,1517 evaluation: smith_eval.RunEvalConfig | None = None,1518 dataset_version: datetime | str | None = None,1519 concurrency_level: int = 5,1520 project_name: str | None = None,1521 project_metadata: dict[str, Any] | None = None,1522 verbose: bool = False,1523 revision_id: str | None = None,1524 **kwargs: Any,1525) -> dict[str, Any]:1526 """Run on dataset.15271528 Run the Chain or language model on a dataset and store traces1529 to the specified project name.15301531 For the (usually faster) async version of this function,1532 see `arun_on_dataset`.15331534 Args:1535 dataset_name: Name of the dataset to run the chain on.1536 llm_or_chain_factory: Language model or Chain constructor to run1537 over the dataset. The Chain constructor is used to permit1538 independent calls on each example without carrying over state.1539 evaluation: Configuration for evaluators to run on the1540 results of the chain.1541 dataset_version: Optional version of the dataset.1542 concurrency_level: The number of async tasks to run concurrently.1543 project_name: Name of the project to store the traces in.1544 Defaults to `{dataset_name}-{chain class name}-{datetime}`.1545 project_metadata: Optional metadata to add to the project.1546 Useful for storing information the test variant.1547 (prompt version, model version, etc.)1548 client: LangSmith client to use to access the dataset and to1549 log feedback and run traces.1550 verbose: Whether to print progress.1551 revision_id: Optional revision identifier to assign this test run to1552 track the performance of different versions of your system.1553 **kwargs: Should not be used, but is provided for backwards compatibility.15541555 Returns:1556 `dict` containing the run's project name and the resulting model outputs.15571558 Examples:1559 ```python1560 from langsmith import Client1561 from langchain_openai import ChatOpenAI1562 from langchain_classic.chains import LLMChain1563 from langchain_classic.smith import smith_eval.RunEvalConfig, run_on_dataset15641565 # Chains may have memory. Passing in a constructor function lets the1566 # evaluation framework avoid cross-contamination between runs.1567 def construct_chain():1568 model = ChatOpenAI(temperature=0)1569 chain = LLMChain.from_string(1570 model,1571 "What's the answer to {your_input_key}"1572 )1573 return chain15741575 # Load off-the-shelf evaluators via config or the EvaluatorType (string or enum)1576 evaluation_config = smith_eval.RunEvalConfig(1577 evaluators=[1578 "qa", # "Correctness" against a reference answer1579 "embedding_distance",1580 smith_eval.RunEvalConfig.Criteria("helpfulness"),1581 smith_eval.RunEvalConfig.Criteria({1582 "fifth-grader-score": "Do you have to be smarter than a fifth "1583 "grader to answer this question?"1584 }),1585 ]1586 )15871588 client = Client()1589 run_on_dataset(1590 client,1591 dataset_name="<my_dataset_name>",1592 llm_or_chain_factory=construct_chain,1593 evaluation=evaluation_config,1594 )1595 ```15961597 You can also create custom evaluators by subclassing the `StringEvaluator` or1598 LangSmith's `RunEvaluator` classes.15991600 ```python1601 from typing import Optional1602 from langchain_classic.evaluation import StringEvaluator160316041605 class MyStringEvaluator(StringEvaluator):1606 @property1607 def requires_input(self) -> bool:1608 return False16091610 @property1611 def requires_reference(self) -> bool:1612 return True16131614 @property1615 def evaluation_name(self) -> str:1616 return "exact_match"16171618 def _evaluate_strings(1619 self, prediction, reference=None, input=None, **kwargs1620 ) -> dict:1621 return {"score": prediction == reference}162216231624 evaluation_config = smith_eval.RunEvalConfig(1625 custom_evaluators=[MyStringEvaluator()],1626 )16271628 run_on_dataset(1629 client,1630 dataset_name="<my_dataset_name>",1631 llm_or_chain_factory=construct_chain,1632 evaluation=evaluation_config,1633 )1634 ```1635 """1636 input_mapper = kwargs.pop("input_mapper", None)1637 if input_mapper:1638 warn_deprecated("0.0.305", message=_INPUT_MAPPER_DEP_WARNING, pending=True)1639 tags = kwargs.pop("tags", None)1640 if tags:1641 warn_deprecated(1642 "0.1.9",1643 message="The tags argument is deprecated and will be"1644 " removed in a future release. Please specify project_metadata instead.",1645 pending=True,1646 )1647 if revision_id is None:1648 revision_id = get_langchain_env_var_metadata().get("revision_id")16491650 if kwargs:1651 warn_deprecated(1652 "0.0.305",1653 message="The following arguments are deprecated and "1654 "will be removed in a future release: "1655 f"{kwargs.keys()}.",1656 removal="0.0.305",1657 )1658 client = client or Client()1659 container = _DatasetRunContainer.prepare(1660 client,1661 dataset_name,1662 llm_or_chain_factory,1663 project_name,1664 evaluation,1665 tags,1666 input_mapper,1667 concurrency_level,1668 project_metadata=project_metadata,1669 revision_id=revision_id,1670 dataset_version=dataset_version,1671 )1672 if concurrency_level == 0:1673 batch_results = [1674 _run_llm_or_chain(1675 example,1676 config,1677 llm_or_chain_factory=container.wrapped_model,1678 input_mapper=input_mapper,1679 )1680 for example, config in zip(1681 container.examples, container.configs, strict=False1682 )1683 ]1684 else:1685 with runnable_config.get_executor_for_config(container.configs[0]) as executor:1686 batch_results = list(1687 executor.map(1688 functools.partial(1689 _run_llm_or_chain,1690 llm_or_chain_factory=container.wrapped_model,1691 input_mapper=input_mapper,1692 ),1693 container.examples,1694 container.configs,1695 ),1696 )16971698 return container.finish(batch_results, verbose=verbose)
Same data, no extra tab — call code_get_file + code_get_findings over MCP from Claude/Cursor/Copilot.