libs/langchain/langchain_classic/smith/evaluation/runner_utils.py PYTHON 1,699 lines View on github.com → Search inside
1"""Utilities for running language models or Chains over datasets."""23from __future__ import annotations45import concurrent.futures6import dataclasses7import functools8import inspect9import logging10import uuid11from collections.abc import Callable12from datetime import datetime, timezone13from typing import (14    TYPE_CHECKING,15    Any,16    cast,17)1819from langchain_core._api import warn_deprecated20from langchain_core.callbacks import Callbacks21from langchain_core.language_models import BaseLanguageModel22from langchain_core.messages import BaseMessage, messages_from_dict23from langchain_core.outputs import ChatResult, LLMResult24from langchain_core.runnables import Runnable, RunnableConfig, RunnableLambda25from langchain_core.runnables import config as runnable_config26from langchain_core.runnables import utils as runnable_utils27from langchain_core.tracers.evaluation import (28    EvaluatorCallbackHandler,29    wait_for_all_evaluators,30)31from langchain_core.tracers.langchain import LangChainTracer32from langsmith.client import Client33from langsmith.env import get_git_info, get_langchain_env_var_metadata34from langsmith.evaluation import (35    EvaluationResult,36    RunEvaluator,37)38from langsmith.evaluation import (39    run_evaluator as run_evaluator_dec,40)41from langsmith.run_helpers import as_runnable, is_traceable_function42from langsmith.schemas import Dataset, DataType, Example, Run, TracerSession43from langsmith.utils import LangSmithError44from requests import HTTPError45from typing_extensions import TypedDict4647from langchain_classic.chains.base import Chain48from langchain_classic.evaluation.loading import load_evaluator49from langchain_classic.evaluation.schema import (50    EvaluatorType,51    PairwiseStringEvaluator,52    StringEvaluator,53)54from langchain_classic.smith import evaluation as smith_eval55from langchain_classic.smith.evaluation import config as smith_eval_config56from langchain_classic.smith.evaluation import name_generation, progress5758if TYPE_CHECKING:59    import pandas as pd6061logger = logging.getLogger(__name__)6263MODEL_OR_CHAIN_FACTORY = (64    Callable[[], Chain | Runnable]65    | BaseLanguageModel66    | Callable[[dict], Any]67    | Runnable68    | Chain69)70MCF = Callable[[], Chain | Runnable] | BaseLanguageModel717273class InputFormatError(Exception):74    """Raised when the input format is invalid."""757677## Shared Utilities787980class TestResult(dict):81    """A dictionary of the results of a single test run."""8283    def get_aggregate_feedback(84        self,85    ) -> pd.DataFrame:86        """Return quantiles for the feedback scores.8788        This method calculates and prints the quantiles for the feedback scores89        across all feedback keys.9091        Returns:92            A DataFrame containing the quantiles for each feedback key.93        """94        df = self.to_dataframe()95        # Drop all things starting with inputs., outputs., and reference96        to_drop = [97            col98            for col in df.columns99            if col.startswith(("inputs.", "outputs.", "reference"))100            or col in {"input", "output"}101        ]102        return df.describe(include="all").drop(to_drop, axis=1)103104    def to_dataframe(self) -> pd.DataFrame:105        """Convert the results to a dataframe."""106        try:107            import pandas as pd108        except ImportError as e:109            msg = (110                "Pandas is required to convert the results to a dataframe."111                " to install pandas, run `pip install pandas`."112            )113            raise ImportError(msg) from e114115        indices = []116        records = []117        for example_id, result in self["results"].items():118            feedback = result["feedback"]119            output_ = result.get("output")120            if isinstance(output_, dict):121                output = {f"outputs.{k}": v for k, v in output_.items()}122            elif output_ is None:123                output = {}124            else:125                output = {"output": output_}126127            r = {128                **{f"inputs.{k}": v for k, v in result["input"].items()},129                **output,130            }131            if "reference" in result:132                if isinstance(result["reference"], dict):133                    r.update(134                        {f"reference.{k}": v for k, v in result["reference"].items()},135                    )136                else:137                    r["reference"] = result["reference"]138            r.update(139                {140                    **{f"feedback.{f.key}": f.score for f in feedback},141                    "error": result.get("Error"),142                    "execution_time": result["execution_time"],143                    "run_id": result.get("run_id"),144                },145            )146            records.append(r)147            indices.append(example_id)148149        return pd.DataFrame(records, index=indices)150151152class EvalError(dict):153    """Your architecture raised an error."""154155    def __init__(self, Error: BaseException, **kwargs: Any) -> None:  # noqa: N803156        """Initialize the `EvalError` with an error and additional attributes.157158        Args:159            Error: The error that occurred.160            **kwargs: Additional attributes to include in the error.161        """162        super().__init__(Error=Error, **kwargs)163164    def __getattr__(self, name: str) -> Any:165        """Get an attribute from the `EvalError`.166167        Args:168            name: The name of the attribute to get.169170        Returns:171            The value of the attribute.172173        Raises:174            AttributeError: If the attribute does not exist.175        """176        try:177            return self[name]178        except KeyError as e:179            msg = f"'EvalError' object has no attribute '{name}'"180            raise AttributeError(msg) from e181182183def _wrap_in_chain_factory(184    llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,185    dataset_name: str = "<my_dataset>",186) -> MCF:187    """Wrap in a chain factory.188189    Forgive the user if they pass in a chain without memory instead of a chain190    factory. It's a common mistake. Raise a more helpful error message as well.191    """192    if isinstance(llm_or_chain_factory, Chain):193        chain = llm_or_chain_factory194        chain_class = chain.__class__.__name__195        if llm_or_chain_factory.memory is not None:196            memory_class = chain.memory.__class__.__name__197            msg = (198                "Cannot directly evaluate a chain with stateful memory."199                " To evaluate this chain, pass in a chain constructor"200                " that initializes fresh memory each time it is called."201                "  This will safeguard against information"202                " leakage between dataset examples."203                "\nFor example:\n\n"204                "def chain_constructor():\n"205                f"    new_memory = {memory_class}(...)\n"206                f"    return {chain_class}"207                "(memory=new_memory, ...)\n\n"208                f'run_on_dataset("{dataset_name}", chain_constructor, ...)'209            )210            raise ValueError(msg)211        return lambda: chain212    if isinstance(llm_or_chain_factory, BaseLanguageModel):213        return llm_or_chain_factory214    if isinstance(llm_or_chain_factory, Runnable):215        # Memory may exist here, but it's not elegant to check all those cases.216        lcf = llm_or_chain_factory217        return lambda: lcf218    if callable(llm_or_chain_factory):219        if is_traceable_function(llm_or_chain_factory):220            runnable_ = as_runnable(cast("Callable", llm_or_chain_factory))221            return lambda: runnable_222        try:223            _model = llm_or_chain_factory()  # type: ignore[call-arg]224        except TypeError:225            # It's an arbitrary function, wrap it in a RunnableLambda226            user_func = cast("Callable", llm_or_chain_factory)227            sig = inspect.signature(user_func)228            logger.info("Wrapping function %s as RunnableLambda.", sig)229            wrapped = RunnableLambda(user_func)230            return lambda: wrapped231        constructor = cast("Callable", llm_or_chain_factory)232        if isinstance(_model, BaseLanguageModel):233            # It's not uncommon to do an LLM constructor instead of raw LLM,234            # so we'll unpack it for the user.235            return _model236        if is_traceable_function(cast("Callable", _model)):237            runnable_ = as_runnable(cast("Callable", _model))238            return lambda: runnable_239        if not isinstance(_model, Runnable):240            # This is unlikely to happen - a constructor for a model function241            return lambda: RunnableLambda(constructor)242        # Typical correct case243        return constructor244    return llm_or_chain_factory  # type: ignore[unreachable]245246247def _get_prompt(inputs: dict[str, Any]) -> str:248    """Get prompt from inputs.249250    Args:251        inputs: The input dictionary.252253    Returns:254        A string prompt.255256    Raises:257        InputFormatError: If the input format is invalid.258    """259    if not inputs:260        msg = "Inputs should not be empty."261        raise InputFormatError(msg)262263    prompts = []264    if "prompt" in inputs:265        if not isinstance(inputs["prompt"], str):266            msg = f"Expected string for 'prompt', got {type(inputs['prompt']).__name__}"267            raise InputFormatError(msg)268        prompts = [inputs["prompt"]]269    elif "prompts" in inputs:270        if not isinstance(inputs["prompts"], list) or not all(271            isinstance(i, str) for i in inputs["prompts"]272        ):273            msg = (274                "Expected list of strings for 'prompts',"275                f" got {type(inputs['prompts']).__name__}"276            )277            raise InputFormatError(msg)278        prompts = inputs["prompts"]279    elif len(inputs) == 1:280        prompt_ = next(iter(inputs.values()))281        if isinstance(prompt_, str):282            prompts = [prompt_]283        elif isinstance(prompt_, list) and all(isinstance(i, str) for i in prompt_):284            prompts = prompt_285        else:286            msg = f"LLM Run expects string prompt input. Got {inputs}"287            raise InputFormatError(msg)288    else:289        msg = f"LLM Run expects 'prompt' or 'prompts' in inputs. Got {inputs}"290        raise InputFormatError(msg)291    if len(prompts) == 1:292        return prompts[0]293    msg = f"LLM Run expects single prompt input. Got {len(prompts)} prompts."294    raise InputFormatError(msg)295296297class ChatModelInput(TypedDict):298    """Input for a chat model."""299300    messages: list[BaseMessage]301302303def _get_messages(inputs: dict[str, Any]) -> dict:304    """Get Chat Messages from inputs.305306    Args:307        inputs: The input dictionary.308309    Returns:310        A list of chat messages.311312    Raises:313        InputFormatError: If the input format is invalid.314    """315    if not inputs:316        msg = "Inputs should not be empty."317        raise InputFormatError(msg)318    input_copy = inputs.copy()319    if "messages" in inputs:320        input_copy["input"] = input_copy.pop("messages")321    elif len(inputs) == 1:322        input_copy["input"] = next(iter(inputs.values()))323    if "input" in input_copy:324        raw_messages = input_copy["input"]325        if isinstance(raw_messages, list) and all(326            isinstance(i, dict) for i in raw_messages327        ):328            raw_messages = [raw_messages]329        if len(raw_messages) == 1:330            input_copy["input"] = messages_from_dict(raw_messages[0])331        else:332            msg = (333                "Batch messages not supported. Please provide a"334                " single list of messages."335            )336            raise InputFormatError(msg)337        return input_copy338    msg = (339        f"Chat Run expects single List[dict] or List[List[dict]] 'messages'"340        f" input. Got {inputs}"341    )342    raise InputFormatError(msg)343344345## Shared data validation utilities346def _validate_example_inputs_for_language_model(347    first_example: Example,348    input_mapper: Callable[[dict], Any] | None,349) -> None:350    if input_mapper:351        prompt_input = input_mapper(first_example.inputs or {})352        if not isinstance(prompt_input, str) and not (353            isinstance(prompt_input, list)354            and all(isinstance(msg, BaseMessage) for msg in prompt_input)355        ):356            msg = (357                "When using an input_mapper to prepare dataset example inputs"358                " for an LLM or chat model, the output must a single string or"359                " a list of chat messages."360                f"\nGot: {prompt_input} of type {type(prompt_input)}."361            )362            raise InputFormatError(msg)363    else:364        try:365            _get_prompt(first_example.inputs or {})366        except InputFormatError:367            try:368                _get_messages(first_example.inputs or {})369            except InputFormatError as err2:370                msg = (371                    "Example inputs do not match language model input format. "372                    "Expected a dictionary with messages or a single prompt."373                    f" Got: {first_example.inputs}"374                    " Please update your dataset OR provide an input_mapper"375                    " to convert the example.inputs to a compatible format"376                    " for the llm or chat model you wish to evaluate."377                )378                raise InputFormatError(msg) from err2379380381def _validate_example_inputs_for_chain(382    first_example: Example,383    chain: Chain,384    input_mapper: Callable[[dict], Any] | None,385) -> None:386    """Validate that the example inputs match the chain input keys."""387    if input_mapper:388        first_inputs = input_mapper(first_example.inputs or {})389        missing_keys = set(chain.input_keys).difference(first_inputs)390        if not isinstance(first_inputs, dict):391            msg = (392                "When using an input_mapper to prepare dataset example"393                " inputs for a chain, the mapped value must be a dictionary."394                f"\nGot: {first_inputs} of type {type(first_inputs)}."395            )396            raise InputFormatError(msg)397        if missing_keys:398            msg = (399                "Missing keys after loading example using input_mapper."400                f"\nExpected: {chain.input_keys}. Got: {first_inputs.keys()}"401            )402            raise InputFormatError(msg)403    else:404        first_inputs = first_example.inputs or {}405        missing_keys = set(chain.input_keys).difference(first_inputs)406        if len(first_inputs) == 1 and len(chain.input_keys) == 1:407            # We can pass this through the run method.408            # Refrain from calling to validate.409            pass410        elif missing_keys:411            msg = (412                "Example inputs missing expected chain input keys."413                " Please provide an input_mapper to convert the example.inputs"414                " to a compatible format for the chain you wish to evaluate."415                f"Expected: {chain.input_keys}. "416                f"Got: {first_inputs.keys()}"417            )418            raise InputFormatError(msg)419420421def _validate_example_inputs(422    example: Example,423    llm_or_chain_factory: MCF,424    input_mapper: Callable[[dict], Any] | None,425) -> None:426    """Validate that the example inputs are valid for the model."""427    if isinstance(llm_or_chain_factory, BaseLanguageModel):428        _validate_example_inputs_for_language_model(example, input_mapper)429    else:430        chain = llm_or_chain_factory()431        if isinstance(chain, Chain):432            # Otherwise it's a runnable433            _validate_example_inputs_for_chain(example, chain, input_mapper)434        elif isinstance(chain, Runnable):435            logger.debug("Skipping input validation for %s", chain)436437438## Shared Evaluator Setup Utilities439440441def _setup_evaluation(442    llm_or_chain_factory: MCF,443    examples: list[Example],444    evaluation: smith_eval.RunEvalConfig | None,445    data_type: DataType,446) -> list[RunEvaluator] | None:447    """Configure the evaluators to run on the results of the chain."""448    if evaluation:449        if isinstance(llm_or_chain_factory, BaseLanguageModel):450            run_inputs, run_outputs = None, None451            run_type = "llm"452        else:453            run_type = "chain"454            chain = llm_or_chain_factory()455            run_inputs = chain.input_keys if isinstance(chain, Chain) else None456            run_outputs = chain.output_keys if isinstance(chain, Chain) else None457        run_evaluators = _load_run_evaluators(458            evaluation,459            run_type,460            data_type,461            list(examples[0].outputs) if examples[0].outputs else None,462            run_inputs,463            run_outputs,464        )465    else:466        # TODO: Create a default helpfulness evaluator467        run_evaluators = None468    return run_evaluators469470471def _determine_input_key(472    config: smith_eval.RunEvalConfig,473    run_inputs: list[str] | None,474) -> str | None:475    input_key = None476    if config.input_key:477        input_key = config.input_key478        if run_inputs and input_key not in run_inputs:479            logger.warning(480                "Input key %s not in chain's specified input keys %s. "481                "Evaluation behavior may be undefined.",482                input_key,483                run_inputs,484            )485    elif run_inputs and len(run_inputs) == 1:486        input_key = run_inputs[0]487    elif run_inputs is not None and len(run_inputs) > 1:488        logger.warning(489            "Chain expects multiple input keys: %s,"490            " Evaluator is likely to fail. Evaluation behavior may be undefined."491            " Specify an input_key in the RunEvalConfig to avoid this warning.",492            run_inputs,493        )494495    return input_key496497498def _determine_prediction_key(499    config: smith_eval.RunEvalConfig,500    run_outputs: list[str] | None,501) -> str | None:502    prediction_key = None503    if config.prediction_key:504        prediction_key = config.prediction_key505        if run_outputs and prediction_key not in run_outputs:506            logger.warning(507                "Prediction key %s not in chain's specified output keys %s. "508                "Evaluation behavior may be undefined.",509                prediction_key,510                run_outputs,511            )512    elif run_outputs and len(run_outputs) == 1:513        prediction_key = run_outputs[0]514    elif run_outputs is not None and len(run_outputs) > 1:515        logger.warning(516            "Chain expects multiple output keys: %s,"517            " Evaluation behavior may be undefined. Specify a prediction_key"518            " in the RunEvalConfig to avoid this warning.",519            run_outputs,520        )521    return prediction_key522523524def _determine_reference_key(525    config: smith_eval.RunEvalConfig,526    example_outputs: list[str] | None,527) -> str | None:528    if config.reference_key:529        reference_key = config.reference_key530        if example_outputs and reference_key not in example_outputs:531            msg = (532                f"Reference key {reference_key} not in Dataset"533                f" example outputs: {example_outputs}"534            )535            raise ValueError(msg)536    elif example_outputs and len(example_outputs) == 1:537        reference_key = next(iter(example_outputs))538    else:539        reference_key = None540    return reference_key541542543def _construct_run_evaluator(544    eval_config: smith_eval_config.SINGLE_EVAL_CONFIG_TYPE545    | smith_eval_config.CUSTOM_EVALUATOR_TYPE,546    eval_llm: BaseLanguageModel | None,547    run_type: str,548    data_type: DataType,549    example_outputs: list[str] | None,550    reference_key: str | None,551    input_key: str | None,552    prediction_key: str | None,553) -> RunEvaluator:554    if isinstance(eval_config, RunEvaluator):555        return eval_config556    if isinstance(eval_config, (EvaluatorType, str)):557        if not isinstance(eval_config, EvaluatorType):558            eval_config = EvaluatorType(eval_config)559        evaluator_ = load_evaluator(eval_config, llm=eval_llm)560        eval_type_tag = eval_config.value561    elif isinstance(eval_config, smith_eval_config.EvalConfig):562        kwargs = {"llm": eval_llm, **eval_config.get_kwargs()}563        evaluator_ = load_evaluator(eval_config.evaluator_type, **kwargs)564        eval_type_tag = eval_config.evaluator_type.value565        # Override keys if specified in the config566        if isinstance(eval_config, smith_eval_config.SingleKeyEvalConfig):567            input_key = eval_config.input_key or input_key568            prediction_key = eval_config.prediction_key or prediction_key569            reference_key = eval_config.reference_key or reference_key570    elif callable(eval_config):571        # Assume we can decorate572        return run_evaluator_dec(eval_config)573    else:574        msg = f"Unknown evaluator type: {type(eval_config)}"575        raise ValueError(msg)  # noqa: TRY004576577    if isinstance(evaluator_, StringEvaluator):578        if evaluator_.requires_reference and reference_key is None:579            msg = (580                f"Must specify reference_key in smith_eval.RunEvalConfig to use"581                f" evaluator of type {eval_type_tag} with"582                f" dataset with multiple output keys: {example_outputs}."583            )584            raise ValueError(msg)585        run_evaluator = smith_eval.StringRunEvaluatorChain.from_run_and_data_type(586            evaluator_,587            run_type,588            data_type,589            input_key=input_key,590            prediction_key=prediction_key,591            reference_key=reference_key,592            tags=[eval_type_tag],593        )594    elif isinstance(evaluator_, PairwiseStringEvaluator):595        msg = (596            f"Run evaluator for {eval_type_tag} is not implemented."597            " PairwiseStringEvaluators compare the outputs of two different models"598            " rather than the output of a single model."599            " Did you mean to use a StringEvaluator instead?"600            "\nSee: https://python.langchain.com/docs/guides/evaluation/string/"601        )602        raise NotImplementedError(msg)603604    else:605        msg = f"Run evaluator for {eval_type_tag} is not implemented"606        raise NotImplementedError(msg)607    return run_evaluator608609610def _get_keys(611    config: smith_eval.RunEvalConfig,612    run_inputs: list[str] | None,613    run_outputs: list[str] | None,614    example_outputs: list[str] | None,615) -> tuple[str | None, str | None, str | None]:616    input_key = _determine_input_key(config, run_inputs)617    prediction_key = _determine_prediction_key(config, run_outputs)618    reference_key = _determine_reference_key(config, example_outputs)619    return input_key, prediction_key, reference_key620621622def _load_run_evaluators(623    config: smith_eval.RunEvalConfig,624    run_type: str,625    data_type: DataType,626    example_outputs: list[str] | None,627    run_inputs: list[str] | None,628    run_outputs: list[str] | None,629) -> list[RunEvaluator]:630    """Load run evaluators from a configuration.631632    Args:633        config: Configuration for the run evaluators.634        run_type: The type of run.635        data_type: The type of dataset used in the run.636        example_outputs: The example outputs.637        run_inputs: The input keys for the run.638        run_outputs: The output keys for the run.639640    Returns:641        A list of run evaluators.642    """643    run_evaluators = []644    input_key, prediction_key, reference_key = None, None, None645    if config.evaluators or (646        config.custom_evaluators647        and any(isinstance(e, StringEvaluator) for e in config.custom_evaluators)648    ):649        input_key, prediction_key, reference_key = _get_keys(650            config,651            run_inputs,652            run_outputs,653            example_outputs,654        )655    for eval_config in config.evaluators:656        run_evaluator = _construct_run_evaluator(657            eval_config,658            config.eval_llm,659            run_type,660            data_type,661            example_outputs,662            reference_key,663            input_key,664            prediction_key,665        )666        run_evaluators.append(run_evaluator)667    custom_evaluators = config.custom_evaluators or []668    for custom_evaluator in custom_evaluators:669        if isinstance(custom_evaluator, RunEvaluator):670            run_evaluators.append(custom_evaluator)671        elif isinstance(custom_evaluator, StringEvaluator):672            run_evaluators.append(673                smith_eval.StringRunEvaluatorChain.from_run_and_data_type(674                    custom_evaluator,675                    run_type,676                    data_type,677                    input_key=input_key,678                    prediction_key=prediction_key,679                    reference_key=reference_key,680                ),681            )682        elif callable(custom_evaluator):683            run_evaluators.append(run_evaluator_dec(custom_evaluator))684        else:685            msg = (  # type: ignore[unreachable]686                f"Unsupported custom evaluator: {custom_evaluator}."687                f" Expected RunEvaluator or StringEvaluator."688            )689            raise ValueError(msg)  # noqa: TRY004690691    return run_evaluators692693694### Async Helpers695696697async def _arun_llm(698    llm: BaseLanguageModel,699    inputs: dict[str, Any],700    *,701    tags: list[str] | None = None,702    callbacks: Callbacks = None,703    input_mapper: Callable[[dict], Any] | None = None,704    metadata: dict[str, Any] | None = None,705) -> str | BaseMessage:706    """Asynchronously run the language model.707708    Args:709        llm: The language model to run.710        inputs: The input dictionary.711        tags: Optional tags to add to the run.712        callbacks: Optional callbacks to use during the run.713        input_mapper: Optional function to map inputs to the expected format.714        metadata: Optional metadata to add to the run.715716    Returns:717        The LLMResult or ChatResult.718719    Raises:720        ValueError: If the LLM type is unsupported.721        InputFormatError: If the input format is invalid.722    """723    if input_mapper is not None:724        prompt_or_messages = input_mapper(inputs)725        if isinstance(prompt_or_messages, str) or (726            isinstance(prompt_or_messages, list)727            and all(isinstance(msg, BaseMessage) for msg in prompt_or_messages)728        ):729            return await llm.ainvoke(730                prompt_or_messages,731                config=RunnableConfig(732                    callbacks=callbacks,733                    tags=tags or [],734                    metadata=metadata or {},735                ),736            )737        msg = (738            "Input mapper returned invalid format"739            f" {prompt_or_messages}"740            "\nExpected a single string or list of chat messages."741        )742        raise InputFormatError(msg)743744    try:745        prompt = _get_prompt(inputs)746        llm_output: str | BaseMessage = await llm.ainvoke(747            prompt,748            config=RunnableConfig(749                callbacks=callbacks,750                tags=tags or [],751                metadata=metadata or {},752            ),753        )754    except InputFormatError:755        llm_inputs = _get_messages(inputs)756        llm_output = await llm.ainvoke(757            **llm_inputs,758            config=RunnableConfig(759                callbacks=callbacks,760                tags=tags or [],761                metadata=metadata or {},762            ),763        )764    return llm_output765766767async def _arun_chain(768    chain: Chain | Runnable,769    inputs: dict[str, Any],770    callbacks: Callbacks,771    *,772    tags: list[str] | None = None,773    input_mapper: Callable[[dict], Any] | None = None,774    metadata: dict[str, Any] | None = None,775) -> dict | str:776    """Run a chain asynchronously on inputs."""777    inputs_ = inputs if input_mapper is None else input_mapper(inputs)778    if (779        isinstance(chain, Chain)780        and isinstance(inputs_, dict)781        and len(inputs_) == 1782        and chain.input_keys783    ):784        val = next(iter(inputs_.values()))785        output = await chain.ainvoke(786            val,787            config=RunnableConfig(788                callbacks=callbacks,789                tags=tags or [],790                metadata=metadata or {},791            ),792        )793    else:794        runnable_config = RunnableConfig(795            tags=tags or [],796            callbacks=callbacks,797            metadata=metadata or {},798        )799        output = await chain.ainvoke(inputs_, config=runnable_config)800    return output801802803async def _arun_llm_or_chain(804    example: Example,805    config: RunnableConfig,806    *,807    llm_or_chain_factory: MCF,808    input_mapper: Callable[[dict], Any] | None = None,809) -> dict | str | LLMResult | ChatResult:810    """Asynchronously run the Chain or language model.811812    Args:813        example: The example to run.814        config: The configuration for the run.815        llm_or_chain_factory: The Chain or language model constructor to run.816        input_mapper: Optional function to map the input to the expected format.817818    Returns:819        A list of outputs.820    """821    chain_or_llm = (822        "LLM" if isinstance(llm_or_chain_factory, BaseLanguageModel) else "Chain"823    )824    result = None825    try:826        if isinstance(llm_or_chain_factory, BaseLanguageModel):827            output: Any = await _arun_llm(828                llm_or_chain_factory,829                example.inputs or {},830                tags=config["tags"],831                callbacks=config["callbacks"],832                input_mapper=input_mapper,833                metadata=config.get("metadata"),834            )835        else:836            chain = llm_or_chain_factory()837            output = await _arun_chain(838                chain,839                example.inputs or {},840                tags=config["tags"],841                callbacks=config["callbacks"],842                input_mapper=input_mapper,843                metadata=config.get("metadata"),844            )845        result = output846    except Exception as e:  # noqa: BLE001847        logger.warning(848            "%s failed for example %s with inputs %s\n%s",849            chain_or_llm,850            example.id,851            example.inputs,852            e,853        )854        result = EvalError(Error=e)855    return result856857858## Sync Utilities859860861def _run_llm(862    llm: BaseLanguageModel,863    inputs: dict[str, Any],864    callbacks: Callbacks,865    *,866    tags: list[str] | None = None,867    input_mapper: Callable[[dict], Any] | None = None,868    metadata: dict[str, Any] | None = None,869) -> str | BaseMessage:870    """Run the language model on the example.871872    Args:873        llm: The language model to run.874        inputs: The input dictionary.875        callbacks: The callbacks to use during the run.876        tags: Optional tags to add to the run.877        input_mapper: function to map to the inputs dictionary from an Example878        metadata: Optional metadata to add to the run.879880    Returns:881        The LLMResult or ChatResult.882883    Raises:884        ValueError: If the LLM type is unsupported.885        InputFormatError: If the input format is invalid.886    """887    # Most of this is legacy code; we could probably remove a lot of it.888    if input_mapper is not None:889        prompt_or_messages = input_mapper(inputs)890        if isinstance(prompt_or_messages, str) or (891            isinstance(prompt_or_messages, list)892            and all(isinstance(msg, BaseMessage) for msg in prompt_or_messages)893        ):894            llm_output: str | BaseMessage = llm.invoke(895                prompt_or_messages,896                config=RunnableConfig(897                    callbacks=callbacks,898                    tags=tags or [],899                    metadata=metadata or {},900                ),901            )902        else:903            msg = (904                "Input mapper returned invalid format: "905                f" {prompt_or_messages}"906                "\nExpected a single string or list of chat messages."907            )908            raise InputFormatError(msg)909    else:910        try:911            llm_prompts = _get_prompt(inputs)912            llm_output = llm.invoke(913                llm_prompts,914                config=RunnableConfig(915                    callbacks=callbacks,916                    tags=tags or [],917                    metadata=metadata or {},918                ),919            )920        except InputFormatError:921            llm_inputs = _get_messages(inputs)922            llm_output = llm.invoke(923                **llm_inputs,924                config=RunnableConfig(callbacks=callbacks, metadata=metadata or {}),925            )926    return llm_output927928929def _run_chain(930    chain: Chain | Runnable,931    inputs: dict[str, Any],932    callbacks: Callbacks,933    *,934    tags: list[str] | None = None,935    input_mapper: Callable[[dict], Any] | None = None,936    metadata: dict[str, Any] | None = None,937) -> dict | str:938    """Run a chain on inputs."""939    inputs_ = inputs if input_mapper is None else input_mapper(inputs)940    if (941        isinstance(chain, Chain)942        and isinstance(inputs_, dict)943        and len(inputs_) == 1944        and chain.input_keys945    ):946        val = next(iter(inputs_.values()))947        output = chain.invoke(948            val,949            config=RunnableConfig(950                callbacks=callbacks,951                tags=tags or [],952                metadata=metadata or {},953            ),954        )955    else:956        runnable_config = RunnableConfig(957            tags=tags or [],958            callbacks=callbacks,959            metadata=metadata or {},960        )961        output = chain.invoke(inputs_, config=runnable_config)962    return output963964965def _run_llm_or_chain(966    example: Example,967    config: RunnableConfig,968    *,969    llm_or_chain_factory: MCF,970    input_mapper: Callable[[dict], Any] | None = None,971) -> dict | str | LLMResult | ChatResult:972    """Run the Chain or language model synchronously.973974    Args:975        example: The example to run.976        config: The configuration for the run.977        llm_or_chain_factory: The Chain or language model constructor to run.978        input_mapper: Optional function to map the input to the expected format.979980    Returns:981        The outputs of the model or chain.982    """983    chain_or_llm = (984        "LLM" if isinstance(llm_or_chain_factory, BaseLanguageModel) else "Chain"985    )986    result = None987    try:988        if isinstance(llm_or_chain_factory, BaseLanguageModel):989            output: Any = _run_llm(990                llm_or_chain_factory,991                example.inputs or {},992                config["callbacks"],993                tags=config["tags"],994                input_mapper=input_mapper,995                metadata=config.get("metadata"),996            )997        else:998            chain = llm_or_chain_factory()999            output = _run_chain(1000                chain,1001                example.inputs or {},1002                config["callbacks"],1003                tags=config["tags"],1004                input_mapper=input_mapper,1005                metadata=config.get("metadata"),1006            )1007        result = output1008    except Exception as e:  # noqa: BLE0011009        error_type = type(e).__name__1010        logger.warning(1011            "%s failed for example %s with inputs %s\nError Type: %s, Message: %s",1012            chain_or_llm,1013            example.id,1014            example.inputs,1015            error_type,1016            e,1017        )1018        result = EvalError(Error=e)1019    return result102010211022def _prepare_eval_run(1023    client: Client,1024    dataset_name: str,1025    llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,1026    project_name: str,1027    project_metadata: dict[str, Any] | None = None,1028    tags: list[str] | None = None,1029    dataset_version: str | datetime | None = None,1030) -> tuple[MCF, TracerSession, Dataset, list[Example]]:1031    wrapped_model = _wrap_in_chain_factory(llm_or_chain_factory, dataset_name)1032    dataset = client.read_dataset(dataset_name=dataset_name)10331034    examples = list(client.list_examples(dataset_id=dataset.id, as_of=dataset_version))1035    if not examples:1036        msg = f"Dataset {dataset_name} has no example rows."1037        raise ValueError(msg)1038    modified_at = [ex.modified_at for ex in examples if ex.modified_at]1039    # Should always be defined in practice when fetched,1040    # but the typing permits None1041    max_modified_at = max(modified_at) if modified_at else None1042    inferred_version = max_modified_at.isoformat() if max_modified_at else None10431044    try:1045        project_metadata = project_metadata or {}1046        git_info = get_git_info()1047        if git_info:1048            project_metadata = {1049                **project_metadata,1050                "git": git_info,1051            }10521053        project_metadata["dataset_version"] = inferred_version1054        project = client.create_project(1055            project_name,1056            reference_dataset_id=dataset.id,1057            project_extra={"tags": tags} if tags else {},1058            metadata=project_metadata,1059        )1060    except (HTTPError, ValueError, LangSmithError) as e:1061        if "already exists " not in str(e):1062            raise1063        uid = uuid.uuid4()1064        example_msg = f"""1065run_on_dataset(1066    ...1067    project_name="{project_name} - {uid}", # Update since {project_name} already exists1068)1069"""1070        msg = (1071            f"Test project {project_name} already exists. Please use a different name:"1072            f"\n\n{example_msg}"1073        )1074        raise ValueError(msg) from e1075    comparison_url = dataset.url + f"/compare?selectedSessions={project.id}"1076    print(  # noqa: T2011077        f"View the evaluation results for project '{project_name}'"1078        f" at:\n{comparison_url}\n\n"1079        f"View all tests for Dataset {dataset_name} at:\n{dataset.url}",1080        flush=True,1081    )1082    return wrapped_model, project, dataset, examples108310841085class _RowResult(TypedDict, total=False):1086    """A dictionary of the results for a single example row."""10871088    feedback: list[EvaluationResult] | None1089    execution_time: float | None1090    run_id: str | None109110921093@dataclasses.dataclass1094class _DatasetRunContainer:1095    """A container to help manage the state of a eval run."""10961097    client: Client1098    project: TracerSession1099    wrapped_model: MCF1100    examples: list[Example]1101    configs: list[RunnableConfig]1102    batch_evaluators: list[smith_eval_config.BATCH_EVALUATOR_LIKE] | None = None11031104    def _merge_test_outputs(1105        self,1106        batch_results: list,1107        all_eval_results: dict[str, _RowResult],1108    ) -> dict:1109        results: dict = {}1110        for example, output in zip(self.examples, batch_results, strict=False):1111            row_result = all_eval_results.get(str(example.id), {})1112            results[str(example.id)] = {1113                "input": example.inputs,1114                "feedback": row_result.get("feedback", []),1115                "execution_time": row_result.get("execution_time"),1116                "run_id": row_result.get("run_id"),1117            }1118            if isinstance(output, EvalError):1119                results[str(example.id)]["Error"] = output.Error1120            else:1121                results[str(example.id)]["output"] = output1122            if example.outputs:1123                results[str(example.id)]["reference"] = example.outputs1124        return results11251126    def _run_batch_evaluators(self, runs: dict[str, Run]) -> list[dict]:1127        evaluators = self.batch_evaluators1128        if not evaluators:1129            return []1130        runs_list = [runs[str(example.id)] for example in self.examples]1131        aggregate_feedback = []1132        with concurrent.futures.ThreadPoolExecutor() as executor:1133            for evaluator in evaluators:1134                try:1135                    result = evaluator(runs_list, self.examples)1136                    if isinstance(result, EvaluationResult):1137                        result = result.model_dump()1138                    aggregate_feedback.append(cast("dict", result))1139                    executor.submit(1140                        self.client.create_feedback,1141                        **result,1142                        run_id=None,1143                        project_id=self.project.id,1144                    )1145                except Exception:1146                    logger.exception(1147                        "Error running batch evaluator %s", repr(evaluator)1148                    )1149        return aggregate_feedback11501151    def _collect_metrics(self) -> tuple[dict[str, _RowResult], dict[str, Run]]:1152        all_eval_results: dict = {}1153        all_runs: dict = {}1154        for c in self.configs:1155            for callback in cast("list", c["callbacks"]):1156                if isinstance(callback, EvaluatorCallbackHandler):1157                    eval_results = callback.logged_eval_results1158                    for (_, example_id), v in eval_results.items():1159                        all_eval_results.setdefault(str(example_id), {}).update(1160                            {"feedback": v},1161                        )1162                elif isinstance(callback, LangChainTracer):1163                    run = callback.latest_run1164                    execution_time = (1165                        (run.end_time - run.start_time).total_seconds()1166                        if run and run.end_time1167                        else None1168                    )1169                    run_id = str(run.id) if run else None1170                    all_eval_results.setdefault(str(callback.example_id), {}).update(1171                        {1172                            "execution_time": execution_time,1173                            "run_id": run_id,1174                            "run": run,1175                        },1176                    )1177                    all_runs[str(callback.example_id)] = run1178        return cast("dict[str, _RowResult]", all_eval_results), all_runs11791180    def _collect_test_results(1181        self,1182        batch_results: list[dict | str | LLMResult | ChatResult],1183    ) -> TestResult:1184        logger.info("Waiting for evaluators to complete.")1185        wait_for_all_evaluators()1186        all_eval_results, all_runs = self._collect_metrics()1187        aggregate_feedback = None1188        if self.batch_evaluators:1189            logger.info("Running session evaluators.")1190            aggregate_feedback = self._run_batch_evaluators(all_runs)1191        results = self._merge_test_outputs(batch_results, all_eval_results)1192        return TestResult(1193            project_name=self.project.name,1194            results=results,1195            aggregate_metrics=aggregate_feedback,1196        )11971198    def finish(1199        self,1200        batch_results: list,1201        verbose: bool = False,  # noqa: FBT001,FBT0021202    ) -> TestResult:1203        results = self._collect_test_results(batch_results)1204        if verbose:1205            try:1206                agg_feedback = results.get_aggregate_feedback()1207                _display_aggregate_results(agg_feedback)1208            except Exception as e:  # noqa: BLE0011209                logger.debug("Failed to print aggregate feedback: %s", e, exc_info=True)1210        try:1211            # Closing the project permits name changing and metric optimizations1212            self.client.update_project(1213                self.project.id,1214                end_time=datetime.now(timezone.utc),1215            )1216        except Exception as e:  # noqa: BLE0011217            logger.debug("Failed to close project: %s", e, exc_info=True)1218        return results12191220    @classmethod1221    def prepare(1222        cls,1223        client: Client,1224        dataset_name: str,1225        llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,1226        project_name: str | None,1227        evaluation: smith_eval.RunEvalConfig | None = None,1228        tags: list[str] | None = None,1229        input_mapper: Callable[[dict], Any] | None = None,1230        concurrency_level: int = 5,1231        project_metadata: dict[str, Any] | None = None,1232        revision_id: str | None = None,1233        dataset_version: datetime | str | None = None,1234    ) -> _DatasetRunContainer:1235        project_name = project_name or name_generation.random_name()1236        if revision_id:1237            if not project_metadata:1238                project_metadata = {}1239            project_metadata.update({"revision_id": revision_id})1240        wrapped_model, project, dataset, examples = _prepare_eval_run(1241            client,1242            dataset_name,1243            llm_or_chain_factory,1244            project_name,1245            project_metadata=project_metadata,1246            tags=tags,1247            dataset_version=dataset_version,1248        )1249        tags = tags or []1250        for k, v in (project.metadata.get("git") or {}).items():1251            tags.append(f"git:{k}={v}")1252        run_metadata = {"dataset_version": project.metadata["dataset_version"]}1253        if revision_id:1254            run_metadata["revision_id"] = revision_id1255        wrapped_model = _wrap_in_chain_factory(llm_or_chain_factory)1256        run_evaluators = _setup_evaluation(1257            wrapped_model,1258            examples,1259            evaluation,1260            dataset.data_type or DataType.kv,1261        )1262        _validate_example_inputs(examples[0], wrapped_model, input_mapper)1263        progress_bar = progress.ProgressBarCallback(len(examples))1264        configs = [1265            RunnableConfig(1266                callbacks=[1267                    LangChainTracer(1268                        project_name=project.name,1269                        client=client,1270                        example_id=example.id,1271                    ),1272                    EvaluatorCallbackHandler(1273                        evaluators=run_evaluators or [],1274                        client=client,1275                        example_id=example.id,1276                        max_concurrency=0,1277                    ),1278                    progress_bar,1279                ],1280                tags=tags,1281                max_concurrency=concurrency_level,1282                metadata=run_metadata,1283            )1284            for example in examples1285        ]1286        return cls(1287            client=client,1288            project=project,1289            wrapped_model=wrapped_model,1290            examples=examples,1291            configs=configs,1292            batch_evaluators=evaluation.batch_evaluators if evaluation else None,1293        )129412951296def _is_jupyter_environment() -> bool:1297    try:1298        from IPython.core.getipython import get_ipython12991300        res = get_ipython()  # type: ignore[no-untyped-call]1301        return res is not None and "zmqshell" in str(type(res))1302    except ImportError:1303        return False130413051306def _display_aggregate_results(aggregate_results: pd.DataFrame) -> None:1307    if _is_jupyter_environment():1308        from IPython.display import HTML, display13091310        display(HTML("<h3>Experiment Results:</h3>"))  # type: ignore[no-untyped-call]1311        display(aggregate_results)  # type: ignore[no-untyped-call]1312    else:1313        formatted_string = aggregate_results.to_string(1314            float_format=lambda x: f"{x:.2f}",1315            justify="right",1316        )1317        print("\n Experiment Results:")  # noqa: T2011318        print(formatted_string)  # noqa: T201131913201321_INPUT_MAPPER_DEP_WARNING = (1322    "The input_mapper argument is deprecated and "1323    "will be removed in a future release. Please add a "1324    " RunnableLambda to your chain to map inputs to the expected format"1325    " instead. Example:\n"1326    "def construct_chain():\n"1327    "    my_chain = ...\n"1328    "    input_mapper = {'other_key': 'MyOtherInput', 'my_input_key': x}\n"1329    "    return input_mapper | my_chain\n"1330    "run_on_dataset(..., llm_or_chain_factory=construct_chain)\n"1331    "(See https://api.python.langchain.com/en/latest/schema/"1332    "langchain.schema.runnable.base.RunnableLambda.html)"1333)13341335## Public API133613371338async def arun_on_dataset(1339    client: Client | None,1340    dataset_name: str,1341    llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,1342    *,1343    evaluation: smith_eval.RunEvalConfig | None = None,1344    dataset_version: datetime | str | None = None,1345    concurrency_level: int = 5,1346    project_name: str | None = None,1347    project_metadata: dict[str, Any] | None = None,1348    verbose: bool = False,1349    revision_id: str | None = None,1350    **kwargs: Any,1351) -> dict[str, Any]:1352    """Run on dataset.13531354    Run the Chain or language model on a dataset and store traces1355    to the specified project name.13561357    For the (usually faster) async version of this function,1358    see `arun_on_dataset`.13591360    Args:1361        dataset_name: Name of the dataset to run the chain on.1362        llm_or_chain_factory: Language model or Chain constructor to run1363            over the dataset. The Chain constructor is used to permit1364            independent calls on each example without carrying over state.1365        evaluation: Configuration for evaluators to run on the1366            results of the chain.1367        dataset_version: Optional version of the dataset.1368        concurrency_level: The number of async tasks to run concurrently.1369        project_name: Name of the project to store the traces in.1370            Defaults to `{dataset_name}-{chain class name}-{datetime}`.1371        project_metadata: Optional metadata to add to the project.1372            Useful for storing information the test variant.1373            (prompt version, model version, etc.)1374        client: LangSmith client to use to access the dataset and to1375            log feedback and run traces.1376        verbose: Whether to print progress.1377        revision_id: Optional revision identifier to assign this test run to1378            track the performance of different versions of your system.1379        **kwargs: Should not be used, but is provided for backwards compatibility.13801381    Returns:1382        `dict` containing the run's project name and the resulting model outputs.13831384    Examples:1385    ```python1386    from langsmith import Client1387    from langchain_openai import ChatOpenAI1388    from langchain_classic.chains import LLMChain1389    from langchain_classic.smith import smith_eval.RunEvalConfig, run_on_dataset13901391    # Chains may have memory. Passing in a constructor function lets the1392    # evaluation framework avoid cross-contamination between runs.1393    def construct_chain():1394        model = ChatOpenAI(temperature=0)1395        chain = LLMChain.from_string(1396            model,1397            "What's the answer to {your_input_key}"1398        )1399        return chain14001401    # Load off-the-shelf evaluators via config or the EvaluatorType (string or enum)1402    evaluation_config = smith_eval.RunEvalConfig(1403        evaluators=[1404            "qa",  # "Correctness" against a reference answer1405            "embedding_distance",1406            smith_eval.RunEvalConfig.Criteria("helpfulness"),1407            smith_eval.RunEvalConfig.Criteria({1408                "fifth-grader-score": "Do you have to be smarter than a fifth "1409                "grader to answer this question?"1410            }),1411        ]1412    )14131414    client = Client()1415    await arun_on_dataset(1416        client,1417        dataset_name="<my_dataset_name>",1418        llm_or_chain_factory=construct_chain,1419        evaluation=evaluation_config,1420    )1421    ```1422    You can also create custom evaluators by subclassing the `StringEvaluator or1423    LangSmith's `RunEvaluator` classes.14241425    ```python1426    from typing import Optional1427    from langchain_classic.evaluation import StringEvaluator142814291430    class MyStringEvaluator(StringEvaluator):1431        @property1432        def requires_input(self) -> bool:1433            return False14341435        @property1436        def requires_reference(self) -> bool:1437            return True14381439        @property1440        def evaluation_name(self) -> str:1441            return "exact_match"14421443        def _evaluate_strings(1444            self, prediction, reference=None, input=None, **kwargs1445        ) -> dict:1446            return {"score": prediction == reference}144714481449    evaluation_config = smith_eval.RunEvalConfig(1450        custom_evaluators=[MyStringEvaluator()],1451    )14521453    await arun_on_dataset(1454        client,1455        dataset_name="<my_dataset_name>",1456        llm_or_chain_factory=construct_chain,1457        evaluation=evaluation_config,1458    )1459    ```1460    """1461    input_mapper = kwargs.pop("input_mapper", None)1462    if input_mapper:1463        warn_deprecated("0.0.305", message=_INPUT_MAPPER_DEP_WARNING, pending=True)1464    if revision_id is None:1465        revision_id = get_langchain_env_var_metadata().get("revision_id")1466    tags = kwargs.pop("tags", None)1467    if tags:1468        warn_deprecated(1469            "0.1.9",1470            message="The tags argument is deprecated and will be"1471            " removed in a future release. Please specify project_metadata instead.",1472            pending=True,1473        )14741475    if kwargs:1476        warn_deprecated(1477            "0.0.305",1478            message="The following arguments are deprecated and "1479            "will be removed in a future release: "1480            f"{kwargs.keys()}.",1481            removal="0.0.305",1482        )1483    client = client or Client()1484    container = _DatasetRunContainer.prepare(1485        client,1486        dataset_name,1487        llm_or_chain_factory,1488        project_name,1489        evaluation,1490        tags,1491        input_mapper,1492        concurrency_level,1493        project_metadata=project_metadata,1494        revision_id=revision_id,1495        dataset_version=dataset_version,1496    )1497    batch_results = await runnable_utils.gather_with_concurrency(1498        container.configs[0].get("max_concurrency"),1499        *map(1500            functools.partial(1501                _arun_llm_or_chain,1502                llm_or_chain_factory=container.wrapped_model,1503                input_mapper=input_mapper,1504            ),1505            container.examples,1506            container.configs,1507        ),1508    )1509    return container.finish(batch_results, verbose=verbose)151015111512def run_on_dataset(1513    client: Client | None,1514    dataset_name: str,1515    llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,1516    *,1517    evaluation: smith_eval.RunEvalConfig | None = None,1518    dataset_version: datetime | str | None = None,1519    concurrency_level: int = 5,1520    project_name: str | None = None,1521    project_metadata: dict[str, Any] | None = None,1522    verbose: bool = False,1523    revision_id: str | None = None,1524    **kwargs: Any,1525) -> dict[str, Any]:1526    """Run on dataset.15271528    Run the Chain or language model on a dataset and store traces1529    to the specified project name.15301531    For the (usually faster) async version of this function,1532    see `arun_on_dataset`.15331534    Args:1535        dataset_name: Name of the dataset to run the chain on.1536        llm_or_chain_factory: Language model or Chain constructor to run1537            over the dataset. The Chain constructor is used to permit1538            independent calls on each example without carrying over state.1539        evaluation: Configuration for evaluators to run on the1540            results of the chain.1541        dataset_version: Optional version of the dataset.1542        concurrency_level: The number of async tasks to run concurrently.1543        project_name: Name of the project to store the traces in.1544            Defaults to `{dataset_name}-{chain class name}-{datetime}`.1545        project_metadata: Optional metadata to add to the project.1546            Useful for storing information the test variant.1547            (prompt version, model version, etc.)1548        client: LangSmith client to use to access the dataset and to1549            log feedback and run traces.1550        verbose: Whether to print progress.1551        revision_id: Optional revision identifier to assign this test run to1552            track the performance of different versions of your system.1553        **kwargs: Should not be used, but is provided for backwards compatibility.15541555    Returns:1556        `dict` containing the run's project name and the resulting model outputs.15571558    Examples:1559    ```python1560    from langsmith import Client1561    from langchain_openai import ChatOpenAI1562    from langchain_classic.chains import LLMChain1563    from langchain_classic.smith import smith_eval.RunEvalConfig, run_on_dataset15641565    # Chains may have memory. Passing in a constructor function lets the1566    # evaluation framework avoid cross-contamination between runs.1567    def construct_chain():1568        model = ChatOpenAI(temperature=0)1569        chain = LLMChain.from_string(1570            model,1571            "What's the answer to {your_input_key}"1572        )1573        return chain15741575    # Load off-the-shelf evaluators via config or the EvaluatorType (string or enum)1576    evaluation_config = smith_eval.RunEvalConfig(1577        evaluators=[1578            "qa",  # "Correctness" against a reference answer1579            "embedding_distance",1580            smith_eval.RunEvalConfig.Criteria("helpfulness"),1581            smith_eval.RunEvalConfig.Criteria({1582                "fifth-grader-score": "Do you have to be smarter than a fifth "1583                "grader to answer this question?"1584            }),1585        ]1586    )15871588    client = Client()1589    run_on_dataset(1590        client,1591        dataset_name="<my_dataset_name>",1592        llm_or_chain_factory=construct_chain,1593        evaluation=evaluation_config,1594    )1595    ```15961597    You can also create custom evaluators by subclassing the `StringEvaluator` or1598    LangSmith's `RunEvaluator` classes.15991600    ```python1601    from typing import Optional1602    from langchain_classic.evaluation import StringEvaluator160316041605    class MyStringEvaluator(StringEvaluator):1606        @property1607        def requires_input(self) -> bool:1608            return False16091610        @property1611        def requires_reference(self) -> bool:1612            return True16131614        @property1615        def evaluation_name(self) -> str:1616            return "exact_match"16171618        def _evaluate_strings(1619            self, prediction, reference=None, input=None, **kwargs1620        ) -> dict:1621            return {"score": prediction == reference}162216231624    evaluation_config = smith_eval.RunEvalConfig(1625        custom_evaluators=[MyStringEvaluator()],1626    )16271628    run_on_dataset(1629        client,1630        dataset_name="<my_dataset_name>",1631        llm_or_chain_factory=construct_chain,1632        evaluation=evaluation_config,1633    )1634    ```1635    """1636    input_mapper = kwargs.pop("input_mapper", None)1637    if input_mapper:1638        warn_deprecated("0.0.305", message=_INPUT_MAPPER_DEP_WARNING, pending=True)1639    tags = kwargs.pop("tags", None)1640    if tags:1641        warn_deprecated(1642            "0.1.9",1643            message="The tags argument is deprecated and will be"1644            " removed in a future release. Please specify project_metadata instead.",1645            pending=True,1646        )1647    if revision_id is None:1648        revision_id = get_langchain_env_var_metadata().get("revision_id")16491650    if kwargs:1651        warn_deprecated(1652            "0.0.305",1653            message="The following arguments are deprecated and "1654            "will be removed in a future release: "1655            f"{kwargs.keys()}.",1656            removal="0.0.305",1657        )1658    client = client or Client()1659    container = _DatasetRunContainer.prepare(1660        client,1661        dataset_name,1662        llm_or_chain_factory,1663        project_name,1664        evaluation,1665        tags,1666        input_mapper,1667        concurrency_level,1668        project_metadata=project_metadata,1669        revision_id=revision_id,1670        dataset_version=dataset_version,1671    )1672    if concurrency_level == 0:1673        batch_results = [1674            _run_llm_or_chain(1675                example,1676                config,1677                llm_or_chain_factory=container.wrapped_model,1678                input_mapper=input_mapper,1679            )1680            for example, config in zip(1681                container.examples, container.configs, strict=False1682            )1683        ]1684    else:1685        with runnable_config.get_executor_for_config(container.configs[0]) as executor:1686            batch_results = list(1687                executor.map(1688                    functools.partial(1689                        _run_llm_or_chain,1690                        llm_or_chain_factory=container.wrapped_model,1691                        input_mapper=input_mapper,1692                    ),1693                    container.examples,1694                    container.configs,1695                ),1696            )16971698    return container.finish(batch_results, verbose=verbose)

Code quality findings 75

Ensure functions have docstrings for documentation
missing-docstring
def get_aggregate_feedback(
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
if isinstance(output_, dict):
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
if isinstance(result["reference"], dict):
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
if isinstance(llm_or_chain_factory, Chain):
Ensure functions have docstrings for documentation
missing-docstring
"def chain_constructor():\n"
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
if isinstance(llm_or_chain_factory, BaseLanguageModel):
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
if isinstance(llm_or_chain_factory, Runnable):
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
if isinstance(_model, BaseLanguageModel):
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
if not isinstance(_model, Runnable):
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
if not isinstance(inputs["prompt"], str):
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
if not isinstance(inputs["prompts"], list) or not all(
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
isinstance(i, str) for i in inputs["prompts"]
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
if isinstance(prompt_, str):
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
elif isinstance(prompt_, list) and all(isinstance(i, str) for i in prompt_):
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
if isinstance(raw_messages, list) and all(
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
isinstance(i, dict) for i in raw_messages
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
if not isinstance(prompt_input, str) and not (
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
isinstance(prompt_input, list)
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
if not isinstance(first_inputs, dict):
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
if isinstance(llm_or_chain_factory, BaseLanguageModel):
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
if isinstance(chain, Chain):
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
elif isinstance(chain, Runnable):
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
if isinstance(llm_or_chain_factory, BaseLanguageModel):
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
run_inputs = chain.input_keys if isinstance(chain, Chain) else None
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
run_outputs = chain.output_keys if isinstance(chain, Chain) else None
Avoid unnecessary list conversions; use generators where possible
unnecessary-list
list(examples[0].outputs) if examples[0].outputs else None,
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
if isinstance(eval_config, RunEvaluator):
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
if isinstance(eval_config, (EvaluatorType, str)):
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
if not isinstance(eval_config, EvaluatorType):
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
elif isinstance(eval_config, smith_eval_config.EvalConfig):
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
if isinstance(eval_config, smith_eval_config.SingleKeyEvalConfig):
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
if isinstance(evaluator_, StringEvaluator):
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
elif isinstance(evaluator_, PairwiseStringEvaluator):
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
if isinstance(custom_evaluator, RunEvaluator):
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
elif isinstance(custom_evaluator, StringEvaluator):
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
if isinstance(prompt_or_messages, str) or (
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
isinstance(prompt_or_messages, list)
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
isinstance(chain, Chain)
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
and isinstance(inputs_, dict)
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
"LLM" if isinstance(llm_or_chain_factory, BaseLanguageModel) else "Chain"
Ensure try blocks have corresponding except or finally blocks
try-without-except
try:
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
if isinstance(llm_or_chain_factory, BaseLanguageModel):
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
if isinstance(prompt_or_messages, str) or (
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
isinstance(prompt_or_messages, list)
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
isinstance(chain, Chain)
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
and isinstance(inputs_, dict)
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
"LLM" if isinstance(llm_or_chain_factory, BaseLanguageModel) else "Chain"
Ensure try blocks have corresponding except or finally blocks
try-without-except
try:
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
if isinstance(llm_or_chain_factory, BaseLanguageModel):
Use isinstance() for type checking instead of type()
type-check
error_type = type(e).__name__
Avoid unnecessary list conversions; use generators where possible
unnecessary-list
examples = list(client.list_examples(dataset_id=dataset.id, as_of=dataset_version))
Ensure try blocks have corresponding except or finally blocks
try-without-except
try:
Use logging module for better control and configurability
print-statement
print( # noqa: T201
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
if isinstance(output, EvalError):
Ensure try blocks have corresponding except or finally blocks
try-without-except
try:
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
if isinstance(result, EvaluationResult):
Catch specific exceptions instead of Exception to avoid masking bugs
broad-except
except Exception:
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
if isinstance(callback, EvaluatorCallbackHandler):
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
elif isinstance(callback, LangChainTracer):
Ensure functions have docstrings for documentation
missing-docstring
def finish(
Ensure functions have docstrings for documentation
missing-docstring
def prepare(
Use logging module for better control and configurability
print-statement
print("\n Experiment Results:") # noqa: T201
Use logging module for better control and configurability
print-statement
print(formatted_string) # noqa: T201
Ensure functions have docstrings for documentation
missing-docstring
"def construct_chain():\n"
Ensure functions have docstrings for documentation
missing-docstring
async def arun_on_dataset(
Ensure functions have docstrings for documentation
missing-docstring
def construct_chain():
Ensure functions have docstrings for documentation
missing-docstring
def requires_input(self) -> bool:
Ensure functions have docstrings for documentation
missing-docstring
def requires_reference(self) -> bool:
Ensure functions have docstrings for documentation
missing-docstring
def evaluation_name(self) -> str:
Ensure functions have docstrings for documentation
missing-docstring
def run_on_dataset(
Ensure functions have docstrings for documentation
missing-docstring
def construct_chain():
Ensure functions have docstrings for documentation
missing-docstring
def requires_input(self) -> bool:
Ensure functions have docstrings for documentation
missing-docstring
def requires_reference(self) -> bool:
Ensure functions have docstrings for documentation
missing-docstring
def evaluation_name(self) -> str:
Avoid unnecessary list conversions; use generators where possible
unnecessary-list
batch_results = list(

Get this view in your editor

Same data, no extra tab — call code_get_file + code_get_findings over MCP from Claude/Cursor/Copilot.