libs/langchain/langchain_classic/smith/evaluation/string_run_evaluator.py PYTHON 480 lines View on github.com → Search inside
1"""Run evaluator wrapper for string evaluators."""23from __future__ import annotations45import logging6import uuid7from abc import abstractmethod8from typing import Any, cast910from langchain_core.callbacks.manager import (11    AsyncCallbackManagerForChainRun,12    CallbackManagerForChainRun,13)14from langchain_core.load.dump import dumpd15from langchain_core.load.load import load16from langchain_core.load.serializable import Serializable17from langchain_core.messages import BaseMessage, get_buffer_string, messages_from_dict18from langsmith import EvaluationResult, RunEvaluator19from langsmith.schemas import DataType, Example, Run20from typing_extensions import override2122from langchain_classic.chains.base import Chain23from langchain_classic.evaluation.schema import StringEvaluator24from langchain_classic.schema import RUN_KEY2526_logger = logging.getLogger(__name__)272829def _get_messages_from_run_dict(messages: list[dict]) -> list[BaseMessage]:30    if not messages:31        return []32    first_message = messages[0]33    if "lc" in first_message:34        return [35            load(dumpd(message), allowed_objects="messages") for message in messages36        ]37    return messages_from_dict(messages)383940class StringRunMapper(Serializable):41    """Extract items to evaluate from the run object."""4243    @property44    def output_keys(self) -> list[str]:45        """The keys to extract from the run."""46        return ["prediction", "input"]4748    @abstractmethod49    def map(self, run: Run) -> dict[str, str]:50        """Maps the Run to a dictionary."""5152    def __call__(self, run: Run) -> dict[str, str]:53        """Maps the Run to a dictionary."""54        if not run.outputs:55            msg = f"Run {run.id} has no outputs to evaluate."56            raise ValueError(msg)57        return self.map(run)585960class LLMStringRunMapper(StringRunMapper):61    """Extract items to evaluate from the run object."""6263    def serialize_chat_messages(self, messages: list[dict] | list[list[dict]]) -> str:64        """Extract the input messages from the run."""65        if isinstance(messages, list) and messages:66            if isinstance(messages[0], dict):67                chat_messages = _get_messages_from_run_dict(68                    cast("list[dict]", messages)69                )70            elif isinstance(messages[0], list):71                # Runs from Tracer have messages as a list of lists of dicts72                chat_messages = _get_messages_from_run_dict(messages[0])73            else:74                msg = f"Could not extract messages to evaluate {messages}"  # type: ignore[unreachable]75                raise ValueError(msg)76            return get_buffer_string(chat_messages)77        msg = f"Could not extract messages to evaluate {messages}"78        raise ValueError(msg)7980    def serialize_inputs(self, inputs: dict) -> str:81        """Serialize inputs.8283        Args:84            inputs: The inputs from the run, expected to contain prompts or messages.8586        Returns:87            The serialized input text from the prompts or messages.8889        Raises:90            ValueError: If neither prompts nor messages are found in the inputs.91        """92        if "prompts" in inputs:  # Should we even accept this?93            input_ = "\n\n".join(inputs["prompts"])94        elif "prompt" in inputs:95            input_ = inputs["prompt"]96        elif "messages" in inputs:97            input_ = self.serialize_chat_messages(inputs["messages"])98        else:99            msg = "LLM Run must have either messages or prompts as inputs."100            raise ValueError(msg)101        return input_102103    def serialize_outputs(self, outputs: dict) -> str:104        """Serialize outputs.105106        Args:107            outputs: The outputs from the run, expected to contain generations.108109        Returns:110            The serialized output text from the first generation.111112        Raises:113            ValueError: If no generations are found in the outputs or if the generations114                are empty.115        """116        if not outputs.get("generations"):117            msg = "Cannot evaluate LLM Run without generations."118            raise ValueError(msg)119        generations: list[dict] | list[list[dict]] = outputs["generations"]120        if not generations:121            msg = "Cannot evaluate LLM run with empty generations."122            raise ValueError(msg)123        first_generation: dict | list[dict] = generations[0]124        if isinstance(first_generation, list):125            # Runs from Tracer have generations as a list of lists of dicts126            # Whereas Runs from the API have a list of dicts127            first_generation = first_generation[0]128        if "message" in first_generation:129            output_ = self.serialize_chat_messages([first_generation["message"]])130        else:131            output_ = first_generation["text"]132        return output_133134    def map(self, run: Run) -> dict[str, str]:135        """Maps the Run to a dictionary."""136        if run.run_type != "llm":137            msg = "LLM RunMapper only supports LLM runs."138            raise ValueError(msg)139        if not run.outputs:140            if run.error:141                msg = f"Cannot evaluate errored LLM run {run.id}: {run.error}"142                raise ValueError(msg)143            msg = f"Run {run.id} has no outputs. Cannot evaluate this run."144            raise ValueError(msg)145        try:146            inputs = self.serialize_inputs(run.inputs)147        except Exception as e:148            msg = f"Could not parse LM input from run inputs {run.inputs}"149            raise ValueError(msg) from e150        try:151            output_ = self.serialize_outputs(run.outputs)152        except Exception as e:153            msg = f"Could not parse LM prediction from run outputs {run.outputs}"154            raise ValueError(msg) from e155        return {"input": inputs, "prediction": output_}156157158class ChainStringRunMapper(StringRunMapper):159    """Extract items to evaluate from the run object from a chain."""160161    input_key: str | None = None162    """The key from the model Run's inputs to use as the eval input.163    If not provided, will use the only input key or raise an164    error if there are multiple."""165    prediction_key: str | None = None166    """The key from the model Run's outputs to use as the eval prediction.167    If not provided, will use the only output key or raise an error168    if there are multiple."""169170    def _get_key(self, source: dict, key: str | None, which: str) -> str:171        if key is not None:172            return source[key]173        if len(source) == 1:174            return next(iter(source.values()))175        msg = (176            f"Could not map run {which} with multiple keys: "177            f"{source}\nPlease manually specify a {which}_key"178        )179        raise ValueError(msg)180181    def map(self, run: Run) -> dict[str, str]:182        """Maps the Run to a dictionary."""183        if not run.outputs:184            msg = (185                f"Run with ID {run.id} lacks outputs required for evaluation."186                " Ensure the Run has valid outputs."187            )188            raise ValueError(msg)189        if self.input_key is not None and self.input_key not in run.inputs:190            msg = (191                f"Run with ID {run.id} is missing the expected input key"192                f" '{self.input_key}'.\nAvailable input keys in this Run"193                f"  are: {run.inputs.keys()}.\nAdjust the evaluator's"194                f" input_key or ensure your input data includes key"195                f" '{self.input_key}'."196            )197            raise ValueError(msg)198        if self.prediction_key is not None and self.prediction_key not in run.outputs:199            available_keys = ", ".join(run.outputs.keys())200            msg = (201                f"Run with ID {run.id} doesn't have the expected prediction key"202                f" '{self.prediction_key}'. Available prediction keys in this Run are:"203                f" {available_keys}. Adjust the evaluator's prediction_key or"204                " ensure the Run object's outputs the expected key."205            )206            raise ValueError(msg)207208        input_ = self._get_key(run.inputs, self.input_key, "input")209        prediction = self._get_key(run.outputs, self.prediction_key, "prediction")210        return {211            "input": input_,212            "prediction": prediction,213        }214215216class ToolStringRunMapper(StringRunMapper):217    """Map an input to the tool."""218219    @override220    def map(self, run: Run) -> dict[str, str]:221        if not run.outputs:222            msg = f"Run {run.id} has no outputs to evaluate."223            raise ValueError(msg)224        return {"input": run.inputs["input"], "prediction": run.outputs["output"]}225226227class StringExampleMapper(Serializable):228    """Map an example, or row in the dataset, to the inputs of an evaluation."""229230    reference_key: str | None = None231232    @property233    def output_keys(self) -> list[str]:234        """The keys to extract from the run."""235        return ["reference"]236237    def serialize_chat_messages(self, messages: list[dict]) -> str:238        """Extract the input messages from the run."""239        chat_messages = _get_messages_from_run_dict(messages)240        return get_buffer_string(chat_messages)241242    def map(self, example: Example) -> dict[str, str]:243        """Maps the Example, or dataset row to a dictionary."""244        if not example.outputs:245            msg = f"Example {example.id} has no outputs to use as a reference."246            raise ValueError(msg)247        if self.reference_key is None:248            if len(example.outputs) > 1:249                msg = (250                    f"Example {example.id} has multiple outputs, so you must"251                    " specify a reference_key."252                )253                raise ValueError(msg)254            output = next(iter(example.outputs.values()))255        elif self.reference_key not in example.outputs:256            msg = (257                f"Example {example.id} does not have reference key"258                f" {self.reference_key}."259            )260            raise ValueError(msg)261        else:262            output = example.outputs[self.reference_key]263        return {264            "reference": self.serialize_chat_messages([output])265            if isinstance(output, dict) and output.get("type") and output.get("data")266            else output,267        }268269    def __call__(self, example: Example) -> dict[str, str]:270        """Maps the Run and Example to a dictionary."""271        if not example.outputs:272            msg = f"Example {example.id} has no outputs to use as areference label."273            raise ValueError(msg)274        return self.map(example)275276277class StringRunEvaluatorChain(Chain, RunEvaluator):278    """Evaluate Run and optional examples."""279280    run_mapper: StringRunMapper281    """Maps the Run to a dictionary with 'input' and 'prediction' strings."""282    example_mapper: StringExampleMapper | None = None283    """Maps the Example (dataset row) to a dictionary284    with a 'reference' string."""285    name: str286    """The name of the evaluation metric."""287    string_evaluator: StringEvaluator288    """The evaluation chain."""289290    @property291    @override292    def input_keys(self) -> list[str]:293        return ["run", "example"]294295    @property296    @override297    def output_keys(self) -> list[str]:298        return ["feedback"]299300    def _prepare_input(self, inputs: dict[str, Any]) -> dict[str, str]:301        run: Run = inputs["run"]302        example: Example | None = inputs.get("example")303        evaluate_strings_inputs = self.run_mapper(run)304        if not self.string_evaluator.requires_input:305            # Hide warning about unused input306            evaluate_strings_inputs.pop("input", None)307        if example and self.example_mapper and self.string_evaluator.requires_reference:308            evaluate_strings_inputs.update(self.example_mapper(example))309        elif self.string_evaluator.requires_reference:310            msg = (311                f"Evaluator {self.name} requires an reference"312                " example from the dataset,"313                f" but none was provided for run {run.id}."314            )315            raise ValueError(msg)316        return evaluate_strings_inputs317318    def _prepare_output(self, output: dict[str, Any]) -> dict[str, Any]:319        evaluation_result = EvaluationResult(320            key=self.name,321            comment=output.get("reasoning"),322            **output,323        )324        if RUN_KEY in output:325            # TODO: Not currently surfaced. Update326            evaluation_result.evaluator_info[RUN_KEY] = output[RUN_KEY]327        return {"feedback": evaluation_result}328329    def _call(330        self,331        inputs: dict[str, str],332        run_manager: CallbackManagerForChainRun | None = None,333    ) -> dict[str, Any]:334        """Call the evaluation chain."""335        evaluate_strings_inputs = self._prepare_input(inputs)336        _run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager()337        callbacks = _run_manager.get_child()338        chain_output = self.string_evaluator.evaluate_strings(339            **evaluate_strings_inputs,340            callbacks=callbacks,341            include_run_info=True,342        )343        return self._prepare_output(chain_output)344345    async def _acall(346        self,347        inputs: dict[str, str],348        run_manager: AsyncCallbackManagerForChainRun | None = None,349    ) -> dict[str, Any]:350        """Call the evaluation chain."""351        evaluate_strings_inputs = self._prepare_input(inputs)352        _run_manager = run_manager or AsyncCallbackManagerForChainRun.get_noop_manager()353        callbacks = _run_manager.get_child()354        chain_output = await self.string_evaluator.aevaluate_strings(355            **evaluate_strings_inputs,356            callbacks=callbacks,357            include_run_info=True,358        )359        return self._prepare_output(chain_output)360361    def _prepare_evaluator_output(self, output: dict[str, Any]) -> EvaluationResult:362        feedback: EvaluationResult = output["feedback"]363        if RUN_KEY not in feedback.evaluator_info:364            feedback.evaluator_info[RUN_KEY] = output[RUN_KEY]365        return feedback366367    @override368    def evaluate_run(369        self,370        run: Run,371        example: Example | None = None,372        evaluator_run_id: uuid.UUID | None = None,373    ) -> EvaluationResult:374        """Evaluate an example."""375        try:376            result = self({"run": run, "example": example}, include_run_info=True)377            return self._prepare_evaluator_output(result)378        except Exception as e:379            _logger.exception("Error evaluating run %s", run.id)380            return EvaluationResult(381                key=self.string_evaluator.evaluation_name,382                comment=f"Error evaluating run {run.id}: {e}",383                # TODO: Add run ID once we can declare it via callbacks384            )385386    @override387    async def aevaluate_run(388        self,389        run: Run,390        example: Example | None = None,391        evaluator_run_id: uuid.UUID | None = None,392    ) -> EvaluationResult:393        """Evaluate an example."""394        try:395            result = await self.acall(396                {"run": run, "example": example},397                include_run_info=True,398            )399            return self._prepare_evaluator_output(result)400        except Exception as e:401            _logger.exception("Error evaluating run %s", run.id)402            return EvaluationResult(403                key=self.string_evaluator.evaluation_name,404                comment=f"Error evaluating run {run.id}: {e}",405            )406407    @classmethod408    def from_run_and_data_type(409        cls,410        evaluator: StringEvaluator,411        run_type: str,412        data_type: DataType,413        input_key: str | None = None,414        prediction_key: str | None = None,415        reference_key: str | None = None,416        tags: list[str] | None = None,417    ) -> StringRunEvaluatorChain:418        """Create a StringRunEvaluatorChain.419420        Create a StringRunEvaluatorChain from an evaluator and the run and dataset421        types.422423        This method provides an easy way to instantiate a StringRunEvaluatorChain, by424        taking an evaluator and information about the type of run and the data.425        The method supports LLM and chain runs.426427        Args:428            evaluator: The string evaluator to use.429            run_type: The type of run being evaluated.430                Supported types are LLM and Chain.431            data_type: The type of dataset used in the run.432            input_key: The key used to map the input from the run.433            prediction_key: The key used to map the prediction from the run.434            reference_key: The key used to map the reference from the dataset.435            tags: List of tags to attach to the evaluation chain.436437        Returns:438            The instantiated evaluation chain.439440        Raises:441            ValueError: If the run type is not supported, or if the evaluator requires a442                reference from the dataset but the reference key is not provided.443444        """445        # Configure how run inputs/predictions are passed to the evaluator446        if run_type == "llm":447            run_mapper: StringRunMapper = LLMStringRunMapper()448        elif run_type == "chain":449            run_mapper = ChainStringRunMapper(450                input_key=input_key,451                prediction_key=prediction_key,452            )453        else:454            msg = f"Unsupported run type {run_type}. Expected one of 'llm' or 'chain'."455            raise ValueError(msg)456457        # Configure how example rows are fed as a reference string to the evaluator458        if (459            reference_key is not None460            or data_type in (DataType.llm, DataType.chat)461            or evaluator.requires_reference462        ):463            example_mapper = StringExampleMapper(reference_key=reference_key)464        elif evaluator.requires_reference:465            msg = (  # type: ignore[unreachable]466                f"Evaluator {evaluator.evaluation_name} requires a reference"467                " example from the dataset. Please specify the reference key from"468                " amongst the dataset outputs keys."469            )470            raise ValueError(msg)471        else:472            example_mapper = None473        return cls(474            name=evaluator.evaluation_name,475            run_mapper=run_mapper,476            example_mapper=example_mapper,477            string_evaluator=evaluator,478            tags=tags,479        )

Code quality findings 11

Overuse may indicate design issues; consider polymorphism
isinstance-overuse
if isinstance(messages, list) and messages:
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
if isinstance(messages[0], dict):
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
elif isinstance(messages[0], list):
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
if isinstance(first_generation, list):
Ensure functions have docstrings for documentation
missing-docstring
def map(self, run: Run) -> dict[str, str]:
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
if isinstance(output, dict) and output.get("type") and output.get("data")
Ensure functions have docstrings for documentation
missing-docstring
def input_keys(self) -> list[str]:
Ensure functions have docstrings for documentation
missing-docstring
def output_keys(self) -> list[str]:
Ensure functions have docstrings for documentation
missing-docstring
def evaluate_run(
Ensure functions have docstrings for documentation
missing-docstring
async def aevaluate_run(
Ensure functions have docstrings for documentation
missing-docstring
def from_run_and_data_type(

Get this view in your editor

Same data, no extra tab — call code_get_file + code_get_findings over MCP from Claude/Cursor/Copilot.