libs/langchain/langchain_classic/smith/evaluation/string_run_evaluator.py · langchain-ai/langchain

1"""Run evaluator wrapper for string evaluators."""23from __future__ import annotations45import logging6import uuid7from abc import abstractmethod8from typing import Any910from langchain_core.callbacks.manager import (11    AsyncCallbackManagerForChainRun,12    CallbackManagerForChainRun,13)14from langchain_core.load.dump import dumpd15from langchain_core.load.load import load16from langchain_core.load.serializable import Serializable17from langchain_core.messages import BaseMessage, get_buffer_string, messages_from_dict18from langsmith import EvaluationResult, RunEvaluator19from langsmith.schemas import DataType, Example, Run20from typing_extensions import override2122from langchain_classic.chains.base import Chain23from langchain_classic.evaluation.schema import StringEvaluator24from langchain_classic.schema import RUN_KEY2526_logger = logging.getLogger(__name__)272829def _get_messages_from_run_dict(messages: list[dict]) -> list[BaseMessage]:30    if not messages:31        return []32    first_message = messages[0]33    if "lc" in first_message:34        return [35            load(dumpd(message), allowed_objects="messages") for message in messages36        ]37    return messages_from_dict(messages)383940class StringRunMapper(Serializable):41    """Extract items to evaluate from the run object."""4243    @property44    def output_keys(self) -> list[str]:45        """The keys to extract from the run."""46        return ["prediction", "input"]4748    @abstractmethod49    def map(self, run: Run) -> dict[str, str]:50        """Maps the Run to a dictionary."""5152    def __call__(self, run: Run) -> dict[str, str]:53        """Maps the Run to a dictionary."""54        if not run.outputs:55            msg = f"Run {run.id} has no outputs to evaluate."56            raise ValueError(msg)57        return self.map(run)585960class LLMStringRunMapper(StringRunMapper):61    """Extract items to evaluate from the run object."""6263    def serialize_chat_messages(self, messages: list[dict] | list[list[dict]]) -> str:64        """Extract the input messages from the run."""65        if isinstance(messages, list) and messages:66            if isinstance(messages[0], dict):67                chat_messages = _get_messages_from_run_dict(messages)68            elif isinstance(messages[0], list):69                # Runs from Tracer have messages as a list of lists of dicts70                chat_messages = _get_messages_from_run_dict(messages[0])71            else:72                msg = f"Could not extract messages to evaluate {messages}"  # type: ignore[unreachable]73                raise ValueError(msg)74            return get_buffer_string(chat_messages)75        msg = f"Could not extract messages to evaluate {messages}"76        raise ValueError(msg)7778    def serialize_inputs(self, inputs: dict) -> str:79        """Serialize inputs.8081        Args:82            inputs: The inputs from the run, expected to contain prompts or messages.8384        Returns:85            The serialized input text from the prompts or messages.8687        Raises:88            ValueError: If neither prompts nor messages are found in the inputs.89        """90        if "prompts" in inputs:  # Should we even accept this?91            input_ = "\n\n".join(inputs["prompts"])92        elif "prompt" in inputs:93            input_ = inputs["prompt"]94        elif "messages" in inputs:95            input_ = self.serialize_chat_messages(inputs["messages"])96        else:97            msg = "LLM Run must have either messages or prompts as inputs."98            raise ValueError(msg)99        return input_100101    def serialize_outputs(self, outputs: dict) -> str:102        """Serialize outputs.103104        Args:105            outputs: The outputs from the run, expected to contain generations.106107        Returns:108            The serialized output text from the first generation.109110        Raises:111            ValueError: If no generations are found in the outputs or if the generations112                are empty.113        """114        if not outputs.get("generations"):115            msg = "Cannot evaluate LLM Run without generations."116            raise ValueError(msg)117        generations: list[dict] | list[list[dict]] = outputs["generations"]118        if not generations:119            msg = "Cannot evaluate LLM run with empty generations."120            raise ValueError(msg)121        first_generation: dict | list[dict] = generations[0]122        if isinstance(first_generation, list):123            # Runs from Tracer have generations as a list of lists of dicts124            # Whereas Runs from the API have a list of dicts125            first_generation = first_generation[0]126        if "message" in first_generation:127            output_ = self.serialize_chat_messages([first_generation["message"]])128        else:129            output_ = first_generation["text"]130        return output_131132    def map(self, run: Run) -> dict[str, str]:133        """Maps the Run to a dictionary."""134        if run.run_type != "llm":135            msg = "LLM RunMapper only supports LLM runs."136            raise ValueError(msg)137        if not run.outputs:138            if run.error:139                msg = f"Cannot evaluate errored LLM run {run.id}: {run.error}"140                raise ValueError(msg)141            msg = f"Run {run.id} has no outputs. Cannot evaluate this run."142            raise ValueError(msg)143        try:144            inputs = self.serialize_inputs(run.inputs)145        except Exception as e:146            msg = f"Could not parse LM input from run inputs {run.inputs}"147            raise ValueError(msg) from e148        try:149            output_ = self.serialize_outputs(run.outputs)150        except Exception as e:151            msg = f"Could not parse LM prediction from run outputs {run.outputs}"152            raise ValueError(msg) from e153        return {"input": inputs, "prediction": output_}154155156class ChainStringRunMapper(StringRunMapper):157    """Extract items to evaluate from the run object from a chain."""158159    input_key: str | None = None160    """The key from the model Run's inputs to use as the eval input.161    If not provided, will use the only input key or raise an162    error if there are multiple."""163    prediction_key: str | None = None164    """The key from the model Run's outputs to use as the eval prediction.165    If not provided, will use the only output key or raise an error166    if there are multiple."""167168    def _get_key(self, source: dict, key: str | None, which: str) -> str:169        if key is not None:170            return source[key]171        if len(source) == 1:172            return next(iter(source.values()))173        msg = (174            f"Could not map run {which} with multiple keys: "175            f"{source}\nPlease manually specify a {which}_key"176        )177        raise ValueError(msg)178179    def map(self, run: Run) -> dict[str, str]:180        """Maps the Run to a dictionary."""181        if not run.outputs:182            msg = (183                f"Run with ID {run.id} lacks outputs required for evaluation."184                " Ensure the Run has valid outputs."185            )186            raise ValueError(msg)187        if self.input_key is not None and self.input_key not in run.inputs:188            msg = (189                f"Run with ID {run.id} is missing the expected input key"190                f" '{self.input_key}'.\nAvailable input keys in this Run"191                f"  are: {run.inputs.keys()}.\nAdjust the evaluator's"192                f" input_key or ensure your input data includes key"193                f" '{self.input_key}'."194            )195            raise ValueError(msg)196        if self.prediction_key is not None and self.prediction_key not in run.outputs:197            available_keys = ", ".join(run.outputs.keys())198            msg = (199                f"Run with ID {run.id} doesn't have the expected prediction key"200                f" '{self.prediction_key}'. Available prediction keys in this Run are:"201                f" {available_keys}. Adjust the evaluator's prediction_key or"202                " ensure the Run object's outputs the expected key."203            )204            raise ValueError(msg)205206        input_ = self._get_key(run.inputs, self.input_key, "input")207        prediction = self._get_key(run.outputs, self.prediction_key, "prediction")208        return {209            "input": input_,210            "prediction": prediction,211        }212213214class ToolStringRunMapper(StringRunMapper):215    """Map an input to the tool."""216217    @override218    def map(self, run: Run) -> dict[str, str]:219        if not run.outputs:220            msg = f"Run {run.id} has no outputs to evaluate."221            raise ValueError(msg)222        return {"input": run.inputs["input"], "prediction": run.outputs["output"]}223224225class StringExampleMapper(Serializable):226    """Map an example, or row in the dataset, to the inputs of an evaluation."""227228    reference_key: str | None = None229230    @property231    def output_keys(self) -> list[str]:232        """The keys to extract from the run."""233        return ["reference"]234235    def serialize_chat_messages(self, messages: list[dict]) -> str:236        """Extract the input messages from the run."""237        chat_messages = _get_messages_from_run_dict(messages)238        return get_buffer_string(chat_messages)239240    def map(self, example: Example) -> dict[str, str]:241        """Maps the Example, or dataset row to a dictionary."""242        if not example.outputs:243            msg = f"Example {example.id} has no outputs to use as a reference."244            raise ValueError(msg)245        if self.reference_key is None:246            if len(example.outputs) > 1:247                msg = (248                    f"Example {example.id} has multiple outputs, so you must"249                    " specify a reference_key."250                )251                raise ValueError(msg)252            output = next(iter(example.outputs.values()))253        elif self.reference_key not in example.outputs:254            msg = (255                f"Example {example.id} does not have reference key"256                f" {self.reference_key}."257            )258            raise ValueError(msg)259        else:260            output = example.outputs[self.reference_key]261        return {262            "reference": self.serialize_chat_messages([output])263            if isinstance(output, dict) and output.get("type") and output.get("data")264            else output,265        }266267    def __call__(self, example: Example) -> dict[str, str]:268        """Maps the Run and Example to a dictionary."""269        if not example.outputs:270            msg = f"Example {example.id} has no outputs to use as areference label."271            raise ValueError(msg)272        return self.map(example)273274275class StringRunEvaluatorChain(Chain, RunEvaluator):276    """Evaluate Run and optional examples."""277278    run_mapper: StringRunMapper279    """Maps the Run to a dictionary with 'input' and 'prediction' strings."""280    example_mapper: StringExampleMapper | None = None281    """Maps the Example (dataset row) to a dictionary282    with a 'reference' string."""283    name: str284    """The name of the evaluation metric."""285    string_evaluator: StringEvaluator286    """The evaluation chain."""287288    @property289    @override290    def input_keys(self) -> list[str]:291        return ["run", "example"]292293    @property294    @override295    def output_keys(self) -> list[str]:296        return ["feedback"]297298    def _prepare_input(self, inputs: dict[str, Any]) -> dict[str, str]:299        run: Run = inputs["run"]300        example: Example | None = inputs.get("example")301        evaluate_strings_inputs = self.run_mapper(run)302        if not self.string_evaluator.requires_input:303            # Hide warning about unused input304            evaluate_strings_inputs.pop("input", None)305        if example and self.example_mapper and self.string_evaluator.requires_reference:306            evaluate_strings_inputs.update(self.example_mapper(example))307        elif self.string_evaluator.requires_reference:308            msg = (309                f"Evaluator {self.name} requires an reference"310                " example from the dataset,"311                f" but none was provided for run {run.id}."312            )313            raise ValueError(msg)314        return evaluate_strings_inputs315316    def _prepare_output(self, output: dict[str, Any]) -> dict[str, Any]:317        evaluation_result = EvaluationResult(318            key=self.name,319            comment=output.get("reasoning"),320            **output,321        )322        if RUN_KEY in output:323            # TODO: Not currently surfaced. Update324            evaluation_result.evaluator_info[RUN_KEY] = output[RUN_KEY]325        return {"feedback": evaluation_result}326327    def _call(328        self,329        inputs: dict[str, str],330        run_manager: CallbackManagerForChainRun | None = None,331    ) -> dict[str, Any]:332        """Call the evaluation chain."""333        evaluate_strings_inputs = self._prepare_input(inputs)334        _run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager()335        callbacks = _run_manager.get_child()336        chain_output = self.string_evaluator.evaluate_strings(337            **evaluate_strings_inputs,338            callbacks=callbacks,339            include_run_info=True,340        )341        return self._prepare_output(chain_output)342343    async def _acall(344        self,345        inputs: dict[str, str],346        run_manager: AsyncCallbackManagerForChainRun | None = None,347    ) -> dict[str, Any]:348        """Call the evaluation chain."""349        evaluate_strings_inputs = self._prepare_input(inputs)350        _run_manager = run_manager or AsyncCallbackManagerForChainRun.get_noop_manager()351        callbacks = _run_manager.get_child()352        chain_output = await self.string_evaluator.aevaluate_strings(353            **evaluate_strings_inputs,354            callbacks=callbacks,355            include_run_info=True,356        )357        return self._prepare_output(chain_output)358359    def _prepare_evaluator_output(self, output: dict[str, Any]) -> EvaluationResult:360        feedback: EvaluationResult = output["feedback"]361        if RUN_KEY not in feedback.evaluator_info:362            feedback.evaluator_info[RUN_KEY] = output[RUN_KEY]363        return feedback364365    @override366    def evaluate_run(367        self,368        run: Run,369        example: Example | None = None,370        evaluator_run_id: uuid.UUID | None = None,371    ) -> EvaluationResult:372        """Evaluate an example."""373        try:374            result = self({"run": run, "example": example}, include_run_info=True)375            return self._prepare_evaluator_output(result)376        except Exception as e:377            _logger.exception("Error evaluating run %s", run.id)378            return EvaluationResult(379                key=self.string_evaluator.evaluation_name,380                comment=f"Error evaluating run {run.id}: {e}",381                # TODO: Add run ID once we can declare it via callbacks382            )383384    @override385    async def aevaluate_run(386        self,387        run: Run,388        example: Example | None = None,389        evaluator_run_id: uuid.UUID | None = None,390    ) -> EvaluationResult:391        """Evaluate an example."""392        try:393            result = await self.acall(394                {"run": run, "example": example},395                include_run_info=True,396            )397            return self._prepare_evaluator_output(result)398        except Exception as e:399            _logger.exception("Error evaluating run %s", run.id)400            return EvaluationResult(401                key=self.string_evaluator.evaluation_name,402                comment=f"Error evaluating run {run.id}: {e}",403            )404405    @classmethod406    def from_run_and_data_type(407        cls,408        evaluator: StringEvaluator,409        run_type: str,410        data_type: DataType,411        input_key: str | None = None,412        prediction_key: str | None = None,413        reference_key: str | None = None,414        tags: list[str] | None = None,415    ) -> StringRunEvaluatorChain:416        """Create a StringRunEvaluatorChain.417418        Create a StringRunEvaluatorChain from an evaluator and the run and dataset419        types.420421        This method provides an easy way to instantiate a StringRunEvaluatorChain, by422        taking an evaluator and information about the type of run and the data.423        The method supports LLM and chain runs.424425        Args:426            evaluator: The string evaluator to use.427            run_type: The type of run being evaluated.428                Supported types are LLM and Chain.429            data_type: The type of dataset used in the run.430            input_key: The key used to map the input from the run.431            prediction_key: The key used to map the prediction from the run.432            reference_key: The key used to map the reference from the dataset.433            tags: List of tags to attach to the evaluation chain.434435        Returns:436            The instantiated evaluation chain.437438        Raises:439            ValueError: If the run type is not supported, or if the evaluator requires a440                reference from the dataset but the reference key is not provided.441442        """443        # Configure how run inputs/predictions are passed to the evaluator444        if run_type == "llm":445            run_mapper: StringRunMapper = LLMStringRunMapper()446        elif run_type == "chain":447            run_mapper = ChainStringRunMapper(448                input_key=input_key,449                prediction_key=prediction_key,450            )451        else:452            msg = f"Unsupported run type {run_type}. Expected one of 'llm' or 'chain'."453            raise ValueError(msg)454455        # Configure how example rows are fed as a reference string to the evaluator456        if (457            reference_key is not None458            or data_type in (DataType.llm, DataType.chat)459            or evaluator.requires_reference460        ):461            example_mapper = StringExampleMapper(reference_key=reference_key)462        elif evaluator.requires_reference:463            msg = (  # type: ignore[unreachable]464                f"Evaluator {evaluator.evaluation_name} requires a reference"465                " example from the dataset. Please specify the reference key from"466                " amongst the dataset outputs keys."467            )468            raise ValueError(msg)469        else:470            example_mapper = None471        return cls(472            name=evaluator.evaluation_name,473            run_mapper=run_mapper,474            example_mapper=example_mapper,475            string_evaluator=evaluator,476            tags=tags,477        )