Overuse may indicate design issues; consider polymorphism
if isinstance(messages, list) and messages:
1"""Run evaluator wrapper for string evaluators."""23from __future__ import annotations45import logging6import uuid7from abc import abstractmethod8from typing import Any, cast910from langchain_core.callbacks.manager import (11 AsyncCallbackManagerForChainRun,12 CallbackManagerForChainRun,13)14from langchain_core.load.dump import dumpd15from langchain_core.load.load import load16from langchain_core.load.serializable import Serializable17from langchain_core.messages import BaseMessage, get_buffer_string, messages_from_dict18from langsmith import EvaluationResult, RunEvaluator19from langsmith.schemas import DataType, Example, Run20from typing_extensions import override2122from langchain_classic.chains.base import Chain23from langchain_classic.evaluation.schema import StringEvaluator24from langchain_classic.schema import RUN_KEY2526_logger = logging.getLogger(__name__)272829def _get_messages_from_run_dict(messages: list[dict]) -> list[BaseMessage]:30 if not messages:31 return []32 first_message = messages[0]33 if "lc" in first_message:34 return [35 load(dumpd(message), allowed_objects="messages") for message in messages36 ]37 return messages_from_dict(messages)383940class StringRunMapper(Serializable):41 """Extract items to evaluate from the run object."""4243 @property44 def output_keys(self) -> list[str]:45 """The keys to extract from the run."""46 return ["prediction", "input"]4748 @abstractmethod49 def map(self, run: Run) -> dict[str, str]:50 """Maps the Run to a dictionary."""5152 def __call__(self, run: Run) -> dict[str, str]:53 """Maps the Run to a dictionary."""54 if not run.outputs:55 msg = f"Run {run.id} has no outputs to evaluate."56 raise ValueError(msg)57 return self.map(run)585960class LLMStringRunMapper(StringRunMapper):61 """Extract items to evaluate from the run object."""6263 def serialize_chat_messages(self, messages: list[dict] | list[list[dict]]) -> str:64 """Extract the input messages from the run."""65 if isinstance(messages, list) and messages:66 if isinstance(messages[0], dict):67 chat_messages = _get_messages_from_run_dict(68 cast("list[dict]", messages)69 )70 elif isinstance(messages[0], list):71 # Runs from Tracer have messages as a list of lists of dicts72 chat_messages = _get_messages_from_run_dict(messages[0])73 else:74 msg = f"Could not extract messages to evaluate {messages}" # type: ignore[unreachable]75 raise ValueError(msg)76 return get_buffer_string(chat_messages)77 msg = f"Could not extract messages to evaluate {messages}"78 raise ValueError(msg)7980 def serialize_inputs(self, inputs: dict) -> str:81 """Serialize inputs.8283 Args:84 inputs: The inputs from the run, expected to contain prompts or messages.8586 Returns:87 The serialized input text from the prompts or messages.8889 Raises:90 ValueError: If neither prompts nor messages are found in the inputs.91 """92 if "prompts" in inputs: # Should we even accept this?93 input_ = "\n\n".join(inputs["prompts"])94 elif "prompt" in inputs:95 input_ = inputs["prompt"]96 elif "messages" in inputs:97 input_ = self.serialize_chat_messages(inputs["messages"])98 else:99 msg = "LLM Run must have either messages or prompts as inputs."100 raise ValueError(msg)101 return input_102103 def serialize_outputs(self, outputs: dict) -> str:104 """Serialize outputs.105106 Args:107 outputs: The outputs from the run, expected to contain generations.108109 Returns:110 The serialized output text from the first generation.111112 Raises:113 ValueError: If no generations are found in the outputs or if the generations114 are empty.115 """116 if not outputs.get("generations"):117 msg = "Cannot evaluate LLM Run without generations."118 raise ValueError(msg)119 generations: list[dict] | list[list[dict]] = outputs["generations"]120 if not generations:121 msg = "Cannot evaluate LLM run with empty generations."122 raise ValueError(msg)123 first_generation: dict | list[dict] = generations[0]124 if isinstance(first_generation, list):125 # Runs from Tracer have generations as a list of lists of dicts126 # Whereas Runs from the API have a list of dicts127 first_generation = first_generation[0]128 if "message" in first_generation:129 output_ = self.serialize_chat_messages([first_generation["message"]])130 else:131 output_ = first_generation["text"]132 return output_133134 def map(self, run: Run) -> dict[str, str]:135 """Maps the Run to a dictionary."""136 if run.run_type != "llm":137 msg = "LLM RunMapper only supports LLM runs."138 raise ValueError(msg)139 if not run.outputs:140 if run.error:141 msg = f"Cannot evaluate errored LLM run {run.id}: {run.error}"142 raise ValueError(msg)143 msg = f"Run {run.id} has no outputs. Cannot evaluate this run."144 raise ValueError(msg)145 try:146 inputs = self.serialize_inputs(run.inputs)147 except Exception as e:148 msg = f"Could not parse LM input from run inputs {run.inputs}"149 raise ValueError(msg) from e150 try:151 output_ = self.serialize_outputs(run.outputs)152 except Exception as e:153 msg = f"Could not parse LM prediction from run outputs {run.outputs}"154 raise ValueError(msg) from e155 return {"input": inputs, "prediction": output_}156157158class ChainStringRunMapper(StringRunMapper):159 """Extract items to evaluate from the run object from a chain."""160161 input_key: str | None = None162 """The key from the model Run's inputs to use as the eval input.163 If not provided, will use the only input key or raise an164 error if there are multiple."""165 prediction_key: str | None = None166 """The key from the model Run's outputs to use as the eval prediction.167 If not provided, will use the only output key or raise an error168 if there are multiple."""169170 def _get_key(self, source: dict, key: str | None, which: str) -> str:171 if key is not None:172 return source[key]173 if len(source) == 1:174 return next(iter(source.values()))175 msg = (176 f"Could not map run {which} with multiple keys: "177 f"{source}\nPlease manually specify a {which}_key"178 )179 raise ValueError(msg)180181 def map(self, run: Run) -> dict[str, str]:182 """Maps the Run to a dictionary."""183 if not run.outputs:184 msg = (185 f"Run with ID {run.id} lacks outputs required for evaluation."186 " Ensure the Run has valid outputs."187 )188 raise ValueError(msg)189 if self.input_key is not None and self.input_key not in run.inputs:190 msg = (191 f"Run with ID {run.id} is missing the expected input key"192 f" '{self.input_key}'.\nAvailable input keys in this Run"193 f" are: {run.inputs.keys()}.\nAdjust the evaluator's"194 f" input_key or ensure your input data includes key"195 f" '{self.input_key}'."196 )197 raise ValueError(msg)198 if self.prediction_key is not None and self.prediction_key not in run.outputs:199 available_keys = ", ".join(run.outputs.keys())200 msg = (201 f"Run with ID {run.id} doesn't have the expected prediction key"202 f" '{self.prediction_key}'. Available prediction keys in this Run are:"203 f" {available_keys}. Adjust the evaluator's prediction_key or"204 " ensure the Run object's outputs the expected key."205 )206 raise ValueError(msg)207208 input_ = self._get_key(run.inputs, self.input_key, "input")209 prediction = self._get_key(run.outputs, self.prediction_key, "prediction")210 return {211 "input": input_,212 "prediction": prediction,213 }214215216class ToolStringRunMapper(StringRunMapper):217 """Map an input to the tool."""218219 @override220 def map(self, run: Run) -> dict[str, str]:221 if not run.outputs:222 msg = f"Run {run.id} has no outputs to evaluate."223 raise ValueError(msg)224 return {"input": run.inputs["input"], "prediction": run.outputs["output"]}225226227class StringExampleMapper(Serializable):228 """Map an example, or row in the dataset, to the inputs of an evaluation."""229230 reference_key: str | None = None231232 @property233 def output_keys(self) -> list[str]:234 """The keys to extract from the run."""235 return ["reference"]236237 def serialize_chat_messages(self, messages: list[dict]) -> str:238 """Extract the input messages from the run."""239 chat_messages = _get_messages_from_run_dict(messages)240 return get_buffer_string(chat_messages)241242 def map(self, example: Example) -> dict[str, str]:243 """Maps the Example, or dataset row to a dictionary."""244 if not example.outputs:245 msg = f"Example {example.id} has no outputs to use as a reference."246 raise ValueError(msg)247 if self.reference_key is None:248 if len(example.outputs) > 1:249 msg = (250 f"Example {example.id} has multiple outputs, so you must"251 " specify a reference_key."252 )253 raise ValueError(msg)254 output = next(iter(example.outputs.values()))255 elif self.reference_key not in example.outputs:256 msg = (257 f"Example {example.id} does not have reference key"258 f" {self.reference_key}."259 )260 raise ValueError(msg)261 else:262 output = example.outputs[self.reference_key]263 return {264 "reference": self.serialize_chat_messages([output])265 if isinstance(output, dict) and output.get("type") and output.get("data")266 else output,267 }268269 def __call__(self, example: Example) -> dict[str, str]:270 """Maps the Run and Example to a dictionary."""271 if not example.outputs:272 msg = f"Example {example.id} has no outputs to use as areference label."273 raise ValueError(msg)274 return self.map(example)275276277class StringRunEvaluatorChain(Chain, RunEvaluator):278 """Evaluate Run and optional examples."""279280 run_mapper: StringRunMapper281 """Maps the Run to a dictionary with 'input' and 'prediction' strings."""282 example_mapper: StringExampleMapper | None = None283 """Maps the Example (dataset row) to a dictionary284 with a 'reference' string."""285 name: str286 """The name of the evaluation metric."""287 string_evaluator: StringEvaluator288 """The evaluation chain."""289290 @property291 @override292 def input_keys(self) -> list[str]:293 return ["run", "example"]294295 @property296 @override297 def output_keys(self) -> list[str]:298 return ["feedback"]299300 def _prepare_input(self, inputs: dict[str, Any]) -> dict[str, str]:301 run: Run = inputs["run"]302 example: Example | None = inputs.get("example")303 evaluate_strings_inputs = self.run_mapper(run)304 if not self.string_evaluator.requires_input:305 # Hide warning about unused input306 evaluate_strings_inputs.pop("input", None)307 if example and self.example_mapper and self.string_evaluator.requires_reference:308 evaluate_strings_inputs.update(self.example_mapper(example))309 elif self.string_evaluator.requires_reference:310 msg = (311 f"Evaluator {self.name} requires an reference"312 " example from the dataset,"313 f" but none was provided for run {run.id}."314 )315 raise ValueError(msg)316 return evaluate_strings_inputs317318 def _prepare_output(self, output: dict[str, Any]) -> dict[str, Any]:319 evaluation_result = EvaluationResult(320 key=self.name,321 comment=output.get("reasoning"),322 **output,323 )324 if RUN_KEY in output:325 # TODO: Not currently surfaced. Update326 evaluation_result.evaluator_info[RUN_KEY] = output[RUN_KEY]327 return {"feedback": evaluation_result}328329 def _call(330 self,331 inputs: dict[str, str],332 run_manager: CallbackManagerForChainRun | None = None,333 ) -> dict[str, Any]:334 """Call the evaluation chain."""335 evaluate_strings_inputs = self._prepare_input(inputs)336 _run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager()337 callbacks = _run_manager.get_child()338 chain_output = self.string_evaluator.evaluate_strings(339 **evaluate_strings_inputs,340 callbacks=callbacks,341 include_run_info=True,342 )343 return self._prepare_output(chain_output)344345 async def _acall(346 self,347 inputs: dict[str, str],348 run_manager: AsyncCallbackManagerForChainRun | None = None,349 ) -> dict[str, Any]:350 """Call the evaluation chain."""351 evaluate_strings_inputs = self._prepare_input(inputs)352 _run_manager = run_manager or AsyncCallbackManagerForChainRun.get_noop_manager()353 callbacks = _run_manager.get_child()354 chain_output = await self.string_evaluator.aevaluate_strings(355 **evaluate_strings_inputs,356 callbacks=callbacks,357 include_run_info=True,358 )359 return self._prepare_output(chain_output)360361 def _prepare_evaluator_output(self, output: dict[str, Any]) -> EvaluationResult:362 feedback: EvaluationResult = output["feedback"]363 if RUN_KEY not in feedback.evaluator_info:364 feedback.evaluator_info[RUN_KEY] = output[RUN_KEY]365 return feedback366367 @override368 def evaluate_run(369 self,370 run: Run,371 example: Example | None = None,372 evaluator_run_id: uuid.UUID | None = None,373 ) -> EvaluationResult:374 """Evaluate an example."""375 try:376 result = self({"run": run, "example": example}, include_run_info=True)377 return self._prepare_evaluator_output(result)378 except Exception as e:379 _logger.exception("Error evaluating run %s", run.id)380 return EvaluationResult(381 key=self.string_evaluator.evaluation_name,382 comment=f"Error evaluating run {run.id}: {e}",383 # TODO: Add run ID once we can declare it via callbacks384 )385386 @override387 async def aevaluate_run(388 self,389 run: Run,390 example: Example | None = None,391 evaluator_run_id: uuid.UUID | None = None,392 ) -> EvaluationResult:393 """Evaluate an example."""394 try:395 result = await self.acall(396 {"run": run, "example": example},397 include_run_info=True,398 )399 return self._prepare_evaluator_output(result)400 except Exception as e:401 _logger.exception("Error evaluating run %s", run.id)402 return EvaluationResult(403 key=self.string_evaluator.evaluation_name,404 comment=f"Error evaluating run {run.id}: {e}",405 )406407 @classmethod408 def from_run_and_data_type(409 cls,410 evaluator: StringEvaluator,411 run_type: str,412 data_type: DataType,413 input_key: str | None = None,414 prediction_key: str | None = None,415 reference_key: str | None = None,416 tags: list[str] | None = None,417 ) -> StringRunEvaluatorChain:418 """Create a StringRunEvaluatorChain.419420 Create a StringRunEvaluatorChain from an evaluator and the run and dataset421 types.422423 This method provides an easy way to instantiate a StringRunEvaluatorChain, by424 taking an evaluator and information about the type of run and the data.425 The method supports LLM and chain runs.426427 Args:428 evaluator: The string evaluator to use.429 run_type: The type of run being evaluated.430 Supported types are LLM and Chain.431 data_type: The type of dataset used in the run.432 input_key: The key used to map the input from the run.433 prediction_key: The key used to map the prediction from the run.434 reference_key: The key used to map the reference from the dataset.435 tags: List of tags to attach to the evaluation chain.436437 Returns:438 The instantiated evaluation chain.439440 Raises:441 ValueError: If the run type is not supported, or if the evaluator requires a442 reference from the dataset but the reference key is not provided.443444 """445 # Configure how run inputs/predictions are passed to the evaluator446 if run_type == "llm":447 run_mapper: StringRunMapper = LLMStringRunMapper()448 elif run_type == "chain":449 run_mapper = ChainStringRunMapper(450 input_key=input_key,451 prediction_key=prediction_key,452 )453 else:454 msg = f"Unsupported run type {run_type}. Expected one of 'llm' or 'chain'."455 raise ValueError(msg)456457 # Configure how example rows are fed as a reference string to the evaluator458 if (459 reference_key is not None460 or data_type in (DataType.llm, DataType.chat)461 or evaluator.requires_reference462 ):463 example_mapper = StringExampleMapper(reference_key=reference_key)464 elif evaluator.requires_reference:465 msg = ( # type: ignore[unreachable]466 f"Evaluator {evaluator.evaluation_name} requires a reference"467 " example from the dataset. Please specify the reference key from"468 " amongst the dataset outputs keys."469 )470 raise ValueError(msg)471 else:472 example_mapper = None473 return cls(474 name=evaluator.evaluation_name,475 run_mapper=run_mapper,476 example_mapper=example_mapper,477 string_evaluator=evaluator,478 tags=tags,479 )
Same data, no extra tab — call code_get_file + code_get_findings over MCP from Claude/Cursor/Copilot.