libs/langchain_v1/langchain/agents/middleware/pii.py · langchain-ai/langchain

1"""PII detection and handling middleware for agents."""23from __future__ import annotations45from functools import partial6from typing import TYPE_CHECKING, Any, ClassVar, Literal78from langchain_core.messages import AIMessage, AnyMessage, BaseMessage, HumanMessage, ToolMessage9from langgraph.stream import StreamTransformer10from typing_extensions import override1112from langchain.agents.middleware._redaction import (13    PIIDetectionError,14    PIIMatch,15    RedactionRule,16    ResolvedRedactionRule,17    apply_strategy,18    detect_credit_card,19    detect_email,20    detect_ip,21    detect_mac_address,22    detect_url,23)24from langchain.agents.middleware.types import (25    AgentMiddleware,26    AgentState,27    ContextT,28    ResponseT,29    hook_config,30)3132if TYPE_CHECKING:33    from collections.abc import Callable3435    from langgraph.runtime import Runtime36    from langgraph.stream._types import ProtocolEvent373839_DEFAULT_STREAM_LOOKBACK = 12840"""Default trailing-buffer size for cross-delta PII detection.4142The transformer always holds the last `lookback` characters in a per-content43block buffer so that PII patterns straddling delta boundaries are detected44before any text is released downstream. 128 comfortably covers the built-in45detectors (the credit-card regex tops out at 19 characters; URLs and emails46are typically well under 100) while bounding first-token latency.47"""484950class _PIIStreamTransformer(StreamTransformer):51    """Mutates `content-block-delta` text on `messages` events in flight.5253    Runs before built-in stream transformers so the redacted text is what54    every downstream consumer sees — both the main protocol event log and55    the `run.messages` projection that `MessagesTransformer` snapshots into.5657    Holds a sliding buffer of the most recent text per (run_id, content58    block index) so PII patterns that straddle delta boundaries are caught.59    Anything older than `lookback` characters is redacted with the resolved60    rule's strategy and emitted as the new delta text; the trailing tail61    stays in the buffer until a later delta extends it past the cap or the62    block's finish event flushes the snapshot.63    """6465    before_builtins: ClassVar[bool] = True66    required_stream_modes: ClassVar[tuple[str, ...]] = ("messages", "tools", "values")6768    def __init__(69        self,70        scope: tuple[str, ...] = (),71        *,72        rule: ResolvedRedactionRule,73        lookback: int = _DEFAULT_STREAM_LOOKBACK,74    ) -> None:75        super().__init__(scope)76        self._rule = rule77        self._lookback = lookback78        # Text/reasoning deltas keyed by `(run_id, content_block_index)`.79        self._buffers: dict[tuple[str, int], str] = {}80        # Tool-output-delta buffers keyed by `tool_call_id`. Held in a81        # separate dict so `_drop_run` on the messages channel can't82        # sweep active tool-output state.83        self._tool_buffers: dict[str, str] = {}8485    def init(self) -> dict[str, Any]:86        # No projection — this transformer mutates events in place rather87        # than building a derived view.88        return {}8990    def process(self, event: ProtocolEvent) -> bool:91        method = event["method"]92        if method == "messages":93            return self._process_messages_event(event)94        if method == "tools":95            return self._process_tools_event(event)96        if method == "values":97            return self._process_values_event(event)98        return True99100    def _process_values_event(self, event: ProtocolEvent) -> bool:101        """Redact the state snapshot on the `values` channel.102103        State snapshots emitted between nodes carry the full state dict,104        which typically includes the messages list. Walking the snapshot105        with `_redact_value` returns a fresh structure where every106        message has a redacted copy of its content — the original107        objects in graph state remain intact for the state-level108        enforcer (`apply_to_tool_results` via `before_model`) to act on109        independently when the agent loops back.110        """111        data = event["params"].get("data")112        if data is None:113            return True114        event["params"]["data"] = self._redact_value(data)115        return True116117    def _process_messages_event(self, event: ProtocolEvent) -> bool:118        params = event["params"]119        data = params.get("data")120        if not isinstance(data, tuple) or len(data) != 2:  # noqa: PLR2004121            return True122        payload, metadata = data123124        # Legacy `(BaseMessage, metadata)` shape: the langgraph→langchain125        # integration emits this when a model only implements `_generate`126        # (or when its `_astream` falls back), producing a single event127        # carrying the full message rather than streamed content-block128        # deltas. Swap in a redacted copy so the consumer sees scrubbed129        # text on the wire while the original stays intact in graph state130        # for `after_model` to act on independently. Under `block`,131        # `_redact_base_message` raises `PIIDetectionError` via132        # `apply_strategy` before we get here.133        if isinstance(payload, BaseMessage):134            redacted = self._redact_base_message(payload)135            if redacted is not payload:136                params["data"] = (redacted, metadata)137            return True138139        if not isinstance(payload, dict):140            return True141        kind = payload.get("event")142        run_id = str(metadata.get("run_id") or "") if metadata else ""143144        if kind == "content-block-delta":145            self._mutate_delta(payload, run_id)146        elif kind == "content-block-finish":147            self._finalize_block(payload, run_id)148        elif kind in {"message-finish", "error"}:149            self._drop_run(run_id)150        return True151152    def _process_tools_event(self, event: ProtocolEvent) -> bool:153        data = event["params"].get("data")154        if not isinstance(data, dict):155            return True156        kind = data.get("event")157        tool_call_id = data.get("tool_call_id")158159        if kind == "tool-started":160            # Tool inputs may be a dict (multi-arg tools), a string161            # (single-arg tools — `BaseTool._parse_input` passes the162            # raw string through), or a list (array-input tools).163            # `_redact_value` handles all three uniformly.164            if "input" in data:165                data["input"] = self._redact_value(data["input"])166        elif kind == "tool-output-delta":167            # Use the tool_call_id as buffer key when present; fall back168            # to a None-keyed slot for the rare malformed/custom emitter169            # case (the buffer becomes shared but at least redaction runs).170            self._mutate_tool_output_delta(171                data, tool_call_id if isinstance(tool_call_id, str) else ""172            )173        elif kind == "tool-finished":174            if "output" in data:175                data["output"] = self._redact_value(data["output"])176            if isinstance(tool_call_id, str):177                self._tool_buffers.pop(tool_call_id, None)178        elif kind == "tool-error":179            msg = data.get("message")180            if isinstance(msg, str) and msg:181                matches = self._rule.detector(msg)182                if matches:183                    data["message"] = apply_strategy(msg, matches, self._rule.strategy)184            if isinstance(tool_call_id, str):185                self._tool_buffers.pop(tool_call_id, None)186187        return True188189    def _mutate_tool_output_delta(self, data: dict[str, Any], tool_call_id: str) -> None:190        """Redact a `tool-output-delta` payload.191192        String deltas go through the same lookback machinery as193        text-deltas, keyed by `tool_call_id` in the disjoint194        `_tool_buffers` dict so `_drop_run` on the messages channel195        can't sweep active tool-output state.196197        Structured deltas (dict/list) walk recursively without198        buffering — they don't have a position-stable shape across199        deltas to buffer against.200        """201        delta = data.get("delta")202        if isinstance(delta, str):203            held = self._tool_buffers.get(tool_call_id, "")204            combined = held + delta205206            matches = self._rule.detector(combined)207            if matches:208                # `apply_strategy` raises `PIIDetectionError` under209                # `strategy="block"`, failing the run immediately —210                # cleaner than withholding deltas until `after_model`211                # raises later.212                combined = apply_strategy(combined, matches, self._rule.strategy)213214            emit_end = max(0, len(combined) - self._lookback)215            self._tool_buffers[tool_call_id] = combined[emit_end:]216            data["delta"] = combined[:emit_end]217        elif isinstance(delta, (dict, list)):218            data["delta"] = self._redact_value(delta)219220    def _redact_tool_call_list(self, calls: list[Any] | None) -> tuple[list[Any], bool]:221        """Walk a list of tool-call (or invalid-tool-call) dicts.222223        Returns `(new_list, changed)`. Each element's `args` is run224        through `_redact_value` regardless of its type — `tool_call.args`225        is a dict, `invalid_tool_call.args` is a raw JSON string, and226        `_redact_value` handles both shapes uniformly. If nothing227        changed, returns the input list and `changed=False`.228        """229        if not calls:230            return calls or [], False231        new_calls: list[Any] = []232        changed = False233        for tc in calls:234            if isinstance(tc, dict) and "args" in tc and tc["args"] is not None:235                redacted = self._redact_value(tc["args"])236                if redacted != tc["args"]:237                    new_tc = dict(tc)238                    new_tc["args"] = redacted239                    new_calls.append(new_tc)240                    changed = True241                    continue242            new_calls.append(tc)243        return new_calls, changed244245    def _redact_value(self, value: Any) -> Any:246        """Recursively redact PII in string leaves of a nested structure.247248        Returns a new value where every `str` leaf that contains PII has249        been replaced (or emptied under `block`). Non-string leaves and250        the structure itself are preserved.251252        `BaseMessage` payloads (typically `ToolMessage` from253        `tool-finished.output`, or any message reached via the `values`254        channel) return a fresh copy with `.content` redacted plus255        `AIMessage.tool_calls[*].args` / `invalid_tool_calls[*].args`256        walked. The original object stays intact for state-level257        enforcers (`after_model`, `before_model` with258        `apply_to_tool_results`) to act on independently.259260        Scope mirrors the pre-streaming state-level surfaces:261        `.content` (string or list-of-content-blocks) and `tool_calls`262        args. Other message attributes (`additional_kwargs`,263        `response_metadata`, `ToolMessage.artifact`) are intentionally264        not walked here — they aren't scrubbed in graph state by the265        existing hooks, so scrubbing them on the wire would create266        a wire/state divergence.267        """268        if isinstance(value, str):269            if not value:270                return value271            matches = self._rule.detector(value)272            if not matches:273                return value274            # `apply_strategy` raises `PIIDetectionError` under `block`275            # — the run fails immediately rather than buffering until a276            # state-level hook can raise.277            return apply_strategy(value, matches, self._rule.strategy)278        if isinstance(value, BaseMessage):279            return self._redact_base_message(value)280        if isinstance(value, dict):281            return {k: self._redact_value(v) for k, v in value.items()}282        if isinstance(value, list):283            return [self._redact_value(v) for v in value]284        if isinstance(value, tuple):285            return tuple(self._redact_value(v) for v in value)286        return value287288    def _redact_base_message(self, value: BaseMessage) -> BaseMessage:289        """Return a fresh copy of `value` with PII-carrying surfaces redacted."""290        update: dict[str, Any] = {}291292        content = value.content293        if isinstance(content, str) and content:294            matches = self._rule.detector(content)295            if matches:296                update["content"] = apply_strategy(content, matches, self._rule.strategy)297        elif isinstance(content, list) and content:298            # Structured content-blocks shape:299            # `[{"type": "text", "text": "..."}, {"type": "tool_call", ...}, ...]`.300            redacted_content = self._redact_value(content)301            if redacted_content != content:302                update["content"] = redacted_content303304        # `AIMessage.tool_calls` and `.invalid_tool_calls` carry PII in305        # `args` independently of `.content`. `tool_call.args` is a306        # dict; `invalid_tool_call.args` is a raw JSON string —307        # `_redact_value` handles both shapes via the recursion.308        if isinstance(value, AIMessage):309            new_tc_list, tc_changed = self._redact_tool_call_list(value.tool_calls)310            if tc_changed:311                update["tool_calls"] = new_tc_list312            new_inv_list, inv_changed = self._redact_tool_call_list(value.invalid_tool_calls)313            if inv_changed:314                update["invalid_tool_calls"] = new_inv_list315316        if not update:317            return value318        return value.model_copy(update=update)319320    def _mutate_delta(self, payload: dict[str, Any], run_id: str) -> None:321        delta = payload.get("delta")322        if not isinstance(delta, dict):323            return324        delta_type = delta.get("type")325        if delta_type == "text-delta":326            self._mutate_string_field_delta(delta, payload, run_id, "text")327            return328        if delta_type == "reasoning-delta":329            # Reasoning content (chain-of-thought from extended-thinking330            # models) is a real PII surface — models echo back331            # user-supplied data or synthesize it from context. Run the332            # same lookback machinery as text-delta against the333            # `reasoning` field. Block indices are unique within a334            # message regardless of block type, so the buffer key335            # `(run_id, index)` naturally disjoint from text-delta keys.336            self._mutate_string_field_delta(delta, payload, run_id, "reasoning")337            return338        if delta_type == "block-delta":339            fields = delta.get("fields")340            if isinstance(fields, dict) and fields.get("type") in {341                "tool_call_chunk",342                "server_tool_call_chunk",343            }:344                self._mutate_tool_call_chunk_delta(fields)345        # Other delta types (`data-delta`, vendor block types) pass346        # through. The pre-streaming middleware scrubbed `.content` text347        # on state messages only; binary payloads and provider-specific348        # block shapes are out of scope for parity with that surface.349350    def _mutate_string_field_delta(351        self,352        delta: dict[str, Any],353        payload: dict[str, Any],354        run_id: str,355        field: str,356    ) -> None:357        """Apply the lookback-buffer redaction to a string field on a delta.358359        Shared by `text-delta` (`field="text"`) and `reasoning-delta`360        (`field="reasoning"`). Buffer is keyed by `(run_id, block_index)`;361        block indices are unique within a message so different block362        types share the same key space without collision.363        """364        text = delta.get(field)365        if not isinstance(text, str) or not text:366            return367        index = payload.get("index")368        if not isinstance(index, int):369            return370371        key = (run_id, index)372        held = self._buffers.get(key, "")373        combined = held + text374375        # Run detection on the full accumulated buffer before splitting.376        # Detecting only on the about-to-emit prefix would miss matches377        # that straddle the lookback boundary — the detector's regex378        # needs a complete, boundary-anchored hit, so a truncated prefix379        # would fail to match and the partial PII would leak on the380        # wire. Under `strategy="block"`, `apply_strategy` raises381        # `PIIDetectionError` here, failing the run as soon as PII382        # arrives rather than buffering until `after_model`.383        matches = self._rule.detector(combined)384        if matches:385            combined = apply_strategy(combined, matches, self._rule.strategy)386387        emit_end = max(0, len(combined) - self._lookback)388        self._buffers[key] = combined[emit_end:]389        delta[field] = combined[:emit_end]390391    def _mutate_tool_call_chunk_delta(self, fields: dict[str, Any]) -> None:392        """Redact cumulative tool-call args with lookback withholding.393394        Each `tool_call_chunk` `block-delta` event carries the full395        accumulated args string (verified against `_compat_bridge.py`396        — `delta_source = current` for these block types — and against397        the consumer-side `_merge_block_delta_into_store`, which398        replaces wholesale rather than appends).399400        Detection runs on the full cumulative args so any complete PII401        anywhere in the string is redacted before emission. Lookback402        withholding then trims the trailing the lookback window characters403        from what reaches the consumer — those characters might be the404        start of a partial PII match that completes in a future405        cumulative delta. The trimmed tail surfaces at `content-block-406        finish` where `_finalize_block` redacts the parsed args dict.407408        For args that fit within the lookback window (the typical case),409        this withholds the entire args string during streaming — the410        redacted args dict appears only at finalize. For args that411        exceed the lookback window, the safe prefix streams incrementally412        as the cumulative state grows. PII that appears more than413        the lookback window characters from the cumulative tail in a414        delta where it hasn't yet completed can still surface in the415        emit prefix — same residual exposure as PII longer than416        the lookback window on the text path. The `content-block-finish`417        snapshot redaction is the backstop.418        """419        args = fields.get("args")420        if not isinstance(args, str) or not args:421            return422423        matches = self._rule.detector(args)424        if matches:425            # `apply_strategy` raises `PIIDetectionError` under426            # `strategy="block"` — the run fails the moment a complete427            # PII pattern surfaces in the cumulative args string.428            args = apply_strategy(args, matches, self._rule.strategy)429430        emit_end = max(0, len(args) - self._lookback)431        fields["args"] = args[:emit_end]432433    def _finalize_block(self, payload: dict[str, Any], run_id: str) -> None:434        index = payload.get("index")435        if not isinstance(index, int):436            return437        key = (run_id, index)438        # The finalized block carries the model's original concatenation439        # of deltas, not what we emitted on the wire. Re-run detection over440        # its full text so the snapshot matches the redacted stream.441        content = payload.get("content")442        if isinstance(content, dict):443            ctype = content.get("type")444            if ctype == "text":445                self._finalize_string_field(content, "text")446            elif ctype == "reasoning":447                self._finalize_string_field(content, "reasoning")448            elif (449                ctype in {"tool_call", "server_tool_call", "invalid_tool_call"}450                and "args" in content451                and content["args"] is not None452            ):453                # `tool_call` / `server_tool_call` args are dicts;454                # `invalid_tool_call.args` is the raw unparsed JSON455                # string. `_redact_value` handles both shapes.456                content["args"] = self._redact_value(content["args"])457        self._buffers.pop(key, None)458459    def _finalize_string_field(self, content: dict[str, Any], field: str) -> None:460        """Re-redact a string content-block field on `content-block-finish`.461462        Used for `text` and `reasoning` content blocks. Under463        `strategy="block"` `apply_strategy` raises `PIIDetectionError`,464        failing the run immediately.465        """466        text = content.get(field)467        if not isinstance(text, str) or not text:468            return469        matches = self._rule.detector(text)470        if not matches:471            return472        content[field] = apply_strategy(text, matches, self._rule.strategy)473474    def _drop_run(self, run_id: str) -> None:475        # Release any buffered tails for this run_id — content-block-finish476        # should have already done so for normal completion, but message-finish477        # / error paths need an explicit sweep so abandoned blocks don't478        # accumulate in long-lived processes.479        stale = [key for key in self._buffers if key[0] == run_id]480        for key in stale:481            del self._buffers[key]482483    def finalize(self) -> None:484        self._buffers.clear()485        self._tool_buffers.clear()486487    def fail(self, err: BaseException) -> None:  # noqa: ARG002488        self._buffers.clear()489        self._tool_buffers.clear()490491492class PIIMiddleware(AgentMiddleware[AgentState[ResponseT], ContextT, ResponseT]):493    """Detect and handle Personally Identifiable Information (PII) in conversations.494495    This middleware detects common PII types and applies configurable strategies496    to handle them. It can detect emails, credit cards, IP addresses, MAC addresses, and497    URLs in both user input and agent output.498499    Built-in PII types:500501    - `email`: Email addresses502    - `credit_card`: Credit card numbers (validated with Luhn algorithm)503    - `ip`: IP addresses (validated with stdlib)504    - `mac_address`: MAC addresses505    - `url`: URLs (both `http`/`https` and bare URLs)506507    Strategies:508509    - `block`: Raise an exception when PII is detected510    - `redact`: Replace PII with `[REDACTED_TYPE]` placeholders511    - `mask`: Partially mask PII (e.g., `****-****-****-1234` for credit card)512    - `hash`: Replace PII with deterministic hash (e.g., `<email_hash:a1b2c3d4>`)513514    Strategy Selection Guide:515516    | Strategy | Preserves Identity? | Best For                                |517    | -------- | ------------------- | --------------------------------------- |518    | `block`  | N/A                 | Avoid PII completely                    |519    | `redact` | No                  | General compliance, log sanitization    |520    | `mask`   | No                  | Human readability, customer service UIs |521    | `hash`   | Yes (pseudonymous)  | Analytics, debugging                    |522523    Example:524        ```python525        from langchain.agents.middleware import PIIMiddleware526        from langchain.agents import create_agent527528        # Redact all emails in user input529        agent = create_agent(530            "openai:gpt-5.5",531            middleware=[532                PIIMiddleware("email", strategy="redact"),533            ],534        )535536        # Use different strategies for different PII types537        agent = create_agent(538            "openai:gpt-5.5",539            middleware=[540                PIIMiddleware("credit_card", strategy="mask"),541                PIIMiddleware("url", strategy="redact"),542                PIIMiddleware("ip", strategy="hash"),543            ],544        )545546        # Custom PII type with regex547        agent = create_agent(548            "openai:gpt-5.5",549            middleware=[550                PIIMiddleware("api_key", detector=r"sk-[a-zA-Z0-9]{32}", strategy="block"),551            ],552        )553        ```554    """555556    def __init__(557        self,558        # From a typing point of view, the literals are covered by 'str'.559        # Nonetheless, we escape PYI051 to keep hints and autocompletion for the caller.560        pii_type: Literal["email", "credit_card", "ip", "mac_address", "url"] | str,  # noqa: PYI051561        *,562        strategy: Literal["block", "redact", "mask", "hash"] = "redact",563        detector: Callable[[str], list[PIIMatch]] | str | None = None,564        apply_to_input: bool = True,565        apply_to_output: bool = False,566        apply_to_tool_results: bool = False,567    ) -> None:568        """Initialize the PII detection middleware.569570        Args:571            pii_type: Type of PII to detect.572573                Can be a built-in type (`email`, `credit_card`, `ip`, `mac_address`,574                `url`) or a custom type name.575            strategy: How to handle detected PII.576577                Options:578579                * `block`: Raise `PIIDetectionError` when PII is detected580                * `redact`: Replace with `[REDACTED_TYPE]` placeholders581                * `mask`: Partially mask PII (show last few characters)582                * `hash`: Replace with deterministic hash (format: `<type_hash:digest>`)583584            detector: Custom detector function or regex pattern.585586                * If `Callable`: Function that takes content string and returns587                    list of `PIIMatch` objects588                * If `str`: Regex pattern to match PII589                * If `None`: Uses built-in detector for the `pii_type`590            apply_to_input: Whether to check user messages before model call.591            apply_to_output: Whether to check AI messages after model call.592593                When `True`, a stream transformer is also installed so594                that every wire surface of an agent run is redacted in595                flight:596597                * Streamed AI text deltas (`content-block-delta` of type598                  `text-delta`)599                * Streamed tool-call arguments (`content-block-delta`600                  with `tool_call_chunk` / `server_tool_call_chunk`601                  fields, plus the finalized `tool_call` content block602                  on `content-block-finish`)603                * Tool execution events on the `tools` channel604                  (`tool-started.input`, `tool-output-delta`,605                  `tool-finished.output`, `tool-error.message`)606                * State snapshots on the `values` channel — message607                  lists are walked and each message's `.content` is608                  redacted on a fresh copy (state itself stays intact609                  for `before_model` / `after_model` to act on610                  independently)611612                State-level redaction via `after_model` (and613                `before_model` with `apply_to_tool_results`) remains the614                canonical enforcer; the streaming transformer ensures615                consumers reading `astream_events(version="v3")` or616                `run.messages` / `run.tool_calls` / `run.values` never617                see PII on the wire.618            apply_to_tool_results: Whether to check tool result messages after tool execution.619620        Raises:621            ValueError: If `pii_type` is not built-in and no detector is provided.622        """623        super().__init__()624625        self.apply_to_input = apply_to_input626        self.apply_to_output = apply_to_output627        self.apply_to_tool_results = apply_to_tool_results628629        self._resolved_rule: ResolvedRedactionRule = RedactionRule(630            pii_type=pii_type,631            strategy=strategy,632            detector=detector,633        ).resolve()634        self.pii_type = self._resolved_rule.pii_type635        self.strategy = self._resolved_rule.strategy636        self.detector = self._resolved_rule.detector637638        # Stream transformer scrubs the streamed surface of the same639        # messages that the state-level hooks scrub in graph state.640        # Installed whenever any output-side scrubbing is enabled —641        # `apply_to_output` covers AI messages (text, tool-call args,642        # reasoning), `apply_to_tool_results` covers tool execution643        # (the `tools` channel + ToolMessage content on `values` and644        # `messages`). For `block` the transformer raises645        # `PIIDetectionError` directly from its event handler the646        # moment a complete PII pattern is detected, failing the run647        # via langgraph's `StreamMux.afail` path. The state-level648        # `after_model` / `before_model` hooks remain a backstop for649        # non-streaming consumers.650        if self.apply_to_output or self.apply_to_tool_results:651            self.transformers = (652                partial(653                    _PIIStreamTransformer,654                    rule=self._resolved_rule,655                ),656            )657658    @property659    def name(self) -> str:660        """Name of the middleware."""661        return f"{self.__class__.__name__}[{self.pii_type}]"662663    def _process_content(self, content: str) -> tuple[str, list[PIIMatch]]:664        """Apply the configured redaction rule to the provided content."""665        matches = self.detector(content)666        if not matches:667            return content, []668        sanitized = apply_strategy(content, matches, self.strategy)669        return sanitized, matches670671    @hook_config(can_jump_to=["end"])672    @override673    def before_model(674        self,675        state: AgentState[Any],676        runtime: Runtime[ContextT],677    ) -> dict[str, Any] | None:678        """Check user messages and tool results for PII before model invocation.679680        Args:681            state: The current agent state.682            runtime: The langgraph runtime.683684        Returns:685            Updated state with PII handled according to strategy, or `None` if no PII686                detected.687688        Raises:689            PIIDetectionError: If PII is detected and strategy is `'block'`.690        """691        if not self.apply_to_input and not self.apply_to_tool_results:692            return None693694        messages = state["messages"]695        if not messages:696            return None697698        new_messages = list(messages)699        any_modified = False700701        # Check user input if enabled702        if self.apply_to_input:703            # Get last user message704            last_user_msg = None705            last_user_idx = None706            for i in range(len(messages) - 1, -1, -1):707                if isinstance(messages[i], HumanMessage):708                    last_user_msg = messages[i]709                    last_user_idx = i710                    break711712            if last_user_idx is not None and last_user_msg and last_user_msg.content:713                # Detect PII in message content714                content = str(last_user_msg.content)715                new_content, matches = self._process_content(content)716717                if matches:718                    updated_message: AnyMessage = HumanMessage(719                        content=new_content,720                        id=last_user_msg.id,721                        name=last_user_msg.name,722                    )723724                    new_messages[last_user_idx] = updated_message725                    any_modified = True726727        # Check tool results if enabled728        if self.apply_to_tool_results:729            # Find the last AIMessage, then process all `ToolMessage` objects after it730            last_ai_idx = None731            for i in range(len(messages) - 1, -1, -1):732                if isinstance(messages[i], AIMessage):733                    last_ai_idx = i734                    break735736            if last_ai_idx is not None:737                # Get all tool messages after the last AI message738                for i in range(last_ai_idx + 1, len(messages)):739                    msg = messages[i]740                    if isinstance(msg, ToolMessage):741                        tool_msg = msg742                        if not tool_msg.content:743                            continue744745                        content = str(tool_msg.content)746                        new_content, matches = self._process_content(content)747748                        if not matches:749                            continue750751                        # Create updated tool message752                        updated_message = ToolMessage(753                            content=new_content,754                            id=tool_msg.id,755                            name=tool_msg.name,756                            tool_call_id=tool_msg.tool_call_id,757                        )758759                        new_messages[i] = updated_message760                        any_modified = True761762        if any_modified:763            return {"messages": new_messages}764765        return None766767    @hook_config(can_jump_to=["end"])768    async def abefore_model(769        self,770        state: AgentState[Any],771        runtime: Runtime[ContextT],772    ) -> dict[str, Any] | None:773        """Async check user messages and tool results for PII before model invocation.774775        Args:776            state: The current agent state.777            runtime: The langgraph runtime.778779        Returns:780            Updated state with PII handled according to strategy, or `None` if no PII781                detected.782783        Raises:784            PIIDetectionError: If PII is detected and strategy is `'block'`.785        """786        return self.before_model(state, runtime)787788    @override789    def after_model(790        self,791        state: AgentState[Any],792        runtime: Runtime[ContextT],793    ) -> dict[str, Any] | None:794        """Check AI messages for PII after model invocation.795796        Args:797            state: The current agent state.798            runtime: The langgraph runtime.799800        Returns:801            Updated state with PII handled according to strategy, or None if no PII802                detected.803804        Raises:805            PIIDetectionError: If PII is detected and strategy is `'block'`.806        """807        if not self.apply_to_output:808            return None809810        messages = state["messages"]811        if not messages:812            return None813814        # Get last AI message815        last_ai_msg = None816        last_ai_idx = None817        for i in range(len(messages) - 1, -1, -1):818            msg = messages[i]819            if isinstance(msg, AIMessage):820                last_ai_msg = msg821                last_ai_idx = i822                break823824        if last_ai_idx is None or not last_ai_msg or not last_ai_msg.content:825            return None826827        # Detect PII in message content828        content = str(last_ai_msg.content)829        new_content, matches = self._process_content(content)830831        if not matches:832            return None833834        # Create updated message835        updated_message = AIMessage(836            content=new_content,837            id=last_ai_msg.id,838            name=last_ai_msg.name,839            tool_calls=last_ai_msg.tool_calls,840        )841842        # Return updated messages843        new_messages = list(messages)844        new_messages[last_ai_idx] = updated_message845846        return {"messages": new_messages}847848    async def aafter_model(849        self,850        state: AgentState[Any],851        runtime: Runtime[ContextT],852    ) -> dict[str, Any] | None:853        """Async check AI messages for PII after model invocation.854855        Args:856            state: The current agent state.857            runtime: The langgraph runtime.858859        Returns:860            Updated state with PII handled according to strategy, or None if no PII861                detected.862863        Raises:864            PIIDetectionError: If PII is detected and strategy is `'block'`.865        """866        return self.after_model(state, runtime)867868869__all__ = [870    "PIIDetectionError",871    "PIIMatch",872    "PIIMiddleware",873    "detect_credit_card",874    "detect_email",875    "detect_ip",876    "detect_mac_address",877    "detect_url",878]

Code quality findings 42

Ensure functions have docstrings for documentation

L85

missing-docstring

def init(self) -> dict[str, Any]:

Ensure functions have docstrings for documentation

L90

missing-docstring

def process(self, event: ProtocolEvent) -> bool:

Overuse may indicate design issues; consider polymorphism

L120

isinstance-overuse

if not isinstance(data, tuple) or len(data) != 2: # noqa: PLR2004

Overuse may indicate design issues; consider polymorphism

L133

isinstance-overuse

if isinstance(payload, BaseMessage):

Overuse may indicate design issues; consider polymorphism

L139

isinstance-overuse

if not isinstance(payload, dict):

Overuse may indicate design issues; consider polymorphism

L154

isinstance-overuse

if not isinstance(data, dict):

Overuse may indicate design issues; consider polymorphism

L171

isinstance-overuse

data, tool_call_id if isinstance(tool_call_id, str) else ""

Overuse may indicate design issues; consider polymorphism

L176

isinstance-overuse

if isinstance(tool_call_id, str):

Overuse may indicate design issues; consider polymorphism

L180

isinstance-overuse

if isinstance(msg, str) and msg:

Overuse may indicate design issues; consider polymorphism

L184

isinstance-overuse

if isinstance(tool_call_id, str):

Overuse may indicate design issues; consider polymorphism

L202

isinstance-overuse

if isinstance(delta, str):

Overuse may indicate design issues; consider polymorphism

L217

isinstance-overuse

elif isinstance(delta, (dict, list)):

Overuse may indicate design issues; consider polymorphism

L234

isinstance-overuse

if isinstance(tc, dict) and "args" in tc and tc["args"] is not None:

Overuse may indicate design issues; consider polymorphism

L268

isinstance-overuse

if isinstance(value, str):

Overuse may indicate design issues; consider polymorphism

L278

isinstance-overuse

if isinstance(value, BaseMessage):

Overuse may indicate design issues; consider polymorphism

L280

isinstance-overuse

if isinstance(value, dict):

Overuse may indicate design issues; consider polymorphism

L282

isinstance-overuse

if isinstance(value, list):

Overuse may indicate design issues; consider polymorphism

L284

isinstance-overuse

if isinstance(value, tuple):

Overuse may indicate design issues; consider polymorphism

L293

isinstance-overuse

if isinstance(content, str) and content:

Overuse may indicate design issues; consider polymorphism

L297

isinstance-overuse

elif isinstance(content, list) and content:

Overuse may indicate design issues; consider polymorphism

L308

isinstance-overuse

if isinstance(value, AIMessage):

Overuse may indicate design issues; consider polymorphism

L322

isinstance-overuse

if not isinstance(delta, dict):

Overuse may indicate design issues; consider polymorphism

L340

isinstance-overuse

if isinstance(fields, dict) and fields.get("type") in {

Overuse may indicate design issues; consider polymorphism

L365

isinstance-overuse

if not isinstance(text, str) or not text:

Overuse may indicate design issues; consider polymorphism

L368

isinstance-overuse

if not isinstance(index, int):

Overuse may indicate design issues; consider polymorphism

L420

isinstance-overuse

if not isinstance(args, str) or not args:

Overuse may indicate design issues; consider polymorphism

L435

isinstance-overuse

if not isinstance(index, int):

Overuse may indicate design issues; consider polymorphism

L442

isinstance-overuse

if isinstance(content, dict):

Overuse may indicate design issues; consider polymorphism

L467

isinstance-overuse

if not isinstance(text, str) or not text:

Avoid unless necessary; Python's garbage collector typically handles object deletion

L481

unnecessary-del

del self._buffers[key]

Ensure functions have docstrings for documentation

L483

missing-docstring

def finalize(self) -> None:

Ensure functions have docstrings for documentation

L487

missing-docstring

def fail(self, err: BaseException) -> None: # noqa: ARG002

Ensure functions have docstrings for documentation

L673

missing-docstring

def before_model(

Avoid unnecessary list conversions; use generators where possible

L698

unnecessary-list

new_messages = list(messages)

Overuse may indicate design issues; consider polymorphism

L707

isinstance-overuse

if isinstance(messages[i], HumanMessage):

Overuse may indicate design issues; consider polymorphism

L732

isinstance-overuse

if isinstance(messages[i], AIMessage):

Overuse may indicate design issues; consider polymorphism

L740

isinstance-overuse

if isinstance(msg, ToolMessage):

Ensure functions have docstrings for documentation

L768

missing-docstring

async def abefore_model(

Ensure functions have docstrings for documentation

L789

missing-docstring

def after_model(

Overuse may indicate design issues; consider polymorphism

L819

isinstance-overuse

if isinstance(msg, AIMessage):

Avoid unnecessary list conversions; use generators where possible

L843

unnecessary-list

new_messages = list(messages)

Ensure functions have docstrings for documentation

L848

missing-docstring

async def aafter_model(

Code quality findings 42

Get this view in your editor