libs/core/langchain_core/_security/_policy.py PYTHON 307 lines View on github.com → Search inside
1"""SSRF protection policy with IP validation and DNS-aware URL checking."""23import asyncio4import dataclasses5import ipaddress6import os7import socket8import urllib.parse910from langchain_core._security._exceptions import SSRFBlockedError1112# ---------------------------------------------------------------------------13# Blocklist constants14# ---------------------------------------------------------------------------1516_BLOCKED_IPV4_NETWORKS: tuple[ipaddress.IPv4Network, ...] = tuple(17    ipaddress.IPv4Network(n)18    for n in (19        "10.0.0.0/8",  # RFC 1918 - private class A20        "172.16.0.0/12",  # RFC 1918 - private class B21        "192.168.0.0/16",  # RFC 1918 - private class C22        "127.0.0.0/8",  # RFC 1122 - loopback23        "169.254.0.0/16",  # RFC 3927 - link-local24        "0.0.0.0/8",  # RFC 1122 - "this network"25        "100.64.0.0/10",  # RFC 6598 - shared/CGN address space26        "192.0.0.0/24",  # RFC 6890 - IETF protocol assignments27        "192.0.2.0/24",  # RFC 5737 - TEST-NET-1 (documentation)28        "198.18.0.0/15",  # RFC 2544 - benchmarking29        "198.51.100.0/24",  # RFC 5737 - TEST-NET-2 (documentation)30        "203.0.113.0/24",  # RFC 5737 - TEST-NET-3 (documentation)31        "224.0.0.0/4",  # RFC 5771 - multicast32        "240.0.0.0/4",  # RFC 1112 - reserved for future use33        "255.255.255.255/32",  # RFC 919  - limited broadcast34    )35)3637_BLOCKED_IPV6_NETWORKS: tuple[ipaddress.IPv6Network, ...] = tuple(38    ipaddress.IPv6Network(n)39    for n in (40        "::1/128",  # RFC 4291 - loopback41        "fc00::/7",  # RFC 4193 - unique local addresses (ULA)42        "fe80::/10",  # RFC 4291 - link-local43        "ff00::/8",  # RFC 4291 - multicast44        "::ffff:0:0/96",  # RFC 4291 - IPv4-mapped IPv6 addresses45        "::0.0.0.0/96",  # RFC 4291 - IPv4-compatible IPv6 (deprecated)46        "64:ff9b::/96",  # RFC 6052 - NAT64 well-known prefix47        "64:ff9b:1::/48",  # RFC 8215 - NAT64 discovery prefix48    )49)5051_CLOUD_METADATA_IPS: frozenset[str] = frozenset(52    {53        "169.254.169.254",  # AWS, GCP, Azure, DigitalOcean, Oracle Cloud54        "169.254.170.2",  # AWS ECS task metadata55        "169.254.170.23",  # AWS EKS Pod Identity Agent56        "100.100.100.200",  # Alibaba Cloud metadata57        "fd00:ec2::254",  # AWS EC2 IMDSv2 over IPv6 (Nitro instances)58        "fd00:ec2::23",  # AWS EKS Pod Identity Agent (IPv6)59        "fe80::a9fe:a9fe",  # OpenStack Nova metadata (IPv6 link-local)60    }61)6263# Network ranges that are always blocked when block_cloud_metadata=True,64# independent of block_private_ips.  The entire link-local range is used by65# cloud metadata services across providers.66_CLOUD_METADATA_NETWORKS: tuple[ipaddress.IPv4Network | ipaddress.IPv6Network, ...] = (67    ipaddress.IPv4Network("169.254.0.0/16"),68)6970_CLOUD_METADATA_HOSTNAMES: frozenset[str] = frozenset(71    {72        "metadata.google.internal",73        "metadata.amazonaws.com",74        "metadata",75        "instance-data",76    }77)7879_LOCALHOST_NAMES: frozenset[str] = frozenset(80    {81        "localhost",82        "localhost.localdomain",83        "host.docker.internal",84    }85)8687_K8S_SUFFIX = ".svc.cluster.local"8889_LOOPBACK_IPV4 = ipaddress.IPv4Network("127.0.0.0/8")90_LOOPBACK_IPV6 = ipaddress.IPv6Address("::1")9192# NAT64 well-known prefixes93_NAT64_PREFIX = ipaddress.IPv6Network("64:ff9b::/96")94_NAT64_DISCOVERY_PREFIX = ipaddress.IPv6Network("64:ff9b:1::/48")959697# ---------------------------------------------------------------------------98# SSRFPolicy99# ---------------------------------------------------------------------------100101102@dataclasses.dataclass(frozen=True)103class SSRFPolicy:104    """Immutable policy controlling which URLs/IPs are considered safe."""105106    allowed_schemes: frozenset[str] = frozenset({"http", "https"})107    block_private_ips: bool = True108    block_localhost: bool = True109    block_cloud_metadata: bool = True110    block_k8s_internal: bool = True111    allowed_hosts: frozenset[str] = frozenset()112    additional_blocked_cidrs: tuple[113        ipaddress.IPv4Network | ipaddress.IPv6Network, ...114    ] = ()115116117# ---------------------------------------------------------------------------118# Helpers119# ---------------------------------------------------------------------------120121122def _extract_embedded_ipv4(123    addr: ipaddress.IPv6Address,124) -> ipaddress.IPv4Address | None:125    """Extract an embedded IPv4 from IPv4-mapped or NAT64 IPv6 addresses."""126    # Check ipv4_mapped first (covers ::ffff:x.x.x.x)127    if addr.ipv4_mapped is not None:128        return addr.ipv4_mapped129130    # Check NAT64 prefixes  embedded IPv4 is in the last 4 bytes131    if addr in _NAT64_PREFIX or addr in _NAT64_DISCOVERY_PREFIX:132        raw = addr.packed133        return ipaddress.IPv4Address(raw[-4:])134135    return None136137138def _ip_in_blocked_networks(139    addr: ipaddress.IPv4Address | ipaddress.IPv6Address,140    policy: SSRFPolicy,141) -> str | None:142    """Return a reason string if *addr* falls in a blocked range, else None."""143    # NOTE: if profiling shows this is a hot path, consider memoising with144    # @functools.lru_cache (key on (addr, id(policy))).145    if isinstance(addr, ipaddress.IPv4Address):146        if policy.block_private_ips:147            for net in _BLOCKED_IPV4_NETWORKS:148                if addr in net:149                    return "private IP range"150        for net in policy.additional_blocked_cidrs:  # type: ignore[assignment]151            if isinstance(net, ipaddress.IPv4Network) and addr in net:152                return "blocked CIDR"153    else:154        if policy.block_private_ips:155            for net in _BLOCKED_IPV6_NETWORKS:  # type: ignore[assignment]156                if addr in net:157                    return "private IP range"158        for net in policy.additional_blocked_cidrs:  # type: ignore[assignment]159            if isinstance(net, ipaddress.IPv6Network) and addr in net:160                return "blocked CIDR"161162    # Loopback check  independent of block_private_ips so that163    # block_localhost=True still catches 127.x.x.x / ::1 even when164    # private IPs are allowed.165    if policy.block_localhost:166        if isinstance(addr, ipaddress.IPv4Address) and (167            addr in _LOOPBACK_IPV4 or addr in ipaddress.IPv4Network("0.0.0.0/8")168        ):169            return "localhost address"170        if isinstance(addr, ipaddress.IPv6Address) and addr == _LOOPBACK_IPV6:171            return "localhost address"172173    # Cloud metadata check  IP set *and* network ranges (e.g. 169.254.0.0/16).174    # Independent of block_private_ips so that allow_private=True still blocks175    # cloud metadata endpoints.176    if policy.block_cloud_metadata:177        if str(addr) in _CLOUD_METADATA_IPS:178            return "cloud metadata endpoint"179        for net in _CLOUD_METADATA_NETWORKS:  # type: ignore[assignment]180            if addr in net:181                return "cloud metadata endpoint"182183    return None184185186# ---------------------------------------------------------------------------187# Public validation functions188# ---------------------------------------------------------------------------189190191def validate_resolved_ip(ip_str: str, policy: SSRFPolicy) -> None:192    """Validate a resolved IP address against the SSRF policy.193194    Raises SSRFBlockedError if the IP is blocked.195    """196    try:197        addr = ipaddress.ip_address(ip_str)198    except ValueError as exc:199        raise SSRFBlockedError("invalid IP address") from exc200201    if isinstance(addr, ipaddress.IPv6Address):202        inner = _extract_embedded_ipv4(addr)203        if inner is not None:204            addr = inner205206    reason = _ip_in_blocked_networks(addr, policy)207    if reason is not None:208        raise SSRFBlockedError(reason)209210211def validate_hostname(hostname: str, policy: SSRFPolicy) -> None:212    """Validate a hostname against the SSRF policy.213214    Raises SSRFBlockedError if the hostname is blocked.215    """216    lower = hostname.lower()217218    if policy.block_localhost and lower in _LOCALHOST_NAMES:219        raise SSRFBlockedError("localhost address")220221    if policy.block_cloud_metadata and lower in _CLOUD_METADATA_HOSTNAMES:222        raise SSRFBlockedError("cloud metadata endpoint")223224    if policy.block_k8s_internal and lower.endswith(_K8S_SUFFIX):225        raise SSRFBlockedError("Kubernetes internal DNS")226227228def _effective_allowed_hosts(policy: SSRFPolicy) -> frozenset[str]:229    """Return allowed_hosts, augmented for local environments."""230    extra: set[str] = set()231    if os.environ.get("LANGCHAIN_ENV", "").startswith("local"):232        extra.update({"localhost", "testserver"})233    if extra:234        return policy.allowed_hosts | frozenset(extra)235    return policy.allowed_hosts236237238async def validate_url(url: str, policy: SSRFPolicy = SSRFPolicy()) -> None:239    """Validate a URL against the SSRF policy, including DNS resolution.240241    This is the primary entry-point for async code paths. It delegates242    scheme/hostname/allowed-hosts checks to `validate_url_sync`, then243    resolves DNS and validates every resolved IP.244245    Raises:246        SSRFBlockedError: If the URL violates the policy.247    """248    parsed = urllib.parse.urlparse(url)249    hostname = parsed.hostname or ""250251    validate_url_sync(url, policy)252253    allowed = {h.lower() for h in _effective_allowed_hosts(policy)}254    if hostname.lower() in allowed:255        return256257    scheme = (parsed.scheme or "").lower()258    port = parsed.port or (443 if scheme == "https" else 80)259    try:260        addrinfo = await asyncio.to_thread(261            socket.getaddrinfo, hostname, port, type=socket.SOCK_STREAM262        )263    except socket.gaierror as exc:264        msg = "DNS resolution failed"265        raise SSRFBlockedError(msg) from exc266267    for _family, _type, _proto, _canonname, sockaddr in addrinfo:268        validate_resolved_ip(str(sockaddr[0]), policy)269270271def validate_url_sync(url: str, policy: SSRFPolicy = SSRFPolicy()) -> None:272    """Synchronous URL validation (no DNS resolution).273274    Suitable for Pydantic validators and other sync contexts. Checks scheme275    and hostname patterns only - use `validate_url` for full DNS-aware checking.276277    Raises:278        SSRFBlockedError: If the URL violates the policy.279    """280    parsed = urllib.parse.urlparse(url)281282    scheme = (parsed.scheme or "").lower()283    if scheme not in policy.allowed_schemes:284        msg = f"scheme '{scheme}' not allowed"285        raise SSRFBlockedError(msg)286287    hostname = parsed.hostname288    if not hostname:289        msg = "missing hostname"290        raise SSRFBlockedError(msg)291292    allowed = _effective_allowed_hosts(policy)293    if hostname.lower() in {h.lower() for h in allowed}:294        return295296    try:297        ipaddress.ip_address(hostname)298        validate_resolved_ip(hostname, policy)299    except SSRFBlockedError:300        raise301    except ValueError:302        pass303    else:304        return305306    validate_hostname(hostname, policy)

Code quality findings 6

Overuse may indicate design issues; consider polymorphism
isinstance-overuse
if isinstance(addr, ipaddress.IPv4Address):
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
if isinstance(net, ipaddress.IPv4Network) and addr in net:
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
if isinstance(net, ipaddress.IPv6Network) and addr in net:
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
if isinstance(addr, ipaddress.IPv4Address) and (
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
if isinstance(addr, ipaddress.IPv6Address) and addr == _LOOPBACK_IPV6:
Overuse may indicate design issues; consider polymorphism
isinstance-overuse
if isinstance(addr, ipaddress.IPv6Address):

Get this view in your editor

Same data, no extra tab — call code_get_file + code_get_findings over MCP from Claude/Cursor/Copilot.