.github/scripts/check_diff.py · langchain-ai/langchain

1"""Analyze git diffs to determine which directories need to be tested.23Intelligently determines which LangChain packages and directories need to be tested,4linted, or built based on the changes. Handles dependency relationships between5packages, maps file changes to appropriate CI job configurations, and outputs JSON6configurations for GitHub Actions.78- Maps changed files to affected package directories (libs/core, libs/partners/*, etc.)9- Builds dependency graph to include dependent packages when core components change10- Generates test matrix configurations with appropriate Python versions11- Handles special cases for Pydantic version testing and performance benchmarks1213Used as part of the check_diffs workflow.14"""1516import glob17import json18import os19import sys20from collections import defaultdict21from pathlib import Path22from typing import Dict, List, Set2324import tomllib25from get_min_versions import get_min_version_from_toml26from packaging.requirements import Requirement2728LANGCHAIN_DIRS = [29    "libs/core",30    "libs/text-splitters",31    "libs/langchain",32    "libs/langchain_v1",33    "libs/model-profiles",34]3536# Packages with VCR cassette-backed integration tests.37# These get a playback-only CI check to catch stale cassettes.38VCR_PACKAGES = {39    "libs/partners/openai",40}4142# When set to True, we are ignoring core dependents43# in order to be able to get CI to pass for each individual44# package that depends on core45# e.g. if you touch core, we don't then add textsplitters/etc to CI46IGNORE_CORE_DEPENDENTS = False4748# Ignored partners are removed from dependents but still run if directly edited49IGNORED_PARTNERS = [50    # remove huggingface from dependents because of CI instability51    # specifically in huggingface jobs52    "huggingface",53]545556def all_package_dirs() -> Set[str]:57    return {58        "/".join(path.split("/")[:-1]).lstrip("./")59        for path in glob.glob("./libs/**/pyproject.toml", recursive=True)60        if "libs/standard-tests" not in path61    }626364def dependents_graph() -> dict:65    """Construct a mapping of package -> dependents6667    Done such that we can run tests on all dependents of a package when a change is made.68    """69    dependents = defaultdict(set)7071    for path in glob.glob("./libs/**/pyproject.toml", recursive=True):72        if "template" in path:73            continue7475        # load regular and test deps from pyproject.toml76        with open(path, "rb") as f:77            pyproject = tomllib.load(f)7879        pkg_dir = "libs" + "/".join(path.split("libs")[1].split("/")[:-1])80        for dep in [81            *pyproject["project"]["dependencies"],82            *pyproject["dependency-groups"]["test"],83        ]:84            requirement = Requirement(dep)85            package_name = requirement.name86            if "langchain" in dep:87                dependents[package_name].add(pkg_dir)88                continue8990        # load extended deps from extended_testing_deps.txt91        package_path = Path(path).parent92        extended_requirement_path = package_path / "extended_testing_deps.txt"93        if extended_requirement_path.exists():94            with open(extended_requirement_path, "r") as f:95                extended_deps = f.read().splitlines()96                for depline in extended_deps:97                    if depline.startswith("-e "):98                        # editable dependency99                        assert depline.startswith("-e ../partners/"), (100                            "Extended test deps should only editable install partner packages"101                        )102                        partner = depline.split("partners/")[1]103                        dep = f"langchain-{partner}"104                    else:105                        dep = depline.split("==")[0]106107                    if "langchain" in dep:108                        dependents[dep].add(pkg_dir)109110    for k in dependents:111        for partner in IGNORED_PARTNERS:112            if f"libs/partners/{partner}" in dependents[k]:113                dependents[k].remove(f"libs/partners/{partner}")114    return dependents115116117def add_dependents(dirs_to_eval: Set[str], dependents: dict) -> List[str]:118    updated = set()119    for dir_ in dirs_to_eval:120        # handle core manually because it has so many dependents121        if "core" in dir_:122            updated.add(dir_)123            continue124        pkg = "langchain-" + dir_.split("/")[-1]125        updated.update(dependents[pkg])126        updated.add(dir_)127    return list(updated)128129130def _get_configs_for_single_dir(job: str, dir_: str) -> List[Dict[str, str]]:131    if job == "test-pydantic":132        return _get_pydantic_test_configs(dir_)133134    if job == "codspeed":135        # CPU simulation (<1% variance, Valgrind-based) is the default.136        # Partners with heavy SDK inits use walltime instead to keep CI fast.137        CODSPEED_WALLTIME_DIRS = {138            "libs/core",139            "libs/partners/fireworks",  # ~328s under simulation140            "libs/partners/openai",  # 6 benchmarks, ~6 min under simulation141        }142        mode = "walltime" if dir_ in CODSPEED_WALLTIME_DIRS else "simulation"143        return [144            {145                "working-directory": dir_,146                "python-version": "3.13",147                "codspeed-mode": mode,148            }149        ]150    if dir_ == "libs/core":151        py_versions = ["3.10", "3.11", "3.12", "3.13", "3.14"]152    else:153        py_versions = ["3.10", "3.14"]154155    return [{"working-directory": dir_, "python-version": py_v} for py_v in py_versions]156157158def _get_pydantic_test_configs(159    dir_: str, *, python_version: str = "3.12"160) -> List[Dict[str, str]]:161    with open("./libs/core/uv.lock", "rb") as f:162        core_uv_lock_data = tomllib.load(f)163    for package in core_uv_lock_data["package"]:164        if package["name"] == "pydantic":165            core_max_pydantic_minor = package["version"].split(".")[1]166            break167168    with open(f"./{dir_}/uv.lock", "rb") as f:169        dir_uv_lock_data = tomllib.load(f)170171    for package in dir_uv_lock_data["package"]:172        if package["name"] == "pydantic":173            dir_max_pydantic_minor = package["version"].split(".")[1]174            break175176    core_min_pydantic_version = get_min_version_from_toml(177        "./libs/core/pyproject.toml", "release", python_version, include=["pydantic"]178    )["pydantic"]179    core_min_pydantic_minor = (180        core_min_pydantic_version.split(".")[1]181        if "." in core_min_pydantic_version182        else "0"183    )184    dir_min_pydantic_version = get_min_version_from_toml(185        f"./{dir_}/pyproject.toml", "release", python_version, include=["pydantic"]186    ).get("pydantic", "0.0.0")187    dir_min_pydantic_minor = (188        dir_min_pydantic_version.split(".")[1]189        if "." in dir_min_pydantic_version190        else "0"191    )192193    max_pydantic_minor = min(194        int(dir_max_pydantic_minor),195        int(core_max_pydantic_minor),196    )197    min_pydantic_minor = max(198        int(dir_min_pydantic_minor),199        int(core_min_pydantic_minor),200    )201202    configs = [203        {204            "working-directory": dir_,205            "pydantic-version": f"2.{v}.0",206            "python-version": python_version,207        }208        for v in range(min_pydantic_minor, max_pydantic_minor + 1)209    ]210    return configs211212213def _get_configs_for_multi_dirs(214    job: str, dirs_to_run: Dict[str, Set[str]], dependents: dict215) -> List[Dict[str, str]]:216    if job == "lint":217        dirs = add_dependents(218            dirs_to_run["lint"] | dirs_to_run["test"] | dirs_to_run["extended-test"],219            dependents,220        )221    elif job in ["test", "compile-integration-tests", "dependencies", "test-pydantic"]:222        dirs = add_dependents(223            dirs_to_run["test"] | dirs_to_run["extended-test"], dependents224        )225    elif job == "extended-tests":226        dirs = list(dirs_to_run["extended-test"])227    elif job == "codspeed":228        dirs = list(dirs_to_run["codspeed"])229    elif job == "vcr-tests":230        # Only run VCR tests for packages that have cassettes and are affected231        all_affected = set(232            add_dependents(233                dirs_to_run["test"] | dirs_to_run["extended-test"], dependents234            )235        )236        dirs = [d for d in VCR_PACKAGES if d in all_affected]237    else:238        raise ValueError(f"Unknown job: {job}")239240    return [241        config for dir_ in dirs for config in _get_configs_for_single_dir(job, dir_)242    ]243244245def _get_changed_files(args: list[str]) -> list[str]:246    """Parse changed files from command-line arguments.247248    Args:249        args: Either a legacy list of filename arguments or a single JSON array250            produced by `Ana06/get-changed-files` with `format: json`.251252    Returns:253        List of changed files.254255    Raises:256        ValueError: If a single argument looks like JSON but is not a string array.257    """258    if len(args) != 1:259        return args260261    value = args[0].strip()262    if not value.startswith("[") or not value.endswith("]"):263        return args264265    try:266        parsed = json.loads(value)267    except json.JSONDecodeError as e:268        msg = "Expected changed files JSON to be a list of strings."269        raise ValueError(msg) from e270271    if not isinstance(parsed, list) or not all(272        isinstance(file, str) for file in parsed273    ):274        msg = "Expected changed files JSON to be a list of strings."275        raise ValueError(msg)276    return parsed277278279if __name__ == "__main__":280    files = _get_changed_files(sys.argv[1:])281282    dirs_to_run: Dict[str, set] = {283        "lint": set(),284        "test": set(),285        "extended-test": set(),286        "codspeed": set(),287    }288    docs_edited = False289290    if len(files) >= 300:291        # max diff length is 300 files - there are likely files missing292        dirs_to_run["lint"] = all_package_dirs()293        dirs_to_run["test"] = all_package_dirs()294        dirs_to_run["extended-test"] = set(LANGCHAIN_DIRS)295296    for file in files:297        if any(298            file.startswith(dir_)299            for dir_ in (300                ".github/workflows",301                ".github/tools",302                ".github/actions",303                ".github/scripts/check_diff.py",304            )305        ):306            # Infrastructure changes (workflows, actions, CI scripts) trigger tests on307            # all core packages as a safety measure. This ensures that changes to CI/CD308            # infrastructure don't inadvertently break package testing, even if the change309            # appears unrelated (e.g., documentation build workflows). This is intentionally310            # conservative to catch unexpected side effects from workflow modifications.311            #312            # Example: A PR modifying .github/workflows/api_doc_build.yml will trigger313            # lint/test jobs for libs/core, libs/text-splitters, libs/langchain, and314            # libs/langchain_v1, even though the workflow may only affect documentation.315            dirs_to_run["extended-test"].update(LANGCHAIN_DIRS)316317        if file.startswith("libs/core"):318            dirs_to_run["codspeed"].add("libs/core")319        if file.startswith("libs/langchain_v1"):320            dirs_to_run["codspeed"].add("libs/langchain_v1")321        if any(file.startswith(dir_) for dir_ in LANGCHAIN_DIRS):322            # add that dir and all dirs after in LANGCHAIN_DIRS323            # for extended testing324325            found = False326            for dir_ in LANGCHAIN_DIRS:327                if dir_ == "libs/core" and IGNORE_CORE_DEPENDENTS:328                    dirs_to_run["extended-test"].add(dir_)329                    continue330                if file.startswith(dir_):331                    found = True332                if found:333                    dirs_to_run["extended-test"].add(dir_)334        elif file.startswith("libs/standard-tests"):335            # TODO: update to include all packages that rely on standard-tests (all partner packages)336            # Note: won't run on external repo partners337            dirs_to_run["lint"].add("libs/standard-tests")338            dirs_to_run["test"].add("libs/standard-tests")339            dirs_to_run["test"].add("libs/partners/mistralai")340            dirs_to_run["test"].add("libs/partners/openai")341            dirs_to_run["test"].add("libs/partners/anthropic")342            dirs_to_run["test"].add("libs/partners/fireworks")343            dirs_to_run["test"].add("libs/partners/groq")344345        elif file.startswith("libs/partners"):346            partner_dir = file.split("/")[2]347            if os.path.isdir(f"libs/partners/{partner_dir}") and [348                filename349                for filename in os.listdir(f"libs/partners/{partner_dir}")350                if not filename.startswith(".")351            ] != ["README.md"]:352                dirs_to_run["test"].add(f"libs/partners/{partner_dir}")353                # Only add to codspeed if the partner has benchmarks and is not ignored354                if (355                    partner_dir not in IGNORED_PARTNERS356                    and os.path.isdir(357                        f"libs/partners/{partner_dir}/tests/benchmarks"358                    )359                ):360                    dirs_to_run["codspeed"].add(f"libs/partners/{partner_dir}")361            # Skip if the directory was deleted or is just a tombstone readme362        elif file.startswith("libs/"):363            # Check if this is a root-level file in libs/ (e.g., libs/README.md)364            file_parts = file.split("/")365            if len(file_parts) == 2:366                # Root-level file in libs/, skip it (no tests needed)367                continue368            raise ValueError(369                f"Unknown lib: {file}. check_diff.py likely needs "370                "an update for this new library!"371            )372        elif file in [373            "pyproject.toml",374            "uv.lock",375        ]:  # root uv files376            docs_edited = True377378    dependents = dependents_graph()379380    # we now have dirs_by_job381    # todo: clean this up382    map_job_to_configs = {383        job: _get_configs_for_multi_dirs(job, dirs_to_run, dependents)384        for job in [385            "lint",386            "test",387            "extended-tests",388            "compile-integration-tests",389            "dependencies",390            "test-pydantic",391            "codspeed",392            "vcr-tests",393        ]394    }395396    for key, value in map_job_to_configs.items():397        json_output = json.dumps(value)398        print(f"{key}={json_output}")