.github/tools/git-restore-mtime 757 lines View on github.com → Search inside
1#!/usr/bin/env python32#3# git-restore-mtime - Change mtime of files based on commit date of last change4#5#    Copyright (C) 2012 Rodrigo Silva (MestreLion) <linux@rodrigosilva.com>6#7#    This program is free software: you can redistribute it and/or modify8#    it under the terms of the GNU General Public License as published by9#    the Free Software Foundation, either version 3 of the License, or10#    (at your option) any later version.11#12#    This program is distributed in the hope that it will be useful,13#    but WITHOUT ANY WARRANTY; without even the implied warranty of14#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the15#    GNU General Public License for more details.16#17#    You should have received a copy of the GNU General Public License18#    along with this program. See <http://www.gnu.org/licenses/gpl.html>19#20# Source: https://github.com/MestreLion/git-tools21# Version: July 13, 2023 (commit hash 5f832e72453e035fccae9d63a5056918d64476a2)22"""23Change the modification time (mtime) of files in work tree, based on the24date of the most recent commit that modified the file, including renames.2526Ignores untracked files and uncommitted deletions, additions and renames, and27by default modifications too.28---29Useful prior to generating release tarballs, so each file is archived with a30date that is similar to the date when the file was actually last modified,31assuming the actual modification date and its commit date are close.32"""3334# TODO:35# - Add -z on git whatchanged/ls-files, so we don't deal with filename decoding36# - When Python is bumped to 3.7, use text instead of universal_newlines on subprocess37# - Update "Statistics for some large projects" with modern hardware and repositories.38# - Create a README.md for git-restore-mtime alone. It deserves extensive documentation39#   - Move Statistics there40# - See git-extras as a good example on project structure and documentation4142# FIXME:43# - When current dir is outside the worktree, e.g. using --work-tree, `git ls-files`44#   assume any relative pathspecs are to worktree root, not the current dir. As such,45#   relative pathspecs may not work.46# - Renames are tricky:47#   - R100 should not change mtime, but original name is not on filelist. Should48#     track renames until a valid (A, M) mtime found and then set on current name.49#   - Should set mtime for both current and original directories.50#   - Check mode changes with unchanged blobs?51# - Check file (A, D) for the directory mtime is not sufficient:52#   - Renames also change dir mtime, unless rename was on a parent dir53#   - If most recent change of all files in a dir was a Modification (M),54#     dir might not be touched at all.55#   - Dirs containing only subdirectories but no direct files will also56#     not be touched. They're files' [grand]parent dir, but never their dirname().57#   - Some solutions:58#     - After files done, perform some dir processing for missing dirs, finding latest59#       file (A, D, R)60#     - Simple approach: dir mtime is the most recent child (dir or file) mtime61#     - Use a virtual concept of "created at most at" to fill missing info, bubble up62#       to parents and grandparents63#   - When handling [grand]parent dirs, stay inside <pathspec>64# - Better handling of merge commits. `-m` is plain *wrong*. `-c/--cc` is perfect, but65#   painfully slow. First pass without merge commits is not accurate. Maybe add a new66#   `--accurate` mode for `--cc`?6768if __name__ != "__main__":69    raise ImportError("{} should not be used as a module.".format(__name__))7071import argparse72import datetime73import logging74import os.path75import shlex76import signal77import subprocess78import sys79import time8081__version__ = "2022.12+dev"8283# Update symlinks only if the platform supports not following them84UPDATE_SYMLINKS = bool(os.utime in getattr(os, "supports_follow_symlinks", []))8586# Call os.path.normpath() only if not in a POSIX platform (Windows)87NORMALIZE_PATHS = os.path.sep != "/"8889# How many files to process in each batch when re-trying merge commits90STEPMISSING = 1009192# (Extra) keywords for the os.utime() call performed by touch()93UTIME_KWS = {} if not UPDATE_SYMLINKS else {"follow_symlinks": False}949596# Command-line interface ######################################################979899def parse_args():100    parser = argparse.ArgumentParser(description=__doc__.split("\n---")[0])101102    group = parser.add_mutually_exclusive_group()103    group.add_argument(104        "--quiet",105        "-q",106        dest="loglevel",107        action="store_const",108        const=logging.WARNING,109        default=logging.INFO,110        help="Suppress informative messages and summary statistics.",111    )112    group.add_argument(113        "--verbose",114        "-v",115        action="count",116        help="""117        Print additional information for each processed file.118        Specify twice to further increase verbosity.119        """,120    )121122    parser.add_argument(123        "--cwd",124        "-C",125        metavar="DIRECTORY",126        help="""127        Run as if %(prog)s was started in directory %(metavar)s.128        This affects how --work-tree, --git-dir and PATHSPEC arguments are handled.129        See 'man 1 git' or 'git --help' for more information.130        """,131    )132133    parser.add_argument(134        "--git-dir",135        dest="gitdir",136        metavar="GITDIR",137        help="""138        Path to the git repository, by default auto-discovered by searching139        the current directory and its parents for a .git/ subdirectory.140        """,141    )142143    parser.add_argument(144        "--work-tree",145        dest="workdir",146        metavar="WORKTREE",147        help="""148        Path to the work tree root, by default the parent of GITDIR if it's149        automatically discovered, or the current directory if GITDIR is set.150        """,151    )152153    parser.add_argument(154        "--force",155        "-f",156        default=False,157        action="store_true",158        help="""159        Force updating files with uncommitted modifications.160        Untracked files and uncommitted deletions, renames and additions are161        always ignored.162        """,163    )164165    parser.add_argument(166        "--merge",167        "-m",168        default=False,169        action="store_true",170        help="""171        Include merge commits.172        Leads to more recent times and more files per commit, thus with the same173        time, which may or may not be what you want.174        Including merge commits may lead to fewer commits being evaluated as files175        are found sooner, which can improve performance, sometimes substantially.176        But as merge commits are usually huge, processing them may also take longer.177        By default, merge commits are only used for files missing from regular commits.178        """,179    )180181    parser.add_argument(182        "--first-parent",183        default=False,184        action="store_true",185        help="""186        Consider only the first parent, the "main branch", when evaluating merge commits.187        Only effective when merge commits are processed, either when --merge is188        used or when finding missing files after the first regular log search.189        See --skip-missing.190        """,191    )192193    parser.add_argument(194        "--skip-missing",195        "-s",196        dest="missing",197        default=True,198        action="store_false",199        help="""200        Do not try to find missing files.201        If merge commits were not evaluated with --merge and some files were202        not found in regular commits, by default %(prog)s searches for these203        files again in the merge commits.204        This option disables this retry, so files found only in merge commits205        will not have their timestamp updated.206        """,207    )208209    parser.add_argument(210        "--no-directories",211        "-D",212        dest="dirs",213        default=True,214        action="store_false",215        help="""216        Do not update directory timestamps.217        By default, use the time of its most recently created, renamed or deleted file.218        Note that just modifying a file will NOT update its directory time.219        """,220    )221222    parser.add_argument(223        "--test",224        "-t",225        default=False,226        action="store_true",227        help="Test run: do not actually update any file timestamp.",228    )229230    parser.add_argument(231        "--commit-time",232        "-c",233        dest="commit_time",234        default=False,235        action="store_true",236        help="Use commit time instead of author time.",237    )238239    parser.add_argument(240        "--oldest-time",241        "-o",242        dest="reverse_order",243        default=False,244        action="store_true",245        help="""246        Update times based on the oldest, instead of the most recent commit of a file.247        This reverses the order in which the git log is processed to emulate a248        file "creation" date. Note this will be inaccurate for files deleted and249        re-created at later dates.250        """,251    )252253    parser.add_argument(254        "--skip-older-than",255        metavar="SECONDS",256        type=int,257        help="""258        Ignore files that are currently older than %(metavar)s.259        Useful in workflows that assume such files already have a correct timestamp,260        as it may improve performance by processing fewer files.261        """,262    )263264    parser.add_argument(265        "--skip-older-than-commit",266        "-N",267        default=False,268        action="store_true",269        help="""270        Ignore files older than the timestamp it would be updated to.271        Such files may be considered "original", likely in the author's repository.272        """,273    )274275    parser.add_argument(276        "--unique-times",277        default=False,278        action="store_true",279        help="""280        Set the microseconds to a unique value per commit.281        Allows telling apart changes that would otherwise have identical timestamps,282        as git's time accuracy is in seconds.283        """,284    )285286    parser.add_argument(287        "pathspec",288        nargs="*",289        metavar="PATHSPEC",290        help="""291        Only modify paths matching %(metavar)s, relative to current directory.292        By default, update all but untracked files and submodules.293        """,294    )295296    parser.add_argument(297        "--version",298        "-V",299        action="version",300        version="%(prog)s version {version}".format(version=get_version()),301    )302303    args_ = parser.parse_args()304    if args_.verbose:305        args_.loglevel = max(logging.TRACE, logging.DEBUG // args_.verbose)306    args_.debug = args_.loglevel <= logging.DEBUG307    return args_308309310def get_version(version=__version__):311    if not version.endswith("+dev"):312        return version313    try:314        cwd = os.path.dirname(os.path.realpath(__file__))315        return Git(cwd=cwd, errors=False).describe().lstrip("v")316    except Git.Error:317        return "-".join((version, "unknown"))318319320# Helper functions ############################################################321322323def setup_logging():324    """Add TRACE logging level and corresponding method, return the root logger"""325    logging.TRACE = TRACE = logging.DEBUG // 2326    logging.Logger.trace = lambda _, m, *a, **k: _.log(TRACE, m, *a, **k)327    return logging.getLogger()328329330def normalize(path):331    r"""Normalize paths from git, handling non-ASCII characters.332333    Git stores paths as UTF-8 normalization form C.334    If path contains non-ASCII or non-printable characters, git outputs the UTF-8335    in octal-escaped notation, escaping double-quotes and backslashes, and then336    double-quoting the whole path.337    https://git-scm.com/docs/git-config#Documentation/git-config.txt-corequotePath338339    This function reverts this encoding, so:340    normalize(r'"Back\\slash_double\"quote_a\303\247a\303\255"') =>341        r'Back\slash_double"quote_açaí')342343    Paths with invalid UTF-8 encoding, such as single 0x80-0xFF bytes (e.g, from344    Latin1/Windows-1251 encoding) are decoded using surrogate escape, the same345    method used by Python for filesystem paths. So 0xE6 ("æ" in Latin1, r'\\346'346    from Git) is decoded as "\udce6". See https://peps.python.org/pep-0383/ and347    https://vstinner.github.io/painful-history-python-filesystem-encoding.html348349    Also see notes on `windows/non-ascii-paths.txt` about path encodings on350    non-UTF-8 platforms and filesystems.351    """352    if path and path[0] == '"':353        # Python 2: path = path[1:-1].decode("string-escape")354        # Python 3: https://stackoverflow.com/a/46650050/624066355        path = (356            path[1:-1]  # Remove enclosing double quotes357            .encode("latin1")  # Convert to bytes, required by 'unicode-escape'358            .decode("unicode-escape")  # Perform the actual octal-escaping decode359            .encode("latin1")  # 1:1 mapping to bytes, UTF-8 encoded360            .decode("utf8", "surrogateescape")361        )  # Decode from UTF-8362    if NORMALIZE_PATHS:363        # Make sure the slash matches the OS; for Windows we need a backslash364        path = os.path.normpath(path)365    return path366367368def dummy(*_args, **_kwargs):369    """No-op function used in dry-run tests"""370371372def touch(path, mtime):373    """The actual mtime update"""374    os.utime(path, (mtime, mtime), **UTIME_KWS)375376377def touch_ns(path, mtime_ns):378    """The actual mtime update, using nanoseconds for unique timestamps"""379    os.utime(path, None, ns=(mtime_ns, mtime_ns), **UTIME_KWS)380381382def isodate(secs: int):383    # time.localtime() accepts floats, but discards fractional part384    return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(secs))385386387def isodate_ns(ns: int):388    # for integers fromtimestamp() is equivalent and ~16% slower than isodate()389    return datetime.datetime.fromtimestamp(ns / 1000000000).isoformat(sep=" ")390391392def get_mtime_ns(secs: int, idx: int):393    # Time resolution for filesystems and functions:394    # ext-4 and other POSIX filesystems: 1 nanosecond395    # NTFS (Windows default): 100 nanoseconds396    # datetime.datetime() (due to 64-bit float epoch): 1 microsecond397    us = idx % 1000000  # 10**6398    return 1000 * (1000000 * secs + us)399400401def get_mtime_path(path):402    return os.path.getmtime(path)403404405# Git class and parse_log(), the heart of the script ##########################406407408class Git:409    def __init__(self, workdir=None, gitdir=None, cwd=None, errors=True):410        self.gitcmd = ["git"]411        self.errors = errors412        self._proc = None413        if workdir:414            self.gitcmd.extend(("--work-tree", workdir))415        if gitdir:416            self.gitcmd.extend(("--git-dir", gitdir))417        if cwd:418            self.gitcmd.extend(("-C", cwd))419        self.workdir, self.gitdir = self._get_repo_dirs()420421    def ls_files(self, paths: list = None):422        return (normalize(_) for _ in self._run("ls-files --full-name", paths))423424    def ls_dirty(self, force=False):425        return (426            normalize(_[3:].split(" -> ", 1)[-1])427            for _ in self._run("status --porcelain")428            if _[:2] != "??" and (not force or (_[0] in ("R", "A") or _[1] == "D"))429        )430431    def log(432        self,433        merge=False,434        first_parent=False,435        commit_time=False,436        reverse_order=False,437        paths: list = None,438    ):439        cmd = "whatchanged --pretty={}".format("%ct" if commit_time else "%at")440        if merge:441            cmd += " -m"442        if first_parent:443            cmd += " --first-parent"444        if reverse_order:445            cmd += " --reverse"446        return self._run(cmd, paths)447448    def describe(self):449        return self._run("describe --tags", check=True)[0]450451    def terminate(self):452        if self._proc is None:453            return454        try:455            self._proc.terminate()456        except OSError:457            # Avoid errors on OpenBSD458            pass459460    def _get_repo_dirs(self):461        return (462            os.path.normpath(_)463            for _ in self._run(464                "rev-parse --show-toplevel --absolute-git-dir", check=True465            )466        )467468    def _run(self, cmdstr: str, paths: list = None, output=True, check=False):469        cmdlist = self.gitcmd + shlex.split(cmdstr)470        if paths:471            cmdlist.append("--")472            cmdlist.extend(paths)473        popen_args = dict(universal_newlines=True, encoding="utf8")474        if not self.errors:475            popen_args["stderr"] = subprocess.DEVNULL476        log.trace("Executing: %s", " ".join(cmdlist))477        if not output:478            return subprocess.call(cmdlist, **popen_args)479        if check:480            try:481                stdout: str = subprocess.check_output(cmdlist, **popen_args)482                return stdout.splitlines()483            except subprocess.CalledProcessError as e:484                raise self.Error(e.returncode, e.cmd, e.output, e.stderr)485        self._proc = subprocess.Popen(cmdlist, stdout=subprocess.PIPE, **popen_args)486        return (_.rstrip() for _ in self._proc.stdout)487488    def __del__(self):489        self.terminate()490491    class Error(subprocess.CalledProcessError):492        """Error from git executable"""493494495def parse_log(filelist, dirlist, stats, git, merge=False, filterlist=None):496    mtime = 0497    datestr = isodate(0)498    for line in git.log(499        merge, args.first_parent, args.commit_time, args.reverse_order, filterlist500    ):501        stats["loglines"] += 1502503        # Blank line between Date and list of files504        if not line:505            continue506507        # Date line508        if line[0] != ":":  # Faster than `not line.startswith(':')`509            stats["commits"] += 1510            mtime = int(line)511            if args.unique_times:512                mtime = get_mtime_ns(mtime, stats["commits"])513            if args.debug:514                datestr = isodate(mtime)515            continue516517        # File line: three tokens if it describes a renaming, otherwise two518        tokens = line.split("\t")519520        # Possible statuses:521        # M: Modified (content changed)522        # A: Added (created)523        # D: Deleted524        # T: Type changed: to/from regular file, symlinks, submodules525        # R099: Renamed (moved), with % of unchanged content. 100 = pure rename526        # Not possible in log: C=Copied, U=Unmerged, X=Unknown, B=pairing Broken527        status = tokens[0].split(" ")[-1]528        file = tokens[-1]529530        # Handles non-ASCII chars and OS path separator531        file = normalize(file)532533        def do_file():534            if args.skip_older_than_commit and get_mtime_path(file) <= mtime:535                stats["skip"] += 1536                return537            if args.debug:538                log.debug(539                    "%d\t%d\t%d\t%s\t%s",540                    stats["loglines"],541                    stats["commits"],542                    stats["files"],543                    datestr,544                    file,545                )546            try:547                touch(os.path.join(git.workdir, file), mtime)548                stats["touches"] += 1549            except Exception as e:550                log.error("ERROR: %s: %s", e, file)551                stats["errors"] += 1552553        def do_dir():554            if args.debug:555                log.debug(556                    "%d\t%d\t-\t%s\t%s",557                    stats["loglines"],558                    stats["commits"],559                    datestr,560                    "{}/".format(dirname or "."),561                )562            try:563                touch(os.path.join(git.workdir, dirname), mtime)564                stats["dirtouches"] += 1565            except Exception as e:566                log.error("ERROR: %s: %s", e, dirname)567                stats["direrrors"] += 1568569        if file in filelist:570            stats["files"] -= 1571            filelist.remove(file)572            do_file()573574        if args.dirs and status in ("A", "D"):575            dirname = os.path.dirname(file)576            if dirname in dirlist:577                dirlist.remove(dirname)578                do_dir()579580        # All files done?581        if not stats["files"]:582            git.terminate()583            return584585586# Main Logic ##################################################################587588589def main():590    start = time.time()  # yes, Wall time. CPU time is not realistic for users.591    stats = {592        _: 0593        for _ in (594            "loglines",595            "commits",596            "touches",597            "skip",598            "errors",599            "dirtouches",600            "direrrors",601        )602    }603604    logging.basicConfig(level=args.loglevel, format="%(message)s")605    log.trace("Arguments: %s", args)606607    # First things first: Where and Who are we?608    if args.cwd:609        log.debug("Changing directory: %s", args.cwd)610        try:611            os.chdir(args.cwd)612        except OSError as e:613            log.critical(e)614            return e.errno615    # Using both os.chdir() and `git -C` is redundant, but might prevent side effects616    # `git -C` alone could be enough if we make sure that:617    # - all paths, including args.pathspec, are processed by git: ls-files, rev-parse618    # - touch() / os.utime() path argument is always prepended with git.workdir619    try:620        git = Git(workdir=args.workdir, gitdir=args.gitdir, cwd=args.cwd)621    except Git.Error as e:622        # Not in a git repository, and git already informed user on stderr. So we just...623        return e.returncode624625    # Get the files managed by git and build file list to be processed626    if UPDATE_SYMLINKS and not args.skip_older_than:627        filelist = set(git.ls_files(args.pathspec))628    else:629        filelist = set()630        for path in git.ls_files(args.pathspec):631            fullpath = os.path.join(git.workdir, path)632633            # Symlink (to file, to dir or broken - git handles the same way)634            if not UPDATE_SYMLINKS and os.path.islink(fullpath):635                log.warning(636                    "WARNING: Skipping symlink, no OS support for updates: %s", path637                )638                continue639640            # skip files which are older than given threshold641            if (642                args.skip_older_than643                and start - get_mtime_path(fullpath) > args.skip_older_than644            ):645                continue646647            # Always add files relative to worktree root648            filelist.add(path)649650    # If --force, silently ignore uncommitted deletions (not in the filesystem)651    # and renames / additions (will not be found in log anyway)652    if args.force:653        filelist -= set(git.ls_dirty(force=True))654    # Otherwise, ignore any dirty files655    else:656        dirty = set(git.ls_dirty())657        if dirty:658            log.warning(659                "WARNING: Modified files in the working directory were ignored."660                "\nTo include such files, commit your changes or use --force."661            )662            filelist -= dirty663664    # Build dir list to be processed665    dirlist = set(os.path.dirname(_) for _ in filelist) if args.dirs else set()666667    stats["totalfiles"] = stats["files"] = len(filelist)668    log.info("{0:,} files to be processed in work dir".format(stats["totalfiles"]))669670    if not filelist:671        # Nothing to do. Exit silently and without errors, just like git does672        return673674    # Process the log until all files are 'touched'675    log.debug("Line #\tLog #\tF.Left\tModification Time\tFile Name")676    parse_log(filelist, dirlist, stats, git, args.merge, args.pathspec)677678    # Missing files679    if filelist:680        # Try to find them in merge logs, if not done already681        # (usually HUGE, thus MUCH slower!)682        if args.missing and not args.merge:683            filterlist = list(filelist)684            missing = len(filterlist)685            log.info(686                "{0:,} files not found in log, trying merge commits".format(missing)687            )688            for i in range(0, missing, STEPMISSING):689                parse_log(690                    filelist,691                    dirlist,692                    stats,693                    git,694                    merge=True,695                    filterlist=filterlist[i : i + STEPMISSING],696                )697698        # Still missing some?699        for file in filelist:700            log.warning("WARNING: not found in the log: %s", file)701702    # Final statistics703    # Suggestion: use git-log --before=mtime to brag about skipped log entries704    def log_info(msg, *a, width=13):705        ifmt = "{:%d,}" % (width,)  # not using 'n' for consistency with ffmt706        ffmt = "{:%d,.2f}" % (width,)707        # %-formatting lacks a thousand separator, must pre-render with .format()708        log.info(msg.replace("%d", ifmt).replace("%f", ffmt).format(*a))709710    log_info(711        "Statistics:\n%f seconds\n%d log lines processed\n%d commits evaluated",712        time.time() - start,713        stats["loglines"],714        stats["commits"],715    )716717    if args.dirs:718        if stats["direrrors"]:719            log_info("%d directory update errors", stats["direrrors"])720        log_info("%d directories updated", stats["dirtouches"])721722    if stats["touches"] != stats["totalfiles"]:723        log_info("%d files", stats["totalfiles"])724    if stats["skip"]:725        log_info("%d files skipped", stats["skip"])726    if stats["files"]:727        log_info("%d files missing", stats["files"])728    if stats["errors"]:729        log_info("%d file update errors", stats["errors"])730731    log_info("%d files updated", stats["touches"])732733    if args.test:734        log.info("TEST RUN - No files modified!")735736737# Keep only essential, global assignments here. Any other logic must be in main()738log = setup_logging()739args = parse_args()740741# Set the actual touch() and other functions based on command-line arguments742if args.unique_times:743    touch = touch_ns744    isodate = isodate_ns745746# Make sure this is always set last to ensure --test behaves as intended747if args.test:748    touch = dummy749750# UI done, it's showtime!751try:752    sys.exit(main())753except KeyboardInterrupt:754    log.info("\nAborting")755    signal.signal(signal.SIGINT, signal.SIG_DFL)756    os.kill(os.getpid(), signal.SIGINT)

Findings

✓ No findings reported for this file.

Get this view in your editor

Same data, no extra tab — call code_get_file + code_get_findings over MCP from Claude/Cursor/Copilot.