1#!/usr/bin/env python32#3# git-restore-mtime - Change mtime of files based on commit date of last change4#5# Copyright (C) 2012 Rodrigo Silva (MestreLion) <linux@rodrigosilva.com>6#7# This program is free software: you can redistribute it and/or modify8# it under the terms of the GNU General Public License as published by9# the Free Software Foundation, either version 3 of the License, or10# (at your option) any later version.11#12# This program is distributed in the hope that it will be useful,13# but WITHOUT ANY WARRANTY; without even the implied warranty of14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the15# GNU General Public License for more details.16#17# You should have received a copy of the GNU General Public License18# along with this program. See <http://www.gnu.org/licenses/gpl.html>19#20# Source: https://github.com/MestreLion/git-tools21# Version: July 13, 2023 (commit hash 5f832e72453e035fccae9d63a5056918d64476a2)22"""23Change the modification time (mtime) of files in work tree, based on the24date of the most recent commit that modified the file, including renames.2526Ignores untracked files and uncommitted deletions, additions and renames, and27by default modifications too.28---29Useful prior to generating release tarballs, so each file is archived with a30date that is similar to the date when the file was actually last modified,31assuming the actual modification date and its commit date are close.32"""3334# TODO:35# - Add -z on git whatchanged/ls-files, so we don't deal with filename decoding36# - When Python is bumped to 3.7, use text instead of universal_newlines on subprocess37# - Update "Statistics for some large projects" with modern hardware and repositories.38# - Create a README.md for git-restore-mtime alone. It deserves extensive documentation39# - Move Statistics there40# - See git-extras as a good example on project structure and documentation4142# FIXME:43# - When current dir is outside the worktree, e.g. using --work-tree, `git ls-files`44# assume any relative pathspecs are to worktree root, not the current dir. As such,45# relative pathspecs may not work.46# - Renames are tricky:47# - R100 should not change mtime, but original name is not on filelist. Should48# track renames until a valid (A, M) mtime found and then set on current name.49# - Should set mtime for both current and original directories.50# - Check mode changes with unchanged blobs?51# - Check file (A, D) for the directory mtime is not sufficient:52# - Renames also change dir mtime, unless rename was on a parent dir53# - If most recent change of all files in a dir was a Modification (M),54# dir might not be touched at all.55# - Dirs containing only subdirectories but no direct files will also56# not be touched. They're files' [grand]parent dir, but never their dirname().57# - Some solutions:58# - After files done, perform some dir processing for missing dirs, finding latest59# file (A, D, R)60# - Simple approach: dir mtime is the most recent child (dir or file) mtime61# - Use a virtual concept of "created at most at" to fill missing info, bubble up62# to parents and grandparents63# - When handling [grand]parent dirs, stay inside <pathspec>64# - Better handling of merge commits. `-m` is plain *wrong*. `-c/--cc` is perfect, but65# painfully slow. First pass without merge commits is not accurate. Maybe add a new66# `--accurate` mode for `--cc`?6768if __name__ != "__main__":69 raise ImportError("{} should not be used as a module.".format(__name__))7071import argparse72import datetime73import logging74import os.path75import shlex76import signal77import subprocess78import sys79import time8081__version__ = "2022.12+dev"8283# Update symlinks only if the platform supports not following them84UPDATE_SYMLINKS = bool(os.utime in getattr(os, "supports_follow_symlinks", []))8586# Call os.path.normpath() only if not in a POSIX platform (Windows)87NORMALIZE_PATHS = os.path.sep != "/"8889# How many files to process in each batch when re-trying merge commits90STEPMISSING = 1009192# (Extra) keywords for the os.utime() call performed by touch()93UTIME_KWS = {} if not UPDATE_SYMLINKS else {"follow_symlinks": False}949596# Command-line interface ######################################################979899def parse_args():100 parser = argparse.ArgumentParser(description=__doc__.split("\n---")[0])101102 group = parser.add_mutually_exclusive_group()103 group.add_argument(104 "--quiet",105 "-q",106 dest="loglevel",107 action="store_const",108 const=logging.WARNING,109 default=logging.INFO,110 help="Suppress informative messages and summary statistics.",111 )112 group.add_argument(113 "--verbose",114 "-v",115 action="count",116 help="""117 Print additional information for each processed file.118 Specify twice to further increase verbosity.119 """,120 )121122 parser.add_argument(123 "--cwd",124 "-C",125 metavar="DIRECTORY",126 help="""127 Run as if %(prog)s was started in directory %(metavar)s.128 This affects how --work-tree, --git-dir and PATHSPEC arguments are handled.129 See 'man 1 git' or 'git --help' for more information.130 """,131 )132133 parser.add_argument(134 "--git-dir",135 dest="gitdir",136 metavar="GITDIR",137 help="""138 Path to the git repository, by default auto-discovered by searching139 the current directory and its parents for a .git/ subdirectory.140 """,141 )142143 parser.add_argument(144 "--work-tree",145 dest="workdir",146 metavar="WORKTREE",147 help="""148 Path to the work tree root, by default the parent of GITDIR if it's149 automatically discovered, or the current directory if GITDIR is set.150 """,151 )152153 parser.add_argument(154 "--force",155 "-f",156 default=False,157 action="store_true",158 help="""159 Force updating files with uncommitted modifications.160 Untracked files and uncommitted deletions, renames and additions are161 always ignored.162 """,163 )164165 parser.add_argument(166 "--merge",167 "-m",168 default=False,169 action="store_true",170 help="""171 Include merge commits.172 Leads to more recent times and more files per commit, thus with the same173 time, which may or may not be what you want.174 Including merge commits may lead to fewer commits being evaluated as files175 are found sooner, which can improve performance, sometimes substantially.176 But as merge commits are usually huge, processing them may also take longer.177 By default, merge commits are only used for files missing from regular commits.178 """,179 )180181 parser.add_argument(182 "--first-parent",183 default=False,184 action="store_true",185 help="""186 Consider only the first parent, the "main branch", when evaluating merge commits.187 Only effective when merge commits are processed, either when --merge is188 used or when finding missing files after the first regular log search.189 See --skip-missing.190 """,191 )192193 parser.add_argument(194 "--skip-missing",195 "-s",196 dest="missing",197 default=True,198 action="store_false",199 help="""200 Do not try to find missing files.201 If merge commits were not evaluated with --merge and some files were202 not found in regular commits, by default %(prog)s searches for these203 files again in the merge commits.204 This option disables this retry, so files found only in merge commits205 will not have their timestamp updated.206 """,207 )208209 parser.add_argument(210 "--no-directories",211 "-D",212 dest="dirs",213 default=True,214 action="store_false",215 help="""216 Do not update directory timestamps.217 By default, use the time of its most recently created, renamed or deleted file.218 Note that just modifying a file will NOT update its directory time.219 """,220 )221222 parser.add_argument(223 "--test",224 "-t",225 default=False,226 action="store_true",227 help="Test run: do not actually update any file timestamp.",228 )229230 parser.add_argument(231 "--commit-time",232 "-c",233 dest="commit_time",234 default=False,235 action="store_true",236 help="Use commit time instead of author time.",237 )238239 parser.add_argument(240 "--oldest-time",241 "-o",242 dest="reverse_order",243 default=False,244 action="store_true",245 help="""246 Update times based on the oldest, instead of the most recent commit of a file.247 This reverses the order in which the git log is processed to emulate a248 file "creation" date. Note this will be inaccurate for files deleted and249 re-created at later dates.250 """,251 )252253 parser.add_argument(254 "--skip-older-than",255 metavar="SECONDS",256 type=int,257 help="""258 Ignore files that are currently older than %(metavar)s.259 Useful in workflows that assume such files already have a correct timestamp,260 as it may improve performance by processing fewer files.261 """,262 )263264 parser.add_argument(265 "--skip-older-than-commit",266 "-N",267 default=False,268 action="store_true",269 help="""270 Ignore files older than the timestamp it would be updated to.271 Such files may be considered "original", likely in the author's repository.272 """,273 )274275 parser.add_argument(276 "--unique-times",277 default=False,278 action="store_true",279 help="""280 Set the microseconds to a unique value per commit.281 Allows telling apart changes that would otherwise have identical timestamps,282 as git's time accuracy is in seconds.283 """,284 )285286 parser.add_argument(287 "pathspec",288 nargs="*",289 metavar="PATHSPEC",290 help="""291 Only modify paths matching %(metavar)s, relative to current directory.292 By default, update all but untracked files and submodules.293 """,294 )295296 parser.add_argument(297 "--version",298 "-V",299 action="version",300 version="%(prog)s version {version}".format(version=get_version()),301 )302303 args_ = parser.parse_args()304 if args_.verbose:305 args_.loglevel = max(logging.TRACE, logging.DEBUG // args_.verbose)306 args_.debug = args_.loglevel <= logging.DEBUG307 return args_308309310def get_version(version=__version__):311 if not version.endswith("+dev"):312 return version313 try:314 cwd = os.path.dirname(os.path.realpath(__file__))315 return Git(cwd=cwd, errors=False).describe().lstrip("v")316 except Git.Error:317 return "-".join((version, "unknown"))318319320# Helper functions ############################################################321322323def setup_logging():324 """Add TRACE logging level and corresponding method, return the root logger"""325 logging.TRACE = TRACE = logging.DEBUG // 2326 logging.Logger.trace = lambda _, m, *a, **k: _.log(TRACE, m, *a, **k)327 return logging.getLogger()328329330def normalize(path):331 r"""Normalize paths from git, handling non-ASCII characters.332333 Git stores paths as UTF-8 normalization form C.334 If path contains non-ASCII or non-printable characters, git outputs the UTF-8335 in octal-escaped notation, escaping double-quotes and backslashes, and then336 double-quoting the whole path.337 https://git-scm.com/docs/git-config#Documentation/git-config.txt-corequotePath338339 This function reverts this encoding, so:340 normalize(r'"Back\\slash_double\"quote_a\303\247a\303\255"') =>341 r'Back\slash_double"quote_açaí')342343 Paths with invalid UTF-8 encoding, such as single 0x80-0xFF bytes (e.g, from344 Latin1/Windows-1251 encoding) are decoded using surrogate escape, the same345 method used by Python for filesystem paths. So 0xE6 ("æ" in Latin1, r'\\346'346 from Git) is decoded as "\udce6". See https://peps.python.org/pep-0383/ and347 https://vstinner.github.io/painful-history-python-filesystem-encoding.html348349 Also see notes on `windows/non-ascii-paths.txt` about path encodings on350 non-UTF-8 platforms and filesystems.351 """352 if path and path[0] == '"':353 # Python 2: path = path[1:-1].decode("string-escape")354 # Python 3: https://stackoverflow.com/a/46650050/624066355 path = (356 path[1:-1] # Remove enclosing double quotes357 .encode("latin1") # Convert to bytes, required by 'unicode-escape'358 .decode("unicode-escape") # Perform the actual octal-escaping decode359 .encode("latin1") # 1:1 mapping to bytes, UTF-8 encoded360 .decode("utf8", "surrogateescape")361 ) # Decode from UTF-8362 if NORMALIZE_PATHS:363 # Make sure the slash matches the OS; for Windows we need a backslash364 path = os.path.normpath(path)365 return path366367368def dummy(*_args, **_kwargs):369 """No-op function used in dry-run tests"""370371372def touch(path, mtime):373 """The actual mtime update"""374 os.utime(path, (mtime, mtime), **UTIME_KWS)375376377def touch_ns(path, mtime_ns):378 """The actual mtime update, using nanoseconds for unique timestamps"""379 os.utime(path, None, ns=(mtime_ns, mtime_ns), **UTIME_KWS)380381382def isodate(secs: int):383 # time.localtime() accepts floats, but discards fractional part384 return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(secs))385386387def isodate_ns(ns: int):388 # for integers fromtimestamp() is equivalent and ~16% slower than isodate()389 return datetime.datetime.fromtimestamp(ns / 1000000000).isoformat(sep=" ")390391392def get_mtime_ns(secs: int, idx: int):393 # Time resolution for filesystems and functions:394 # ext-4 and other POSIX filesystems: 1 nanosecond395 # NTFS (Windows default): 100 nanoseconds396 # datetime.datetime() (due to 64-bit float epoch): 1 microsecond397 us = idx % 1000000 # 10**6398 return 1000 * (1000000 * secs + us)399400401def get_mtime_path(path):402 return os.path.getmtime(path)403404405# Git class and parse_log(), the heart of the script ##########################406407408class Git:409 def __init__(self, workdir=None, gitdir=None, cwd=None, errors=True):410 self.gitcmd = ["git"]411 self.errors = errors412 self._proc = None413 if workdir:414 self.gitcmd.extend(("--work-tree", workdir))415 if gitdir:416 self.gitcmd.extend(("--git-dir", gitdir))417 if cwd:418 self.gitcmd.extend(("-C", cwd))419 self.workdir, self.gitdir = self._get_repo_dirs()420421 def ls_files(self, paths: list = None):422 return (normalize(_) for _ in self._run("ls-files --full-name", paths))423424 def ls_dirty(self, force=False):425 return (426 normalize(_[3:].split(" -> ", 1)[-1])427 for _ in self._run("status --porcelain")428 if _[:2] != "??" and (not force or (_[0] in ("R", "A") or _[1] == "D"))429 )430431 def log(432 self,433 merge=False,434 first_parent=False,435 commit_time=False,436 reverse_order=False,437 paths: list = None,438 ):439 cmd = "whatchanged --pretty={}".format("%ct" if commit_time else "%at")440 if merge:441 cmd += " -m"442 if first_parent:443 cmd += " --first-parent"444 if reverse_order:445 cmd += " --reverse"446 return self._run(cmd, paths)447448 def describe(self):449 return self._run("describe --tags", check=True)[0]450451 def terminate(self):452 if self._proc is None:453 return454 try:455 self._proc.terminate()456 except OSError:457 # Avoid errors on OpenBSD458 pass459460 def _get_repo_dirs(self):461 return (462 os.path.normpath(_)463 for _ in self._run(464 "rev-parse --show-toplevel --absolute-git-dir", check=True465 )466 )467468 def _run(self, cmdstr: str, paths: list = None, output=True, check=False):469 cmdlist = self.gitcmd + shlex.split(cmdstr)470 if paths:471 cmdlist.append("--")472 cmdlist.extend(paths)473 popen_args = dict(universal_newlines=True, encoding="utf8")474 if not self.errors:475 popen_args["stderr"] = subprocess.DEVNULL476 log.trace("Executing: %s", " ".join(cmdlist))477 if not output:478 return subprocess.call(cmdlist, **popen_args)479 if check:480 try:481 stdout: str = subprocess.check_output(cmdlist, **popen_args)482 return stdout.splitlines()483 except subprocess.CalledProcessError as e:484 raise self.Error(e.returncode, e.cmd, e.output, e.stderr)485 self._proc = subprocess.Popen(cmdlist, stdout=subprocess.PIPE, **popen_args)486 return (_.rstrip() for _ in self._proc.stdout)487488 def __del__(self):489 self.terminate()490491 class Error(subprocess.CalledProcessError):492 """Error from git executable"""493494495def parse_log(filelist, dirlist, stats, git, merge=False, filterlist=None):496 mtime = 0497 datestr = isodate(0)498 for line in git.log(499 merge, args.first_parent, args.commit_time, args.reverse_order, filterlist500 ):501 stats["loglines"] += 1502503 # Blank line between Date and list of files504 if not line:505 continue506507 # Date line508 if line[0] != ":": # Faster than `not line.startswith(':')`509 stats["commits"] += 1510 mtime = int(line)511 if args.unique_times:512 mtime = get_mtime_ns(mtime, stats["commits"])513 if args.debug:514 datestr = isodate(mtime)515 continue516517 # File line: three tokens if it describes a renaming, otherwise two518 tokens = line.split("\t")519520 # Possible statuses:521 # M: Modified (content changed)522 # A: Added (created)523 # D: Deleted524 # T: Type changed: to/from regular file, symlinks, submodules525 # R099: Renamed (moved), with % of unchanged content. 100 = pure rename526 # Not possible in log: C=Copied, U=Unmerged, X=Unknown, B=pairing Broken527 status = tokens[0].split(" ")[-1]528 file = tokens[-1]529530 # Handles non-ASCII chars and OS path separator531 file = normalize(file)532533 def do_file():534 if args.skip_older_than_commit and get_mtime_path(file) <= mtime:535 stats["skip"] += 1536 return537 if args.debug:538 log.debug(539 "%d\t%d\t%d\t%s\t%s",540 stats["loglines"],541 stats["commits"],542 stats["files"],543 datestr,544 file,545 )546 try:547 touch(os.path.join(git.workdir, file), mtime)548 stats["touches"] += 1549 except Exception as e:550 log.error("ERROR: %s: %s", e, file)551 stats["errors"] += 1552553 def do_dir():554 if args.debug:555 log.debug(556 "%d\t%d\t-\t%s\t%s",557 stats["loglines"],558 stats["commits"],559 datestr,560 "{}/".format(dirname or "."),561 )562 try:563 touch(os.path.join(git.workdir, dirname), mtime)564 stats["dirtouches"] += 1565 except Exception as e:566 log.error("ERROR: %s: %s", e, dirname)567 stats["direrrors"] += 1568569 if file in filelist:570 stats["files"] -= 1571 filelist.remove(file)572 do_file()573574 if args.dirs and status in ("A", "D"):575 dirname = os.path.dirname(file)576 if dirname in dirlist:577 dirlist.remove(dirname)578 do_dir()579580 # All files done?581 if not stats["files"]:582 git.terminate()583 return584585586# Main Logic ##################################################################587588589def main():590 start = time.time() # yes, Wall time. CPU time is not realistic for users.591 stats = {592 _: 0593 for _ in (594 "loglines",595 "commits",596 "touches",597 "skip",598 "errors",599 "dirtouches",600 "direrrors",601 )602 }603604 logging.basicConfig(level=args.loglevel, format="%(message)s")605 log.trace("Arguments: %s", args)606607 # First things first: Where and Who are we?608 if args.cwd:609 log.debug("Changing directory: %s", args.cwd)610 try:611 os.chdir(args.cwd)612 except OSError as e:613 log.critical(e)614 return e.errno615 # Using both os.chdir() and `git -C` is redundant, but might prevent side effects616 # `git -C` alone could be enough if we make sure that:617 # - all paths, including args.pathspec, are processed by git: ls-files, rev-parse618 # - touch() / os.utime() path argument is always prepended with git.workdir619 try:620 git = Git(workdir=args.workdir, gitdir=args.gitdir, cwd=args.cwd)621 except Git.Error as e:622 # Not in a git repository, and git already informed user on stderr. So we just...623 return e.returncode624625 # Get the files managed by git and build file list to be processed626 if UPDATE_SYMLINKS and not args.skip_older_than:627 filelist = set(git.ls_files(args.pathspec))628 else:629 filelist = set()630 for path in git.ls_files(args.pathspec):631 fullpath = os.path.join(git.workdir, path)632633 # Symlink (to file, to dir or broken - git handles the same way)634 if not UPDATE_SYMLINKS and os.path.islink(fullpath):635 log.warning(636 "WARNING: Skipping symlink, no OS support for updates: %s", path637 )638 continue639640 # skip files which are older than given threshold641 if (642 args.skip_older_than643 and start - get_mtime_path(fullpath) > args.skip_older_than644 ):645 continue646647 # Always add files relative to worktree root648 filelist.add(path)649650 # If --force, silently ignore uncommitted deletions (not in the filesystem)651 # and renames / additions (will not be found in log anyway)652 if args.force:653 filelist -= set(git.ls_dirty(force=True))654 # Otherwise, ignore any dirty files655 else:656 dirty = set(git.ls_dirty())657 if dirty:658 log.warning(659 "WARNING: Modified files in the working directory were ignored."660 "\nTo include such files, commit your changes or use --force."661 )662 filelist -= dirty663664 # Build dir list to be processed665 dirlist = set(os.path.dirname(_) for _ in filelist) if args.dirs else set()666667 stats["totalfiles"] = stats["files"] = len(filelist)668 log.info("{0:,} files to be processed in work dir".format(stats["totalfiles"]))669670 if not filelist:671 # Nothing to do. Exit silently and without errors, just like git does672 return673674 # Process the log until all files are 'touched'675 log.debug("Line #\tLog #\tF.Left\tModification Time\tFile Name")676 parse_log(filelist, dirlist, stats, git, args.merge, args.pathspec)677678 # Missing files679 if filelist:680 # Try to find them in merge logs, if not done already681 # (usually HUGE, thus MUCH slower!)682 if args.missing and not args.merge:683 filterlist = list(filelist)684 missing = len(filterlist)685 log.info(686 "{0:,} files not found in log, trying merge commits".format(missing)687 )688 for i in range(0, missing, STEPMISSING):689 parse_log(690 filelist,691 dirlist,692 stats,693 git,694 merge=True,695 filterlist=filterlist[i : i + STEPMISSING],696 )697698 # Still missing some?699 for file in filelist:700 log.warning("WARNING: not found in the log: %s", file)701702 # Final statistics703 # Suggestion: use git-log --before=mtime to brag about skipped log entries704 def log_info(msg, *a, width=13):705 ifmt = "{:%d,}" % (width,) # not using 'n' for consistency with ffmt706 ffmt = "{:%d,.2f}" % (width,)707 # %-formatting lacks a thousand separator, must pre-render with .format()708 log.info(msg.replace("%d", ifmt).replace("%f", ffmt).format(*a))709710 log_info(711 "Statistics:\n%f seconds\n%d log lines processed\n%d commits evaluated",712 time.time() - start,713 stats["loglines"],714 stats["commits"],715 )716717 if args.dirs:718 if stats["direrrors"]:719 log_info("%d directory update errors", stats["direrrors"])720 log_info("%d directories updated", stats["dirtouches"])721722 if stats["touches"] != stats["totalfiles"]:723 log_info("%d files", stats["totalfiles"])724 if stats["skip"]:725 log_info("%d files skipped", stats["skip"])726 if stats["files"]:727 log_info("%d files missing", stats["files"])728 if stats["errors"]:729 log_info("%d file update errors", stats["errors"])730731 log_info("%d files updated", stats["touches"])732733 if args.test:734 log.info("TEST RUN - No files modified!")735736737# Keep only essential, global assignments here. Any other logic must be in main()738log = setup_logging()739args = parse_args()740741# Set the actual touch() and other functions based on command-line arguments742if args.unique_times:743 touch = touch_ns744 isodate = isodate_ns745746# Make sure this is always set last to ensure --test behaves as intended747if args.test:748 touch = dummy749750# UI done, it's showtime!751try:752 sys.exit(main())753except KeyboardInterrupt:754 log.info("\nAborting")755 signal.signal(signal.SIGINT, signal.SIG_DFL)756 os.kill(os.getpid(), signal.SIGINT)
Findings
✓ No findings reported for this file.