1#!/usr/bin/env python3 2# 3# Copyright 2020, Data61, CSIRO (ABN 41 687 119 230) 4# 5# SPDX-License-Identifier: GPL-2.0-only 6# 7 8''' 9A tool for determining provenance. 10 11Occasionally one encounters a directory of source code that was derived from an upstream repository 12with history either squashed or discarded. To pull in upstream changes, it is desirable to know what 13commit the source code originated from. This script helps you determine that by looking for the 14upstream commit with the smallest diff to the downstream files. Sample usage: 15 16 whence.py -u https://github.com/torvalds/linux --upstream-subdir scripts/kconfig \ 17 -d https://github.com/seL4/seL4_tools --downstream-subdir kbuild-tool/kconfig 18''' 19 20import argparse 21import os 22import shutil 23import subprocess 24import sys 25import tempfile 26 27 28class GitRepo(object): 29 def __init__(self, url): 30 self.tmp = tempfile.mkdtemp() 31 subprocess.check_call(['git', 'clone', url, self.tmp]) 32 33 def checkout(self, commit): 34 subprocess.check_call(['git', 'checkout', commit], cwd=self.tmp) 35 36 def log(self, subdir): 37 # Reverse the commit list to test them chronologically, just for consistency. 38 return reversed(subprocess.check_output(['git', 'log', '--pretty=tformat:%H', '.'], 39 cwd=os.path.join(self.tmp, subdir)).split()) 40 41 def __del__(self): 42 shutil.rmtree(self.tmp) 43 44 45def main(argv): 46 parser = argparse.ArgumentParser(description='locate a Git commit in an upstream project from ' 47 'which downstream source was derived') 48 parser.add_argument('--upstream', '-u', required=True, help='URL of upstream repository to ' 49 'search') 50 parser.add_argument('--upstream-subdir', default='', help='subdirectory within upstream ' 51 'repository to consider (root by default)') 52 parser.add_argument('--downstream', '-d', required=True, help='URL of downstream repository ' 53 'to analyse') 54 parser.add_argument('--downstream-subdir', default='', help='subdirectory within downstream ' 55 'repository to analyse (root by default)') 56 parser.add_argument('--downstream-commit', help='commit in downstream repository to consider ' 57 '(HEAD of master by default)') 58 opts = parser.parse_args(argv[1:]) 59 60 sys.stderr.write('Cloning %s into a temporary directory...\n' % opts.upstream) 61 try: 62 upstream = GitRepo(opts.upstream) 63 except subprocess.CalledProcessError: 64 return -1 65 66 sys.stderr.write('Cloning %s into a temporary directory...\n' % opts.downstream) 67 try: 68 downstream = GitRepo(opts.downstream) 69 except subprocess.CalledProcessError: 70 return -1 71 72 if opts.downstream_commit is not None: 73 sys.stderr.write('Updating downstream to %s...\n' % opts.downstream_commit) 74 try: 75 downstream.checkout(opts.downstream_commit) 76 except subprocess.CalledProcessError: 77 return -1 78 79 sys.stderr.write('Retrieving candidate commit list...\n') 80 try: 81 commits = list(upstream.log(opts.upstream_subdir)) 82 except subprocess.CalledProcessError: 83 return -1 84 sys.stderr.write('%d commits to consider\n' % len(commits)) 85 86 # We now have everything we need. Examine each commit, tracking the smallest diff we've seen. 87 88 min_diff = None 89 min_commit = None 90 91 for index, commit in enumerate(commits): 92 sys.stderr.write('Considering %s (%d of %d)...\n' % (commit, index, len(commits))) 93 try: 94 upstream.checkout(commit) 95 except subprocess.CalledProcessError: 96 return -1 97 src = os.path.join(upstream.tmp, opts.upstream_subdir) 98 dst = os.path.join(downstream.tmp, opts.downstream_subdir) 99 100 p = subprocess.Popen(['diff', src, dst], stdout=subprocess.PIPE) 101 stdout, _ = p.communicate() 102 diff = len(stdout.split('\n')) 103 sys.stderr.write('This commit has a difference metric of %d\n' % diff) 104 105 if min_diff is None or min_diff > diff: 106 min_diff = diff 107 min_commit = commit 108 109 sys.stderr.write('The most likely commit is %s\n' % min_commit) 110 111 return 0 112 113 114if __name__ == '__main__': 115 sys.exit(main(sys.argv)) 116