1#!/usr/bin/env python3
2#
3# Copyright 2020, Data61, CSIRO (ABN 41 687 119 230)
4#
5# SPDX-License-Identifier: GPL-2.0-only
6#
7
8'''
9A tool for determining provenance.
10
11Occasionally one encounters a directory of source code that was derived from an upstream repository
12with history either squashed or discarded. To pull in upstream changes, it is desirable to know what
13commit the source code originated from. This script helps you determine that by looking for the
14upstream commit with the smallest diff to the downstream files. Sample usage:
15
16    whence.py -u https://github.com/torvalds/linux --upstream-subdir scripts/kconfig \
17              -d https://github.com/seL4/seL4_tools --downstream-subdir kbuild-tool/kconfig
18'''
19
20import argparse
21import os
22import shutil
23import subprocess
24import sys
25import tempfile
26
27
28class GitRepo(object):
29    def __init__(self, url):
30        self.tmp = tempfile.mkdtemp()
31        subprocess.check_call(['git', 'clone', url, self.tmp])
32
33    def checkout(self, commit):
34        subprocess.check_call(['git', 'checkout', commit], cwd=self.tmp)
35
36    def log(self, subdir):
37        # Reverse the commit list to test them chronologically, just for consistency.
38        return reversed(subprocess.check_output(['git', 'log', '--pretty=tformat:%H', '.'],
39                                                cwd=os.path.join(self.tmp, subdir)).split())
40
41    def __del__(self):
42        shutil.rmtree(self.tmp)
43
44
45def main(argv):
46    parser = argparse.ArgumentParser(description='locate a Git commit in an upstream project from '
47                                     'which downstream source was derived')
48    parser.add_argument('--upstream', '-u', required=True, help='URL of upstream repository to '
49                        'search')
50    parser.add_argument('--upstream-subdir', default='', help='subdirectory within upstream '
51                        'repository to consider (root by default)')
52    parser.add_argument('--downstream', '-d', required=True, help='URL of downstream repository '
53                        'to analyse')
54    parser.add_argument('--downstream-subdir', default='', help='subdirectory within downstream '
55                        'repository to analyse (root by default)')
56    parser.add_argument('--downstream-commit', help='commit in downstream repository to consider '
57                        '(HEAD of master by default)')
58    opts = parser.parse_args(argv[1:])
59
60    sys.stderr.write('Cloning %s into a temporary directory...\n' % opts.upstream)
61    try:
62        upstream = GitRepo(opts.upstream)
63    except subprocess.CalledProcessError:
64        return -1
65
66    sys.stderr.write('Cloning %s into a temporary directory...\n' % opts.downstream)
67    try:
68        downstream = GitRepo(opts.downstream)
69    except subprocess.CalledProcessError:
70        return -1
71
72    if opts.downstream_commit is not None:
73        sys.stderr.write('Updating downstream to %s...\n' % opts.downstream_commit)
74        try:
75            downstream.checkout(opts.downstream_commit)
76        except subprocess.CalledProcessError:
77            return -1
78
79    sys.stderr.write('Retrieving candidate commit list...\n')
80    try:
81        commits = list(upstream.log(opts.upstream_subdir))
82    except subprocess.CalledProcessError:
83        return -1
84    sys.stderr.write('%d commits to consider\n' % len(commits))
85
86    # We now have everything we need. Examine each commit, tracking the smallest diff we've seen.
87
88    min_diff = None
89    min_commit = None
90
91    for index, commit in enumerate(commits):
92        sys.stderr.write('Considering %s (%d of %d)...\n' % (commit, index, len(commits)))
93        try:
94            upstream.checkout(commit)
95        except subprocess.CalledProcessError:
96            return -1
97        src = os.path.join(upstream.tmp, opts.upstream_subdir)
98        dst = os.path.join(downstream.tmp, opts.downstream_subdir)
99
100        p = subprocess.Popen(['diff', src, dst], stdout=subprocess.PIPE)
101        stdout, _ = p.communicate()
102        diff = len(stdout.split('\n'))
103        sys.stderr.write('This commit has a difference metric of %d\n' % diff)
104
105        if min_diff is None or min_diff > diff:
106            min_diff = diff
107            min_commit = commit
108
109    sys.stderr.write('The most likely commit is %s\n' % min_commit)
110
111    return 0
112
113
114if __name__ == '__main__':
115    sys.exit(main(sys.argv))
116