1256809Sian# -*- coding: utf-8 -*-
2256809Sian# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
3256809Sian# See https://llvm.org/LICENSE.txt for license information.
4256809Sian# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
5256809Sian""" This module is responsible to capture the compiler invocation of any
6256809Sianbuild process. The result of that should be a compilation database.
7256809Sian
8256809SianThis implementation is using the LD_PRELOAD or DYLD_INSERT_LIBRARIES
9256809Sianmechanisms provided by the dynamic linker. The related library is implemented
10256809Sianin C language and can be found under 'libear' directory.
11256809Sian
12256809SianThe 'libear' library is capturing all child process creation and logging the
13256809Sianrelevant information about it into separate files in a specified directory.
14256809SianThe parameter of this process is the output directory name, where the report
15256809Sianfiles shall be placed. This parameter is passed as an environment variable.
16256809Sian
17256809SianThe module also implements compiler wrappers to intercept the compiler calls.
18256809Sian
19256809SianThe module implements the build command execution and the post-processing of
20256809Sianthe output files, which will condensates into a compilation database. """
21256809Sian
22256809Sianimport sys
23256809Sianimport os
24256809Sianimport os.path
25256809Sianimport re
26256809Sianimport itertools
27256809Sianimport json
28256809Sianimport glob
29256809Sianimport logging
30256809Sianfrom libear import build_libear, TemporaryDirectory
31257393Sianfrom libscanbuild import command_entry_point, compiler_wrapper, \
32256809Sian    wrapper_environment, run_command, run_build
33256809Sianfrom libscanbuild import duplicate_check
34256809Sianfrom libscanbuild.compilation import split_command
35256809Sianfrom libscanbuild.arguments import parse_args_for_intercept_build
36256809Sianfrom libscanbuild.shell import encode, decode
37256809Sian
38256809Sian__all__ = ['capture', 'intercept_build', 'intercept_compiler_wrapper']
39256809Sian
40256809SianGS = chr(0x1d)
41256809SianRS = chr(0x1e)
42256809SianUS = chr(0x1f)
43256809Sian
44264204SianCOMPILER_WRAPPER_CC = 'intercept-cc'
45264204SianCOMPILER_WRAPPER_CXX = 'intercept-c++'
46264204SianTRACE_FILE_EXTENSION = '.cmd'  # same as in ear.c
47256809SianWRAPPER_ONLY_PLATFORMS = frozenset({'win32', 'cygwin'})
48256809Sian
49256809Sian
50264251Srpaulo@command_entry_point
51256809Siandef intercept_build():
52256809Sian    """ Entry point for 'intercept-build' command. """
53256809Sian
54256809Sian    args = parse_args_for_intercept_build()
55256809Sian    return capture(args)
56256809Sian
57256809Sian
58256809Siandef capture(args):
59256809Sian    """ The entry point of build command interception. """
60256809Sian
61256809Sian    def post_processing(commands):
62256809Sian        """ To make a compilation database, it needs to filter out commands
63256809Sian        which are not compiler calls. Needs to find the source file name
64256809Sian        from the arguments. And do shell escaping on the command.
65256809Sian
66256809Sian        To support incremental builds, it is desired to read elements from
67256809Sian        an existing compilation database from a previous run. These elements
68256809Sian        shall be merged with the new elements. """
69256809Sian
70256809Sian        # create entries from the current run
71256809Sian        current = itertools.chain.from_iterable(
72256809Sian            # creates a sequence of entry generators from an exec,
73256809Sian            format_entry(command) for command in commands)
74256809Sian        # read entries from previous run
75256809Sian        if 'append' in args and args.append and os.path.isfile(args.cdb):
76256809Sian            with open(args.cdb) as handle:
77256809Sian                previous = iter(json.load(handle))
78256809Sian        else:
79256809Sian            previous = iter([])
80256809Sian        # filter out duplicate entries from both
81256809Sian        duplicate = duplicate_check(entry_hash)
82256809Sian        return (entry
83256809Sian                for entry in itertools.chain(previous, current)
84256809Sian                if os.path.exists(entry['file']) and not duplicate(entry))
85256809Sian
86256809Sian    with TemporaryDirectory(prefix='intercept-') as tmp_dir:
87256809Sian        # run the build command
88256809Sian        environment = setup_environment(args, tmp_dir)
89256809Sian        exit_code = run_build(args.build, env=environment)
90256809Sian        # read the intercepted exec calls
91256809Sian        exec_traces = itertools.chain.from_iterable(
92256809Sian            parse_exec_trace(os.path.join(tmp_dir, filename))
93256809Sian            for filename in sorted(glob.iglob(os.path.join(tmp_dir, '*.cmd'))))
94256809Sian        # do post processing
95257393Sian        entries = post_processing(exec_traces)
96257393Sian        # dump the compilation database
97257393Sian        with open(args.cdb, 'w+') as handle:
98257393Sian            json.dump(list(entries), handle, sort_keys=True, indent=4)
99257393Sian        return exit_code
100257393Sian
101256809Sian
102256809Siandef setup_environment(args, destination):
103256809Sian    """ Sets up the environment for the build command.
104256809Sian
105256809Sian    It sets the required environment variables and execute the given command.
106256809Sian    The exec calls will be logged by the 'libear' preloaded library or by the
107256809Sian    'wrapper' programs. """
108256809Sian
109262725Simp    c_compiler = args.cc if 'cc' in args else 'cc'
110256809Sian    cxx_compiler = args.cxx if 'cxx' in args else 'c++'
111256809Sian
112256809Sian    libear_path = None if args.override_compiler or is_preload_disabled(
113256809Sian        sys.platform) else build_libear(c_compiler, destination)
114256809Sian
115256809Sian    environment = dict(os.environ)
116256809Sian    environment.update({'INTERCEPT_BUILD_TARGET_DIR': destination})
117256809Sian
118256809Sian    if not libear_path:
119256809Sian        logging.debug('intercept gonna use compiler wrappers')
120256809Sian        environment.update(wrapper_environment(args))
121256809Sian        environment.update({
122256809Sian            'CC': COMPILER_WRAPPER_CC,
123256809Sian            'CXX': COMPILER_WRAPPER_CXX
124256809Sian        })
125256809Sian    elif sys.platform == 'darwin':
126256809Sian        logging.debug('intercept gonna preload libear on OSX')
127256809Sian        environment.update({
128256809Sian            'DYLD_INSERT_LIBRARIES': libear_path,
129256809Sian            'DYLD_FORCE_FLAT_NAMESPACE': '1'
130256809Sian        })
131256809Sian    else:
132256809Sian        logging.debug('intercept gonna preload libear on UNIX')
133256809Sian        environment.update({'LD_PRELOAD': libear_path})
134256809Sian
135    return environment
136
137
138@command_entry_point
139def intercept_compiler_wrapper():
140    """ Entry point for `intercept-cc` and `intercept-c++`. """
141
142    return compiler_wrapper(intercept_compiler_wrapper_impl)
143
144
145def intercept_compiler_wrapper_impl(_, execution):
146    """ Implement intercept compiler wrapper functionality.
147
148    It does generate execution report into target directory.
149    The target directory name is from environment variables. """
150
151    message_prefix = 'execution report might be incomplete: %s'
152
153    target_dir = os.getenv('INTERCEPT_BUILD_TARGET_DIR')
154    if not target_dir:
155        logging.warning(message_prefix, 'missing target directory')
156        return
157    # write current execution info to the pid file
158    try:
159        target_file_name = str(os.getpid()) + TRACE_FILE_EXTENSION
160        target_file = os.path.join(target_dir, target_file_name)
161        logging.debug('writing execution report to: %s', target_file)
162        write_exec_trace(target_file, execution)
163    except IOError:
164        logging.warning(message_prefix, 'io problem')
165
166
167def write_exec_trace(filename, entry):
168    """ Write execution report file.
169
170    This method shall be sync with the execution report writer in interception
171    library. The entry in the file is a JSON objects.
172
173    :param filename:    path to the output execution trace file,
174    :param entry:       the Execution object to append to that file. """
175
176    with open(filename, 'ab') as handler:
177        pid = str(entry.pid)
178        command = US.join(entry.cmd) + US
179        content = RS.join([pid, pid, 'wrapper', entry.cwd, command]) + GS
180        handler.write(content.encode('utf-8'))
181
182
183def parse_exec_trace(filename):
184    """ Parse the file generated by the 'libear' preloaded library.
185
186    Given filename points to a file which contains the basic report
187    generated by the interception library or wrapper command. A single
188    report file _might_ contain multiple process creation info. """
189
190    logging.debug('parse exec trace file: %s', filename)
191    with open(filename, 'r') as handler:
192        content = handler.read()
193        for group in filter(bool, content.split(GS)):
194            records = group.split(RS)
195            yield {
196                'pid': records[0],
197                'ppid': records[1],
198                'function': records[2],
199                'directory': records[3],
200                'command': records[4].split(US)[:-1]
201            }
202
203
204def format_entry(exec_trace):
205    """ Generate the desired fields for compilation database entries. """
206
207    def abspath(cwd, name):
208        """ Create normalized absolute path from input filename. """
209        fullname = name if os.path.isabs(name) else os.path.join(cwd, name)
210        return os.path.normpath(fullname)
211
212    logging.debug('format this command: %s', exec_trace['command'])
213    compilation = split_command(exec_trace['command'])
214    if compilation:
215        for source in compilation.files:
216            compiler = 'c++' if compilation.compiler == 'c++' else 'cc'
217            command = [compiler, '-c'] + compilation.flags + [source]
218            logging.debug('formated as: %s', command)
219            yield {
220                'directory': exec_trace['directory'],
221                'command': encode(command),
222                'file': abspath(exec_trace['directory'], source)
223            }
224
225
226def is_preload_disabled(platform):
227    """ Library-based interposition will fail silently if SIP is enabled,
228    so this should be detected. You can detect whether SIP is enabled on
229    Darwin by checking whether (1) there is a binary called 'csrutil' in
230    the path and, if so, (2) whether the output of executing 'csrutil status'
231    contains 'System Integrity Protection status: enabled'.
232
233    :param platform: name of the platform (returned by sys.platform),
234    :return: True if library preload will fail by the dynamic linker. """
235
236    if platform in WRAPPER_ONLY_PLATFORMS:
237        return True
238    elif platform == 'darwin':
239        command = ['csrutil', 'status']
240        pattern = re.compile(r'System Integrity Protection status:\s+enabled')
241        try:
242            return any(pattern.match(line) for line in run_command(command))
243        except:
244            return False
245    else:
246        return False
247
248
249def entry_hash(entry):
250    """ Implement unique hash method for compilation database entries. """
251
252    # For faster lookup in set filename is reverted
253    filename = entry['file'][::-1]
254    # For faster lookup in set directory is reverted
255    directory = entry['directory'][::-1]
256    # On OS X the 'cc' and 'c++' compilers are wrappers for
257    # 'clang' therefore both call would be logged. To avoid
258    # this the hash does not contain the first word of the
259    # command.
260    command = ' '.join(decode(entry['command'])[1:])
261
262    return '<>'.join([filename, directory, command])
263