1/*
2 * Copyright 2017, Data61
3 * Commonwealth Scientific and Industrial Research Organisation (CSIRO)
4 * ABN 41 687 119 230.
5 *
6 * This software may be distributed and modified according to the terms of
7 * the BSD 2-Clause license. Note that NO WARRANTY is provided.
8 * See "LICENSE_BSD2.txt" for details.
9 *
10 * @TAG(DATA61_BSD)
11 */
12#include <assert.h>
13#include "cfg.h"
14#include <clang-c/Index.h> /* -lclang */
15#include <errno.h>
16#include <getopt.h>
17#include "set.h"
18#include <stdbool.h>
19#include <stdio.h>
20#include <stdlib.h>
21#include <string.h>
22
23//#define DEBUG 1
24
25/* We want to implement a rewriter (or source-to-source translator). The
26 * standard way of doing this would be to sub-class RecursiveASTVisitor and go
27 * from there. After several attempts I gave up on getting either of the Clang
28 * C++ or Python bindings to compile and link and resorted to the below in C.
29 */
30
31/* Determine whether a given cursor is in our list of entities to never emit. */
32static bool is_blacklisted(set_t *blacklist, CXCursor cursor) {
33    CXString s = clang_getCursorSpelling(cursor);
34    const char *name = clang_getCString(s);
35    bool blacklisted = set_contains(blacklist, name);
36    clang_disposeString(s);
37    return blacklisted;
38}
39
40/* Dump a given cursor to the passed stream. */
41static void emit(FILE* stream, CXTranslationUnit tu, CXCursor cursor, set_t *attribs) {
42    /* Transform the cursor into a list of text tokens. */
43    CXSourceRange range = clang_getCursorExtent(cursor);
44    CXToken *tokens;
45    unsigned int tokens_sz;
46    clang_tokenize(tu, range, &tokens, &tokens_sz);
47
48    /* Bail out early if possible to reduce complexity in the follow on logic.
49     */
50    if (tokens_sz == 0) {
51        clang_disposeTokens(tu, tokens, tokens_sz);
52        return;
53    }
54
55    /* Now time to deal with Clang's quirks. */
56
57    /* Grab the last token. */
58    CXString cxlast = clang_getTokenSpelling(tu, tokens[tokens_sz - 1]);
59    const char *last = clang_getCString(cxlast);
60
61    /* Grab the first token. */
62    CXString cxfirst = clang_getTokenSpelling(tu, tokens[0]);
63    const char *first = clang_getCString(cxfirst);
64
65    enum CXCursorKind kind = clang_getCursorKind(cursor);
66
67#ifdef DEBUG
68    CXString cxname = clang_getCursorSpelling(cursor);
69    const char *name = clang_getCString(cxname);
70    CXString cxkind = clang_getTypeKindSpelling(kind);
71    const char *k = clang_getCString(cxkind);
72    fprintf(stderr, "Cursor %s of kind %s\n", name, k);
73    clang_disposeString(cxkind);
74    clang_disposeString(cxname);
75#endif
76
77    switch (kind) {
78
79        /* XXX: If the cursor is an empty `;`, its extent covers the
80         * (unrelated) following token as well. Libclang bug?
81         * We could potentially just return here, but for now we will emit
82         * the ';' anyway.  Decreasing the tokens_sz prevents us from also
83         * emitting the unrelated following token.
84         */
85        case CXCursor_UnexposedDecl:
86            if (!strcmp(first, ";")) {
87                tokens_sz--;
88            }
89            break;
90
91        /* XXX: If the cursor is a function definition, its extent covers the
92         * (unrelated) following token as well. Libclang bug? An exception is
93         * that a function appearing at the end of the translation unit will not
94         * have an extra appended token. To cope with this, assume we never want to strip
95         * closing braces.
96         */
97        case CXCursor_FunctionDecl:
98            if (clang_isCursorDefinition(cursor) && strcmp(last, "}"))
99                tokens_sz--;
100            break;
101
102        /* XXX: In code like 'typedef struct foo {...} foo_t', Clang considers
103         * foo and foo_t siblings. We end up visiting foo, then foo_t. In my
104         * mind, foo is a child of foo_t, but maybe Clang's outlook makes more
105         * sense from a scoping point of view. Either way, it helpfully covers
106         * an extra token in the foo cursor, so we can play the same trick as
107         * above to elide the (excess) struct definition.
108         */
109        case CXCursor_StructDecl:
110        case CXCursor_UnionDecl:
111        case CXCursor_EnumDecl:
112            if (strcmp(last, ";") && strcmp(last, "}")) {
113                clang_disposeString(cxlast);
114                clang_disposeString(cxfirst);
115                clang_disposeTokens(tu, tokens, tokens_sz);
116                return;
117            }
118            break;
119
120        /* XXX: When multiple variables are declared in a single statement:
121         *  int x, y;
122         * the declaration appears once per variable. Only the final instance
123         * is terminated with a semi-colon and only it is valid.
124         */
125        case CXCursor_TypedefDecl:
126        case CXCursor_VarDecl:
127            /* To add insult to injury, typedefed structs with trailing
128             * __attribute__s come out with nothing following the
129             * __attribute__. Why? Who knows. In this case we actually *do*
130             * want to emit them. Probably. Of course this logic breaks down in
131             * the case of code like:
132             *  typedef struct f { int x; } y, z __attribute__(...);
133             * but I don't easily see how to resolve situations like this.
134             */
135            if (!strcmp(last, "__attribute__"))
136                break;
137            if (strcmp(last, ";")) {
138                clang_disposeString(cxlast);
139                clang_disposeString(cxfirst);
140                clang_disposeTokens(tu, tokens, tokens_sz);
141                return;
142            }
143
144        default: /* shut -Wswitch warnings up */
145            break;
146    }
147
148    clang_disposeString(cxlast);
149    clang_disposeString(cxfirst);
150
151    /* Dump all the tokens, not trying to preserve white space. */
152    for (unsigned int i = 0; i < tokens_sz; i++) {
153        CXString s = clang_getTokenSpelling(tu, tokens[i]);
154        const char *token = clang_getCString(s);
155        if (i == tokens_sz - 1 && attribs != NULL) {
156            /* Precede the last token with any extra attributes. */
157            void print_attribute(const char *attrib) {
158                fprintf(stream, "__attribute__((%s))\n", attrib);
159            }
160            set_foreach(attribs, (void(*)(void*))print_attribute);
161        }
162        if (i == tokens_sz - 1 &&
163                (kind == CXCursor_TypedefDecl || kind == CXCursor_VarDecl) &&
164                !strcmp(token, "__attribute__"))
165            /* XXX: Yet more hackery. Libclang misparses a trailing attribute
166             * on a typedef. This should appear in the AST as an UnexposedDecl,
167             * but for whatever reason it doesn't. No idea from whence this
168             * behaviour stems as Clang itself just removes the attribute from
169             * the AST altogether entirely.
170             */
171            fprintf(stream, "; ");
172        else
173            fprintf(stream, "%s\n", token);
174        clang_disposeString(s);
175    }
176    clang_disposeTokens(tu, tokens, tokens_sz);
177}
178
179/* State data that we'll pass around while visiting the AST. */
180typedef struct {
181    set_t *keep;
182    set_t *blacklist;
183    dict_t *extra_attributes;
184    CXTranslationUnit *tu;
185    FILE *out;
186} state_t;
187
188/* Visit a node in the AST. */
189enum CXChildVisitResult visitor(CXCursor cursor, CXCursor _, state_t *state) {
190
191    CXString s = clang_getCursorSpelling(cursor);
192    const char *name = clang_getCString(s);
193
194    bool retain = true;
195
196    if (clang_getCursorKind(cursor) == CXCursor_FunctionDecl) {
197        /* Determine whether the function was one of those the user requested
198         * to keep. */
199        retain = set_contains(state->keep, name);
200    }
201
202    /* Get any extra attributes we need to apply to this symbol. */
203    set_t *attribs = dict_get(state->extra_attributes, name);
204
205    /* Don't need the symbol name any more. */
206    clang_disposeString(s);
207
208    if (!retain)
209        return CXChildVisit_Continue;
210
211    if (is_blacklisted(state->blacklist, cursor))
212        return CXChildVisit_Continue;
213
214    /* If we reached here, the current cursor is one we do want in the output.
215     */
216    emit(state->out, *state->tu, cursor, attribs);
217
218    /* Never recurse (in the AST). We're only visiting top-level nodes. */
219    return CXChildVisit_Continue;
220}
221
222typedef struct {
223    const char *input;
224    const char *output;
225    set_t *keep;
226    set_t *blacklist;
227    dict_t *extra_attributes;
228} options_t;
229
230static options_t *parse_args(int argc, char **argv) {
231    const struct option opts[] = {
232        {"add-attribute", required_argument, NULL, 'a'},
233        {"blacklist", required_argument, NULL, 'b'},
234        {"help", no_argument, NULL, '?'},
235        {"keep", required_argument, NULL, 'k'},
236        {"output", required_argument, NULL, 'o'},
237        {NULL, 0, NULL, 0},
238    };
239
240    options_t *o = calloc(1, sizeof(*o));
241    if (o == NULL)
242        goto fail1;
243
244    /* defaults */
245    o->output = "/dev/stdout";
246    o->keep = set();
247    if (o->keep == NULL)
248        goto fail2;
249
250    o->blacklist = set();
251    if (o->blacklist == NULL)
252        goto fail3;
253
254    o->extra_attributes = dict((void(*)(void*))set_destroy);
255    if (o->extra_attributes == NULL)
256        goto fail4;
257
258    while (true) {
259        int index = 0;
260        int c = getopt_long(argc, argv, "a:b:k:o:?", opts, &index);
261
262        if (c == -1)
263            /* end of defined options */
264            break;
265
266        switch (c) {
267            case 'a':; /* --add-attribute */
268                char *attrib = strstr(optarg, ":");
269                if (attrib == NULL) {
270                    fprintf(stderr, "illegal argument %s to --add-attribute\n", optarg);
271                    goto fail4;
272                }
273                *attrib = '\0';/* NUL-terminate the symbol name */
274                attrib++; /* move on to the attribute */
275                set_t *s = dict_get(o->extra_attributes, optarg);
276                if (s == NULL) {
277                    s = set();
278                    if (s == NULL)
279                        goto fail4;
280                    dict_set(o->extra_attributes, optarg, s);
281                }
282                set_insert(s, attrib);
283                break;
284
285            case 'b': /* --blacklist */
286                set_insert(o->blacklist, optarg);
287                break;
288
289            case 'k': /* --keep */
290                set_insert(o->keep, optarg);
291                break;
292
293            case 'o': /* --output */
294                o->output = optarg;
295                break;
296
297            case '?': /* --help */
298                printf("Usage: %s options... input_file\n"
299                       "Trims a C file by discarding unwanted functions.\n"
300                       "\n"
301                       " Options:\n"
302                       "  --add-attribute symbol:attrib\n"
303                       "  -a symbol:attrib                Annotate a symbol with a GCC attribute.\n"
304                       "  --blacklist symbol | -b symbol  Drop a given typedef or variable.\n"
305                       "  --help | -?                     Print this information.\n"
306                       "  --keep symbol | -k symbol       Retain a particular function.\n"
307                       "  --output file | -o file         Write output to file, rather than stdout.\n",
308                    argv[0]);
309                goto fail5;
310
311            default:
312                goto fail5;
313        }
314    }
315
316    /* Hopefully we still have an input file remaining. */
317    if (optind == argc - 1)
318        o->input = argv[optind];
319    else if (optind < argc) {
320        fprintf(stderr, "multiple input files are not supported\n");
321        goto fail5;
322    }
323
324    return o;
325
326fail5: dict_destroy(o->extra_attributes);
327fail4: set_destroy(o->blacklist);
328fail3: set_destroy(o->keep);
329fail2: free(o);
330fail1: exit(EXIT_FAILURE);
331}
332
333/* Use the passed CFG to recursively enumerate callees of the passed "to-keep"
334 * symbols and accumulate these. Returns non-zero on failure.
335 */
336static int merge_callees(set_t *keeps, cfg_t *graph) {
337
338    /* A set for tracking the callees. We need to use a separate set and then
339     * post-merge this into the keeps set because we cannot insert into the
340     * keeps set while iterating through it.
341     */
342    set_t *callees = set();
343    if (callees == NULL)
344        return -1;
345
346    /* Visitor for appending each callee to the "keeps" set. */
347    enum CXChildVisitResult visitor(const char *callee, const char *caller,
348            void *_) {
349
350        /* The CFG callee visitation calls us once per undefined function with
351         * NULL as the callee. This is useful for warning the user when the
352         * input file is incomplete and we may be pruning it too agressively.
353         */
354        if (callee == NULL) {
355            fprintf(stderr, "Warning: no definition for called function %s\n", caller);
356            return CXChildVisit_Continue;
357        }
358
359        set_insert(callees, callee);
360
361        return CXChildVisit_Recurse;
362    }
363
364    set_iter_t i;
365    set_iter(keeps, &i);
366
367    while (true) {
368        const char *caller = set_iter_next(&i);
369        if (caller == NULL)
370            break;
371
372        if (cfg_visit_callees(graph, caller, visitor, NULL) == 1) {
373            /* Traversal of this particular caller's callees failed. */
374            set_destroy(callees);
375            return -1;
376        }
377    }
378    set_union(keeps, callees);
379    return 0;
380}
381
382int main(int argc, char **argv) {
383    options_t *opts = parse_args(argc, argv);
384
385    if (opts == NULL) {
386        perror("failed to parse arguments");
387        return EXIT_FAILURE;
388    } else if (opts->input == NULL) {
389        fprintf(stderr, "no input file provided\n");
390        return EXIT_FAILURE;
391    }
392
393    /* Test whether we can read from the file. */
394    FILE *check = fopen(opts->input, "r");
395    if (check == NULL) {
396        fprintf(stderr, "input file does not exist or is unreadable\n");
397        return EXIT_FAILURE;
398    }
399    fclose(check);
400
401    /* Flags to tell Clang that input is C. */
402    const char *const args[] = {
403        "-x",
404        "c",
405    };
406
407    /* Parse the source file into a translation unit */
408    CXIndex index = clang_createIndex(0, 0);
409    CXTranslationUnit tu = clang_parseTranslationUnit(index, opts->input, args,
410        sizeof(args) / sizeof(args[0]), NULL, 0, CXTranslationUnit_None);
411    if (tu == NULL) {
412        fprintf(stderr, "failed to parse source file\n");
413        return EXIT_FAILURE;
414    }
415
416    FILE *f = fopen(opts->output, "w");
417    if (f == NULL) {
418        perror("failed to open output");
419        return EXIT_FAILURE;
420    }
421
422    /* Derive the Control Flow Graph of the TU. We then use this CFG to expand
423     * the kept symbols set to include callees of the kept symbols.
424     */
425    cfg_t *graph = cfg(tu);
426    if (graph == NULL) {
427        fprintf(stderr, "failed to form CFG");
428        if (errno != 0) {
429            perror(": ");
430        } else {
431            fprintf(stderr, "\n");
432        }
433        return EXIT_FAILURE;
434    }
435    if (merge_callees(opts->keep, graph) != 0) {
436        fprintf(stderr, "Failed to traverse CFG\n");
437        return EXIT_FAILURE;
438    }
439    cfg_destroy(graph); /* no longer needed */
440
441    state_t st = {
442        .keep = opts->keep,
443        .blacklist = opts->blacklist,
444        .extra_attributes = opts->extra_attributes,
445        .tu = &tu,
446        .out = f,
447    };
448
449    /* Now traverse the AST */
450    CXCursor cursor = clang_getTranslationUnitCursor(tu);
451    clang_visitChildren(cursor, (CXCursorVisitor)visitor, &st);
452
453    clang_disposeTranslationUnit(tu);
454    clang_disposeIndex(index);
455
456    return 0;
457}
458