1/* 2 * Copyright 2017, Data61 3 * Commonwealth Scientific and Industrial Research Organisation (CSIRO) 4 * ABN 41 687 119 230. 5 * 6 * This software may be distributed and modified according to the terms of 7 * the BSD 2-Clause license. Note that NO WARRANTY is provided. 8 * See "LICENSE_BSD2.txt" for details. 9 * 10 * @TAG(DATA61_BSD) 11 */ 12#include <assert.h> 13#include "cfg.h" 14#include <clang-c/Index.h> /* -lclang */ 15#include <errno.h> 16#include <getopt.h> 17#include "set.h" 18#include <stdbool.h> 19#include <stdio.h> 20#include <stdlib.h> 21#include <string.h> 22 23//#define DEBUG 1 24 25/* We want to implement a rewriter (or source-to-source translator). The 26 * standard way of doing this would be to sub-class RecursiveASTVisitor and go 27 * from there. After several attempts I gave up on getting either of the Clang 28 * C++ or Python bindings to compile and link and resorted to the below in C. 29 */ 30 31/* Determine whether a given cursor is in our list of entities to never emit. */ 32static bool is_blacklisted(set_t *blacklist, CXCursor cursor) { 33 CXString s = clang_getCursorSpelling(cursor); 34 const char *name = clang_getCString(s); 35 bool blacklisted = set_contains(blacklist, name); 36 clang_disposeString(s); 37 return blacklisted; 38} 39 40/* Dump a given cursor to the passed stream. */ 41static void emit(FILE* stream, CXTranslationUnit tu, CXCursor cursor, set_t *attribs) { 42 /* Transform the cursor into a list of text tokens. */ 43 CXSourceRange range = clang_getCursorExtent(cursor); 44 CXToken *tokens; 45 unsigned int tokens_sz; 46 clang_tokenize(tu, range, &tokens, &tokens_sz); 47 48 /* Bail out early if possible to reduce complexity in the follow on logic. 49 */ 50 if (tokens_sz == 0) { 51 clang_disposeTokens(tu, tokens, tokens_sz); 52 return; 53 } 54 55 /* Now time to deal with Clang's quirks. */ 56 57 /* Grab the last token. */ 58 CXString cxlast = clang_getTokenSpelling(tu, tokens[tokens_sz - 1]); 59 const char *last = clang_getCString(cxlast); 60 61 /* Grab the first token. */ 62 CXString cxfirst = clang_getTokenSpelling(tu, tokens[0]); 63 const char *first = clang_getCString(cxfirst); 64 65 enum CXCursorKind kind = clang_getCursorKind(cursor); 66 67#ifdef DEBUG 68 CXString cxname = clang_getCursorSpelling(cursor); 69 const char *name = clang_getCString(cxname); 70 CXString cxkind = clang_getTypeKindSpelling(kind); 71 const char *k = clang_getCString(cxkind); 72 fprintf(stderr, "Cursor %s of kind %s\n", name, k); 73 clang_disposeString(cxkind); 74 clang_disposeString(cxname); 75#endif 76 77 switch (kind) { 78 79 /* XXX: If the cursor is an empty `;`, its extent covers the 80 * (unrelated) following token as well. Libclang bug? 81 * We could potentially just return here, but for now we will emit 82 * the ';' anyway. Decreasing the tokens_sz prevents us from also 83 * emitting the unrelated following token. 84 */ 85 case CXCursor_UnexposedDecl: 86 if (!strcmp(first, ";")) { 87 tokens_sz--; 88 } 89 break; 90 91 /* XXX: If the cursor is a function definition, its extent covers the 92 * (unrelated) following token as well. Libclang bug? An exception is 93 * that a function appearing at the end of the translation unit will not 94 * have an extra appended token. To cope with this, assume we never want to strip 95 * closing braces. 96 */ 97 case CXCursor_FunctionDecl: 98 if (clang_isCursorDefinition(cursor) && strcmp(last, "}")) 99 tokens_sz--; 100 break; 101 102 /* XXX: In code like 'typedef struct foo {...} foo_t', Clang considers 103 * foo and foo_t siblings. We end up visiting foo, then foo_t. In my 104 * mind, foo is a child of foo_t, but maybe Clang's outlook makes more 105 * sense from a scoping point of view. Either way, it helpfully covers 106 * an extra token in the foo cursor, so we can play the same trick as 107 * above to elide the (excess) struct definition. 108 */ 109 case CXCursor_StructDecl: 110 case CXCursor_UnionDecl: 111 case CXCursor_EnumDecl: 112 if (strcmp(last, ";") && strcmp(last, "}")) { 113 clang_disposeString(cxlast); 114 clang_disposeString(cxfirst); 115 clang_disposeTokens(tu, tokens, tokens_sz); 116 return; 117 } 118 break; 119 120 /* XXX: When multiple variables are declared in a single statement: 121 * int x, y; 122 * the declaration appears once per variable. Only the final instance 123 * is terminated with a semi-colon and only it is valid. 124 */ 125 case CXCursor_TypedefDecl: 126 case CXCursor_VarDecl: 127 /* To add insult to injury, typedefed structs with trailing 128 * __attribute__s come out with nothing following the 129 * __attribute__. Why? Who knows. In this case we actually *do* 130 * want to emit them. Probably. Of course this logic breaks down in 131 * the case of code like: 132 * typedef struct f { int x; } y, z __attribute__(...); 133 * but I don't easily see how to resolve situations like this. 134 */ 135 if (!strcmp(last, "__attribute__")) 136 break; 137 if (strcmp(last, ";")) { 138 clang_disposeString(cxlast); 139 clang_disposeString(cxfirst); 140 clang_disposeTokens(tu, tokens, tokens_sz); 141 return; 142 } 143 144 default: /* shut -Wswitch warnings up */ 145 break; 146 } 147 148 clang_disposeString(cxlast); 149 clang_disposeString(cxfirst); 150 151 /* Dump all the tokens, not trying to preserve white space. */ 152 for (unsigned int i = 0; i < tokens_sz; i++) { 153 CXString s = clang_getTokenSpelling(tu, tokens[i]); 154 const char *token = clang_getCString(s); 155 if (i == tokens_sz - 1 && attribs != NULL) { 156 /* Precede the last token with any extra attributes. */ 157 void print_attribute(const char *attrib) { 158 fprintf(stream, "__attribute__((%s))\n", attrib); 159 } 160 set_foreach(attribs, (void(*)(void*))print_attribute); 161 } 162 if (i == tokens_sz - 1 && 163 (kind == CXCursor_TypedefDecl || kind == CXCursor_VarDecl) && 164 !strcmp(token, "__attribute__")) 165 /* XXX: Yet more hackery. Libclang misparses a trailing attribute 166 * on a typedef. This should appear in the AST as an UnexposedDecl, 167 * but for whatever reason it doesn't. No idea from whence this 168 * behaviour stems as Clang itself just removes the attribute from 169 * the AST altogether entirely. 170 */ 171 fprintf(stream, "; "); 172 else 173 fprintf(stream, "%s\n", token); 174 clang_disposeString(s); 175 } 176 clang_disposeTokens(tu, tokens, tokens_sz); 177} 178 179/* State data that we'll pass around while visiting the AST. */ 180typedef struct { 181 set_t *keep; 182 set_t *blacklist; 183 dict_t *extra_attributes; 184 CXTranslationUnit *tu; 185 FILE *out; 186} state_t; 187 188/* Visit a node in the AST. */ 189enum CXChildVisitResult visitor(CXCursor cursor, CXCursor _, state_t *state) { 190 191 CXString s = clang_getCursorSpelling(cursor); 192 const char *name = clang_getCString(s); 193 194 bool retain = true; 195 196 if (clang_getCursorKind(cursor) == CXCursor_FunctionDecl) { 197 /* Determine whether the function was one of those the user requested 198 * to keep. */ 199 retain = set_contains(state->keep, name); 200 } 201 202 /* Get any extra attributes we need to apply to this symbol. */ 203 set_t *attribs = dict_get(state->extra_attributes, name); 204 205 /* Don't need the symbol name any more. */ 206 clang_disposeString(s); 207 208 if (!retain) 209 return CXChildVisit_Continue; 210 211 if (is_blacklisted(state->blacklist, cursor)) 212 return CXChildVisit_Continue; 213 214 /* If we reached here, the current cursor is one we do want in the output. 215 */ 216 emit(state->out, *state->tu, cursor, attribs); 217 218 /* Never recurse (in the AST). We're only visiting top-level nodes. */ 219 return CXChildVisit_Continue; 220} 221 222typedef struct { 223 const char *input; 224 const char *output; 225 set_t *keep; 226 set_t *blacklist; 227 dict_t *extra_attributes; 228} options_t; 229 230static options_t *parse_args(int argc, char **argv) { 231 const struct option opts[] = { 232 {"add-attribute", required_argument, NULL, 'a'}, 233 {"blacklist", required_argument, NULL, 'b'}, 234 {"help", no_argument, NULL, '?'}, 235 {"keep", required_argument, NULL, 'k'}, 236 {"output", required_argument, NULL, 'o'}, 237 {NULL, 0, NULL, 0}, 238 }; 239 240 options_t *o = calloc(1, sizeof(*o)); 241 if (o == NULL) 242 goto fail1; 243 244 /* defaults */ 245 o->output = "/dev/stdout"; 246 o->keep = set(); 247 if (o->keep == NULL) 248 goto fail2; 249 250 o->blacklist = set(); 251 if (o->blacklist == NULL) 252 goto fail3; 253 254 o->extra_attributes = dict((void(*)(void*))set_destroy); 255 if (o->extra_attributes == NULL) 256 goto fail4; 257 258 while (true) { 259 int index = 0; 260 int c = getopt_long(argc, argv, "a:b:k:o:?", opts, &index); 261 262 if (c == -1) 263 /* end of defined options */ 264 break; 265 266 switch (c) { 267 case 'a':; /* --add-attribute */ 268 char *attrib = strstr(optarg, ":"); 269 if (attrib == NULL) { 270 fprintf(stderr, "illegal argument %s to --add-attribute\n", optarg); 271 goto fail4; 272 } 273 *attrib = '\0';/* NUL-terminate the symbol name */ 274 attrib++; /* move on to the attribute */ 275 set_t *s = dict_get(o->extra_attributes, optarg); 276 if (s == NULL) { 277 s = set(); 278 if (s == NULL) 279 goto fail4; 280 dict_set(o->extra_attributes, optarg, s); 281 } 282 set_insert(s, attrib); 283 break; 284 285 case 'b': /* --blacklist */ 286 set_insert(o->blacklist, optarg); 287 break; 288 289 case 'k': /* --keep */ 290 set_insert(o->keep, optarg); 291 break; 292 293 case 'o': /* --output */ 294 o->output = optarg; 295 break; 296 297 case '?': /* --help */ 298 printf("Usage: %s options... input_file\n" 299 "Trims a C file by discarding unwanted functions.\n" 300 "\n" 301 " Options:\n" 302 " --add-attribute symbol:attrib\n" 303 " -a symbol:attrib Annotate a symbol with a GCC attribute.\n" 304 " --blacklist symbol | -b symbol Drop a given typedef or variable.\n" 305 " --help | -? Print this information.\n" 306 " --keep symbol | -k symbol Retain a particular function.\n" 307 " --output file | -o file Write output to file, rather than stdout.\n", 308 argv[0]); 309 goto fail5; 310 311 default: 312 goto fail5; 313 } 314 } 315 316 /* Hopefully we still have an input file remaining. */ 317 if (optind == argc - 1) 318 o->input = argv[optind]; 319 else if (optind < argc) { 320 fprintf(stderr, "multiple input files are not supported\n"); 321 goto fail5; 322 } 323 324 return o; 325 326fail5: dict_destroy(o->extra_attributes); 327fail4: set_destroy(o->blacklist); 328fail3: set_destroy(o->keep); 329fail2: free(o); 330fail1: exit(EXIT_FAILURE); 331} 332 333/* Use the passed CFG to recursively enumerate callees of the passed "to-keep" 334 * symbols and accumulate these. Returns non-zero on failure. 335 */ 336static int merge_callees(set_t *keeps, cfg_t *graph) { 337 338 /* A set for tracking the callees. We need to use a separate set and then 339 * post-merge this into the keeps set because we cannot insert into the 340 * keeps set while iterating through it. 341 */ 342 set_t *callees = set(); 343 if (callees == NULL) 344 return -1; 345 346 /* Visitor for appending each callee to the "keeps" set. */ 347 enum CXChildVisitResult visitor(const char *callee, const char *caller, 348 void *_) { 349 350 /* The CFG callee visitation calls us once per undefined function with 351 * NULL as the callee. This is useful for warning the user when the 352 * input file is incomplete and we may be pruning it too agressively. 353 */ 354 if (callee == NULL) { 355 fprintf(stderr, "Warning: no definition for called function %s\n", caller); 356 return CXChildVisit_Continue; 357 } 358 359 set_insert(callees, callee); 360 361 return CXChildVisit_Recurse; 362 } 363 364 set_iter_t i; 365 set_iter(keeps, &i); 366 367 while (true) { 368 const char *caller = set_iter_next(&i); 369 if (caller == NULL) 370 break; 371 372 if (cfg_visit_callees(graph, caller, visitor, NULL) == 1) { 373 /* Traversal of this particular caller's callees failed. */ 374 set_destroy(callees); 375 return -1; 376 } 377 } 378 set_union(keeps, callees); 379 return 0; 380} 381 382int main(int argc, char **argv) { 383 options_t *opts = parse_args(argc, argv); 384 385 if (opts == NULL) { 386 perror("failed to parse arguments"); 387 return EXIT_FAILURE; 388 } else if (opts->input == NULL) { 389 fprintf(stderr, "no input file provided\n"); 390 return EXIT_FAILURE; 391 } 392 393 /* Test whether we can read from the file. */ 394 FILE *check = fopen(opts->input, "r"); 395 if (check == NULL) { 396 fprintf(stderr, "input file does not exist or is unreadable\n"); 397 return EXIT_FAILURE; 398 } 399 fclose(check); 400 401 /* Flags to tell Clang that input is C. */ 402 const char *const args[] = { 403 "-x", 404 "c", 405 }; 406 407 /* Parse the source file into a translation unit */ 408 CXIndex index = clang_createIndex(0, 0); 409 CXTranslationUnit tu = clang_parseTranslationUnit(index, opts->input, args, 410 sizeof(args) / sizeof(args[0]), NULL, 0, CXTranslationUnit_None); 411 if (tu == NULL) { 412 fprintf(stderr, "failed to parse source file\n"); 413 return EXIT_FAILURE; 414 } 415 416 FILE *f = fopen(opts->output, "w"); 417 if (f == NULL) { 418 perror("failed to open output"); 419 return EXIT_FAILURE; 420 } 421 422 /* Derive the Control Flow Graph of the TU. We then use this CFG to expand 423 * the kept symbols set to include callees of the kept symbols. 424 */ 425 cfg_t *graph = cfg(tu); 426 if (graph == NULL) { 427 fprintf(stderr, "failed to form CFG"); 428 if (errno != 0) { 429 perror(": "); 430 } else { 431 fprintf(stderr, "\n"); 432 } 433 return EXIT_FAILURE; 434 } 435 if (merge_callees(opts->keep, graph) != 0) { 436 fprintf(stderr, "Failed to traverse CFG\n"); 437 return EXIT_FAILURE; 438 } 439 cfg_destroy(graph); /* no longer needed */ 440 441 state_t st = { 442 .keep = opts->keep, 443 .blacklist = opts->blacklist, 444 .extra_attributes = opts->extra_attributes, 445 .tu = &tu, 446 .out = f, 447 }; 448 449 /* Now traverse the AST */ 450 CXCursor cursor = clang_getTranslationUnitCursor(tu); 451 clang_visitChildren(cursor, (CXCursorVisitor)visitor, &st); 452 453 clang_disposeTranslationUnit(tu); 454 clang_disposeIndex(index); 455 456 return 0; 457} 458