1//===-- sanitizer_symbolizer_libcdep.cpp ----------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file is shared between AddressSanitizer and ThreadSanitizer
10// run-time libraries.
11//===----------------------------------------------------------------------===//
12
13#include "sanitizer_allocator_internal.h"
14#include "sanitizer_internal_defs.h"
15#include "sanitizer_platform.h"
16#include "sanitizer_symbolizer_internal.h"
17
18namespace __sanitizer {
19
20Symbolizer *Symbolizer::GetOrInit() {
21  SpinMutexLock l(&init_mu_);
22  if (symbolizer_)
23    return symbolizer_;
24  symbolizer_ = PlatformInit();
25  CHECK(symbolizer_);
26  return symbolizer_;
27}
28
29// See sanitizer_symbolizer_markup.cpp.
30#if !SANITIZER_SYMBOLIZER_MARKUP
31
32const char *ExtractToken(const char *str, const char *delims, char **result) {
33  uptr prefix_len = internal_strcspn(str, delims);
34  *result = (char*)InternalAlloc(prefix_len + 1);
35  internal_memcpy(*result, str, prefix_len);
36  (*result)[prefix_len] = '\0';
37  const char *prefix_end = str + prefix_len;
38  if (*prefix_end != '\0') prefix_end++;
39  return prefix_end;
40}
41
42const char *ExtractInt(const char *str, const char *delims, int *result) {
43  char *buff = nullptr;
44  const char *ret = ExtractToken(str, delims, &buff);
45  if (buff) {
46    *result = (int)internal_atoll(buff);
47  }
48  InternalFree(buff);
49  return ret;
50}
51
52const char *ExtractUptr(const char *str, const char *delims, uptr *result) {
53  char *buff = nullptr;
54  const char *ret = ExtractToken(str, delims, &buff);
55  if (buff) {
56    *result = (uptr)internal_atoll(buff);
57  }
58  InternalFree(buff);
59  return ret;
60}
61
62const char *ExtractSptr(const char *str, const char *delims, sptr *result) {
63  char *buff = nullptr;
64  const char *ret = ExtractToken(str, delims, &buff);
65  if (buff) {
66    *result = (sptr)internal_atoll(buff);
67  }
68  InternalFree(buff);
69  return ret;
70}
71
72const char *ExtractTokenUpToDelimiter(const char *str, const char *delimiter,
73                                      char **result) {
74  const char *found_delimiter = internal_strstr(str, delimiter);
75  uptr prefix_len =
76      found_delimiter ? found_delimiter - str : internal_strlen(str);
77  *result = (char *)InternalAlloc(prefix_len + 1);
78  internal_memcpy(*result, str, prefix_len);
79  (*result)[prefix_len] = '\0';
80  const char *prefix_end = str + prefix_len;
81  if (*prefix_end != '\0') prefix_end += internal_strlen(delimiter);
82  return prefix_end;
83}
84
85SymbolizedStack *Symbolizer::SymbolizePC(uptr addr) {
86  Lock l(&mu_);
87  SymbolizedStack *res = SymbolizedStack::New(addr);
88  auto *mod = FindModuleForAddress(addr);
89  if (!mod)
90    return res;
91  // Always fill data about module name and offset.
92  res->info.FillModuleInfo(*mod);
93  for (auto &tool : tools_) {
94    SymbolizerScope sym_scope(this);
95    if (tool.SymbolizePC(addr, res)) {
96      return res;
97    }
98  }
99  return res;
100}
101
102bool Symbolizer::SymbolizeData(uptr addr, DataInfo *info) {
103  Lock l(&mu_);
104  const char *module_name = nullptr;
105  uptr module_offset;
106  ModuleArch arch;
107  if (!FindModuleNameAndOffsetForAddress(addr, &module_name, &module_offset,
108                                         &arch))
109    return false;
110  info->Clear();
111  info->module = internal_strdup(module_name);
112  info->module_offset = module_offset;
113  info->module_arch = arch;
114  for (auto &tool : tools_) {
115    SymbolizerScope sym_scope(this);
116    if (tool.SymbolizeData(addr, info)) {
117      return true;
118    }
119  }
120  return false;
121}
122
123bool Symbolizer::SymbolizeFrame(uptr addr, FrameInfo *info) {
124  Lock l(&mu_);
125  const char *module_name = nullptr;
126  if (!FindModuleNameAndOffsetForAddress(
127          addr, &module_name, &info->module_offset, &info->module_arch))
128    return false;
129  info->module = internal_strdup(module_name);
130  for (auto &tool : tools_) {
131    SymbolizerScope sym_scope(this);
132    if (tool.SymbolizeFrame(addr, info)) {
133      return true;
134    }
135  }
136  return false;
137}
138
139bool Symbolizer::GetModuleNameAndOffsetForPC(uptr pc, const char **module_name,
140                                             uptr *module_address) {
141  Lock l(&mu_);
142  const char *internal_module_name = nullptr;
143  ModuleArch arch;
144  if (!FindModuleNameAndOffsetForAddress(pc, &internal_module_name,
145                                         module_address, &arch))
146    return false;
147
148  if (module_name)
149    *module_name = module_names_.GetOwnedCopy(internal_module_name);
150  return true;
151}
152
153void Symbolizer::Flush() {
154  Lock l(&mu_);
155  for (auto &tool : tools_) {
156    SymbolizerScope sym_scope(this);
157    tool.Flush();
158  }
159}
160
161const char *Symbolizer::Demangle(const char *name) {
162  CHECK(name);
163  Lock l(&mu_);
164  for (auto &tool : tools_) {
165    SymbolizerScope sym_scope(this);
166    if (const char *demangled = tool.Demangle(name))
167      return demangled;
168  }
169  if (const char *demangled = PlatformDemangle(name))
170    return demangled;
171  return name;
172}
173
174bool Symbolizer::FindModuleNameAndOffsetForAddress(uptr address,
175                                                   const char **module_name,
176                                                   uptr *module_offset,
177                                                   ModuleArch *module_arch) {
178  const LoadedModule *module = FindModuleForAddress(address);
179  if (!module)
180    return false;
181  *module_name = module->full_name();
182  *module_offset = address - module->base_address();
183  *module_arch = module->arch();
184  return true;
185}
186
187void Symbolizer::RefreshModules() {
188  modules_.init();
189  fallback_modules_.fallbackInit();
190  RAW_CHECK(modules_.size() > 0);
191  modules_fresh_ = true;
192}
193
194const ListOfModules &Symbolizer::GetRefreshedListOfModules() {
195  if (!modules_fresh_)
196    RefreshModules();
197
198  return modules_;
199}
200
201static const LoadedModule *SearchForModule(const ListOfModules &modules,
202                                           uptr address) {
203  for (uptr i = 0; i < modules.size(); i++) {
204    if (modules[i].containsAddress(address)) {
205      return &modules[i];
206    }
207  }
208  return nullptr;
209}
210
211const LoadedModule *Symbolizer::FindModuleForAddress(uptr address) {
212  bool modules_were_reloaded = false;
213  if (!modules_fresh_) {
214    RefreshModules();
215    modules_were_reloaded = true;
216  }
217  const LoadedModule *module = SearchForModule(modules_, address);
218  if (module) return module;
219
220  // dlopen/dlclose interceptors invalidate the module list, but when
221  // interception is disabled, we need to retry if the lookup fails in
222  // case the module list changed.
223#if !SANITIZER_INTERCEPT_DLOPEN_DLCLOSE
224  if (!modules_were_reloaded) {
225    RefreshModules();
226    module = SearchForModule(modules_, address);
227    if (module) return module;
228  }
229#endif
230
231  if (fallback_modules_.size()) {
232    module = SearchForModule(fallback_modules_, address);
233  }
234  return module;
235}
236
237// For now we assume the following protocol:
238// For each request of the form
239//   <module_name> <module_offset>
240// passed to STDIN, external symbolizer prints to STDOUT response:
241//   <function_name>
242//   <file_name>:<line_number>:<column_number>
243//   <function_name>
244//   <file_name>:<line_number>:<column_number>
245//   ...
246//   <empty line>
247class LLVMSymbolizerProcess final : public SymbolizerProcess {
248 public:
249  explicit LLVMSymbolizerProcess(const char *path)
250      : SymbolizerProcess(path, /*use_posix_spawn=*/SANITIZER_APPLE) {}
251
252 private:
253  bool ReachedEndOfOutput(const char *buffer, uptr length) const override {
254    // Empty line marks the end of llvm-symbolizer output.
255    return length >= 2 && buffer[length - 1] == '\n' &&
256           buffer[length - 2] == '\n';
257  }
258
259  // When adding a new architecture, don't forget to also update
260  // script/asan_symbolize.py and sanitizer_common.h.
261  void GetArgV(const char *path_to_binary,
262               const char *(&argv)[kArgVMax]) const override {
263#if defined(__x86_64h__)
264    const char* const kSymbolizerArch = "--default-arch=x86_64h";
265#elif defined(__x86_64__)
266    const char* const kSymbolizerArch = "--default-arch=x86_64";
267#elif defined(__i386__)
268    const char* const kSymbolizerArch = "--default-arch=i386";
269#elif SANITIZER_LOONGARCH64
270    const char *const kSymbolizerArch = "--default-arch=loongarch64";
271#elif SANITIZER_RISCV64
272    const char *const kSymbolizerArch = "--default-arch=riscv64";
273#elif defined(__aarch64__)
274    const char* const kSymbolizerArch = "--default-arch=arm64";
275#elif defined(__arm__)
276    const char* const kSymbolizerArch = "--default-arch=arm";
277#elif defined(__powerpc64__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
278    const char* const kSymbolizerArch = "--default-arch=powerpc64";
279#elif defined(__powerpc64__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
280    const char* const kSymbolizerArch = "--default-arch=powerpc64le";
281#elif defined(__s390x__)
282    const char* const kSymbolizerArch = "--default-arch=s390x";
283#elif defined(__s390__)
284    const char* const kSymbolizerArch = "--default-arch=s390";
285#else
286    const char* const kSymbolizerArch = "--default-arch=unknown";
287#endif
288
289    const char *const demangle_flag =
290        common_flags()->demangle ? "--demangle" : "--no-demangle";
291    const char *const inline_flag =
292        common_flags()->symbolize_inline_frames ? "--inlines" : "--no-inlines";
293    int i = 0;
294    argv[i++] = path_to_binary;
295    argv[i++] = demangle_flag;
296    argv[i++] = inline_flag;
297    argv[i++] = kSymbolizerArch;
298    argv[i++] = nullptr;
299    CHECK_LE(i, kArgVMax);
300  }
301};
302
303LLVMSymbolizer::LLVMSymbolizer(const char *path, LowLevelAllocator *allocator)
304    : symbolizer_process_(new(*allocator) LLVMSymbolizerProcess(path)) {}
305
306// Parse a <file>:<line>[:<column>] buffer. The file path may contain colons on
307// Windows, so extract tokens from the right hand side first. The column info is
308// also optional.
309static const char *ParseFileLineInfo(AddressInfo *info, const char *str) {
310  char *file_line_info = nullptr;
311  str = ExtractToken(str, "\n", &file_line_info);
312  CHECK(file_line_info);
313
314  if (uptr size = internal_strlen(file_line_info)) {
315    char *back = file_line_info + size - 1;
316    for (int i = 0; i < 2; ++i) {
317      while (back > file_line_info && IsDigit(*back)) --back;
318      if (*back != ':' || !IsDigit(back[1])) break;
319      info->column = info->line;
320      info->line = internal_atoll(back + 1);
321      // Truncate the string at the colon to keep only filename.
322      *back = '\0';
323      --back;
324    }
325    ExtractToken(file_line_info, "", &info->file);
326  }
327
328  InternalFree(file_line_info);
329  return str;
330}
331
332// Parses one or more two-line strings in the following format:
333//   <function_name>
334//   <file_name>:<line_number>[:<column_number>]
335// Used by LLVMSymbolizer, Addr2LinePool and InternalSymbolizer, since all of
336// them use the same output format.
337void ParseSymbolizePCOutput(const char *str, SymbolizedStack *res) {
338  bool top_frame = true;
339  SymbolizedStack *last = res;
340  while (true) {
341    char *function_name = nullptr;
342    str = ExtractToken(str, "\n", &function_name);
343    CHECK(function_name);
344    if (function_name[0] == '\0') {
345      // There are no more frames.
346      InternalFree(function_name);
347      break;
348    }
349    SymbolizedStack *cur;
350    if (top_frame) {
351      cur = res;
352      top_frame = false;
353    } else {
354      cur = SymbolizedStack::New(res->info.address);
355      cur->info.FillModuleInfo(res->info.module, res->info.module_offset,
356                               res->info.module_arch);
357      last->next = cur;
358      last = cur;
359    }
360
361    AddressInfo *info = &cur->info;
362    info->function = function_name;
363    str = ParseFileLineInfo(info, str);
364
365    // Functions and filenames can be "??", in which case we write 0
366    // to address info to mark that names are unknown.
367    if (0 == internal_strcmp(info->function, "??")) {
368      InternalFree(info->function);
369      info->function = 0;
370    }
371    if (info->file && 0 == internal_strcmp(info->file, "??")) {
372      InternalFree(info->file);
373      info->file = 0;
374    }
375  }
376}
377
378// Parses a two- or three-line string in the following format:
379//   <symbol_name>
380//   <start_address> <size>
381//   <filename>:<column>
382// Used by LLVMSymbolizer and InternalSymbolizer. LLVMSymbolizer added support
383// for symbolizing the third line in D123538, but we support the older two-line
384// information as well.
385void ParseSymbolizeDataOutput(const char *str, DataInfo *info) {
386  str = ExtractToken(str, "\n", &info->name);
387  str = ExtractUptr(str, " ", &info->start);
388  str = ExtractUptr(str, "\n", &info->size);
389  // Note: If the third line isn't present, these calls will set info.{file,
390  // line} to empty strings.
391  str = ExtractToken(str, ":", &info->file);
392  str = ExtractUptr(str, "\n", &info->line);
393}
394
395void ParseSymbolizeFrameOutput(const char *str,
396                               InternalMmapVector<LocalInfo> *locals) {
397  if (internal_strncmp(str, "??", 2) == 0)
398    return;
399
400  while (*str) {
401    LocalInfo local;
402    str = ExtractToken(str, "\n", &local.function_name);
403    str = ExtractToken(str, "\n", &local.name);
404
405    AddressInfo addr;
406    str = ParseFileLineInfo(&addr, str);
407    local.decl_file = addr.file;
408    local.decl_line = addr.line;
409
410    local.has_frame_offset = internal_strncmp(str, "??", 2) != 0;
411    str = ExtractSptr(str, " ", &local.frame_offset);
412
413    local.has_size = internal_strncmp(str, "??", 2) != 0;
414    str = ExtractUptr(str, " ", &local.size);
415
416    local.has_tag_offset = internal_strncmp(str, "??", 2) != 0;
417    str = ExtractUptr(str, "\n", &local.tag_offset);
418
419    locals->push_back(local);
420  }
421}
422
423bool LLVMSymbolizer::SymbolizePC(uptr addr, SymbolizedStack *stack) {
424  AddressInfo *info = &stack->info;
425  const char *buf = FormatAndSendCommand(
426      "CODE", info->module, info->module_offset, info->module_arch);
427  if (!buf)
428    return false;
429  ParseSymbolizePCOutput(buf, stack);
430  return true;
431}
432
433bool LLVMSymbolizer::SymbolizeData(uptr addr, DataInfo *info) {
434  const char *buf = FormatAndSendCommand(
435      "DATA", info->module, info->module_offset, info->module_arch);
436  if (!buf)
437    return false;
438  ParseSymbolizeDataOutput(buf, info);
439  info->start += (addr - info->module_offset); // Add the base address.
440  return true;
441}
442
443bool LLVMSymbolizer::SymbolizeFrame(uptr addr, FrameInfo *info) {
444  const char *buf = FormatAndSendCommand(
445      "FRAME", info->module, info->module_offset, info->module_arch);
446  if (!buf)
447    return false;
448  ParseSymbolizeFrameOutput(buf, &info->locals);
449  return true;
450}
451
452const char *LLVMSymbolizer::FormatAndSendCommand(const char *command_prefix,
453                                                 const char *module_name,
454                                                 uptr module_offset,
455                                                 ModuleArch arch) {
456  CHECK(module_name);
457  int size_needed = 0;
458  if (arch == kModuleArchUnknown)
459    size_needed = internal_snprintf(buffer_, kBufferSize, "%s \"%s\" 0x%zx\n",
460                                    command_prefix, module_name, module_offset);
461  else
462    size_needed = internal_snprintf(buffer_, kBufferSize,
463                                    "%s \"%s:%s\" 0x%zx\n", command_prefix,
464                                    module_name, ModuleArchToString(arch),
465                                    module_offset);
466
467  if (size_needed >= static_cast<int>(kBufferSize)) {
468    Report("WARNING: Command buffer too small");
469    return nullptr;
470  }
471
472  return symbolizer_process_->SendCommand(buffer_);
473}
474
475SymbolizerProcess::SymbolizerProcess(const char *path, bool use_posix_spawn)
476    : path_(path),
477      input_fd_(kInvalidFd),
478      output_fd_(kInvalidFd),
479      times_restarted_(0),
480      failed_to_start_(false),
481      reported_invalid_path_(false),
482      use_posix_spawn_(use_posix_spawn) {
483  CHECK(path_);
484  CHECK_NE(path_[0], '\0');
485}
486
487static bool IsSameModule(const char* path) {
488  if (const char* ProcessName = GetProcessName()) {
489    if (const char* SymbolizerName = StripModuleName(path)) {
490      return !internal_strcmp(ProcessName, SymbolizerName);
491    }
492  }
493  return false;
494}
495
496const char *SymbolizerProcess::SendCommand(const char *command) {
497  if (failed_to_start_)
498    return nullptr;
499  if (IsSameModule(path_)) {
500    Report("WARNING: Symbolizer was blocked from starting itself!\n");
501    failed_to_start_ = true;
502    return nullptr;
503  }
504  for (; times_restarted_ < kMaxTimesRestarted; times_restarted_++) {
505    // Start or restart symbolizer if we failed to send command to it.
506    if (const char *res = SendCommandImpl(command))
507      return res;
508    Restart();
509  }
510  if (!failed_to_start_) {
511    Report("WARNING: Failed to use and restart external symbolizer!\n");
512    failed_to_start_ = true;
513  }
514  return nullptr;
515}
516
517const char *SymbolizerProcess::SendCommandImpl(const char *command) {
518  if (input_fd_ == kInvalidFd || output_fd_ == kInvalidFd)
519      return nullptr;
520  if (!WriteToSymbolizer(command, internal_strlen(command)))
521      return nullptr;
522  if (!ReadFromSymbolizer())
523    return nullptr;
524  return buffer_.data();
525}
526
527bool SymbolizerProcess::Restart() {
528  if (input_fd_ != kInvalidFd)
529    CloseFile(input_fd_);
530  if (output_fd_ != kInvalidFd)
531    CloseFile(output_fd_);
532  return StartSymbolizerSubprocess();
533}
534
535bool SymbolizerProcess::ReadFromSymbolizer() {
536  buffer_.clear();
537  constexpr uptr max_length = 1024;
538  bool ret = true;
539  do {
540    uptr just_read = 0;
541    uptr size_before = buffer_.size();
542    buffer_.resize(size_before + max_length);
543    buffer_.resize(buffer_.capacity());
544    bool ret = ReadFromFile(input_fd_, &buffer_[size_before],
545                            buffer_.size() - size_before, &just_read);
546
547    if (!ret)
548      just_read = 0;
549
550    buffer_.resize(size_before + just_read);
551
552    // We can't read 0 bytes, as we don't expect external symbolizer to close
553    // its stdout.
554    if (just_read == 0) {
555      Report("WARNING: Can't read from symbolizer at fd %d\n", input_fd_);
556      ret = false;
557      break;
558    }
559  } while (!ReachedEndOfOutput(buffer_.data(), buffer_.size()));
560  buffer_.push_back('\0');
561  return ret;
562}
563
564bool SymbolizerProcess::WriteToSymbolizer(const char *buffer, uptr length) {
565  if (length == 0)
566    return true;
567  uptr write_len = 0;
568  bool success = WriteToFile(output_fd_, buffer, length, &write_len);
569  if (!success || write_len != length) {
570    Report("WARNING: Can't write to symbolizer at fd %d\n", output_fd_);
571    return false;
572  }
573  return true;
574}
575
576#endif  // !SANITIZER_SYMBOLIZER_MARKUP
577
578}  // namespace __sanitizer
579