1//===- lib/DebugInfo/Symbolize/Markup.cpp ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file defines the log symbolizer markup data model and parser.
11///
12//===----------------------------------------------------------------------===//
13
14#include "llvm/DebugInfo/Symbolize/Markup.h"
15
16#include "llvm/ADT/STLExtras.h"
17#include "llvm/ADT/StringExtras.h"
18
19namespace llvm {
20namespace symbolize {
21
22// Matches the following:
23//   "\033[0m"
24//   "\033[1m"
25//   "\033[30m" -- "\033[37m"
26static const char SGRSyntaxStr[] = "\033\\[([0-1]|3[0-7])m";
27
28MarkupParser::MarkupParser(StringSet<> MultilineTags)
29    : MultilineTags(std::move(MultilineTags)), SGRSyntax(SGRSyntaxStr) {}
30
31static StringRef takeTo(StringRef Str, StringRef::iterator Pos) {
32  return Str.take_front(Pos - Str.begin());
33}
34static void advanceTo(StringRef &Str, StringRef::iterator Pos) {
35  Str = Str.drop_front(Pos - Str.begin());
36}
37
38void MarkupParser::parseLine(StringRef Line) {
39  Buffer.clear();
40  NextIdx = 0;
41  FinishedMultiline.clear();
42  this->Line = Line;
43}
44
45std::optional<MarkupNode> MarkupParser::nextNode() {
46  // Pull something out of the buffer if possible.
47  if (!Buffer.empty()) {
48    if (NextIdx < Buffer.size())
49      return std::move(Buffer[NextIdx++]);
50    NextIdx = 0;
51    Buffer.clear();
52  }
53
54  // The buffer is empty, so parse the next bit of the line.
55
56  if (Line.empty())
57    return std::nullopt;
58
59  if (!InProgressMultiline.empty()) {
60    if (std::optional<StringRef> MultilineEnd = parseMultiLineEnd(Line)) {
61      llvm::append_range(InProgressMultiline, *MultilineEnd);
62      assert(FinishedMultiline.empty() &&
63             "At most one multi-line element can be finished at a time.");
64      FinishedMultiline.swap(InProgressMultiline);
65      // Parse the multi-line element as if it were contiguous.
66      advanceTo(Line, MultilineEnd->end());
67      return *parseElement(FinishedMultiline);
68    }
69
70    // The whole line is part of the multi-line element.
71    llvm::append_range(InProgressMultiline, Line);
72    Line = Line.drop_front(Line.size());
73    return std::nullopt;
74  }
75
76  // Find the first valid markup element, if any.
77  if (std::optional<MarkupNode> Element = parseElement(Line)) {
78    parseTextOutsideMarkup(takeTo(Line, Element->Text.begin()));
79    Buffer.push_back(std::move(*Element));
80    advanceTo(Line, Element->Text.end());
81    return nextNode();
82  }
83
84  // Since there were no valid elements remaining, see if the line opens a
85  // multi-line element.
86  if (std::optional<StringRef> MultilineBegin = parseMultiLineBegin(Line)) {
87    // Emit any text before the element.
88    parseTextOutsideMarkup(takeTo(Line, MultilineBegin->begin()));
89
90    // Begin recording the multi-line element.
91    llvm::append_range(InProgressMultiline, *MultilineBegin);
92    Line = Line.drop_front(Line.size());
93    return nextNode();
94  }
95
96  // The line doesn't contain any more markup elements, so emit it as text.
97  parseTextOutsideMarkup(Line);
98  Line = Line.drop_front(Line.size());
99  return nextNode();
100}
101
102void MarkupParser::flush() {
103  Buffer.clear();
104  NextIdx = 0;
105  Line = {};
106  if (InProgressMultiline.empty())
107    return;
108  FinishedMultiline.swap(InProgressMultiline);
109  parseTextOutsideMarkup(FinishedMultiline);
110}
111
112// Finds and returns the next valid markup element in the given line. Returns
113// std::nullopt if the line contains no valid elements.
114std::optional<MarkupNode> MarkupParser::parseElement(StringRef Line) {
115  while (true) {
116    // Find next element using begin and end markers.
117    size_t BeginPos = Line.find("{{{");
118    if (BeginPos == StringRef::npos)
119      return std::nullopt;
120    size_t EndPos = Line.find("}}}", BeginPos + 3);
121    if (EndPos == StringRef::npos)
122      return std::nullopt;
123    EndPos += 3;
124    MarkupNode Element;
125    Element.Text = Line.slice(BeginPos, EndPos);
126    Line = Line.substr(EndPos);
127
128    // Parse tag.
129    StringRef Content = Element.Text.drop_front(3).drop_back(3);
130    StringRef FieldsContent;
131    std::tie(Element.Tag, FieldsContent) = Content.split(':');
132    if (Element.Tag.empty())
133      continue;
134
135    // Parse fields.
136    if (!FieldsContent.empty())
137      FieldsContent.split(Element.Fields, ":");
138    else if (Content.back() == ':')
139      Element.Fields.push_back(FieldsContent);
140
141    return Element;
142  }
143}
144
145static MarkupNode textNode(StringRef Text) {
146  MarkupNode Node;
147  Node.Text = Text;
148  return Node;
149}
150
151// Parses a region of text known to be outside any markup elements. Such text
152// may still contain SGR control codes, so the region is further subdivided into
153// control codes and true text regions.
154void MarkupParser::parseTextOutsideMarkup(StringRef Text) {
155  if (Text.empty())
156    return;
157  SmallVector<StringRef> Matches;
158  while (SGRSyntax.match(Text, &Matches)) {
159    // Emit any text before the SGR element.
160    if (Matches.begin()->begin() != Text.begin())
161      Buffer.push_back(textNode(takeTo(Text, Matches.begin()->begin())));
162
163    Buffer.push_back(textNode(*Matches.begin()));
164    advanceTo(Text, Matches.begin()->end());
165  }
166  if (!Text.empty())
167    Buffer.push_back(textNode(Text));
168}
169
170// Given that a line doesn't contain any valid markup, see if it ends with the
171// start of a multi-line element. If so, returns the beginning.
172std::optional<StringRef> MarkupParser::parseMultiLineBegin(StringRef Line) {
173  // A multi-line begin marker must be the last one on the line.
174  size_t BeginPos = Line.rfind("{{{");
175  if (BeginPos == StringRef::npos)
176    return std::nullopt;
177  size_t BeginTagPos = BeginPos + 3;
178
179  // If there are any end markers afterwards, the begin marker cannot belong to
180  // a multi-line element.
181  size_t EndPos = Line.find("}}}", BeginTagPos);
182  if (EndPos != StringRef::npos)
183    return std::nullopt;
184
185  // Check whether the tag is registered multi-line.
186  size_t EndTagPos = Line.find(':', BeginTagPos);
187  if (EndTagPos == StringRef::npos)
188    return std::nullopt;
189  StringRef Tag = Line.slice(BeginTagPos, EndTagPos);
190  if (!MultilineTags.contains(Tag))
191    return std::nullopt;
192  return Line.substr(BeginPos);
193}
194
195// See if the line begins with the ending of an in-progress multi-line element.
196// If so, return the ending.
197std::optional<StringRef> MarkupParser::parseMultiLineEnd(StringRef Line) {
198  size_t EndPos = Line.find("}}}");
199  if (EndPos == StringRef::npos)
200    return std::nullopt;
201  return Line.take_front(EndPos + 3);
202}
203
204} // end namespace symbolize
205} // end namespace llvm
206