1//===--- Macros.h - Format C++ code -----------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file contains the main building blocks of macro support in
11/// clang-format.
12///
13/// In order to not violate the requirement that clang-format can format files
14/// in isolation, clang-format's macro support uses expansions users provide
15/// as part of clang-format's style configuration.
16///
17/// Macro definitions are of the form "MACRO(p1, p2)=p1 + p2", but only support
18/// one level of expansion (\see MacroExpander for a full description of what
19/// is supported).
20///
21/// As part of parsing, clang-format uses the MacroExpander to expand the
22/// spelled token streams into expanded token streams when it encounters a
23/// macro call. The UnwrappedLineParser continues to parse UnwrappedLines
24/// from the expanded token stream.
25/// After the expanded unwrapped lines are parsed, the MacroCallReconstructor
26/// matches the spelled token stream into unwrapped lines that best resemble the
27/// structure of the expanded unwrapped lines. These reconstructed unwrapped
28/// lines are aliasing the tokens in the expanded token stream, so that token
29/// annotations will be reused when formatting the spelled macro calls.
30///
31/// When formatting, clang-format annotates and formats the expanded unwrapped
32/// lines first, determining the token types. Next, it formats the spelled
33/// unwrapped lines, keeping the token types fixed, while allowing other
34/// formatting decisions to change.
35///
36//===----------------------------------------------------------------------===//
37
38#ifndef CLANG_LIB_FORMAT_MACROS_H
39#define CLANG_LIB_FORMAT_MACROS_H
40
41#include <list>
42#include <map>
43#include <string>
44#include <vector>
45
46#include "FormatToken.h"
47#include "llvm/ADT/ArrayRef.h"
48#include "llvm/ADT/DenseMap.h"
49#include "llvm/ADT/SmallVector.h"
50#include "llvm/ADT/StringRef.h"
51
52namespace clang {
53namespace format {
54
55struct UnwrappedLine;
56struct UnwrappedLineNode;
57
58/// Takes a set of macro definitions as strings and allows expanding calls to
59/// those macros.
60///
61/// For example:
62/// Definition: A(x, y)=x + y
63/// Call      : A(int a = 1, 2)
64/// Expansion : int a = 1 + 2
65///
66/// Expansion does not check arity of the definition.
67/// If fewer arguments than expected are provided, the remaining parameters
68/// are considered empty:
69/// Call     : A(a)
70/// Expansion: a +
71/// If more arguments than expected are provided, they will be discarded.
72///
73/// The expander does not support:
74/// - recursive expansion
75/// - stringification
76/// - concatenation
77/// - variadic macros
78///
79/// Furthermore, only a single expansion of each macro argument is supported,
80/// so that we cannot get conflicting formatting decisions from different
81/// expansions.
82/// Definition: A(x)=x+x
83/// Call      : A(id)
84/// Expansion : id+x
85///
86class MacroExpander {
87public:
88  using ArgsList = llvm::ArrayRef<llvm::SmallVector<FormatToken *, 8>>;
89
90  /// Construct a macro expander from a set of macro definitions.
91  /// Macro definitions must be encoded as UTF-8.
92  ///
93  /// Each entry in \p Macros must conform to the following simple
94  /// macro-definition language:
95  /// <definition> ::= <id> <expansion> | <id> "(" <params> ")" <expansion>
96  /// <params>     ::= <id-list> | ""
97  /// <id-list>    ::= <id> | <id> "," <params>
98  /// <expansion>  ::= "=" <tail> | <eof>
99  /// <tail>       ::= <tok> <tail> | <eof>
100  ///
101  /// Macros that cannot be parsed will be silently discarded.
102  ///
103  MacroExpander(const std::vector<std::string> &Macros,
104                clang::SourceManager &SourceMgr, const FormatStyle &Style,
105                llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
106                IdentifierTable &IdentTable);
107  ~MacroExpander();
108
109  /// Returns whether any macro \p Name is defined, regardless of overloads.
110  bool defined(llvm::StringRef Name) const;
111
112  /// Returns whetherh there is an object-like overload, i.e. where the macro
113  /// has no arguments and should not consume subsequent parentheses.
114  bool objectLike(llvm::StringRef Name) const;
115
116  /// Returns whether macro \p Name provides an overload with the given arity.
117  bool hasArity(llvm::StringRef Name, unsigned Arity) const;
118
119  /// Returns the expanded stream of format tokens for \p ID, where
120  /// each element in \p Args is a positional argument to the macro call.
121  /// If \p Args is not set, the object-like overload is used.
122  /// If \p Args is set, the overload with the arity equal to \c Args.size() is
123  /// used.
124  llvm::SmallVector<FormatToken *, 8>
125  expand(FormatToken *ID, std::optional<ArgsList> OptionalArgs) const;
126
127private:
128  struct Definition;
129  class DefinitionParser;
130
131  void parseDefinition(const std::string &Macro);
132
133  clang::SourceManager &SourceMgr;
134  const FormatStyle &Style;
135  llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator;
136  IdentifierTable &IdentTable;
137  SmallVector<std::unique_ptr<llvm::MemoryBuffer>> Buffers;
138  llvm::StringMap<llvm::DenseMap<int, Definition>> FunctionLike;
139  llvm::StringMap<Definition> ObjectLike;
140};
141
142/// Converts a sequence of UnwrappedLines containing expanded macros into a
143/// single UnwrappedLine containing the macro calls.  This UnwrappedLine may be
144/// broken into child lines, in a way that best conveys the structure of the
145/// expanded code.
146///
147/// In the simplest case, a spelled UnwrappedLine contains one macro, and after
148/// expanding it we have one expanded UnwrappedLine.  In general, macro
149/// expansions can span UnwrappedLines, and multiple macros can contribute
150/// tokens to the same line.  We keep consuming expanded lines until:
151/// *   all expansions that started have finished (we're not chopping any macros
152///     in half)
153/// *   *and* we've reached the end of a *spelled* unwrapped line.
154///
155/// A single UnwrappedLine represents this chunk of code.
156///
157/// After this point, the state of the spelled/expanded stream is "in sync"
158/// (both at the start of an UnwrappedLine, with no macros open), so the
159/// Reconstructor can be thrown away and parsing can continue.
160///
161/// Given a mapping from the macro name identifier token in the macro call
162/// to the tokens of the macro call, for example:
163/// CLASSA -> CLASSA({public: void x();})
164///
165/// When getting the formatted lines of the expansion via the \c addLine method
166/// (each '->' specifies a call to \c addLine ):
167/// -> class A {
168/// -> public:
169/// ->   void x();
170/// -> };
171///
172/// Creates the tree of unwrapped lines containing the macro call tokens so that
173/// the macro call tokens fit the semantic structure of the expanded formatted
174/// lines:
175/// -> CLASSA({
176/// -> public:
177/// ->   void x();
178/// -> })
179class MacroCallReconstructor {
180public:
181  /// Create an Reconstructor whose resulting \p UnwrappedLine will start at
182  /// \p Level, using the map from name identifier token to the corresponding
183  /// tokens of the spelled macro call.
184  MacroCallReconstructor(
185      unsigned Level,
186      const llvm::DenseMap<FormatToken *, std::unique_ptr<UnwrappedLine>>
187          &ActiveExpansions);
188
189  /// For the given \p Line, match all occurences of tokens expanded from a
190  /// macro to unwrapped lines in the spelled macro call so that the resulting
191  /// tree of unwrapped lines best resembles the structure of unwrapped lines
192  /// passed in via \c addLine.
193  void addLine(const UnwrappedLine &Line);
194
195  /// Check whether at the current state there is no open macro expansion
196  /// that needs to be processed to finish an macro call.
197  /// Only when \c finished() is true, \c takeResult() can be called to retrieve
198  /// the resulting \c UnwrappedLine.
199  /// If there are multiple subsequent macro calls within an unwrapped line in
200  /// the spelled token stream, the calling code may also continue to call
201  /// \c addLine() when \c finished() is true.
202  bool finished() const { return ActiveExpansions.empty(); }
203
204  /// Retrieve the formatted \c UnwrappedLine containing the orginal
205  /// macro calls, formatted according to the expanded token stream received
206  /// via \c addLine().
207  /// Generally, this line tries to have the same structure as the expanded,
208  /// formatted unwrapped lines handed in via \c addLine(), with the exception
209  /// that for multiple top-level lines, each subsequent line will be the
210  /// child of the last token in its predecessor. This representation is chosen
211  /// because it is a precondition to the formatter that we get what looks like
212  /// a single statement in a single \c UnwrappedLine (i.e. matching parens).
213  ///
214  /// If a token in a macro argument is a child of a token in the expansion,
215  /// the parent will be the corresponding token in the macro call.
216  /// For example:
217  ///   #define C(a, b) class C { a b
218  ///   C(int x;, int y;)
219  /// would expand to
220  ///   class C { int x; int y;
221  /// where in a formatted line "int x;" and "int y;" would both be new separate
222  /// lines.
223  ///
224  /// In the result, "int x;" will be a child of the opening parenthesis in "C("
225  /// and "int y;" will be a child of the "," token:
226  ///   C (
227  ///     \- int x;
228  ///     ,
229  ///     \- int y;
230  ///     )
231  UnwrappedLine takeResult() &&;
232
233private:
234  void add(FormatToken *Token, FormatToken *ExpandedParent, bool First);
235  void prepareParent(FormatToken *ExpandedParent, bool First);
236  FormatToken *getParentInResult(FormatToken *Parent);
237  void reconstruct(FormatToken *Token);
238  void startReconstruction(FormatToken *Token);
239  bool reconstructActiveCallUntil(FormatToken *Token);
240  void endReconstruction(FormatToken *Token);
241  bool processNextReconstructed();
242  void finalize();
243
244  struct ReconstructedLine;
245
246  void appendToken(FormatToken *Token, ReconstructedLine *L = nullptr);
247  UnwrappedLine createUnwrappedLine(const ReconstructedLine &Line, int Level);
248  void debug(const ReconstructedLine &Line, int Level);
249  ReconstructedLine &parentLine();
250  ReconstructedLine *currentLine();
251  void debugParentMap() const;
252
253#ifndef NDEBUG
254  enum ReconstructorState {
255    Start,      // No macro expansion was found in the input yet.
256    InProgress, // During a macro reconstruction.
257    Finalized,  // Past macro reconstruction, the result is finalized.
258  };
259  ReconstructorState State = Start;
260#endif
261
262  // Node in which we build up the resulting unwrapped line; this type is
263  // analogous to UnwrappedLineNode.
264  struct LineNode {
265    LineNode() = default;
266    LineNode(FormatToken *Tok) : Tok(Tok) {}
267    FormatToken *Tok = nullptr;
268    llvm::SmallVector<std::unique_ptr<ReconstructedLine>> Children;
269  };
270
271  // Line in which we build up the resulting unwrapped line.
272  // FIXME: Investigate changing UnwrappedLine to a pointer type and using it
273  // instead of rolling our own type.
274  struct ReconstructedLine {
275    llvm::SmallVector<std::unique_ptr<LineNode>> Tokens;
276  };
277
278  // The line in which we collect the resulting reconstructed output.
279  // To reduce special cases in the algorithm, the first level of the line
280  // contains a single null token that has the reconstructed incoming
281  // lines as children.
282  // In the end, we stich the lines together so that each subsequent line
283  // is a child of the last token of the previous line. This is necessary
284  // in order to format the overall expression as a single logical line -
285  // if we created separate lines, we'd format them with their own top-level
286  // indent depending on the semantic structure, which is not desired.
287  ReconstructedLine Result;
288
289  // Stack of currently "open" lines, where each line's predecessor's last
290  // token is the parent token for that line.
291  llvm::SmallVector<ReconstructedLine *> ActiveReconstructedLines;
292
293  // Maps from the expanded token to the token that takes its place in the
294  // reconstructed token stream in terms of parent-child relationships.
295  // Note that it might take multiple steps to arrive at the correct
296  // parent in the output.
297  // Given: #define C(a, b) []() { a; b; }
298  // And a call: C(f(), g())
299  // The structure in the incoming formatted unwrapped line will be:
300  // []() {
301  //      |- f();
302  //      \- g();
303  // }
304  // with f and g being children of the opening brace.
305  // In the reconstructed call:
306  // C(f(), g())
307  //  \- f()
308  //      \- g()
309  // We want f to be a child of the opening parenthesis and g to be a child
310  // of the comma token in the macro call.
311  // Thus, we map
312  // { -> (
313  // and add
314  // ( -> ,
315  // once we're past the comma in the reconstruction.
316  llvm::DenseMap<FormatToken *, FormatToken *>
317      SpelledParentToReconstructedParent;
318
319  // Keeps track of a single expansion while we're reconstructing tokens it
320  // generated.
321  struct Expansion {
322    // The identifier token of the macro call.
323    FormatToken *ID;
324    // Our current position in the reconstruction.
325    std::list<UnwrappedLineNode>::iterator SpelledI;
326    // The end of the reconstructed token sequence.
327    std::list<UnwrappedLineNode>::iterator SpelledE;
328  };
329
330  // Stack of macro calls for which we're in the middle of an expansion.
331  llvm::SmallVector<Expansion> ActiveExpansions;
332
333  struct MacroCallState {
334    MacroCallState(ReconstructedLine *Line, FormatToken *ParentLastToken,
335                   FormatToken *MacroCallLParen);
336
337    ReconstructedLine *Line;
338
339    // The last token in the parent line or expansion, or nullptr if the macro
340    // expansion is on a top-level line.
341    //
342    // For example, in the macro call:
343    //   auto f = []() { ID(1); };
344    // The MacroCallState for ID will have '{' as ParentLastToken.
345    //
346    // In the macro call:
347    //   ID(ID(void f()));
348    // The MacroCallState of the outer ID will have nullptr as ParentLastToken,
349    // while the MacroCallState for the inner ID will have the '(' of the outer
350    // ID as ParentLastToken.
351    //
352    // In the macro call:
353    //   ID2(a, ID(b));
354    // The MacroCallState of ID will have ',' as ParentLastToken.
355    FormatToken *ParentLastToken;
356
357    // The l_paren of this MacroCallState's macro call.
358    FormatToken *MacroCallLParen;
359  };
360
361  // Keeps track of the lines into which the opening brace/parenthesis &
362  // argument separating commas for each level in the macro call go in order to
363  // put the corresponding closing brace/parenthesis into the same line in the
364  // output and keep track of which parents in the expanded token stream map to
365  // which tokens in the reconstructed stream.
366  // When an opening brace/parenthesis has children, we want the structure of
367  // the output line to be:
368  // |- MACRO
369  // |- (
370  // |  \- <argument>
371  // |- ,
372  // |  \- <argument>
373  // \- )
374  llvm::SmallVector<MacroCallState> MacroCallStructure;
375
376  // Level the generated UnwrappedLine will be at.
377  const unsigned Level;
378
379  // Maps from identifier of the macro call to an unwrapped line containing
380  // all tokens of the macro call.
381  const llvm::DenseMap<FormatToken *, std::unique_ptr<UnwrappedLine>>
382      &IdToReconstructed;
383};
384
385} // namespace format
386} // namespace clang
387
388#endif
389