lib/Format/MacroCallReconstructor.cpp

//===--- MacroCallReconstructor.cpp - Format C++ code -----------*- C++ -*-===//
//
//                     The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
///
/// \file
/// This file contains the implementation of MacroCallReconstructor, which fits
/// an reconstructed macro call to a parsed set of UnwrappedLines.
///
//===----------------------------------------------------------------------===//

#include "Macros.h"

#include "UnwrappedLineParser.h"
#include "clang/Basic/TokenKinds.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/Support/Debug.h"
#include <cassert>

#define DEBUG_TYPE "format-reconstruct"

namespace clang {
namespace format {

// Call \p Call for each token in the unwrapped line given, passing
// the token, its parent and whether it is the first token in the line.
template <typename T>
void forEachToken(const UnwrappedLine &Line, const T &Call,
                  FormatToken *Parent = nullptr) {
  bool First = true;
  for (const auto &N : Line.Tokens) {
    Call(N.Tok, Parent, First);
    First = false;
    for (const auto &Child : N.Children)
      forEachToken(Child, Call, N.Tok);
  }
}

MacroCallReconstructor::MacroCallReconstructor(
    unsigned Level,
    const llvm::DenseMap<FormatToken *, std::unique_ptr<UnwrappedLine>>
        &ActiveExpansions)
    : Level(Level), IdToReconstructed(ActiveExpansions) {
  Result.Tokens.push_back(std::make_unique<LineNode>());
  ActiveReconstructedLines.push_back(&Result);
}

void MacroCallReconstructor::addLine(const UnwrappedLine &Line) {
  assert(State != Finalized);
  LLVM_DEBUG(llvm::dbgs() << "MCR: new line...\n");
  forEachToken(Line, [&](FormatToken *Token, FormatToken *Parent, bool First) {
    add(Token, Parent, First);
  });
  assert(InProgress || finished());
}

UnwrappedLine MacroCallReconstructor::takeResult() && {
  finalize();
  assert(Result.Tokens.size() == 1 &&
         Result.Tokens.front()->Children.size() == 1);
  UnwrappedLine Final =
      createUnwrappedLine(*Result.Tokens.front()->Children.front(), Level);
  assert(!Final.Tokens.empty());
  return Final;
}

// Reconstruct the position of the next \p Token, given its parent \p
// ExpandedParent in the incoming unwrapped line. \p First specifies whether it
// is the first token in a given unwrapped line.
void MacroCallReconstructor::add(FormatToken *Token,
                                 FormatToken *ExpandedParent, bool First) {
  LLVM_DEBUG(
      llvm::dbgs() << "MCR: Token: " << Token->TokenText << ", Parent: "
                   << (ExpandedParent ? ExpandedParent->TokenText : "<null>")
                   << ", First: " << First << "\n");
  // In order to be able to find the correct parent in the reconstructed token
  // stream, we need to continue the last open reconstruction until we find the
  // given token if it is part of the reconstructed token stream.
  //
  // Note that hidden tokens can be part of the reconstructed stream in nested
  // macro calls.
  // For example, given
  //   #define C(x, y) x y
  //   #define B(x) {x}
  // And the call:
  //   C(a, B(b))
  // The outer macro call will be C(a, {b}), and the hidden token '}' can be
  // found in the reconstructed token stream of that expansion level.
  // In the expanded token stream
  //   a {b}
  // 'b' is a child of '{'. We need to continue the open expansion of the ','
  // in the call of 'C' in order to correctly set the ',' as the parent of '{',
  // so we later set the spelled token 'b' as a child of the ','.
  if (!ActiveExpansions.empty() && Token->MacroCtx &&
      (Token->MacroCtx->Role != MR_Hidden ||
       ActiveExpansions.size() != Token->MacroCtx->ExpandedFrom.size())) {
    if (/*PassedMacroComma = */ reconstructActiveCallUntil(Token))
      First = true;
  }

  prepareParent(ExpandedParent, First);

  if (Token->MacroCtx) {
    // If this token was generated by a macro call, add the reconstructed
    // equivalent of the token.
    reconstruct(Token);
  } else {
    // Otherwise, we add it to the current line.
    appendToken(Token);
  }
}

// Adjusts the stack of active reconstructed lines so we're ready to push
// tokens. The tokens to be pushed are children of ExpandedParent in the
// expanded code.
//
// This may entail:
// - creating a new line, if the parent is on the active line
// - popping active lines, if the parent is further up the stack
//
// Postcondition:
// ActiveReconstructedLines.back() is the line that has \p ExpandedParent or its
// reconstructed replacement token as a parent (when possible) - that is, the
// last token in \c ActiveReconstructedLines[ActiveReconstructedLines.size()-2]
// is the parent of ActiveReconstructedLines.back() in the reconstructed
// unwrapped line.
void MacroCallReconstructor::prepareParent(FormatToken *ExpandedParent,
                                           bool NewLine) {
  LLVM_DEBUG({
    llvm::dbgs() << "ParentMap:\n";
    debugParentMap();
  });
  // We want to find the parent in the new unwrapped line, where the expanded
  // parent might have been replaced during reconstruction.
  FormatToken *Parent = getParentInResult(ExpandedParent);
  LLVM_DEBUG(llvm::dbgs() << "MCR: New parent: "
                          << (Parent ? Parent->TokenText : "<null>") << "\n");

  FormatToken *OpenMacroParent = nullptr;
  if (!MacroCallStructure.empty()) {
    // Inside a macro expansion, it is possible to lose track of the correct
    // parent - either because it is already popped, for example because it was
    // in a different macro argument (e.g. M({, })), or when we work on invalid
    // code.
    // Thus, we use the innermost macro call's parent as the parent at which
    // we stop; this allows us to stay within the macro expansion and keeps
    // any problems confined to the extent of the macro call.
    OpenMacroParent =
        getParentInResult(MacroCallStructure.back().MacroCallLParen);
    LLVM_DEBUG(llvm::dbgs()
               << "MacroCallLParen: "
               << MacroCallStructure.back().MacroCallLParen->TokenText
               << ", OpenMacroParent: "
               << (OpenMacroParent ? OpenMacroParent->TokenText : "<null>")
               << "\n");
  }
  if (NewLine ||
      (!ActiveReconstructedLines.back()->Tokens.empty() &&
       Parent == ActiveReconstructedLines.back()->Tokens.back()->Tok)) {
    // If we are at the first token in a new line, we want to also
    // create a new line in the resulting reconstructed unwrapped line.
    while (ActiveReconstructedLines.back()->Tokens.empty() ||
           (Parent != ActiveReconstructedLines.back()->Tokens.back()->Tok &&
            ActiveReconstructedLines.back()->Tokens.back()->Tok !=
                OpenMacroParent)) {
      ActiveReconstructedLines.pop_back();
      assert(!ActiveReconstructedLines.empty());
    }
    assert(!ActiveReconstructedLines.empty());
    ActiveReconstructedLines.back()->Tokens.back()->Children.push_back(
        std::make_unique<ReconstructedLine>());
    ActiveReconstructedLines.push_back(
        &*ActiveReconstructedLines.back()->Tokens.back()->Children.back());
  } else if (parentLine().Tokens.back()->Tok != Parent) {
    // If we're not the first token in a new line, pop lines until we find
    // the child of \c Parent in the stack.
    while (Parent != parentLine().Tokens.back()->Tok &&
           parentLine().Tokens.back()->Tok &&
           parentLine().Tokens.back()->Tok != OpenMacroParent) {
      ActiveReconstructedLines.pop_back();
      assert(!ActiveReconstructedLines.empty());
    }
  }
  assert(!ActiveReconstructedLines.empty());
}

// For a given \p Parent in the incoming expanded token stream, find the
// corresponding parent in the output.
FormatToken *MacroCallReconstructor::getParentInResult(FormatToken *Parent) {
  FormatToken *Mapped = SpelledParentToReconstructedParent.lookup(Parent);
  if (!Mapped)
    return Parent;
  for (; Mapped; Mapped = SpelledParentToReconstructedParent.lookup(Parent))
    Parent = Mapped;
  // If we use a different token than the parent in the expanded token stream
  // as parent, mark it as a special parent, so the formatting code knows it
  // needs to have its children formatted.
  Parent->MacroParent = true;
  return Parent;
}

// Reconstruct a \p Token that was expanded from a macro call.
void MacroCallReconstructor::reconstruct(FormatToken *Token) {
  assert(Token->MacroCtx);
  // A single token can be the only result of a macro call:
  // Given: #define ID(x, y) ;
  // And the call: ID(<some>, <tokens>)
  // ';' in the expanded stream will reconstruct all of ID(<some>, <tokens>).
  if (Token->MacroCtx->StartOfExpansion) {
    startReconstruction(Token);
    // If the order of tokens in the expanded token stream is not the
    // same as the order of tokens in the reconstructed stream, we need
    // to reconstruct tokens that arrive later in the stream.
    if (Token->MacroCtx->Role != MR_Hidden)
      reconstructActiveCallUntil(Token);
  }
  assert(!ActiveExpansions.empty());
  if (ActiveExpansions.back().SpelledI != ActiveExpansions.back().SpelledE) {
    assert(ActiveExpansions.size() == Token->MacroCtx->ExpandedFrom.size());
    if (Token->MacroCtx->Role != MR_Hidden) {
      // The current token in the reconstructed token stream must be the token
      // we're looking for - we either arrive here after startReconstruction,
      // which initiates the stream to the first token, or after
      // continueReconstructionUntil skipped until the expected token in the
      // reconstructed stream at the start of add(...).
      assert(ActiveExpansions.back().SpelledI->Tok == Token);
      processNextReconstructed();
    } else if (!currentLine()->Tokens.empty()) {
      // Map all hidden tokens to the last visible token in the output.
      // If the hidden token is a parent, we'll use the last visible
      // token as the parent of the hidden token's children.
      SpelledParentToReconstructedParent[Token] =
          currentLine()->Tokens.back()->Tok;
    } else {
      for (auto I = ActiveReconstructedLines.rbegin(),
                E = ActiveReconstructedLines.rend();
           I != E; ++I) {
        if (!(*I)->Tokens.empty()) {
          SpelledParentToReconstructedParent[Token] = (*I)->Tokens.back()->Tok;
          break;
        }
      }
    }
  }
  if (Token->MacroCtx->EndOfExpansion)
    endReconstruction(Token);
}

// Given a \p Token that starts an expansion, reconstruct the beginning of the
// macro call.
// For example, given: #define ID(x) x
// And the call: ID(int a)
// Reconstructs: ID(
void MacroCallReconstructor::startReconstruction(FormatToken *Token) {
  assert(Token->MacroCtx);
  assert(!Token->MacroCtx->ExpandedFrom.empty());
  assert(ActiveExpansions.size() <= Token->MacroCtx->ExpandedFrom.size());
#ifndef NDEBUG
  // Check that the token's reconstruction stack matches our current
  // reconstruction stack.
  for (size_t I = 0; I < ActiveExpansions.size(); ++I) {
    assert(ActiveExpansions[I].ID ==
           Token->MacroCtx
               ->ExpandedFrom[Token->MacroCtx->ExpandedFrom.size() - 1 - I]);
  }
#endif
  // Start reconstruction for all calls for which this token is the first token
  // generated by the call.
  // Note that the token's expanded from stack is inside-to-outside, and the
  // expansions for which this token is not the first are the outermost ones.
  ArrayRef<FormatToken *> StartedMacros =
      ArrayRef(Token->MacroCtx->ExpandedFrom)
          .drop_back(ActiveExpansions.size());
  assert(StartedMacros.size() == Token->MacroCtx->StartOfExpansion);
  // We reconstruct macro calls outside-to-inside.
  for (FormatToken *ID : llvm::reverse(StartedMacros)) {
    // We found a macro call to be reconstructed; the next time our
    // reconstruction stack is empty we know we finished an reconstruction.
#ifndef NDEBUG
    State = InProgress;
#endif
    // Put the reconstructed macro call's token into our reconstruction stack.
    auto IU = IdToReconstructed.find(ID);
    assert(IU != IdToReconstructed.end());
    ActiveExpansions.push_back(
        {ID, IU->second->Tokens.begin(), IU->second->Tokens.end()});
    // Process the macro call's identifier.
    processNextReconstructed();
    if (ActiveExpansions.back().SpelledI == ActiveExpansions.back().SpelledE)
      continue;
    if (ActiveExpansions.back().SpelledI->Tok->is(tok::l_paren)) {
      // Process the optional opening parenthesis.
      processNextReconstructed();
    }
  }
}

// Add all tokens in the reconstruction stream to the output until we find the
// given \p Token.
bool MacroCallReconstructor::reconstructActiveCallUntil(FormatToken *Token) {
  assert(!ActiveExpansions.empty());
  bool PassedMacroComma = false;
  // FIXME: If Token was already expanded earlier, due to
  // a change in order, we will not find it, but need to
  // skip it.
  while (ActiveExpansions.back().SpelledI != ActiveExpansions.back().SpelledE &&
         ActiveExpansions.back().SpelledI->Tok != Token) {
    PassedMacroComma = processNextReconstructed() || PassedMacroComma;
  }
  return PassedMacroComma;
}

// End all reconstructions for which \p Token is the final token.
void MacroCallReconstructor::endReconstruction(FormatToken *Token) {
  assert(Token->MacroCtx &&
         (ActiveExpansions.size() >= Token->MacroCtx->EndOfExpansion));
  for (size_t I = 0; I < Token->MacroCtx->EndOfExpansion; ++I) {
#ifndef NDEBUG
    // Check all remaining tokens but the final closing parenthesis and optional
    // trailing comment were already reconstructed at an inner expansion level.
    for (auto T = ActiveExpansions.back().SpelledI;
         T != ActiveExpansions.back().SpelledE; ++T) {
      FormatToken *Token = T->Tok;
      bool ClosingParen = (std::next(T) == ActiveExpansions.back().SpelledE ||
                           std::next(T)->Tok->isTrailingComment()) &&
                          !Token->MacroCtx && Token->is(tok::r_paren);
      bool TrailingComment = Token->isTrailingComment();
      bool PreviousLevel =
          Token->MacroCtx &&
          (ActiveExpansions.size() < Token->MacroCtx->ExpandedFrom.size());
      if (!ClosingParen && !TrailingComment && !PreviousLevel)
        llvm::dbgs() << "At token: " << Token->TokenText << "\n";
      // In addition to the following cases, we can also run into this
      // when a macro call had more arguments than expected; in that case,
      // the comma and the remaining tokens in the macro call will potentially
      // end up in the line when we finish the expansion.
      // FIXME: Add the information which arguments are unused, and assert
      // one of the cases below plus reconstructed macro argument tokens.
      // assert(ClosingParen || TrailingComment || PreviousLevel);
    }
#endif
    // Handle the remaining open tokens:
    // - expand the closing parenthesis, if it exists, including an optional
    //   trailing comment
    // - handle tokens that were already reconstructed at an inner expansion
    //   level
    // - handle tokens when a macro call had more than the expected number of
    //   arguments, i.e. when #define M(x) is called as M(a, b, c) we'll end
    //   up with the sequence ", b, c)" being open at the end of the
    //   reconstruction; we want to gracefully handle that case
    //
    // FIXME: See the above debug-check for what we will need to do to be
    // able to assert this.
    for (auto T = ActiveExpansions.back().SpelledI;
         T != ActiveExpansions.back().SpelledE; ++T) {
      processNextReconstructed();
    }
    ActiveExpansions.pop_back();
  }
}

void MacroCallReconstructor::debugParentMap() const {
  llvm::DenseSet<FormatToken *> Values;
  for (const auto &P : SpelledParentToReconstructedParent)
    Values.insert(P.second);

  for (const auto &P : SpelledParentToReconstructedParent) {
    if (Values.contains(P.first))
      continue;
    llvm::dbgs() << (P.first ? P.first->TokenText : "<null>");
    for (auto I = SpelledParentToReconstructedParent.find(P.first),
              E = SpelledParentToReconstructedParent.end();
         I != E; I = SpelledParentToReconstructedParent.find(I->second)) {
      llvm::dbgs() << " -> " << (I->second ? I->second->TokenText : "<null>");
    }
    llvm::dbgs() << "\n";
  }
}

// If visible, add the next token of the reconstructed token sequence to the
// output. Returns whether reconstruction passed a comma that is part of a
// macro call.
bool MacroCallReconstructor::processNextReconstructed() {
  FormatToken *Token = ActiveExpansions.back().SpelledI->Tok;
  ++ActiveExpansions.back().SpelledI;
  if (Token->MacroCtx) {
    // Skip tokens that are not part of the macro call.
    if (Token->MacroCtx->Role == MR_Hidden)
      return false;
    // Skip tokens we already expanded during an inner reconstruction.
    // For example, given: #define ID(x) {x}
    // And the call: ID(ID(f))
    // We get two reconstructions:
    // ID(f) -> {f}
    // ID({f}) -> {{f}}
    // We reconstruct f during the first reconstruction, and skip it during the
    // second reconstruction.
    if (ActiveExpansions.size() < Token->MacroCtx->ExpandedFrom.size())
      return false;
  }
  // Tokens that do not have a macro context are tokens in that are part of the
  // macro call that have not taken part in expansion.
  if (!Token->MacroCtx) {
    // Put the parentheses and commas of a macro call into the same line;
    // if the arguments produce new unwrapped lines, they will become children
    // of the corresponding opening parenthesis or comma tokens in the
    // reconstructed call.
    if (Token->is(tok::l_paren)) {
      MacroCallStructure.push_back(MacroCallState(
          currentLine(), parentLine().Tokens.back()->Tok, Token));
      // All tokens that are children of the previous line's last token in the
      // reconstructed token stream will now be children of the l_paren token.
      // For example, for the line containing the macro calls:
      //   auto x = ID({ID(2)});
      // We will build up a map <null> -> ( -> ( with the first and second
      // l_paren of the macro call respectively. New lines that come in with a
      // <null> parent will then become children of the l_paren token of the
      // currently innermost macro call.
      SpelledParentToReconstructedParent[MacroCallStructure.back()
                                             .ParentLastToken] = Token;
      appendToken(Token);
      prepareParent(Token, /*NewLine=*/true);
      Token->MacroParent = true;
      return false;
    }
    if (!MacroCallStructure.empty()) {
      if (Token->is(tok::comma)) {
        // Make new lines inside the next argument children of the comma token.
        SpelledParentToReconstructedParent
            [MacroCallStructure.back().Line->Tokens.back()->Tok] = Token;
        Token->MacroParent = true;
        appendToken(Token, MacroCallStructure.back().Line);
        prepareParent(Token, /*NewLine=*/true);
        return true;
      }
      if (Token->is(tok::r_paren)) {
        appendToken(Token, MacroCallStructure.back().Line);
        SpelledParentToReconstructedParent.erase(
            MacroCallStructure.back().ParentLastToken);
        MacroCallStructure.pop_back();
        return false;
      }
    }
  }
  // Note that any tokens that are tagged with MR_None have been passed as
  // arguments to the macro that have not been expanded, for example:
  // Given: #define ID(X) x
  // When calling: ID(a, b)
  // 'b' will be part of the reconstructed token stream, but tagged MR_None.
  // Given that erroring out in this case would be disruptive, we continue
  // pushing the (unformatted) token.
  // FIXME: This can lead to unfortunate formatting decisions - give the user
  // a hint that their macro definition is broken.
  appendToken(Token);
  return false;
}

void MacroCallReconstructor::finalize() {
#ifndef NDEBUG
  assert(State != Finalized && finished());
  State = Finalized;
#endif

  // We created corresponding unwrapped lines for each incoming line as children
  // the the toplevel null token.
  assert(Result.Tokens.size() == 1 && !Result.Tokens.front()->Children.empty());
  LLVM_DEBUG({
    llvm::dbgs() << "Finalizing reconstructed lines:\n";
    debug(Result, 0);
  });

  // The first line becomes the top level line in the resulting unwrapped line.
  LineNode &Top = *Result.Tokens.front();
  auto *I = Top.Children.begin();
  // Every subsequent line will become a child of the last token in the previous
  // line, which is the token prior to the first token in the line.
  LineNode *Last = (*I)->Tokens.back().get();
  ++I;
  for (auto *E = Top.Children.end(); I != E; ++I) {
    assert(Last->Children.empty());
    Last->Children.push_back(std::move(*I));

    // Mark the previous line's last token as generated by a macro expansion
    // so the formatting algorithm can take that into account.
    Last->Tok->MacroParent = true;

    Last = Last->Children.back()->Tokens.back().get();
  }
  Top.Children.resize(1);
}

void MacroCallReconstructor::appendToken(FormatToken *Token,
                                         ReconstructedLine *L) {
  L = L ? L : currentLine();
  LLVM_DEBUG(llvm::dbgs() << "-> " << Token->TokenText << "\n");
  L->Tokens.push_back(std::make_unique<LineNode>(Token));
}

UnwrappedLine
MacroCallReconstructor::createUnwrappedLine(const ReconstructedLine &Line,
                                            int Level) {
  UnwrappedLine Result;
  Result.Level = Level;
  for (const auto &N : Line.Tokens) {
    Result.Tokens.push_back(N->Tok);
    UnwrappedLineNode &Current = Result.Tokens.back();
    for (const auto &Child : N->Children) {
      if (Child->Tokens.empty())
        continue;
      Current.Children.push_back(createUnwrappedLine(*Child, Level + 1));
    }
    if (Current.Children.size() == 1 &&
        Current.Tok->isOneOf(tok::l_paren, tok::comma)) {
      Result.Tokens.splice(Result.Tokens.end(),
                           Current.Children.front().Tokens);
      Current.Children.clear();
    }
  }
  return Result;
}

void MacroCallReconstructor::debug(const ReconstructedLine &Line, int Level) {
  for (int i = 0; i < Level; ++i)
    llvm::dbgs() << " ";
  for (const auto &N : Line.Tokens) {
    if (!N)
      continue;
    if (N->Tok)
      llvm::dbgs() << N->Tok->TokenText << " ";
    for (const auto &Child : N->Children) {
      llvm::dbgs() << "\n";
      debug(*Child, Level + 1);
      for (int i = 0; i < Level; ++i)
        llvm::dbgs() << " ";
    }
  }
  llvm::dbgs() << "\n";
}

MacroCallReconstructor::ReconstructedLine &
MacroCallReconstructor::parentLine() {
  return **std::prev(std::prev(ActiveReconstructedLines.end()));
}

MacroCallReconstructor::ReconstructedLine *
MacroCallReconstructor::currentLine() {
  return ActiveReconstructedLines.back();
}

MacroCallReconstructor::MacroCallState::MacroCallState(
    MacroCallReconstructor::ReconstructedLine *Line,
    FormatToken *ParentLastToken, FormatToken *MacroCallLParen)
    : Line(Line), ParentLastToken(ParentLastToken),
      MacroCallLParen(MacroCallLParen) {
  LLVM_DEBUG(
      llvm::dbgs() << "ParentLastToken: "
                   << (ParentLastToken ? ParentLastToken->TokenText : "<null>")
                   << "\n");

  assert(MacroCallLParen->is(tok::l_paren));
}

} // namespace format
} // namespace clang