1//===--- FormatTokenLexer.cpp - Lex FormatTokens -------------*- C++ ----*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements FormatTokenLexer, which tokenizes a source file
11/// into a FormatToken stream suitable for ClangFormat.
12///
13//===----------------------------------------------------------------------===//
14
15#include "FormatTokenLexer.h"
16#include "FormatToken.h"
17#include "clang/Basic/SourceLocation.h"
18#include "clang/Basic/SourceManager.h"
19#include "clang/Format/Format.h"
20#include "llvm/Support/Regex.h"
21
22namespace clang {
23namespace format {
24
25FormatTokenLexer::FormatTokenLexer(const SourceManager &SourceMgr, FileID ID,
26                                   unsigned Column, const FormatStyle &Style,
27                                   encoding::Encoding Encoding)
28    : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}),
29      Column(Column), TrailingWhitespace(0), SourceMgr(SourceMgr), ID(ID),
30      Style(Style), IdentTable(getFormattingLangOpts(Style)),
31      Keywords(IdentTable), Encoding(Encoding), FirstInLineIndex(0),
32      FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin),
33      MacroBlockEndRegex(Style.MacroBlockEnd) {
34  Lex.reset(new Lexer(ID, SourceMgr.getBuffer(ID), SourceMgr,
35                      getFormattingLangOpts(Style)));
36  Lex->SetKeepWhitespaceMode(true);
37
38  for (const std::string &ForEachMacro : Style.ForEachMacros)
39    Macros.insert({&IdentTable.get(ForEachMacro), TT_ForEachMacro});
40  for (const std::string &StatementMacro : Style.StatementMacros)
41    Macros.insert({&IdentTable.get(StatementMacro), TT_StatementMacro});
42  for (const std::string &TypenameMacro : Style.TypenameMacros)
43    Macros.insert({&IdentTable.get(TypenameMacro), TT_TypenameMacro});
44  for (const std::string &NamespaceMacro : Style.NamespaceMacros)
45    Macros.insert({&IdentTable.get(NamespaceMacro), TT_NamespaceMacro});
46}
47
48ArrayRef<FormatToken *> FormatTokenLexer::lex() {
49  assert(Tokens.empty());
50  assert(FirstInLineIndex == 0);
51  do {
52    Tokens.push_back(getNextToken());
53    if (Style.Language == FormatStyle::LK_JavaScript) {
54      tryParseJSRegexLiteral();
55      handleTemplateStrings();
56    }
57    if (Style.Language == FormatStyle::LK_TextProto)
58      tryParsePythonComment();
59    tryMergePreviousTokens();
60    if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline)
61      FirstInLineIndex = Tokens.size() - 1;
62  } while (Tokens.back()->Tok.isNot(tok::eof));
63  return Tokens;
64}
65
66void FormatTokenLexer::tryMergePreviousTokens() {
67  if (tryMerge_TMacro())
68    return;
69  if (tryMergeConflictMarkers())
70    return;
71  if (tryMergeLessLess())
72    return;
73
74  if (Style.isCSharp()) {
75    if (tryMergeCSharpKeywordVariables())
76      return;
77    if (tryMergeCSharpVerbatimStringLiteral())
78      return;
79    if (tryMergeCSharpDoubleQuestion())
80      return;
81    if (tryMergeCSharpNullConditionals())
82      return;
83    if (tryTransformCSharpForEach())
84      return;
85    static const tok::TokenKind JSRightArrow[] = {tok::equal, tok::greater};
86    if (tryMergeTokens(JSRightArrow, TT_JsFatArrow))
87      return;
88  }
89
90  if (tryMergeNSStringLiteral())
91    return;
92
93  if (Style.Language == FormatStyle::LK_JavaScript) {
94    static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal};
95    static const tok::TokenKind JSNotIdentity[] = {tok::exclaimequal,
96                                                   tok::equal};
97    static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater,
98                                                  tok::greaterequal};
99    static const tok::TokenKind JSRightArrow[] = {tok::equal, tok::greater};
100    static const tok::TokenKind JSExponentiation[] = {tok::star, tok::star};
101    static const tok::TokenKind JSExponentiationEqual[] = {tok::star,
102                                                           tok::starequal};
103    static const tok::TokenKind JSNullPropagatingOperator[] = {tok::question,
104                                                               tok::period};
105    static const tok::TokenKind JSNullishOperator[] = {tok::question,
106                                                       tok::question};
107
108    // FIXME: Investigate what token type gives the correct operator priority.
109    if (tryMergeTokens(JSIdentity, TT_BinaryOperator))
110      return;
111    if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator))
112      return;
113    if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator))
114      return;
115    if (tryMergeTokens(JSRightArrow, TT_JsFatArrow))
116      return;
117    if (tryMergeTokens(JSExponentiation, TT_JsExponentiation))
118      return;
119    if (tryMergeTokens(JSExponentiationEqual, TT_JsExponentiationEqual)) {
120      Tokens.back()->Tok.setKind(tok::starequal);
121      return;
122    }
123    if (tryMergeTokens(JSNullishOperator, TT_JsNullishCoalescingOperator))
124      return;
125    if (tryMergeTokens(JSNullPropagatingOperator,
126                       TT_JsNullPropagatingOperator)) {
127      // Treat like a regular "." access.
128      Tokens.back()->Tok.setKind(tok::period);
129      return;
130    }
131    if (tryMergeJSPrivateIdentifier())
132      return;
133  }
134
135  if (Style.Language == FormatStyle::LK_Java) {
136    static const tok::TokenKind JavaRightLogicalShiftAssign[] = {
137        tok::greater, tok::greater, tok::greaterequal};
138    if (tryMergeTokens(JavaRightLogicalShiftAssign, TT_BinaryOperator))
139      return;
140  }
141}
142
143bool FormatTokenLexer::tryMergeNSStringLiteral() {
144  if (Tokens.size() < 2)
145    return false;
146  auto &At = *(Tokens.end() - 2);
147  auto &String = *(Tokens.end() - 1);
148  if (!At->is(tok::at) || !String->is(tok::string_literal))
149    return false;
150  At->Tok.setKind(tok::string_literal);
151  At->TokenText = StringRef(At->TokenText.begin(),
152                            String->TokenText.end() - At->TokenText.begin());
153  At->ColumnWidth += String->ColumnWidth;
154  At->Type = TT_ObjCStringLiteral;
155  Tokens.erase(Tokens.end() - 1);
156  return true;
157}
158
159bool FormatTokenLexer::tryMergeJSPrivateIdentifier() {
160  // Merges #idenfier into a single identifier with the text #identifier
161  // but the token tok::identifier.
162  if (Tokens.size() < 2)
163    return false;
164  auto &Hash = *(Tokens.end() - 2);
165  auto &Identifier = *(Tokens.end() - 1);
166  if (!Hash->is(tok::hash) || !Identifier->is(tok::identifier))
167    return false;
168  Hash->Tok.setKind(tok::identifier);
169  Hash->TokenText =
170      StringRef(Hash->TokenText.begin(),
171                Identifier->TokenText.end() - Hash->TokenText.begin());
172  Hash->ColumnWidth += Identifier->ColumnWidth;
173  Hash->Type = TT_JsPrivateIdentifier;
174  Tokens.erase(Tokens.end() - 1);
175  return true;
176}
177
178// Search for verbatim or interpolated string literals @"ABC" or
179// $"aaaaa{abc}aaaaa" i and mark the token as TT_CSharpStringLiteral, and to
180// prevent splitting of @, $ and ".
181bool FormatTokenLexer::tryMergeCSharpVerbatimStringLiteral() {
182  if (Tokens.size() < 2)
183    return false;
184  auto &At = *(Tokens.end() - 2);
185  auto &String = *(Tokens.end() - 1);
186
187  // Look for $"aaaaaa" @"aaaaaa".
188  if (!(At->is(tok::at) || At->TokenText == "$") ||
189      !String->is(tok::string_literal))
190    return false;
191
192  if (Tokens.size() >= 2 && At->is(tok::at)) {
193    auto &Dollar = *(Tokens.end() - 3);
194    if (Dollar->TokenText == "$") {
195      // This looks like $@"aaaaa" so we need to combine all 3 tokens.
196      Dollar->Tok.setKind(tok::string_literal);
197      Dollar->TokenText =
198          StringRef(Dollar->TokenText.begin(),
199                    String->TokenText.end() - Dollar->TokenText.begin());
200      Dollar->ColumnWidth += (At->ColumnWidth + String->ColumnWidth);
201      Dollar->Type = TT_CSharpStringLiteral;
202      Tokens.erase(Tokens.end() - 2);
203      Tokens.erase(Tokens.end() - 1);
204      return true;
205    }
206  }
207
208  // Convert back into just a string_literal.
209  At->Tok.setKind(tok::string_literal);
210  At->TokenText = StringRef(At->TokenText.begin(),
211                            String->TokenText.end() - At->TokenText.begin());
212  At->ColumnWidth += String->ColumnWidth;
213  At->Type = TT_CSharpStringLiteral;
214  Tokens.erase(Tokens.end() - 1);
215  return true;
216}
217
218bool FormatTokenLexer::tryMergeCSharpDoubleQuestion() {
219  if (Tokens.size() < 2)
220    return false;
221  auto &FirstQuestion = *(Tokens.end() - 2);
222  auto &SecondQuestion = *(Tokens.end() - 1);
223  if (!FirstQuestion->is(tok::question) || !SecondQuestion->is(tok::question))
224    return false;
225  FirstQuestion->Tok.setKind(tok::question);
226  FirstQuestion->TokenText = StringRef(FirstQuestion->TokenText.begin(),
227                                       SecondQuestion->TokenText.end() -
228                                           FirstQuestion->TokenText.begin());
229  FirstQuestion->ColumnWidth += SecondQuestion->ColumnWidth;
230  FirstQuestion->Type = TT_CSharpNullCoalescing;
231  Tokens.erase(Tokens.end() - 1);
232  return true;
233}
234
235bool FormatTokenLexer::tryMergeCSharpKeywordVariables() {
236  if (Tokens.size() < 2)
237    return false;
238  auto &At = *(Tokens.end() - 2);
239  auto &Keyword = *(Tokens.end() - 1);
240  if (!At->is(tok::at))
241    return false;
242  if (!Keywords.isCSharpKeyword(*Keyword))
243    return false;
244
245  At->Tok.setKind(tok::identifier);
246  At->TokenText = StringRef(At->TokenText.begin(),
247                            Keyword->TokenText.end() - At->TokenText.begin());
248  At->ColumnWidth += Keyword->ColumnWidth;
249  At->Type = Keyword->Type;
250  Tokens.erase(Tokens.end() - 1);
251  return true;
252}
253
254// In C# merge the Identifier and the ? together e.g. arg?.
255bool FormatTokenLexer::tryMergeCSharpNullConditionals() {
256  if (Tokens.size() < 2)
257    return false;
258  auto &Identifier = *(Tokens.end() - 2);
259  auto &Question = *(Tokens.end() - 1);
260  if (!Identifier->isOneOf(tok::r_square, tok::identifier) ||
261      !Question->is(tok::question))
262    return false;
263  Identifier->TokenText =
264      StringRef(Identifier->TokenText.begin(),
265                Question->TokenText.end() - Identifier->TokenText.begin());
266  Identifier->ColumnWidth += Question->ColumnWidth;
267  Tokens.erase(Tokens.end() - 1);
268  return true;
269}
270
271// In C# transform identifier foreach into kw_foreach
272bool FormatTokenLexer::tryTransformCSharpForEach() {
273  if (Tokens.size() < 1)
274    return false;
275  auto &Identifier = *(Tokens.end() - 1);
276  if (!Identifier->is(tok::identifier))
277    return false;
278  if (Identifier->TokenText != "foreach")
279    return false;
280
281  Identifier->Type = TT_ForEachMacro;
282  Identifier->Tok.setKind(tok::kw_for);
283  return true;
284}
285
286bool FormatTokenLexer::tryMergeLessLess() {
287  // Merge X,less,less,Y into X,lessless,Y unless X or Y is less.
288  if (Tokens.size() < 3)
289    return false;
290
291  bool FourthTokenIsLess = false;
292  if (Tokens.size() > 3)
293    FourthTokenIsLess = (Tokens.end() - 4)[0]->is(tok::less);
294
295  auto First = Tokens.end() - 3;
296  if (First[2]->is(tok::less) || First[1]->isNot(tok::less) ||
297      First[0]->isNot(tok::less) || FourthTokenIsLess)
298    return false;
299
300  // Only merge if there currently is no whitespace between the two "<".
301  if (First[1]->WhitespaceRange.getBegin() !=
302      First[1]->WhitespaceRange.getEnd())
303    return false;
304
305  First[0]->Tok.setKind(tok::lessless);
306  First[0]->TokenText = "<<";
307  First[0]->ColumnWidth += 1;
308  Tokens.erase(Tokens.end() - 2);
309  return true;
310}
311
312bool FormatTokenLexer::tryMergeTokens(ArrayRef<tok::TokenKind> Kinds,
313                                      TokenType NewType) {
314  if (Tokens.size() < Kinds.size())
315    return false;
316
317  SmallVectorImpl<FormatToken *>::const_iterator First =
318      Tokens.end() - Kinds.size();
319  if (!First[0]->is(Kinds[0]))
320    return false;
321  unsigned AddLength = 0;
322  for (unsigned i = 1; i < Kinds.size(); ++i) {
323    if (!First[i]->is(Kinds[i]) || First[i]->WhitespaceRange.getBegin() !=
324                                       First[i]->WhitespaceRange.getEnd())
325      return false;
326    AddLength += First[i]->TokenText.size();
327  }
328  Tokens.resize(Tokens.size() - Kinds.size() + 1);
329  First[0]->TokenText = StringRef(First[0]->TokenText.data(),
330                                  First[0]->TokenText.size() + AddLength);
331  First[0]->ColumnWidth += AddLength;
332  First[0]->Type = NewType;
333  return true;
334}
335
336// Returns \c true if \p Tok can only be followed by an operand in JavaScript.
337bool FormatTokenLexer::precedesOperand(FormatToken *Tok) {
338  // NB: This is not entirely correct, as an r_paren can introduce an operand
339  // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough
340  // corner case to not matter in practice, though.
341  return Tok->isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace,
342                      tok::r_brace, tok::l_square, tok::semi, tok::exclaim,
343                      tok::colon, tok::question, tok::tilde) ||
344         Tok->isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw,
345                      tok::kw_else, tok::kw_new, tok::kw_delete, tok::kw_void,
346                      tok::kw_typeof, Keywords.kw_instanceof, Keywords.kw_in) ||
347         Tok->isBinaryOperator();
348}
349
350bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken *Prev) {
351  if (!Prev)
352    return true;
353
354  // Regex literals can only follow after prefix unary operators, not after
355  // postfix unary operators. If the '++' is followed by a non-operand
356  // introducing token, the slash here is the operand and not the start of a
357  // regex.
358  // `!` is an unary prefix operator, but also a post-fix operator that casts
359  // away nullability, so the same check applies.
360  if (Prev->isOneOf(tok::plusplus, tok::minusminus, tok::exclaim))
361    return (Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3]));
362
363  // The previous token must introduce an operand location where regex
364  // literals can occur.
365  if (!precedesOperand(Prev))
366    return false;
367
368  return true;
369}
370
371// Tries to parse a JavaScript Regex literal starting at the current token,
372// if that begins with a slash and is in a location where JavaScript allows
373// regex literals. Changes the current token to a regex literal and updates
374// its text if successful.
375void FormatTokenLexer::tryParseJSRegexLiteral() {
376  FormatToken *RegexToken = Tokens.back();
377  if (!RegexToken->isOneOf(tok::slash, tok::slashequal))
378    return;
379
380  FormatToken *Prev = nullptr;
381  for (auto I = Tokens.rbegin() + 1, E = Tokens.rend(); I != E; ++I) {
382    // NB: Because previous pointers are not initialized yet, this cannot use
383    // Token.getPreviousNonComment.
384    if ((*I)->isNot(tok::comment)) {
385      Prev = *I;
386      break;
387    }
388  }
389
390  if (!canPrecedeRegexLiteral(Prev))
391    return;
392
393  // 'Manually' lex ahead in the current file buffer.
394  const char *Offset = Lex->getBufferLocation();
395  const char *RegexBegin = Offset - RegexToken->TokenText.size();
396  StringRef Buffer = Lex->getBuffer();
397  bool InCharacterClass = false;
398  bool HaveClosingSlash = false;
399  for (; !HaveClosingSlash && Offset != Buffer.end(); ++Offset) {
400    // Regular expressions are terminated with a '/', which can only be
401    // escaped using '\' or a character class between '[' and ']'.
402    // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5.
403    switch (*Offset) {
404    case '\\':
405      // Skip the escaped character.
406      ++Offset;
407      break;
408    case '[':
409      InCharacterClass = true;
410      break;
411    case ']':
412      InCharacterClass = false;
413      break;
414    case '/':
415      if (!InCharacterClass)
416        HaveClosingSlash = true;
417      break;
418    }
419  }
420
421  RegexToken->Type = TT_RegexLiteral;
422  // Treat regex literals like other string_literals.
423  RegexToken->Tok.setKind(tok::string_literal);
424  RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin);
425  RegexToken->ColumnWidth = RegexToken->TokenText.size();
426
427  resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
428}
429
430void FormatTokenLexer::handleTemplateStrings() {
431  FormatToken *BacktickToken = Tokens.back();
432
433  if (BacktickToken->is(tok::l_brace)) {
434    StateStack.push(LexerState::NORMAL);
435    return;
436  }
437  if (BacktickToken->is(tok::r_brace)) {
438    if (StateStack.size() == 1)
439      return;
440    StateStack.pop();
441    if (StateStack.top() != LexerState::TEMPLATE_STRING)
442      return;
443    // If back in TEMPLATE_STRING, fallthrough and continue parsing the
444  } else if (BacktickToken->is(tok::unknown) &&
445             BacktickToken->TokenText == "`") {
446    StateStack.push(LexerState::TEMPLATE_STRING);
447  } else {
448    return; // Not actually a template
449  }
450
451  // 'Manually' lex ahead in the current file buffer.
452  const char *Offset = Lex->getBufferLocation();
453  const char *TmplBegin = Offset - BacktickToken->TokenText.size(); // at "`"
454  for (; Offset != Lex->getBuffer().end(); ++Offset) {
455    if (Offset[0] == '`') {
456      StateStack.pop();
457      break;
458    }
459    if (Offset[0] == '\\') {
460      ++Offset; // Skip the escaped character.
461    } else if (Offset + 1 < Lex->getBuffer().end() && Offset[0] == '$' &&
462               Offset[1] == '{') {
463      // '${' introduces an expression interpolation in the template string.
464      StateStack.push(LexerState::NORMAL);
465      ++Offset;
466      break;
467    }
468  }
469
470  StringRef LiteralText(TmplBegin, Offset - TmplBegin + 1);
471  BacktickToken->Type = TT_TemplateString;
472  BacktickToken->Tok.setKind(tok::string_literal);
473  BacktickToken->TokenText = LiteralText;
474
475  // Adjust width for potentially multiline string literals.
476  size_t FirstBreak = LiteralText.find('\n');
477  StringRef FirstLineText = FirstBreak == StringRef::npos
478                                ? LiteralText
479                                : LiteralText.substr(0, FirstBreak);
480  BacktickToken->ColumnWidth = encoding::columnWidthWithTabs(
481      FirstLineText, BacktickToken->OriginalColumn, Style.TabWidth, Encoding);
482  size_t LastBreak = LiteralText.rfind('\n');
483  if (LastBreak != StringRef::npos) {
484    BacktickToken->IsMultiline = true;
485    unsigned StartColumn = 0; // The template tail spans the entire line.
486    BacktickToken->LastLineColumnWidth = encoding::columnWidthWithTabs(
487        LiteralText.substr(LastBreak + 1, LiteralText.size()), StartColumn,
488        Style.TabWidth, Encoding);
489  }
490
491  SourceLocation loc = Offset < Lex->getBuffer().end()
492                           ? Lex->getSourceLocation(Offset + 1)
493                           : SourceMgr.getLocForEndOfFile(ID);
494  resetLexer(SourceMgr.getFileOffset(loc));
495}
496
497void FormatTokenLexer::tryParsePythonComment() {
498  FormatToken *HashToken = Tokens.back();
499  if (!HashToken->isOneOf(tok::hash, tok::hashhash))
500    return;
501  // Turn the remainder of this line into a comment.
502  const char *CommentBegin =
503      Lex->getBufferLocation() - HashToken->TokenText.size(); // at "#"
504  size_t From = CommentBegin - Lex->getBuffer().begin();
505  size_t To = Lex->getBuffer().find_first_of('\n', From);
506  if (To == StringRef::npos)
507    To = Lex->getBuffer().size();
508  size_t Len = To - From;
509  HashToken->Type = TT_LineComment;
510  HashToken->Tok.setKind(tok::comment);
511  HashToken->TokenText = Lex->getBuffer().substr(From, Len);
512  SourceLocation Loc = To < Lex->getBuffer().size()
513                           ? Lex->getSourceLocation(CommentBegin + Len)
514                           : SourceMgr.getLocForEndOfFile(ID);
515  resetLexer(SourceMgr.getFileOffset(Loc));
516}
517
518bool FormatTokenLexer::tryMerge_TMacro() {
519  if (Tokens.size() < 4)
520    return false;
521  FormatToken *Last = Tokens.back();
522  if (!Last->is(tok::r_paren))
523    return false;
524
525  FormatToken *String = Tokens[Tokens.size() - 2];
526  if (!String->is(tok::string_literal) || String->IsMultiline)
527    return false;
528
529  if (!Tokens[Tokens.size() - 3]->is(tok::l_paren))
530    return false;
531
532  FormatToken *Macro = Tokens[Tokens.size() - 4];
533  if (Macro->TokenText != "_T")
534    return false;
535
536  const char *Start = Macro->TokenText.data();
537  const char *End = Last->TokenText.data() + Last->TokenText.size();
538  String->TokenText = StringRef(Start, End - Start);
539  String->IsFirst = Macro->IsFirst;
540  String->LastNewlineOffset = Macro->LastNewlineOffset;
541  String->WhitespaceRange = Macro->WhitespaceRange;
542  String->OriginalColumn = Macro->OriginalColumn;
543  String->ColumnWidth = encoding::columnWidthWithTabs(
544      String->TokenText, String->OriginalColumn, Style.TabWidth, Encoding);
545  String->NewlinesBefore = Macro->NewlinesBefore;
546  String->HasUnescapedNewline = Macro->HasUnescapedNewline;
547
548  Tokens.pop_back();
549  Tokens.pop_back();
550  Tokens.pop_back();
551  Tokens.back() = String;
552  return true;
553}
554
555bool FormatTokenLexer::tryMergeConflictMarkers() {
556  if (Tokens.back()->NewlinesBefore == 0 && Tokens.back()->isNot(tok::eof))
557    return false;
558
559  // Conflict lines look like:
560  // <marker> <text from the vcs>
561  // For example:
562  // >>>>>>> /file/in/file/system at revision 1234
563  //
564  // We merge all tokens in a line that starts with a conflict marker
565  // into a single token with a special token type that the unwrapped line
566  // parser will use to correctly rebuild the underlying code.
567
568  FileID ID;
569  // Get the position of the first token in the line.
570  unsigned FirstInLineOffset;
571  std::tie(ID, FirstInLineOffset) = SourceMgr.getDecomposedLoc(
572      Tokens[FirstInLineIndex]->getStartOfNonWhitespace());
573  StringRef Buffer = SourceMgr.getBuffer(ID)->getBuffer();
574  // Calculate the offset of the start of the current line.
575  auto LineOffset = Buffer.rfind('\n', FirstInLineOffset);
576  if (LineOffset == StringRef::npos) {
577    LineOffset = 0;
578  } else {
579    ++LineOffset;
580  }
581
582  auto FirstSpace = Buffer.find_first_of(" \n", LineOffset);
583  StringRef LineStart;
584  if (FirstSpace == StringRef::npos) {
585    LineStart = Buffer.substr(LineOffset);
586  } else {
587    LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset);
588  }
589
590  TokenType Type = TT_Unknown;
591  if (LineStart == "<<<<<<<" || LineStart == ">>>>") {
592    Type = TT_ConflictStart;
593  } else if (LineStart == "|||||||" || LineStart == "=======" ||
594             LineStart == "====") {
595    Type = TT_ConflictAlternative;
596  } else if (LineStart == ">>>>>>>" || LineStart == "<<<<") {
597    Type = TT_ConflictEnd;
598  }
599
600  if (Type != TT_Unknown) {
601    FormatToken *Next = Tokens.back();
602
603    Tokens.resize(FirstInLineIndex + 1);
604    // We do not need to build a complete token here, as we will skip it
605    // during parsing anyway (as we must not touch whitespace around conflict
606    // markers).
607    Tokens.back()->Type = Type;
608    Tokens.back()->Tok.setKind(tok::kw___unknown_anytype);
609
610    Tokens.push_back(Next);
611    return true;
612  }
613
614  return false;
615}
616
617FormatToken *FormatTokenLexer::getStashedToken() {
618  // Create a synthesized second '>' or '<' token.
619  Token Tok = FormatTok->Tok;
620  StringRef TokenText = FormatTok->TokenText;
621
622  unsigned OriginalColumn = FormatTok->OriginalColumn;
623  FormatTok = new (Allocator.Allocate()) FormatToken;
624  FormatTok->Tok = Tok;
625  SourceLocation TokLocation =
626      FormatTok->Tok.getLocation().getLocWithOffset(Tok.getLength() - 1);
627  FormatTok->Tok.setLocation(TokLocation);
628  FormatTok->WhitespaceRange = SourceRange(TokLocation, TokLocation);
629  FormatTok->TokenText = TokenText;
630  FormatTok->ColumnWidth = 1;
631  FormatTok->OriginalColumn = OriginalColumn + 1;
632
633  return FormatTok;
634}
635
636FormatToken *FormatTokenLexer::getNextToken() {
637  if (StateStack.top() == LexerState::TOKEN_STASHED) {
638    StateStack.pop();
639    return getStashedToken();
640  }
641
642  FormatTok = new (Allocator.Allocate()) FormatToken;
643  readRawToken(*FormatTok);
644  SourceLocation WhitespaceStart =
645      FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace);
646  FormatTok->IsFirst = IsFirstToken;
647  IsFirstToken = false;
648
649  // Consume and record whitespace until we find a significant token.
650  unsigned WhitespaceLength = TrailingWhitespace;
651  while (FormatTok->Tok.is(tok::unknown)) {
652    StringRef Text = FormatTok->TokenText;
653    auto EscapesNewline = [&](int pos) {
654      // A '\r' here is just part of '\r\n'. Skip it.
655      if (pos >= 0 && Text[pos] == '\r')
656        --pos;
657      // See whether there is an odd number of '\' before this.
658      // FIXME: This is wrong. A '\' followed by a newline is always removed,
659      // regardless of whether there is another '\' before it.
660      // FIXME: Newlines can also be escaped by a '?' '?' '/' trigraph.
661      unsigned count = 0;
662      for (; pos >= 0; --pos, ++count)
663        if (Text[pos] != '\\')
664          break;
665      return count & 1;
666    };
667    // FIXME: This miscounts tok:unknown tokens that are not just
668    // whitespace, e.g. a '`' character.
669    for (int i = 0, e = Text.size(); i != e; ++i) {
670      switch (Text[i]) {
671      case '\n':
672        ++FormatTok->NewlinesBefore;
673        FormatTok->HasUnescapedNewline = !EscapesNewline(i - 1);
674        FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
675        Column = 0;
676        break;
677      case '\r':
678        FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
679        Column = 0;
680        break;
681      case '\f':
682      case '\v':
683        Column = 0;
684        break;
685      case ' ':
686        ++Column;
687        break;
688      case '\t':
689        Column +=
690            Style.TabWidth - (Style.TabWidth ? Column % Style.TabWidth : 0);
691        break;
692      case '\\':
693        if (i + 1 == e || (Text[i + 1] != '\r' && Text[i + 1] != '\n'))
694          FormatTok->Type = TT_ImplicitStringLiteral;
695        break;
696      default:
697        FormatTok->Type = TT_ImplicitStringLiteral;
698        break;
699      }
700      if (FormatTok->Type == TT_ImplicitStringLiteral)
701        break;
702    }
703
704    if (FormatTok->is(TT_ImplicitStringLiteral))
705      break;
706    WhitespaceLength += FormatTok->Tok.getLength();
707
708    readRawToken(*FormatTok);
709  }
710
711  // JavaScript and Java do not allow to escape the end of the line with a
712  // backslash. Backslashes are syntax errors in plain source, but can occur in
713  // comments. When a single line comment ends with a \, it'll cause the next
714  // line of code to be lexed as a comment, breaking formatting. The code below
715  // finds comments that contain a backslash followed by a line break, truncates
716  // the comment token at the backslash, and resets the lexer to restart behind
717  // the backslash.
718  if ((Style.Language == FormatStyle::LK_JavaScript ||
719       Style.Language == FormatStyle::LK_Java) &&
720      FormatTok->is(tok::comment) && FormatTok->TokenText.startswith("//")) {
721    size_t BackslashPos = FormatTok->TokenText.find('\\');
722    while (BackslashPos != StringRef::npos) {
723      if (BackslashPos + 1 < FormatTok->TokenText.size() &&
724          FormatTok->TokenText[BackslashPos + 1] == '\n') {
725        const char *Offset = Lex->getBufferLocation();
726        Offset -= FormatTok->TokenText.size();
727        Offset += BackslashPos + 1;
728        resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
729        FormatTok->TokenText = FormatTok->TokenText.substr(0, BackslashPos + 1);
730        FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
731            FormatTok->TokenText, FormatTok->OriginalColumn, Style.TabWidth,
732            Encoding);
733        break;
734      }
735      BackslashPos = FormatTok->TokenText.find('\\', BackslashPos + 1);
736    }
737  }
738
739  // In case the token starts with escaped newlines, we want to
740  // take them into account as whitespace - this pattern is quite frequent
741  // in macro definitions.
742  // FIXME: Add a more explicit test.
743  while (FormatTok->TokenText.size() > 1 && FormatTok->TokenText[0] == '\\') {
744    unsigned SkippedWhitespace = 0;
745    if (FormatTok->TokenText.size() > 2 &&
746        (FormatTok->TokenText[1] == '\r' && FormatTok->TokenText[2] == '\n'))
747      SkippedWhitespace = 3;
748    else if (FormatTok->TokenText[1] == '\n')
749      SkippedWhitespace = 2;
750    else
751      break;
752
753    ++FormatTok->NewlinesBefore;
754    WhitespaceLength += SkippedWhitespace;
755    FormatTok->LastNewlineOffset = SkippedWhitespace;
756    Column = 0;
757    FormatTok->TokenText = FormatTok->TokenText.substr(SkippedWhitespace);
758  }
759
760  FormatTok->WhitespaceRange = SourceRange(
761      WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
762
763  FormatTok->OriginalColumn = Column;
764
765  TrailingWhitespace = 0;
766  if (FormatTok->Tok.is(tok::comment)) {
767    // FIXME: Add the trimmed whitespace to Column.
768    StringRef UntrimmedText = FormatTok->TokenText;
769    FormatTok->TokenText = FormatTok->TokenText.rtrim(" \t\v\f");
770    TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size();
771  } else if (FormatTok->Tok.is(tok::raw_identifier)) {
772    IdentifierInfo &Info = IdentTable.get(FormatTok->TokenText);
773    FormatTok->Tok.setIdentifierInfo(&Info);
774    FormatTok->Tok.setKind(Info.getTokenID());
775    if (Style.Language == FormatStyle::LK_Java &&
776        FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete,
777                           tok::kw_operator)) {
778      FormatTok->Tok.setKind(tok::identifier);
779      FormatTok->Tok.setIdentifierInfo(nullptr);
780    } else if (Style.Language == FormatStyle::LK_JavaScript &&
781               FormatTok->isOneOf(tok::kw_struct, tok::kw_union,
782                                  tok::kw_operator)) {
783      FormatTok->Tok.setKind(tok::identifier);
784      FormatTok->Tok.setIdentifierInfo(nullptr);
785    }
786  } else if (FormatTok->Tok.is(tok::greatergreater)) {
787    FormatTok->Tok.setKind(tok::greater);
788    FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
789    ++Column;
790    StateStack.push(LexerState::TOKEN_STASHED);
791  } else if (FormatTok->Tok.is(tok::lessless)) {
792    FormatTok->Tok.setKind(tok::less);
793    FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
794    ++Column;
795    StateStack.push(LexerState::TOKEN_STASHED);
796  }
797
798  // Now FormatTok is the next non-whitespace token.
799
800  StringRef Text = FormatTok->TokenText;
801  size_t FirstNewlinePos = Text.find('\n');
802  if (FirstNewlinePos == StringRef::npos) {
803    // FIXME: ColumnWidth actually depends on the start column, we need to
804    // take this into account when the token is moved.
805    FormatTok->ColumnWidth =
806        encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding);
807    Column += FormatTok->ColumnWidth;
808  } else {
809    FormatTok->IsMultiline = true;
810    // FIXME: ColumnWidth actually depends on the start column, we need to
811    // take this into account when the token is moved.
812    FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
813        Text.substr(0, FirstNewlinePos), Column, Style.TabWidth, Encoding);
814
815    // The last line of the token always starts in column 0.
816    // Thus, the length can be precomputed even in the presence of tabs.
817    FormatTok->LastLineColumnWidth = encoding::columnWidthWithTabs(
818        Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth, Encoding);
819    Column = FormatTok->LastLineColumnWidth;
820  }
821
822  if (Style.isCpp()) {
823    auto it = Macros.find(FormatTok->Tok.getIdentifierInfo());
824    if (!(Tokens.size() > 0 && Tokens.back()->Tok.getIdentifierInfo() &&
825          Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() ==
826              tok::pp_define) &&
827        it != Macros.end()) {
828      FormatTok->Type = it->second;
829    } else if (FormatTok->is(tok::identifier)) {
830      if (MacroBlockBeginRegex.match(Text)) {
831        FormatTok->Type = TT_MacroBlockBegin;
832      } else if (MacroBlockEndRegex.match(Text)) {
833        FormatTok->Type = TT_MacroBlockEnd;
834      }
835    }
836  }
837
838  return FormatTok;
839}
840
841void FormatTokenLexer::readRawToken(FormatToken &Tok) {
842  Lex->LexFromRawLexer(Tok.Tok);
843  Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()),
844                            Tok.Tok.getLength());
845  // For formatting, treat unterminated string literals like normal string
846  // literals.
847  if (Tok.is(tok::unknown)) {
848    if (!Tok.TokenText.empty() && Tok.TokenText[0] == '"') {
849      Tok.Tok.setKind(tok::string_literal);
850      Tok.IsUnterminatedLiteral = true;
851    } else if (Style.Language == FormatStyle::LK_JavaScript &&
852               Tok.TokenText == "''") {
853      Tok.Tok.setKind(tok::string_literal);
854    }
855  }
856
857  if ((Style.Language == FormatStyle::LK_JavaScript ||
858       Style.Language == FormatStyle::LK_Proto ||
859       Style.Language == FormatStyle::LK_TextProto) &&
860      Tok.is(tok::char_constant)) {
861    Tok.Tok.setKind(tok::string_literal);
862  }
863
864  if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format on" ||
865                               Tok.TokenText == "/* clang-format on */")) {
866    FormattingDisabled = false;
867  }
868
869  Tok.Finalized = FormattingDisabled;
870
871  if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format off" ||
872                               Tok.TokenText == "/* clang-format off */")) {
873    FormattingDisabled = true;
874  }
875}
876
877void FormatTokenLexer::resetLexer(unsigned Offset) {
878  StringRef Buffer = SourceMgr.getBufferData(ID);
879  Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID),
880                      getFormattingLangOpts(Style), Buffer.begin(),
881                      Buffer.begin() + Offset, Buffer.end()));
882  Lex->SetKeepWhitespaceMode(true);
883  TrailingWhitespace = 0;
884}
885
886} // namespace format
887} // namespace clang
888