1//===--- CommentLexer.cpp -------------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "clang/AST/CommentLexer.h"
10#include "clang/AST/CommentCommandTraits.h"
11#include "clang/AST/CommentDiagnostic.h"
12#include "clang/Basic/CharInfo.h"
13#include "llvm/ADT/StringExtras.h"
14#include "llvm/ADT/StringSwitch.h"
15#include "llvm/Support/ConvertUTF.h"
16#include "llvm/Support/ErrorHandling.h"
17
18namespace clang {
19namespace comments {
20
21void Token::dump(const Lexer &L, const SourceManager &SM) const {
22  llvm::errs() << "comments::Token Kind=" << Kind << " ";
23  Loc.print(llvm::errs(), SM);
24  llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
25}
26
27static inline bool isHTMLNamedCharacterReferenceCharacter(char C) {
28  return isLetter(C);
29}
30
31static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) {
32  return isDigit(C);
33}
34
35static inline bool isHTMLHexCharacterReferenceCharacter(char C) {
36  return isHexDigit(C);
37}
38
39static inline StringRef convertCodePointToUTF8(
40                                      llvm::BumpPtrAllocator &Allocator,
41                                      unsigned CodePoint) {
42  char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
43  char *ResolvedPtr = Resolved;
44  if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
45    return StringRef(Resolved, ResolvedPtr - Resolved);
46  else
47    return StringRef();
48}
49
50namespace {
51
52#include "clang/AST/CommentHTMLTags.inc"
53#include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
54
55} // end anonymous namespace
56
57StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
58  // Fast path, first check a few most widely used named character references.
59  return llvm::StringSwitch<StringRef>(Name)
60      .Case("amp", "&")
61      .Case("lt", "<")
62      .Case("gt", ">")
63      .Case("quot", "\"")
64      .Case("apos", "\'")
65      // Slow path.
66      .Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
67}
68
69StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
70  unsigned CodePoint = 0;
71  for (unsigned i = 0, e = Name.size(); i != e; ++i) {
72    assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
73    CodePoint *= 10;
74    CodePoint += Name[i] - '0';
75  }
76  return convertCodePointToUTF8(Allocator, CodePoint);
77}
78
79StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
80  unsigned CodePoint = 0;
81  for (unsigned i = 0, e = Name.size(); i != e; ++i) {
82    CodePoint *= 16;
83    const char C = Name[i];
84    assert(isHTMLHexCharacterReferenceCharacter(C));
85    CodePoint += llvm::hexDigitValue(C);
86  }
87  return convertCodePointToUTF8(Allocator, CodePoint);
88}
89
90void Lexer::skipLineStartingDecorations() {
91  // This function should be called only for C comments
92  assert(CommentState == LCS_InsideCComment);
93
94  if (BufferPtr == CommentEnd)
95    return;
96
97  switch (*BufferPtr) {
98  case ' ':
99  case '\t':
100  case '\f':
101  case '\v': {
102    const char *NewBufferPtr = BufferPtr;
103    NewBufferPtr++;
104    if (NewBufferPtr == CommentEnd)
105      return;
106
107    char C = *NewBufferPtr;
108    while (isHorizontalWhitespace(C)) {
109      NewBufferPtr++;
110      if (NewBufferPtr == CommentEnd)
111        return;
112      C = *NewBufferPtr;
113    }
114    if (C == '*')
115      BufferPtr = NewBufferPtr + 1;
116    break;
117  }
118  case '*':
119    BufferPtr++;
120    break;
121  }
122}
123
124namespace {
125/// Returns pointer to the first newline character in the string.
126const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
127  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
128    if (isVerticalWhitespace(*BufferPtr))
129      return BufferPtr;
130  }
131  return BufferEnd;
132}
133
134const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
135  if (BufferPtr == BufferEnd)
136    return BufferPtr;
137
138  if (*BufferPtr == '\n')
139    BufferPtr++;
140  else {
141    assert(*BufferPtr == '\r');
142    BufferPtr++;
143    if (BufferPtr != BufferEnd && *BufferPtr == '\n')
144      BufferPtr++;
145  }
146  return BufferPtr;
147}
148
149const char *skipNamedCharacterReference(const char *BufferPtr,
150                                        const char *BufferEnd) {
151  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
152    if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
153      return BufferPtr;
154  }
155  return BufferEnd;
156}
157
158const char *skipDecimalCharacterReference(const char *BufferPtr,
159                                          const char *BufferEnd) {
160  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
161    if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
162      return BufferPtr;
163  }
164  return BufferEnd;
165}
166
167const char *skipHexCharacterReference(const char *BufferPtr,
168                                      const char *BufferEnd) {
169  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
170    if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
171      return BufferPtr;
172  }
173  return BufferEnd;
174}
175
176bool isHTMLIdentifierStartingCharacter(char C) {
177  return isLetter(C);
178}
179
180bool isHTMLIdentifierCharacter(char C) {
181  return isAlphanumeric(C);
182}
183
184const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
185  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
186    if (!isHTMLIdentifierCharacter(*BufferPtr))
187      return BufferPtr;
188  }
189  return BufferEnd;
190}
191
192/// Skip HTML string quoted in single or double quotes.  Escaping quotes inside
193/// string allowed.
194///
195/// Returns pointer to closing quote.
196const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
197{
198  const char Quote = *BufferPtr;
199  assert(Quote == '\"' || Quote == '\'');
200
201  BufferPtr++;
202  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
203    const char C = *BufferPtr;
204    if (C == Quote && BufferPtr[-1] != '\\')
205      return BufferPtr;
206  }
207  return BufferEnd;
208}
209
210const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
211  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
212    if (!isWhitespace(*BufferPtr))
213      return BufferPtr;
214  }
215  return BufferEnd;
216}
217
218bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
219  return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
220}
221
222bool isCommandNameStartCharacter(char C) {
223  return isLetter(C);
224}
225
226bool isCommandNameCharacter(char C) {
227  return isAlphanumeric(C);
228}
229
230const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
231  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
232    if (!isCommandNameCharacter(*BufferPtr))
233      return BufferPtr;
234  }
235  return BufferEnd;
236}
237
238/// Return the one past end pointer for BCPL comments.
239/// Handles newlines escaped with backslash or trigraph for backslahs.
240const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
241  const char *CurPtr = BufferPtr;
242  while (CurPtr != BufferEnd) {
243    while (!isVerticalWhitespace(*CurPtr)) {
244      CurPtr++;
245      if (CurPtr == BufferEnd)
246        return BufferEnd;
247    }
248    // We found a newline, check if it is escaped.
249    const char *EscapePtr = CurPtr - 1;
250    while(isHorizontalWhitespace(*EscapePtr))
251      EscapePtr--;
252
253    if (*EscapePtr == '\\' ||
254        (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
255         EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
256      // We found an escaped newline.
257      CurPtr = skipNewline(CurPtr, BufferEnd);
258    } else
259      return CurPtr; // Not an escaped newline.
260  }
261  return BufferEnd;
262}
263
264/// Return the one past end pointer for C comments.
265/// Very dumb, does not handle escaped newlines or trigraphs.
266const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
267  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
268    if (*BufferPtr == '*') {
269      assert(BufferPtr + 1 != BufferEnd);
270      if (*(BufferPtr + 1) == '/')
271        return BufferPtr;
272    }
273  }
274  llvm_unreachable("buffer end hit before '*/' was seen");
275}
276
277} // end anonymous namespace
278
279void Lexer::formTokenWithChars(Token &Result, const char *TokEnd,
280                               tok::TokenKind Kind) {
281  const unsigned TokLen = TokEnd - BufferPtr;
282  Result.setLocation(getSourceLocation(BufferPtr));
283  Result.setKind(Kind);
284  Result.setLength(TokLen);
285#ifndef NDEBUG
286  Result.TextPtr = "<UNSET>";
287  Result.IntVal = 7;
288#endif
289  BufferPtr = TokEnd;
290}
291
292void Lexer::lexCommentText(Token &T) {
293  assert(CommentState == LCS_InsideBCPLComment ||
294         CommentState == LCS_InsideCComment);
295
296  // Handles lexing non-command text, i.e. text and newline.
297  auto HandleNonCommandToken = [&]() -> void {
298    assert(State == LS_Normal);
299
300    const char *TokenPtr = BufferPtr;
301    assert(TokenPtr < CommentEnd);
302    switch (*TokenPtr) {
303      case '\n':
304      case '\r':
305          TokenPtr = skipNewline(TokenPtr, CommentEnd);
306          formTokenWithChars(T, TokenPtr, tok::newline);
307
308          if (CommentState == LCS_InsideCComment)
309            skipLineStartingDecorations();
310          return;
311
312      default: {
313          StringRef TokStartSymbols = ParseCommands ? "\n\r\\@&<" : "\n\r";
314          size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr)
315                           .find_first_of(TokStartSymbols);
316          if (End != StringRef::npos)
317            TokenPtr += End;
318          else
319            TokenPtr = CommentEnd;
320          formTextToken(T, TokenPtr);
321          return;
322      }
323    }
324  };
325
326  if (!ParseCommands)
327    return HandleNonCommandToken();
328
329  switch (State) {
330  case LS_Normal:
331    break;
332  case LS_VerbatimBlockFirstLine:
333    lexVerbatimBlockFirstLine(T);
334    return;
335  case LS_VerbatimBlockBody:
336    lexVerbatimBlockBody(T);
337    return;
338  case LS_VerbatimLineText:
339    lexVerbatimLineText(T);
340    return;
341  case LS_HTMLStartTag:
342    lexHTMLStartTag(T);
343    return;
344  case LS_HTMLEndTag:
345    lexHTMLEndTag(T);
346    return;
347  }
348
349  assert(State == LS_Normal);
350  const char *TokenPtr = BufferPtr;
351  assert(TokenPtr < CommentEnd);
352  switch(*TokenPtr) {
353    case '\\':
354    case '@': {
355      // Commands that start with a backslash and commands that start with
356      // 'at' have equivalent semantics.  But we keep information about the
357      // exact syntax in AST for comments.
358      tok::TokenKind CommandKind =
359          (*TokenPtr == '@') ? tok::at_command : tok::backslash_command;
360      TokenPtr++;
361      if (TokenPtr == CommentEnd) {
362        formTextToken(T, TokenPtr);
363        return;
364      }
365      char C = *TokenPtr;
366      switch (C) {
367      default:
368        break;
369
370      case '\\': case '@': case '&': case '$':
371      case '#':  case '<': case '>': case '%':
372      case '\"': case '.': case ':':
373        // This is one of \\ \@ \& \$ etc escape sequences.
374        TokenPtr++;
375        if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
376          // This is the \:: escape sequence.
377          TokenPtr++;
378        }
379        StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
380        formTokenWithChars(T, TokenPtr, tok::text);
381        T.setText(UnescapedText);
382        return;
383      }
384
385      // Don't make zero-length commands.
386      if (!isCommandNameStartCharacter(*TokenPtr)) {
387        formTextToken(T, TokenPtr);
388        return;
389      }
390
391      TokenPtr = skipCommandName(TokenPtr, CommentEnd);
392      unsigned Length = TokenPtr - (BufferPtr + 1);
393
394      // Hardcoded support for lexing LaTeX formula commands
395      // \f$ \f[ \f] \f{ \f} as a single command.
396      if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
397        C = *TokenPtr;
398        if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
399          TokenPtr++;
400          Length++;
401        }
402      }
403
404      StringRef CommandName(BufferPtr + 1, Length);
405
406      const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
407      if (!Info) {
408        if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {
409          StringRef CorrectedName = Info->Name;
410          SourceLocation Loc = getSourceLocation(BufferPtr);
411          SourceLocation EndLoc = getSourceLocation(TokenPtr);
412          SourceRange FullRange = SourceRange(Loc, EndLoc);
413          SourceRange CommandRange(Loc.getLocWithOffset(1), EndLoc);
414          Diag(Loc, diag::warn_correct_comment_command_name)
415            << FullRange << CommandName << CorrectedName
416            << FixItHint::CreateReplacement(CommandRange, CorrectedName);
417        } else {
418          formTokenWithChars(T, TokenPtr, tok::unknown_command);
419          T.setUnknownCommandName(CommandName);
420          Diag(T.getLocation(), diag::warn_unknown_comment_command_name)
421              << SourceRange(T.getLocation(), T.getEndLocation());
422          return;
423        }
424      }
425      if (Info->IsVerbatimBlockCommand) {
426        setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
427        return;
428      }
429      if (Info->IsVerbatimLineCommand) {
430        setupAndLexVerbatimLine(T, TokenPtr, Info);
431        return;
432      }
433      formTokenWithChars(T, TokenPtr, CommandKind);
434      T.setCommandID(Info->getID());
435      return;
436    }
437
438    case '&':
439      lexHTMLCharacterReference(T);
440      return;
441
442    case '<': {
443      TokenPtr++;
444      if (TokenPtr == CommentEnd) {
445        formTextToken(T, TokenPtr);
446        return;
447      }
448      const char C = *TokenPtr;
449      if (isHTMLIdentifierStartingCharacter(C))
450        setupAndLexHTMLStartTag(T);
451      else if (C == '/')
452        setupAndLexHTMLEndTag(T);
453      else
454        formTextToken(T, TokenPtr);
455      return;
456    }
457
458    default:
459      return HandleNonCommandToken();
460  }
461}
462
463void Lexer::setupAndLexVerbatimBlock(Token &T,
464                                     const char *TextBegin,
465                                     char Marker, const CommandInfo *Info) {
466  assert(Info->IsVerbatimBlockCommand);
467
468  VerbatimBlockEndCommandName.clear();
469  VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
470  VerbatimBlockEndCommandName.append(Info->EndCommandName);
471
472  formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
473  T.setVerbatimBlockID(Info->getID());
474
475  // If there is a newline following the verbatim opening command, skip the
476  // newline so that we don't create an tok::verbatim_block_line with empty
477  // text content.
478  if (BufferPtr != CommentEnd &&
479      isVerticalWhitespace(*BufferPtr)) {
480    BufferPtr = skipNewline(BufferPtr, CommentEnd);
481    State = LS_VerbatimBlockBody;
482    return;
483  }
484
485  State = LS_VerbatimBlockFirstLine;
486}
487
488void Lexer::lexVerbatimBlockFirstLine(Token &T) {
489again:
490  assert(BufferPtr < CommentEnd);
491
492  // FIXME: It would be better to scan the text once, finding either the block
493  // end command or newline.
494  //
495  // Extract current line.
496  const char *Newline = findNewline(BufferPtr, CommentEnd);
497  StringRef Line(BufferPtr, Newline - BufferPtr);
498
499  // Look for end command in current line.
500  size_t Pos = Line.find(VerbatimBlockEndCommandName);
501  const char *TextEnd;
502  const char *NextLine;
503  if (Pos == StringRef::npos) {
504    // Current line is completely verbatim.
505    TextEnd = Newline;
506    NextLine = skipNewline(Newline, CommentEnd);
507  } else if (Pos == 0) {
508    // Current line contains just an end command.
509    const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
510    StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
511    formTokenWithChars(T, End, tok::verbatim_block_end);
512    T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
513    State = LS_Normal;
514    return;
515  } else {
516    // There is some text, followed by end command.  Extract text first.
517    TextEnd = BufferPtr + Pos;
518    NextLine = TextEnd;
519    // If there is only whitespace before end command, skip whitespace.
520    if (isWhitespace(BufferPtr, TextEnd)) {
521      BufferPtr = TextEnd;
522      goto again;
523    }
524  }
525
526  StringRef Text(BufferPtr, TextEnd - BufferPtr);
527  formTokenWithChars(T, NextLine, tok::verbatim_block_line);
528  T.setVerbatimBlockText(Text);
529
530  State = LS_VerbatimBlockBody;
531}
532
533void Lexer::lexVerbatimBlockBody(Token &T) {
534  assert(State == LS_VerbatimBlockBody);
535
536  if (CommentState == LCS_InsideCComment)
537    skipLineStartingDecorations();
538
539  if (BufferPtr == CommentEnd) {
540    formTokenWithChars(T, BufferPtr, tok::verbatim_block_line);
541    T.setVerbatimBlockText("");
542    return;
543  }
544
545  lexVerbatimBlockFirstLine(T);
546}
547
548void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
549                                    const CommandInfo *Info) {
550  assert(Info->IsVerbatimLineCommand);
551  formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
552  T.setVerbatimLineID(Info->getID());
553
554  State = LS_VerbatimLineText;
555}
556
557void Lexer::lexVerbatimLineText(Token &T) {
558  assert(State == LS_VerbatimLineText);
559
560  // Extract current line.
561  const char *Newline = findNewline(BufferPtr, CommentEnd);
562  StringRef Text(BufferPtr, Newline - BufferPtr);
563  formTokenWithChars(T, Newline, tok::verbatim_line_text);
564  T.setVerbatimLineText(Text);
565
566  State = LS_Normal;
567}
568
569void Lexer::lexHTMLCharacterReference(Token &T) {
570  const char *TokenPtr = BufferPtr;
571  assert(*TokenPtr == '&');
572  TokenPtr++;
573  if (TokenPtr == CommentEnd) {
574    formTextToken(T, TokenPtr);
575    return;
576  }
577  const char *NamePtr;
578  bool isNamed = false;
579  bool isDecimal = false;
580  char C = *TokenPtr;
581  if (isHTMLNamedCharacterReferenceCharacter(C)) {
582    NamePtr = TokenPtr;
583    TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
584    isNamed = true;
585  } else if (C == '#') {
586    TokenPtr++;
587    if (TokenPtr == CommentEnd) {
588      formTextToken(T, TokenPtr);
589      return;
590    }
591    C = *TokenPtr;
592    if (isHTMLDecimalCharacterReferenceCharacter(C)) {
593      NamePtr = TokenPtr;
594      TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
595      isDecimal = true;
596    } else if (C == 'x' || C == 'X') {
597      TokenPtr++;
598      NamePtr = TokenPtr;
599      TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
600    } else {
601      formTextToken(T, TokenPtr);
602      return;
603    }
604  } else {
605    formTextToken(T, TokenPtr);
606    return;
607  }
608  if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
609      *TokenPtr != ';') {
610    formTextToken(T, TokenPtr);
611    return;
612  }
613  StringRef Name(NamePtr, TokenPtr - NamePtr);
614  TokenPtr++; // Skip semicolon.
615  StringRef Resolved;
616  if (isNamed)
617    Resolved = resolveHTMLNamedCharacterReference(Name);
618  else if (isDecimal)
619    Resolved = resolveHTMLDecimalCharacterReference(Name);
620  else
621    Resolved = resolveHTMLHexCharacterReference(Name);
622
623  if (Resolved.empty()) {
624    formTextToken(T, TokenPtr);
625    return;
626  }
627  formTokenWithChars(T, TokenPtr, tok::text);
628  T.setText(Resolved);
629}
630
631void Lexer::setupAndLexHTMLStartTag(Token &T) {
632  assert(BufferPtr[0] == '<' &&
633         isHTMLIdentifierStartingCharacter(BufferPtr[1]));
634  const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
635  StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
636  if (!isHTMLTagName(Name)) {
637    formTextToken(T, TagNameEnd);
638    return;
639  }
640
641  formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
642  T.setHTMLTagStartName(Name);
643
644  BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
645
646  const char C = *BufferPtr;
647  if (BufferPtr != CommentEnd &&
648      (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
649    State = LS_HTMLStartTag;
650}
651
652void Lexer::lexHTMLStartTag(Token &T) {
653  assert(State == LS_HTMLStartTag);
654
655  const char *TokenPtr = BufferPtr;
656  char C = *TokenPtr;
657  if (isHTMLIdentifierCharacter(C)) {
658    TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
659    StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
660    formTokenWithChars(T, TokenPtr, tok::html_ident);
661    T.setHTMLIdent(Ident);
662  } else {
663    switch (C) {
664    case '=':
665      TokenPtr++;
666      formTokenWithChars(T, TokenPtr, tok::html_equals);
667      break;
668    case '\"':
669    case '\'': {
670      const char *OpenQuote = TokenPtr;
671      TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
672      const char *ClosingQuote = TokenPtr;
673      if (TokenPtr != CommentEnd) // Skip closing quote.
674        TokenPtr++;
675      formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
676      T.setHTMLQuotedString(StringRef(OpenQuote + 1,
677                                      ClosingQuote - (OpenQuote + 1)));
678      break;
679    }
680    case '>':
681      TokenPtr++;
682      formTokenWithChars(T, TokenPtr, tok::html_greater);
683      State = LS_Normal;
684      return;
685    case '/':
686      TokenPtr++;
687      if (TokenPtr != CommentEnd && *TokenPtr == '>') {
688        TokenPtr++;
689        formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
690      } else
691        formTextToken(T, TokenPtr);
692
693      State = LS_Normal;
694      return;
695    }
696  }
697
698  // Now look ahead and return to normal state if we don't see any HTML tokens
699  // ahead.
700  BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
701  if (BufferPtr == CommentEnd) {
702    State = LS_Normal;
703    return;
704  }
705
706  C = *BufferPtr;
707  if (!isHTMLIdentifierStartingCharacter(C) &&
708      C != '=' && C != '\"' && C != '\'' && C != '>') {
709    State = LS_Normal;
710    return;
711  }
712}
713
714void Lexer::setupAndLexHTMLEndTag(Token &T) {
715  assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
716
717  const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
718  const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
719  StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
720  if (!isHTMLTagName(Name)) {
721    formTextToken(T, TagNameEnd);
722    return;
723  }
724
725  const char *End = skipWhitespace(TagNameEnd, CommentEnd);
726
727  formTokenWithChars(T, End, tok::html_end_tag);
728  T.setHTMLTagEndName(Name);
729
730  if (BufferPtr != CommentEnd && *BufferPtr == '>')
731    State = LS_HTMLEndTag;
732}
733
734void Lexer::lexHTMLEndTag(Token &T) {
735  assert(BufferPtr != CommentEnd && *BufferPtr == '>');
736
737  formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
738  State = LS_Normal;
739}
740
741Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
742             const CommandTraits &Traits, SourceLocation FileLoc,
743             const char *BufferStart, const char *BufferEnd,
744             bool ParseCommands)
745    : Allocator(Allocator), Diags(Diags), Traits(Traits),
746      BufferStart(BufferStart), BufferEnd(BufferEnd), FileLoc(FileLoc),
747      BufferPtr(BufferStart), CommentState(LCS_BeforeComment), State(LS_Normal),
748      ParseCommands(ParseCommands) {}
749
750void Lexer::lex(Token &T) {
751again:
752  switch (CommentState) {
753  case LCS_BeforeComment:
754    if (BufferPtr == BufferEnd) {
755      formTokenWithChars(T, BufferPtr, tok::eof);
756      return;
757    }
758
759    assert(*BufferPtr == '/');
760    BufferPtr++; // Skip first slash.
761    switch(*BufferPtr) {
762    case '/': { // BCPL comment.
763      BufferPtr++; // Skip second slash.
764
765      if (BufferPtr != BufferEnd) {
766        // Skip Doxygen magic marker, if it is present.
767        // It might be missing because of a typo //< or /*<, or because we
768        // merged this non-Doxygen comment into a bunch of Doxygen comments
769        // around it: /** ... */ /* ... */ /** ... */
770        const char C = *BufferPtr;
771        if (C == '/' || C == '!')
772          BufferPtr++;
773      }
774
775      // Skip less-than symbol that marks trailing comments.
776      // Skip it even if the comment is not a Doxygen one, because //< and /*<
777      // are frequent typos.
778      if (BufferPtr != BufferEnd && *BufferPtr == '<')
779        BufferPtr++;
780
781      CommentState = LCS_InsideBCPLComment;
782      if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
783        State = LS_Normal;
784      CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
785      goto again;
786    }
787    case '*': { // C comment.
788      BufferPtr++; // Skip star.
789
790      // Skip Doxygen magic marker.
791      const char C = *BufferPtr;
792      if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
793        BufferPtr++;
794
795      // Skip less-than symbol that marks trailing comments.
796      if (BufferPtr != BufferEnd && *BufferPtr == '<')
797        BufferPtr++;
798
799      CommentState = LCS_InsideCComment;
800      State = LS_Normal;
801      CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
802      goto again;
803    }
804    default:
805      llvm_unreachable("second character of comment should be '/' or '*'");
806    }
807
808  case LCS_BetweenComments: {
809    // Consecutive comments are extracted only if there is only whitespace
810    // between them.  So we can search for the start of the next comment.
811    const char *EndWhitespace = BufferPtr;
812    while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
813      EndWhitespace++;
814
815    // Turn any whitespace between comments (and there is only whitespace
816    // between them -- guaranteed by comment extraction) into a newline.  We
817    // have two newlines between C comments in total (first one was synthesized
818    // after a comment).
819    formTokenWithChars(T, EndWhitespace, tok::newline);
820
821    CommentState = LCS_BeforeComment;
822    break;
823  }
824
825  case LCS_InsideBCPLComment:
826  case LCS_InsideCComment:
827    if (BufferPtr != CommentEnd) {
828      lexCommentText(T);
829      break;
830    } else {
831      // Skip C comment closing sequence.
832      if (CommentState == LCS_InsideCComment) {
833        assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
834        BufferPtr += 2;
835        assert(BufferPtr <= BufferEnd);
836
837        // Synthenize newline just after the C comment, regardless if there is
838        // actually a newline.
839        formTokenWithChars(T, BufferPtr, tok::newline);
840
841        CommentState = LCS_BetweenComments;
842        break;
843      } else {
844        // Don't synthesized a newline after BCPL comment.
845        CommentState = LCS_BetweenComments;
846        goto again;
847      }
848    }
849  }
850}
851
852StringRef Lexer::getSpelling(const Token &Tok,
853                             const SourceManager &SourceMgr) const {
854  SourceLocation Loc = Tok.getLocation();
855  std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
856
857  bool InvalidTemp = false;
858  StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
859  if (InvalidTemp)
860    return StringRef();
861
862  const char *Begin = File.data() + LocInfo.second;
863  return StringRef(Begin, Tok.getLength());
864}
865
866} // end namespace comments
867} // end namespace clang
868