CommentLexer.cpp revision 251662
1239313Sdim#include "clang/AST/CommentLexer.h"
2239313Sdim#include "clang/AST/CommentCommandTraits.h"
3251662Sdim#include "clang/AST/CommentDiagnostic.h"
4249423Sdim#include "clang/Basic/CharInfo.h"
5249423Sdim#include "llvm/ADT/StringExtras.h"
6239313Sdim#include "llvm/ADT/StringSwitch.h"
7249423Sdim#include "llvm/Support/ConvertUTF.h"
8239313Sdim#include "llvm/Support/ErrorHandling.h"
9239313Sdim
10239313Sdimnamespace clang {
11239313Sdimnamespace comments {
12239313Sdim
13239313Sdimvoid Token::dump(const Lexer &L, const SourceManager &SM) const {
14239313Sdim  llvm::errs() << "comments::Token Kind=" << Kind << " ";
15239313Sdim  Loc.dump(SM);
16239313Sdim  llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
17239313Sdim}
18239313Sdim
19249423Sdimstatic inline bool isHTMLNamedCharacterReferenceCharacter(char C) {
20249423Sdim  return isLetter(C);
21239313Sdim}
22239313Sdim
23249423Sdimstatic inline bool isHTMLDecimalCharacterReferenceCharacter(char C) {
24249423Sdim  return isDigit(C);
25239313Sdim}
26239313Sdim
27249423Sdimstatic inline bool isHTMLHexCharacterReferenceCharacter(char C) {
28249423Sdim  return isHexDigit(C);
29239313Sdim}
30243830Sdim
31249423Sdimstatic inline StringRef convertCodePointToUTF8(
32249423Sdim                                      llvm::BumpPtrAllocator &Allocator,
33249423Sdim                                      unsigned CodePoint) {
34249423Sdim  char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
35249423Sdim  char *ResolvedPtr = Resolved;
36249423Sdim  if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
37249423Sdim    return StringRef(Resolved, ResolvedPtr - Resolved);
38249423Sdim  else
39249423Sdim    return StringRef();
40249423Sdim}
41249423Sdim
42249423Sdimnamespace {
43249423Sdim
44243830Sdim#include "clang/AST/CommentHTMLTags.inc"
45249423Sdim#include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
46243830Sdim
47239313Sdim} // unnamed namespace
48239313Sdim
49239313SdimStringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
50249423Sdim  // Fast path, first check a few most widely used named character references.
51239313Sdim  return llvm::StringSwitch<StringRef>(Name)
52239313Sdim      .Case("amp", "&")
53239313Sdim      .Case("lt", "<")
54239313Sdim      .Case("gt", ">")
55239313Sdim      .Case("quot", "\"")
56239313Sdim      .Case("apos", "\'")
57249423Sdim      // Slow path.
58249423Sdim      .Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
59239313Sdim}
60239313Sdim
61239313SdimStringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
62239313Sdim  unsigned CodePoint = 0;
63239313Sdim  for (unsigned i = 0, e = Name.size(); i != e; ++i) {
64239313Sdim    assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
65239313Sdim    CodePoint *= 10;
66239313Sdim    CodePoint += Name[i] - '0';
67239313Sdim  }
68249423Sdim  return convertCodePointToUTF8(Allocator, CodePoint);
69239313Sdim}
70239313Sdim
71239313SdimStringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
72239313Sdim  unsigned CodePoint = 0;
73239313Sdim  for (unsigned i = 0, e = Name.size(); i != e; ++i) {
74239313Sdim    CodePoint *= 16;
75239313Sdim    const char C = Name[i];
76239313Sdim    assert(isHTMLHexCharacterReferenceCharacter(C));
77249423Sdim    CodePoint += llvm::hexDigitValue(C);
78239313Sdim  }
79249423Sdim  return convertCodePointToUTF8(Allocator, CodePoint);
80239313Sdim}
81239313Sdim
82239313Sdimvoid Lexer::skipLineStartingDecorations() {
83239313Sdim  // This function should be called only for C comments
84239313Sdim  assert(CommentState == LCS_InsideCComment);
85239313Sdim
86239313Sdim  if (BufferPtr == CommentEnd)
87239313Sdim    return;
88239313Sdim
89239313Sdim  switch (*BufferPtr) {
90239313Sdim  case ' ':
91239313Sdim  case '\t':
92239313Sdim  case '\f':
93239313Sdim  case '\v': {
94239313Sdim    const char *NewBufferPtr = BufferPtr;
95239313Sdim    NewBufferPtr++;
96239313Sdim    if (NewBufferPtr == CommentEnd)
97239313Sdim      return;
98239313Sdim
99239313Sdim    char C = *NewBufferPtr;
100249423Sdim    while (isHorizontalWhitespace(C)) {
101239313Sdim      NewBufferPtr++;
102239313Sdim      if (NewBufferPtr == CommentEnd)
103239313Sdim        return;
104239313Sdim      C = *NewBufferPtr;
105239313Sdim    }
106239313Sdim    if (C == '*')
107239313Sdim      BufferPtr = NewBufferPtr + 1;
108239313Sdim    break;
109239313Sdim  }
110239313Sdim  case '*':
111239313Sdim    BufferPtr++;
112239313Sdim    break;
113239313Sdim  }
114239313Sdim}
115239313Sdim
116239313Sdimnamespace {
117239313Sdim/// Returns pointer to the first newline character in the string.
118239313Sdimconst char *findNewline(const char *BufferPtr, const char *BufferEnd) {
119239313Sdim  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
120249423Sdim    if (isVerticalWhitespace(*BufferPtr))
121239313Sdim      return BufferPtr;
122239313Sdim  }
123239313Sdim  return BufferEnd;
124239313Sdim}
125239313Sdim
126239313Sdimconst char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
127239313Sdim  if (BufferPtr == BufferEnd)
128239313Sdim    return BufferPtr;
129239313Sdim
130239313Sdim  if (*BufferPtr == '\n')
131239313Sdim    BufferPtr++;
132239313Sdim  else {
133239313Sdim    assert(*BufferPtr == '\r');
134239313Sdim    BufferPtr++;
135239313Sdim    if (BufferPtr != BufferEnd && *BufferPtr == '\n')
136239313Sdim      BufferPtr++;
137239313Sdim  }
138239313Sdim  return BufferPtr;
139239313Sdim}
140239313Sdim
141239313Sdimconst char *skipNamedCharacterReference(const char *BufferPtr,
142239313Sdim                                        const char *BufferEnd) {
143239313Sdim  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
144239313Sdim    if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
145239313Sdim      return BufferPtr;
146239313Sdim  }
147239313Sdim  return BufferEnd;
148239313Sdim}
149239313Sdim
150239313Sdimconst char *skipDecimalCharacterReference(const char *BufferPtr,
151239313Sdim                                          const char *BufferEnd) {
152239313Sdim  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
153239313Sdim    if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
154239313Sdim      return BufferPtr;
155239313Sdim  }
156239313Sdim  return BufferEnd;
157239313Sdim}
158239313Sdim
159239313Sdimconst char *skipHexCharacterReference(const char *BufferPtr,
160239313Sdim                                          const char *BufferEnd) {
161239313Sdim  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
162239313Sdim    if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
163239313Sdim      return BufferPtr;
164239313Sdim  }
165239313Sdim  return BufferEnd;
166239313Sdim}
167239313Sdim
168239313Sdimbool isHTMLIdentifierStartingCharacter(char C) {
169249423Sdim  return isLetter(C);
170239313Sdim}
171239313Sdim
172239313Sdimbool isHTMLIdentifierCharacter(char C) {
173249423Sdim  return isAlphanumeric(C);
174239313Sdim}
175239313Sdim
176239313Sdimconst char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
177239313Sdim  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
178239313Sdim    if (!isHTMLIdentifierCharacter(*BufferPtr))
179239313Sdim      return BufferPtr;
180239313Sdim  }
181239313Sdim  return BufferEnd;
182239313Sdim}
183239313Sdim
184239313Sdim/// Skip HTML string quoted in single or double quotes.  Escaping quotes inside
185239313Sdim/// string allowed.
186239313Sdim///
187239313Sdim/// Returns pointer to closing quote.
188239313Sdimconst char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
189239313Sdim{
190239313Sdim  const char Quote = *BufferPtr;
191239313Sdim  assert(Quote == '\"' || Quote == '\'');
192239313Sdim
193239313Sdim  BufferPtr++;
194239313Sdim  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
195239313Sdim    const char C = *BufferPtr;
196239313Sdim    if (C == Quote && BufferPtr[-1] != '\\')
197239313Sdim      return BufferPtr;
198239313Sdim  }
199239313Sdim  return BufferEnd;
200239313Sdim}
201239313Sdim
202239313Sdimconst char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
203239313Sdim  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
204239313Sdim    if (!isWhitespace(*BufferPtr))
205239313Sdim      return BufferPtr;
206239313Sdim  }
207239313Sdim  return BufferEnd;
208239313Sdim}
209239313Sdim
210239313Sdimbool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
211239313Sdim  return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
212239313Sdim}
213239313Sdim
214243830Sdimbool isCommandNameStartCharacter(char C) {
215249423Sdim  return isLetter(C);
216243830Sdim}
217243830Sdim
218239313Sdimbool isCommandNameCharacter(char C) {
219249423Sdim  return isAlphanumeric(C);
220239313Sdim}
221239313Sdim
222239313Sdimconst char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
223239313Sdim  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
224239313Sdim    if (!isCommandNameCharacter(*BufferPtr))
225239313Sdim      return BufferPtr;
226239313Sdim  }
227239313Sdim  return BufferEnd;
228239313Sdim}
229239313Sdim
230239313Sdim/// Return the one past end pointer for BCPL comments.
231239313Sdim/// Handles newlines escaped with backslash or trigraph for backslahs.
232239313Sdimconst char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
233239313Sdim  const char *CurPtr = BufferPtr;
234239313Sdim  while (CurPtr != BufferEnd) {
235249423Sdim    while (!isVerticalWhitespace(*CurPtr)) {
236239313Sdim      CurPtr++;
237239313Sdim      if (CurPtr == BufferEnd)
238239313Sdim        return BufferEnd;
239239313Sdim    }
240239313Sdim    // We found a newline, check if it is escaped.
241239313Sdim    const char *EscapePtr = CurPtr - 1;
242239313Sdim    while(isHorizontalWhitespace(*EscapePtr))
243239313Sdim      EscapePtr--;
244239313Sdim
245239313Sdim    if (*EscapePtr == '\\' ||
246239313Sdim        (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
247239313Sdim         EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
248239313Sdim      // We found an escaped newline.
249239313Sdim      CurPtr = skipNewline(CurPtr, BufferEnd);
250239313Sdim    } else
251239313Sdim      return CurPtr; // Not an escaped newline.
252239313Sdim  }
253239313Sdim  return BufferEnd;
254239313Sdim}
255239313Sdim
256239313Sdim/// Return the one past end pointer for C comments.
257239313Sdim/// Very dumb, does not handle escaped newlines or trigraphs.
258239313Sdimconst char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
259239313Sdim  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
260239313Sdim    if (*BufferPtr == '*') {
261239313Sdim      assert(BufferPtr + 1 != BufferEnd);
262239313Sdim      if (*(BufferPtr + 1) == '/')
263239313Sdim        return BufferPtr;
264239313Sdim    }
265239313Sdim  }
266239313Sdim  llvm_unreachable("buffer end hit before '*/' was seen");
267239313Sdim}
268239313Sdim} // unnamed namespace
269239313Sdim
270239313Sdimvoid Lexer::lexCommentText(Token &T) {
271239313Sdim  assert(CommentState == LCS_InsideBCPLComment ||
272239313Sdim         CommentState == LCS_InsideCComment);
273239313Sdim
274239313Sdim  switch (State) {
275239313Sdim  case LS_Normal:
276239313Sdim    break;
277239313Sdim  case LS_VerbatimBlockFirstLine:
278239313Sdim    lexVerbatimBlockFirstLine(T);
279239313Sdim    return;
280239313Sdim  case LS_VerbatimBlockBody:
281239313Sdim    lexVerbatimBlockBody(T);
282239313Sdim    return;
283239313Sdim  case LS_VerbatimLineText:
284239313Sdim    lexVerbatimLineText(T);
285239313Sdim    return;
286239313Sdim  case LS_HTMLStartTag:
287239313Sdim    lexHTMLStartTag(T);
288239313Sdim    return;
289239313Sdim  case LS_HTMLEndTag:
290239313Sdim    lexHTMLEndTag(T);
291239313Sdim    return;
292239313Sdim  }
293239313Sdim
294239313Sdim  assert(State == LS_Normal);
295239313Sdim
296239313Sdim  const char *TokenPtr = BufferPtr;
297239313Sdim  assert(TokenPtr < CommentEnd);
298239313Sdim  while (TokenPtr != CommentEnd) {
299239313Sdim    switch(*TokenPtr) {
300239313Sdim      case '\\':
301239313Sdim      case '@': {
302249423Sdim        // Commands that start with a backslash and commands that start with
303249423Sdim        // 'at' have equivalent semantics.  But we keep information about the
304249423Sdim        // exact syntax in AST for comments.
305249423Sdim        tok::TokenKind CommandKind =
306249423Sdim            (*TokenPtr == '@') ? tok::at_command : tok::backslash_command;
307239313Sdim        TokenPtr++;
308239313Sdim        if (TokenPtr == CommentEnd) {
309239313Sdim          formTextToken(T, TokenPtr);
310239313Sdim          return;
311239313Sdim        }
312239313Sdim        char C = *TokenPtr;
313239313Sdim        switch (C) {
314239313Sdim        default:
315239313Sdim          break;
316239313Sdim
317239313Sdim        case '\\': case '@': case '&': case '$':
318239313Sdim        case '#':  case '<': case '>': case '%':
319239313Sdim        case '\"': case '.': case ':':
320239313Sdim          // This is one of \\ \@ \& \$ etc escape sequences.
321239313Sdim          TokenPtr++;
322239313Sdim          if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
323239313Sdim            // This is the \:: escape sequence.
324239313Sdim            TokenPtr++;
325239313Sdim          }
326239313Sdim          StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
327239313Sdim          formTokenWithChars(T, TokenPtr, tok::text);
328239313Sdim          T.setText(UnescapedText);
329239313Sdim          return;
330239313Sdim        }
331239313Sdim
332239313Sdim        // Don't make zero-length commands.
333243830Sdim        if (!isCommandNameStartCharacter(*TokenPtr)) {
334239313Sdim          formTextToken(T, TokenPtr);
335239313Sdim          return;
336239313Sdim        }
337239313Sdim
338239313Sdim        TokenPtr = skipCommandName(TokenPtr, CommentEnd);
339239313Sdim        unsigned Length = TokenPtr - (BufferPtr + 1);
340239313Sdim
341239313Sdim        // Hardcoded support for lexing LaTeX formula commands
342239313Sdim        // \f$ \f[ \f] \f{ \f} as a single command.
343239313Sdim        if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
344239313Sdim          C = *TokenPtr;
345239313Sdim          if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
346239313Sdim            TokenPtr++;
347239313Sdim            Length++;
348239313Sdim          }
349239313Sdim        }
350239313Sdim
351239313Sdim        const StringRef CommandName(BufferPtr + 1, Length);
352239313Sdim
353243830Sdim        const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
354243830Sdim        if (!Info) {
355243830Sdim          formTokenWithChars(T, TokenPtr, tok::unknown_command);
356243830Sdim          T.setUnknownCommandName(CommandName);
357251662Sdim          Diag(T.getLocation(), diag::warn_unknown_comment_command_name);
358239313Sdim          return;
359239313Sdim        }
360243830Sdim        if (Info->IsVerbatimBlockCommand) {
361243830Sdim          setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
362239313Sdim          return;
363239313Sdim        }
364243830Sdim        if (Info->IsVerbatimLineCommand) {
365243830Sdim          setupAndLexVerbatimLine(T, TokenPtr, Info);
366243830Sdim          return;
367243830Sdim        }
368249423Sdim        formTokenWithChars(T, TokenPtr, CommandKind);
369243830Sdim        T.setCommandID(Info->getID());
370239313Sdim        return;
371239313Sdim      }
372239313Sdim
373239313Sdim      case '&':
374239313Sdim        lexHTMLCharacterReference(T);
375239313Sdim        return;
376239313Sdim
377239313Sdim      case '<': {
378239313Sdim        TokenPtr++;
379239313Sdim        if (TokenPtr == CommentEnd) {
380239313Sdim          formTextToken(T, TokenPtr);
381239313Sdim          return;
382239313Sdim        }
383239313Sdim        const char C = *TokenPtr;
384239313Sdim        if (isHTMLIdentifierStartingCharacter(C))
385239313Sdim          setupAndLexHTMLStartTag(T);
386239313Sdim        else if (C == '/')
387239313Sdim          setupAndLexHTMLEndTag(T);
388239313Sdim        else
389239313Sdim          formTextToken(T, TokenPtr);
390239313Sdim
391239313Sdim        return;
392239313Sdim      }
393239313Sdim
394239313Sdim      case '\n':
395239313Sdim      case '\r':
396239313Sdim        TokenPtr = skipNewline(TokenPtr, CommentEnd);
397239313Sdim        formTokenWithChars(T, TokenPtr, tok::newline);
398239313Sdim
399239313Sdim        if (CommentState == LCS_InsideCComment)
400239313Sdim          skipLineStartingDecorations();
401239313Sdim        return;
402239313Sdim
403239313Sdim      default: {
404249423Sdim        size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr).
405249423Sdim                         find_first_of("\n\r\\@&<");
406249423Sdim        if (End != StringRef::npos)
407249423Sdim          TokenPtr += End;
408249423Sdim        else
409249423Sdim          TokenPtr = CommentEnd;
410239313Sdim        formTextToken(T, TokenPtr);
411239313Sdim        return;
412239313Sdim      }
413239313Sdim    }
414239313Sdim  }
415239313Sdim}
416239313Sdim
417239313Sdimvoid Lexer::setupAndLexVerbatimBlock(Token &T,
418239313Sdim                                     const char *TextBegin,
419243830Sdim                                     char Marker, const CommandInfo *Info) {
420243830Sdim  assert(Info->IsVerbatimBlockCommand);
421243830Sdim
422239313Sdim  VerbatimBlockEndCommandName.clear();
423239313Sdim  VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
424243830Sdim  VerbatimBlockEndCommandName.append(Info->EndCommandName);
425239313Sdim
426239313Sdim  formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
427243830Sdim  T.setVerbatimBlockID(Info->getID());
428239313Sdim
429239313Sdim  // If there is a newline following the verbatim opening command, skip the
430239313Sdim  // newline so that we don't create an tok::verbatim_block_line with empty
431239313Sdim  // text content.
432249423Sdim  if (BufferPtr != CommentEnd &&
433249423Sdim      isVerticalWhitespace(*BufferPtr)) {
434249423Sdim    BufferPtr = skipNewline(BufferPtr, CommentEnd);
435249423Sdim    State = LS_VerbatimBlockBody;
436249423Sdim    return;
437239313Sdim  }
438239313Sdim
439239313Sdim  State = LS_VerbatimBlockFirstLine;
440239313Sdim}
441239313Sdim
442239313Sdimvoid Lexer::lexVerbatimBlockFirstLine(Token &T) {
443239313Sdimagain:
444239313Sdim  assert(BufferPtr < CommentEnd);
445239313Sdim
446239313Sdim  // FIXME: It would be better to scan the text once, finding either the block
447239313Sdim  // end command or newline.
448239313Sdim  //
449239313Sdim  // Extract current line.
450239313Sdim  const char *Newline = findNewline(BufferPtr, CommentEnd);
451239313Sdim  StringRef Line(BufferPtr, Newline - BufferPtr);
452239313Sdim
453239313Sdim  // Look for end command in current line.
454239313Sdim  size_t Pos = Line.find(VerbatimBlockEndCommandName);
455239313Sdim  const char *TextEnd;
456239313Sdim  const char *NextLine;
457239313Sdim  if (Pos == StringRef::npos) {
458239313Sdim    // Current line is completely verbatim.
459239313Sdim    TextEnd = Newline;
460239313Sdim    NextLine = skipNewline(Newline, CommentEnd);
461239313Sdim  } else if (Pos == 0) {
462239313Sdim    // Current line contains just an end command.
463239313Sdim    const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
464239313Sdim    StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
465239313Sdim    formTokenWithChars(T, End, tok::verbatim_block_end);
466243830Sdim    T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
467239313Sdim    State = LS_Normal;
468239313Sdim    return;
469239313Sdim  } else {
470239313Sdim    // There is some text, followed by end command.  Extract text first.
471239313Sdim    TextEnd = BufferPtr + Pos;
472239313Sdim    NextLine = TextEnd;
473239313Sdim    // If there is only whitespace before end command, skip whitespace.
474239313Sdim    if (isWhitespace(BufferPtr, TextEnd)) {
475239313Sdim      BufferPtr = TextEnd;
476239313Sdim      goto again;
477239313Sdim    }
478239313Sdim  }
479239313Sdim
480239313Sdim  StringRef Text(BufferPtr, TextEnd - BufferPtr);
481239313Sdim  formTokenWithChars(T, NextLine, tok::verbatim_block_line);
482239313Sdim  T.setVerbatimBlockText(Text);
483239313Sdim
484239313Sdim  State = LS_VerbatimBlockBody;
485239313Sdim}
486239313Sdim
487239313Sdimvoid Lexer::lexVerbatimBlockBody(Token &T) {
488239313Sdim  assert(State == LS_VerbatimBlockBody);
489239313Sdim
490239313Sdim  if (CommentState == LCS_InsideCComment)
491239313Sdim    skipLineStartingDecorations();
492239313Sdim
493239313Sdim  lexVerbatimBlockFirstLine(T);
494239313Sdim}
495239313Sdim
496243830Sdimvoid Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
497243830Sdim                                    const CommandInfo *Info) {
498243830Sdim  assert(Info->IsVerbatimLineCommand);
499239313Sdim  formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
500243830Sdim  T.setVerbatimLineID(Info->getID());
501239313Sdim
502239313Sdim  State = LS_VerbatimLineText;
503239313Sdim}
504239313Sdim
505239313Sdimvoid Lexer::lexVerbatimLineText(Token &T) {
506239313Sdim  assert(State == LS_VerbatimLineText);
507239313Sdim
508239313Sdim  // Extract current line.
509239313Sdim  const char *Newline = findNewline(BufferPtr, CommentEnd);
510239313Sdim  const StringRef Text(BufferPtr, Newline - BufferPtr);
511239313Sdim  formTokenWithChars(T, Newline, tok::verbatim_line_text);
512239313Sdim  T.setVerbatimLineText(Text);
513239313Sdim
514239313Sdim  State = LS_Normal;
515239313Sdim}
516239313Sdim
517239313Sdimvoid Lexer::lexHTMLCharacterReference(Token &T) {
518239313Sdim  const char *TokenPtr = BufferPtr;
519239313Sdim  assert(*TokenPtr == '&');
520239313Sdim  TokenPtr++;
521239313Sdim  if (TokenPtr == CommentEnd) {
522239313Sdim    formTextToken(T, TokenPtr);
523239313Sdim    return;
524239313Sdim  }
525239313Sdim  const char *NamePtr;
526239313Sdim  bool isNamed = false;
527239313Sdim  bool isDecimal = false;
528239313Sdim  char C = *TokenPtr;
529239313Sdim  if (isHTMLNamedCharacterReferenceCharacter(C)) {
530239313Sdim    NamePtr = TokenPtr;
531239313Sdim    TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
532239313Sdim    isNamed = true;
533239313Sdim  } else if (C == '#') {
534239313Sdim    TokenPtr++;
535239313Sdim    if (TokenPtr == CommentEnd) {
536239313Sdim      formTextToken(T, TokenPtr);
537239313Sdim      return;
538239313Sdim    }
539239313Sdim    C = *TokenPtr;
540239313Sdim    if (isHTMLDecimalCharacterReferenceCharacter(C)) {
541239313Sdim      NamePtr = TokenPtr;
542239313Sdim      TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
543239313Sdim      isDecimal = true;
544239313Sdim    } else if (C == 'x' || C == 'X') {
545239313Sdim      TokenPtr++;
546239313Sdim      NamePtr = TokenPtr;
547239313Sdim      TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
548239313Sdim    } else {
549239313Sdim      formTextToken(T, TokenPtr);
550239313Sdim      return;
551239313Sdim    }
552239313Sdim  } else {
553239313Sdim    formTextToken(T, TokenPtr);
554239313Sdim    return;
555239313Sdim  }
556239313Sdim  if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
557239313Sdim      *TokenPtr != ';') {
558239313Sdim    formTextToken(T, TokenPtr);
559239313Sdim    return;
560239313Sdim  }
561239313Sdim  StringRef Name(NamePtr, TokenPtr - NamePtr);
562239313Sdim  TokenPtr++; // Skip semicolon.
563239313Sdim  StringRef Resolved;
564239313Sdim  if (isNamed)
565239313Sdim    Resolved = resolveHTMLNamedCharacterReference(Name);
566239313Sdim  else if (isDecimal)
567239313Sdim    Resolved = resolveHTMLDecimalCharacterReference(Name);
568239313Sdim  else
569239313Sdim    Resolved = resolveHTMLHexCharacterReference(Name);
570239313Sdim
571239313Sdim  if (Resolved.empty()) {
572239313Sdim    formTextToken(T, TokenPtr);
573239313Sdim    return;
574239313Sdim  }
575239313Sdim  formTokenWithChars(T, TokenPtr, tok::text);
576239313Sdim  T.setText(Resolved);
577239313Sdim  return;
578239313Sdim}
579239313Sdim
580239313Sdimvoid Lexer::setupAndLexHTMLStartTag(Token &T) {
581239313Sdim  assert(BufferPtr[0] == '<' &&
582239313Sdim         isHTMLIdentifierStartingCharacter(BufferPtr[1]));
583239313Sdim  const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
584243830Sdim  StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
585243830Sdim  if (!isHTMLTagName(Name)) {
586243830Sdim    formTextToken(T, TagNameEnd);
587243830Sdim    return;
588243830Sdim  }
589239313Sdim
590239313Sdim  formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
591239313Sdim  T.setHTMLTagStartName(Name);
592239313Sdim
593239313Sdim  BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
594239313Sdim
595239313Sdim  const char C = *BufferPtr;
596239313Sdim  if (BufferPtr != CommentEnd &&
597239313Sdim      (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
598239313Sdim    State = LS_HTMLStartTag;
599239313Sdim}
600239313Sdim
601239313Sdimvoid Lexer::lexHTMLStartTag(Token &T) {
602239313Sdim  assert(State == LS_HTMLStartTag);
603239313Sdim
604239313Sdim  const char *TokenPtr = BufferPtr;
605239313Sdim  char C = *TokenPtr;
606239313Sdim  if (isHTMLIdentifierCharacter(C)) {
607239313Sdim    TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
608239313Sdim    StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
609239313Sdim    formTokenWithChars(T, TokenPtr, tok::html_ident);
610239313Sdim    T.setHTMLIdent(Ident);
611239313Sdim  } else {
612239313Sdim    switch (C) {
613239313Sdim    case '=':
614239313Sdim      TokenPtr++;
615239313Sdim      formTokenWithChars(T, TokenPtr, tok::html_equals);
616239313Sdim      break;
617239313Sdim    case '\"':
618239313Sdim    case '\'': {
619239313Sdim      const char *OpenQuote = TokenPtr;
620239313Sdim      TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
621239313Sdim      const char *ClosingQuote = TokenPtr;
622239313Sdim      if (TokenPtr != CommentEnd) // Skip closing quote.
623239313Sdim        TokenPtr++;
624239313Sdim      formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
625239313Sdim      T.setHTMLQuotedString(StringRef(OpenQuote + 1,
626239313Sdim                                      ClosingQuote - (OpenQuote + 1)));
627239313Sdim      break;
628239313Sdim    }
629239313Sdim    case '>':
630239313Sdim      TokenPtr++;
631239313Sdim      formTokenWithChars(T, TokenPtr, tok::html_greater);
632239313Sdim      State = LS_Normal;
633239313Sdim      return;
634239313Sdim    case '/':
635239313Sdim      TokenPtr++;
636239313Sdim      if (TokenPtr != CommentEnd && *TokenPtr == '>') {
637239313Sdim        TokenPtr++;
638239313Sdim        formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
639239313Sdim      } else
640239313Sdim        formTextToken(T, TokenPtr);
641239313Sdim
642239313Sdim      State = LS_Normal;
643239313Sdim      return;
644239313Sdim    }
645239313Sdim  }
646239313Sdim
647239313Sdim  // Now look ahead and return to normal state if we don't see any HTML tokens
648239313Sdim  // ahead.
649239313Sdim  BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
650239313Sdim  if (BufferPtr == CommentEnd) {
651239313Sdim    State = LS_Normal;
652239313Sdim    return;
653239313Sdim  }
654239313Sdim
655239313Sdim  C = *BufferPtr;
656239313Sdim  if (!isHTMLIdentifierStartingCharacter(C) &&
657239313Sdim      C != '=' && C != '\"' && C != '\'' && C != '>') {
658239313Sdim    State = LS_Normal;
659239313Sdim    return;
660239313Sdim  }
661239313Sdim}
662239313Sdim
663239313Sdimvoid Lexer::setupAndLexHTMLEndTag(Token &T) {
664239313Sdim  assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
665239313Sdim
666239313Sdim  const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
667239313Sdim  const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
668243830Sdim  StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
669243830Sdim  if (!isHTMLTagName(Name)) {
670243830Sdim    formTextToken(T, TagNameEnd);
671243830Sdim    return;
672243830Sdim  }
673239313Sdim
674239313Sdim  const char *End = skipWhitespace(TagNameEnd, CommentEnd);
675239313Sdim
676239313Sdim  formTokenWithChars(T, End, tok::html_end_tag);
677243830Sdim  T.setHTMLTagEndName(Name);
678239313Sdim
679239313Sdim  if (BufferPtr != CommentEnd && *BufferPtr == '>')
680239313Sdim    State = LS_HTMLEndTag;
681239313Sdim}
682239313Sdim
683239313Sdimvoid Lexer::lexHTMLEndTag(Token &T) {
684239313Sdim  assert(BufferPtr != CommentEnd && *BufferPtr == '>');
685239313Sdim
686239313Sdim  formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
687239313Sdim  State = LS_Normal;
688239313Sdim}
689239313Sdim
690251662SdimLexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
691251662Sdim             const CommandTraits &Traits,
692243830Sdim             SourceLocation FileLoc,
693239313Sdim             const char *BufferStart, const char *BufferEnd):
694251662Sdim    Allocator(Allocator), Diags(Diags), Traits(Traits),
695239313Sdim    BufferStart(BufferStart), BufferEnd(BufferEnd),
696243830Sdim    FileLoc(FileLoc), BufferPtr(BufferStart),
697239313Sdim    CommentState(LCS_BeforeComment), State(LS_Normal) {
698239313Sdim}
699239313Sdim
700239313Sdimvoid Lexer::lex(Token &T) {
701239313Sdimagain:
702239313Sdim  switch (CommentState) {
703239313Sdim  case LCS_BeforeComment:
704239313Sdim    if (BufferPtr == BufferEnd) {
705239313Sdim      formTokenWithChars(T, BufferPtr, tok::eof);
706239313Sdim      return;
707239313Sdim    }
708239313Sdim
709239313Sdim    assert(*BufferPtr == '/');
710239313Sdim    BufferPtr++; // Skip first slash.
711239313Sdim    switch(*BufferPtr) {
712239313Sdim    case '/': { // BCPL comment.
713239313Sdim      BufferPtr++; // Skip second slash.
714239313Sdim
715239313Sdim      if (BufferPtr != BufferEnd) {
716239313Sdim        // Skip Doxygen magic marker, if it is present.
717239313Sdim        // It might be missing because of a typo //< or /*<, or because we
718239313Sdim        // merged this non-Doxygen comment into a bunch of Doxygen comments
719239313Sdim        // around it: /** ... */ /* ... */ /** ... */
720239313Sdim        const char C = *BufferPtr;
721239313Sdim        if (C == '/' || C == '!')
722239313Sdim          BufferPtr++;
723239313Sdim      }
724239313Sdim
725239313Sdim      // Skip less-than symbol that marks trailing comments.
726239313Sdim      // Skip it even if the comment is not a Doxygen one, because //< and /*<
727239313Sdim      // are frequent typos.
728239313Sdim      if (BufferPtr != BufferEnd && *BufferPtr == '<')
729239313Sdim        BufferPtr++;
730239313Sdim
731239313Sdim      CommentState = LCS_InsideBCPLComment;
732239313Sdim      if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
733239313Sdim        State = LS_Normal;
734239313Sdim      CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
735239313Sdim      goto again;
736239313Sdim    }
737239313Sdim    case '*': { // C comment.
738239313Sdim      BufferPtr++; // Skip star.
739239313Sdim
740239313Sdim      // Skip Doxygen magic marker.
741239313Sdim      const char C = *BufferPtr;
742239313Sdim      if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
743239313Sdim        BufferPtr++;
744239313Sdim
745239313Sdim      // Skip less-than symbol that marks trailing comments.
746239313Sdim      if (BufferPtr != BufferEnd && *BufferPtr == '<')
747239313Sdim        BufferPtr++;
748239313Sdim
749239313Sdim      CommentState = LCS_InsideCComment;
750239313Sdim      State = LS_Normal;
751239313Sdim      CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
752239313Sdim      goto again;
753239313Sdim    }
754239313Sdim    default:
755239313Sdim      llvm_unreachable("second character of comment should be '/' or '*'");
756239313Sdim    }
757239313Sdim
758239313Sdim  case LCS_BetweenComments: {
759239313Sdim    // Consecutive comments are extracted only if there is only whitespace
760239313Sdim    // between them.  So we can search for the start of the next comment.
761239313Sdim    const char *EndWhitespace = BufferPtr;
762239313Sdim    while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
763239313Sdim      EndWhitespace++;
764239313Sdim
765239313Sdim    // Turn any whitespace between comments (and there is only whitespace
766239313Sdim    // between them -- guaranteed by comment extraction) into a newline.  We
767239313Sdim    // have two newlines between C comments in total (first one was synthesized
768239313Sdim    // after a comment).
769239313Sdim    formTokenWithChars(T, EndWhitespace, tok::newline);
770239313Sdim
771239313Sdim    CommentState = LCS_BeforeComment;
772239313Sdim    break;
773239313Sdim  }
774239313Sdim
775239313Sdim  case LCS_InsideBCPLComment:
776239313Sdim  case LCS_InsideCComment:
777239313Sdim    if (BufferPtr != CommentEnd) {
778239313Sdim      lexCommentText(T);
779239313Sdim      break;
780239313Sdim    } else {
781239313Sdim      // Skip C comment closing sequence.
782239313Sdim      if (CommentState == LCS_InsideCComment) {
783239313Sdim        assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
784239313Sdim        BufferPtr += 2;
785239313Sdim        assert(BufferPtr <= BufferEnd);
786239313Sdim
787239313Sdim        // Synthenize newline just after the C comment, regardless if there is
788239313Sdim        // actually a newline.
789239313Sdim        formTokenWithChars(T, BufferPtr, tok::newline);
790239313Sdim
791239313Sdim        CommentState = LCS_BetweenComments;
792239313Sdim        break;
793239313Sdim      } else {
794239313Sdim        // Don't synthesized a newline after BCPL comment.
795239313Sdim        CommentState = LCS_BetweenComments;
796239313Sdim        goto again;
797239313Sdim      }
798239313Sdim    }
799239313Sdim  }
800239313Sdim}
801239313Sdim
802239313SdimStringRef Lexer::getSpelling(const Token &Tok,
803239313Sdim                             const SourceManager &SourceMgr,
804239313Sdim                             bool *Invalid) const {
805239313Sdim  SourceLocation Loc = Tok.getLocation();
806239313Sdim  std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
807239313Sdim
808239313Sdim  bool InvalidTemp = false;
809239313Sdim  StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
810239313Sdim  if (InvalidTemp) {
811239313Sdim    *Invalid = true;
812239313Sdim    return StringRef();
813239313Sdim  }
814239313Sdim
815239313Sdim  const char *Begin = File.data() + LocInfo.second;
816239313Sdim  return StringRef(Begin, Tok.getLength());
817239313Sdim}
818239313Sdim
819239313Sdim} // end namespace comments
820239313Sdim} // end namespace clang
821239313Sdim
822