1239313Sdim//===--- CommentLexer.h - Lexer for structured comments ---------*- C++ -*-===//
2239313Sdim//
3239313Sdim//                     The LLVM Compiler Infrastructure
4239313Sdim//
5239313Sdim// This file is distributed under the University of Illinois Open Source
6239313Sdim// License. See LICENSE.TXT for details.
7239313Sdim//
8239313Sdim//===----------------------------------------------------------------------===//
9239313Sdim//
10239313Sdim//  This file defines lexer for structured comments and supporting token class.
11239313Sdim//
12239313Sdim//===----------------------------------------------------------------------===//
13239313Sdim
14239313Sdim#ifndef LLVM_CLANG_AST_COMMENT_LEXER_H
15239313Sdim#define LLVM_CLANG_AST_COMMENT_LEXER_H
16239313Sdim
17239313Sdim#include "clang/Basic/SourceManager.h"
18251662Sdim#include "clang/Basic/Diagnostic.h"
19239313Sdim#include "llvm/ADT/SmallString.h"
20239313Sdim#include "llvm/ADT/SmallVector.h"
21249423Sdim#include "llvm/ADT/StringRef.h"
22239313Sdim#include "llvm/Support/Allocator.h"
23239313Sdim#include "llvm/Support/raw_ostream.h"
24239313Sdim
25239313Sdimnamespace clang {
26239313Sdimnamespace comments {
27239313Sdim
28239313Sdimclass Lexer;
29239313Sdimclass TextTokenRetokenizer;
30243830Sdimstruct CommandInfo;
31239313Sdimclass CommandTraits;
32239313Sdim
33239313Sdimnamespace tok {
34239313Sdimenum TokenKind {
35239313Sdim  eof,
36239313Sdim  newline,
37239313Sdim  text,
38249423Sdim  unknown_command,   // Command that does not have an ID.
39249423Sdim  backslash_command, // Command with an ID, that used backslash marker.
40249423Sdim  at_command,        // Command with an ID, that used 'at' marker.
41239313Sdim  verbatim_block_begin,
42239313Sdim  verbatim_block_line,
43239313Sdim  verbatim_block_end,
44239313Sdim  verbatim_line_name,
45239313Sdim  verbatim_line_text,
46239313Sdim  html_start_tag,     // <tag
47239313Sdim  html_ident,         // attr
48239313Sdim  html_equals,        // =
49239313Sdim  html_quoted_string, // "blah\"blah" or 'blah\'blah'
50239313Sdim  html_greater,       // >
51239313Sdim  html_slash_greater, // />
52239313Sdim  html_end_tag        // </tag
53239313Sdim};
54239313Sdim} // end namespace tok
55239313Sdim
56239313Sdim/// \brief Comment token.
57239313Sdimclass Token {
58239313Sdim  friend class Lexer;
59239313Sdim  friend class TextTokenRetokenizer;
60239313Sdim
61239313Sdim  /// The location of the token.
62239313Sdim  SourceLocation Loc;
63239313Sdim
64239313Sdim  /// The actual kind of the token.
65239313Sdim  tok::TokenKind Kind;
66239313Sdim
67239313Sdim  /// Length of the token spelling in comment.  Can be 0 for synthenized
68239313Sdim  /// tokens.
69239313Sdim  unsigned Length;
70239313Sdim
71239313Sdim  /// Contains text value associated with a token.
72243830Sdim  const char *TextPtr;
73239313Sdim
74243830Sdim  /// Integer value associated with a token.
75243830Sdim  ///
76243830Sdim  /// If the token is a konwn command, contains command ID and TextPtr is
77243830Sdim  /// unused (command spelling can be found with CommandTraits).  Otherwise,
78243830Sdim  /// contains the length of the string that starts at TextPtr.
79243830Sdim  unsigned IntVal;
80249423Sdim
81239313Sdimpublic:
82239313Sdim  SourceLocation getLocation() const LLVM_READONLY { return Loc; }
83239313Sdim  void setLocation(SourceLocation SL) { Loc = SL; }
84239313Sdim
85239313Sdim  SourceLocation getEndLocation() const LLVM_READONLY {
86239313Sdim    if (Length == 0 || Length == 1)
87239313Sdim      return Loc;
88239313Sdim    return Loc.getLocWithOffset(Length - 1);
89239313Sdim  }
90239313Sdim
91239313Sdim  tok::TokenKind getKind() const LLVM_READONLY { return Kind; }
92239313Sdim  void setKind(tok::TokenKind K) { Kind = K; }
93239313Sdim
94239313Sdim  bool is(tok::TokenKind K) const LLVM_READONLY { return Kind == K; }
95239313Sdim  bool isNot(tok::TokenKind K) const LLVM_READONLY { return Kind != K; }
96239313Sdim
97239313Sdim  unsigned getLength() const LLVM_READONLY { return Length; }
98239313Sdim  void setLength(unsigned L) { Length = L; }
99239313Sdim
100239313Sdim  StringRef getText() const LLVM_READONLY {
101239313Sdim    assert(is(tok::text));
102243830Sdim    return StringRef(TextPtr, IntVal);
103239313Sdim  }
104239313Sdim
105239313Sdim  void setText(StringRef Text) {
106239313Sdim    assert(is(tok::text));
107243830Sdim    TextPtr = Text.data();
108243830Sdim    IntVal = Text.size();
109239313Sdim  }
110239313Sdim
111243830Sdim  StringRef getUnknownCommandName() const LLVM_READONLY {
112243830Sdim    assert(is(tok::unknown_command));
113243830Sdim    return StringRef(TextPtr, IntVal);
114243830Sdim  }
115243830Sdim
116243830Sdim  void setUnknownCommandName(StringRef Name) {
117243830Sdim    assert(is(tok::unknown_command));
118243830Sdim    TextPtr = Name.data();
119243830Sdim    IntVal = Name.size();
120243830Sdim  }
121243830Sdim
122243830Sdim  unsigned getCommandID() const LLVM_READONLY {
123249423Sdim    assert(is(tok::backslash_command) || is(tok::at_command));
124243830Sdim    return IntVal;
125239313Sdim  }
126239313Sdim
127243830Sdim  void setCommandID(unsigned ID) {
128249423Sdim    assert(is(tok::backslash_command) || is(tok::at_command));
129243830Sdim    IntVal = ID;
130239313Sdim  }
131239313Sdim
132243830Sdim  unsigned getVerbatimBlockID() const LLVM_READONLY {
133239313Sdim    assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
134243830Sdim    return IntVal;
135239313Sdim  }
136239313Sdim
137243830Sdim  void setVerbatimBlockID(unsigned ID) {
138239313Sdim    assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
139243830Sdim    IntVal = ID;
140239313Sdim  }
141239313Sdim
142239313Sdim  StringRef getVerbatimBlockText() const LLVM_READONLY {
143239313Sdim    assert(is(tok::verbatim_block_line));
144243830Sdim    return StringRef(TextPtr, IntVal);
145239313Sdim  }
146239313Sdim
147239313Sdim  void setVerbatimBlockText(StringRef Text) {
148239313Sdim    assert(is(tok::verbatim_block_line));
149243830Sdim    TextPtr = Text.data();
150243830Sdim    IntVal = Text.size();
151239313Sdim  }
152239313Sdim
153243830Sdim  unsigned getVerbatimLineID() const LLVM_READONLY {
154239313Sdim    assert(is(tok::verbatim_line_name));
155243830Sdim    return IntVal;
156239313Sdim  }
157239313Sdim
158243830Sdim  void setVerbatimLineID(unsigned ID) {
159239313Sdim    assert(is(tok::verbatim_line_name));
160243830Sdim    IntVal = ID;
161239313Sdim  }
162239313Sdim
163239313Sdim  StringRef getVerbatimLineText() const LLVM_READONLY {
164239313Sdim    assert(is(tok::verbatim_line_text));
165243830Sdim    return StringRef(TextPtr, IntVal);
166239313Sdim  }
167239313Sdim
168239313Sdim  void setVerbatimLineText(StringRef Text) {
169239313Sdim    assert(is(tok::verbatim_line_text));
170243830Sdim    TextPtr = Text.data();
171243830Sdim    IntVal = Text.size();
172239313Sdim  }
173239313Sdim
174239313Sdim  StringRef getHTMLTagStartName() const LLVM_READONLY {
175239313Sdim    assert(is(tok::html_start_tag));
176243830Sdim    return StringRef(TextPtr, IntVal);
177239313Sdim  }
178239313Sdim
179239313Sdim  void setHTMLTagStartName(StringRef Name) {
180239313Sdim    assert(is(tok::html_start_tag));
181243830Sdim    TextPtr = Name.data();
182243830Sdim    IntVal = Name.size();
183239313Sdim  }
184239313Sdim
185239313Sdim  StringRef getHTMLIdent() const LLVM_READONLY {
186239313Sdim    assert(is(tok::html_ident));
187243830Sdim    return StringRef(TextPtr, IntVal);
188239313Sdim  }
189239313Sdim
190239313Sdim  void setHTMLIdent(StringRef Name) {
191239313Sdim    assert(is(tok::html_ident));
192243830Sdim    TextPtr = Name.data();
193243830Sdim    IntVal = Name.size();
194239313Sdim  }
195239313Sdim
196239313Sdim  StringRef getHTMLQuotedString() const LLVM_READONLY {
197239313Sdim    assert(is(tok::html_quoted_string));
198243830Sdim    return StringRef(TextPtr, IntVal);
199239313Sdim  }
200239313Sdim
201239313Sdim  void setHTMLQuotedString(StringRef Str) {
202239313Sdim    assert(is(tok::html_quoted_string));
203243830Sdim    TextPtr = Str.data();
204243830Sdim    IntVal = Str.size();
205239313Sdim  }
206239313Sdim
207239313Sdim  StringRef getHTMLTagEndName() const LLVM_READONLY {
208239313Sdim    assert(is(tok::html_end_tag));
209243830Sdim    return StringRef(TextPtr, IntVal);
210239313Sdim  }
211239313Sdim
212239313Sdim  void setHTMLTagEndName(StringRef Name) {
213239313Sdim    assert(is(tok::html_end_tag));
214243830Sdim    TextPtr = Name.data();
215243830Sdim    IntVal = Name.size();
216239313Sdim  }
217239313Sdim
218239313Sdim  void dump(const Lexer &L, const SourceManager &SM) const;
219239313Sdim};
220239313Sdim
221239313Sdim/// \brief Comment lexer.
222239313Sdimclass Lexer {
223239313Sdimprivate:
224243830Sdim  Lexer(const Lexer &) LLVM_DELETED_FUNCTION;
225243830Sdim  void operator=(const Lexer &) LLVM_DELETED_FUNCTION;
226239313Sdim
227239313Sdim  /// Allocator for strings that are semantic values of tokens and have to be
228239313Sdim  /// computed (for example, resolved decimal character references).
229239313Sdim  llvm::BumpPtrAllocator &Allocator;
230239313Sdim
231251662Sdim  DiagnosticsEngine &Diags;
232251662Sdim
233239313Sdim  const CommandTraits &Traits;
234239313Sdim
235239313Sdim  const char *const BufferStart;
236239313Sdim  const char *const BufferEnd;
237239313Sdim  SourceLocation FileLoc;
238239313Sdim
239239313Sdim  const char *BufferPtr;
240239313Sdim
241239313Sdim  /// One past end pointer for the current comment.  For BCPL comments points
242239313Sdim  /// to newline or BufferEnd, for C comments points to star in '*/'.
243239313Sdim  const char *CommentEnd;
244239313Sdim
245239313Sdim  enum LexerCommentState {
246239313Sdim    LCS_BeforeComment,
247239313Sdim    LCS_InsideBCPLComment,
248239313Sdim    LCS_InsideCComment,
249239313Sdim    LCS_BetweenComments
250239313Sdim  };
251239313Sdim
252239313Sdim  /// Low-level lexer state, track if we are inside or outside of comment.
253239313Sdim  LexerCommentState CommentState;
254239313Sdim
255239313Sdim  enum LexerState {
256239313Sdim    /// Lexing normal comment text
257239313Sdim    LS_Normal,
258239313Sdim
259239313Sdim    /// Finished lexing verbatim block beginning command, will lex first body
260239313Sdim    /// line.
261239313Sdim    LS_VerbatimBlockFirstLine,
262239313Sdim
263239313Sdim    /// Lexing verbatim block body line-by-line, skipping line-starting
264239313Sdim    /// decorations.
265239313Sdim    LS_VerbatimBlockBody,
266239313Sdim
267239313Sdim    /// Finished lexing verbatim line beginning command, will lex text (one
268239313Sdim    /// line).
269239313Sdim    LS_VerbatimLineText,
270239313Sdim
271239313Sdim    /// Finished lexing \verbatim <TAG \endverbatim part, lexing tag attributes.
272239313Sdim    LS_HTMLStartTag,
273239313Sdim
274239313Sdim    /// Finished lexing \verbatim </TAG \endverbatim part, lexing '>'.
275239313Sdim    LS_HTMLEndTag
276239313Sdim  };
277239313Sdim
278239313Sdim  /// Current lexing mode.
279239313Sdim  LexerState State;
280239313Sdim
281239313Sdim  /// If State is LS_VerbatimBlock, contains the name of verbatim end
282239313Sdim  /// command, including command marker.
283239313Sdim  SmallString<16> VerbatimBlockEndCommandName;
284239313Sdim
285239313Sdim  /// Given a character reference name (e.g., "lt"), return the character that
286239313Sdim  /// it stands for (e.g., "<").
287239313Sdim  StringRef resolveHTMLNamedCharacterReference(StringRef Name) const;
288239313Sdim
289239313Sdim  /// Given a Unicode codepoint as base-10 integer, return the character.
290239313Sdim  StringRef resolveHTMLDecimalCharacterReference(StringRef Name) const;
291239313Sdim
292239313Sdim  /// Given a Unicode codepoint as base-16 integer, return the character.
293239313Sdim  StringRef resolveHTMLHexCharacterReference(StringRef Name) const;
294239313Sdim
295239313Sdim  void formTokenWithChars(Token &Result, const char *TokEnd,
296239313Sdim                          tok::TokenKind Kind) {
297239313Sdim    const unsigned TokLen = TokEnd - BufferPtr;
298239313Sdim    Result.setLocation(getSourceLocation(BufferPtr));
299239313Sdim    Result.setKind(Kind);
300239313Sdim    Result.setLength(TokLen);
301239313Sdim#ifndef NDEBUG
302243830Sdim    Result.TextPtr = "<UNSET>";
303243830Sdim    Result.IntVal = 7;
304239313Sdim#endif
305239313Sdim    BufferPtr = TokEnd;
306239313Sdim  }
307239313Sdim
308239313Sdim  void formTextToken(Token &Result, const char *TokEnd) {
309239313Sdim    StringRef Text(BufferPtr, TokEnd - BufferPtr);
310239313Sdim    formTokenWithChars(Result, TokEnd, tok::text);
311239313Sdim    Result.setText(Text);
312239313Sdim  }
313239313Sdim
314239313Sdim  SourceLocation getSourceLocation(const char *Loc) const {
315239313Sdim    assert(Loc >= BufferStart && Loc <= BufferEnd &&
316239313Sdim           "Location out of range for this buffer!");
317239313Sdim
318239313Sdim    const unsigned CharNo = Loc - BufferStart;
319239313Sdim    return FileLoc.getLocWithOffset(CharNo);
320239313Sdim  }
321239313Sdim
322251662Sdim  DiagnosticBuilder Diag(SourceLocation Loc, unsigned DiagID) {
323251662Sdim    return Diags.Report(Loc, DiagID);
324251662Sdim  }
325251662Sdim
326239313Sdim  /// Eat string matching regexp \code \s*\* \endcode.
327239313Sdim  void skipLineStartingDecorations();
328239313Sdim
329239313Sdim  /// Lex stuff inside comments.  CommentEnd should be set correctly.
330239313Sdim  void lexCommentText(Token &T);
331239313Sdim
332239313Sdim  void setupAndLexVerbatimBlock(Token &T,
333239313Sdim                                const char *TextBegin,
334243830Sdim                                char Marker, const CommandInfo *Info);
335239313Sdim
336239313Sdim  void lexVerbatimBlockFirstLine(Token &T);
337239313Sdim
338239313Sdim  void lexVerbatimBlockBody(Token &T);
339239313Sdim
340243830Sdim  void setupAndLexVerbatimLine(Token &T, const char *TextBegin,
341243830Sdim                               const CommandInfo *Info);
342239313Sdim
343239313Sdim  void lexVerbatimLineText(Token &T);
344239313Sdim
345239313Sdim  void lexHTMLCharacterReference(Token &T);
346239313Sdim
347239313Sdim  void setupAndLexHTMLStartTag(Token &T);
348239313Sdim
349239313Sdim  void lexHTMLStartTag(Token &T);
350239313Sdim
351239313Sdim  void setupAndLexHTMLEndTag(Token &T);
352239313Sdim
353239313Sdim  void lexHTMLEndTag(Token &T);
354239313Sdim
355239313Sdimpublic:
356251662Sdim  Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
357251662Sdim        const CommandTraits &Traits,
358243830Sdim        SourceLocation FileLoc,
359239313Sdim        const char *BufferStart, const char *BufferEnd);
360239313Sdim
361239313Sdim  void lex(Token &T);
362239313Sdim
363239313Sdim  StringRef getSpelling(const Token &Tok,
364239313Sdim                        const SourceManager &SourceMgr,
365239313Sdim                        bool *Invalid = NULL) const;
366239313Sdim};
367239313Sdim
368239313Sdim} // end namespace comments
369239313Sdim} // end namespace clang
370239313Sdim
371239313Sdim#endif
372239313Sdim
373