ScriptLexer.cpp revision 363496
1//===- ScriptLexer.cpp ----------------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines a lexer for the linker script.
10//
11// The linker script's grammar is not complex but ambiguous due to the
12// lack of the formal specification of the language. What we are trying to
13// do in this and other files in LLD is to make a "reasonable" linker
14// script processor.
15//
16// Among simplicity, compatibility and efficiency, we put the most
17// emphasis on simplicity when we wrote this lexer. Compatibility with the
18// GNU linkers is important, but we did not try to clone every tiny corner
19// case of their lexers, as even ld.bfd and ld.gold are subtly different
20// in various corner cases. We do not care much about efficiency because
21// the time spent in parsing linker scripts is usually negligible.
22//
23// Our grammar of the linker script is LL(2), meaning that it needs at
24// most two-token lookahead to parse. The only place we need two-token
25// lookahead is labels in version scripts, where we need to parse "local :"
26// as if "local:".
27//
28// Overall, this lexer works fine for most linker scripts. There might
29// be room for improving compatibility, but that's probably not at the
30// top of our todo list.
31//
32//===----------------------------------------------------------------------===//
33
34#include "ScriptLexer.h"
35#include "lld/Common/ErrorHandler.h"
36#include "llvm/ADT/Twine.h"
37
38using namespace llvm;
39
40namespace lld {
41namespace elf {
42// Returns a whole line containing the current token.
43StringRef ScriptLexer::getLine() {
44  StringRef s = getCurrentMB().getBuffer();
45  StringRef tok = tokens[pos - 1];
46
47  size_t pos = s.rfind('\n', tok.data() - s.data());
48  if (pos != StringRef::npos)
49    s = s.substr(pos + 1);
50  return s.substr(0, s.find_first_of("\r\n"));
51}
52
53// Returns 1-based line number of the current token.
54size_t ScriptLexer::getLineNumber() {
55  if (pos == 0)
56    return 1;
57  StringRef s = getCurrentMB().getBuffer();
58  StringRef tok = tokens[pos - 1];
59  return s.substr(0, tok.data() - s.data()).count('\n') + 1;
60}
61
62// Returns 0-based column number of the current token.
63size_t ScriptLexer::getColumnNumber() {
64  StringRef tok = tokens[pos - 1];
65  return tok.data() - getLine().data();
66}
67
68std::string ScriptLexer::getCurrentLocation() {
69  std::string filename = getCurrentMB().getBufferIdentifier();
70  return (filename + ":" + Twine(getLineNumber())).str();
71}
72
73ScriptLexer::ScriptLexer(MemoryBufferRef mb) { tokenize(mb); }
74
75// We don't want to record cascading errors. Keep only the first one.
76void ScriptLexer::setError(const Twine &msg) {
77  if (errorCount())
78    return;
79
80  std::string s = (getCurrentLocation() + ": " + msg).str();
81  if (pos)
82    s += "\n>>> " + getLine().str() + "\n>>> " +
83         std::string(getColumnNumber(), ' ') + "^";
84  error(s);
85}
86
87// Split S into linker script tokens.
88void ScriptLexer::tokenize(MemoryBufferRef mb) {
89  std::vector<StringRef> vec;
90  mbs.push_back(mb);
91  StringRef s = mb.getBuffer();
92  StringRef begin = s;
93
94  for (;;) {
95    s = skipSpace(s);
96    if (s.empty())
97      break;
98
99    // Quoted token. Note that double-quote characters are parts of a token
100    // because, in a glob match context, only unquoted tokens are interpreted
101    // as glob patterns. Double-quoted tokens are literal patterns in that
102    // context.
103    if (s.startswith("\"")) {
104      size_t e = s.find("\"", 1);
105      if (e == StringRef::npos) {
106        StringRef filename = mb.getBufferIdentifier();
107        size_t lineno = begin.substr(0, s.data() - begin.data()).count('\n');
108        error(filename + ":" + Twine(lineno + 1) + ": unclosed quote");
109        return;
110      }
111
112      vec.push_back(s.take_front(e + 1));
113      s = s.substr(e + 1);
114      continue;
115    }
116
117    // ">foo" is parsed to ">" and "foo", but ">>" is parsed to ">>".
118    // "|", "||", "&" and "&&" are different operators.
119    if (s.startswith("<<") || s.startswith("<=") || s.startswith(">>") ||
120        s.startswith(">=") || s.startswith("||") || s.startswith("&&")) {
121      vec.push_back(s.substr(0, 2));
122      s = s.substr(2);
123      continue;
124    }
125
126    // Unquoted token. This is more relaxed than tokens in C-like language,
127    // so that you can write "file-name.cpp" as one bare token, for example.
128    size_t pos = s.find_first_not_of(
129        "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
130        "0123456789_.$/\\~=+[]*?-!^:");
131
132    // A character that cannot start a word (which is usually a
133    // punctuation) forms a single character token.
134    if (pos == 0)
135      pos = 1;
136    vec.push_back(s.substr(0, pos));
137    s = s.substr(pos);
138  }
139
140  tokens.insert(tokens.begin() + pos, vec.begin(), vec.end());
141}
142
143// Skip leading whitespace characters or comments.
144StringRef ScriptLexer::skipSpace(StringRef s) {
145  for (;;) {
146    if (s.startswith("/*")) {
147      size_t e = s.find("*/", 2);
148      if (e == StringRef::npos) {
149        error("unclosed comment in a linker script");
150        return "";
151      }
152      s = s.substr(e + 2);
153      continue;
154    }
155    if (s.startswith("#")) {
156      size_t e = s.find('\n', 1);
157      if (e == StringRef::npos)
158        e = s.size() - 1;
159      s = s.substr(e + 1);
160      continue;
161    }
162    size_t size = s.size();
163    s = s.ltrim();
164    if (s.size() == size)
165      return s;
166  }
167}
168
169// An erroneous token is handled as if it were the last token before EOF.
170bool ScriptLexer::atEOF() { return errorCount() || tokens.size() == pos; }
171
172// Split a given string as an expression.
173// This function returns "3", "*" and "5" for "3*5" for example.
174static std::vector<StringRef> tokenizeExpr(StringRef s) {
175  StringRef ops = "+-*/:!~=<>"; // List of operators
176
177  // Quoted strings are literal strings, so we don't want to split it.
178  if (s.startswith("\""))
179    return {s};
180
181  // Split S with operators as separators.
182  std::vector<StringRef> ret;
183  while (!s.empty()) {
184    size_t e = s.find_first_of(ops);
185
186    // No need to split if there is no operator.
187    if (e == StringRef::npos) {
188      ret.push_back(s);
189      break;
190    }
191
192    // Get a token before the opreator.
193    if (e != 0)
194      ret.push_back(s.substr(0, e));
195
196    // Get the operator as a token.
197    // Keep !=, ==, >=, <=, << and >> operators as a single tokens.
198    if (s.substr(e).startswith("!=") || s.substr(e).startswith("==") ||
199        s.substr(e).startswith(">=") || s.substr(e).startswith("<=") ||
200        s.substr(e).startswith("<<") || s.substr(e).startswith(">>")) {
201      ret.push_back(s.substr(e, 2));
202      s = s.substr(e + 2);
203    } else {
204      ret.push_back(s.substr(e, 1));
205      s = s.substr(e + 1);
206    }
207  }
208  return ret;
209}
210
211// In contexts where expressions are expected, the lexer should apply
212// different tokenization rules than the default one. By default,
213// arithmetic operator characters are regular characters, but in the
214// expression context, they should be independent tokens.
215//
216// For example, "foo*3" should be tokenized to "foo", "*" and "3" only
217// in the expression context.
218//
219// This function may split the current token into multiple tokens.
220void ScriptLexer::maybeSplitExpr() {
221  if (!inExpr || errorCount() || atEOF())
222    return;
223
224  std::vector<StringRef> v = tokenizeExpr(tokens[pos]);
225  if (v.size() == 1)
226    return;
227  tokens.erase(tokens.begin() + pos);
228  tokens.insert(tokens.begin() + pos, v.begin(), v.end());
229}
230
231StringRef ScriptLexer::next() {
232  maybeSplitExpr();
233
234  if (errorCount())
235    return "";
236  if (atEOF()) {
237    setError("unexpected EOF");
238    return "";
239  }
240  return tokens[pos++];
241}
242
243StringRef ScriptLexer::peek() {
244  StringRef tok = next();
245  if (errorCount())
246    return "";
247  pos = pos - 1;
248  return tok;
249}
250
251StringRef ScriptLexer::peek2() {
252  skip();
253  StringRef tok = next();
254  if (errorCount())
255    return "";
256  pos = pos - 2;
257  return tok;
258}
259
260bool ScriptLexer::consume(StringRef tok) {
261  if (peek() == tok) {
262    skip();
263    return true;
264  }
265  return false;
266}
267
268// Consumes Tok followed by ":". Space is allowed between Tok and ":".
269bool ScriptLexer::consumeLabel(StringRef tok) {
270  if (consume((tok + ":").str()))
271    return true;
272  if (tokens.size() >= pos + 2 && tokens[pos] == tok &&
273      tokens[pos + 1] == ":") {
274    pos += 2;
275    return true;
276  }
277  return false;
278}
279
280void ScriptLexer::skip() { (void)next(); }
281
282void ScriptLexer::expect(StringRef expect) {
283  if (errorCount())
284    return;
285  StringRef tok = next();
286  if (tok != expect)
287    setError(expect + " expected, but got " + tok);
288}
289
290// Returns true if S encloses T.
291static bool encloses(StringRef s, StringRef t) {
292  return s.bytes_begin() <= t.bytes_begin() && t.bytes_end() <= s.bytes_end();
293}
294
295MemoryBufferRef ScriptLexer::getCurrentMB() {
296  // Find input buffer containing the current token.
297  assert(!mbs.empty());
298  if (pos == 0)
299    return mbs.back();
300  for (MemoryBufferRef mb : mbs)
301    if (encloses(mb.getBuffer(), tokens[pos - 1]))
302      return mb;
303  llvm_unreachable("getCurrentMB: failed to find a token");
304}
305
306} // namespace elf
307} // namespace lld
308