1//===- DFAEmitter.cpp - Finite state automaton emitter --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This class can produce a generic deterministic finite state automaton (DFA),
10// given a set of possible states and transitions.
11//
12// The input transitions can be nondeterministic - this class will produce the
13// deterministic equivalent state machine.
14//
15// The generated code can run the DFA and produce an accepted / not accepted
16// state and also produce, given a sequence of transitions that results in an
17// accepted state, the sequence of intermediate states. This is useful if the
18// initial automaton was nondeterministic - it allows mapping back from the DFA
19// to the NFA.
20//
21//===----------------------------------------------------------------------===//
22#define DEBUG_TYPE "dfa-emitter"
23
24#include "DFAEmitter.h"
25#include "CodeGenTarget.h"
26#include "SequenceToOffsetTable.h"
27#include "TableGenBackends.h"
28#include "llvm/ADT/SmallVector.h"
29#include "llvm/ADT/StringExtras.h"
30#include "llvm/ADT/UniqueVector.h"
31#include "llvm/Support/Debug.h"
32#include "llvm/Support/raw_ostream.h"
33#include "llvm/TableGen/Record.h"
34#include "llvm/TableGen/TableGenBackend.h"
35#include <cassert>
36#include <cstdint>
37#include <map>
38#include <set>
39#include <string>
40#include <vector>
41
42using namespace llvm;
43
44//===----------------------------------------------------------------------===//
45// DfaEmitter implementation. This is independent of the GenAutomaton backend.
46//===----------------------------------------------------------------------===//
47
48void DfaEmitter::addTransition(state_type From, state_type To, action_type A) {
49  Actions.insert(A);
50  NfaStates.insert(From);
51  NfaStates.insert(To);
52  NfaTransitions[{From, A}].push_back(To);
53  ++NumNfaTransitions;
54}
55
56void DfaEmitter::visitDfaState(const DfaState &DS) {
57  // For every possible action...
58  auto FromId = DfaStates.idFor(DS);
59  for (action_type A : Actions) {
60    DfaState NewStates;
61    DfaTransitionInfo TI;
62    // For every represented state, word pair in the original NFA...
63    for (state_type FromState : DS) {
64      // If this action is possible from this state add the transitioned-to
65      // states to NewStates.
66      auto I = NfaTransitions.find({FromState, A});
67      if (I == NfaTransitions.end())
68        continue;
69      for (state_type &ToState : I->second) {
70        NewStates.push_back(ToState);
71        TI.emplace_back(FromState, ToState);
72      }
73    }
74    if (NewStates.empty())
75      continue;
76    // Sort and unique.
77    sort(NewStates);
78    NewStates.erase(std::unique(NewStates.begin(), NewStates.end()),
79                    NewStates.end());
80    sort(TI);
81    TI.erase(std::unique(TI.begin(), TI.end()), TI.end());
82    unsigned ToId = DfaStates.insert(NewStates);
83    DfaTransitions.emplace(std::make_pair(FromId, A), std::make_pair(ToId, TI));
84  }
85}
86
87void DfaEmitter::constructDfa() {
88  DfaState Initial(1, /*NFA initial state=*/0);
89  DfaStates.insert(Initial);
90
91  // Note that UniqueVector starts indices at 1, not zero.
92  unsigned DfaStateId = 1;
93  while (DfaStateId <= DfaStates.size()) {
94    DfaState S = DfaStates[DfaStateId];
95    visitDfaState(S);
96    DfaStateId++;
97  }
98}
99
100void DfaEmitter::emit(StringRef Name, raw_ostream &OS) {
101  constructDfa();
102
103  OS << "// Input NFA has " << NfaStates.size() << " states with "
104     << NumNfaTransitions << " transitions.\n";
105  OS << "// Generated DFA has " << DfaStates.size() << " states with "
106     << DfaTransitions.size() << " transitions.\n\n";
107
108  // Implementation note: We don't bake a simple std::pair<> here as it requires
109  // significantly more effort to parse. A simple test with a large array of
110  // struct-pairs (N=100000) took clang-10 6s to parse. The same array of
111  // std::pair<uint64_t, uint64_t> took 242s. Instead we allow the user to
112  // define the pair type.
113  //
114  // FIXME: It may make sense to emit these as ULEB sequences instead of
115  // pairs of uint64_t.
116  OS << "// A zero-terminated sequence of NFA state transitions. Every DFA\n";
117  OS << "// transition implies a set of NFA transitions. These are referred\n";
118  OS << "// to by index in " << Name << "Transitions[].\n";
119
120  SequenceToOffsetTable<DfaTransitionInfo> Table;
121  std::map<DfaTransitionInfo, unsigned> EmittedIndices;
122  for (auto &T : DfaTransitions)
123    Table.add(T.second.second);
124  Table.layout();
125  OS << "std::array<NfaStatePair, " << Table.size() << "> " << Name
126     << "TransitionInfo = {{\n";
127  Table.emit(
128      OS,
129      [](raw_ostream &OS, std::pair<uint64_t, uint64_t> P) {
130        OS << "{" << P.first << ", " << P.second << "}";
131      },
132      "{0ULL, 0ULL}");
133
134  OS << "}};\n\n";
135
136  OS << "// A transition in the generated " << Name << " DFA.\n";
137  OS << "struct " << Name << "Transition {\n";
138  OS << "  unsigned FromDfaState; // The transitioned-from DFA state.\n";
139  OS << "  ";
140  printActionType(OS);
141  OS << " Action;       // The input symbol that causes this transition.\n";
142  OS << "  unsigned ToDfaState;   // The transitioned-to DFA state.\n";
143  OS << "  unsigned InfoIdx;      // Start index into " << Name
144     << "TransitionInfo.\n";
145  OS << "};\n\n";
146
147  OS << "// A table of DFA transitions, ordered by {FromDfaState, Action}.\n";
148  OS << "// The initial state is 1, not zero.\n";
149  OS << "std::array<" << Name << "Transition, " << DfaTransitions.size() << "> "
150     << Name << "Transitions = {{\n";
151  for (auto &KV : DfaTransitions) {
152    dfa_state_type From = KV.first.first;
153    dfa_state_type To = KV.second.first;
154    action_type A = KV.first.second;
155    unsigned InfoIdx = Table.get(KV.second.second);
156    OS << "  {" << From << ", ";
157    printActionValue(A, OS);
158    OS << ", " << To << ", " << InfoIdx << "},\n";
159  }
160  OS << "\n}};\n\n";
161}
162
163void DfaEmitter::printActionType(raw_ostream &OS) { OS << "uint64_t"; }
164
165void DfaEmitter::printActionValue(action_type A, raw_ostream &OS) { OS << A; }
166
167//===----------------------------------------------------------------------===//
168// AutomatonEmitter implementation
169//===----------------------------------------------------------------------===//
170
171namespace {
172// FIXME: This entire discriminated union could be removed with c++17:
173//   using Action = std::variant<Record *, unsigned, std::string>;
174struct Action {
175  Record *R = nullptr;
176  unsigned I = 0;
177  std::string S = nullptr;
178
179  Action() = default;
180  Action(Record *R, unsigned I, std::string S) : R(R), I(I), S(S) {}
181
182  void print(raw_ostream &OS) const {
183    if (R)
184      OS << R->getName();
185    else if (!S.empty())
186      OS << '"' << S << '"';
187    else
188      OS << I;
189  }
190  bool operator<(const Action &Other) const {
191    return std::make_tuple(R, I, S) <
192           std::make_tuple(Other.R, Other.I, Other.S);
193  }
194};
195
196using ActionTuple = std::vector<Action>;
197class Automaton;
198
199class Transition {
200  uint64_t NewState;
201  // The tuple of actions that causes this transition.
202  ActionTuple Actions;
203  // The types of the actions; this is the same across all transitions.
204  SmallVector<std::string, 4> Types;
205
206public:
207  Transition(Record *R, Automaton *Parent);
208  const ActionTuple &getActions() { return Actions; }
209  SmallVector<std::string, 4> getTypes() { return Types; }
210
211  bool canTransitionFrom(uint64_t State);
212  uint64_t transitionFrom(uint64_t State);
213};
214
215class Automaton {
216  RecordKeeper &Records;
217  Record *R;
218  std::vector<Transition> Transitions;
219  /// All possible action tuples, uniqued.
220  UniqueVector<ActionTuple> Actions;
221  /// The fields within each Transition object to find the action symbols.
222  std::vector<StringRef> ActionSymbolFields;
223
224public:
225  Automaton(RecordKeeper &Records, Record *R);
226  void emit(raw_ostream &OS);
227
228  ArrayRef<StringRef> getActionSymbolFields() { return ActionSymbolFields; }
229  /// If the type of action A has been overridden (there exists a field
230  /// "TypeOf_A") return that, otherwise return the empty string.
231  StringRef getActionSymbolType(StringRef A);
232};
233
234class AutomatonEmitter {
235  RecordKeeper &Records;
236
237public:
238  AutomatonEmitter(RecordKeeper &R) : Records(R) {}
239  void run(raw_ostream &OS);
240};
241
242/// A DfaEmitter implementation that can print our variant action type.
243class CustomDfaEmitter : public DfaEmitter {
244  const UniqueVector<ActionTuple> &Actions;
245  std::string TypeName;
246
247public:
248  CustomDfaEmitter(const UniqueVector<ActionTuple> &Actions, StringRef TypeName)
249      : Actions(Actions), TypeName(TypeName) {}
250
251  void printActionType(raw_ostream &OS) override;
252  void printActionValue(action_type A, raw_ostream &OS) override;
253};
254} // namespace
255
256void AutomatonEmitter::run(raw_ostream &OS) {
257  for (Record *R : Records.getAllDerivedDefinitions("GenericAutomaton")) {
258    Automaton A(Records, R);
259    OS << "#ifdef GET_" << R->getName() << "_DECL\n";
260    A.emit(OS);
261    OS << "#endif  // GET_" << R->getName() << "_DECL\n";
262  }
263}
264
265Automaton::Automaton(RecordKeeper &Records, Record *R)
266    : Records(Records), R(R) {
267  LLVM_DEBUG(dbgs() << "Emitting automaton for " << R->getName() << "\n");
268  ActionSymbolFields = R->getValueAsListOfStrings("SymbolFields");
269}
270
271void Automaton::emit(raw_ostream &OS) {
272  StringRef TransitionClass = R->getValueAsString("TransitionClass");
273  for (Record *T : Records.getAllDerivedDefinitions(TransitionClass)) {
274    assert(T->isSubClassOf("Transition"));
275    Transitions.emplace_back(T, this);
276    Actions.insert(Transitions.back().getActions());
277  }
278
279  LLVM_DEBUG(dbgs() << "  Action alphabet cardinality: " << Actions.size()
280                    << "\n");
281  LLVM_DEBUG(dbgs() << "  Each state has " << Transitions.size()
282                    << " potential transitions.\n");
283
284  StringRef Name = R->getName();
285
286  CustomDfaEmitter Emitter(Actions, std::string(Name) + "Action");
287  // Starting from the initial state, build up a list of possible states and
288  // transitions.
289  std::deque<uint64_t> Worklist(1, 0);
290  std::set<uint64_t> SeenStates;
291  unsigned NumTransitions = 0;
292  SeenStates.insert(Worklist.front());
293  while (!Worklist.empty()) {
294    uint64_t State = Worklist.front();
295    Worklist.pop_front();
296    for (Transition &T : Transitions) {
297      if (!T.canTransitionFrom(State))
298        continue;
299      uint64_t NewState = T.transitionFrom(State);
300      if (SeenStates.emplace(NewState).second)
301        Worklist.emplace_back(NewState);
302      ++NumTransitions;
303      Emitter.addTransition(State, NewState, Actions.idFor(T.getActions()));
304    }
305  }
306  LLVM_DEBUG(dbgs() << "  NFA automaton has " << SeenStates.size()
307                    << " states with " << NumTransitions << " transitions.\n");
308
309  const auto &ActionTypes = Transitions.back().getTypes();
310  OS << "// The type of an action in the " << Name << " automaton.\n";
311  if (ActionTypes.size() == 1) {
312    OS << "using " << Name << "Action = " << ActionTypes[0] << ";\n";
313  } else {
314    OS << "using " << Name << "Action = std::tuple<" << join(ActionTypes, ", ")
315       << ">;\n";
316  }
317  OS << "\n";
318
319  Emitter.emit(Name, OS);
320}
321
322StringRef Automaton::getActionSymbolType(StringRef A) {
323  Twine Ty = "TypeOf_" + A;
324  if (!R->getValue(Ty.str()))
325    return "";
326  return R->getValueAsString(Ty.str());
327}
328
329Transition::Transition(Record *R, Automaton *Parent) {
330  BitsInit *NewStateInit = R->getValueAsBitsInit("NewState");
331  NewState = 0;
332  assert(NewStateInit->getNumBits() <= sizeof(uint64_t) * 8 &&
333         "State cannot be represented in 64 bits!");
334  for (unsigned I = 0; I < NewStateInit->getNumBits(); ++I) {
335    if (auto *Bit = dyn_cast<BitInit>(NewStateInit->getBit(I))) {
336      if (Bit->getValue())
337        NewState |= 1ULL << I;
338    }
339  }
340
341  for (StringRef A : Parent->getActionSymbolFields()) {
342    RecordVal *SymbolV = R->getValue(A);
343    if (auto *Ty = dyn_cast<RecordRecTy>(SymbolV->getType())) {
344      Actions.emplace_back(R->getValueAsDef(A), 0, "");
345      Types.emplace_back(Ty->getAsString());
346    } else if (isa<IntRecTy>(SymbolV->getType())) {
347      Actions.emplace_back(nullptr, R->getValueAsInt(A), "");
348      Types.emplace_back("unsigned");
349    } else if (isa<StringRecTy>(SymbolV->getType()) ||
350               isa<CodeRecTy>(SymbolV->getType())) {
351      Actions.emplace_back(nullptr, 0, R->getValueAsString(A));
352      Types.emplace_back("std::string");
353    } else {
354      report_fatal_error("Unhandled symbol type!");
355    }
356
357    StringRef TypeOverride = Parent->getActionSymbolType(A);
358    if (!TypeOverride.empty())
359      Types.back() = TypeOverride;
360  }
361}
362
363bool Transition::canTransitionFrom(uint64_t State) {
364  if ((State & NewState) == 0)
365    // The bits we want to set are not set;
366    return true;
367  return false;
368}
369
370uint64_t Transition::transitionFrom(uint64_t State) {
371  return State | NewState;
372}
373
374void CustomDfaEmitter::printActionType(raw_ostream &OS) { OS << TypeName; }
375
376void CustomDfaEmitter::printActionValue(action_type A, raw_ostream &OS) {
377  const ActionTuple &AT = Actions[A];
378  if (AT.size() > 1)
379    OS << "std::make_tuple(";
380  bool First = true;
381  for (const auto &SingleAction : AT) {
382    if (!First)
383      OS << ", ";
384    First = false;
385    SingleAction.print(OS);
386  }
387  if (AT.size() > 1)
388    OS << ")";
389}
390
391namespace llvm {
392
393void EmitAutomata(RecordKeeper &RK, raw_ostream &OS) {
394  AutomatonEmitter(RK).run(OS);
395}
396
397} // namespace llvm
398