1//===- DFAEmitter.cpp - Finite state automaton emitter --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This class can produce a generic deterministic finite state automaton (DFA),
10// given a set of possible states and transitions.
11//
12// The input transitions can be nondeterministic - this class will produce the
13// deterministic equivalent state machine.
14//
15// The generated code can run the DFA and produce an accepted / not accepted
16// state and also produce, given a sequence of transitions that results in an
17// accepted state, the sequence of intermediate states. This is useful if the
18// initial automaton was nondeterministic - it allows mapping back from the DFA
19// to the NFA.
20//
21//===----------------------------------------------------------------------===//
22
23#include "DFAEmitter.h"
24#include "SequenceToOffsetTable.h"
25#include "TableGenBackends.h"
26#include "llvm/ADT/SmallVector.h"
27#include "llvm/ADT/StringExtras.h"
28#include "llvm/ADT/UniqueVector.h"
29#include "llvm/Support/Debug.h"
30#include "llvm/Support/raw_ostream.h"
31#include "llvm/TableGen/Record.h"
32#include <cassert>
33#include <cstdint>
34#include <deque>
35#include <map>
36#include <set>
37#include <string>
38#include <variant>
39#include <vector>
40
41#define DEBUG_TYPE "dfa-emitter"
42
43using namespace llvm;
44
45//===----------------------------------------------------------------------===//
46// DfaEmitter implementation. This is independent of the GenAutomaton backend.
47//===----------------------------------------------------------------------===//
48
49void DfaEmitter::addTransition(state_type From, state_type To, action_type A) {
50  Actions.insert(A);
51  NfaStates.insert(From);
52  NfaStates.insert(To);
53  NfaTransitions[{From, A}].push_back(To);
54  ++NumNfaTransitions;
55}
56
57void DfaEmitter::visitDfaState(const DfaState &DS) {
58  // For every possible action...
59  auto FromId = DfaStates.idFor(DS);
60  for (action_type A : Actions) {
61    DfaState NewStates;
62    DfaTransitionInfo TI;
63    // For every represented state, word pair in the original NFA...
64    for (state_type FromState : DS) {
65      // If this action is possible from this state add the transitioned-to
66      // states to NewStates.
67      auto I = NfaTransitions.find({FromState, A});
68      if (I == NfaTransitions.end())
69        continue;
70      for (state_type &ToState : I->second) {
71        NewStates.push_back(ToState);
72        TI.emplace_back(FromState, ToState);
73      }
74    }
75    if (NewStates.empty())
76      continue;
77    // Sort and unique.
78    sort(NewStates);
79    NewStates.erase(std::unique(NewStates.begin(), NewStates.end()),
80                    NewStates.end());
81    sort(TI);
82    TI.erase(std::unique(TI.begin(), TI.end()), TI.end());
83    unsigned ToId = DfaStates.insert(NewStates);
84    DfaTransitions.emplace(std::make_pair(FromId, A), std::make_pair(ToId, TI));
85  }
86}
87
88void DfaEmitter::constructDfa() {
89  DfaState Initial(1, /*NFA initial state=*/0);
90  DfaStates.insert(Initial);
91
92  // Note that UniqueVector starts indices at 1, not zero.
93  unsigned DfaStateId = 1;
94  while (DfaStateId <= DfaStates.size()) {
95    DfaState S = DfaStates[DfaStateId];
96    visitDfaState(S);
97    DfaStateId++;
98  }
99}
100
101void DfaEmitter::emit(StringRef Name, raw_ostream &OS) {
102  constructDfa();
103
104  OS << "// Input NFA has " << NfaStates.size() << " states with "
105     << NumNfaTransitions << " transitions.\n";
106  OS << "// Generated DFA has " << DfaStates.size() << " states with "
107     << DfaTransitions.size() << " transitions.\n\n";
108
109  // Implementation note: We don't bake a simple std::pair<> here as it requires
110  // significantly more effort to parse. A simple test with a large array of
111  // struct-pairs (N=100000) took clang-10 6s to parse. The same array of
112  // std::pair<uint64_t, uint64_t> took 242s. Instead we allow the user to
113  // define the pair type.
114  //
115  // FIXME: It may make sense to emit these as ULEB sequences instead of
116  // pairs of uint64_t.
117  OS << "// A zero-terminated sequence of NFA state transitions. Every DFA\n";
118  OS << "// transition implies a set of NFA transitions. These are referred\n";
119  OS << "// to by index in " << Name << "Transitions[].\n";
120
121  SequenceToOffsetTable<DfaTransitionInfo> Table;
122  std::map<DfaTransitionInfo, unsigned> EmittedIndices;
123  for (auto &T : DfaTransitions)
124    Table.add(T.second.second);
125  Table.layout();
126  OS << "const std::array<NfaStatePair, " << Table.size() << "> " << Name
127     << "TransitionInfo = {{\n";
128  Table.emit(
129      OS,
130      [](raw_ostream &OS, std::pair<uint64_t, uint64_t> P) {
131        OS << "{" << P.first << ", " << P.second << "}";
132      },
133      "{0ULL, 0ULL}");
134
135  OS << "}};\n\n";
136
137  OS << "// A transition in the generated " << Name << " DFA.\n";
138  OS << "struct " << Name << "Transition {\n";
139  OS << "  unsigned FromDfaState; // The transitioned-from DFA state.\n";
140  OS << "  ";
141  printActionType(OS);
142  OS << " Action;       // The input symbol that causes this transition.\n";
143  OS << "  unsigned ToDfaState;   // The transitioned-to DFA state.\n";
144  OS << "  unsigned InfoIdx;      // Start index into " << Name
145     << "TransitionInfo.\n";
146  OS << "};\n\n";
147
148  OS << "// A table of DFA transitions, ordered by {FromDfaState, Action}.\n";
149  OS << "// The initial state is 1, not zero.\n";
150  OS << "const std::array<" << Name << "Transition, "
151     << DfaTransitions.size() << "> " << Name << "Transitions = {{\n";
152  for (auto &KV : DfaTransitions) {
153    dfa_state_type From = KV.first.first;
154    dfa_state_type To = KV.second.first;
155    action_type A = KV.first.second;
156    unsigned InfoIdx = Table.get(KV.second.second);
157    OS << "  {" << From << ", ";
158    printActionValue(A, OS);
159    OS << ", " << To << ", " << InfoIdx << "},\n";
160  }
161  OS << "\n}};\n\n";
162}
163
164void DfaEmitter::printActionType(raw_ostream &OS) { OS << "uint64_t"; }
165
166void DfaEmitter::printActionValue(action_type A, raw_ostream &OS) { OS << A; }
167
168//===----------------------------------------------------------------------===//
169// AutomatonEmitter implementation
170//===----------------------------------------------------------------------===//
171
172namespace {
173
174using Action = std::variant<Record *, unsigned, std::string>;
175using ActionTuple = std::vector<Action>;
176class Automaton;
177
178class Transition {
179  uint64_t NewState;
180  // The tuple of actions that causes this transition.
181  ActionTuple Actions;
182  // The types of the actions; this is the same across all transitions.
183  SmallVector<std::string, 4> Types;
184
185public:
186  Transition(Record *R, Automaton *Parent);
187  const ActionTuple &getActions() { return Actions; }
188  SmallVector<std::string, 4> getTypes() { return Types; }
189
190  bool canTransitionFrom(uint64_t State);
191  uint64_t transitionFrom(uint64_t State);
192};
193
194class Automaton {
195  RecordKeeper &Records;
196  Record *R;
197  std::vector<Transition> Transitions;
198  /// All possible action tuples, uniqued.
199  UniqueVector<ActionTuple> Actions;
200  /// The fields within each Transition object to find the action symbols.
201  std::vector<StringRef> ActionSymbolFields;
202
203public:
204  Automaton(RecordKeeper &Records, Record *R);
205  void emit(raw_ostream &OS);
206
207  ArrayRef<StringRef> getActionSymbolFields() { return ActionSymbolFields; }
208  /// If the type of action A has been overridden (there exists a field
209  /// "TypeOf_A") return that, otherwise return the empty string.
210  StringRef getActionSymbolType(StringRef A);
211};
212
213class AutomatonEmitter {
214  RecordKeeper &Records;
215
216public:
217  AutomatonEmitter(RecordKeeper &R) : Records(R) {}
218  void run(raw_ostream &OS);
219};
220
221/// A DfaEmitter implementation that can print our variant action type.
222class CustomDfaEmitter : public DfaEmitter {
223  const UniqueVector<ActionTuple> &Actions;
224  std::string TypeName;
225
226public:
227  CustomDfaEmitter(const UniqueVector<ActionTuple> &Actions, StringRef TypeName)
228      : Actions(Actions), TypeName(TypeName) {}
229
230  void printActionType(raw_ostream &OS) override;
231  void printActionValue(action_type A, raw_ostream &OS) override;
232};
233} // namespace
234
235void AutomatonEmitter::run(raw_ostream &OS) {
236  for (Record *R : Records.getAllDerivedDefinitions("GenericAutomaton")) {
237    Automaton A(Records, R);
238    OS << "#ifdef GET_" << R->getName() << "_DECL\n";
239    A.emit(OS);
240    OS << "#endif  // GET_" << R->getName() << "_DECL\n";
241  }
242}
243
244Automaton::Automaton(RecordKeeper &Records, Record *R)
245    : Records(Records), R(R) {
246  LLVM_DEBUG(dbgs() << "Emitting automaton for " << R->getName() << "\n");
247  ActionSymbolFields = R->getValueAsListOfStrings("SymbolFields");
248}
249
250void Automaton::emit(raw_ostream &OS) {
251  StringRef TransitionClass = R->getValueAsString("TransitionClass");
252  for (Record *T : Records.getAllDerivedDefinitions(TransitionClass)) {
253    assert(T->isSubClassOf("Transition"));
254    Transitions.emplace_back(T, this);
255    Actions.insert(Transitions.back().getActions());
256  }
257
258  LLVM_DEBUG(dbgs() << "  Action alphabet cardinality: " << Actions.size()
259                    << "\n");
260  LLVM_DEBUG(dbgs() << "  Each state has " << Transitions.size()
261                    << " potential transitions.\n");
262
263  StringRef Name = R->getName();
264
265  CustomDfaEmitter Emitter(Actions, std::string(Name) + "Action");
266  // Starting from the initial state, build up a list of possible states and
267  // transitions.
268  std::deque<uint64_t> Worklist(1, 0);
269  std::set<uint64_t> SeenStates;
270  unsigned NumTransitions = 0;
271  SeenStates.insert(Worklist.front());
272  while (!Worklist.empty()) {
273    uint64_t State = Worklist.front();
274    Worklist.pop_front();
275    for (Transition &T : Transitions) {
276      if (!T.canTransitionFrom(State))
277        continue;
278      uint64_t NewState = T.transitionFrom(State);
279      if (SeenStates.emplace(NewState).second)
280        Worklist.emplace_back(NewState);
281      ++NumTransitions;
282      Emitter.addTransition(State, NewState, Actions.idFor(T.getActions()));
283    }
284  }
285  LLVM_DEBUG(dbgs() << "  NFA automaton has " << SeenStates.size()
286                    << " states with " << NumTransitions << " transitions.\n");
287  (void) NumTransitions;
288
289  const auto &ActionTypes = Transitions.back().getTypes();
290  OS << "// The type of an action in the " << Name << " automaton.\n";
291  if (ActionTypes.size() == 1) {
292    OS << "using " << Name << "Action = " << ActionTypes[0] << ";\n";
293  } else {
294    OS << "using " << Name << "Action = std::tuple<" << join(ActionTypes, ", ")
295       << ">;\n";
296  }
297  OS << "\n";
298
299  Emitter.emit(Name, OS);
300}
301
302StringRef Automaton::getActionSymbolType(StringRef A) {
303  Twine Ty = "TypeOf_" + A;
304  if (!R->getValue(Ty.str()))
305    return "";
306  return R->getValueAsString(Ty.str());
307}
308
309Transition::Transition(Record *R, Automaton *Parent) {
310  BitsInit *NewStateInit = R->getValueAsBitsInit("NewState");
311  NewState = 0;
312  assert(NewStateInit->getNumBits() <= sizeof(uint64_t) * 8 &&
313         "State cannot be represented in 64 bits!");
314  for (unsigned I = 0; I < NewStateInit->getNumBits(); ++I) {
315    if (auto *Bit = dyn_cast<BitInit>(NewStateInit->getBit(I))) {
316      if (Bit->getValue())
317        NewState |= 1ULL << I;
318    }
319  }
320
321  for (StringRef A : Parent->getActionSymbolFields()) {
322    RecordVal *SymbolV = R->getValue(A);
323    if (auto *Ty = dyn_cast<RecordRecTy>(SymbolV->getType())) {
324      Actions.emplace_back(R->getValueAsDef(A));
325      Types.emplace_back(Ty->getAsString());
326    } else if (isa<IntRecTy>(SymbolV->getType())) {
327      Actions.emplace_back(static_cast<unsigned>(R->getValueAsInt(A)));
328      Types.emplace_back("unsigned");
329    } else if (isa<StringRecTy>(SymbolV->getType())) {
330      Actions.emplace_back(std::string(R->getValueAsString(A)));
331      Types.emplace_back("std::string");
332    } else {
333      report_fatal_error("Unhandled symbol type!");
334    }
335
336    StringRef TypeOverride = Parent->getActionSymbolType(A);
337    if (!TypeOverride.empty())
338      Types.back() = std::string(TypeOverride);
339  }
340}
341
342bool Transition::canTransitionFrom(uint64_t State) {
343  if ((State & NewState) == 0)
344    // The bits we want to set are not set;
345    return true;
346  return false;
347}
348
349uint64_t Transition::transitionFrom(uint64_t State) {
350  return State | NewState;
351}
352
353void CustomDfaEmitter::printActionType(raw_ostream &OS) { OS << TypeName; }
354
355void CustomDfaEmitter::printActionValue(action_type A, raw_ostream &OS) {
356  const ActionTuple &AT = Actions[A];
357  if (AT.size() > 1)
358    OS << "std::make_tuple(";
359  ListSeparator LS;
360  for (const auto &SingleAction : AT) {
361    OS << LS;
362    if (const auto *R = std::get_if<Record *>(&SingleAction))
363      OS << (*R)->getName();
364    else if (const auto *S = std::get_if<std::string>(&SingleAction))
365      OS << '"' << *S << '"';
366    else
367      OS << std::get<unsigned>(SingleAction);
368  }
369  if (AT.size() > 1)
370    OS << ")";
371}
372
373namespace llvm {
374
375void EmitAutomata(RecordKeeper &RK, raw_ostream &OS) {
376  AutomatonEmitter(RK).run(OS);
377}
378
379} // namespace llvm
380