xref: /openbsd-src/gnu/llvm/llvm/utils/TableGen/DFAEmitter.cpp (revision d415bd752c734aee168c4ee86ff32e8cc249eb16)
109467b48Spatrick //===- DFAEmitter.cpp - Finite state automaton emitter --------------------===//
209467b48Spatrick //
309467b48Spatrick // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
409467b48Spatrick // See https://llvm.org/LICENSE.txt for license information.
509467b48Spatrick // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
609467b48Spatrick //
709467b48Spatrick //===----------------------------------------------------------------------===//
809467b48Spatrick //
909467b48Spatrick // This class can produce a generic deterministic finite state automaton (DFA),
1009467b48Spatrick // given a set of possible states and transitions.
1109467b48Spatrick //
1209467b48Spatrick // The input transitions can be nondeterministic - this class will produce the
1309467b48Spatrick // deterministic equivalent state machine.
1409467b48Spatrick //
1509467b48Spatrick // The generated code can run the DFA and produce an accepted / not accepted
1609467b48Spatrick // state and also produce, given a sequence of transitions that results in an
1709467b48Spatrick // accepted state, the sequence of intermediate states. This is useful if the
1809467b48Spatrick // initial automaton was nondeterministic - it allows mapping back from the DFA
1909467b48Spatrick // to the NFA.
2009467b48Spatrick //
2109467b48Spatrick //===----------------------------------------------------------------------===//
2209467b48Spatrick 
2309467b48Spatrick #include "DFAEmitter.h"
2409467b48Spatrick #include "SequenceToOffsetTable.h"
2509467b48Spatrick #include "TableGenBackends.h"
2609467b48Spatrick #include "llvm/ADT/SmallVector.h"
2709467b48Spatrick #include "llvm/ADT/StringExtras.h"
2809467b48Spatrick #include "llvm/ADT/UniqueVector.h"
2909467b48Spatrick #include "llvm/Support/Debug.h"
3009467b48Spatrick #include "llvm/Support/raw_ostream.h"
3109467b48Spatrick #include "llvm/TableGen/Record.h"
3209467b48Spatrick #include <cassert>
3309467b48Spatrick #include <cstdint>
34*d415bd75Srobert #include <deque>
3509467b48Spatrick #include <map>
3609467b48Spatrick #include <set>
3709467b48Spatrick #include <string>
38*d415bd75Srobert #include <variant>
3909467b48Spatrick #include <vector>
4009467b48Spatrick 
4173471bf0Spatrick #define DEBUG_TYPE "dfa-emitter"
4273471bf0Spatrick 
4309467b48Spatrick using namespace llvm;
4409467b48Spatrick 
4509467b48Spatrick //===----------------------------------------------------------------------===//
4609467b48Spatrick // DfaEmitter implementation. This is independent of the GenAutomaton backend.
4709467b48Spatrick //===----------------------------------------------------------------------===//
4809467b48Spatrick 
addTransition(state_type From,state_type To,action_type A)4909467b48Spatrick void DfaEmitter::addTransition(state_type From, state_type To, action_type A) {
5009467b48Spatrick   Actions.insert(A);
5109467b48Spatrick   NfaStates.insert(From);
5209467b48Spatrick   NfaStates.insert(To);
5309467b48Spatrick   NfaTransitions[{From, A}].push_back(To);
5409467b48Spatrick   ++NumNfaTransitions;
5509467b48Spatrick }
5609467b48Spatrick 
visitDfaState(const DfaState & DS)5709467b48Spatrick void DfaEmitter::visitDfaState(const DfaState &DS) {
5809467b48Spatrick   // For every possible action...
5909467b48Spatrick   auto FromId = DfaStates.idFor(DS);
6009467b48Spatrick   for (action_type A : Actions) {
6109467b48Spatrick     DfaState NewStates;
6209467b48Spatrick     DfaTransitionInfo TI;
6309467b48Spatrick     // For every represented state, word pair in the original NFA...
6409467b48Spatrick     for (state_type FromState : DS) {
6509467b48Spatrick       // If this action is possible from this state add the transitioned-to
6609467b48Spatrick       // states to NewStates.
6709467b48Spatrick       auto I = NfaTransitions.find({FromState, A});
6809467b48Spatrick       if (I == NfaTransitions.end())
6909467b48Spatrick         continue;
7009467b48Spatrick       for (state_type &ToState : I->second) {
7109467b48Spatrick         NewStates.push_back(ToState);
7209467b48Spatrick         TI.emplace_back(FromState, ToState);
7309467b48Spatrick       }
7409467b48Spatrick     }
7509467b48Spatrick     if (NewStates.empty())
7609467b48Spatrick       continue;
7709467b48Spatrick     // Sort and unique.
7809467b48Spatrick     sort(NewStates);
7909467b48Spatrick     NewStates.erase(std::unique(NewStates.begin(), NewStates.end()),
8009467b48Spatrick                     NewStates.end());
8109467b48Spatrick     sort(TI);
8209467b48Spatrick     TI.erase(std::unique(TI.begin(), TI.end()), TI.end());
8309467b48Spatrick     unsigned ToId = DfaStates.insert(NewStates);
8409467b48Spatrick     DfaTransitions.emplace(std::make_pair(FromId, A), std::make_pair(ToId, TI));
8509467b48Spatrick   }
8609467b48Spatrick }
8709467b48Spatrick 
constructDfa()8809467b48Spatrick void DfaEmitter::constructDfa() {
8909467b48Spatrick   DfaState Initial(1, /*NFA initial state=*/0);
9009467b48Spatrick   DfaStates.insert(Initial);
9109467b48Spatrick 
9209467b48Spatrick   // Note that UniqueVector starts indices at 1, not zero.
9309467b48Spatrick   unsigned DfaStateId = 1;
9409467b48Spatrick   while (DfaStateId <= DfaStates.size()) {
9509467b48Spatrick     DfaState S = DfaStates[DfaStateId];
9609467b48Spatrick     visitDfaState(S);
9709467b48Spatrick     DfaStateId++;
9809467b48Spatrick   }
9909467b48Spatrick }
10009467b48Spatrick 
emit(StringRef Name,raw_ostream & OS)10109467b48Spatrick void DfaEmitter::emit(StringRef Name, raw_ostream &OS) {
10209467b48Spatrick   constructDfa();
10309467b48Spatrick 
10409467b48Spatrick   OS << "// Input NFA has " << NfaStates.size() << " states with "
10509467b48Spatrick      << NumNfaTransitions << " transitions.\n";
10609467b48Spatrick   OS << "// Generated DFA has " << DfaStates.size() << " states with "
10709467b48Spatrick      << DfaTransitions.size() << " transitions.\n\n";
10809467b48Spatrick 
10909467b48Spatrick   // Implementation note: We don't bake a simple std::pair<> here as it requires
11009467b48Spatrick   // significantly more effort to parse. A simple test with a large array of
11109467b48Spatrick   // struct-pairs (N=100000) took clang-10 6s to parse. The same array of
11209467b48Spatrick   // std::pair<uint64_t, uint64_t> took 242s. Instead we allow the user to
11309467b48Spatrick   // define the pair type.
11409467b48Spatrick   //
11509467b48Spatrick   // FIXME: It may make sense to emit these as ULEB sequences instead of
11609467b48Spatrick   // pairs of uint64_t.
11709467b48Spatrick   OS << "// A zero-terminated sequence of NFA state transitions. Every DFA\n";
11809467b48Spatrick   OS << "// transition implies a set of NFA transitions. These are referred\n";
11909467b48Spatrick   OS << "// to by index in " << Name << "Transitions[].\n";
12009467b48Spatrick 
12109467b48Spatrick   SequenceToOffsetTable<DfaTransitionInfo> Table;
12209467b48Spatrick   std::map<DfaTransitionInfo, unsigned> EmittedIndices;
12309467b48Spatrick   for (auto &T : DfaTransitions)
12409467b48Spatrick     Table.add(T.second.second);
12509467b48Spatrick   Table.layout();
126097a140dSpatrick   OS << "const std::array<NfaStatePair, " << Table.size() << "> " << Name
12709467b48Spatrick      << "TransitionInfo = {{\n";
12809467b48Spatrick   Table.emit(
12909467b48Spatrick       OS,
13009467b48Spatrick       [](raw_ostream &OS, std::pair<uint64_t, uint64_t> P) {
13109467b48Spatrick         OS << "{" << P.first << ", " << P.second << "}";
13209467b48Spatrick       },
13309467b48Spatrick       "{0ULL, 0ULL}");
13409467b48Spatrick 
13509467b48Spatrick   OS << "}};\n\n";
13609467b48Spatrick 
13709467b48Spatrick   OS << "// A transition in the generated " << Name << " DFA.\n";
13809467b48Spatrick   OS << "struct " << Name << "Transition {\n";
13909467b48Spatrick   OS << "  unsigned FromDfaState; // The transitioned-from DFA state.\n";
14009467b48Spatrick   OS << "  ";
14109467b48Spatrick   printActionType(OS);
14209467b48Spatrick   OS << " Action;       // The input symbol that causes this transition.\n";
14309467b48Spatrick   OS << "  unsigned ToDfaState;   // The transitioned-to DFA state.\n";
14409467b48Spatrick   OS << "  unsigned InfoIdx;      // Start index into " << Name
14509467b48Spatrick      << "TransitionInfo.\n";
14609467b48Spatrick   OS << "};\n\n";
14709467b48Spatrick 
14809467b48Spatrick   OS << "// A table of DFA transitions, ordered by {FromDfaState, Action}.\n";
14909467b48Spatrick   OS << "// The initial state is 1, not zero.\n";
150097a140dSpatrick   OS << "const std::array<" << Name << "Transition, "
151097a140dSpatrick      << DfaTransitions.size() << "> " << Name << "Transitions = {{\n";
15209467b48Spatrick   for (auto &KV : DfaTransitions) {
15309467b48Spatrick     dfa_state_type From = KV.first.first;
15409467b48Spatrick     dfa_state_type To = KV.second.first;
15509467b48Spatrick     action_type A = KV.first.second;
15609467b48Spatrick     unsigned InfoIdx = Table.get(KV.second.second);
15709467b48Spatrick     OS << "  {" << From << ", ";
15809467b48Spatrick     printActionValue(A, OS);
15909467b48Spatrick     OS << ", " << To << ", " << InfoIdx << "},\n";
16009467b48Spatrick   }
16109467b48Spatrick   OS << "\n}};\n\n";
16209467b48Spatrick }
16309467b48Spatrick 
printActionType(raw_ostream & OS)16409467b48Spatrick void DfaEmitter::printActionType(raw_ostream &OS) { OS << "uint64_t"; }
16509467b48Spatrick 
printActionValue(action_type A,raw_ostream & OS)16609467b48Spatrick void DfaEmitter::printActionValue(action_type A, raw_ostream &OS) { OS << A; }
16709467b48Spatrick 
16809467b48Spatrick //===----------------------------------------------------------------------===//
16909467b48Spatrick // AutomatonEmitter implementation
17009467b48Spatrick //===----------------------------------------------------------------------===//
17109467b48Spatrick 
17209467b48Spatrick namespace {
17309467b48Spatrick 
174*d415bd75Srobert using Action = std::variant<Record *, unsigned, std::string>;
17509467b48Spatrick using ActionTuple = std::vector<Action>;
17609467b48Spatrick class Automaton;
17709467b48Spatrick 
17809467b48Spatrick class Transition {
17909467b48Spatrick   uint64_t NewState;
18009467b48Spatrick   // The tuple of actions that causes this transition.
18109467b48Spatrick   ActionTuple Actions;
18209467b48Spatrick   // The types of the actions; this is the same across all transitions.
18309467b48Spatrick   SmallVector<std::string, 4> Types;
18409467b48Spatrick 
18509467b48Spatrick public:
18609467b48Spatrick   Transition(Record *R, Automaton *Parent);
getActions()18709467b48Spatrick   const ActionTuple &getActions() { return Actions; }
getTypes()18809467b48Spatrick   SmallVector<std::string, 4> getTypes() { return Types; }
18909467b48Spatrick 
19009467b48Spatrick   bool canTransitionFrom(uint64_t State);
19109467b48Spatrick   uint64_t transitionFrom(uint64_t State);
19209467b48Spatrick };
19309467b48Spatrick 
19409467b48Spatrick class Automaton {
19509467b48Spatrick   RecordKeeper &Records;
19609467b48Spatrick   Record *R;
19709467b48Spatrick   std::vector<Transition> Transitions;
19809467b48Spatrick   /// All possible action tuples, uniqued.
19909467b48Spatrick   UniqueVector<ActionTuple> Actions;
20009467b48Spatrick   /// The fields within each Transition object to find the action symbols.
20109467b48Spatrick   std::vector<StringRef> ActionSymbolFields;
20209467b48Spatrick 
20309467b48Spatrick public:
20409467b48Spatrick   Automaton(RecordKeeper &Records, Record *R);
20509467b48Spatrick   void emit(raw_ostream &OS);
20609467b48Spatrick 
getActionSymbolFields()20709467b48Spatrick   ArrayRef<StringRef> getActionSymbolFields() { return ActionSymbolFields; }
20809467b48Spatrick   /// If the type of action A has been overridden (there exists a field
20909467b48Spatrick   /// "TypeOf_A") return that, otherwise return the empty string.
21009467b48Spatrick   StringRef getActionSymbolType(StringRef A);
21109467b48Spatrick };
21209467b48Spatrick 
21309467b48Spatrick class AutomatonEmitter {
21409467b48Spatrick   RecordKeeper &Records;
21509467b48Spatrick 
21609467b48Spatrick public:
AutomatonEmitter(RecordKeeper & R)21709467b48Spatrick   AutomatonEmitter(RecordKeeper &R) : Records(R) {}
21809467b48Spatrick   void run(raw_ostream &OS);
21909467b48Spatrick };
22009467b48Spatrick 
22109467b48Spatrick /// A DfaEmitter implementation that can print our variant action type.
22209467b48Spatrick class CustomDfaEmitter : public DfaEmitter {
22309467b48Spatrick   const UniqueVector<ActionTuple> &Actions;
22409467b48Spatrick   std::string TypeName;
22509467b48Spatrick 
22609467b48Spatrick public:
CustomDfaEmitter(const UniqueVector<ActionTuple> & Actions,StringRef TypeName)22709467b48Spatrick   CustomDfaEmitter(const UniqueVector<ActionTuple> &Actions, StringRef TypeName)
22809467b48Spatrick       : Actions(Actions), TypeName(TypeName) {}
22909467b48Spatrick 
23009467b48Spatrick   void printActionType(raw_ostream &OS) override;
23109467b48Spatrick   void printActionValue(action_type A, raw_ostream &OS) override;
23209467b48Spatrick };
23309467b48Spatrick } // namespace
23409467b48Spatrick 
run(raw_ostream & OS)23509467b48Spatrick void AutomatonEmitter::run(raw_ostream &OS) {
23609467b48Spatrick   for (Record *R : Records.getAllDerivedDefinitions("GenericAutomaton")) {
23709467b48Spatrick     Automaton A(Records, R);
23809467b48Spatrick     OS << "#ifdef GET_" << R->getName() << "_DECL\n";
23909467b48Spatrick     A.emit(OS);
24009467b48Spatrick     OS << "#endif  // GET_" << R->getName() << "_DECL\n";
24109467b48Spatrick   }
24209467b48Spatrick }
24309467b48Spatrick 
Automaton(RecordKeeper & Records,Record * R)24409467b48Spatrick Automaton::Automaton(RecordKeeper &Records, Record *R)
24509467b48Spatrick     : Records(Records), R(R) {
24609467b48Spatrick   LLVM_DEBUG(dbgs() << "Emitting automaton for " << R->getName() << "\n");
24709467b48Spatrick   ActionSymbolFields = R->getValueAsListOfStrings("SymbolFields");
24809467b48Spatrick }
24909467b48Spatrick 
emit(raw_ostream & OS)25009467b48Spatrick void Automaton::emit(raw_ostream &OS) {
25109467b48Spatrick   StringRef TransitionClass = R->getValueAsString("TransitionClass");
25209467b48Spatrick   for (Record *T : Records.getAllDerivedDefinitions(TransitionClass)) {
25309467b48Spatrick     assert(T->isSubClassOf("Transition"));
25409467b48Spatrick     Transitions.emplace_back(T, this);
25509467b48Spatrick     Actions.insert(Transitions.back().getActions());
25609467b48Spatrick   }
25709467b48Spatrick 
25809467b48Spatrick   LLVM_DEBUG(dbgs() << "  Action alphabet cardinality: " << Actions.size()
25909467b48Spatrick                     << "\n");
26009467b48Spatrick   LLVM_DEBUG(dbgs() << "  Each state has " << Transitions.size()
26109467b48Spatrick                     << " potential transitions.\n");
26209467b48Spatrick 
26309467b48Spatrick   StringRef Name = R->getName();
26409467b48Spatrick 
26509467b48Spatrick   CustomDfaEmitter Emitter(Actions, std::string(Name) + "Action");
26609467b48Spatrick   // Starting from the initial state, build up a list of possible states and
26709467b48Spatrick   // transitions.
26809467b48Spatrick   std::deque<uint64_t> Worklist(1, 0);
26909467b48Spatrick   std::set<uint64_t> SeenStates;
27009467b48Spatrick   unsigned NumTransitions = 0;
27109467b48Spatrick   SeenStates.insert(Worklist.front());
27209467b48Spatrick   while (!Worklist.empty()) {
27309467b48Spatrick     uint64_t State = Worklist.front();
27409467b48Spatrick     Worklist.pop_front();
27509467b48Spatrick     for (Transition &T : Transitions) {
27609467b48Spatrick       if (!T.canTransitionFrom(State))
27709467b48Spatrick         continue;
27809467b48Spatrick       uint64_t NewState = T.transitionFrom(State);
27909467b48Spatrick       if (SeenStates.emplace(NewState).second)
28009467b48Spatrick         Worklist.emplace_back(NewState);
28109467b48Spatrick       ++NumTransitions;
28209467b48Spatrick       Emitter.addTransition(State, NewState, Actions.idFor(T.getActions()));
28309467b48Spatrick     }
28409467b48Spatrick   }
28509467b48Spatrick   LLVM_DEBUG(dbgs() << "  NFA automaton has " << SeenStates.size()
28609467b48Spatrick                     << " states with " << NumTransitions << " transitions.\n");
287*d415bd75Srobert   (void) NumTransitions;
28809467b48Spatrick 
28909467b48Spatrick   const auto &ActionTypes = Transitions.back().getTypes();
29009467b48Spatrick   OS << "// The type of an action in the " << Name << " automaton.\n";
29109467b48Spatrick   if (ActionTypes.size() == 1) {
29209467b48Spatrick     OS << "using " << Name << "Action = " << ActionTypes[0] << ";\n";
29309467b48Spatrick   } else {
29409467b48Spatrick     OS << "using " << Name << "Action = std::tuple<" << join(ActionTypes, ", ")
29509467b48Spatrick        << ">;\n";
29609467b48Spatrick   }
29709467b48Spatrick   OS << "\n";
29809467b48Spatrick 
29909467b48Spatrick   Emitter.emit(Name, OS);
30009467b48Spatrick }
30109467b48Spatrick 
getActionSymbolType(StringRef A)30209467b48Spatrick StringRef Automaton::getActionSymbolType(StringRef A) {
30309467b48Spatrick   Twine Ty = "TypeOf_" + A;
30409467b48Spatrick   if (!R->getValue(Ty.str()))
30509467b48Spatrick     return "";
30609467b48Spatrick   return R->getValueAsString(Ty.str());
30709467b48Spatrick }
30809467b48Spatrick 
Transition(Record * R,Automaton * Parent)30909467b48Spatrick Transition::Transition(Record *R, Automaton *Parent) {
31009467b48Spatrick   BitsInit *NewStateInit = R->getValueAsBitsInit("NewState");
31109467b48Spatrick   NewState = 0;
31209467b48Spatrick   assert(NewStateInit->getNumBits() <= sizeof(uint64_t) * 8 &&
31309467b48Spatrick          "State cannot be represented in 64 bits!");
31409467b48Spatrick   for (unsigned I = 0; I < NewStateInit->getNumBits(); ++I) {
31509467b48Spatrick     if (auto *Bit = dyn_cast<BitInit>(NewStateInit->getBit(I))) {
31609467b48Spatrick       if (Bit->getValue())
31709467b48Spatrick         NewState |= 1ULL << I;
31809467b48Spatrick     }
31909467b48Spatrick   }
32009467b48Spatrick 
32109467b48Spatrick   for (StringRef A : Parent->getActionSymbolFields()) {
32209467b48Spatrick     RecordVal *SymbolV = R->getValue(A);
32309467b48Spatrick     if (auto *Ty = dyn_cast<RecordRecTy>(SymbolV->getType())) {
324*d415bd75Srobert       Actions.emplace_back(R->getValueAsDef(A));
32509467b48Spatrick       Types.emplace_back(Ty->getAsString());
32609467b48Spatrick     } else if (isa<IntRecTy>(SymbolV->getType())) {
327*d415bd75Srobert       Actions.emplace_back(static_cast<unsigned>(R->getValueAsInt(A)));
32809467b48Spatrick       Types.emplace_back("unsigned");
32973471bf0Spatrick     } else if (isa<StringRecTy>(SymbolV->getType())) {
330*d415bd75Srobert       Actions.emplace_back(std::string(R->getValueAsString(A)));
33109467b48Spatrick       Types.emplace_back("std::string");
33209467b48Spatrick     } else {
33309467b48Spatrick       report_fatal_error("Unhandled symbol type!");
33409467b48Spatrick     }
33509467b48Spatrick 
33609467b48Spatrick     StringRef TypeOverride = Parent->getActionSymbolType(A);
33709467b48Spatrick     if (!TypeOverride.empty())
338097a140dSpatrick       Types.back() = std::string(TypeOverride);
33909467b48Spatrick   }
34009467b48Spatrick }
34109467b48Spatrick 
canTransitionFrom(uint64_t State)34209467b48Spatrick bool Transition::canTransitionFrom(uint64_t State) {
34309467b48Spatrick   if ((State & NewState) == 0)
34409467b48Spatrick     // The bits we want to set are not set;
34509467b48Spatrick     return true;
34609467b48Spatrick   return false;
34709467b48Spatrick }
34809467b48Spatrick 
transitionFrom(uint64_t State)34909467b48Spatrick uint64_t Transition::transitionFrom(uint64_t State) {
35009467b48Spatrick   return State | NewState;
35109467b48Spatrick }
35209467b48Spatrick 
printActionType(raw_ostream & OS)35309467b48Spatrick void CustomDfaEmitter::printActionType(raw_ostream &OS) { OS << TypeName; }
35409467b48Spatrick 
printActionValue(action_type A,raw_ostream & OS)35509467b48Spatrick void CustomDfaEmitter::printActionValue(action_type A, raw_ostream &OS) {
35609467b48Spatrick   const ActionTuple &AT = Actions[A];
35709467b48Spatrick   if (AT.size() > 1)
35809467b48Spatrick     OS << "std::make_tuple(";
35973471bf0Spatrick   ListSeparator LS;
36009467b48Spatrick   for (const auto &SingleAction : AT) {
36173471bf0Spatrick     OS << LS;
362*d415bd75Srobert     if (const auto *R = std::get_if<Record *>(&SingleAction))
363*d415bd75Srobert       OS << (*R)->getName();
364*d415bd75Srobert     else if (const auto *S = std::get_if<std::string>(&SingleAction))
365*d415bd75Srobert       OS << '"' << *S << '"';
366*d415bd75Srobert     else
367*d415bd75Srobert       OS << std::get<unsigned>(SingleAction);
36809467b48Spatrick   }
36909467b48Spatrick   if (AT.size() > 1)
37009467b48Spatrick     OS << ")";
37109467b48Spatrick }
37209467b48Spatrick 
37309467b48Spatrick namespace llvm {
37409467b48Spatrick 
EmitAutomata(RecordKeeper & RK,raw_ostream & OS)37509467b48Spatrick void EmitAutomata(RecordKeeper &RK, raw_ostream &OS) {
37609467b48Spatrick   AutomatonEmitter(RK).run(OS);
37709467b48Spatrick }
37809467b48Spatrick 
37909467b48Spatrick } // namespace llvm
380