1*09467b48Spatrick //===- DFAEmitter.cpp - Finite state automaton emitter --------------------===// 2*09467b48Spatrick // 3*09467b48Spatrick // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4*09467b48Spatrick // See https://llvm.org/LICENSE.txt for license information. 5*09467b48Spatrick // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6*09467b48Spatrick // 7*09467b48Spatrick //===----------------------------------------------------------------------===// 8*09467b48Spatrick // 9*09467b48Spatrick // This class can produce a generic deterministic finite state automaton (DFA), 10*09467b48Spatrick // given a set of possible states and transitions. 11*09467b48Spatrick // 12*09467b48Spatrick // The input transitions can be nondeterministic - this class will produce the 13*09467b48Spatrick // deterministic equivalent state machine. 14*09467b48Spatrick // 15*09467b48Spatrick // The generated code can run the DFA and produce an accepted / not accepted 16*09467b48Spatrick // state and also produce, given a sequence of transitions that results in an 17*09467b48Spatrick // accepted state, the sequence of intermediate states. This is useful if the 18*09467b48Spatrick // initial automaton was nondeterministic - it allows mapping back from the DFA 19*09467b48Spatrick // to the NFA. 20*09467b48Spatrick // 21*09467b48Spatrick //===----------------------------------------------------------------------===// 22*09467b48Spatrick #define DEBUG_TYPE "dfa-emitter" 23*09467b48Spatrick 24*09467b48Spatrick #include "DFAEmitter.h" 25*09467b48Spatrick #include "CodeGenTarget.h" 26*09467b48Spatrick #include "SequenceToOffsetTable.h" 27*09467b48Spatrick #include "TableGenBackends.h" 28*09467b48Spatrick #include "llvm/ADT/SmallVector.h" 29*09467b48Spatrick #include "llvm/ADT/StringExtras.h" 30*09467b48Spatrick #include "llvm/ADT/UniqueVector.h" 31*09467b48Spatrick #include "llvm/Support/Debug.h" 32*09467b48Spatrick #include "llvm/Support/raw_ostream.h" 33*09467b48Spatrick #include "llvm/TableGen/Record.h" 34*09467b48Spatrick #include "llvm/TableGen/TableGenBackend.h" 35*09467b48Spatrick #include <cassert> 36*09467b48Spatrick #include <cstdint> 37*09467b48Spatrick #include <map> 38*09467b48Spatrick #include <set> 39*09467b48Spatrick #include <string> 40*09467b48Spatrick #include <vector> 41*09467b48Spatrick 42*09467b48Spatrick using namespace llvm; 43*09467b48Spatrick 44*09467b48Spatrick //===----------------------------------------------------------------------===// 45*09467b48Spatrick // DfaEmitter implementation. This is independent of the GenAutomaton backend. 46*09467b48Spatrick //===----------------------------------------------------------------------===// 47*09467b48Spatrick 48*09467b48Spatrick void DfaEmitter::addTransition(state_type From, state_type To, action_type A) { 49*09467b48Spatrick Actions.insert(A); 50*09467b48Spatrick NfaStates.insert(From); 51*09467b48Spatrick NfaStates.insert(To); 52*09467b48Spatrick NfaTransitions[{From, A}].push_back(To); 53*09467b48Spatrick ++NumNfaTransitions; 54*09467b48Spatrick } 55*09467b48Spatrick 56*09467b48Spatrick void DfaEmitter::visitDfaState(const DfaState &DS) { 57*09467b48Spatrick // For every possible action... 58*09467b48Spatrick auto FromId = DfaStates.idFor(DS); 59*09467b48Spatrick for (action_type A : Actions) { 60*09467b48Spatrick DfaState NewStates; 61*09467b48Spatrick DfaTransitionInfo TI; 62*09467b48Spatrick // For every represented state, word pair in the original NFA... 63*09467b48Spatrick for (state_type FromState : DS) { 64*09467b48Spatrick // If this action is possible from this state add the transitioned-to 65*09467b48Spatrick // states to NewStates. 66*09467b48Spatrick auto I = NfaTransitions.find({FromState, A}); 67*09467b48Spatrick if (I == NfaTransitions.end()) 68*09467b48Spatrick continue; 69*09467b48Spatrick for (state_type &ToState : I->second) { 70*09467b48Spatrick NewStates.push_back(ToState); 71*09467b48Spatrick TI.emplace_back(FromState, ToState); 72*09467b48Spatrick } 73*09467b48Spatrick } 74*09467b48Spatrick if (NewStates.empty()) 75*09467b48Spatrick continue; 76*09467b48Spatrick // Sort and unique. 77*09467b48Spatrick sort(NewStates); 78*09467b48Spatrick NewStates.erase(std::unique(NewStates.begin(), NewStates.end()), 79*09467b48Spatrick NewStates.end()); 80*09467b48Spatrick sort(TI); 81*09467b48Spatrick TI.erase(std::unique(TI.begin(), TI.end()), TI.end()); 82*09467b48Spatrick unsigned ToId = DfaStates.insert(NewStates); 83*09467b48Spatrick DfaTransitions.emplace(std::make_pair(FromId, A), std::make_pair(ToId, TI)); 84*09467b48Spatrick } 85*09467b48Spatrick } 86*09467b48Spatrick 87*09467b48Spatrick void DfaEmitter::constructDfa() { 88*09467b48Spatrick DfaState Initial(1, /*NFA initial state=*/0); 89*09467b48Spatrick DfaStates.insert(Initial); 90*09467b48Spatrick 91*09467b48Spatrick // Note that UniqueVector starts indices at 1, not zero. 92*09467b48Spatrick unsigned DfaStateId = 1; 93*09467b48Spatrick while (DfaStateId <= DfaStates.size()) { 94*09467b48Spatrick DfaState S = DfaStates[DfaStateId]; 95*09467b48Spatrick visitDfaState(S); 96*09467b48Spatrick DfaStateId++; 97*09467b48Spatrick } 98*09467b48Spatrick } 99*09467b48Spatrick 100*09467b48Spatrick void DfaEmitter::emit(StringRef Name, raw_ostream &OS) { 101*09467b48Spatrick constructDfa(); 102*09467b48Spatrick 103*09467b48Spatrick OS << "// Input NFA has " << NfaStates.size() << " states with " 104*09467b48Spatrick << NumNfaTransitions << " transitions.\n"; 105*09467b48Spatrick OS << "// Generated DFA has " << DfaStates.size() << " states with " 106*09467b48Spatrick << DfaTransitions.size() << " transitions.\n\n"; 107*09467b48Spatrick 108*09467b48Spatrick // Implementation note: We don't bake a simple std::pair<> here as it requires 109*09467b48Spatrick // significantly more effort to parse. A simple test with a large array of 110*09467b48Spatrick // struct-pairs (N=100000) took clang-10 6s to parse. The same array of 111*09467b48Spatrick // std::pair<uint64_t, uint64_t> took 242s. Instead we allow the user to 112*09467b48Spatrick // define the pair type. 113*09467b48Spatrick // 114*09467b48Spatrick // FIXME: It may make sense to emit these as ULEB sequences instead of 115*09467b48Spatrick // pairs of uint64_t. 116*09467b48Spatrick OS << "// A zero-terminated sequence of NFA state transitions. Every DFA\n"; 117*09467b48Spatrick OS << "// transition implies a set of NFA transitions. These are referred\n"; 118*09467b48Spatrick OS << "// to by index in " << Name << "Transitions[].\n"; 119*09467b48Spatrick 120*09467b48Spatrick SequenceToOffsetTable<DfaTransitionInfo> Table; 121*09467b48Spatrick std::map<DfaTransitionInfo, unsigned> EmittedIndices; 122*09467b48Spatrick for (auto &T : DfaTransitions) 123*09467b48Spatrick Table.add(T.second.second); 124*09467b48Spatrick Table.layout(); 125*09467b48Spatrick OS << "std::array<NfaStatePair, " << Table.size() << "> " << Name 126*09467b48Spatrick << "TransitionInfo = {{\n"; 127*09467b48Spatrick Table.emit( 128*09467b48Spatrick OS, 129*09467b48Spatrick [](raw_ostream &OS, std::pair<uint64_t, uint64_t> P) { 130*09467b48Spatrick OS << "{" << P.first << ", " << P.second << "}"; 131*09467b48Spatrick }, 132*09467b48Spatrick "{0ULL, 0ULL}"); 133*09467b48Spatrick 134*09467b48Spatrick OS << "}};\n\n"; 135*09467b48Spatrick 136*09467b48Spatrick OS << "// A transition in the generated " << Name << " DFA.\n"; 137*09467b48Spatrick OS << "struct " << Name << "Transition {\n"; 138*09467b48Spatrick OS << " unsigned FromDfaState; // The transitioned-from DFA state.\n"; 139*09467b48Spatrick OS << " "; 140*09467b48Spatrick printActionType(OS); 141*09467b48Spatrick OS << " Action; // The input symbol that causes this transition.\n"; 142*09467b48Spatrick OS << " unsigned ToDfaState; // The transitioned-to DFA state.\n"; 143*09467b48Spatrick OS << " unsigned InfoIdx; // Start index into " << Name 144*09467b48Spatrick << "TransitionInfo.\n"; 145*09467b48Spatrick OS << "};\n\n"; 146*09467b48Spatrick 147*09467b48Spatrick OS << "// A table of DFA transitions, ordered by {FromDfaState, Action}.\n"; 148*09467b48Spatrick OS << "// The initial state is 1, not zero.\n"; 149*09467b48Spatrick OS << "std::array<" << Name << "Transition, " << DfaTransitions.size() << "> " 150*09467b48Spatrick << Name << "Transitions = {{\n"; 151*09467b48Spatrick for (auto &KV : DfaTransitions) { 152*09467b48Spatrick dfa_state_type From = KV.first.first; 153*09467b48Spatrick dfa_state_type To = KV.second.first; 154*09467b48Spatrick action_type A = KV.first.second; 155*09467b48Spatrick unsigned InfoIdx = Table.get(KV.second.second); 156*09467b48Spatrick OS << " {" << From << ", "; 157*09467b48Spatrick printActionValue(A, OS); 158*09467b48Spatrick OS << ", " << To << ", " << InfoIdx << "},\n"; 159*09467b48Spatrick } 160*09467b48Spatrick OS << "\n}};\n\n"; 161*09467b48Spatrick } 162*09467b48Spatrick 163*09467b48Spatrick void DfaEmitter::printActionType(raw_ostream &OS) { OS << "uint64_t"; } 164*09467b48Spatrick 165*09467b48Spatrick void DfaEmitter::printActionValue(action_type A, raw_ostream &OS) { OS << A; } 166*09467b48Spatrick 167*09467b48Spatrick //===----------------------------------------------------------------------===// 168*09467b48Spatrick // AutomatonEmitter implementation 169*09467b48Spatrick //===----------------------------------------------------------------------===// 170*09467b48Spatrick 171*09467b48Spatrick namespace { 172*09467b48Spatrick // FIXME: This entire discriminated union could be removed with c++17: 173*09467b48Spatrick // using Action = std::variant<Record *, unsigned, std::string>; 174*09467b48Spatrick struct Action { 175*09467b48Spatrick Record *R = nullptr; 176*09467b48Spatrick unsigned I = 0; 177*09467b48Spatrick std::string S = nullptr; 178*09467b48Spatrick 179*09467b48Spatrick Action() = default; 180*09467b48Spatrick Action(Record *R, unsigned I, std::string S) : R(R), I(I), S(S) {} 181*09467b48Spatrick 182*09467b48Spatrick void print(raw_ostream &OS) const { 183*09467b48Spatrick if (R) 184*09467b48Spatrick OS << R->getName(); 185*09467b48Spatrick else if (!S.empty()) 186*09467b48Spatrick OS << '"' << S << '"'; 187*09467b48Spatrick else 188*09467b48Spatrick OS << I; 189*09467b48Spatrick } 190*09467b48Spatrick bool operator<(const Action &Other) const { 191*09467b48Spatrick return std::make_tuple(R, I, S) < 192*09467b48Spatrick std::make_tuple(Other.R, Other.I, Other.S); 193*09467b48Spatrick } 194*09467b48Spatrick }; 195*09467b48Spatrick 196*09467b48Spatrick using ActionTuple = std::vector<Action>; 197*09467b48Spatrick class Automaton; 198*09467b48Spatrick 199*09467b48Spatrick class Transition { 200*09467b48Spatrick uint64_t NewState; 201*09467b48Spatrick // The tuple of actions that causes this transition. 202*09467b48Spatrick ActionTuple Actions; 203*09467b48Spatrick // The types of the actions; this is the same across all transitions. 204*09467b48Spatrick SmallVector<std::string, 4> Types; 205*09467b48Spatrick 206*09467b48Spatrick public: 207*09467b48Spatrick Transition(Record *R, Automaton *Parent); 208*09467b48Spatrick const ActionTuple &getActions() { return Actions; } 209*09467b48Spatrick SmallVector<std::string, 4> getTypes() { return Types; } 210*09467b48Spatrick 211*09467b48Spatrick bool canTransitionFrom(uint64_t State); 212*09467b48Spatrick uint64_t transitionFrom(uint64_t State); 213*09467b48Spatrick }; 214*09467b48Spatrick 215*09467b48Spatrick class Automaton { 216*09467b48Spatrick RecordKeeper &Records; 217*09467b48Spatrick Record *R; 218*09467b48Spatrick std::vector<Transition> Transitions; 219*09467b48Spatrick /// All possible action tuples, uniqued. 220*09467b48Spatrick UniqueVector<ActionTuple> Actions; 221*09467b48Spatrick /// The fields within each Transition object to find the action symbols. 222*09467b48Spatrick std::vector<StringRef> ActionSymbolFields; 223*09467b48Spatrick 224*09467b48Spatrick public: 225*09467b48Spatrick Automaton(RecordKeeper &Records, Record *R); 226*09467b48Spatrick void emit(raw_ostream &OS); 227*09467b48Spatrick 228*09467b48Spatrick ArrayRef<StringRef> getActionSymbolFields() { return ActionSymbolFields; } 229*09467b48Spatrick /// If the type of action A has been overridden (there exists a field 230*09467b48Spatrick /// "TypeOf_A") return that, otherwise return the empty string. 231*09467b48Spatrick StringRef getActionSymbolType(StringRef A); 232*09467b48Spatrick }; 233*09467b48Spatrick 234*09467b48Spatrick class AutomatonEmitter { 235*09467b48Spatrick RecordKeeper &Records; 236*09467b48Spatrick 237*09467b48Spatrick public: 238*09467b48Spatrick AutomatonEmitter(RecordKeeper &R) : Records(R) {} 239*09467b48Spatrick void run(raw_ostream &OS); 240*09467b48Spatrick }; 241*09467b48Spatrick 242*09467b48Spatrick /// A DfaEmitter implementation that can print our variant action type. 243*09467b48Spatrick class CustomDfaEmitter : public DfaEmitter { 244*09467b48Spatrick const UniqueVector<ActionTuple> &Actions; 245*09467b48Spatrick std::string TypeName; 246*09467b48Spatrick 247*09467b48Spatrick public: 248*09467b48Spatrick CustomDfaEmitter(const UniqueVector<ActionTuple> &Actions, StringRef TypeName) 249*09467b48Spatrick : Actions(Actions), TypeName(TypeName) {} 250*09467b48Spatrick 251*09467b48Spatrick void printActionType(raw_ostream &OS) override; 252*09467b48Spatrick void printActionValue(action_type A, raw_ostream &OS) override; 253*09467b48Spatrick }; 254*09467b48Spatrick } // namespace 255*09467b48Spatrick 256*09467b48Spatrick void AutomatonEmitter::run(raw_ostream &OS) { 257*09467b48Spatrick for (Record *R : Records.getAllDerivedDefinitions("GenericAutomaton")) { 258*09467b48Spatrick Automaton A(Records, R); 259*09467b48Spatrick OS << "#ifdef GET_" << R->getName() << "_DECL\n"; 260*09467b48Spatrick A.emit(OS); 261*09467b48Spatrick OS << "#endif // GET_" << R->getName() << "_DECL\n"; 262*09467b48Spatrick } 263*09467b48Spatrick } 264*09467b48Spatrick 265*09467b48Spatrick Automaton::Automaton(RecordKeeper &Records, Record *R) 266*09467b48Spatrick : Records(Records), R(R) { 267*09467b48Spatrick LLVM_DEBUG(dbgs() << "Emitting automaton for " << R->getName() << "\n"); 268*09467b48Spatrick ActionSymbolFields = R->getValueAsListOfStrings("SymbolFields"); 269*09467b48Spatrick } 270*09467b48Spatrick 271*09467b48Spatrick void Automaton::emit(raw_ostream &OS) { 272*09467b48Spatrick StringRef TransitionClass = R->getValueAsString("TransitionClass"); 273*09467b48Spatrick for (Record *T : Records.getAllDerivedDefinitions(TransitionClass)) { 274*09467b48Spatrick assert(T->isSubClassOf("Transition")); 275*09467b48Spatrick Transitions.emplace_back(T, this); 276*09467b48Spatrick Actions.insert(Transitions.back().getActions()); 277*09467b48Spatrick } 278*09467b48Spatrick 279*09467b48Spatrick LLVM_DEBUG(dbgs() << " Action alphabet cardinality: " << Actions.size() 280*09467b48Spatrick << "\n"); 281*09467b48Spatrick LLVM_DEBUG(dbgs() << " Each state has " << Transitions.size() 282*09467b48Spatrick << " potential transitions.\n"); 283*09467b48Spatrick 284*09467b48Spatrick StringRef Name = R->getName(); 285*09467b48Spatrick 286*09467b48Spatrick CustomDfaEmitter Emitter(Actions, std::string(Name) + "Action"); 287*09467b48Spatrick // Starting from the initial state, build up a list of possible states and 288*09467b48Spatrick // transitions. 289*09467b48Spatrick std::deque<uint64_t> Worklist(1, 0); 290*09467b48Spatrick std::set<uint64_t> SeenStates; 291*09467b48Spatrick unsigned NumTransitions = 0; 292*09467b48Spatrick SeenStates.insert(Worklist.front()); 293*09467b48Spatrick while (!Worklist.empty()) { 294*09467b48Spatrick uint64_t State = Worklist.front(); 295*09467b48Spatrick Worklist.pop_front(); 296*09467b48Spatrick for (Transition &T : Transitions) { 297*09467b48Spatrick if (!T.canTransitionFrom(State)) 298*09467b48Spatrick continue; 299*09467b48Spatrick uint64_t NewState = T.transitionFrom(State); 300*09467b48Spatrick if (SeenStates.emplace(NewState).second) 301*09467b48Spatrick Worklist.emplace_back(NewState); 302*09467b48Spatrick ++NumTransitions; 303*09467b48Spatrick Emitter.addTransition(State, NewState, Actions.idFor(T.getActions())); 304*09467b48Spatrick } 305*09467b48Spatrick } 306*09467b48Spatrick LLVM_DEBUG(dbgs() << " NFA automaton has " << SeenStates.size() 307*09467b48Spatrick << " states with " << NumTransitions << " transitions.\n"); 308*09467b48Spatrick 309*09467b48Spatrick const auto &ActionTypes = Transitions.back().getTypes(); 310*09467b48Spatrick OS << "// The type of an action in the " << Name << " automaton.\n"; 311*09467b48Spatrick if (ActionTypes.size() == 1) { 312*09467b48Spatrick OS << "using " << Name << "Action = " << ActionTypes[0] << ";\n"; 313*09467b48Spatrick } else { 314*09467b48Spatrick OS << "using " << Name << "Action = std::tuple<" << join(ActionTypes, ", ") 315*09467b48Spatrick << ">;\n"; 316*09467b48Spatrick } 317*09467b48Spatrick OS << "\n"; 318*09467b48Spatrick 319*09467b48Spatrick Emitter.emit(Name, OS); 320*09467b48Spatrick } 321*09467b48Spatrick 322*09467b48Spatrick StringRef Automaton::getActionSymbolType(StringRef A) { 323*09467b48Spatrick Twine Ty = "TypeOf_" + A; 324*09467b48Spatrick if (!R->getValue(Ty.str())) 325*09467b48Spatrick return ""; 326*09467b48Spatrick return R->getValueAsString(Ty.str()); 327*09467b48Spatrick } 328*09467b48Spatrick 329*09467b48Spatrick Transition::Transition(Record *R, Automaton *Parent) { 330*09467b48Spatrick BitsInit *NewStateInit = R->getValueAsBitsInit("NewState"); 331*09467b48Spatrick NewState = 0; 332*09467b48Spatrick assert(NewStateInit->getNumBits() <= sizeof(uint64_t) * 8 && 333*09467b48Spatrick "State cannot be represented in 64 bits!"); 334*09467b48Spatrick for (unsigned I = 0; I < NewStateInit->getNumBits(); ++I) { 335*09467b48Spatrick if (auto *Bit = dyn_cast<BitInit>(NewStateInit->getBit(I))) { 336*09467b48Spatrick if (Bit->getValue()) 337*09467b48Spatrick NewState |= 1ULL << I; 338*09467b48Spatrick } 339*09467b48Spatrick } 340*09467b48Spatrick 341*09467b48Spatrick for (StringRef A : Parent->getActionSymbolFields()) { 342*09467b48Spatrick RecordVal *SymbolV = R->getValue(A); 343*09467b48Spatrick if (auto *Ty = dyn_cast<RecordRecTy>(SymbolV->getType())) { 344*09467b48Spatrick Actions.emplace_back(R->getValueAsDef(A), 0, ""); 345*09467b48Spatrick Types.emplace_back(Ty->getAsString()); 346*09467b48Spatrick } else if (isa<IntRecTy>(SymbolV->getType())) { 347*09467b48Spatrick Actions.emplace_back(nullptr, R->getValueAsInt(A), ""); 348*09467b48Spatrick Types.emplace_back("unsigned"); 349*09467b48Spatrick } else if (isa<StringRecTy>(SymbolV->getType()) || 350*09467b48Spatrick isa<CodeRecTy>(SymbolV->getType())) { 351*09467b48Spatrick Actions.emplace_back(nullptr, 0, R->getValueAsString(A)); 352*09467b48Spatrick Types.emplace_back("std::string"); 353*09467b48Spatrick } else { 354*09467b48Spatrick report_fatal_error("Unhandled symbol type!"); 355*09467b48Spatrick } 356*09467b48Spatrick 357*09467b48Spatrick StringRef TypeOverride = Parent->getActionSymbolType(A); 358*09467b48Spatrick if (!TypeOverride.empty()) 359*09467b48Spatrick Types.back() = TypeOverride; 360*09467b48Spatrick } 361*09467b48Spatrick } 362*09467b48Spatrick 363*09467b48Spatrick bool Transition::canTransitionFrom(uint64_t State) { 364*09467b48Spatrick if ((State & NewState) == 0) 365*09467b48Spatrick // The bits we want to set are not set; 366*09467b48Spatrick return true; 367*09467b48Spatrick return false; 368*09467b48Spatrick } 369*09467b48Spatrick 370*09467b48Spatrick uint64_t Transition::transitionFrom(uint64_t State) { 371*09467b48Spatrick return State | NewState; 372*09467b48Spatrick } 373*09467b48Spatrick 374*09467b48Spatrick void CustomDfaEmitter::printActionType(raw_ostream &OS) { OS << TypeName; } 375*09467b48Spatrick 376*09467b48Spatrick void CustomDfaEmitter::printActionValue(action_type A, raw_ostream &OS) { 377*09467b48Spatrick const ActionTuple &AT = Actions[A]; 378*09467b48Spatrick if (AT.size() > 1) 379*09467b48Spatrick OS << "std::make_tuple("; 380*09467b48Spatrick bool First = true; 381*09467b48Spatrick for (const auto &SingleAction : AT) { 382*09467b48Spatrick if (!First) 383*09467b48Spatrick OS << ", "; 384*09467b48Spatrick First = false; 385*09467b48Spatrick SingleAction.print(OS); 386*09467b48Spatrick } 387*09467b48Spatrick if (AT.size() > 1) 388*09467b48Spatrick OS << ")"; 389*09467b48Spatrick } 390*09467b48Spatrick 391*09467b48Spatrick namespace llvm { 392*09467b48Spatrick 393*09467b48Spatrick void EmitAutomata(RecordKeeper &RK, raw_ostream &OS) { 394*09467b48Spatrick AutomatonEmitter(RK).run(OS); 395*09467b48Spatrick } 396*09467b48Spatrick 397*09467b48Spatrick } // namespace llvm 398