//===- ARMLatencyMutations.cpp - ARM Latency Mutations --------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // /// \file This file contains the ARM definition DAG scheduling mutations which /// change inter-instruction latencies // //===----------------------------------------------------------------------===// #include "ARMLatencyMutations.h" #include "ARMSubtarget.h" #include "Thumb2InstrInfo.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/CodeGen/ScheduleDAG.h" #include "llvm/CodeGen/ScheduleDAGMutation.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include #include #include #include namespace llvm { namespace { // Precompute information about opcodes to speed up pass class InstructionInformation { protected: struct IInfo { bool HasBRegAddr : 1; // B-side of addr gen is a register bool HasBRegAddrShift : 1; // B-side of addr gen has a shift bool IsDivide : 1; // Some form of integer divide bool IsInlineShiftALU : 1; // Inline shift+ALU bool IsMultiply : 1; // Some form of integer multiply bool IsMVEIntMAC : 1; // MVE 8/16/32-bit integer MAC operation bool IsNonSubwordLoad : 1; // Load which is a word or larger bool IsShift : 1; // Shift operation bool IsRev : 1; // REV operation bool ProducesQP : 1; // Produces a vector register result bool ProducesDP : 1; // Produces a double-precision register result bool ProducesSP : 1; // Produces a single-precision register result bool ConsumesQP : 1; // Consumes a vector register result bool ConsumesDP : 1; // Consumes a double-precision register result bool ConsumesSP : 1; // Consumes a single-precision register result unsigned MVEIntMACMatched; // Matched operand type (for MVE) unsigned AddressOpMask; // Mask indicating which operands go into AGU IInfo() : HasBRegAddr(false), HasBRegAddrShift(false), IsDivide(false), IsInlineShiftALU(false), IsMultiply(false), IsMVEIntMAC(false), IsNonSubwordLoad(false), IsShift(false), IsRev(false), ProducesQP(false), ProducesDP(false), ProducesSP(false), ConsumesQP(false), ConsumesDP(false), ConsumesSP(false), MVEIntMACMatched(0), AddressOpMask(0) {} }; typedef std::array IInfoArray; IInfoArray Info; public: // Always available information unsigned getAddressOpMask(unsigned Op) { return Info[Op].AddressOpMask; } bool hasBRegAddr(unsigned Op) { return Info[Op].HasBRegAddr; } bool hasBRegAddrShift(unsigned Op) { return Info[Op].HasBRegAddrShift; } bool isDivide(unsigned Op) { return Info[Op].IsDivide; } bool isInlineShiftALU(unsigned Op) { return Info[Op].IsInlineShiftALU; } bool isMultiply(unsigned Op) { return Info[Op].IsMultiply; } bool isMVEIntMAC(unsigned Op) { return Info[Op].IsMVEIntMAC; } bool isNonSubwordLoad(unsigned Op) { return Info[Op].IsNonSubwordLoad; } bool isRev(unsigned Op) { return Info[Op].IsRev; } bool isShift(unsigned Op) { return Info[Op].IsShift; } // information available if markDPConsumers is called. bool producesQP(unsigned Op) { return Info[Op].ProducesQP; } bool producesDP(unsigned Op) { return Info[Op].ProducesDP; } bool producesSP(unsigned Op) { return Info[Op].ProducesSP; } bool consumesQP(unsigned Op) { return Info[Op].ConsumesQP; } bool consumesDP(unsigned Op) { return Info[Op].ConsumesDP; } bool consumesSP(unsigned Op) { return Info[Op].ConsumesSP; } bool isMVEIntMACMatched(unsigned SrcOp, unsigned DstOp) { return SrcOp == DstOp || Info[DstOp].MVEIntMACMatched == SrcOp; } InstructionInformation(const ARMBaseInstrInfo *TII); protected: void markDPProducersConsumers(const ARMBaseInstrInfo *TII); }; InstructionInformation::InstructionInformation(const ARMBaseInstrInfo *TII) { using namespace ARM; std::initializer_list hasBRegAddrList = { t2LDRs, t2LDRBs, t2LDRHs, t2STRs, t2STRBs, t2STRHs, tLDRr, tLDRBr, tLDRHr, tSTRr, tSTRBr, tSTRHr, }; for (auto op : hasBRegAddrList) { Info[op].HasBRegAddr = true; } std::initializer_list hasBRegAddrShiftList = { t2LDRs, t2LDRBs, t2LDRHs, t2STRs, t2STRBs, t2STRHs, }; for (auto op : hasBRegAddrShiftList) { Info[op].HasBRegAddrShift = true; } Info[t2SDIV].IsDivide = Info[t2UDIV].IsDivide = true; std::initializer_list isInlineShiftALUList = { t2ADCrs, t2ADDSrs, t2ADDrs, t2BICrs, t2EORrs, t2ORNrs, t2RSBSrs, t2RSBrs, t2SBCrs, t2SUBrs, t2SUBSrs, t2CMPrs, t2CMNzrs, t2TEQrs, t2TSTrs, }; for (auto op : isInlineShiftALUList) { Info[op].IsInlineShiftALU = true; } Info[t2SDIV].IsDivide = Info[t2UDIV].IsDivide = true; std::initializer_list isMultiplyList = { t2MUL, t2MLA, t2MLS, t2SMLABB, t2SMLABT, t2SMLAD, t2SMLADX, t2SMLAL, t2SMLALBB, t2SMLALBT, t2SMLALD, t2SMLALDX, t2SMLALTB, t2SMLALTT, t2SMLATB, t2SMLATT, t2SMLAWT, t2SMLSD, t2SMLSDX, t2SMLSLD, t2SMLSLDX, t2SMMLA, t2SMMLAR, t2SMMLS, t2SMMLSR, t2SMMUL, t2SMMULR, t2SMUAD, t2SMUADX, t2SMULBB, t2SMULBT, t2SMULL, t2SMULTB, t2SMULTT, t2SMULWT, t2SMUSD, t2SMUSDX, t2UMAAL, t2UMLAL, t2UMULL, tMUL, }; for (auto op : isMultiplyList) { Info[op].IsMultiply = true; } std::initializer_list isMVEIntMACList = { MVE_VMLAS_qr_i16, MVE_VMLAS_qr_i32, MVE_VMLAS_qr_i8, MVE_VMLA_qr_i16, MVE_VMLA_qr_i32, MVE_VMLA_qr_i8, MVE_VQDMLAH_qrs16, MVE_VQDMLAH_qrs32, MVE_VQDMLAH_qrs8, MVE_VQDMLASH_qrs16, MVE_VQDMLASH_qrs32, MVE_VQDMLASH_qrs8, MVE_VQRDMLAH_qrs16, MVE_VQRDMLAH_qrs32, MVE_VQRDMLAH_qrs8, MVE_VQRDMLASH_qrs16, MVE_VQRDMLASH_qrs32, MVE_VQRDMLASH_qrs8, MVE_VQDMLADHXs16, MVE_VQDMLADHXs32, MVE_VQDMLADHXs8, MVE_VQDMLADHs16, MVE_VQDMLADHs32, MVE_VQDMLADHs8, MVE_VQDMLSDHXs16, MVE_VQDMLSDHXs32, MVE_VQDMLSDHXs8, MVE_VQDMLSDHs16, MVE_VQDMLSDHs32, MVE_VQDMLSDHs8, MVE_VQRDMLADHXs16, MVE_VQRDMLADHXs32, MVE_VQRDMLADHXs8, MVE_VQRDMLADHs16, MVE_VQRDMLADHs32, MVE_VQRDMLADHs8, MVE_VQRDMLSDHXs16, MVE_VQRDMLSDHXs32, MVE_VQRDMLSDHXs8, MVE_VQRDMLSDHs16, MVE_VQRDMLSDHs32, MVE_VQRDMLSDHs8, }; for (auto op : isMVEIntMACList) { Info[op].IsMVEIntMAC = true; } std::initializer_list isNonSubwordLoadList = { t2LDRi12, t2LDRi8, t2LDR_POST, t2LDR_PRE, t2LDRpci, t2LDRs, t2LDRDi8, t2LDRD_POST, t2LDRD_PRE, tLDRi, tLDRpci, tLDRr, tLDRspi, }; for (auto op : isNonSubwordLoadList) { Info[op].IsNonSubwordLoad = true; } std::initializer_list isRevList = { t2REV, t2REV16, t2REVSH, t2RBIT, tREV, tREV16, tREVSH, }; for (auto op : isRevList) { Info[op].IsRev = true; } std::initializer_list isShiftList = { t2ASRri, t2ASRrr, t2LSLri, t2LSLrr, t2LSRri, t2LSRrr, t2RORri, t2RORrr, tASRri, tASRrr, tLSLSri, tLSLri, tLSLrr, tLSRri, tLSRrr, tROR, }; for (auto op : isShiftList) { Info[op].IsShift = true; } std::initializer_list Address1List = { t2LDRBi12, t2LDRBi8, t2LDRBpci, t2LDRBs, t2LDRHi12, t2LDRHi8, t2LDRHpci, t2LDRHs, t2LDRSBi12, t2LDRSBi8, t2LDRSBpci, t2LDRSBs, t2LDRSHi12, t2LDRSHi8, t2LDRSHpci, t2LDRSHs, t2LDRi12, t2LDRi8, t2LDRpci, t2LDRs, tLDRBi, tLDRBr, tLDRHi, tLDRHr, tLDRSB, tLDRSH, tLDRi, tLDRpci, tLDRr, tLDRspi, t2STRBi12, t2STRBi8, t2STRBs, t2STRHi12, t2STRHi8, t2STRHs, t2STRi12, t2STRi8, t2STRs, tSTRBi, tSTRBr, tSTRHi, tSTRHr, tSTRi, tSTRr, tSTRspi, VLDRD, VLDRH, VLDRS, VSTRD, VSTRH, VSTRS, MVE_VLD20_16, MVE_VLD20_32, MVE_VLD20_8, MVE_VLD21_16, MVE_VLD21_32, MVE_VLD21_8, MVE_VLD40_16, MVE_VLD40_32, MVE_VLD40_8, MVE_VLD41_16, MVE_VLD41_32, MVE_VLD41_8, MVE_VLD42_16, MVE_VLD42_32, MVE_VLD42_8, MVE_VLD43_16, MVE_VLD43_32, MVE_VLD43_8, MVE_VLDRBS16, MVE_VLDRBS16_rq, MVE_VLDRBS32, MVE_VLDRBS32_rq, MVE_VLDRBU16, MVE_VLDRBU16_rq, MVE_VLDRBU32, MVE_VLDRBU32_rq, MVE_VLDRBU8, MVE_VLDRBU8_rq, MVE_VLDRDU64_qi, MVE_VLDRDU64_rq, MVE_VLDRDU64_rq_u, MVE_VLDRHS32, MVE_VLDRHS32_rq, MVE_VLDRHS32_rq_u, MVE_VLDRHU16, MVE_VLDRHU16_rq, MVE_VLDRHU16_rq_u, MVE_VLDRHU32, MVE_VLDRHU32_rq, MVE_VLDRHU32_rq_u, MVE_VLDRWU32, MVE_VLDRWU32_qi, MVE_VLDRWU32_rq, MVE_VLDRWU32_rq_u, MVE_VST20_16, MVE_VST20_32, MVE_VST20_8, MVE_VST21_16, MVE_VST21_32, MVE_VST21_8, MVE_VST40_16, MVE_VST40_32, MVE_VST40_8, MVE_VST41_16, MVE_VST41_32, MVE_VST41_8, MVE_VST42_16, MVE_VST42_32, MVE_VST42_8, MVE_VST43_16, MVE_VST43_32, MVE_VST43_8, MVE_VSTRB16, MVE_VSTRB16_rq, MVE_VSTRB32, MVE_VSTRB32_rq, MVE_VSTRBU8, MVE_VSTRB8_rq, MVE_VSTRD64_qi, MVE_VSTRD64_rq, MVE_VSTRD64_rq_u, MVE_VSTRH32, MVE_VSTRH32_rq, MVE_VSTRH32_rq_u, MVE_VSTRHU16, MVE_VSTRH16_rq, MVE_VSTRH16_rq_u, MVE_VSTRWU32, MVE_VSTRW32_qi, MVE_VSTRW32_rq, MVE_VSTRW32_rq_u, }; std::initializer_list Address2List = { t2LDRB_POST, t2LDRB_PRE, t2LDRDi8, t2LDRH_POST, t2LDRH_PRE, t2LDRSB_POST, t2LDRSB_PRE, t2LDRSH_POST, t2LDRSH_PRE, t2LDR_POST, t2LDR_PRE, t2STRB_POST, t2STRB_PRE, t2STRDi8, t2STRH_POST, t2STRH_PRE, t2STR_POST, t2STR_PRE, MVE_VLD20_16_wb, MVE_VLD20_32_wb, MVE_VLD20_8_wb, MVE_VLD21_16_wb, MVE_VLD21_32_wb, MVE_VLD21_8_wb, MVE_VLD40_16_wb, MVE_VLD40_32_wb, MVE_VLD40_8_wb, MVE_VLD41_16_wb, MVE_VLD41_32_wb, MVE_VLD41_8_wb, MVE_VLD42_16_wb, MVE_VLD42_32_wb, MVE_VLD42_8_wb, MVE_VLD43_16_wb, MVE_VLD43_32_wb, MVE_VLD43_8_wb, MVE_VLDRBS16_post, MVE_VLDRBS16_pre, MVE_VLDRBS32_post, MVE_VLDRBS32_pre, MVE_VLDRBU16_post, MVE_VLDRBU16_pre, MVE_VLDRBU32_post, MVE_VLDRBU32_pre, MVE_VLDRBU8_post, MVE_VLDRBU8_pre, MVE_VLDRDU64_qi_pre, MVE_VLDRHS32_post, MVE_VLDRHS32_pre, MVE_VLDRHU16_post, MVE_VLDRHU16_pre, MVE_VLDRHU32_post, MVE_VLDRHU32_pre, MVE_VLDRWU32_post, MVE_VLDRWU32_pre, MVE_VLDRWU32_qi_pre, MVE_VST20_16_wb, MVE_VST20_32_wb, MVE_VST20_8_wb, MVE_VST21_16_wb, MVE_VST21_32_wb, MVE_VST21_8_wb, MVE_VST40_16_wb, MVE_VST40_32_wb, MVE_VST40_8_wb, MVE_VST41_16_wb, MVE_VST41_32_wb, MVE_VST41_8_wb, MVE_VST42_16_wb, MVE_VST42_32_wb, MVE_VST42_8_wb, MVE_VST43_16_wb, MVE_VST43_32_wb, MVE_VST43_8_wb, MVE_VSTRB16_post, MVE_VSTRB16_pre, MVE_VSTRB32_post, MVE_VSTRB32_pre, MVE_VSTRBU8_post, MVE_VSTRBU8_pre, MVE_VSTRD64_qi_pre, MVE_VSTRH32_post, MVE_VSTRH32_pre, MVE_VSTRHU16_post, MVE_VSTRHU16_pre, MVE_VSTRWU32_post, MVE_VSTRWU32_pre, MVE_VSTRW32_qi_pre, }; std::initializer_list Address3List = { t2LDRD_POST, t2LDRD_PRE, t2STRD_POST, t2STRD_PRE, }; // Compute a mask of which operands are involved in address computation for (auto &op : Address1List) { Info[op].AddressOpMask = 0x6; } for (auto &op : Address2List) { Info[op].AddressOpMask = 0xc; } for (auto &op : Address3List) { Info[op].AddressOpMask = 0x18; } for (auto &op : hasBRegAddrShiftList) { Info[op].AddressOpMask |= 0x8; } } void InstructionInformation::markDPProducersConsumers( const ARMBaseInstrInfo *TII) { // Learn about all instructions which have FP source/dest registers for (unsigned MI = 0; MI < ARM::INSTRUCTION_LIST_END; ++MI) { const MCInstrDesc &MID = TII->get(MI); auto Operands = MID.operands(); for (unsigned OI = 0, OIE = MID.getNumOperands(); OI != OIE; ++OI) { bool MarkQP = false, MarkDP = false, MarkSP = false; switch (Operands[OI].RegClass) { case ARM::MQPRRegClassID: case ARM::DPRRegClassID: case ARM::DPR_8RegClassID: case ARM::DPR_VFP2RegClassID: case ARM::DPairRegClassID: case ARM::DPairSpcRegClassID: case ARM::DQuadRegClassID: case ARM::DQuadSpcRegClassID: case ARM::DTripleRegClassID: case ARM::DTripleSpcRegClassID: MarkDP = true; break; case ARM::QPRRegClassID: case ARM::QPR_8RegClassID: case ARM::QPR_VFP2RegClassID: case ARM::QQPRRegClassID: case ARM::QQQQPRRegClassID: MarkQP = true; break; case ARM::SPRRegClassID: case ARM::SPR_8RegClassID: case ARM::FPWithVPRRegClassID: MarkSP = true; break; default: break; } if (MarkQP) { if (OI < MID.getNumDefs()) Info[MI].ProducesQP = true; else Info[MI].ConsumesQP = true; } if (MarkDP) { if (OI < MID.getNumDefs()) Info[MI].ProducesDP = true; else Info[MI].ConsumesDP = true; } if (MarkSP) { if (OI < MID.getNumDefs()) Info[MI].ProducesSP = true; else Info[MI].ConsumesSP = true; } } } } } // anonymous namespace static bool hasImplicitCPSRUse(const MachineInstr *MI) { return MI->getDesc().hasImplicitUseOfPhysReg(ARM::CPSR); } void ARMOverrideBypasses::setBidirLatencies(SUnit &SrcSU, SDep &SrcDep, unsigned latency) { SDep Reverse = SrcDep; Reverse.setSUnit(&SrcSU); for (SDep &PDep : SrcDep.getSUnit()->Preds) { if (PDep == Reverse) { PDep.setLatency(latency); SrcDep.getSUnit()->setDepthDirty(); break; } } SrcDep.setLatency(latency); SrcSU.setHeightDirty(); } static bool mismatchedPred(ARMCC::CondCodes a, ARMCC::CondCodes b) { return (a & 0xe) != (b & 0xe); } // Set output dependences to zero latency for processors which can // simultaneously issue to the same register. Returns true if a change // was made. bool ARMOverrideBypasses::zeroOutputDependences(SUnit &ISU, SDep &Dep) { if (Dep.getKind() == SDep::Output) { setBidirLatencies(ISU, Dep, 0); return true; } return false; } // The graph doesn't look inside of bundles to determine their // scheduling boundaries and reports zero latency into and out of them // (except for CPSR into the bundle, which has latency 1). // Make some better scheduling assumptions: // 1) CPSR uses have zero latency; other uses have incoming latency 1 // 2) CPSR defs retain a latency of zero; others have a latency of 1. // // Returns 1 if a use change was made; 2 if a def change was made; 0 otherwise unsigned ARMOverrideBypasses::makeBundleAssumptions(SUnit &ISU, SDep &Dep) { SUnit &DepSU = *Dep.getSUnit(); const MachineInstr *SrcMI = ISU.getInstr(); unsigned SrcOpcode = SrcMI->getOpcode(); const MachineInstr *DstMI = DepSU.getInstr(); unsigned DstOpcode = DstMI->getOpcode(); if (DstOpcode == ARM::BUNDLE && TII->isPredicated(*DstMI)) { setBidirLatencies( ISU, Dep, (Dep.isAssignedRegDep() && Dep.getReg() == ARM::CPSR) ? 0 : 1); return 1; } if (SrcOpcode == ARM::BUNDLE && TII->isPredicated(*SrcMI) && Dep.isAssignedRegDep() && Dep.getReg() != ARM::CPSR) { setBidirLatencies(ISU, Dep, 1); return 2; } return 0; } // Determine whether there is a memory RAW hazard here and set up latency // accordingly bool ARMOverrideBypasses::memoryRAWHazard(SUnit &ISU, SDep &Dep, unsigned latency) { if (!Dep.isNormalMemory()) return false; auto &SrcInst = *ISU.getInstr(); auto &DstInst = *Dep.getSUnit()->getInstr(); if (!SrcInst.mayStore() || !DstInst.mayLoad()) return false; auto SrcMO = *SrcInst.memoperands().begin(); auto DstMO = *DstInst.memoperands().begin(); auto SrcVal = SrcMO->getValue(); auto DstVal = DstMO->getValue(); auto SrcPseudoVal = SrcMO->getPseudoValue(); auto DstPseudoVal = DstMO->getPseudoValue(); if (SrcVal && DstVal && AA->alias(SrcVal, DstVal) == AliasResult::MustAlias && SrcMO->getOffset() == DstMO->getOffset()) { setBidirLatencies(ISU, Dep, latency); return true; } else if (SrcPseudoVal && DstPseudoVal && SrcPseudoVal->kind() == DstPseudoVal->kind() && SrcPseudoVal->kind() == PseudoSourceValue::FixedStack) { // Spills/fills auto FS0 = cast(SrcPseudoVal); auto FS1 = cast(DstPseudoVal); if (FS0 == FS1) { setBidirLatencies(ISU, Dep, latency); return true; } } return false; } namespace { std::unique_ptr II; class CortexM7InstructionInformation : public InstructionInformation { public: CortexM7InstructionInformation(const ARMBaseInstrInfo *TII) : InstructionInformation(TII) {} }; class CortexM7Overrides : public ARMOverrideBypasses { public: CortexM7Overrides(const ARMBaseInstrInfo *TII, AAResults *AA) : ARMOverrideBypasses(TII, AA) { if (!II) II.reset(new CortexM7InstructionInformation(TII)); } void modifyBypasses(SUnit &) override; }; void CortexM7Overrides::modifyBypasses(SUnit &ISU) { const MachineInstr *SrcMI = ISU.getInstr(); unsigned SrcOpcode = SrcMI->getOpcode(); bool isNSWload = II->isNonSubwordLoad(SrcOpcode); // Walk the successors looking for latency overrides that are needed for (SDep &Dep : ISU.Succs) { // Output dependences should have 0 latency, as M7 is able to // schedule writers to the same register for simultaneous issue. if (zeroOutputDependences(ISU, Dep)) continue; if (memoryRAWHazard(ISU, Dep, 4)) continue; // Ignore dependencies other than data if (Dep.getKind() != SDep::Data) continue; SUnit &DepSU = *Dep.getSUnit(); if (DepSU.isBoundaryNode()) continue; if (makeBundleAssumptions(ISU, Dep) == 1) continue; const MachineInstr *DstMI = DepSU.getInstr(); unsigned DstOpcode = DstMI->getOpcode(); // Word loads into any multiply or divide instruction are considered // cannot bypass their scheduling stage. Didn't do this in the .td file // because we cannot easily create a read advance that is 0 from certain // writer classes and 1 from all the rest. // (The other way around would have been easy.) if (isNSWload && (II->isMultiply(DstOpcode) || II->isDivide(DstOpcode))) setBidirLatencies(ISU, Dep, Dep.getLatency() + 1); // Word loads into B operand of a load/store are considered cannot bypass // their scheduling stage. Cannot do in the .td file because // need to decide between -1 and -2 for ReadAdvance if (isNSWload && II->hasBRegAddr(DstOpcode) && DstMI->getOperand(2).getReg() == Dep.getReg()) setBidirLatencies(ISU, Dep, Dep.getLatency() + 1); // Multiplies into any address generation cannot bypass from EX3. Cannot do // in the .td file because need to decide between -1 and -2 for ReadAdvance if (II->isMultiply(SrcOpcode)) { unsigned OpMask = II->getAddressOpMask(DstOpcode) >> 1; for (unsigned i = 1; OpMask; ++i, OpMask >>= 1) { if ((OpMask & 1) && DstMI->getOperand(i).isReg() && DstMI->getOperand(i).getReg() == Dep.getReg()) { setBidirLatencies(ISU, Dep, 4); // first legal bypass is EX4->EX1 break; } } } // Mismatched conditional producers take longer on M7; they end up looking // like they were produced at EX3 and read at IS. if (TII->isPredicated(*SrcMI) && Dep.isAssignedRegDep() && (SrcOpcode == ARM::BUNDLE || mismatchedPred(TII->getPredicate(*SrcMI), TII->getPredicate(*DstMI)))) { unsigned Lat = 1; // Operand A of shift+ALU is treated as an EX1 read instead of EX2. if (II->isInlineShiftALU(DstOpcode) && DstMI->getOperand(3).getImm() && DstMI->getOperand(1).getReg() == Dep.getReg()) Lat = 2; Lat = std::min(3u, Dep.getLatency() + Lat); setBidirLatencies(ISU, Dep, std::max(Dep.getLatency(), Lat)); } // CC setter into conditional producer shouldn't have a latency of more // than 1 unless it's due to an implicit read. (All the "true" readers // of the condition code use an implicit read, and predicates use an // explicit.) if (Dep.isAssignedRegDep() && Dep.getReg() == ARM::CPSR && TII->isPredicated(*DstMI) && !hasImplicitCPSRUse(DstMI)) setBidirLatencies(ISU, Dep, 1); // REV instructions cannot bypass directly into the EX1 shifter. The // code is slightly inexact as it doesn't attempt to ensure that the bypass // is to the shifter operands. if (II->isRev(SrcOpcode)) { if (II->isInlineShiftALU(DstOpcode)) setBidirLatencies(ISU, Dep, 2); else if (II->isShift(DstOpcode)) setBidirLatencies(ISU, Dep, 1); } } } class M85InstructionInformation : public InstructionInformation { public: M85InstructionInformation(const ARMBaseInstrInfo *t) : InstructionInformation(t) { markDPProducersConsumers(t); } }; class M85Overrides : public ARMOverrideBypasses { public: M85Overrides(const ARMBaseInstrInfo *t, AAResults *a) : ARMOverrideBypasses(t, a) { if (!II) II.reset(new M85InstructionInformation(t)); } void modifyBypasses(SUnit &) override; private: unsigned computeBypassStage(const MCSchedClassDesc *SCD); signed modifyMixedWidthFP(const MachineInstr *SrcMI, const MachineInstr *DstMI, unsigned RegID, const MCSchedClassDesc *SCD); }; unsigned M85Overrides::computeBypassStage(const MCSchedClassDesc *SCDesc) { auto SM = DAG->getSchedModel(); unsigned DefIdx = 0; // just look for the first output's timing if (DefIdx < SCDesc->NumWriteLatencyEntries) { // Lookup the definition's write latency in SubtargetInfo. const MCWriteLatencyEntry *WLEntry = SM->getSubtargetInfo()->getWriteLatencyEntry(SCDesc, DefIdx); unsigned Latency = WLEntry->Cycles >= 0 ? WLEntry->Cycles : 1000; if (Latency == 4) return 2; else if (Latency == 5) return 3; else if (Latency > 3) return 3; else return Latency; } return 2; } // Latency changes for bypassing between FP registers of different sizes: // // Note that mixed DP/SP are unlikely because of the semantics // of C. Mixed MVE/SP are quite common when MVE intrinsics are used. signed M85Overrides::modifyMixedWidthFP(const MachineInstr *SrcMI, const MachineInstr *DstMI, unsigned RegID, const MCSchedClassDesc *SCD) { if (!II->producesSP(SrcMI->getOpcode()) && !II->producesDP(SrcMI->getOpcode()) && !II->producesQP(SrcMI->getOpcode())) return 0; if (Register::isVirtualRegister(RegID)) { if (II->producesSP(SrcMI->getOpcode()) && II->consumesDP(DstMI->getOpcode())) { for (auto &OP : SrcMI->operands()) if (OP.isReg() && OP.isDef() && OP.getReg() == RegID && OP.getSubReg() == ARM::ssub_1) return 5 - computeBypassStage(SCD); } else if (II->producesSP(SrcMI->getOpcode()) && II->consumesQP(DstMI->getOpcode())) { for (auto &OP : SrcMI->operands()) if (OP.isReg() && OP.isDef() && OP.getReg() == RegID && (OP.getSubReg() == ARM::ssub_1 || OP.getSubReg() == ARM::ssub_3)) return 5 - computeBypassStage(SCD) - ((OP.getSubReg() == ARM::ssub_2 || OP.getSubReg() == ARM::ssub_3) ? 1 : 0); } else if (II->producesDP(SrcMI->getOpcode()) && II->consumesQP(DstMI->getOpcode())) { for (auto &OP : SrcMI->operands()) if (OP.isReg() && OP.isDef() && OP.getReg() == RegID && OP.getSubReg() == ARM::ssub_1) return -1; } else if (II->producesDP(SrcMI->getOpcode()) && II->consumesSP(DstMI->getOpcode())) { for (auto &OP : DstMI->operands()) if (OP.isReg() && OP.isUse() && OP.getReg() == RegID && OP.getSubReg() == ARM::ssub_1) return 5 - computeBypassStage(SCD); } else if (II->producesQP(SrcMI->getOpcode()) && II->consumesSP(DstMI->getOpcode())) { for (auto &OP : DstMI->operands()) if (OP.isReg() && OP.isUse() && OP.getReg() == RegID && (OP.getSubReg() == ARM::ssub_1 || OP.getSubReg() == ARM::ssub_3)) return 5 - computeBypassStage(SCD) + ((OP.getSubReg() == ARM::ssub_2 || OP.getSubReg() == ARM::ssub_3) ? 1 : 0); } else if (II->producesQP(SrcMI->getOpcode()) && II->consumesDP(DstMI->getOpcode())) { for (auto &OP : DstMI->operands()) if (OP.isReg() && OP.isUse() && OP.getReg() == RegID && OP.getSubReg() == ARM::ssub_1) return 1; } } else if (Register::isPhysicalRegister(RegID)) { // Note that when the producer is narrower, not all of the producers // may be present in the scheduling graph; somewhere earlier in the // compiler, an implicit def/use of the aliased full register gets // added to the producer, and so only that producer is seen as *the* // single producer. This behavior also has the unfortunate effect of // serializing the producers in the compiler's view of things. if (II->producesSP(SrcMI->getOpcode()) && II->consumesDP(DstMI->getOpcode())) { for (auto &OP : SrcMI->operands()) if (OP.isReg() && OP.isDef() && OP.getReg() >= ARM::S1 && OP.getReg() <= ARM::S31 && (OP.getReg() - ARM::S0) % 2 && (OP.getReg() == RegID || (OP.getReg() - ARM::S0) / 2 + ARM::D0 == RegID || (OP.getReg() - ARM::S0) / 4 + ARM::Q0 == RegID)) return 5 - computeBypassStage(SCD); } else if (II->producesSP(SrcMI->getOpcode()) && II->consumesQP(DstMI->getOpcode())) { for (auto &OP : SrcMI->operands()) if (OP.isReg() && OP.isDef() && OP.getReg() >= ARM::S1 && OP.getReg() <= ARM::S31 && (OP.getReg() - ARM::S0) % 2 && (OP.getReg() == RegID || (OP.getReg() - ARM::S0) / 2 + ARM::D0 == RegID || (OP.getReg() - ARM::S0) / 4 + ARM::Q0 == RegID)) return 5 - computeBypassStage(SCD) - (((OP.getReg() - ARM::S0) / 2) % 2 ? 1 : 0); } else if (II->producesDP(SrcMI->getOpcode()) && II->consumesQP(DstMI->getOpcode())) { for (auto &OP : SrcMI->operands()) if (OP.isReg() && OP.isDef() && OP.getReg() >= ARM::D0 && OP.getReg() <= ARM::D15 && (OP.getReg() - ARM::D0) % 2 && (OP.getReg() == RegID || (OP.getReg() - ARM::D0) / 2 + ARM::Q0 == RegID)) return -1; } else if (II->producesDP(SrcMI->getOpcode()) && II->consumesSP(DstMI->getOpcode())) { if (RegID >= ARM::S1 && RegID <= ARM::S31 && (RegID - ARM::S0) % 2) return 5 - computeBypassStage(SCD); } else if (II->producesQP(SrcMI->getOpcode()) && II->consumesSP(DstMI->getOpcode())) { if (RegID >= ARM::S1 && RegID <= ARM::S31 && (RegID - ARM::S0) % 2) return 5 - computeBypassStage(SCD) + (((RegID - ARM::S0) / 2) % 2 ? 1 : 0); } else if (II->producesQP(SrcMI->getOpcode()) && II->consumesDP(DstMI->getOpcode())) { if (RegID >= ARM::D1 && RegID <= ARM::D15 && (RegID - ARM::D0) % 2) return 1; } } return 0; } void M85Overrides::modifyBypasses(SUnit &ISU) { const MachineInstr *SrcMI = ISU.getInstr(); unsigned SrcOpcode = SrcMI->getOpcode(); bool isNSWload = II->isNonSubwordLoad(SrcOpcode); // Walk the successors looking for latency overrides that are needed for (SDep &Dep : ISU.Succs) { // Output dependences should have 0 latency, as CortexM85 is able to // schedule writers to the same register for simultaneous issue. if (zeroOutputDependences(ISU, Dep)) continue; if (memoryRAWHazard(ISU, Dep, 3)) continue; // Ignore dependencies other than data or strong ordering. if (Dep.getKind() != SDep::Data) continue; SUnit &DepSU = *Dep.getSUnit(); if (DepSU.isBoundaryNode()) continue; if (makeBundleAssumptions(ISU, Dep) == 1) continue; const MachineInstr *DstMI = DepSU.getInstr(); unsigned DstOpcode = DstMI->getOpcode(); // Word loads into B operand of a load/store with cannot bypass their // scheduling stage. Cannot do in the .td file because need to decide // between -1 and -2 for ReadAdvance if (isNSWload && II->hasBRegAddrShift(DstOpcode) && DstMI->getOperand(3).getImm() != 0 && // shift operand DstMI->getOperand(2).getReg() == Dep.getReg()) setBidirLatencies(ISU, Dep, Dep.getLatency() + 1); if (isNSWload && isMVEVectorInstruction(DstMI)) { setBidirLatencies(ISU, Dep, Dep.getLatency() + 1); } if (II->isMVEIntMAC(DstOpcode) && II->isMVEIntMACMatched(SrcOpcode, DstOpcode) && DstMI->getOperand(0).isReg() && DstMI->getOperand(0).getReg() == Dep.getReg()) setBidirLatencies(ISU, Dep, Dep.getLatency() - 1); // CC setter into conditional producer shouldn't have a latency of more // than 0 unless it's due to an implicit read. if (Dep.isAssignedRegDep() && Dep.getReg() == ARM::CPSR && TII->isPredicated(*DstMI) && !hasImplicitCPSRUse(DstMI)) setBidirLatencies(ISU, Dep, 0); if (signed ALat = modifyMixedWidthFP(SrcMI, DstMI, Dep.getReg(), DAG->getSchedClass(&ISU))) setBidirLatencies(ISU, Dep, std::max(0, signed(Dep.getLatency()) + ALat)); if (II->isRev(SrcOpcode)) { if (II->isInlineShiftALU(DstOpcode)) setBidirLatencies(ISU, Dep, 1); else if (II->isShift(DstOpcode)) setBidirLatencies(ISU, Dep, 1); } } } // Add M55 specific overrides for latencies between instructions. Currently it: // - Adds an extra cycle latency between MVE VMLAV and scalar instructions. class CortexM55Overrides : public ARMOverrideBypasses { public: CortexM55Overrides(const ARMBaseInstrInfo *TII, AAResults *AA) : ARMOverrideBypasses(TII, AA) {} void modifyBypasses(SUnit &SU) override { MachineInstr *SrcMI = SU.getInstr(); if (!(SrcMI->getDesc().TSFlags & ARMII::HorizontalReduction)) return; for (SDep &Dep : SU.Succs) { if (Dep.getKind() != SDep::Data) continue; SUnit &DepSU = *Dep.getSUnit(); if (DepSU.isBoundaryNode()) continue; MachineInstr *DstMI = DepSU.getInstr(); if (!isMVEVectorInstruction(DstMI) && !DstMI->mayStore()) setBidirLatencies(SU, Dep, 3); } } }; } // end anonymous namespace void ARMOverrideBypasses::apply(ScheduleDAGInstrs *DAGInstrs) { DAG = DAGInstrs; for (SUnit &ISU : DAGInstrs->SUnits) { if (ISU.isBoundaryNode()) continue; modifyBypasses(ISU); } if (DAGInstrs->ExitSU.getInstr()) modifyBypasses(DAGInstrs->ExitSU); } std::unique_ptr createARMLatencyMutations(const ARMSubtarget &ST, AAResults *AA) { if (ST.isCortexM85()) return std::make_unique(ST.getInstrInfo(), AA); else if (ST.isCortexM7()) return std::make_unique(ST.getInstrInfo(), AA); else if (ST.isCortexM55()) return std::make_unique(ST.getInstrInfo(), AA); return nullptr; } } // end namespace llvm