15ffd83dbSDimitry Andric //=== lib/CodeGen/GlobalISel/AArch64PreLegalizerCombiner.cpp --------------===// 25ffd83dbSDimitry Andric // 35ffd83dbSDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 45ffd83dbSDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 55ffd83dbSDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 65ffd83dbSDimitry Andric // 75ffd83dbSDimitry Andric //===----------------------------------------------------------------------===// 85ffd83dbSDimitry Andric // 95ffd83dbSDimitry Andric // This pass does combining of machine instructions at the generic MI level, 105ffd83dbSDimitry Andric // before the legalizer. 115ffd83dbSDimitry Andric // 125ffd83dbSDimitry Andric //===----------------------------------------------------------------------===// 135ffd83dbSDimitry Andric 14fe6060f1SDimitry Andric #include "AArch64GlobalISelUtils.h" 155ffd83dbSDimitry Andric #include "AArch64TargetMachine.h" 1681ad6265SDimitry Andric #include "llvm/CodeGen/GlobalISel/CSEInfo.h" 175ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/Combiner.h" 185ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" 195ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" 2006c3fb27SDimitry Andric #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h" 215ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" 225ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 23fe6060f1SDimitry Andric #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 247a6dacacSDimitry Andric #include "llvm/CodeGen/GlobalISel/Utils.h" 255ffd83dbSDimitry Andric #include "llvm/CodeGen/MachineDominators.h" 26fe6060f1SDimitry Andric #include "llvm/CodeGen/MachineFunction.h" 275ffd83dbSDimitry Andric #include "llvm/CodeGen/MachineFunctionPass.h" 28fe6060f1SDimitry Andric #include "llvm/CodeGen/MachineRegisterInfo.h" 295ffd83dbSDimitry Andric #include "llvm/CodeGen/TargetPassConfig.h" 30fe6060f1SDimitry Andric #include "llvm/IR/Instructions.h" 315ffd83dbSDimitry Andric #include "llvm/Support/Debug.h" 325ffd83dbSDimitry Andric 3306c3fb27SDimitry Andric #define GET_GICOMBINER_DEPS 3406c3fb27SDimitry Andric #include "AArch64GenPreLegalizeGICombiner.inc" 3506c3fb27SDimitry Andric #undef GET_GICOMBINER_DEPS 3606c3fb27SDimitry Andric 375ffd83dbSDimitry Andric #define DEBUG_TYPE "aarch64-prelegalizer-combiner" 385ffd83dbSDimitry Andric 395ffd83dbSDimitry Andric using namespace llvm; 405ffd83dbSDimitry Andric using namespace MIPatternMatch; 415ffd83dbSDimitry Andric 4206c3fb27SDimitry Andric namespace { 4306c3fb27SDimitry Andric 4406c3fb27SDimitry Andric #define GET_GICOMBINER_TYPES 4506c3fb27SDimitry Andric #include "AArch64GenPreLegalizeGICombiner.inc" 4606c3fb27SDimitry Andric #undef GET_GICOMBINER_TYPES 4706c3fb27SDimitry Andric 485ffd83dbSDimitry Andric /// Return true if a G_FCONSTANT instruction is known to be better-represented 495ffd83dbSDimitry Andric /// as a G_CONSTANT. 5006c3fb27SDimitry Andric bool matchFConstantToConstant(MachineInstr &MI, MachineRegisterInfo &MRI) { 515ffd83dbSDimitry Andric assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT); 525ffd83dbSDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 535ffd83dbSDimitry Andric const unsigned DstSize = MRI.getType(DstReg).getSizeInBits(); 545ffd83dbSDimitry Andric if (DstSize != 32 && DstSize != 64) 555ffd83dbSDimitry Andric return false; 565ffd83dbSDimitry Andric 575ffd83dbSDimitry Andric // When we're storing a value, it doesn't matter what register bank it's on. 585ffd83dbSDimitry Andric // Since not all floating point constants can be materialized using a fmov, 595ffd83dbSDimitry Andric // it makes more sense to just use a GPR. 605ffd83dbSDimitry Andric return all_of(MRI.use_nodbg_instructions(DstReg), 615ffd83dbSDimitry Andric [](const MachineInstr &Use) { return Use.mayStore(); }); 625ffd83dbSDimitry Andric } 635ffd83dbSDimitry Andric 645ffd83dbSDimitry Andric /// Change a G_FCONSTANT into a G_CONSTANT. 6506c3fb27SDimitry Andric void applyFConstantToConstant(MachineInstr &MI) { 665ffd83dbSDimitry Andric assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT); 675ffd83dbSDimitry Andric MachineIRBuilder MIB(MI); 685ffd83dbSDimitry Andric const APFloat &ImmValAPF = MI.getOperand(1).getFPImm()->getValueAPF(); 695ffd83dbSDimitry Andric MIB.buildConstant(MI.getOperand(0).getReg(), ImmValAPF.bitcastToAPInt()); 705ffd83dbSDimitry Andric MI.eraseFromParent(); 715ffd83dbSDimitry Andric } 725ffd83dbSDimitry Andric 73fe6060f1SDimitry Andric /// Try to match a G_ICMP of a G_TRUNC with zero, in which the truncated bits 74fe6060f1SDimitry Andric /// are sign bits. In this case, we can transform the G_ICMP to directly compare 75fe6060f1SDimitry Andric /// the wide value with a zero. 7606c3fb27SDimitry Andric bool matchICmpRedundantTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, 77fe6060f1SDimitry Andric GISelKnownBits *KB, Register &MatchInfo) { 78fe6060f1SDimitry Andric assert(MI.getOpcode() == TargetOpcode::G_ICMP && KB); 79fe6060f1SDimitry Andric 80fe6060f1SDimitry Andric auto Pred = (CmpInst::Predicate)MI.getOperand(1).getPredicate(); 81fe6060f1SDimitry Andric if (!ICmpInst::isEquality(Pred)) 82fe6060f1SDimitry Andric return false; 83fe6060f1SDimitry Andric 84fe6060f1SDimitry Andric Register LHS = MI.getOperand(2).getReg(); 85fe6060f1SDimitry Andric LLT LHSTy = MRI.getType(LHS); 86fe6060f1SDimitry Andric if (!LHSTy.isScalar()) 87fe6060f1SDimitry Andric return false; 88fe6060f1SDimitry Andric 89fe6060f1SDimitry Andric Register RHS = MI.getOperand(3).getReg(); 90fe6060f1SDimitry Andric Register WideReg; 91fe6060f1SDimitry Andric 92fe6060f1SDimitry Andric if (!mi_match(LHS, MRI, m_GTrunc(m_Reg(WideReg))) || 93fe6060f1SDimitry Andric !mi_match(RHS, MRI, m_SpecificICst(0))) 94fe6060f1SDimitry Andric return false; 95fe6060f1SDimitry Andric 96fe6060f1SDimitry Andric LLT WideTy = MRI.getType(WideReg); 97fe6060f1SDimitry Andric if (KB->computeNumSignBits(WideReg) <= 98fe6060f1SDimitry Andric WideTy.getSizeInBits() - LHSTy.getSizeInBits()) 99fe6060f1SDimitry Andric return false; 100fe6060f1SDimitry Andric 101fe6060f1SDimitry Andric MatchInfo = WideReg; 102fe6060f1SDimitry Andric return true; 103fe6060f1SDimitry Andric } 104fe6060f1SDimitry Andric 10506c3fb27SDimitry Andric void applyICmpRedundantTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, 106fe6060f1SDimitry Andric MachineIRBuilder &Builder, 10706c3fb27SDimitry Andric GISelChangeObserver &Observer, Register &WideReg) { 108fe6060f1SDimitry Andric assert(MI.getOpcode() == TargetOpcode::G_ICMP); 109fe6060f1SDimitry Andric 110fe6060f1SDimitry Andric LLT WideTy = MRI.getType(WideReg); 111fe6060f1SDimitry Andric // We're going to directly use the wide register as the LHS, and then use an 112fe6060f1SDimitry Andric // equivalent size zero for RHS. 113fe6060f1SDimitry Andric Builder.setInstrAndDebugLoc(MI); 114fe6060f1SDimitry Andric auto WideZero = Builder.buildConstant(WideTy, 0); 115fe6060f1SDimitry Andric Observer.changingInstr(MI); 116fe6060f1SDimitry Andric MI.getOperand(2).setReg(WideReg); 117fe6060f1SDimitry Andric MI.getOperand(3).setReg(WideZero.getReg(0)); 118fe6060f1SDimitry Andric Observer.changedInstr(MI); 119fe6060f1SDimitry Andric } 120fe6060f1SDimitry Andric 121fe6060f1SDimitry Andric /// \returns true if it is possible to fold a constant into a G_GLOBAL_VALUE. 122fe6060f1SDimitry Andric /// 123fe6060f1SDimitry Andric /// e.g. 124fe6060f1SDimitry Andric /// 125fe6060f1SDimitry Andric /// %g = G_GLOBAL_VALUE @x -> %g = G_GLOBAL_VALUE @x + cst 12606c3fb27SDimitry Andric bool matchFoldGlobalOffset(MachineInstr &MI, MachineRegisterInfo &MRI, 127fe6060f1SDimitry Andric std::pair<uint64_t, uint64_t> &MatchInfo) { 128fe6060f1SDimitry Andric assert(MI.getOpcode() == TargetOpcode::G_GLOBAL_VALUE); 129fe6060f1SDimitry Andric MachineFunction &MF = *MI.getMF(); 130fe6060f1SDimitry Andric auto &GlobalOp = MI.getOperand(1); 131fe6060f1SDimitry Andric auto *GV = GlobalOp.getGlobal(); 132fe6060f1SDimitry Andric if (GV->isThreadLocal()) 133fe6060f1SDimitry Andric return false; 134fe6060f1SDimitry Andric 135fe6060f1SDimitry Andric // Don't allow anything that could represent offsets etc. 136fe6060f1SDimitry Andric if (MF.getSubtarget<AArch64Subtarget>().ClassifyGlobalReference( 137fe6060f1SDimitry Andric GV, MF.getTarget()) != AArch64II::MO_NO_FLAG) 138fe6060f1SDimitry Andric return false; 139fe6060f1SDimitry Andric 140fe6060f1SDimitry Andric // Look for a G_GLOBAL_VALUE only used by G_PTR_ADDs against constants: 141fe6060f1SDimitry Andric // 142fe6060f1SDimitry Andric // %g = G_GLOBAL_VALUE @x 143fe6060f1SDimitry Andric // %ptr1 = G_PTR_ADD %g, cst1 144fe6060f1SDimitry Andric // %ptr2 = G_PTR_ADD %g, cst2 145fe6060f1SDimitry Andric // ... 146fe6060f1SDimitry Andric // %ptrN = G_PTR_ADD %g, cstN 147fe6060f1SDimitry Andric // 148fe6060f1SDimitry Andric // Identify the *smallest* constant. We want to be able to form this: 149fe6060f1SDimitry Andric // 150fe6060f1SDimitry Andric // %offset_g = G_GLOBAL_VALUE @x + min_cst 151fe6060f1SDimitry Andric // %g = G_PTR_ADD %offset_g, -min_cst 152fe6060f1SDimitry Andric // %ptr1 = G_PTR_ADD %g, cst1 153fe6060f1SDimitry Andric // ... 154fe6060f1SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 155fe6060f1SDimitry Andric uint64_t MinOffset = -1ull; 156fe6060f1SDimitry Andric for (auto &UseInstr : MRI.use_nodbg_instructions(Dst)) { 157fe6060f1SDimitry Andric if (UseInstr.getOpcode() != TargetOpcode::G_PTR_ADD) 158fe6060f1SDimitry Andric return false; 159349cc55cSDimitry Andric auto Cst = getIConstantVRegValWithLookThrough( 160349cc55cSDimitry Andric UseInstr.getOperand(2).getReg(), MRI); 161fe6060f1SDimitry Andric if (!Cst) 162fe6060f1SDimitry Andric return false; 163fe6060f1SDimitry Andric MinOffset = std::min(MinOffset, Cst->Value.getZExtValue()); 164fe6060f1SDimitry Andric } 165fe6060f1SDimitry Andric 166fe6060f1SDimitry Andric // Require that the new offset is larger than the existing one to avoid 167fe6060f1SDimitry Andric // infinite loops. 168fe6060f1SDimitry Andric uint64_t CurrOffset = GlobalOp.getOffset(); 169fe6060f1SDimitry Andric uint64_t NewOffset = MinOffset + CurrOffset; 170fe6060f1SDimitry Andric if (NewOffset <= CurrOffset) 171fe6060f1SDimitry Andric return false; 172fe6060f1SDimitry Andric 173fe6060f1SDimitry Andric // Check whether folding this offset is legal. It must not go out of bounds of 174fe6060f1SDimitry Andric // the referenced object to avoid violating the code model, and must be 1753a9a9c0cSDimitry Andric // smaller than 2^20 because this is the largest offset expressible in all 1763a9a9c0cSDimitry Andric // object formats. (The IMAGE_REL_ARM64_PAGEBASE_REL21 relocation in COFF 1773a9a9c0cSDimitry Andric // stores an immediate signed 21 bit offset.) 178fe6060f1SDimitry Andric // 179fe6060f1SDimitry Andric // This check also prevents us from folding negative offsets, which will end 180fe6060f1SDimitry Andric // up being treated in the same way as large positive ones. They could also 181fe6060f1SDimitry Andric // cause code model violations, and aren't really common enough to matter. 1823a9a9c0cSDimitry Andric if (NewOffset >= (1 << 20)) 183fe6060f1SDimitry Andric return false; 184fe6060f1SDimitry Andric 185fe6060f1SDimitry Andric Type *T = GV->getValueType(); 186fe6060f1SDimitry Andric if (!T->isSized() || 187*0fca6ea1SDimitry Andric NewOffset > GV->getDataLayout().getTypeAllocSize(T)) 188fe6060f1SDimitry Andric return false; 189fe6060f1SDimitry Andric MatchInfo = std::make_pair(NewOffset, MinOffset); 190fe6060f1SDimitry Andric return true; 191fe6060f1SDimitry Andric } 192fe6060f1SDimitry Andric 19306c3fb27SDimitry Andric void applyFoldGlobalOffset(MachineInstr &MI, MachineRegisterInfo &MRI, 19406c3fb27SDimitry Andric MachineIRBuilder &B, GISelChangeObserver &Observer, 195fe6060f1SDimitry Andric std::pair<uint64_t, uint64_t> &MatchInfo) { 196fe6060f1SDimitry Andric // Change: 197fe6060f1SDimitry Andric // 198fe6060f1SDimitry Andric // %g = G_GLOBAL_VALUE @x 199fe6060f1SDimitry Andric // %ptr1 = G_PTR_ADD %g, cst1 200fe6060f1SDimitry Andric // %ptr2 = G_PTR_ADD %g, cst2 201fe6060f1SDimitry Andric // ... 202fe6060f1SDimitry Andric // %ptrN = G_PTR_ADD %g, cstN 203fe6060f1SDimitry Andric // 204fe6060f1SDimitry Andric // To: 205fe6060f1SDimitry Andric // 206fe6060f1SDimitry Andric // %offset_g = G_GLOBAL_VALUE @x + min_cst 207fe6060f1SDimitry Andric // %g = G_PTR_ADD %offset_g, -min_cst 208fe6060f1SDimitry Andric // %ptr1 = G_PTR_ADD %g, cst1 209fe6060f1SDimitry Andric // ... 210fe6060f1SDimitry Andric // %ptrN = G_PTR_ADD %g, cstN 211fe6060f1SDimitry Andric // 212fe6060f1SDimitry Andric // Then, the original G_PTR_ADDs should be folded later on so that they look 213fe6060f1SDimitry Andric // like this: 214fe6060f1SDimitry Andric // 215fe6060f1SDimitry Andric // %ptrN = G_PTR_ADD %offset_g, cstN - min_cst 216fe6060f1SDimitry Andric uint64_t Offset, MinOffset; 217fe6060f1SDimitry Andric std::tie(Offset, MinOffset) = MatchInfo; 2185f757f3fSDimitry Andric B.setInstrAndDebugLoc(*std::next(MI.getIterator())); 219fe6060f1SDimitry Andric Observer.changingInstr(MI); 220fe6060f1SDimitry Andric auto &GlobalOp = MI.getOperand(1); 221fe6060f1SDimitry Andric auto *GV = GlobalOp.getGlobal(); 222fe6060f1SDimitry Andric GlobalOp.ChangeToGA(GV, Offset, GlobalOp.getTargetFlags()); 223fe6060f1SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 224fe6060f1SDimitry Andric Register NewGVDst = MRI.cloneVirtualRegister(Dst); 225fe6060f1SDimitry Andric MI.getOperand(0).setReg(NewGVDst); 226fe6060f1SDimitry Andric Observer.changedInstr(MI); 227fe6060f1SDimitry Andric B.buildPtrAdd( 228fe6060f1SDimitry Andric Dst, NewGVDst, 229fe6060f1SDimitry Andric B.buildConstant(LLT::scalar(64), -static_cast<int64_t>(MinOffset))); 230fe6060f1SDimitry Andric } 231fe6060f1SDimitry Andric 2325f757f3fSDimitry Andric // Combines vecreduce_add(mul(ext(x), ext(y))) -> vecreduce_add(udot(x, y)) 2335f757f3fSDimitry Andric // Or vecreduce_add(ext(x)) -> vecreduce_add(udot(x, 1)) 2345f757f3fSDimitry Andric // Similar to performVecReduceAddCombine in SelectionDAG 2355f757f3fSDimitry Andric bool matchExtAddvToUdotAddv(MachineInstr &MI, MachineRegisterInfo &MRI, 2365f757f3fSDimitry Andric const AArch64Subtarget &STI, 2375f757f3fSDimitry Andric std::tuple<Register, Register, bool> &MatchInfo) { 2385f757f3fSDimitry Andric assert(MI.getOpcode() == TargetOpcode::G_VECREDUCE_ADD && 2395f757f3fSDimitry Andric "Expected a G_VECREDUCE_ADD instruction"); 2405f757f3fSDimitry Andric assert(STI.hasDotProd() && "Target should have Dot Product feature"); 2415f757f3fSDimitry Andric 2425f757f3fSDimitry Andric MachineInstr *I1 = getDefIgnoringCopies(MI.getOperand(1).getReg(), MRI); 2435f757f3fSDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 2445f757f3fSDimitry Andric Register MidReg = I1->getOperand(0).getReg(); 2455f757f3fSDimitry Andric LLT DstTy = MRI.getType(DstReg); 2465f757f3fSDimitry Andric LLT MidTy = MRI.getType(MidReg); 2475f757f3fSDimitry Andric if (DstTy.getScalarSizeInBits() != 32 || MidTy.getScalarSizeInBits() != 32) 2485f757f3fSDimitry Andric return false; 2495f757f3fSDimitry Andric 2505f757f3fSDimitry Andric LLT SrcTy; 2515f757f3fSDimitry Andric auto I1Opc = I1->getOpcode(); 2525f757f3fSDimitry Andric if (I1Opc == TargetOpcode::G_MUL) { 2535f757f3fSDimitry Andric // If result of this has more than 1 use, then there is no point in creating 2545f757f3fSDimitry Andric // udot instruction 2555f757f3fSDimitry Andric if (!MRI.hasOneNonDBGUse(MidReg)) 2565f757f3fSDimitry Andric return false; 2575f757f3fSDimitry Andric 2585f757f3fSDimitry Andric MachineInstr *ExtMI1 = 2595f757f3fSDimitry Andric getDefIgnoringCopies(I1->getOperand(1).getReg(), MRI); 2605f757f3fSDimitry Andric MachineInstr *ExtMI2 = 2615f757f3fSDimitry Andric getDefIgnoringCopies(I1->getOperand(2).getReg(), MRI); 2625f757f3fSDimitry Andric LLT Ext1DstTy = MRI.getType(ExtMI1->getOperand(0).getReg()); 2635f757f3fSDimitry Andric LLT Ext2DstTy = MRI.getType(ExtMI2->getOperand(0).getReg()); 2645f757f3fSDimitry Andric 2655f757f3fSDimitry Andric if (ExtMI1->getOpcode() != ExtMI2->getOpcode() || Ext1DstTy != Ext2DstTy) 2665f757f3fSDimitry Andric return false; 2675f757f3fSDimitry Andric I1Opc = ExtMI1->getOpcode(); 2685f757f3fSDimitry Andric SrcTy = MRI.getType(ExtMI1->getOperand(1).getReg()); 2695f757f3fSDimitry Andric std::get<0>(MatchInfo) = ExtMI1->getOperand(1).getReg(); 2705f757f3fSDimitry Andric std::get<1>(MatchInfo) = ExtMI2->getOperand(1).getReg(); 2715f757f3fSDimitry Andric } else { 2725f757f3fSDimitry Andric SrcTy = MRI.getType(I1->getOperand(1).getReg()); 2735f757f3fSDimitry Andric std::get<0>(MatchInfo) = I1->getOperand(1).getReg(); 2745f757f3fSDimitry Andric std::get<1>(MatchInfo) = 0; 2755f757f3fSDimitry Andric } 2765f757f3fSDimitry Andric 2775f757f3fSDimitry Andric if (I1Opc == TargetOpcode::G_ZEXT) 2785f757f3fSDimitry Andric std::get<2>(MatchInfo) = 0; 2795f757f3fSDimitry Andric else if (I1Opc == TargetOpcode::G_SEXT) 2805f757f3fSDimitry Andric std::get<2>(MatchInfo) = 1; 2815f757f3fSDimitry Andric else 2825f757f3fSDimitry Andric return false; 2835f757f3fSDimitry Andric 2845f757f3fSDimitry Andric if (SrcTy.getScalarSizeInBits() != 8 || SrcTy.getNumElements() % 8 != 0) 2855f757f3fSDimitry Andric return false; 2865f757f3fSDimitry Andric 2875f757f3fSDimitry Andric return true; 2885f757f3fSDimitry Andric } 2895f757f3fSDimitry Andric 2905f757f3fSDimitry Andric void applyExtAddvToUdotAddv(MachineInstr &MI, MachineRegisterInfo &MRI, 2915f757f3fSDimitry Andric MachineIRBuilder &Builder, 2925f757f3fSDimitry Andric GISelChangeObserver &Observer, 2935f757f3fSDimitry Andric const AArch64Subtarget &STI, 2945f757f3fSDimitry Andric std::tuple<Register, Register, bool> &MatchInfo) { 2955f757f3fSDimitry Andric assert(MI.getOpcode() == TargetOpcode::G_VECREDUCE_ADD && 2965f757f3fSDimitry Andric "Expected a G_VECREDUCE_ADD instruction"); 2975f757f3fSDimitry Andric assert(STI.hasDotProd() && "Target should have Dot Product feature"); 2985f757f3fSDimitry Andric 2995f757f3fSDimitry Andric // Initialise the variables 3005f757f3fSDimitry Andric unsigned DotOpcode = 3015f757f3fSDimitry Andric std::get<2>(MatchInfo) ? AArch64::G_SDOT : AArch64::G_UDOT; 3025f757f3fSDimitry Andric Register Ext1SrcReg = std::get<0>(MatchInfo); 3035f757f3fSDimitry Andric 3045f757f3fSDimitry Andric // If there is one source register, create a vector of 0s as the second 3055f757f3fSDimitry Andric // source register 3065f757f3fSDimitry Andric Register Ext2SrcReg; 3075f757f3fSDimitry Andric if (std::get<1>(MatchInfo) == 0) 3085f757f3fSDimitry Andric Ext2SrcReg = Builder.buildConstant(MRI.getType(Ext1SrcReg), 1) 3095f757f3fSDimitry Andric ->getOperand(0) 3105f757f3fSDimitry Andric .getReg(); 3115f757f3fSDimitry Andric else 3125f757f3fSDimitry Andric Ext2SrcReg = std::get<1>(MatchInfo); 3135f757f3fSDimitry Andric 3145f757f3fSDimitry Andric // Find out how many DOT instructions are needed 3155f757f3fSDimitry Andric LLT SrcTy = MRI.getType(Ext1SrcReg); 3165f757f3fSDimitry Andric LLT MidTy; 3175f757f3fSDimitry Andric unsigned NumOfDotMI; 3185f757f3fSDimitry Andric if (SrcTy.getNumElements() % 16 == 0) { 3195f757f3fSDimitry Andric NumOfDotMI = SrcTy.getNumElements() / 16; 3205f757f3fSDimitry Andric MidTy = LLT::fixed_vector(4, 32); 3215f757f3fSDimitry Andric } else if (SrcTy.getNumElements() % 8 == 0) { 3225f757f3fSDimitry Andric NumOfDotMI = SrcTy.getNumElements() / 8; 3235f757f3fSDimitry Andric MidTy = LLT::fixed_vector(2, 32); 3245f757f3fSDimitry Andric } else { 3255f757f3fSDimitry Andric llvm_unreachable("Source type number of elements is not multiple of 8"); 3265f757f3fSDimitry Andric } 3275f757f3fSDimitry Andric 3285f757f3fSDimitry Andric // Handle case where one DOT instruction is needed 3295f757f3fSDimitry Andric if (NumOfDotMI == 1) { 3305f757f3fSDimitry Andric auto Zeroes = Builder.buildConstant(MidTy, 0)->getOperand(0).getReg(); 3315f757f3fSDimitry Andric auto Dot = Builder.buildInstr(DotOpcode, {MidTy}, 3325f757f3fSDimitry Andric {Zeroes, Ext1SrcReg, Ext2SrcReg}); 3335f757f3fSDimitry Andric Builder.buildVecReduceAdd(MI.getOperand(0), Dot->getOperand(0)); 3345f757f3fSDimitry Andric } else { 3355f757f3fSDimitry Andric // If not pad the last v8 element with 0s to a v16 3365f757f3fSDimitry Andric SmallVector<Register, 4> Ext1UnmergeReg; 3375f757f3fSDimitry Andric SmallVector<Register, 4> Ext2UnmergeReg; 3385f757f3fSDimitry Andric if (SrcTy.getNumElements() % 16 != 0) { 3397a6dacacSDimitry Andric SmallVector<Register> Leftover1; 3407a6dacacSDimitry Andric SmallVector<Register> Leftover2; 3415f757f3fSDimitry Andric 3427a6dacacSDimitry Andric // Split the elements into v16i8 and v8i8 3437a6dacacSDimitry Andric LLT MainTy = LLT::fixed_vector(16, 8); 3447a6dacacSDimitry Andric LLT LeftoverTy1, LeftoverTy2; 3457a6dacacSDimitry Andric if ((!extractParts(Ext1SrcReg, MRI.getType(Ext1SrcReg), MainTy, 3467a6dacacSDimitry Andric LeftoverTy1, Ext1UnmergeReg, Leftover1, Builder, 3477a6dacacSDimitry Andric MRI)) || 3487a6dacacSDimitry Andric (!extractParts(Ext2SrcReg, MRI.getType(Ext2SrcReg), MainTy, 3497a6dacacSDimitry Andric LeftoverTy2, Ext2UnmergeReg, Leftover2, Builder, 3507a6dacacSDimitry Andric MRI))) { 3517a6dacacSDimitry Andric llvm_unreachable("Unable to split this vector properly"); 3525f757f3fSDimitry Andric } 3535f757f3fSDimitry Andric 3547a6dacacSDimitry Andric // Pad the leftover v8i8 vector with register of 0s of type v8i8 3557a6dacacSDimitry Andric Register v8Zeroes = Builder.buildConstant(LLT::fixed_vector(8, 8), 0) 3567a6dacacSDimitry Andric ->getOperand(0) 3577a6dacacSDimitry Andric .getReg(); 3585f757f3fSDimitry Andric 3595f757f3fSDimitry Andric Ext1UnmergeReg.push_back( 3605f757f3fSDimitry Andric Builder 3617a6dacacSDimitry Andric .buildMergeLikeInstr(LLT::fixed_vector(16, 8), 3627a6dacacSDimitry Andric {Leftover1[0], v8Zeroes}) 3635f757f3fSDimitry Andric .getReg(0)); 3645f757f3fSDimitry Andric Ext2UnmergeReg.push_back( 3655f757f3fSDimitry Andric Builder 3667a6dacacSDimitry Andric .buildMergeLikeInstr(LLT::fixed_vector(16, 8), 3677a6dacacSDimitry Andric {Leftover2[0], v8Zeroes}) 3685f757f3fSDimitry Andric .getReg(0)); 3697a6dacacSDimitry Andric 3705f757f3fSDimitry Andric } else { 3715f757f3fSDimitry Andric // Unmerge the source vectors to v16i8 3727a6dacacSDimitry Andric unsigned SrcNumElts = SrcTy.getNumElements(); 3737a6dacacSDimitry Andric extractParts(Ext1SrcReg, LLT::fixed_vector(16, 8), SrcNumElts / 16, 3747a6dacacSDimitry Andric Ext1UnmergeReg, Builder, MRI); 3757a6dacacSDimitry Andric extractParts(Ext2SrcReg, LLT::fixed_vector(16, 8), SrcNumElts / 16, 3767a6dacacSDimitry Andric Ext2UnmergeReg, Builder, MRI); 3775f757f3fSDimitry Andric } 3785f757f3fSDimitry Andric 3795f757f3fSDimitry Andric // Build the UDOT instructions 3805f757f3fSDimitry Andric SmallVector<Register, 2> DotReg; 3815f757f3fSDimitry Andric unsigned NumElements = 0; 3825f757f3fSDimitry Andric for (unsigned i = 0; i < Ext1UnmergeReg.size(); i++) { 3835f757f3fSDimitry Andric LLT ZeroesLLT; 3845f757f3fSDimitry Andric // Check if it is 16 or 8 elements. Set Zeroes to the according size 3855f757f3fSDimitry Andric if (MRI.getType(Ext1UnmergeReg[i]).getNumElements() == 16) { 3865f757f3fSDimitry Andric ZeroesLLT = LLT::fixed_vector(4, 32); 3875f757f3fSDimitry Andric NumElements += 4; 3885f757f3fSDimitry Andric } else { 3895f757f3fSDimitry Andric ZeroesLLT = LLT::fixed_vector(2, 32); 3905f757f3fSDimitry Andric NumElements += 2; 3915f757f3fSDimitry Andric } 3925f757f3fSDimitry Andric auto Zeroes = Builder.buildConstant(ZeroesLLT, 0)->getOperand(0).getReg(); 3935f757f3fSDimitry Andric DotReg.push_back( 3945f757f3fSDimitry Andric Builder 3955f757f3fSDimitry Andric .buildInstr(DotOpcode, {MRI.getType(Zeroes)}, 3965f757f3fSDimitry Andric {Zeroes, Ext1UnmergeReg[i], Ext2UnmergeReg[i]}) 3975f757f3fSDimitry Andric .getReg(0)); 3985f757f3fSDimitry Andric } 3995f757f3fSDimitry Andric 4005f757f3fSDimitry Andric // Merge the output 4015f757f3fSDimitry Andric auto ConcatMI = 4025f757f3fSDimitry Andric Builder.buildConcatVectors(LLT::fixed_vector(NumElements, 32), DotReg); 4035f757f3fSDimitry Andric 4045f757f3fSDimitry Andric // Put it through a vector reduction 4055f757f3fSDimitry Andric Builder.buildVecReduceAdd(MI.getOperand(0).getReg(), 4065f757f3fSDimitry Andric ConcatMI->getOperand(0).getReg()); 4075f757f3fSDimitry Andric } 4085f757f3fSDimitry Andric 4095f757f3fSDimitry Andric // Erase the dead instructions 4105f757f3fSDimitry Andric MI.eraseFromParent(); 4115f757f3fSDimitry Andric } 4125f757f3fSDimitry Andric 4137a6dacacSDimitry Andric // Matches {U/S}ADDV(ext(x)) => {U/S}ADDLV(x) 4147a6dacacSDimitry Andric // Ensure that the type coming from the extend instruction is the right size 4157a6dacacSDimitry Andric bool matchExtUaddvToUaddlv(MachineInstr &MI, MachineRegisterInfo &MRI, 4167a6dacacSDimitry Andric std::pair<Register, bool> &MatchInfo) { 4177a6dacacSDimitry Andric assert(MI.getOpcode() == TargetOpcode::G_VECREDUCE_ADD && 4187a6dacacSDimitry Andric "Expected G_VECREDUCE_ADD Opcode"); 4197a6dacacSDimitry Andric 4207a6dacacSDimitry Andric // Check if the last instruction is an extend 4217a6dacacSDimitry Andric MachineInstr *ExtMI = getDefIgnoringCopies(MI.getOperand(1).getReg(), MRI); 4227a6dacacSDimitry Andric auto ExtOpc = ExtMI->getOpcode(); 4237a6dacacSDimitry Andric 4247a6dacacSDimitry Andric if (ExtOpc == TargetOpcode::G_ZEXT) 4257a6dacacSDimitry Andric std::get<1>(MatchInfo) = 0; 4267a6dacacSDimitry Andric else if (ExtOpc == TargetOpcode::G_SEXT) 4277a6dacacSDimitry Andric std::get<1>(MatchInfo) = 1; 4287a6dacacSDimitry Andric else 4297a6dacacSDimitry Andric return false; 4307a6dacacSDimitry Andric 4317a6dacacSDimitry Andric // Check if the source register is a valid type 4327a6dacacSDimitry Andric Register ExtSrcReg = ExtMI->getOperand(1).getReg(); 4337a6dacacSDimitry Andric LLT ExtSrcTy = MRI.getType(ExtSrcReg); 4347a6dacacSDimitry Andric LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); 4357a6dacacSDimitry Andric if ((DstTy.getScalarSizeInBits() == 16 && 4367a6dacacSDimitry Andric ExtSrcTy.getNumElements() % 8 == 0 && ExtSrcTy.getNumElements() < 256) || 4377a6dacacSDimitry Andric (DstTy.getScalarSizeInBits() == 32 && 4387a6dacacSDimitry Andric ExtSrcTy.getNumElements() % 4 == 0) || 4397a6dacacSDimitry Andric (DstTy.getScalarSizeInBits() == 64 && 4407a6dacacSDimitry Andric ExtSrcTy.getNumElements() % 4 == 0)) { 4417a6dacacSDimitry Andric std::get<0>(MatchInfo) = ExtSrcReg; 4427a6dacacSDimitry Andric return true; 4437a6dacacSDimitry Andric } 4447a6dacacSDimitry Andric return false; 4457a6dacacSDimitry Andric } 4467a6dacacSDimitry Andric 4477a6dacacSDimitry Andric void applyExtUaddvToUaddlv(MachineInstr &MI, MachineRegisterInfo &MRI, 4487a6dacacSDimitry Andric MachineIRBuilder &B, GISelChangeObserver &Observer, 4497a6dacacSDimitry Andric std::pair<Register, bool> &MatchInfo) { 4507a6dacacSDimitry Andric assert(MI.getOpcode() == TargetOpcode::G_VECREDUCE_ADD && 4517a6dacacSDimitry Andric "Expected G_VECREDUCE_ADD Opcode"); 4527a6dacacSDimitry Andric 4537a6dacacSDimitry Andric unsigned Opc = std::get<1>(MatchInfo) ? AArch64::G_SADDLV : AArch64::G_UADDLV; 4547a6dacacSDimitry Andric Register SrcReg = std::get<0>(MatchInfo); 4557a6dacacSDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 4567a6dacacSDimitry Andric LLT SrcTy = MRI.getType(SrcReg); 4577a6dacacSDimitry Andric LLT DstTy = MRI.getType(DstReg); 4587a6dacacSDimitry Andric 4597a6dacacSDimitry Andric // If SrcTy has more elements than expected, split them into multiple 4607a6dacacSDimitry Andric // insructions and sum the results 4617a6dacacSDimitry Andric LLT MainTy; 4627a6dacacSDimitry Andric SmallVector<Register, 1> WorkingRegisters; 4637a6dacacSDimitry Andric unsigned SrcScalSize = SrcTy.getScalarSizeInBits(); 4647a6dacacSDimitry Andric unsigned SrcNumElem = SrcTy.getNumElements(); 4657a6dacacSDimitry Andric if ((SrcScalSize == 8 && SrcNumElem > 16) || 4667a6dacacSDimitry Andric (SrcScalSize == 16 && SrcNumElem > 8) || 4677a6dacacSDimitry Andric (SrcScalSize == 32 && SrcNumElem > 4)) { 4687a6dacacSDimitry Andric 4697a6dacacSDimitry Andric LLT LeftoverTy; 4707a6dacacSDimitry Andric SmallVector<Register, 4> LeftoverRegs; 4717a6dacacSDimitry Andric if (SrcScalSize == 8) 4727a6dacacSDimitry Andric MainTy = LLT::fixed_vector(16, 8); 4737a6dacacSDimitry Andric else if (SrcScalSize == 16) 4747a6dacacSDimitry Andric MainTy = LLT::fixed_vector(8, 16); 4757a6dacacSDimitry Andric else if (SrcScalSize == 32) 4767a6dacacSDimitry Andric MainTy = LLT::fixed_vector(4, 32); 4777a6dacacSDimitry Andric else 4787a6dacacSDimitry Andric llvm_unreachable("Source's Scalar Size not supported"); 4797a6dacacSDimitry Andric 4807a6dacacSDimitry Andric // Extract the parts and put each extracted sources through U/SADDLV and put 4817a6dacacSDimitry Andric // the values inside a small vec 4827a6dacacSDimitry Andric extractParts(SrcReg, SrcTy, MainTy, LeftoverTy, WorkingRegisters, 4837a6dacacSDimitry Andric LeftoverRegs, B, MRI); 4847a6dacacSDimitry Andric for (unsigned I = 0; I < LeftoverRegs.size(); I++) { 4857a6dacacSDimitry Andric WorkingRegisters.push_back(LeftoverRegs[I]); 4867a6dacacSDimitry Andric } 4877a6dacacSDimitry Andric } else { 4887a6dacacSDimitry Andric WorkingRegisters.push_back(SrcReg); 4897a6dacacSDimitry Andric MainTy = SrcTy; 4907a6dacacSDimitry Andric } 4917a6dacacSDimitry Andric 4927a6dacacSDimitry Andric unsigned MidScalarSize = MainTy.getScalarSizeInBits() * 2; 4937a6dacacSDimitry Andric LLT MidScalarLLT = LLT::scalar(MidScalarSize); 4947a6dacacSDimitry Andric Register zeroReg = B.buildConstant(LLT::scalar(64), 0).getReg(0); 4957a6dacacSDimitry Andric for (unsigned I = 0; I < WorkingRegisters.size(); I++) { 4967a6dacacSDimitry Andric // If the number of elements is too small to build an instruction, extend 4977a6dacacSDimitry Andric // its size before applying addlv 4987a6dacacSDimitry Andric LLT WorkingRegTy = MRI.getType(WorkingRegisters[I]); 4997a6dacacSDimitry Andric if ((WorkingRegTy.getScalarSizeInBits() == 8) && 5007a6dacacSDimitry Andric (WorkingRegTy.getNumElements() == 4)) { 5017a6dacacSDimitry Andric WorkingRegisters[I] = 5027a6dacacSDimitry Andric B.buildInstr(std::get<1>(MatchInfo) ? TargetOpcode::G_SEXT 5037a6dacacSDimitry Andric : TargetOpcode::G_ZEXT, 5047a6dacacSDimitry Andric {LLT::fixed_vector(4, 16)}, {WorkingRegisters[I]}) 5057a6dacacSDimitry Andric .getReg(0); 5067a6dacacSDimitry Andric } 5077a6dacacSDimitry Andric 5087a6dacacSDimitry Andric // Generate the {U/S}ADDLV instruction, whose output is always double of the 5097a6dacacSDimitry Andric // Src's Scalar size 5107a6dacacSDimitry Andric LLT addlvTy = MidScalarSize <= 32 ? LLT::fixed_vector(4, 32) 5117a6dacacSDimitry Andric : LLT::fixed_vector(2, 64); 5127a6dacacSDimitry Andric Register addlvReg = 5137a6dacacSDimitry Andric B.buildInstr(Opc, {addlvTy}, {WorkingRegisters[I]}).getReg(0); 5147a6dacacSDimitry Andric 5157a6dacacSDimitry Andric // The output from {U/S}ADDLV gets placed in the lowest lane of a v4i32 or 5167a6dacacSDimitry Andric // v2i64 register. 5177a6dacacSDimitry Andric // i16, i32 results uses v4i32 registers 5187a6dacacSDimitry Andric // i64 results uses v2i64 registers 5197a6dacacSDimitry Andric // Therefore we have to extract/truncate the the value to the right type 5207a6dacacSDimitry Andric if (MidScalarSize == 32 || MidScalarSize == 64) { 5217a6dacacSDimitry Andric WorkingRegisters[I] = B.buildInstr(AArch64::G_EXTRACT_VECTOR_ELT, 5227a6dacacSDimitry Andric {MidScalarLLT}, {addlvReg, zeroReg}) 5237a6dacacSDimitry Andric .getReg(0); 5247a6dacacSDimitry Andric } else { 5257a6dacacSDimitry Andric Register extractReg = B.buildInstr(AArch64::G_EXTRACT_VECTOR_ELT, 5267a6dacacSDimitry Andric {LLT::scalar(32)}, {addlvReg, zeroReg}) 5277a6dacacSDimitry Andric .getReg(0); 5287a6dacacSDimitry Andric WorkingRegisters[I] = 5297a6dacacSDimitry Andric B.buildTrunc({MidScalarLLT}, {extractReg}).getReg(0); 5307a6dacacSDimitry Andric } 5317a6dacacSDimitry Andric } 5327a6dacacSDimitry Andric 5337a6dacacSDimitry Andric Register outReg; 5347a6dacacSDimitry Andric if (WorkingRegisters.size() > 1) { 5357a6dacacSDimitry Andric outReg = B.buildAdd(MidScalarLLT, WorkingRegisters[0], WorkingRegisters[1]) 5367a6dacacSDimitry Andric .getReg(0); 5377a6dacacSDimitry Andric for (unsigned I = 2; I < WorkingRegisters.size(); I++) { 5387a6dacacSDimitry Andric outReg = B.buildAdd(MidScalarLLT, outReg, WorkingRegisters[I]).getReg(0); 5397a6dacacSDimitry Andric } 5407a6dacacSDimitry Andric } else { 5417a6dacacSDimitry Andric outReg = WorkingRegisters[0]; 5427a6dacacSDimitry Andric } 5437a6dacacSDimitry Andric 5447a6dacacSDimitry Andric if (DstTy.getScalarSizeInBits() > MidScalarSize) { 5457a6dacacSDimitry Andric // Handle the scalar value if the DstTy's Scalar Size is more than double 5467a6dacacSDimitry Andric // Src's ScalarType 5477a6dacacSDimitry Andric B.buildInstr(std::get<1>(MatchInfo) ? TargetOpcode::G_SEXT 5487a6dacacSDimitry Andric : TargetOpcode::G_ZEXT, 5497a6dacacSDimitry Andric {DstReg}, {outReg}); 5507a6dacacSDimitry Andric } else { 5517a6dacacSDimitry Andric B.buildCopy(DstReg, outReg); 5527a6dacacSDimitry Andric } 5537a6dacacSDimitry Andric 5547a6dacacSDimitry Andric MI.eraseFromParent(); 5557a6dacacSDimitry Andric } 5567a6dacacSDimitry Andric 557*0fca6ea1SDimitry Andric // Pushes ADD/SUB through extend instructions to decrease the number of extend 558*0fca6ea1SDimitry Andric // instruction at the end by allowing selection of {s|u}addl sooner 559*0fca6ea1SDimitry Andric 560*0fca6ea1SDimitry Andric // i32 add(i32 ext i8, i32 ext i8) => i32 ext(i16 add(i16 ext i8, i16 ext i8)) 561*0fca6ea1SDimitry Andric bool matchPushAddSubExt(MachineInstr &MI, MachineRegisterInfo &MRI, 562*0fca6ea1SDimitry Andric Register DstReg, Register SrcReg1, Register SrcReg2) { 563*0fca6ea1SDimitry Andric assert((MI.getOpcode() == TargetOpcode::G_ADD || 564*0fca6ea1SDimitry Andric MI.getOpcode() == TargetOpcode::G_SUB) && 565*0fca6ea1SDimitry Andric "Expected a G_ADD or G_SUB instruction\n"); 566*0fca6ea1SDimitry Andric 567*0fca6ea1SDimitry Andric // Deal with vector types only 568*0fca6ea1SDimitry Andric LLT DstTy = MRI.getType(DstReg); 569*0fca6ea1SDimitry Andric if (!DstTy.isVector()) 570*0fca6ea1SDimitry Andric return false; 571*0fca6ea1SDimitry Andric 572*0fca6ea1SDimitry Andric // Return true if G_{S|Z}EXT instruction is more than 2* source 573*0fca6ea1SDimitry Andric Register ExtDstReg = MI.getOperand(1).getReg(); 574*0fca6ea1SDimitry Andric LLT Ext1SrcTy = MRI.getType(SrcReg1); 575*0fca6ea1SDimitry Andric LLT Ext2SrcTy = MRI.getType(SrcReg2); 576*0fca6ea1SDimitry Andric unsigned ExtDstScal = MRI.getType(ExtDstReg).getScalarSizeInBits(); 577*0fca6ea1SDimitry Andric unsigned Ext1SrcScal = Ext1SrcTy.getScalarSizeInBits(); 578*0fca6ea1SDimitry Andric if (((Ext1SrcScal == 8 && ExtDstScal == 32) || 579*0fca6ea1SDimitry Andric ((Ext1SrcScal == 8 || Ext1SrcScal == 16) && ExtDstScal == 64)) && 580*0fca6ea1SDimitry Andric Ext1SrcTy == Ext2SrcTy) 581*0fca6ea1SDimitry Andric return true; 582*0fca6ea1SDimitry Andric 583*0fca6ea1SDimitry Andric return false; 584*0fca6ea1SDimitry Andric } 585*0fca6ea1SDimitry Andric 586*0fca6ea1SDimitry Andric void applyPushAddSubExt(MachineInstr &MI, MachineRegisterInfo &MRI, 587*0fca6ea1SDimitry Andric MachineIRBuilder &B, bool isSExt, Register DstReg, 588*0fca6ea1SDimitry Andric Register SrcReg1, Register SrcReg2) { 589*0fca6ea1SDimitry Andric LLT SrcTy = MRI.getType(SrcReg1); 590*0fca6ea1SDimitry Andric LLT MidTy = SrcTy.changeElementSize(SrcTy.getScalarSizeInBits() * 2); 591*0fca6ea1SDimitry Andric unsigned Opc = isSExt ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT; 592*0fca6ea1SDimitry Andric Register Ext1Reg = B.buildInstr(Opc, {MidTy}, {SrcReg1}).getReg(0); 593*0fca6ea1SDimitry Andric Register Ext2Reg = B.buildInstr(Opc, {MidTy}, {SrcReg2}).getReg(0); 594*0fca6ea1SDimitry Andric Register AddReg = 595*0fca6ea1SDimitry Andric B.buildInstr(MI.getOpcode(), {MidTy}, {Ext1Reg, Ext2Reg}).getReg(0); 596*0fca6ea1SDimitry Andric 597*0fca6ea1SDimitry Andric // G_SUB has to sign-extend the result. 598*0fca6ea1SDimitry Andric // G_ADD needs to sext from sext and can sext or zext from zext, so the 599*0fca6ea1SDimitry Andric // original opcode is used. 600*0fca6ea1SDimitry Andric if (MI.getOpcode() == TargetOpcode::G_ADD) 601*0fca6ea1SDimitry Andric B.buildInstr(Opc, {DstReg}, {AddReg}); 602*0fca6ea1SDimitry Andric else 603*0fca6ea1SDimitry Andric B.buildSExt(DstReg, AddReg); 604*0fca6ea1SDimitry Andric 605*0fca6ea1SDimitry Andric MI.eraseFromParent(); 606*0fca6ea1SDimitry Andric } 607*0fca6ea1SDimitry Andric 60806c3fb27SDimitry Andric bool tryToSimplifyUADDO(MachineInstr &MI, MachineIRBuilder &B, 60906c3fb27SDimitry Andric CombinerHelper &Helper, GISelChangeObserver &Observer) { 610349cc55cSDimitry Andric // Try simplify G_UADDO with 8 or 16 bit operands to wide G_ADD and TBNZ if 611349cc55cSDimitry Andric // result is only used in the no-overflow case. It is restricted to cases 612349cc55cSDimitry Andric // where we know that the high-bits of the operands are 0. If there's an 6135f757f3fSDimitry Andric // overflow, then the 9th or 17th bit must be set, which can be checked 614349cc55cSDimitry Andric // using TBNZ. 615349cc55cSDimitry Andric // 616349cc55cSDimitry Andric // Change (for UADDOs on 8 and 16 bits): 617349cc55cSDimitry Andric // 618349cc55cSDimitry Andric // %z0 = G_ASSERT_ZEXT _ 619349cc55cSDimitry Andric // %op0 = G_TRUNC %z0 620349cc55cSDimitry Andric // %z1 = G_ASSERT_ZEXT _ 621349cc55cSDimitry Andric // %op1 = G_TRUNC %z1 622349cc55cSDimitry Andric // %val, %cond = G_UADDO %op0, %op1 623349cc55cSDimitry Andric // G_BRCOND %cond, %error.bb 624349cc55cSDimitry Andric // 625349cc55cSDimitry Andric // error.bb: 626349cc55cSDimitry Andric // (no successors and no uses of %val) 627349cc55cSDimitry Andric // 628349cc55cSDimitry Andric // To: 629349cc55cSDimitry Andric // 630349cc55cSDimitry Andric // %z0 = G_ASSERT_ZEXT _ 631349cc55cSDimitry Andric // %z1 = G_ASSERT_ZEXT _ 632349cc55cSDimitry Andric // %add = G_ADD %z0, %z1 633349cc55cSDimitry Andric // %val = G_TRUNC %add 634349cc55cSDimitry Andric // %bit = G_AND %add, 1 << scalar-size-in-bits(%op1) 635349cc55cSDimitry Andric // %cond = G_ICMP NE, %bit, 0 636349cc55cSDimitry Andric // G_BRCOND %cond, %error.bb 637349cc55cSDimitry Andric 638349cc55cSDimitry Andric auto &MRI = *B.getMRI(); 639349cc55cSDimitry Andric 640349cc55cSDimitry Andric MachineOperand *DefOp0 = MRI.getOneDef(MI.getOperand(2).getReg()); 641349cc55cSDimitry Andric MachineOperand *DefOp1 = MRI.getOneDef(MI.getOperand(3).getReg()); 642349cc55cSDimitry Andric Register Op0Wide; 643349cc55cSDimitry Andric Register Op1Wide; 644349cc55cSDimitry Andric if (!mi_match(DefOp0->getParent(), MRI, m_GTrunc(m_Reg(Op0Wide))) || 645349cc55cSDimitry Andric !mi_match(DefOp1->getParent(), MRI, m_GTrunc(m_Reg(Op1Wide)))) 646349cc55cSDimitry Andric return false; 647349cc55cSDimitry Andric LLT WideTy0 = MRI.getType(Op0Wide); 648349cc55cSDimitry Andric LLT WideTy1 = MRI.getType(Op1Wide); 649349cc55cSDimitry Andric Register ResVal = MI.getOperand(0).getReg(); 650349cc55cSDimitry Andric LLT OpTy = MRI.getType(ResVal); 651349cc55cSDimitry Andric MachineInstr *Op0WideDef = MRI.getVRegDef(Op0Wide); 652349cc55cSDimitry Andric MachineInstr *Op1WideDef = MRI.getVRegDef(Op1Wide); 653349cc55cSDimitry Andric 654349cc55cSDimitry Andric unsigned OpTySize = OpTy.getScalarSizeInBits(); 655349cc55cSDimitry Andric // First check that the G_TRUNC feeding the G_UADDO are no-ops, because the 656349cc55cSDimitry Andric // inputs have been zero-extended. 657349cc55cSDimitry Andric if (Op0WideDef->getOpcode() != TargetOpcode::G_ASSERT_ZEXT || 658349cc55cSDimitry Andric Op1WideDef->getOpcode() != TargetOpcode::G_ASSERT_ZEXT || 659349cc55cSDimitry Andric OpTySize != Op0WideDef->getOperand(2).getImm() || 660349cc55cSDimitry Andric OpTySize != Op1WideDef->getOperand(2).getImm()) 661349cc55cSDimitry Andric return false; 662349cc55cSDimitry Andric 663349cc55cSDimitry Andric // Only scalar UADDO with either 8 or 16 bit operands are handled. 664349cc55cSDimitry Andric if (!WideTy0.isScalar() || !WideTy1.isScalar() || WideTy0 != WideTy1 || 665349cc55cSDimitry Andric OpTySize >= WideTy0.getScalarSizeInBits() || 666349cc55cSDimitry Andric (OpTySize != 8 && OpTySize != 16)) 667349cc55cSDimitry Andric return false; 668349cc55cSDimitry Andric 669349cc55cSDimitry Andric // The overflow-status result must be used by a branch only. 670349cc55cSDimitry Andric Register ResStatus = MI.getOperand(1).getReg(); 671349cc55cSDimitry Andric if (!MRI.hasOneNonDBGUse(ResStatus)) 672349cc55cSDimitry Andric return false; 673349cc55cSDimitry Andric MachineInstr *CondUser = &*MRI.use_instr_nodbg_begin(ResStatus); 674349cc55cSDimitry Andric if (CondUser->getOpcode() != TargetOpcode::G_BRCOND) 675349cc55cSDimitry Andric return false; 676349cc55cSDimitry Andric 677349cc55cSDimitry Andric // Make sure the computed result is only used in the no-overflow blocks. 678349cc55cSDimitry Andric MachineBasicBlock *CurrentMBB = MI.getParent(); 679349cc55cSDimitry Andric MachineBasicBlock *FailMBB = CondUser->getOperand(1).getMBB(); 680349cc55cSDimitry Andric if (!FailMBB->succ_empty() || CondUser->getParent() != CurrentMBB) 681349cc55cSDimitry Andric return false; 682349cc55cSDimitry Andric if (any_of(MRI.use_nodbg_instructions(ResVal), 683349cc55cSDimitry Andric [&MI, FailMBB, CurrentMBB](MachineInstr &I) { 684349cc55cSDimitry Andric return &MI != &I && 685349cc55cSDimitry Andric (I.getParent() == FailMBB || I.getParent() == CurrentMBB); 686349cc55cSDimitry Andric })) 687349cc55cSDimitry Andric return false; 688349cc55cSDimitry Andric 689349cc55cSDimitry Andric // Remove G_ADDO. 690349cc55cSDimitry Andric B.setInstrAndDebugLoc(*MI.getNextNode()); 691349cc55cSDimitry Andric MI.eraseFromParent(); 692349cc55cSDimitry Andric 693349cc55cSDimitry Andric // Emit wide add. 694349cc55cSDimitry Andric Register AddDst = MRI.cloneVirtualRegister(Op0Wide); 695349cc55cSDimitry Andric B.buildInstr(TargetOpcode::G_ADD, {AddDst}, {Op0Wide, Op1Wide}); 696349cc55cSDimitry Andric 697349cc55cSDimitry Andric // Emit check of the 9th or 17th bit and update users (the branch). This will 698349cc55cSDimitry Andric // later be folded to TBNZ. 699349cc55cSDimitry Andric Register CondBit = MRI.cloneVirtualRegister(Op0Wide); 700349cc55cSDimitry Andric B.buildAnd( 701349cc55cSDimitry Andric CondBit, AddDst, 702349cc55cSDimitry Andric B.buildConstant(LLT::scalar(32), OpTySize == 8 ? 1 << 8 : 1 << 16)); 703349cc55cSDimitry Andric B.buildICmp(CmpInst::ICMP_NE, ResStatus, CondBit, 704349cc55cSDimitry Andric B.buildConstant(LLT::scalar(32), 0)); 705349cc55cSDimitry Andric 706349cc55cSDimitry Andric // Update ZEXts users of the result value. Because all uses are in the 707349cc55cSDimitry Andric // no-overflow case, we know that the top bits are 0 and we can ignore ZExts. 708349cc55cSDimitry Andric B.buildZExtOrTrunc(ResVal, AddDst); 709349cc55cSDimitry Andric for (MachineOperand &U : make_early_inc_range(MRI.use_operands(ResVal))) { 710349cc55cSDimitry Andric Register WideReg; 711349cc55cSDimitry Andric if (mi_match(U.getParent(), MRI, m_GZExt(m_Reg(WideReg)))) { 712349cc55cSDimitry Andric auto OldR = U.getParent()->getOperand(0).getReg(); 713349cc55cSDimitry Andric Observer.erasingInstr(*U.getParent()); 714349cc55cSDimitry Andric U.getParent()->eraseFromParent(); 715349cc55cSDimitry Andric Helper.replaceRegWith(MRI, OldR, AddDst); 716349cc55cSDimitry Andric } 717349cc55cSDimitry Andric } 718349cc55cSDimitry Andric 719349cc55cSDimitry Andric return true; 720349cc55cSDimitry Andric } 721349cc55cSDimitry Andric 7225f757f3fSDimitry Andric class AArch64PreLegalizerCombinerImpl : public Combiner { 7235ffd83dbSDimitry Andric protected: 7245f757f3fSDimitry Andric // TODO: Make CombinerHelper methods const. 7255f757f3fSDimitry Andric mutable CombinerHelper Helper; 72606c3fb27SDimitry Andric const AArch64PreLegalizerCombinerImplRuleConfig &RuleConfig; 72706c3fb27SDimitry Andric const AArch64Subtarget &STI; 7285ffd83dbSDimitry Andric 7295ffd83dbSDimitry Andric public: 73006c3fb27SDimitry Andric AArch64PreLegalizerCombinerImpl( 7315f757f3fSDimitry Andric MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC, 7325f757f3fSDimitry Andric GISelKnownBits &KB, GISelCSEInfo *CSEInfo, 73306c3fb27SDimitry Andric const AArch64PreLegalizerCombinerImplRuleConfig &RuleConfig, 7345f757f3fSDimitry Andric const AArch64Subtarget &STI, MachineDominatorTree *MDT, 7355f757f3fSDimitry Andric const LegalizerInfo *LI); 73606c3fb27SDimitry Andric 73706c3fb27SDimitry Andric static const char *getName() { return "AArch6400PreLegalizerCombiner"; } 73806c3fb27SDimitry Andric 7395f757f3fSDimitry Andric bool tryCombineAll(MachineInstr &I) const override; 7405f757f3fSDimitry Andric 7415f757f3fSDimitry Andric bool tryCombineAllImpl(MachineInstr &I) const; 74206c3fb27SDimitry Andric 74306c3fb27SDimitry Andric private: 74406c3fb27SDimitry Andric #define GET_GICOMBINER_CLASS_MEMBERS 74506c3fb27SDimitry Andric #include "AArch64GenPreLegalizeGICombiner.inc" 74606c3fb27SDimitry Andric #undef GET_GICOMBINER_CLASS_MEMBERS 7475ffd83dbSDimitry Andric }; 7485ffd83dbSDimitry Andric 74906c3fb27SDimitry Andric #define GET_GICOMBINER_IMPL 7505ffd83dbSDimitry Andric #include "AArch64GenPreLegalizeGICombiner.inc" 75106c3fb27SDimitry Andric #undef GET_GICOMBINER_IMPL 7525ffd83dbSDimitry Andric 75306c3fb27SDimitry Andric AArch64PreLegalizerCombinerImpl::AArch64PreLegalizerCombinerImpl( 7545f757f3fSDimitry Andric MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC, 7555f757f3fSDimitry Andric GISelKnownBits &KB, GISelCSEInfo *CSEInfo, 75606c3fb27SDimitry Andric const AArch64PreLegalizerCombinerImplRuleConfig &RuleConfig, 7575f757f3fSDimitry Andric const AArch64Subtarget &STI, MachineDominatorTree *MDT, 7585f757f3fSDimitry Andric const LegalizerInfo *LI) 7595f757f3fSDimitry Andric : Combiner(MF, CInfo, TPC, &KB, CSEInfo), 7605f757f3fSDimitry Andric Helper(Observer, B, /*IsPreLegalize*/ true, &KB, MDT, LI), 7615f757f3fSDimitry Andric RuleConfig(RuleConfig), STI(STI), 76206c3fb27SDimitry Andric #define GET_GICOMBINER_CONSTRUCTOR_INITS 7635ffd83dbSDimitry Andric #include "AArch64GenPreLegalizeGICombiner.inc" 76406c3fb27SDimitry Andric #undef GET_GICOMBINER_CONSTRUCTOR_INITS 76506c3fb27SDimitry Andric { 76606c3fb27SDimitry Andric } 7675ffd83dbSDimitry Andric 7685f757f3fSDimitry Andric bool AArch64PreLegalizerCombinerImpl::tryCombineAll(MachineInstr &MI) const { 7695f757f3fSDimitry Andric if (tryCombineAllImpl(MI)) 7705ffd83dbSDimitry Andric return true; 7715ffd83dbSDimitry Andric 772fe6060f1SDimitry Andric unsigned Opc = MI.getOpcode(); 773fe6060f1SDimitry Andric switch (Opc) { 7745ffd83dbSDimitry Andric case TargetOpcode::G_SHUFFLE_VECTOR: 7755ffd83dbSDimitry Andric return Helper.tryCombineShuffleVector(MI); 776349cc55cSDimitry Andric case TargetOpcode::G_UADDO: 777349cc55cSDimitry Andric return tryToSimplifyUADDO(MI, B, Helper, Observer); 778fe6060f1SDimitry Andric case TargetOpcode::G_MEMCPY_INLINE: 779fe6060f1SDimitry Andric return Helper.tryEmitMemcpyInline(MI); 780e8d8bef9SDimitry Andric case TargetOpcode::G_MEMCPY: 781e8d8bef9SDimitry Andric case TargetOpcode::G_MEMMOVE: 782e8d8bef9SDimitry Andric case TargetOpcode::G_MEMSET: { 783e8d8bef9SDimitry Andric // If we're at -O0 set a maxlen of 32 to inline, otherwise let the other 784e8d8bef9SDimitry Andric // heuristics decide. 7855f757f3fSDimitry Andric unsigned MaxLen = CInfo.EnableOpt ? 0 : 32; 786e8d8bef9SDimitry Andric // Try to inline memcpy type calls if optimizations are enabled. 787fe6060f1SDimitry Andric if (Helper.tryCombineMemCpyFamily(MI, MaxLen)) 788fe6060f1SDimitry Andric return true; 789fe6060f1SDimitry Andric if (Opc == TargetOpcode::G_MEMSET) 7905f757f3fSDimitry Andric return llvm::AArch64GISelUtils::tryEmitBZero(MI, B, CInfo.EnableMinSize); 791fe6060f1SDimitry Andric return false; 792e8d8bef9SDimitry Andric } 7935ffd83dbSDimitry Andric } 7945ffd83dbSDimitry Andric 7955ffd83dbSDimitry Andric return false; 7965ffd83dbSDimitry Andric } 7975ffd83dbSDimitry Andric 7985ffd83dbSDimitry Andric // Pass boilerplate 7995ffd83dbSDimitry Andric // ================ 8005ffd83dbSDimitry Andric 8015ffd83dbSDimitry Andric class AArch64PreLegalizerCombiner : public MachineFunctionPass { 8025ffd83dbSDimitry Andric public: 8035ffd83dbSDimitry Andric static char ID; 8045ffd83dbSDimitry Andric 805fe6060f1SDimitry Andric AArch64PreLegalizerCombiner(); 8065ffd83dbSDimitry Andric 80706c3fb27SDimitry Andric StringRef getPassName() const override { 80806c3fb27SDimitry Andric return "AArch64PreLegalizerCombiner"; 80906c3fb27SDimitry Andric } 8105ffd83dbSDimitry Andric 8115ffd83dbSDimitry Andric bool runOnMachineFunction(MachineFunction &MF) override; 8125ffd83dbSDimitry Andric 8135ffd83dbSDimitry Andric void getAnalysisUsage(AnalysisUsage &AU) const override; 8145f757f3fSDimitry Andric 8155f757f3fSDimitry Andric private: 8165f757f3fSDimitry Andric AArch64PreLegalizerCombinerImplRuleConfig RuleConfig; 8175ffd83dbSDimitry Andric }; 8185ffd83dbSDimitry Andric } // end anonymous namespace 8195ffd83dbSDimitry Andric 8205ffd83dbSDimitry Andric void AArch64PreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { 8215ffd83dbSDimitry Andric AU.addRequired<TargetPassConfig>(); 8225ffd83dbSDimitry Andric AU.setPreservesCFG(); 8235ffd83dbSDimitry Andric getSelectionDAGFallbackAnalysisUsage(AU); 8245ffd83dbSDimitry Andric AU.addRequired<GISelKnownBitsAnalysis>(); 8255ffd83dbSDimitry Andric AU.addPreserved<GISelKnownBitsAnalysis>(); 826*0fca6ea1SDimitry Andric AU.addRequired<MachineDominatorTreeWrapperPass>(); 827*0fca6ea1SDimitry Andric AU.addPreserved<MachineDominatorTreeWrapperPass>(); 828fe6060f1SDimitry Andric AU.addRequired<GISelCSEAnalysisWrapperPass>(); 829fe6060f1SDimitry Andric AU.addPreserved<GISelCSEAnalysisWrapperPass>(); 8305ffd83dbSDimitry Andric MachineFunctionPass::getAnalysisUsage(AU); 8315ffd83dbSDimitry Andric } 8325ffd83dbSDimitry Andric 833fe6060f1SDimitry Andric AArch64PreLegalizerCombiner::AArch64PreLegalizerCombiner() 834fe6060f1SDimitry Andric : MachineFunctionPass(ID) { 8355ffd83dbSDimitry Andric initializeAArch64PreLegalizerCombinerPass(*PassRegistry::getPassRegistry()); 8365f757f3fSDimitry Andric 8375f757f3fSDimitry Andric if (!RuleConfig.parseCommandLineOption()) 8385f757f3fSDimitry Andric report_fatal_error("Invalid rule identifier"); 8395ffd83dbSDimitry Andric } 8405ffd83dbSDimitry Andric 8415ffd83dbSDimitry Andric bool AArch64PreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { 8425ffd83dbSDimitry Andric if (MF.getProperties().hasProperty( 8435ffd83dbSDimitry Andric MachineFunctionProperties::Property::FailedISel)) 8445ffd83dbSDimitry Andric return false; 845fe6060f1SDimitry Andric auto &TPC = getAnalysis<TargetPassConfig>(); 846fe6060f1SDimitry Andric 847fe6060f1SDimitry Andric // Enable CSE. 848fe6060f1SDimitry Andric GISelCSEAnalysisWrapper &Wrapper = 849fe6060f1SDimitry Andric getAnalysis<GISelCSEAnalysisWrapperPass>().getCSEWrapper(); 850fe6060f1SDimitry Andric auto *CSEInfo = &Wrapper.get(TPC.getCSEConfig()); 851fe6060f1SDimitry Andric 8525f757f3fSDimitry Andric const AArch64Subtarget &ST = MF.getSubtarget<AArch64Subtarget>(); 8535f757f3fSDimitry Andric const auto *LI = ST.getLegalizerInfo(); 8545f757f3fSDimitry Andric 8555ffd83dbSDimitry Andric const Function &F = MF.getFunction(); 8565ffd83dbSDimitry Andric bool EnableOpt = 8575f757f3fSDimitry Andric MF.getTarget().getOptLevel() != CodeGenOptLevel::None && !skipFunction(F); 8585ffd83dbSDimitry Andric GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF); 859*0fca6ea1SDimitry Andric MachineDominatorTree *MDT = 860*0fca6ea1SDimitry Andric &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); 8615f757f3fSDimitry Andric CombinerInfo CInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false, 8625f757f3fSDimitry Andric /*LegalizerInfo*/ nullptr, EnableOpt, F.hasOptSize(), 8635f757f3fSDimitry Andric F.hasMinSize()); 8645f757f3fSDimitry Andric AArch64PreLegalizerCombinerImpl Impl(MF, CInfo, &TPC, *KB, CSEInfo, 8655f757f3fSDimitry Andric RuleConfig, ST, MDT, LI); 8665f757f3fSDimitry Andric return Impl.combineMachineInstrs(); 8675ffd83dbSDimitry Andric } 8685ffd83dbSDimitry Andric 8695ffd83dbSDimitry Andric char AArch64PreLegalizerCombiner::ID = 0; 8705ffd83dbSDimitry Andric INITIALIZE_PASS_BEGIN(AArch64PreLegalizerCombiner, DEBUG_TYPE, 8715ffd83dbSDimitry Andric "Combine AArch64 machine instrs before legalization", 8725ffd83dbSDimitry Andric false, false) 8735ffd83dbSDimitry Andric INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 8745ffd83dbSDimitry Andric INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis) 875fe6060f1SDimitry Andric INITIALIZE_PASS_DEPENDENCY(GISelCSEAnalysisWrapperPass) 8765ffd83dbSDimitry Andric INITIALIZE_PASS_END(AArch64PreLegalizerCombiner, DEBUG_TYPE, 8775ffd83dbSDimitry Andric "Combine AArch64 machine instrs before legalization", false, 8785ffd83dbSDimitry Andric false) 8795ffd83dbSDimitry Andric 8805ffd83dbSDimitry Andric namespace llvm { 881fe6060f1SDimitry Andric FunctionPass *createAArch64PreLegalizerCombiner() { 882fe6060f1SDimitry Andric return new AArch64PreLegalizerCombiner(); 8835ffd83dbSDimitry Andric } 8845ffd83dbSDimitry Andric } // end namespace llvm 885