1e8d8bef9SDimitry Andric //=== AArch64PostLegalizerCombiner.cpp --------------------------*- C++ -*-===// 25ffd83dbSDimitry Andric // 35ffd83dbSDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 45ffd83dbSDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 55ffd83dbSDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 65ffd83dbSDimitry Andric // 75ffd83dbSDimitry Andric //===----------------------------------------------------------------------===// 8e8d8bef9SDimitry Andric /// 9e8d8bef9SDimitry Andric /// \file 10e8d8bef9SDimitry Andric /// Post-legalization combines on generic MachineInstrs. 11e8d8bef9SDimitry Andric /// 12e8d8bef9SDimitry Andric /// The combines here must preserve instruction legality. 13e8d8bef9SDimitry Andric /// 14e8d8bef9SDimitry Andric /// Lowering combines (e.g. pseudo matching) should be handled by 15e8d8bef9SDimitry Andric /// AArch64PostLegalizerLowering. 16e8d8bef9SDimitry Andric /// 17e8d8bef9SDimitry Andric /// Combines which don't rely on instruction legality should go in the 18e8d8bef9SDimitry Andric /// AArch64PreLegalizerCombiner. 19e8d8bef9SDimitry Andric /// 205ffd83dbSDimitry Andric //===----------------------------------------------------------------------===// 215ffd83dbSDimitry Andric 225ffd83dbSDimitry Andric #include "AArch64TargetMachine.h" 235f757f3fSDimitry Andric #include "llvm/ADT/STLExtras.h" 2481ad6265SDimitry Andric #include "llvm/CodeGen/GlobalISel/CSEInfo.h" 255f757f3fSDimitry Andric #include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h" 265ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/Combiner.h" 275ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" 285ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" 2906c3fb27SDimitry Andric #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h" 30fe6060f1SDimitry Andric #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h" 315ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" 3281ad6265SDimitry Andric #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" 33fe6060f1SDimitry Andric #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 34e8d8bef9SDimitry Andric #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 35e8d8bef9SDimitry Andric #include "llvm/CodeGen/GlobalISel/Utils.h" 365ffd83dbSDimitry Andric #include "llvm/CodeGen/MachineDominators.h" 375ffd83dbSDimitry Andric #include "llvm/CodeGen/MachineFunctionPass.h" 38e8d8bef9SDimitry Andric #include "llvm/CodeGen/MachineRegisterInfo.h" 39e8d8bef9SDimitry Andric #include "llvm/CodeGen/TargetOpcodes.h" 405ffd83dbSDimitry Andric #include "llvm/CodeGen/TargetPassConfig.h" 415ffd83dbSDimitry Andric #include "llvm/Support/Debug.h" 425ffd83dbSDimitry Andric 4306c3fb27SDimitry Andric #define GET_GICOMBINER_DEPS 4406c3fb27SDimitry Andric #include "AArch64GenPostLegalizeGICombiner.inc" 4506c3fb27SDimitry Andric #undef GET_GICOMBINER_DEPS 4606c3fb27SDimitry Andric 475ffd83dbSDimitry Andric #define DEBUG_TYPE "aarch64-postlegalizer-combiner" 485ffd83dbSDimitry Andric 495ffd83dbSDimitry Andric using namespace llvm; 50fe6060f1SDimitry Andric using namespace MIPatternMatch; 515ffd83dbSDimitry Andric 5206c3fb27SDimitry Andric namespace { 5306c3fb27SDimitry Andric 5406c3fb27SDimitry Andric #define GET_GICOMBINER_TYPES 5506c3fb27SDimitry Andric #include "AArch64GenPostLegalizeGICombiner.inc" 5606c3fb27SDimitry Andric #undef GET_GICOMBINER_TYPES 5706c3fb27SDimitry Andric 58e8d8bef9SDimitry Andric /// This combine tries do what performExtractVectorEltCombine does in SDAG. 59e8d8bef9SDimitry Andric /// Rewrite for pairwise fadd pattern 60e8d8bef9SDimitry Andric /// (s32 (g_extract_vector_elt 61e8d8bef9SDimitry Andric /// (g_fadd (vXs32 Other) 62e8d8bef9SDimitry Andric /// (g_vector_shuffle (vXs32 Other) undef <1,X,...> )) 0)) 63e8d8bef9SDimitry Andric /// -> 64e8d8bef9SDimitry Andric /// (s32 (g_fadd (g_extract_vector_elt (vXs32 Other) 0) 65e8d8bef9SDimitry Andric /// (g_extract_vector_elt (vXs32 Other) 1)) 66e8d8bef9SDimitry Andric bool matchExtractVecEltPairwiseAdd( 67e8d8bef9SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 68e8d8bef9SDimitry Andric std::tuple<unsigned, LLT, Register> &MatchInfo) { 69e8d8bef9SDimitry Andric Register Src1 = MI.getOperand(1).getReg(); 70e8d8bef9SDimitry Andric Register Src2 = MI.getOperand(2).getReg(); 71e8d8bef9SDimitry Andric LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); 725ffd83dbSDimitry Andric 73349cc55cSDimitry Andric auto Cst = getIConstantVRegValWithLookThrough(Src2, MRI); 74e8d8bef9SDimitry Andric if (!Cst || Cst->Value != 0) 75e8d8bef9SDimitry Andric return false; 76e8d8bef9SDimitry Andric // SDAG also checks for FullFP16, but this looks to be beneficial anyway. 775ffd83dbSDimitry Andric 78e8d8bef9SDimitry Andric // Now check for an fadd operation. TODO: expand this for integer add? 79e8d8bef9SDimitry Andric auto *FAddMI = getOpcodeDef(TargetOpcode::G_FADD, Src1, MRI); 80e8d8bef9SDimitry Andric if (!FAddMI) 815ffd83dbSDimitry Andric return false; 825ffd83dbSDimitry Andric 83e8d8bef9SDimitry Andric // If we add support for integer add, must restrict these types to just s64. 84e8d8bef9SDimitry Andric unsigned DstSize = DstTy.getSizeInBits(); 85e8d8bef9SDimitry Andric if (DstSize != 16 && DstSize != 32 && DstSize != 64) 865ffd83dbSDimitry Andric return false; 87e8d8bef9SDimitry Andric 88e8d8bef9SDimitry Andric Register Src1Op1 = FAddMI->getOperand(1).getReg(); 89e8d8bef9SDimitry Andric Register Src1Op2 = FAddMI->getOperand(2).getReg(); 90e8d8bef9SDimitry Andric MachineInstr *Shuffle = 91e8d8bef9SDimitry Andric getOpcodeDef(TargetOpcode::G_SHUFFLE_VECTOR, Src1Op2, MRI); 92e8d8bef9SDimitry Andric MachineInstr *Other = MRI.getVRegDef(Src1Op1); 93e8d8bef9SDimitry Andric if (!Shuffle) { 94e8d8bef9SDimitry Andric Shuffle = getOpcodeDef(TargetOpcode::G_SHUFFLE_VECTOR, Src1Op1, MRI); 95e8d8bef9SDimitry Andric Other = MRI.getVRegDef(Src1Op2); 965ffd83dbSDimitry Andric } 975ffd83dbSDimitry Andric 98e8d8bef9SDimitry Andric // We're looking for a shuffle that moves the second element to index 0. 99e8d8bef9SDimitry Andric if (Shuffle && Shuffle->getOperand(3).getShuffleMask()[0] == 1 && 100e8d8bef9SDimitry Andric Other == MRI.getVRegDef(Shuffle->getOperand(1).getReg())) { 101e8d8bef9SDimitry Andric std::get<0>(MatchInfo) = TargetOpcode::G_FADD; 102e8d8bef9SDimitry Andric std::get<1>(MatchInfo) = DstTy; 103e8d8bef9SDimitry Andric std::get<2>(MatchInfo) = Other->getOperand(0).getReg(); 1045ffd83dbSDimitry Andric return true; 1055ffd83dbSDimitry Andric } 1065ffd83dbSDimitry Andric return false; 1075ffd83dbSDimitry Andric } 1085ffd83dbSDimitry Andric 10906c3fb27SDimitry Andric void applyExtractVecEltPairwiseAdd( 110e8d8bef9SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 111e8d8bef9SDimitry Andric std::tuple<unsigned, LLT, Register> &MatchInfo) { 112e8d8bef9SDimitry Andric unsigned Opc = std::get<0>(MatchInfo); 113e8d8bef9SDimitry Andric assert(Opc == TargetOpcode::G_FADD && "Unexpected opcode!"); 114e8d8bef9SDimitry Andric // We want to generate two extracts of elements 0 and 1, and add them. 115e8d8bef9SDimitry Andric LLT Ty = std::get<1>(MatchInfo); 116e8d8bef9SDimitry Andric Register Src = std::get<2>(MatchInfo); 117e8d8bef9SDimitry Andric LLT s64 = LLT::scalar(64); 118e8d8bef9SDimitry Andric B.setInstrAndDebugLoc(MI); 119e8d8bef9SDimitry Andric auto Elt0 = B.buildExtractVectorElement(Ty, Src, B.buildConstant(s64, 0)); 120e8d8bef9SDimitry Andric auto Elt1 = B.buildExtractVectorElement(Ty, Src, B.buildConstant(s64, 1)); 121e8d8bef9SDimitry Andric B.buildInstr(Opc, {MI.getOperand(0).getReg()}, {Elt0, Elt1}); 1225ffd83dbSDimitry Andric MI.eraseFromParent(); 1235ffd83dbSDimitry Andric } 1245ffd83dbSDimitry Andric 12506c3fb27SDimitry Andric bool isSignExtended(Register R, MachineRegisterInfo &MRI) { 126e8d8bef9SDimitry Andric // TODO: check if extended build vector as well. 127e8d8bef9SDimitry Andric unsigned Opc = MRI.getVRegDef(R)->getOpcode(); 128e8d8bef9SDimitry Andric return Opc == TargetOpcode::G_SEXT || Opc == TargetOpcode::G_SEXT_INREG; 129e8d8bef9SDimitry Andric } 130e8d8bef9SDimitry Andric 13106c3fb27SDimitry Andric bool isZeroExtended(Register R, MachineRegisterInfo &MRI) { 132e8d8bef9SDimitry Andric // TODO: check if extended build vector as well. 133e8d8bef9SDimitry Andric return MRI.getVRegDef(R)->getOpcode() == TargetOpcode::G_ZEXT; 134e8d8bef9SDimitry Andric } 135e8d8bef9SDimitry Andric 136e8d8bef9SDimitry Andric bool matchAArch64MulConstCombine( 137e8d8bef9SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 138e8d8bef9SDimitry Andric std::function<void(MachineIRBuilder &B, Register DstReg)> &ApplyFn) { 139e8d8bef9SDimitry Andric assert(MI.getOpcode() == TargetOpcode::G_MUL); 140e8d8bef9SDimitry Andric Register LHS = MI.getOperand(1).getReg(); 141e8d8bef9SDimitry Andric Register RHS = MI.getOperand(2).getReg(); 142e8d8bef9SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 143e8d8bef9SDimitry Andric const LLT Ty = MRI.getType(LHS); 144e8d8bef9SDimitry Andric 145e8d8bef9SDimitry Andric // The below optimizations require a constant RHS. 146349cc55cSDimitry Andric auto Const = getIConstantVRegValWithLookThrough(RHS, MRI); 147e8d8bef9SDimitry Andric if (!Const) 148e8d8bef9SDimitry Andric return false; 149e8d8bef9SDimitry Andric 15081ad6265SDimitry Andric APInt ConstValue = Const->Value.sext(Ty.getSizeInBits()); 151e8d8bef9SDimitry Andric // The following code is ported from AArch64ISelLowering. 152e8d8bef9SDimitry Andric // Multiplication of a power of two plus/minus one can be done more 1535f757f3fSDimitry Andric // cheaply as shift+add/sub. For now, this is true unilaterally. If 154e8d8bef9SDimitry Andric // future CPUs have a cheaper MADD instruction, this may need to be 155e8d8bef9SDimitry Andric // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and 156e8d8bef9SDimitry Andric // 64-bit is 5 cycles, so this is always a win. 157e8d8bef9SDimitry Andric // More aggressively, some multiplications N0 * C can be lowered to 158e8d8bef9SDimitry Andric // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M, 159e8d8bef9SDimitry Andric // e.g. 6=3*2=(2+1)*2. 160e8d8bef9SDimitry Andric // TODO: consider lowering more cases, e.g. C = 14, -6, -14 or even 45 161e8d8bef9SDimitry Andric // which equals to (1+2)*16-(1+2). 162e8d8bef9SDimitry Andric // TrailingZeroes is used to test if the mul can be lowered to 163e8d8bef9SDimitry Andric // shift+add+shift. 16406c3fb27SDimitry Andric unsigned TrailingZeroes = ConstValue.countr_zero(); 165e8d8bef9SDimitry Andric if (TrailingZeroes) { 166e8d8bef9SDimitry Andric // Conservatively do not lower to shift+add+shift if the mul might be 167e8d8bef9SDimitry Andric // folded into smul or umul. 168e8d8bef9SDimitry Andric if (MRI.hasOneNonDBGUse(LHS) && 169e8d8bef9SDimitry Andric (isSignExtended(LHS, MRI) || isZeroExtended(LHS, MRI))) 170e8d8bef9SDimitry Andric return false; 171e8d8bef9SDimitry Andric // Conservatively do not lower to shift+add+shift if the mul might be 172e8d8bef9SDimitry Andric // folded into madd or msub. 173e8d8bef9SDimitry Andric if (MRI.hasOneNonDBGUse(Dst)) { 174e8d8bef9SDimitry Andric MachineInstr &UseMI = *MRI.use_instr_begin(Dst); 175fe6060f1SDimitry Andric unsigned UseOpc = UseMI.getOpcode(); 176fe6060f1SDimitry Andric if (UseOpc == TargetOpcode::G_ADD || UseOpc == TargetOpcode::G_PTR_ADD || 177fe6060f1SDimitry Andric UseOpc == TargetOpcode::G_SUB) 178e8d8bef9SDimitry Andric return false; 179e8d8bef9SDimitry Andric } 180e8d8bef9SDimitry Andric } 181e8d8bef9SDimitry Andric // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub 182e8d8bef9SDimitry Andric // and shift+add+shift. 183e8d8bef9SDimitry Andric APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes); 184e8d8bef9SDimitry Andric 185e8d8bef9SDimitry Andric unsigned ShiftAmt, AddSubOpc; 186e8d8bef9SDimitry Andric // Is the shifted value the LHS operand of the add/sub? 187e8d8bef9SDimitry Andric bool ShiftValUseIsLHS = true; 188e8d8bef9SDimitry Andric // Do we need to negate the result? 189e8d8bef9SDimitry Andric bool NegateResult = false; 190e8d8bef9SDimitry Andric 191e8d8bef9SDimitry Andric if (ConstValue.isNonNegative()) { 192e8d8bef9SDimitry Andric // (mul x, 2^N + 1) => (add (shl x, N), x) 193e8d8bef9SDimitry Andric // (mul x, 2^N - 1) => (sub (shl x, N), x) 194e8d8bef9SDimitry Andric // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M) 195e8d8bef9SDimitry Andric APInt SCVMinus1 = ShiftedConstValue - 1; 196e8d8bef9SDimitry Andric APInt CVPlus1 = ConstValue + 1; 197e8d8bef9SDimitry Andric if (SCVMinus1.isPowerOf2()) { 198e8d8bef9SDimitry Andric ShiftAmt = SCVMinus1.logBase2(); 199e8d8bef9SDimitry Andric AddSubOpc = TargetOpcode::G_ADD; 200e8d8bef9SDimitry Andric } else if (CVPlus1.isPowerOf2()) { 201e8d8bef9SDimitry Andric ShiftAmt = CVPlus1.logBase2(); 202e8d8bef9SDimitry Andric AddSubOpc = TargetOpcode::G_SUB; 203e8d8bef9SDimitry Andric } else 204e8d8bef9SDimitry Andric return false; 205e8d8bef9SDimitry Andric } else { 206e8d8bef9SDimitry Andric // (mul x, -(2^N - 1)) => (sub x, (shl x, N)) 207e8d8bef9SDimitry Andric // (mul x, -(2^N + 1)) => - (add (shl x, N), x) 208e8d8bef9SDimitry Andric APInt CVNegPlus1 = -ConstValue + 1; 209e8d8bef9SDimitry Andric APInt CVNegMinus1 = -ConstValue - 1; 210e8d8bef9SDimitry Andric if (CVNegPlus1.isPowerOf2()) { 211e8d8bef9SDimitry Andric ShiftAmt = CVNegPlus1.logBase2(); 212e8d8bef9SDimitry Andric AddSubOpc = TargetOpcode::G_SUB; 213e8d8bef9SDimitry Andric ShiftValUseIsLHS = false; 214e8d8bef9SDimitry Andric } else if (CVNegMinus1.isPowerOf2()) { 215e8d8bef9SDimitry Andric ShiftAmt = CVNegMinus1.logBase2(); 216e8d8bef9SDimitry Andric AddSubOpc = TargetOpcode::G_ADD; 217e8d8bef9SDimitry Andric NegateResult = true; 218e8d8bef9SDimitry Andric } else 219e8d8bef9SDimitry Andric return false; 220e8d8bef9SDimitry Andric } 221e8d8bef9SDimitry Andric 222e8d8bef9SDimitry Andric if (NegateResult && TrailingZeroes) 223e8d8bef9SDimitry Andric return false; 224e8d8bef9SDimitry Andric 225e8d8bef9SDimitry Andric ApplyFn = [=](MachineIRBuilder &B, Register DstReg) { 226e8d8bef9SDimitry Andric auto Shift = B.buildConstant(LLT::scalar(64), ShiftAmt); 227e8d8bef9SDimitry Andric auto ShiftedVal = B.buildShl(Ty, LHS, Shift); 228e8d8bef9SDimitry Andric 229e8d8bef9SDimitry Andric Register AddSubLHS = ShiftValUseIsLHS ? ShiftedVal.getReg(0) : LHS; 230e8d8bef9SDimitry Andric Register AddSubRHS = ShiftValUseIsLHS ? LHS : ShiftedVal.getReg(0); 231e8d8bef9SDimitry Andric auto Res = B.buildInstr(AddSubOpc, {Ty}, {AddSubLHS, AddSubRHS}); 232e8d8bef9SDimitry Andric assert(!(NegateResult && TrailingZeroes) && 233e8d8bef9SDimitry Andric "NegateResult and TrailingZeroes cannot both be true for now."); 234e8d8bef9SDimitry Andric // Negate the result. 235e8d8bef9SDimitry Andric if (NegateResult) { 236e8d8bef9SDimitry Andric B.buildSub(DstReg, B.buildConstant(Ty, 0), Res); 237e8d8bef9SDimitry Andric return; 238e8d8bef9SDimitry Andric } 239e8d8bef9SDimitry Andric // Shift the result. 240e8d8bef9SDimitry Andric if (TrailingZeroes) { 241e8d8bef9SDimitry Andric B.buildShl(DstReg, Res, B.buildConstant(LLT::scalar(64), TrailingZeroes)); 242e8d8bef9SDimitry Andric return; 243e8d8bef9SDimitry Andric } 244e8d8bef9SDimitry Andric B.buildCopy(DstReg, Res.getReg(0)); 245e8d8bef9SDimitry Andric }; 246e8d8bef9SDimitry Andric return true; 247e8d8bef9SDimitry Andric } 248e8d8bef9SDimitry Andric 24906c3fb27SDimitry Andric void applyAArch64MulConstCombine( 250e8d8bef9SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 251e8d8bef9SDimitry Andric std::function<void(MachineIRBuilder &B, Register DstReg)> &ApplyFn) { 252e8d8bef9SDimitry Andric B.setInstrAndDebugLoc(MI); 253e8d8bef9SDimitry Andric ApplyFn(B, MI.getOperand(0).getReg()); 2545ffd83dbSDimitry Andric MI.eraseFromParent(); 2555ffd83dbSDimitry Andric } 2565ffd83dbSDimitry Andric 257fe6060f1SDimitry Andric /// Try to fold a G_MERGE_VALUES of 2 s32 sources, where the second source 258fe6060f1SDimitry Andric /// is a zero, into a G_ZEXT of the first. 259fe6060f1SDimitry Andric bool matchFoldMergeToZext(MachineInstr &MI, MachineRegisterInfo &MRI) { 260fe6060f1SDimitry Andric auto &Merge = cast<GMerge>(MI); 261fe6060f1SDimitry Andric LLT SrcTy = MRI.getType(Merge.getSourceReg(0)); 262fe6060f1SDimitry Andric if (SrcTy != LLT::scalar(32) || Merge.getNumSources() != 2) 263fe6060f1SDimitry Andric return false; 264fe6060f1SDimitry Andric return mi_match(Merge.getSourceReg(1), MRI, m_SpecificICst(0)); 265fe6060f1SDimitry Andric } 266fe6060f1SDimitry Andric 267fe6060f1SDimitry Andric void applyFoldMergeToZext(MachineInstr &MI, MachineRegisterInfo &MRI, 268fe6060f1SDimitry Andric MachineIRBuilder &B, GISelChangeObserver &Observer) { 269fe6060f1SDimitry Andric // Mutate %d(s64) = G_MERGE_VALUES %a(s32), 0(s32) 270fe6060f1SDimitry Andric // -> 271fe6060f1SDimitry Andric // %d(s64) = G_ZEXT %a(s32) 272fe6060f1SDimitry Andric Observer.changingInstr(MI); 273fe6060f1SDimitry Andric MI.setDesc(B.getTII().get(TargetOpcode::G_ZEXT)); 27481ad6265SDimitry Andric MI.removeOperand(2); 275fe6060f1SDimitry Andric Observer.changedInstr(MI); 276fe6060f1SDimitry Andric } 277fe6060f1SDimitry Andric 278349cc55cSDimitry Andric /// \returns True if a G_ANYEXT instruction \p MI should be mutated to a G_ZEXT 279349cc55cSDimitry Andric /// instruction. 28006c3fb27SDimitry Andric bool matchMutateAnyExtToZExt(MachineInstr &MI, MachineRegisterInfo &MRI) { 281349cc55cSDimitry Andric // If this is coming from a scalar compare then we can use a G_ZEXT instead of 282349cc55cSDimitry Andric // a G_ANYEXT: 283349cc55cSDimitry Andric // 284349cc55cSDimitry Andric // %cmp:_(s32) = G_[I|F]CMP ... <-- produces 0/1. 285349cc55cSDimitry Andric // %ext:_(s64) = G_ANYEXT %cmp(s32) 286349cc55cSDimitry Andric // 287349cc55cSDimitry Andric // By doing this, we can leverage more KnownBits combines. 288349cc55cSDimitry Andric assert(MI.getOpcode() == TargetOpcode::G_ANYEXT); 289349cc55cSDimitry Andric Register Dst = MI.getOperand(0).getReg(); 290349cc55cSDimitry Andric Register Src = MI.getOperand(1).getReg(); 291349cc55cSDimitry Andric return MRI.getType(Dst).isScalar() && 292349cc55cSDimitry Andric mi_match(Src, MRI, 293349cc55cSDimitry Andric m_any_of(m_GICmp(m_Pred(), m_Reg(), m_Reg()), 294349cc55cSDimitry Andric m_GFCmp(m_Pred(), m_Reg(), m_Reg()))); 295349cc55cSDimitry Andric } 296349cc55cSDimitry Andric 29706c3fb27SDimitry Andric void applyMutateAnyExtToZExt(MachineInstr &MI, MachineRegisterInfo &MRI, 298349cc55cSDimitry Andric MachineIRBuilder &B, 299349cc55cSDimitry Andric GISelChangeObserver &Observer) { 300349cc55cSDimitry Andric Observer.changingInstr(MI); 301349cc55cSDimitry Andric MI.setDesc(B.getTII().get(TargetOpcode::G_ZEXT)); 302349cc55cSDimitry Andric Observer.changedInstr(MI); 303349cc55cSDimitry Andric } 304349cc55cSDimitry Andric 3050eae32dcSDimitry Andric /// Match a 128b store of zero and split it into two 64 bit stores, for 3060eae32dcSDimitry Andric /// size/performance reasons. 30706c3fb27SDimitry Andric bool matchSplitStoreZero128(MachineInstr &MI, MachineRegisterInfo &MRI) { 3080eae32dcSDimitry Andric GStore &Store = cast<GStore>(MI); 3090eae32dcSDimitry Andric if (!Store.isSimple()) 3100eae32dcSDimitry Andric return false; 3110eae32dcSDimitry Andric LLT ValTy = MRI.getType(Store.getValueReg()); 312*0fca6ea1SDimitry Andric if (ValTy.isScalableVector()) 313*0fca6ea1SDimitry Andric return false; 3140eae32dcSDimitry Andric if (!ValTy.isVector() || ValTy.getSizeInBits() != 128) 3150eae32dcSDimitry Andric return false; 316*0fca6ea1SDimitry Andric if (Store.getMemSizeInBits() != ValTy.getSizeInBits()) 3170eae32dcSDimitry Andric return false; // Don't split truncating stores. 3180eae32dcSDimitry Andric if (!MRI.hasOneNonDBGUse(Store.getValueReg())) 3190eae32dcSDimitry Andric return false; 3200eae32dcSDimitry Andric auto MaybeCst = isConstantOrConstantSplatVector( 3210eae32dcSDimitry Andric *MRI.getVRegDef(Store.getValueReg()), MRI); 3220eae32dcSDimitry Andric return MaybeCst && MaybeCst->isZero(); 3230eae32dcSDimitry Andric } 3240eae32dcSDimitry Andric 32506c3fb27SDimitry Andric void applySplitStoreZero128(MachineInstr &MI, MachineRegisterInfo &MRI, 3260eae32dcSDimitry Andric MachineIRBuilder &B, 3270eae32dcSDimitry Andric GISelChangeObserver &Observer) { 3280eae32dcSDimitry Andric B.setInstrAndDebugLoc(MI); 3290eae32dcSDimitry Andric GStore &Store = cast<GStore>(MI); 3300eae32dcSDimitry Andric assert(MRI.getType(Store.getValueReg()).isVector() && 3310eae32dcSDimitry Andric "Expected a vector store value"); 3320eae32dcSDimitry Andric LLT NewTy = LLT::scalar(64); 3330eae32dcSDimitry Andric Register PtrReg = Store.getPointerReg(); 3340eae32dcSDimitry Andric auto Zero = B.buildConstant(NewTy, 0); 3350eae32dcSDimitry Andric auto HighPtr = B.buildPtrAdd(MRI.getType(PtrReg), PtrReg, 3360eae32dcSDimitry Andric B.buildConstant(LLT::scalar(64), 8)); 3370eae32dcSDimitry Andric auto &MF = *MI.getMF(); 3380eae32dcSDimitry Andric auto *LowMMO = MF.getMachineMemOperand(&Store.getMMO(), 0, NewTy); 3390eae32dcSDimitry Andric auto *HighMMO = MF.getMachineMemOperand(&Store.getMMO(), 8, NewTy); 3400eae32dcSDimitry Andric B.buildStore(Zero, PtrReg, *LowMMO); 3410eae32dcSDimitry Andric B.buildStore(Zero, HighPtr, *HighMMO); 3420eae32dcSDimitry Andric Store.eraseFromParent(); 3430eae32dcSDimitry Andric } 3440eae32dcSDimitry Andric 3455f757f3fSDimitry Andric bool matchOrToBSP(MachineInstr &MI, MachineRegisterInfo &MRI, 3465f757f3fSDimitry Andric std::tuple<Register, Register, Register> &MatchInfo) { 3475f757f3fSDimitry Andric const LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); 3485f757f3fSDimitry Andric if (!DstTy.isVector()) 3495f757f3fSDimitry Andric return false; 3505ffd83dbSDimitry Andric 3515f757f3fSDimitry Andric Register AO1, AO2, BVO1, BVO2; 3525f757f3fSDimitry Andric if (!mi_match(MI, MRI, 3535f757f3fSDimitry Andric m_GOr(m_GAnd(m_Reg(AO1), m_Reg(BVO1)), 3545f757f3fSDimitry Andric m_GAnd(m_Reg(AO2), m_Reg(BVO2))))) 3555f757f3fSDimitry Andric return false; 3565f757f3fSDimitry Andric 3575f757f3fSDimitry Andric auto *BV1 = getOpcodeDef<GBuildVector>(BVO1, MRI); 3585f757f3fSDimitry Andric auto *BV2 = getOpcodeDef<GBuildVector>(BVO2, MRI); 3595f757f3fSDimitry Andric if (!BV1 || !BV2) 3605f757f3fSDimitry Andric return false; 3615f757f3fSDimitry Andric 3625f757f3fSDimitry Andric for (int I = 0, E = DstTy.getNumElements(); I < E; I++) { 3635f757f3fSDimitry Andric auto ValAndVReg1 = 3645f757f3fSDimitry Andric getIConstantVRegValWithLookThrough(BV1->getSourceReg(I), MRI); 3655f757f3fSDimitry Andric auto ValAndVReg2 = 3665f757f3fSDimitry Andric getIConstantVRegValWithLookThrough(BV2->getSourceReg(I), MRI); 3675f757f3fSDimitry Andric if (!ValAndVReg1 || !ValAndVReg2 || 3685f757f3fSDimitry Andric ValAndVReg1->Value != ~ValAndVReg2->Value) 3695f757f3fSDimitry Andric return false; 3705f757f3fSDimitry Andric } 3715f757f3fSDimitry Andric 3725f757f3fSDimitry Andric MatchInfo = {AO1, AO2, BVO1}; 3735f757f3fSDimitry Andric return true; 3745f757f3fSDimitry Andric } 3755f757f3fSDimitry Andric 3765f757f3fSDimitry Andric void applyOrToBSP(MachineInstr &MI, MachineRegisterInfo &MRI, 3775f757f3fSDimitry Andric MachineIRBuilder &B, 3785f757f3fSDimitry Andric std::tuple<Register, Register, Register> &MatchInfo) { 3795f757f3fSDimitry Andric B.setInstrAndDebugLoc(MI); 3805f757f3fSDimitry Andric B.buildInstr( 3815f757f3fSDimitry Andric AArch64::G_BSP, {MI.getOperand(0).getReg()}, 3825f757f3fSDimitry Andric {std::get<2>(MatchInfo), std::get<0>(MatchInfo), std::get<1>(MatchInfo)}); 3835f757f3fSDimitry Andric MI.eraseFromParent(); 3845f757f3fSDimitry Andric } 3855f757f3fSDimitry Andric 386*0fca6ea1SDimitry Andric // Combines Mul(And(Srl(X, 15), 0x10001), 0xffff) into CMLTz 387*0fca6ea1SDimitry Andric bool matchCombineMulCMLT(MachineInstr &MI, MachineRegisterInfo &MRI, 388*0fca6ea1SDimitry Andric Register &SrcReg) { 389*0fca6ea1SDimitry Andric LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); 390*0fca6ea1SDimitry Andric 391*0fca6ea1SDimitry Andric if (DstTy != LLT::fixed_vector(2, 64) && DstTy != LLT::fixed_vector(2, 32) && 392*0fca6ea1SDimitry Andric DstTy != LLT::fixed_vector(4, 32) && DstTy != LLT::fixed_vector(4, 16) && 393*0fca6ea1SDimitry Andric DstTy != LLT::fixed_vector(8, 16)) 394*0fca6ea1SDimitry Andric return false; 395*0fca6ea1SDimitry Andric 396*0fca6ea1SDimitry Andric auto AndMI = getDefIgnoringCopies(MI.getOperand(1).getReg(), MRI); 397*0fca6ea1SDimitry Andric if (AndMI->getOpcode() != TargetOpcode::G_AND) 398*0fca6ea1SDimitry Andric return false; 399*0fca6ea1SDimitry Andric auto LShrMI = getDefIgnoringCopies(AndMI->getOperand(1).getReg(), MRI); 400*0fca6ea1SDimitry Andric if (LShrMI->getOpcode() != TargetOpcode::G_LSHR) 401*0fca6ea1SDimitry Andric return false; 402*0fca6ea1SDimitry Andric 403*0fca6ea1SDimitry Andric // Check the constant splat values 404*0fca6ea1SDimitry Andric auto V1 = isConstantOrConstantSplatVector( 405*0fca6ea1SDimitry Andric *MRI.getVRegDef(MI.getOperand(2).getReg()), MRI); 406*0fca6ea1SDimitry Andric auto V2 = isConstantOrConstantSplatVector( 407*0fca6ea1SDimitry Andric *MRI.getVRegDef(AndMI->getOperand(2).getReg()), MRI); 408*0fca6ea1SDimitry Andric auto V3 = isConstantOrConstantSplatVector( 409*0fca6ea1SDimitry Andric *MRI.getVRegDef(LShrMI->getOperand(2).getReg()), MRI); 410*0fca6ea1SDimitry Andric if (!V1.has_value() || !V2.has_value() || !V3.has_value()) 411*0fca6ea1SDimitry Andric return false; 412*0fca6ea1SDimitry Andric unsigned HalfSize = DstTy.getScalarSizeInBits() / 2; 413*0fca6ea1SDimitry Andric if (!V1.value().isMask(HalfSize) || V2.value() != (1ULL | 1ULL << HalfSize) || 414*0fca6ea1SDimitry Andric V3 != (HalfSize - 1)) 415*0fca6ea1SDimitry Andric return false; 416*0fca6ea1SDimitry Andric 417*0fca6ea1SDimitry Andric SrcReg = LShrMI->getOperand(1).getReg(); 418*0fca6ea1SDimitry Andric 419*0fca6ea1SDimitry Andric return true; 420*0fca6ea1SDimitry Andric } 421*0fca6ea1SDimitry Andric 422*0fca6ea1SDimitry Andric void applyCombineMulCMLT(MachineInstr &MI, MachineRegisterInfo &MRI, 423*0fca6ea1SDimitry Andric MachineIRBuilder &B, Register &SrcReg) { 424*0fca6ea1SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 425*0fca6ea1SDimitry Andric LLT DstTy = MRI.getType(DstReg); 426*0fca6ea1SDimitry Andric LLT HalfTy = 427*0fca6ea1SDimitry Andric DstTy.changeElementCount(DstTy.getElementCount().multiplyCoefficientBy(2)) 428*0fca6ea1SDimitry Andric .changeElementSize(DstTy.getScalarSizeInBits() / 2); 429*0fca6ea1SDimitry Andric 430*0fca6ea1SDimitry Andric Register ZeroVec = B.buildConstant(HalfTy, 0).getReg(0); 431*0fca6ea1SDimitry Andric Register CastReg = 432*0fca6ea1SDimitry Andric B.buildInstr(TargetOpcode::G_BITCAST, {HalfTy}, {SrcReg}).getReg(0); 433*0fca6ea1SDimitry Andric Register CMLTReg = 434*0fca6ea1SDimitry Andric B.buildICmp(CmpInst::Predicate::ICMP_SLT, HalfTy, CastReg, ZeroVec) 435*0fca6ea1SDimitry Andric .getReg(0); 436*0fca6ea1SDimitry Andric 437*0fca6ea1SDimitry Andric B.buildInstr(TargetOpcode::G_BITCAST, {DstReg}, {CMLTReg}).getReg(0); 438*0fca6ea1SDimitry Andric MI.eraseFromParent(); 439*0fca6ea1SDimitry Andric } 440*0fca6ea1SDimitry Andric 4415f757f3fSDimitry Andric class AArch64PostLegalizerCombinerImpl : public Combiner { 4425f757f3fSDimitry Andric protected: 4435f757f3fSDimitry Andric // TODO: Make CombinerHelper methods const. 4445f757f3fSDimitry Andric mutable CombinerHelper Helper; 4455f757f3fSDimitry Andric const AArch64PostLegalizerCombinerImplRuleConfig &RuleConfig; 44606c3fb27SDimitry Andric const AArch64Subtarget &STI; 44706c3fb27SDimitry Andric 44806c3fb27SDimitry Andric public: 44906c3fb27SDimitry Andric AArch64PostLegalizerCombinerImpl( 4505f757f3fSDimitry Andric MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC, 4515f757f3fSDimitry Andric GISelKnownBits &KB, GISelCSEInfo *CSEInfo, 45206c3fb27SDimitry Andric const AArch64PostLegalizerCombinerImplRuleConfig &RuleConfig, 4535f757f3fSDimitry Andric const AArch64Subtarget &STI, MachineDominatorTree *MDT, 4545f757f3fSDimitry Andric const LegalizerInfo *LI); 45506c3fb27SDimitry Andric 45606c3fb27SDimitry Andric static const char *getName() { return "AArch64PostLegalizerCombiner"; } 45706c3fb27SDimitry Andric 4585f757f3fSDimitry Andric bool tryCombineAll(MachineInstr &I) const override; 45906c3fb27SDimitry Andric 46006c3fb27SDimitry Andric private: 46106c3fb27SDimitry Andric #define GET_GICOMBINER_CLASS_MEMBERS 4625ffd83dbSDimitry Andric #include "AArch64GenPostLegalizeGICombiner.inc" 46306c3fb27SDimitry Andric #undef GET_GICOMBINER_CLASS_MEMBERS 46406c3fb27SDimitry Andric }; 46506c3fb27SDimitry Andric 46606c3fb27SDimitry Andric #define GET_GICOMBINER_IMPL 46706c3fb27SDimitry Andric #include "AArch64GenPostLegalizeGICombiner.inc" 46806c3fb27SDimitry Andric #undef GET_GICOMBINER_IMPL 46906c3fb27SDimitry Andric 47006c3fb27SDimitry Andric AArch64PostLegalizerCombinerImpl::AArch64PostLegalizerCombinerImpl( 4715f757f3fSDimitry Andric MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC, 4725f757f3fSDimitry Andric GISelKnownBits &KB, GISelCSEInfo *CSEInfo, 47306c3fb27SDimitry Andric const AArch64PostLegalizerCombinerImplRuleConfig &RuleConfig, 4745f757f3fSDimitry Andric const AArch64Subtarget &STI, MachineDominatorTree *MDT, 4755f757f3fSDimitry Andric const LegalizerInfo *LI) 4765f757f3fSDimitry Andric : Combiner(MF, CInfo, TPC, &KB, CSEInfo), 4775f757f3fSDimitry Andric Helper(Observer, B, /*IsPreLegalize*/ false, &KB, MDT, LI), 4785f757f3fSDimitry Andric RuleConfig(RuleConfig), STI(STI), 47906c3fb27SDimitry Andric #define GET_GICOMBINER_CONSTRUCTOR_INITS 48006c3fb27SDimitry Andric #include "AArch64GenPostLegalizeGICombiner.inc" 48106c3fb27SDimitry Andric #undef GET_GICOMBINER_CONSTRUCTOR_INITS 48206c3fb27SDimitry Andric { 48306c3fb27SDimitry Andric } 4845ffd83dbSDimitry Andric 4855ffd83dbSDimitry Andric class AArch64PostLegalizerCombiner : public MachineFunctionPass { 4865ffd83dbSDimitry Andric public: 4875ffd83dbSDimitry Andric static char ID; 4885ffd83dbSDimitry Andric 4895ffd83dbSDimitry Andric AArch64PostLegalizerCombiner(bool IsOptNone = false); 4905ffd83dbSDimitry Andric 4915ffd83dbSDimitry Andric StringRef getPassName() const override { 4925ffd83dbSDimitry Andric return "AArch64PostLegalizerCombiner"; 4935ffd83dbSDimitry Andric } 4945ffd83dbSDimitry Andric 4955ffd83dbSDimitry Andric bool runOnMachineFunction(MachineFunction &MF) override; 4965ffd83dbSDimitry Andric void getAnalysisUsage(AnalysisUsage &AU) const override; 4975ffd83dbSDimitry Andric 4985ffd83dbSDimitry Andric private: 4995ffd83dbSDimitry Andric bool IsOptNone; 5005f757f3fSDimitry Andric AArch64PostLegalizerCombinerImplRuleConfig RuleConfig; 5015f757f3fSDimitry Andric 5025f757f3fSDimitry Andric 5035f757f3fSDimitry Andric struct StoreInfo { 5045f757f3fSDimitry Andric GStore *St = nullptr; 5055f757f3fSDimitry Andric // The G_PTR_ADD that's used by the store. We keep this to cache the 5065f757f3fSDimitry Andric // MachineInstr def. 5075f757f3fSDimitry Andric GPtrAdd *Ptr = nullptr; 5085f757f3fSDimitry Andric // The signed offset to the Ptr instruction. 5095f757f3fSDimitry Andric int64_t Offset = 0; 5105f757f3fSDimitry Andric LLT StoredType; 5115f757f3fSDimitry Andric }; 5125f757f3fSDimitry Andric bool tryOptimizeConsecStores(SmallVectorImpl<StoreInfo> &Stores, 5135f757f3fSDimitry Andric CSEMIRBuilder &MIB); 5145f757f3fSDimitry Andric 5155f757f3fSDimitry Andric bool optimizeConsecutiveMemOpAddressing(MachineFunction &MF, 5165f757f3fSDimitry Andric CSEMIRBuilder &MIB); 5175ffd83dbSDimitry Andric }; 5185ffd83dbSDimitry Andric } // end anonymous namespace 5195ffd83dbSDimitry Andric 5205ffd83dbSDimitry Andric void AArch64PostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { 5215ffd83dbSDimitry Andric AU.addRequired<TargetPassConfig>(); 5225ffd83dbSDimitry Andric AU.setPreservesCFG(); 5235ffd83dbSDimitry Andric getSelectionDAGFallbackAnalysisUsage(AU); 5245ffd83dbSDimitry Andric AU.addRequired<GISelKnownBitsAnalysis>(); 5255ffd83dbSDimitry Andric AU.addPreserved<GISelKnownBitsAnalysis>(); 5265ffd83dbSDimitry Andric if (!IsOptNone) { 527*0fca6ea1SDimitry Andric AU.addRequired<MachineDominatorTreeWrapperPass>(); 528*0fca6ea1SDimitry Andric AU.addPreserved<MachineDominatorTreeWrapperPass>(); 529fe6060f1SDimitry Andric AU.addRequired<GISelCSEAnalysisWrapperPass>(); 530fe6060f1SDimitry Andric AU.addPreserved<GISelCSEAnalysisWrapperPass>(); 5315ffd83dbSDimitry Andric } 5325ffd83dbSDimitry Andric MachineFunctionPass::getAnalysisUsage(AU); 5335ffd83dbSDimitry Andric } 5345ffd83dbSDimitry Andric 5355ffd83dbSDimitry Andric AArch64PostLegalizerCombiner::AArch64PostLegalizerCombiner(bool IsOptNone) 5365ffd83dbSDimitry Andric : MachineFunctionPass(ID), IsOptNone(IsOptNone) { 5375ffd83dbSDimitry Andric initializeAArch64PostLegalizerCombinerPass(*PassRegistry::getPassRegistry()); 5385f757f3fSDimitry Andric 5395f757f3fSDimitry Andric if (!RuleConfig.parseCommandLineOption()) 5405f757f3fSDimitry Andric report_fatal_error("Invalid rule identifier"); 5415ffd83dbSDimitry Andric } 5425ffd83dbSDimitry Andric 5435ffd83dbSDimitry Andric bool AArch64PostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { 5445ffd83dbSDimitry Andric if (MF.getProperties().hasProperty( 5455ffd83dbSDimitry Andric MachineFunctionProperties::Property::FailedISel)) 5465ffd83dbSDimitry Andric return false; 5475ffd83dbSDimitry Andric assert(MF.getProperties().hasProperty( 5485ffd83dbSDimitry Andric MachineFunctionProperties::Property::Legalized) && 5495ffd83dbSDimitry Andric "Expected a legalized function?"); 5505ffd83dbSDimitry Andric auto *TPC = &getAnalysis<TargetPassConfig>(); 5515ffd83dbSDimitry Andric const Function &F = MF.getFunction(); 5525ffd83dbSDimitry Andric bool EnableOpt = 5535f757f3fSDimitry Andric MF.getTarget().getOptLevel() != CodeGenOptLevel::None && !skipFunction(F); 5545f757f3fSDimitry Andric 5555f757f3fSDimitry Andric const AArch64Subtarget &ST = MF.getSubtarget<AArch64Subtarget>(); 5565f757f3fSDimitry Andric const auto *LI = ST.getLegalizerInfo(); 5575f757f3fSDimitry Andric 5585ffd83dbSDimitry Andric GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF); 5595ffd83dbSDimitry Andric MachineDominatorTree *MDT = 560*0fca6ea1SDimitry Andric IsOptNone ? nullptr 561*0fca6ea1SDimitry Andric : &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); 562fe6060f1SDimitry Andric GISelCSEAnalysisWrapper &Wrapper = 563fe6060f1SDimitry Andric getAnalysis<GISelCSEAnalysisWrapperPass>().getCSEWrapper(); 564fe6060f1SDimitry Andric auto *CSEInfo = &Wrapper.get(TPC->getCSEConfig()); 5655f757f3fSDimitry Andric 5665f757f3fSDimitry Andric CombinerInfo CInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false, 5675f757f3fSDimitry Andric /*LegalizerInfo*/ nullptr, EnableOpt, F.hasOptSize(), 5685f757f3fSDimitry Andric F.hasMinSize()); 5695f757f3fSDimitry Andric AArch64PostLegalizerCombinerImpl Impl(MF, CInfo, TPC, *KB, CSEInfo, 5705f757f3fSDimitry Andric RuleConfig, ST, MDT, LI); 5715f757f3fSDimitry Andric bool Changed = Impl.combineMachineInstrs(); 5725f757f3fSDimitry Andric 5735f757f3fSDimitry Andric auto MIB = CSEMIRBuilder(MF); 5745f757f3fSDimitry Andric MIB.setCSEInfo(CSEInfo); 5755f757f3fSDimitry Andric Changed |= optimizeConsecutiveMemOpAddressing(MF, MIB); 5765f757f3fSDimitry Andric return Changed; 5775f757f3fSDimitry Andric } 5785f757f3fSDimitry Andric 5795f757f3fSDimitry Andric bool AArch64PostLegalizerCombiner::tryOptimizeConsecStores( 5805f757f3fSDimitry Andric SmallVectorImpl<StoreInfo> &Stores, CSEMIRBuilder &MIB) { 5815f757f3fSDimitry Andric if (Stores.size() <= 2) 5825f757f3fSDimitry Andric return false; 5835f757f3fSDimitry Andric 5845f757f3fSDimitry Andric // Profitabity checks: 5855f757f3fSDimitry Andric int64_t BaseOffset = Stores[0].Offset; 5865f757f3fSDimitry Andric unsigned NumPairsExpected = Stores.size() / 2; 5875f757f3fSDimitry Andric unsigned TotalInstsExpected = NumPairsExpected + (Stores.size() % 2); 5885f757f3fSDimitry Andric // Size savings will depend on whether we can fold the offset, as an 5895f757f3fSDimitry Andric // immediate of an ADD. 5905f757f3fSDimitry Andric auto &TLI = *MIB.getMF().getSubtarget().getTargetLowering(); 5915f757f3fSDimitry Andric if (!TLI.isLegalAddImmediate(BaseOffset)) 5925f757f3fSDimitry Andric TotalInstsExpected++; 5935f757f3fSDimitry Andric int SavingsExpected = Stores.size() - TotalInstsExpected; 5945f757f3fSDimitry Andric if (SavingsExpected <= 0) 5955f757f3fSDimitry Andric return false; 5965f757f3fSDimitry Andric 5975f757f3fSDimitry Andric auto &MRI = MIB.getMF().getRegInfo(); 5985f757f3fSDimitry Andric 5995f757f3fSDimitry Andric // We have a series of consecutive stores. Factor out the common base 6005f757f3fSDimitry Andric // pointer and rewrite the offsets. 6015f757f3fSDimitry Andric Register NewBase = Stores[0].Ptr->getReg(0); 6025f757f3fSDimitry Andric for (auto &SInfo : Stores) { 6035f757f3fSDimitry Andric // Compute a new pointer with the new base ptr and adjusted offset. 6045f757f3fSDimitry Andric MIB.setInstrAndDebugLoc(*SInfo.St); 6055f757f3fSDimitry Andric auto NewOff = MIB.buildConstant(LLT::scalar(64), SInfo.Offset - BaseOffset); 6065f757f3fSDimitry Andric auto NewPtr = MIB.buildPtrAdd(MRI.getType(SInfo.St->getPointerReg()), 6075f757f3fSDimitry Andric NewBase, NewOff); 6085f757f3fSDimitry Andric if (MIB.getObserver()) 6095f757f3fSDimitry Andric MIB.getObserver()->changingInstr(*SInfo.St); 6105f757f3fSDimitry Andric SInfo.St->getOperand(1).setReg(NewPtr.getReg(0)); 6115f757f3fSDimitry Andric if (MIB.getObserver()) 6125f757f3fSDimitry Andric MIB.getObserver()->changedInstr(*SInfo.St); 6135f757f3fSDimitry Andric } 6145f757f3fSDimitry Andric LLVM_DEBUG(dbgs() << "Split a series of " << Stores.size() 6155f757f3fSDimitry Andric << " stores into a base pointer and offsets.\n"); 6165f757f3fSDimitry Andric return true; 6175f757f3fSDimitry Andric } 6185f757f3fSDimitry Andric 6195f757f3fSDimitry Andric static cl::opt<bool> 6205f757f3fSDimitry Andric EnableConsecutiveMemOpOpt("aarch64-postlegalizer-consecutive-memops", 6215f757f3fSDimitry Andric cl::init(true), cl::Hidden, 6225f757f3fSDimitry Andric cl::desc("Enable consecutive memop optimization " 6235f757f3fSDimitry Andric "in AArch64PostLegalizerCombiner")); 6245f757f3fSDimitry Andric 6255f757f3fSDimitry Andric bool AArch64PostLegalizerCombiner::optimizeConsecutiveMemOpAddressing( 6265f757f3fSDimitry Andric MachineFunction &MF, CSEMIRBuilder &MIB) { 6275f757f3fSDimitry Andric // This combine needs to run after all reassociations/folds on pointer 6285f757f3fSDimitry Andric // addressing have been done, specifically those that combine two G_PTR_ADDs 6295f757f3fSDimitry Andric // with constant offsets into a single G_PTR_ADD with a combined offset. 6305f757f3fSDimitry Andric // The goal of this optimization is to undo that combine in the case where 6315f757f3fSDimitry Andric // doing so has prevented the formation of pair stores due to illegal 6325f757f3fSDimitry Andric // addressing modes of STP. The reason that we do it here is because 6335f757f3fSDimitry Andric // it's much easier to undo the transformation of a series consecutive 6345f757f3fSDimitry Andric // mem ops, than it is to detect when doing it would be a bad idea looking 6355f757f3fSDimitry Andric // at a single G_PTR_ADD in the reassociation/ptradd_immed_chain combine. 6365f757f3fSDimitry Andric // 6375f757f3fSDimitry Andric // An example: 6385f757f3fSDimitry Andric // G_STORE %11:_(<2 x s64>), %base:_(p0) :: (store (<2 x s64>), align 1) 6395f757f3fSDimitry Andric // %off1:_(s64) = G_CONSTANT i64 4128 6405f757f3fSDimitry Andric // %p1:_(p0) = G_PTR_ADD %0:_, %off1:_(s64) 6415f757f3fSDimitry Andric // G_STORE %11:_(<2 x s64>), %p1:_(p0) :: (store (<2 x s64>), align 1) 6425f757f3fSDimitry Andric // %off2:_(s64) = G_CONSTANT i64 4144 6435f757f3fSDimitry Andric // %p2:_(p0) = G_PTR_ADD %0:_, %off2:_(s64) 6445f757f3fSDimitry Andric // G_STORE %11:_(<2 x s64>), %p2:_(p0) :: (store (<2 x s64>), align 1) 6455f757f3fSDimitry Andric // %off3:_(s64) = G_CONSTANT i64 4160 6465f757f3fSDimitry Andric // %p3:_(p0) = G_PTR_ADD %0:_, %off3:_(s64) 6475f757f3fSDimitry Andric // G_STORE %11:_(<2 x s64>), %17:_(p0) :: (store (<2 x s64>), align 1) 6485f757f3fSDimitry Andric bool Changed = false; 6495f757f3fSDimitry Andric auto &MRI = MF.getRegInfo(); 6505f757f3fSDimitry Andric 6515f757f3fSDimitry Andric if (!EnableConsecutiveMemOpOpt) 6525f757f3fSDimitry Andric return Changed; 6535f757f3fSDimitry Andric 6545f757f3fSDimitry Andric SmallVector<StoreInfo, 8> Stores; 6555f757f3fSDimitry Andric // If we see a load, then we keep track of any values defined by it. 6565f757f3fSDimitry Andric // In the following example, STP formation will fail anyway because 6575f757f3fSDimitry Andric // the latter store is using a load result that appears after the 6585f757f3fSDimitry Andric // the prior store. In this situation if we factor out the offset then 6595f757f3fSDimitry Andric // we increase code size for no benefit. 6605f757f3fSDimitry Andric // G_STORE %v1:_(s64), %base:_(p0) :: (store (s64)) 6615f757f3fSDimitry Andric // %v2:_(s64) = G_LOAD %ldptr:_(p0) :: (load (s64)) 6625f757f3fSDimitry Andric // G_STORE %v2:_(s64), %base:_(p0) :: (store (s64)) 6635f757f3fSDimitry Andric SmallVector<Register> LoadValsSinceLastStore; 6645f757f3fSDimitry Andric 6655f757f3fSDimitry Andric auto storeIsValid = [&](StoreInfo &Last, StoreInfo New) { 6665f757f3fSDimitry Andric // Check if this store is consecutive to the last one. 6675f757f3fSDimitry Andric if (Last.Ptr->getBaseReg() != New.Ptr->getBaseReg() || 6685f757f3fSDimitry Andric (Last.Offset + static_cast<int64_t>(Last.StoredType.getSizeInBytes()) != 6695f757f3fSDimitry Andric New.Offset) || 6705f757f3fSDimitry Andric Last.StoredType != New.StoredType) 6715f757f3fSDimitry Andric return false; 6725f757f3fSDimitry Andric 6735f757f3fSDimitry Andric // Check if this store is using a load result that appears after the 6745f757f3fSDimitry Andric // last store. If so, bail out. 6755f757f3fSDimitry Andric if (any_of(LoadValsSinceLastStore, [&](Register LoadVal) { 6765f757f3fSDimitry Andric return New.St->getValueReg() == LoadVal; 6775f757f3fSDimitry Andric })) 6785f757f3fSDimitry Andric return false; 6795f757f3fSDimitry Andric 6805f757f3fSDimitry Andric // Check if the current offset would be too large for STP. 6815f757f3fSDimitry Andric // If not, then STP formation should be able to handle it, so we don't 6825f757f3fSDimitry Andric // need to do anything. 6835f757f3fSDimitry Andric int64_t MaxLegalOffset; 6845f757f3fSDimitry Andric switch (New.StoredType.getSizeInBits()) { 6855f757f3fSDimitry Andric case 32: 6865f757f3fSDimitry Andric MaxLegalOffset = 252; 6875f757f3fSDimitry Andric break; 6885f757f3fSDimitry Andric case 64: 6895f757f3fSDimitry Andric MaxLegalOffset = 504; 6905f757f3fSDimitry Andric break; 6915f757f3fSDimitry Andric case 128: 6925f757f3fSDimitry Andric MaxLegalOffset = 1008; 6935f757f3fSDimitry Andric break; 6945f757f3fSDimitry Andric default: 6955f757f3fSDimitry Andric llvm_unreachable("Unexpected stored type size"); 6965f757f3fSDimitry Andric } 6975f757f3fSDimitry Andric if (New.Offset < MaxLegalOffset) 6985f757f3fSDimitry Andric return false; 6995f757f3fSDimitry Andric 7005f757f3fSDimitry Andric // If factoring it out still wouldn't help then don't bother. 7015f757f3fSDimitry Andric return New.Offset - Stores[0].Offset <= MaxLegalOffset; 7025f757f3fSDimitry Andric }; 7035f757f3fSDimitry Andric 7045f757f3fSDimitry Andric auto resetState = [&]() { 7055f757f3fSDimitry Andric Stores.clear(); 7065f757f3fSDimitry Andric LoadValsSinceLastStore.clear(); 7075f757f3fSDimitry Andric }; 7085f757f3fSDimitry Andric 7095f757f3fSDimitry Andric for (auto &MBB : MF) { 7105f757f3fSDimitry Andric // We're looking inside a single BB at a time since the memset pattern 7115f757f3fSDimitry Andric // should only be in a single block. 7125f757f3fSDimitry Andric resetState(); 7135f757f3fSDimitry Andric for (auto &MI : MBB) { 714*0fca6ea1SDimitry Andric // Skip for scalable vectors 715*0fca6ea1SDimitry Andric if (auto *LdSt = dyn_cast<GLoadStore>(&MI); 716*0fca6ea1SDimitry Andric LdSt && MRI.getType(LdSt->getOperand(0).getReg()).isScalableVector()) 717*0fca6ea1SDimitry Andric continue; 718*0fca6ea1SDimitry Andric 7195f757f3fSDimitry Andric if (auto *St = dyn_cast<GStore>(&MI)) { 7205f757f3fSDimitry Andric Register PtrBaseReg; 7215f757f3fSDimitry Andric APInt Offset; 7225f757f3fSDimitry Andric LLT StoredValTy = MRI.getType(St->getValueReg()); 7235f757f3fSDimitry Andric unsigned ValSize = StoredValTy.getSizeInBits(); 724*0fca6ea1SDimitry Andric if (ValSize < 32 || St->getMMO().getSizeInBits() != ValSize) 7255f757f3fSDimitry Andric continue; 7265f757f3fSDimitry Andric 7275f757f3fSDimitry Andric Register PtrReg = St->getPointerReg(); 7285f757f3fSDimitry Andric if (mi_match( 7295f757f3fSDimitry Andric PtrReg, MRI, 7305f757f3fSDimitry Andric m_OneNonDBGUse(m_GPtrAdd(m_Reg(PtrBaseReg), m_ICst(Offset))))) { 7315f757f3fSDimitry Andric GPtrAdd *PtrAdd = cast<GPtrAdd>(MRI.getVRegDef(PtrReg)); 7325f757f3fSDimitry Andric StoreInfo New = {St, PtrAdd, Offset.getSExtValue(), StoredValTy}; 7335f757f3fSDimitry Andric 7345f757f3fSDimitry Andric if (Stores.empty()) { 7355f757f3fSDimitry Andric Stores.push_back(New); 7365f757f3fSDimitry Andric continue; 7375f757f3fSDimitry Andric } 7385f757f3fSDimitry Andric 7395f757f3fSDimitry Andric // Check if this store is a valid continuation of the sequence. 7405f757f3fSDimitry Andric auto &Last = Stores.back(); 7415f757f3fSDimitry Andric if (storeIsValid(Last, New)) { 7425f757f3fSDimitry Andric Stores.push_back(New); 7435f757f3fSDimitry Andric LoadValsSinceLastStore.clear(); // Reset the load value tracking. 7445f757f3fSDimitry Andric } else { 7455f757f3fSDimitry Andric // The store isn't a valid to consider for the prior sequence, 7465f757f3fSDimitry Andric // so try to optimize what we have so far and start a new sequence. 7475f757f3fSDimitry Andric Changed |= tryOptimizeConsecStores(Stores, MIB); 7485f757f3fSDimitry Andric resetState(); 7495f757f3fSDimitry Andric Stores.push_back(New); 7505f757f3fSDimitry Andric } 7515f757f3fSDimitry Andric } 7525f757f3fSDimitry Andric } else if (auto *Ld = dyn_cast<GLoad>(&MI)) { 7535f757f3fSDimitry Andric LoadValsSinceLastStore.push_back(Ld->getDstReg()); 7545f757f3fSDimitry Andric } 7555f757f3fSDimitry Andric } 7565f757f3fSDimitry Andric Changed |= tryOptimizeConsecStores(Stores, MIB); 7575f757f3fSDimitry Andric resetState(); 7585f757f3fSDimitry Andric } 7595f757f3fSDimitry Andric 7605f757f3fSDimitry Andric return Changed; 7615ffd83dbSDimitry Andric } 7625ffd83dbSDimitry Andric 7635ffd83dbSDimitry Andric char AArch64PostLegalizerCombiner::ID = 0; 7645ffd83dbSDimitry Andric INITIALIZE_PASS_BEGIN(AArch64PostLegalizerCombiner, DEBUG_TYPE, 7655ffd83dbSDimitry Andric "Combine AArch64 MachineInstrs after legalization", false, 7665ffd83dbSDimitry Andric false) 7675ffd83dbSDimitry Andric INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 7685ffd83dbSDimitry Andric INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis) 7695ffd83dbSDimitry Andric INITIALIZE_PASS_END(AArch64PostLegalizerCombiner, DEBUG_TYPE, 7705ffd83dbSDimitry Andric "Combine AArch64 MachineInstrs after legalization", false, 7715ffd83dbSDimitry Andric false) 7725ffd83dbSDimitry Andric 7735ffd83dbSDimitry Andric namespace llvm { 774e8d8bef9SDimitry Andric FunctionPass *createAArch64PostLegalizerCombiner(bool IsOptNone) { 7755ffd83dbSDimitry Andric return new AArch64PostLegalizerCombiner(IsOptNone); 7765ffd83dbSDimitry Andric } 7775ffd83dbSDimitry Andric } // end namespace llvm 778