1*0fca6ea1SDimitry Andric //===-------- LoopIdiomVectorize.cpp - Loop idiom vectorization -----------===// 2*0fca6ea1SDimitry Andric // 3*0fca6ea1SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4*0fca6ea1SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 5*0fca6ea1SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6*0fca6ea1SDimitry Andric // 7*0fca6ea1SDimitry Andric //===----------------------------------------------------------------------===// 8*0fca6ea1SDimitry Andric // 9*0fca6ea1SDimitry Andric // This pass implements a pass that recognizes certain loop idioms and 10*0fca6ea1SDimitry Andric // transforms them into more optimized versions of the same loop. In cases 11*0fca6ea1SDimitry Andric // where this happens, it can be a significant performance win. 12*0fca6ea1SDimitry Andric // 13*0fca6ea1SDimitry Andric // We currently only recognize one loop that finds the first mismatched byte 14*0fca6ea1SDimitry Andric // in an array and returns the index, i.e. something like: 15*0fca6ea1SDimitry Andric // 16*0fca6ea1SDimitry Andric // while (++i != n) { 17*0fca6ea1SDimitry Andric // if (a[i] != b[i]) 18*0fca6ea1SDimitry Andric // break; 19*0fca6ea1SDimitry Andric // } 20*0fca6ea1SDimitry Andric // 21*0fca6ea1SDimitry Andric // In this example we can actually vectorize the loop despite the early exit, 22*0fca6ea1SDimitry Andric // although the loop vectorizer does not support it. It requires some extra 23*0fca6ea1SDimitry Andric // checks to deal with the possibility of faulting loads when crossing page 24*0fca6ea1SDimitry Andric // boundaries. However, even with these checks it is still profitable to do the 25*0fca6ea1SDimitry Andric // transformation. 26*0fca6ea1SDimitry Andric // 27*0fca6ea1SDimitry Andric //===----------------------------------------------------------------------===// 28*0fca6ea1SDimitry Andric // 29*0fca6ea1SDimitry Andric // NOTE: This Pass matches a really specific loop pattern because it's only 30*0fca6ea1SDimitry Andric // supposed to be a temporary solution until our LoopVectorizer is powerful 31*0fca6ea1SDimitry Andric // enought to vectorize it automatically. 32*0fca6ea1SDimitry Andric // 33*0fca6ea1SDimitry Andric // TODO List: 34*0fca6ea1SDimitry Andric // 35*0fca6ea1SDimitry Andric // * Add support for the inverse case where we scan for a matching element. 36*0fca6ea1SDimitry Andric // * Permit 64-bit induction variable types. 37*0fca6ea1SDimitry Andric // * Recognize loops that increment the IV *after* comparing bytes. 38*0fca6ea1SDimitry Andric // * Allow 32-bit sign-extends of the IV used by the GEP. 39*0fca6ea1SDimitry Andric // 40*0fca6ea1SDimitry Andric //===----------------------------------------------------------------------===// 41*0fca6ea1SDimitry Andric 42*0fca6ea1SDimitry Andric #include "llvm/Transforms/Vectorize/LoopIdiomVectorize.h" 43*0fca6ea1SDimitry Andric #include "llvm/Analysis/DomTreeUpdater.h" 44*0fca6ea1SDimitry Andric #include "llvm/Analysis/LoopPass.h" 45*0fca6ea1SDimitry Andric #include "llvm/Analysis/TargetTransformInfo.h" 46*0fca6ea1SDimitry Andric #include "llvm/IR/Dominators.h" 47*0fca6ea1SDimitry Andric #include "llvm/IR/IRBuilder.h" 48*0fca6ea1SDimitry Andric #include "llvm/IR/Intrinsics.h" 49*0fca6ea1SDimitry Andric #include "llvm/IR/MDBuilder.h" 50*0fca6ea1SDimitry Andric #include "llvm/IR/PatternMatch.h" 51*0fca6ea1SDimitry Andric #include "llvm/Transforms/Utils/BasicBlockUtils.h" 52*0fca6ea1SDimitry Andric 53*0fca6ea1SDimitry Andric using namespace llvm; 54*0fca6ea1SDimitry Andric using namespace PatternMatch; 55*0fca6ea1SDimitry Andric 56*0fca6ea1SDimitry Andric #define DEBUG_TYPE "loop-idiom-vectorize" 57*0fca6ea1SDimitry Andric 58*0fca6ea1SDimitry Andric static cl::opt<bool> DisableAll("disable-loop-idiom-vectorize-all", cl::Hidden, 59*0fca6ea1SDimitry Andric cl::init(false), 60*0fca6ea1SDimitry Andric cl::desc("Disable Loop Idiom Vectorize Pass.")); 61*0fca6ea1SDimitry Andric 62*0fca6ea1SDimitry Andric static cl::opt<LoopIdiomVectorizeStyle> 63*0fca6ea1SDimitry Andric LITVecStyle("loop-idiom-vectorize-style", cl::Hidden, 64*0fca6ea1SDimitry Andric cl::desc("The vectorization style for loop idiom transform."), 65*0fca6ea1SDimitry Andric cl::values(clEnumValN(LoopIdiomVectorizeStyle::Masked, "masked", 66*0fca6ea1SDimitry Andric "Use masked vector intrinsics"), 67*0fca6ea1SDimitry Andric clEnumValN(LoopIdiomVectorizeStyle::Predicated, 68*0fca6ea1SDimitry Andric "predicated", "Use VP intrinsics")), 69*0fca6ea1SDimitry Andric cl::init(LoopIdiomVectorizeStyle::Masked)); 70*0fca6ea1SDimitry Andric 71*0fca6ea1SDimitry Andric static cl::opt<bool> 72*0fca6ea1SDimitry Andric DisableByteCmp("disable-loop-idiom-vectorize-bytecmp", cl::Hidden, 73*0fca6ea1SDimitry Andric cl::init(false), 74*0fca6ea1SDimitry Andric cl::desc("Proceed with Loop Idiom Vectorize Pass, but do " 75*0fca6ea1SDimitry Andric "not convert byte-compare loop(s).")); 76*0fca6ea1SDimitry Andric 77*0fca6ea1SDimitry Andric static cl::opt<unsigned> 78*0fca6ea1SDimitry Andric ByteCmpVF("loop-idiom-vectorize-bytecmp-vf", cl::Hidden, 79*0fca6ea1SDimitry Andric cl::desc("The vectorization factor for byte-compare patterns."), 80*0fca6ea1SDimitry Andric cl::init(16)); 81*0fca6ea1SDimitry Andric 82*0fca6ea1SDimitry Andric static cl::opt<bool> 83*0fca6ea1SDimitry Andric VerifyLoops("loop-idiom-vectorize-verify", cl::Hidden, cl::init(false), 84*0fca6ea1SDimitry Andric cl::desc("Verify loops generated Loop Idiom Vectorize Pass.")); 85*0fca6ea1SDimitry Andric 86*0fca6ea1SDimitry Andric namespace { 87*0fca6ea1SDimitry Andric class LoopIdiomVectorize { 88*0fca6ea1SDimitry Andric LoopIdiomVectorizeStyle VectorizeStyle; 89*0fca6ea1SDimitry Andric unsigned ByteCompareVF; 90*0fca6ea1SDimitry Andric Loop *CurLoop = nullptr; 91*0fca6ea1SDimitry Andric DominatorTree *DT; 92*0fca6ea1SDimitry Andric LoopInfo *LI; 93*0fca6ea1SDimitry Andric const TargetTransformInfo *TTI; 94*0fca6ea1SDimitry Andric const DataLayout *DL; 95*0fca6ea1SDimitry Andric 96*0fca6ea1SDimitry Andric // Blocks that will be used for inserting vectorized code. 97*0fca6ea1SDimitry Andric BasicBlock *EndBlock = nullptr; 98*0fca6ea1SDimitry Andric BasicBlock *VectorLoopPreheaderBlock = nullptr; 99*0fca6ea1SDimitry Andric BasicBlock *VectorLoopStartBlock = nullptr; 100*0fca6ea1SDimitry Andric BasicBlock *VectorLoopMismatchBlock = nullptr; 101*0fca6ea1SDimitry Andric BasicBlock *VectorLoopIncBlock = nullptr; 102*0fca6ea1SDimitry Andric 103*0fca6ea1SDimitry Andric public: 104*0fca6ea1SDimitry Andric LoopIdiomVectorize(LoopIdiomVectorizeStyle S, unsigned VF, DominatorTree *DT, 105*0fca6ea1SDimitry Andric LoopInfo *LI, const TargetTransformInfo *TTI, 106*0fca6ea1SDimitry Andric const DataLayout *DL) 107*0fca6ea1SDimitry Andric : VectorizeStyle(S), ByteCompareVF(VF), DT(DT), LI(LI), TTI(TTI), DL(DL) { 108*0fca6ea1SDimitry Andric } 109*0fca6ea1SDimitry Andric 110*0fca6ea1SDimitry Andric bool run(Loop *L); 111*0fca6ea1SDimitry Andric 112*0fca6ea1SDimitry Andric private: 113*0fca6ea1SDimitry Andric /// \name Countable Loop Idiom Handling 114*0fca6ea1SDimitry Andric /// @{ 115*0fca6ea1SDimitry Andric 116*0fca6ea1SDimitry Andric bool runOnCountableLoop(); 117*0fca6ea1SDimitry Andric bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, 118*0fca6ea1SDimitry Andric SmallVectorImpl<BasicBlock *> &ExitBlocks); 119*0fca6ea1SDimitry Andric 120*0fca6ea1SDimitry Andric bool recognizeByteCompare(); 121*0fca6ea1SDimitry Andric 122*0fca6ea1SDimitry Andric Value *expandFindMismatch(IRBuilder<> &Builder, DomTreeUpdater &DTU, 123*0fca6ea1SDimitry Andric GetElementPtrInst *GEPA, GetElementPtrInst *GEPB, 124*0fca6ea1SDimitry Andric Instruction *Index, Value *Start, Value *MaxLen); 125*0fca6ea1SDimitry Andric 126*0fca6ea1SDimitry Andric Value *createMaskedFindMismatch(IRBuilder<> &Builder, DomTreeUpdater &DTU, 127*0fca6ea1SDimitry Andric GetElementPtrInst *GEPA, 128*0fca6ea1SDimitry Andric GetElementPtrInst *GEPB, Value *ExtStart, 129*0fca6ea1SDimitry Andric Value *ExtEnd); 130*0fca6ea1SDimitry Andric Value *createPredicatedFindMismatch(IRBuilder<> &Builder, DomTreeUpdater &DTU, 131*0fca6ea1SDimitry Andric GetElementPtrInst *GEPA, 132*0fca6ea1SDimitry Andric GetElementPtrInst *GEPB, Value *ExtStart, 133*0fca6ea1SDimitry Andric Value *ExtEnd); 134*0fca6ea1SDimitry Andric 135*0fca6ea1SDimitry Andric void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB, 136*0fca6ea1SDimitry Andric PHINode *IndPhi, Value *MaxLen, Instruction *Index, 137*0fca6ea1SDimitry Andric Value *Start, bool IncIdx, BasicBlock *FoundBB, 138*0fca6ea1SDimitry Andric BasicBlock *EndBB); 139*0fca6ea1SDimitry Andric /// @} 140*0fca6ea1SDimitry Andric }; 141*0fca6ea1SDimitry Andric } // anonymous namespace 142*0fca6ea1SDimitry Andric 143*0fca6ea1SDimitry Andric PreservedAnalyses LoopIdiomVectorizePass::run(Loop &L, LoopAnalysisManager &AM, 144*0fca6ea1SDimitry Andric LoopStandardAnalysisResults &AR, 145*0fca6ea1SDimitry Andric LPMUpdater &) { 146*0fca6ea1SDimitry Andric if (DisableAll) 147*0fca6ea1SDimitry Andric return PreservedAnalyses::all(); 148*0fca6ea1SDimitry Andric 149*0fca6ea1SDimitry Andric const auto *DL = &L.getHeader()->getDataLayout(); 150*0fca6ea1SDimitry Andric 151*0fca6ea1SDimitry Andric LoopIdiomVectorizeStyle VecStyle = VectorizeStyle; 152*0fca6ea1SDimitry Andric if (LITVecStyle.getNumOccurrences()) 153*0fca6ea1SDimitry Andric VecStyle = LITVecStyle; 154*0fca6ea1SDimitry Andric 155*0fca6ea1SDimitry Andric unsigned BCVF = ByteCompareVF; 156*0fca6ea1SDimitry Andric if (ByteCmpVF.getNumOccurrences()) 157*0fca6ea1SDimitry Andric BCVF = ByteCmpVF; 158*0fca6ea1SDimitry Andric 159*0fca6ea1SDimitry Andric LoopIdiomVectorize LIV(VecStyle, BCVF, &AR.DT, &AR.LI, &AR.TTI, DL); 160*0fca6ea1SDimitry Andric if (!LIV.run(&L)) 161*0fca6ea1SDimitry Andric return PreservedAnalyses::all(); 162*0fca6ea1SDimitry Andric 163*0fca6ea1SDimitry Andric return PreservedAnalyses::none(); 164*0fca6ea1SDimitry Andric } 165*0fca6ea1SDimitry Andric 166*0fca6ea1SDimitry Andric //===----------------------------------------------------------------------===// 167*0fca6ea1SDimitry Andric // 168*0fca6ea1SDimitry Andric // Implementation of LoopIdiomVectorize 169*0fca6ea1SDimitry Andric // 170*0fca6ea1SDimitry Andric //===----------------------------------------------------------------------===// 171*0fca6ea1SDimitry Andric 172*0fca6ea1SDimitry Andric bool LoopIdiomVectorize::run(Loop *L) { 173*0fca6ea1SDimitry Andric CurLoop = L; 174*0fca6ea1SDimitry Andric 175*0fca6ea1SDimitry Andric Function &F = *L->getHeader()->getParent(); 176*0fca6ea1SDimitry Andric if (DisableAll || F.hasOptSize()) 177*0fca6ea1SDimitry Andric return false; 178*0fca6ea1SDimitry Andric 179*0fca6ea1SDimitry Andric if (F.hasFnAttribute(Attribute::NoImplicitFloat)) { 180*0fca6ea1SDimitry Andric LLVM_DEBUG(dbgs() << DEBUG_TYPE << " is disabled on " << F.getName() 181*0fca6ea1SDimitry Andric << " due to its NoImplicitFloat attribute"); 182*0fca6ea1SDimitry Andric return false; 183*0fca6ea1SDimitry Andric } 184*0fca6ea1SDimitry Andric 185*0fca6ea1SDimitry Andric // If the loop could not be converted to canonical form, it must have an 186*0fca6ea1SDimitry Andric // indirectbr in it, just give up. 187*0fca6ea1SDimitry Andric if (!L->getLoopPreheader()) 188*0fca6ea1SDimitry Andric return false; 189*0fca6ea1SDimitry Andric 190*0fca6ea1SDimitry Andric LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F[" << F.getName() << "] Loop %" 191*0fca6ea1SDimitry Andric << CurLoop->getHeader()->getName() << "\n"); 192*0fca6ea1SDimitry Andric 193*0fca6ea1SDimitry Andric return recognizeByteCompare(); 194*0fca6ea1SDimitry Andric } 195*0fca6ea1SDimitry Andric 196*0fca6ea1SDimitry Andric bool LoopIdiomVectorize::recognizeByteCompare() { 197*0fca6ea1SDimitry Andric // Currently the transformation only works on scalable vector types, although 198*0fca6ea1SDimitry Andric // there is no fundamental reason why it cannot be made to work for fixed 199*0fca6ea1SDimitry Andric // width too. 200*0fca6ea1SDimitry Andric 201*0fca6ea1SDimitry Andric // We also need to know the minimum page size for the target in order to 202*0fca6ea1SDimitry Andric // generate runtime memory checks to ensure the vector version won't fault. 203*0fca6ea1SDimitry Andric if (!TTI->supportsScalableVectors() || !TTI->getMinPageSize().has_value() || 204*0fca6ea1SDimitry Andric DisableByteCmp) 205*0fca6ea1SDimitry Andric return false; 206*0fca6ea1SDimitry Andric 207*0fca6ea1SDimitry Andric BasicBlock *Header = CurLoop->getHeader(); 208*0fca6ea1SDimitry Andric 209*0fca6ea1SDimitry Andric // In LoopIdiomVectorize::run we have already checked that the loop 210*0fca6ea1SDimitry Andric // has a preheader so we can assume it's in a canonical form. 211*0fca6ea1SDimitry Andric if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 2) 212*0fca6ea1SDimitry Andric return false; 213*0fca6ea1SDimitry Andric 214*0fca6ea1SDimitry Andric PHINode *PN = dyn_cast<PHINode>(&Header->front()); 215*0fca6ea1SDimitry Andric if (!PN || PN->getNumIncomingValues() != 2) 216*0fca6ea1SDimitry Andric return false; 217*0fca6ea1SDimitry Andric 218*0fca6ea1SDimitry Andric auto LoopBlocks = CurLoop->getBlocks(); 219*0fca6ea1SDimitry Andric // The first block in the loop should contain only 4 instructions, e.g. 220*0fca6ea1SDimitry Andric // 221*0fca6ea1SDimitry Andric // while.cond: 222*0fca6ea1SDimitry Andric // %res.phi = phi i32 [ %start, %ph ], [ %inc, %while.body ] 223*0fca6ea1SDimitry Andric // %inc = add i32 %res.phi, 1 224*0fca6ea1SDimitry Andric // %cmp.not = icmp eq i32 %inc, %n 225*0fca6ea1SDimitry Andric // br i1 %cmp.not, label %while.end, label %while.body 226*0fca6ea1SDimitry Andric // 227*0fca6ea1SDimitry Andric if (LoopBlocks[0]->sizeWithoutDebug() > 4) 228*0fca6ea1SDimitry Andric return false; 229*0fca6ea1SDimitry Andric 230*0fca6ea1SDimitry Andric // The second block should contain 7 instructions, e.g. 231*0fca6ea1SDimitry Andric // 232*0fca6ea1SDimitry Andric // while.body: 233*0fca6ea1SDimitry Andric // %idx = zext i32 %inc to i64 234*0fca6ea1SDimitry Andric // %idx.a = getelementptr inbounds i8, ptr %a, i64 %idx 235*0fca6ea1SDimitry Andric // %load.a = load i8, ptr %idx.a 236*0fca6ea1SDimitry Andric // %idx.b = getelementptr inbounds i8, ptr %b, i64 %idx 237*0fca6ea1SDimitry Andric // %load.b = load i8, ptr %idx.b 238*0fca6ea1SDimitry Andric // %cmp.not.ld = icmp eq i8 %load.a, %load.b 239*0fca6ea1SDimitry Andric // br i1 %cmp.not.ld, label %while.cond, label %while.end 240*0fca6ea1SDimitry Andric // 241*0fca6ea1SDimitry Andric if (LoopBlocks[1]->sizeWithoutDebug() > 7) 242*0fca6ea1SDimitry Andric return false; 243*0fca6ea1SDimitry Andric 244*0fca6ea1SDimitry Andric // The incoming value to the PHI node from the loop should be an add of 1. 245*0fca6ea1SDimitry Andric Value *StartIdx = nullptr; 246*0fca6ea1SDimitry Andric Instruction *Index = nullptr; 247*0fca6ea1SDimitry Andric if (!CurLoop->contains(PN->getIncomingBlock(0))) { 248*0fca6ea1SDimitry Andric StartIdx = PN->getIncomingValue(0); 249*0fca6ea1SDimitry Andric Index = dyn_cast<Instruction>(PN->getIncomingValue(1)); 250*0fca6ea1SDimitry Andric } else { 251*0fca6ea1SDimitry Andric StartIdx = PN->getIncomingValue(1); 252*0fca6ea1SDimitry Andric Index = dyn_cast<Instruction>(PN->getIncomingValue(0)); 253*0fca6ea1SDimitry Andric } 254*0fca6ea1SDimitry Andric 255*0fca6ea1SDimitry Andric // Limit to 32-bit types for now 256*0fca6ea1SDimitry Andric if (!Index || !Index->getType()->isIntegerTy(32) || 257*0fca6ea1SDimitry Andric !match(Index, m_c_Add(m_Specific(PN), m_One()))) 258*0fca6ea1SDimitry Andric return false; 259*0fca6ea1SDimitry Andric 260*0fca6ea1SDimitry Andric // If we match the pattern, PN and Index will be replaced with the result of 261*0fca6ea1SDimitry Andric // the cttz.elts intrinsic. If any other instructions are used outside of 262*0fca6ea1SDimitry Andric // the loop, we cannot replace it. 263*0fca6ea1SDimitry Andric for (BasicBlock *BB : LoopBlocks) 264*0fca6ea1SDimitry Andric for (Instruction &I : *BB) 265*0fca6ea1SDimitry Andric if (&I != PN && &I != Index) 266*0fca6ea1SDimitry Andric for (User *U : I.users()) 267*0fca6ea1SDimitry Andric if (!CurLoop->contains(cast<Instruction>(U))) 268*0fca6ea1SDimitry Andric return false; 269*0fca6ea1SDimitry Andric 270*0fca6ea1SDimitry Andric // Match the branch instruction for the header 271*0fca6ea1SDimitry Andric ICmpInst::Predicate Pred; 272*0fca6ea1SDimitry Andric Value *MaxLen; 273*0fca6ea1SDimitry Andric BasicBlock *EndBB, *WhileBB; 274*0fca6ea1SDimitry Andric if (!match(Header->getTerminator(), 275*0fca6ea1SDimitry Andric m_Br(m_ICmp(Pred, m_Specific(Index), m_Value(MaxLen)), 276*0fca6ea1SDimitry Andric m_BasicBlock(EndBB), m_BasicBlock(WhileBB))) || 277*0fca6ea1SDimitry Andric Pred != ICmpInst::Predicate::ICMP_EQ || !CurLoop->contains(WhileBB)) 278*0fca6ea1SDimitry Andric return false; 279*0fca6ea1SDimitry Andric 280*0fca6ea1SDimitry Andric // WhileBB should contain the pattern of load & compare instructions. Match 281*0fca6ea1SDimitry Andric // the pattern and find the GEP instructions used by the loads. 282*0fca6ea1SDimitry Andric ICmpInst::Predicate WhilePred; 283*0fca6ea1SDimitry Andric BasicBlock *FoundBB; 284*0fca6ea1SDimitry Andric BasicBlock *TrueBB; 285*0fca6ea1SDimitry Andric Value *LoadA, *LoadB; 286*0fca6ea1SDimitry Andric if (!match(WhileBB->getTerminator(), 287*0fca6ea1SDimitry Andric m_Br(m_ICmp(WhilePred, m_Value(LoadA), m_Value(LoadB)), 288*0fca6ea1SDimitry Andric m_BasicBlock(TrueBB), m_BasicBlock(FoundBB))) || 289*0fca6ea1SDimitry Andric WhilePred != ICmpInst::Predicate::ICMP_EQ || !CurLoop->contains(TrueBB)) 290*0fca6ea1SDimitry Andric return false; 291*0fca6ea1SDimitry Andric 292*0fca6ea1SDimitry Andric Value *A, *B; 293*0fca6ea1SDimitry Andric if (!match(LoadA, m_Load(m_Value(A))) || !match(LoadB, m_Load(m_Value(B)))) 294*0fca6ea1SDimitry Andric return false; 295*0fca6ea1SDimitry Andric 296*0fca6ea1SDimitry Andric LoadInst *LoadAI = cast<LoadInst>(LoadA); 297*0fca6ea1SDimitry Andric LoadInst *LoadBI = cast<LoadInst>(LoadB); 298*0fca6ea1SDimitry Andric if (!LoadAI->isSimple() || !LoadBI->isSimple()) 299*0fca6ea1SDimitry Andric return false; 300*0fca6ea1SDimitry Andric 301*0fca6ea1SDimitry Andric GetElementPtrInst *GEPA = dyn_cast<GetElementPtrInst>(A); 302*0fca6ea1SDimitry Andric GetElementPtrInst *GEPB = dyn_cast<GetElementPtrInst>(B); 303*0fca6ea1SDimitry Andric 304*0fca6ea1SDimitry Andric if (!GEPA || !GEPB) 305*0fca6ea1SDimitry Andric return false; 306*0fca6ea1SDimitry Andric 307*0fca6ea1SDimitry Andric Value *PtrA = GEPA->getPointerOperand(); 308*0fca6ea1SDimitry Andric Value *PtrB = GEPB->getPointerOperand(); 309*0fca6ea1SDimitry Andric 310*0fca6ea1SDimitry Andric // Check we are loading i8 values from two loop invariant pointers 311*0fca6ea1SDimitry Andric if (!CurLoop->isLoopInvariant(PtrA) || !CurLoop->isLoopInvariant(PtrB) || 312*0fca6ea1SDimitry Andric !GEPA->getResultElementType()->isIntegerTy(8) || 313*0fca6ea1SDimitry Andric !GEPB->getResultElementType()->isIntegerTy(8) || 314*0fca6ea1SDimitry Andric !LoadAI->getType()->isIntegerTy(8) || 315*0fca6ea1SDimitry Andric !LoadBI->getType()->isIntegerTy(8) || PtrA == PtrB) 316*0fca6ea1SDimitry Andric return false; 317*0fca6ea1SDimitry Andric 318*0fca6ea1SDimitry Andric // Check that the index to the GEPs is the index we found earlier 319*0fca6ea1SDimitry Andric if (GEPA->getNumIndices() > 1 || GEPB->getNumIndices() > 1) 320*0fca6ea1SDimitry Andric return false; 321*0fca6ea1SDimitry Andric 322*0fca6ea1SDimitry Andric Value *IdxA = GEPA->getOperand(GEPA->getNumIndices()); 323*0fca6ea1SDimitry Andric Value *IdxB = GEPB->getOperand(GEPB->getNumIndices()); 324*0fca6ea1SDimitry Andric if (IdxA != IdxB || !match(IdxA, m_ZExt(m_Specific(Index)))) 325*0fca6ea1SDimitry Andric return false; 326*0fca6ea1SDimitry Andric 327*0fca6ea1SDimitry Andric // We only ever expect the pre-incremented index value to be used inside the 328*0fca6ea1SDimitry Andric // loop. 329*0fca6ea1SDimitry Andric if (!PN->hasOneUse()) 330*0fca6ea1SDimitry Andric return false; 331*0fca6ea1SDimitry Andric 332*0fca6ea1SDimitry Andric // Ensure that when the Found and End blocks are identical the PHIs have the 333*0fca6ea1SDimitry Andric // supported format. We don't currently allow cases like this: 334*0fca6ea1SDimitry Andric // while.cond: 335*0fca6ea1SDimitry Andric // ... 336*0fca6ea1SDimitry Andric // br i1 %cmp.not, label %while.end, label %while.body 337*0fca6ea1SDimitry Andric // 338*0fca6ea1SDimitry Andric // while.body: 339*0fca6ea1SDimitry Andric // ... 340*0fca6ea1SDimitry Andric // br i1 %cmp.not2, label %while.cond, label %while.end 341*0fca6ea1SDimitry Andric // 342*0fca6ea1SDimitry Andric // while.end: 343*0fca6ea1SDimitry Andric // %final_ptr = phi ptr [ %c, %while.body ], [ %d, %while.cond ] 344*0fca6ea1SDimitry Andric // 345*0fca6ea1SDimitry Andric // Where the incoming values for %final_ptr are unique and from each of the 346*0fca6ea1SDimitry Andric // loop blocks, but not actually defined in the loop. This requires extra 347*0fca6ea1SDimitry Andric // work setting up the byte.compare block, i.e. by introducing a select to 348*0fca6ea1SDimitry Andric // choose the correct value. 349*0fca6ea1SDimitry Andric // TODO: We could add support for this in future. 350*0fca6ea1SDimitry Andric if (FoundBB == EndBB) { 351*0fca6ea1SDimitry Andric for (PHINode &EndPN : EndBB->phis()) { 352*0fca6ea1SDimitry Andric Value *WhileCondVal = EndPN.getIncomingValueForBlock(Header); 353*0fca6ea1SDimitry Andric Value *WhileBodyVal = EndPN.getIncomingValueForBlock(WhileBB); 354*0fca6ea1SDimitry Andric 355*0fca6ea1SDimitry Andric // The value of the index when leaving the while.cond block is always the 356*0fca6ea1SDimitry Andric // same as the end value (MaxLen) so we permit either. The value when 357*0fca6ea1SDimitry Andric // leaving the while.body block should only be the index. Otherwise for 358*0fca6ea1SDimitry Andric // any other values we only allow ones that are same for both blocks. 359*0fca6ea1SDimitry Andric if (WhileCondVal != WhileBodyVal && 360*0fca6ea1SDimitry Andric ((WhileCondVal != Index && WhileCondVal != MaxLen) || 361*0fca6ea1SDimitry Andric (WhileBodyVal != Index))) 362*0fca6ea1SDimitry Andric return false; 363*0fca6ea1SDimitry Andric } 364*0fca6ea1SDimitry Andric } 365*0fca6ea1SDimitry Andric 366*0fca6ea1SDimitry Andric LLVM_DEBUG(dbgs() << "FOUND IDIOM IN LOOP: \n" 367*0fca6ea1SDimitry Andric << *(EndBB->getParent()) << "\n\n"); 368*0fca6ea1SDimitry Andric 369*0fca6ea1SDimitry Andric // The index is incremented before the GEP/Load pair so we need to 370*0fca6ea1SDimitry Andric // add 1 to the start value. 371*0fca6ea1SDimitry Andric transformByteCompare(GEPA, GEPB, PN, MaxLen, Index, StartIdx, /*IncIdx=*/true, 372*0fca6ea1SDimitry Andric FoundBB, EndBB); 373*0fca6ea1SDimitry Andric return true; 374*0fca6ea1SDimitry Andric } 375*0fca6ea1SDimitry Andric 376*0fca6ea1SDimitry Andric Value *LoopIdiomVectorize::createMaskedFindMismatch( 377*0fca6ea1SDimitry Andric IRBuilder<> &Builder, DomTreeUpdater &DTU, GetElementPtrInst *GEPA, 378*0fca6ea1SDimitry Andric GetElementPtrInst *GEPB, Value *ExtStart, Value *ExtEnd) { 379*0fca6ea1SDimitry Andric Type *I64Type = Builder.getInt64Ty(); 380*0fca6ea1SDimitry Andric Type *ResType = Builder.getInt32Ty(); 381*0fca6ea1SDimitry Andric Type *LoadType = Builder.getInt8Ty(); 382*0fca6ea1SDimitry Andric Value *PtrA = GEPA->getPointerOperand(); 383*0fca6ea1SDimitry Andric Value *PtrB = GEPB->getPointerOperand(); 384*0fca6ea1SDimitry Andric 385*0fca6ea1SDimitry Andric ScalableVectorType *PredVTy = 386*0fca6ea1SDimitry Andric ScalableVectorType::get(Builder.getInt1Ty(), ByteCompareVF); 387*0fca6ea1SDimitry Andric 388*0fca6ea1SDimitry Andric Value *InitialPred = Builder.CreateIntrinsic( 389*0fca6ea1SDimitry Andric Intrinsic::get_active_lane_mask, {PredVTy, I64Type}, {ExtStart, ExtEnd}); 390*0fca6ea1SDimitry Andric 391*0fca6ea1SDimitry Andric Value *VecLen = Builder.CreateIntrinsic(Intrinsic::vscale, {I64Type}, {}); 392*0fca6ea1SDimitry Andric VecLen = 393*0fca6ea1SDimitry Andric Builder.CreateMul(VecLen, ConstantInt::get(I64Type, ByteCompareVF), "", 394*0fca6ea1SDimitry Andric /*HasNUW=*/true, /*HasNSW=*/true); 395*0fca6ea1SDimitry Andric 396*0fca6ea1SDimitry Andric Value *PFalse = Builder.CreateVectorSplat(PredVTy->getElementCount(), 397*0fca6ea1SDimitry Andric Builder.getInt1(false)); 398*0fca6ea1SDimitry Andric 399*0fca6ea1SDimitry Andric BranchInst *JumpToVectorLoop = BranchInst::Create(VectorLoopStartBlock); 400*0fca6ea1SDimitry Andric Builder.Insert(JumpToVectorLoop); 401*0fca6ea1SDimitry Andric 402*0fca6ea1SDimitry Andric DTU.applyUpdates({{DominatorTree::Insert, VectorLoopPreheaderBlock, 403*0fca6ea1SDimitry Andric VectorLoopStartBlock}}); 404*0fca6ea1SDimitry Andric 405*0fca6ea1SDimitry Andric // Set up the first vector loop block by creating the PHIs, doing the vector 406*0fca6ea1SDimitry Andric // loads and comparing the vectors. 407*0fca6ea1SDimitry Andric Builder.SetInsertPoint(VectorLoopStartBlock); 408*0fca6ea1SDimitry Andric PHINode *LoopPred = Builder.CreatePHI(PredVTy, 2, "mismatch_vec_loop_pred"); 409*0fca6ea1SDimitry Andric LoopPred->addIncoming(InitialPred, VectorLoopPreheaderBlock); 410*0fca6ea1SDimitry Andric PHINode *VectorIndexPhi = Builder.CreatePHI(I64Type, 2, "mismatch_vec_index"); 411*0fca6ea1SDimitry Andric VectorIndexPhi->addIncoming(ExtStart, VectorLoopPreheaderBlock); 412*0fca6ea1SDimitry Andric Type *VectorLoadType = 413*0fca6ea1SDimitry Andric ScalableVectorType::get(Builder.getInt8Ty(), ByteCompareVF); 414*0fca6ea1SDimitry Andric Value *Passthru = ConstantInt::getNullValue(VectorLoadType); 415*0fca6ea1SDimitry Andric 416*0fca6ea1SDimitry Andric Value *VectorLhsGep = 417*0fca6ea1SDimitry Andric Builder.CreateGEP(LoadType, PtrA, VectorIndexPhi, "", GEPA->isInBounds()); 418*0fca6ea1SDimitry Andric Value *VectorLhsLoad = Builder.CreateMaskedLoad(VectorLoadType, VectorLhsGep, 419*0fca6ea1SDimitry Andric Align(1), LoopPred, Passthru); 420*0fca6ea1SDimitry Andric 421*0fca6ea1SDimitry Andric Value *VectorRhsGep = 422*0fca6ea1SDimitry Andric Builder.CreateGEP(LoadType, PtrB, VectorIndexPhi, "", GEPB->isInBounds()); 423*0fca6ea1SDimitry Andric Value *VectorRhsLoad = Builder.CreateMaskedLoad(VectorLoadType, VectorRhsGep, 424*0fca6ea1SDimitry Andric Align(1), LoopPred, Passthru); 425*0fca6ea1SDimitry Andric 426*0fca6ea1SDimitry Andric Value *VectorMatchCmp = Builder.CreateICmpNE(VectorLhsLoad, VectorRhsLoad); 427*0fca6ea1SDimitry Andric VectorMatchCmp = Builder.CreateSelect(LoopPred, VectorMatchCmp, PFalse); 428*0fca6ea1SDimitry Andric Value *VectorMatchHasActiveLanes = Builder.CreateOrReduce(VectorMatchCmp); 429*0fca6ea1SDimitry Andric BranchInst *VectorEarlyExit = BranchInst::Create( 430*0fca6ea1SDimitry Andric VectorLoopMismatchBlock, VectorLoopIncBlock, VectorMatchHasActiveLanes); 431*0fca6ea1SDimitry Andric Builder.Insert(VectorEarlyExit); 432*0fca6ea1SDimitry Andric 433*0fca6ea1SDimitry Andric DTU.applyUpdates( 434*0fca6ea1SDimitry Andric {{DominatorTree::Insert, VectorLoopStartBlock, VectorLoopMismatchBlock}, 435*0fca6ea1SDimitry Andric {DominatorTree::Insert, VectorLoopStartBlock, VectorLoopIncBlock}}); 436*0fca6ea1SDimitry Andric 437*0fca6ea1SDimitry Andric // Increment the index counter and calculate the predicate for the next 438*0fca6ea1SDimitry Andric // iteration of the loop. We branch back to the start of the loop if there 439*0fca6ea1SDimitry Andric // is at least one active lane. 440*0fca6ea1SDimitry Andric Builder.SetInsertPoint(VectorLoopIncBlock); 441*0fca6ea1SDimitry Andric Value *NewVectorIndexPhi = 442*0fca6ea1SDimitry Andric Builder.CreateAdd(VectorIndexPhi, VecLen, "", 443*0fca6ea1SDimitry Andric /*HasNUW=*/true, /*HasNSW=*/true); 444*0fca6ea1SDimitry Andric VectorIndexPhi->addIncoming(NewVectorIndexPhi, VectorLoopIncBlock); 445*0fca6ea1SDimitry Andric Value *NewPred = 446*0fca6ea1SDimitry Andric Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, 447*0fca6ea1SDimitry Andric {PredVTy, I64Type}, {NewVectorIndexPhi, ExtEnd}); 448*0fca6ea1SDimitry Andric LoopPred->addIncoming(NewPred, VectorLoopIncBlock); 449*0fca6ea1SDimitry Andric 450*0fca6ea1SDimitry Andric Value *PredHasActiveLanes = 451*0fca6ea1SDimitry Andric Builder.CreateExtractElement(NewPred, uint64_t(0)); 452*0fca6ea1SDimitry Andric BranchInst *VectorLoopBranchBack = 453*0fca6ea1SDimitry Andric BranchInst::Create(VectorLoopStartBlock, EndBlock, PredHasActiveLanes); 454*0fca6ea1SDimitry Andric Builder.Insert(VectorLoopBranchBack); 455*0fca6ea1SDimitry Andric 456*0fca6ea1SDimitry Andric DTU.applyUpdates( 457*0fca6ea1SDimitry Andric {{DominatorTree::Insert, VectorLoopIncBlock, VectorLoopStartBlock}, 458*0fca6ea1SDimitry Andric {DominatorTree::Insert, VectorLoopIncBlock, EndBlock}}); 459*0fca6ea1SDimitry Andric 460*0fca6ea1SDimitry Andric // If we found a mismatch then we need to calculate which lane in the vector 461*0fca6ea1SDimitry Andric // had a mismatch and add that on to the current loop index. 462*0fca6ea1SDimitry Andric Builder.SetInsertPoint(VectorLoopMismatchBlock); 463*0fca6ea1SDimitry Andric PHINode *FoundPred = Builder.CreatePHI(PredVTy, 1, "mismatch_vec_found_pred"); 464*0fca6ea1SDimitry Andric FoundPred->addIncoming(VectorMatchCmp, VectorLoopStartBlock); 465*0fca6ea1SDimitry Andric PHINode *LastLoopPred = 466*0fca6ea1SDimitry Andric Builder.CreatePHI(PredVTy, 1, "mismatch_vec_last_loop_pred"); 467*0fca6ea1SDimitry Andric LastLoopPred->addIncoming(LoopPred, VectorLoopStartBlock); 468*0fca6ea1SDimitry Andric PHINode *VectorFoundIndex = 469*0fca6ea1SDimitry Andric Builder.CreatePHI(I64Type, 1, "mismatch_vec_found_index"); 470*0fca6ea1SDimitry Andric VectorFoundIndex->addIncoming(VectorIndexPhi, VectorLoopStartBlock); 471*0fca6ea1SDimitry Andric 472*0fca6ea1SDimitry Andric Value *PredMatchCmp = Builder.CreateAnd(LastLoopPred, FoundPred); 473*0fca6ea1SDimitry Andric Value *Ctz = Builder.CreateIntrinsic( 474*0fca6ea1SDimitry Andric Intrinsic::experimental_cttz_elts, {ResType, PredMatchCmp->getType()}, 475*0fca6ea1SDimitry Andric {PredMatchCmp, /*ZeroIsPoison=*/Builder.getInt1(true)}); 476*0fca6ea1SDimitry Andric Ctz = Builder.CreateZExt(Ctz, I64Type); 477*0fca6ea1SDimitry Andric Value *VectorLoopRes64 = Builder.CreateAdd(VectorFoundIndex, Ctz, "", 478*0fca6ea1SDimitry Andric /*HasNUW=*/true, /*HasNSW=*/true); 479*0fca6ea1SDimitry Andric return Builder.CreateTrunc(VectorLoopRes64, ResType); 480*0fca6ea1SDimitry Andric } 481*0fca6ea1SDimitry Andric 482*0fca6ea1SDimitry Andric Value *LoopIdiomVectorize::createPredicatedFindMismatch( 483*0fca6ea1SDimitry Andric IRBuilder<> &Builder, DomTreeUpdater &DTU, GetElementPtrInst *GEPA, 484*0fca6ea1SDimitry Andric GetElementPtrInst *GEPB, Value *ExtStart, Value *ExtEnd) { 485*0fca6ea1SDimitry Andric Type *I64Type = Builder.getInt64Ty(); 486*0fca6ea1SDimitry Andric Type *I32Type = Builder.getInt32Ty(); 487*0fca6ea1SDimitry Andric Type *ResType = I32Type; 488*0fca6ea1SDimitry Andric Type *LoadType = Builder.getInt8Ty(); 489*0fca6ea1SDimitry Andric Value *PtrA = GEPA->getPointerOperand(); 490*0fca6ea1SDimitry Andric Value *PtrB = GEPB->getPointerOperand(); 491*0fca6ea1SDimitry Andric 492*0fca6ea1SDimitry Andric auto *JumpToVectorLoop = BranchInst::Create(VectorLoopStartBlock); 493*0fca6ea1SDimitry Andric Builder.Insert(JumpToVectorLoop); 494*0fca6ea1SDimitry Andric 495*0fca6ea1SDimitry Andric DTU.applyUpdates({{DominatorTree::Insert, VectorLoopPreheaderBlock, 496*0fca6ea1SDimitry Andric VectorLoopStartBlock}}); 497*0fca6ea1SDimitry Andric 498*0fca6ea1SDimitry Andric // Set up the first Vector loop block by creating the PHIs, doing the vector 499*0fca6ea1SDimitry Andric // loads and comparing the vectors. 500*0fca6ea1SDimitry Andric Builder.SetInsertPoint(VectorLoopStartBlock); 501*0fca6ea1SDimitry Andric auto *VectorIndexPhi = Builder.CreatePHI(I64Type, 2, "mismatch_vector_index"); 502*0fca6ea1SDimitry Andric VectorIndexPhi->addIncoming(ExtStart, VectorLoopPreheaderBlock); 503*0fca6ea1SDimitry Andric 504*0fca6ea1SDimitry Andric // Calculate AVL by subtracting the vector loop index from the trip count 505*0fca6ea1SDimitry Andric Value *AVL = Builder.CreateSub(ExtEnd, VectorIndexPhi, "avl", /*HasNUW=*/true, 506*0fca6ea1SDimitry Andric /*HasNSW=*/true); 507*0fca6ea1SDimitry Andric 508*0fca6ea1SDimitry Andric auto *VectorLoadType = ScalableVectorType::get(LoadType, ByteCompareVF); 509*0fca6ea1SDimitry Andric auto *VF = ConstantInt::get(I32Type, ByteCompareVF); 510*0fca6ea1SDimitry Andric 511*0fca6ea1SDimitry Andric Value *VL = Builder.CreateIntrinsic(Intrinsic::experimental_get_vector_length, 512*0fca6ea1SDimitry Andric {I64Type}, {AVL, VF, Builder.getTrue()}); 513*0fca6ea1SDimitry Andric Value *GepOffset = VectorIndexPhi; 514*0fca6ea1SDimitry Andric 515*0fca6ea1SDimitry Andric Value *VectorLhsGep = 516*0fca6ea1SDimitry Andric Builder.CreateGEP(LoadType, PtrA, GepOffset, "", GEPA->isInBounds()); 517*0fca6ea1SDimitry Andric VectorType *TrueMaskTy = 518*0fca6ea1SDimitry Andric VectorType::get(Builder.getInt1Ty(), VectorLoadType->getElementCount()); 519*0fca6ea1SDimitry Andric Value *AllTrueMask = Constant::getAllOnesValue(TrueMaskTy); 520*0fca6ea1SDimitry Andric Value *VectorLhsLoad = Builder.CreateIntrinsic( 521*0fca6ea1SDimitry Andric Intrinsic::vp_load, {VectorLoadType, VectorLhsGep->getType()}, 522*0fca6ea1SDimitry Andric {VectorLhsGep, AllTrueMask, VL}, nullptr, "lhs.load"); 523*0fca6ea1SDimitry Andric 524*0fca6ea1SDimitry Andric Value *VectorRhsGep = 525*0fca6ea1SDimitry Andric Builder.CreateGEP(LoadType, PtrB, GepOffset, "", GEPB->isInBounds()); 526*0fca6ea1SDimitry Andric Value *VectorRhsLoad = Builder.CreateIntrinsic( 527*0fca6ea1SDimitry Andric Intrinsic::vp_load, {VectorLoadType, VectorLhsGep->getType()}, 528*0fca6ea1SDimitry Andric {VectorRhsGep, AllTrueMask, VL}, nullptr, "rhs.load"); 529*0fca6ea1SDimitry Andric 530*0fca6ea1SDimitry Andric StringRef PredicateStr = CmpInst::getPredicateName(CmpInst::ICMP_NE); 531*0fca6ea1SDimitry Andric auto *PredicateMDS = MDString::get(VectorLhsLoad->getContext(), PredicateStr); 532*0fca6ea1SDimitry Andric Value *Pred = MetadataAsValue::get(VectorLhsLoad->getContext(), PredicateMDS); 533*0fca6ea1SDimitry Andric Value *VectorMatchCmp = Builder.CreateIntrinsic( 534*0fca6ea1SDimitry Andric Intrinsic::vp_icmp, {VectorLhsLoad->getType()}, 535*0fca6ea1SDimitry Andric {VectorLhsLoad, VectorRhsLoad, Pred, AllTrueMask, VL}, nullptr, 536*0fca6ea1SDimitry Andric "mismatch.cmp"); 537*0fca6ea1SDimitry Andric Value *CTZ = Builder.CreateIntrinsic( 538*0fca6ea1SDimitry Andric Intrinsic::vp_cttz_elts, {ResType, VectorMatchCmp->getType()}, 539*0fca6ea1SDimitry Andric {VectorMatchCmp, /*ZeroIsPoison=*/Builder.getInt1(false), AllTrueMask, 540*0fca6ea1SDimitry Andric VL}); 541*0fca6ea1SDimitry Andric Value *MismatchFound = Builder.CreateICmpNE(CTZ, VL); 542*0fca6ea1SDimitry Andric auto *VectorEarlyExit = BranchInst::Create(VectorLoopMismatchBlock, 543*0fca6ea1SDimitry Andric VectorLoopIncBlock, MismatchFound); 544*0fca6ea1SDimitry Andric Builder.Insert(VectorEarlyExit); 545*0fca6ea1SDimitry Andric 546*0fca6ea1SDimitry Andric DTU.applyUpdates( 547*0fca6ea1SDimitry Andric {{DominatorTree::Insert, VectorLoopStartBlock, VectorLoopMismatchBlock}, 548*0fca6ea1SDimitry Andric {DominatorTree::Insert, VectorLoopStartBlock, VectorLoopIncBlock}}); 549*0fca6ea1SDimitry Andric 550*0fca6ea1SDimitry Andric // Increment the index counter and calculate the predicate for the next 551*0fca6ea1SDimitry Andric // iteration of the loop. We branch back to the start of the loop if there 552*0fca6ea1SDimitry Andric // is at least one active lane. 553*0fca6ea1SDimitry Andric Builder.SetInsertPoint(VectorLoopIncBlock); 554*0fca6ea1SDimitry Andric Value *VL64 = Builder.CreateZExt(VL, I64Type); 555*0fca6ea1SDimitry Andric Value *NewVectorIndexPhi = 556*0fca6ea1SDimitry Andric Builder.CreateAdd(VectorIndexPhi, VL64, "", 557*0fca6ea1SDimitry Andric /*HasNUW=*/true, /*HasNSW=*/true); 558*0fca6ea1SDimitry Andric VectorIndexPhi->addIncoming(NewVectorIndexPhi, VectorLoopIncBlock); 559*0fca6ea1SDimitry Andric Value *ExitCond = Builder.CreateICmpNE(NewVectorIndexPhi, ExtEnd); 560*0fca6ea1SDimitry Andric auto *VectorLoopBranchBack = 561*0fca6ea1SDimitry Andric BranchInst::Create(VectorLoopStartBlock, EndBlock, ExitCond); 562*0fca6ea1SDimitry Andric Builder.Insert(VectorLoopBranchBack); 563*0fca6ea1SDimitry Andric 564*0fca6ea1SDimitry Andric DTU.applyUpdates( 565*0fca6ea1SDimitry Andric {{DominatorTree::Insert, VectorLoopIncBlock, VectorLoopStartBlock}, 566*0fca6ea1SDimitry Andric {DominatorTree::Insert, VectorLoopIncBlock, EndBlock}}); 567*0fca6ea1SDimitry Andric 568*0fca6ea1SDimitry Andric // If we found a mismatch then we need to calculate which lane in the vector 569*0fca6ea1SDimitry Andric // had a mismatch and add that on to the current loop index. 570*0fca6ea1SDimitry Andric Builder.SetInsertPoint(VectorLoopMismatchBlock); 571*0fca6ea1SDimitry Andric 572*0fca6ea1SDimitry Andric // Add LCSSA phis for CTZ and VectorIndexPhi. 573*0fca6ea1SDimitry Andric auto *CTZLCSSAPhi = Builder.CreatePHI(CTZ->getType(), 1, "ctz"); 574*0fca6ea1SDimitry Andric CTZLCSSAPhi->addIncoming(CTZ, VectorLoopStartBlock); 575*0fca6ea1SDimitry Andric auto *VectorIndexLCSSAPhi = 576*0fca6ea1SDimitry Andric Builder.CreatePHI(VectorIndexPhi->getType(), 1, "mismatch_vector_index"); 577*0fca6ea1SDimitry Andric VectorIndexLCSSAPhi->addIncoming(VectorIndexPhi, VectorLoopStartBlock); 578*0fca6ea1SDimitry Andric 579*0fca6ea1SDimitry Andric Value *CTZI64 = Builder.CreateZExt(CTZLCSSAPhi, I64Type); 580*0fca6ea1SDimitry Andric Value *VectorLoopRes64 = Builder.CreateAdd(VectorIndexLCSSAPhi, CTZI64, "", 581*0fca6ea1SDimitry Andric /*HasNUW=*/true, /*HasNSW=*/true); 582*0fca6ea1SDimitry Andric return Builder.CreateTrunc(VectorLoopRes64, ResType); 583*0fca6ea1SDimitry Andric } 584*0fca6ea1SDimitry Andric 585*0fca6ea1SDimitry Andric Value *LoopIdiomVectorize::expandFindMismatch( 586*0fca6ea1SDimitry Andric IRBuilder<> &Builder, DomTreeUpdater &DTU, GetElementPtrInst *GEPA, 587*0fca6ea1SDimitry Andric GetElementPtrInst *GEPB, Instruction *Index, Value *Start, Value *MaxLen) { 588*0fca6ea1SDimitry Andric Value *PtrA = GEPA->getPointerOperand(); 589*0fca6ea1SDimitry Andric Value *PtrB = GEPB->getPointerOperand(); 590*0fca6ea1SDimitry Andric 591*0fca6ea1SDimitry Andric // Get the arguments and types for the intrinsic. 592*0fca6ea1SDimitry Andric BasicBlock *Preheader = CurLoop->getLoopPreheader(); 593*0fca6ea1SDimitry Andric BranchInst *PHBranch = cast<BranchInst>(Preheader->getTerminator()); 594*0fca6ea1SDimitry Andric LLVMContext &Ctx = PHBranch->getContext(); 595*0fca6ea1SDimitry Andric Type *LoadType = Type::getInt8Ty(Ctx); 596*0fca6ea1SDimitry Andric Type *ResType = Builder.getInt32Ty(); 597*0fca6ea1SDimitry Andric 598*0fca6ea1SDimitry Andric // Split block in the original loop preheader. 599*0fca6ea1SDimitry Andric EndBlock = SplitBlock(Preheader, PHBranch, DT, LI, nullptr, "mismatch_end"); 600*0fca6ea1SDimitry Andric 601*0fca6ea1SDimitry Andric // Create the blocks that we're going to need: 602*0fca6ea1SDimitry Andric // 1. A block for checking the zero-extended length exceeds 0 603*0fca6ea1SDimitry Andric // 2. A block to check that the start and end addresses of a given array 604*0fca6ea1SDimitry Andric // lie on the same page. 605*0fca6ea1SDimitry Andric // 3. The vector loop preheader. 606*0fca6ea1SDimitry Andric // 4. The first vector loop block. 607*0fca6ea1SDimitry Andric // 5. The vector loop increment block. 608*0fca6ea1SDimitry Andric // 6. A block we can jump to from the vector loop when a mismatch is found. 609*0fca6ea1SDimitry Andric // 7. The first block of the scalar loop itself, containing PHIs , loads 610*0fca6ea1SDimitry Andric // and cmp. 611*0fca6ea1SDimitry Andric // 8. A scalar loop increment block to increment the PHIs and go back 612*0fca6ea1SDimitry Andric // around the loop. 613*0fca6ea1SDimitry Andric 614*0fca6ea1SDimitry Andric BasicBlock *MinItCheckBlock = BasicBlock::Create( 615*0fca6ea1SDimitry Andric Ctx, "mismatch_min_it_check", EndBlock->getParent(), EndBlock); 616*0fca6ea1SDimitry Andric 617*0fca6ea1SDimitry Andric // Update the terminator added by SplitBlock to branch to the first block 618*0fca6ea1SDimitry Andric Preheader->getTerminator()->setSuccessor(0, MinItCheckBlock); 619*0fca6ea1SDimitry Andric 620*0fca6ea1SDimitry Andric BasicBlock *MemCheckBlock = BasicBlock::Create( 621*0fca6ea1SDimitry Andric Ctx, "mismatch_mem_check", EndBlock->getParent(), EndBlock); 622*0fca6ea1SDimitry Andric 623*0fca6ea1SDimitry Andric VectorLoopPreheaderBlock = BasicBlock::Create( 624*0fca6ea1SDimitry Andric Ctx, "mismatch_vec_loop_preheader", EndBlock->getParent(), EndBlock); 625*0fca6ea1SDimitry Andric 626*0fca6ea1SDimitry Andric VectorLoopStartBlock = BasicBlock::Create(Ctx, "mismatch_vec_loop", 627*0fca6ea1SDimitry Andric EndBlock->getParent(), EndBlock); 628*0fca6ea1SDimitry Andric 629*0fca6ea1SDimitry Andric VectorLoopIncBlock = BasicBlock::Create(Ctx, "mismatch_vec_loop_inc", 630*0fca6ea1SDimitry Andric EndBlock->getParent(), EndBlock); 631*0fca6ea1SDimitry Andric 632*0fca6ea1SDimitry Andric VectorLoopMismatchBlock = BasicBlock::Create(Ctx, "mismatch_vec_loop_found", 633*0fca6ea1SDimitry Andric EndBlock->getParent(), EndBlock); 634*0fca6ea1SDimitry Andric 635*0fca6ea1SDimitry Andric BasicBlock *LoopPreHeaderBlock = BasicBlock::Create( 636*0fca6ea1SDimitry Andric Ctx, "mismatch_loop_pre", EndBlock->getParent(), EndBlock); 637*0fca6ea1SDimitry Andric 638*0fca6ea1SDimitry Andric BasicBlock *LoopStartBlock = 639*0fca6ea1SDimitry Andric BasicBlock::Create(Ctx, "mismatch_loop", EndBlock->getParent(), EndBlock); 640*0fca6ea1SDimitry Andric 641*0fca6ea1SDimitry Andric BasicBlock *LoopIncBlock = BasicBlock::Create( 642*0fca6ea1SDimitry Andric Ctx, "mismatch_loop_inc", EndBlock->getParent(), EndBlock); 643*0fca6ea1SDimitry Andric 644*0fca6ea1SDimitry Andric DTU.applyUpdates({{DominatorTree::Insert, Preheader, MinItCheckBlock}, 645*0fca6ea1SDimitry Andric {DominatorTree::Delete, Preheader, EndBlock}}); 646*0fca6ea1SDimitry Andric 647*0fca6ea1SDimitry Andric // Update LoopInfo with the new vector & scalar loops. 648*0fca6ea1SDimitry Andric auto VectorLoop = LI->AllocateLoop(); 649*0fca6ea1SDimitry Andric auto ScalarLoop = LI->AllocateLoop(); 650*0fca6ea1SDimitry Andric 651*0fca6ea1SDimitry Andric if (CurLoop->getParentLoop()) { 652*0fca6ea1SDimitry Andric CurLoop->getParentLoop()->addBasicBlockToLoop(MinItCheckBlock, *LI); 653*0fca6ea1SDimitry Andric CurLoop->getParentLoop()->addBasicBlockToLoop(MemCheckBlock, *LI); 654*0fca6ea1SDimitry Andric CurLoop->getParentLoop()->addBasicBlockToLoop(VectorLoopPreheaderBlock, 655*0fca6ea1SDimitry Andric *LI); 656*0fca6ea1SDimitry Andric CurLoop->getParentLoop()->addChildLoop(VectorLoop); 657*0fca6ea1SDimitry Andric CurLoop->getParentLoop()->addBasicBlockToLoop(VectorLoopMismatchBlock, *LI); 658*0fca6ea1SDimitry Andric CurLoop->getParentLoop()->addBasicBlockToLoop(LoopPreHeaderBlock, *LI); 659*0fca6ea1SDimitry Andric CurLoop->getParentLoop()->addChildLoop(ScalarLoop); 660*0fca6ea1SDimitry Andric } else { 661*0fca6ea1SDimitry Andric LI->addTopLevelLoop(VectorLoop); 662*0fca6ea1SDimitry Andric LI->addTopLevelLoop(ScalarLoop); 663*0fca6ea1SDimitry Andric } 664*0fca6ea1SDimitry Andric 665*0fca6ea1SDimitry Andric // Add the new basic blocks to their associated loops. 666*0fca6ea1SDimitry Andric VectorLoop->addBasicBlockToLoop(VectorLoopStartBlock, *LI); 667*0fca6ea1SDimitry Andric VectorLoop->addBasicBlockToLoop(VectorLoopIncBlock, *LI); 668*0fca6ea1SDimitry Andric 669*0fca6ea1SDimitry Andric ScalarLoop->addBasicBlockToLoop(LoopStartBlock, *LI); 670*0fca6ea1SDimitry Andric ScalarLoop->addBasicBlockToLoop(LoopIncBlock, *LI); 671*0fca6ea1SDimitry Andric 672*0fca6ea1SDimitry Andric // Set up some types and constants that we intend to reuse. 673*0fca6ea1SDimitry Andric Type *I64Type = Builder.getInt64Ty(); 674*0fca6ea1SDimitry Andric 675*0fca6ea1SDimitry Andric // Check the zero-extended iteration count > 0 676*0fca6ea1SDimitry Andric Builder.SetInsertPoint(MinItCheckBlock); 677*0fca6ea1SDimitry Andric Value *ExtStart = Builder.CreateZExt(Start, I64Type); 678*0fca6ea1SDimitry Andric Value *ExtEnd = Builder.CreateZExt(MaxLen, I64Type); 679*0fca6ea1SDimitry Andric // This check doesn't really cost us very much. 680*0fca6ea1SDimitry Andric 681*0fca6ea1SDimitry Andric Value *LimitCheck = Builder.CreateICmpULE(Start, MaxLen); 682*0fca6ea1SDimitry Andric BranchInst *MinItCheckBr = 683*0fca6ea1SDimitry Andric BranchInst::Create(MemCheckBlock, LoopPreHeaderBlock, LimitCheck); 684*0fca6ea1SDimitry Andric MinItCheckBr->setMetadata( 685*0fca6ea1SDimitry Andric LLVMContext::MD_prof, 686*0fca6ea1SDimitry Andric MDBuilder(MinItCheckBr->getContext()).createBranchWeights(99, 1)); 687*0fca6ea1SDimitry Andric Builder.Insert(MinItCheckBr); 688*0fca6ea1SDimitry Andric 689*0fca6ea1SDimitry Andric DTU.applyUpdates( 690*0fca6ea1SDimitry Andric {{DominatorTree::Insert, MinItCheckBlock, MemCheckBlock}, 691*0fca6ea1SDimitry Andric {DominatorTree::Insert, MinItCheckBlock, LoopPreHeaderBlock}}); 692*0fca6ea1SDimitry Andric 693*0fca6ea1SDimitry Andric // For each of the arrays, check the start/end addresses are on the same 694*0fca6ea1SDimitry Andric // page. 695*0fca6ea1SDimitry Andric Builder.SetInsertPoint(MemCheckBlock); 696*0fca6ea1SDimitry Andric 697*0fca6ea1SDimitry Andric // The early exit in the original loop means that when performing vector 698*0fca6ea1SDimitry Andric // loads we are potentially reading ahead of the early exit. So we could 699*0fca6ea1SDimitry Andric // fault if crossing a page boundary. Therefore, we create runtime memory 700*0fca6ea1SDimitry Andric // checks based on the minimum page size as follows: 701*0fca6ea1SDimitry Andric // 1. Calculate the addresses of the first memory accesses in the loop, 702*0fca6ea1SDimitry Andric // i.e. LhsStart and RhsStart. 703*0fca6ea1SDimitry Andric // 2. Get the last accessed addresses in the loop, i.e. LhsEnd and RhsEnd. 704*0fca6ea1SDimitry Andric // 3. Determine which pages correspond to all the memory accesses, i.e 705*0fca6ea1SDimitry Andric // LhsStartPage, LhsEndPage, RhsStartPage, RhsEndPage. 706*0fca6ea1SDimitry Andric // 4. If LhsStartPage == LhsEndPage and RhsStartPage == RhsEndPage, then 707*0fca6ea1SDimitry Andric // we know we won't cross any page boundaries in the loop so we can 708*0fca6ea1SDimitry Andric // enter the vector loop! Otherwise we fall back on the scalar loop. 709*0fca6ea1SDimitry Andric Value *LhsStartGEP = Builder.CreateGEP(LoadType, PtrA, ExtStart); 710*0fca6ea1SDimitry Andric Value *RhsStartGEP = Builder.CreateGEP(LoadType, PtrB, ExtStart); 711*0fca6ea1SDimitry Andric Value *RhsStart = Builder.CreatePtrToInt(RhsStartGEP, I64Type); 712*0fca6ea1SDimitry Andric Value *LhsStart = Builder.CreatePtrToInt(LhsStartGEP, I64Type); 713*0fca6ea1SDimitry Andric Value *LhsEndGEP = Builder.CreateGEP(LoadType, PtrA, ExtEnd); 714*0fca6ea1SDimitry Andric Value *RhsEndGEP = Builder.CreateGEP(LoadType, PtrB, ExtEnd); 715*0fca6ea1SDimitry Andric Value *LhsEnd = Builder.CreatePtrToInt(LhsEndGEP, I64Type); 716*0fca6ea1SDimitry Andric Value *RhsEnd = Builder.CreatePtrToInt(RhsEndGEP, I64Type); 717*0fca6ea1SDimitry Andric 718*0fca6ea1SDimitry Andric const uint64_t MinPageSize = TTI->getMinPageSize().value(); 719*0fca6ea1SDimitry Andric const uint64_t AddrShiftAmt = llvm::Log2_64(MinPageSize); 720*0fca6ea1SDimitry Andric Value *LhsStartPage = Builder.CreateLShr(LhsStart, AddrShiftAmt); 721*0fca6ea1SDimitry Andric Value *LhsEndPage = Builder.CreateLShr(LhsEnd, AddrShiftAmt); 722*0fca6ea1SDimitry Andric Value *RhsStartPage = Builder.CreateLShr(RhsStart, AddrShiftAmt); 723*0fca6ea1SDimitry Andric Value *RhsEndPage = Builder.CreateLShr(RhsEnd, AddrShiftAmt); 724*0fca6ea1SDimitry Andric Value *LhsPageCmp = Builder.CreateICmpNE(LhsStartPage, LhsEndPage); 725*0fca6ea1SDimitry Andric Value *RhsPageCmp = Builder.CreateICmpNE(RhsStartPage, RhsEndPage); 726*0fca6ea1SDimitry Andric 727*0fca6ea1SDimitry Andric Value *CombinedPageCmp = Builder.CreateOr(LhsPageCmp, RhsPageCmp); 728*0fca6ea1SDimitry Andric BranchInst *CombinedPageCmpCmpBr = BranchInst::Create( 729*0fca6ea1SDimitry Andric LoopPreHeaderBlock, VectorLoopPreheaderBlock, CombinedPageCmp); 730*0fca6ea1SDimitry Andric CombinedPageCmpCmpBr->setMetadata( 731*0fca6ea1SDimitry Andric LLVMContext::MD_prof, MDBuilder(CombinedPageCmpCmpBr->getContext()) 732*0fca6ea1SDimitry Andric .createBranchWeights(10, 90)); 733*0fca6ea1SDimitry Andric Builder.Insert(CombinedPageCmpCmpBr); 734*0fca6ea1SDimitry Andric 735*0fca6ea1SDimitry Andric DTU.applyUpdates( 736*0fca6ea1SDimitry Andric {{DominatorTree::Insert, MemCheckBlock, LoopPreHeaderBlock}, 737*0fca6ea1SDimitry Andric {DominatorTree::Insert, MemCheckBlock, VectorLoopPreheaderBlock}}); 738*0fca6ea1SDimitry Andric 739*0fca6ea1SDimitry Andric // Set up the vector loop preheader, i.e. calculate initial loop predicate, 740*0fca6ea1SDimitry Andric // zero-extend MaxLen to 64-bits, determine the number of vector elements 741*0fca6ea1SDimitry Andric // processed in each iteration, etc. 742*0fca6ea1SDimitry Andric Builder.SetInsertPoint(VectorLoopPreheaderBlock); 743*0fca6ea1SDimitry Andric 744*0fca6ea1SDimitry Andric // At this point we know two things must be true: 745*0fca6ea1SDimitry Andric // 1. Start <= End 746*0fca6ea1SDimitry Andric // 2. ExtMaxLen <= MinPageSize due to the page checks. 747*0fca6ea1SDimitry Andric // Therefore, we know that we can use a 64-bit induction variable that 748*0fca6ea1SDimitry Andric // starts from 0 -> ExtMaxLen and it will not overflow. 749*0fca6ea1SDimitry Andric Value *VectorLoopRes = nullptr; 750*0fca6ea1SDimitry Andric switch (VectorizeStyle) { 751*0fca6ea1SDimitry Andric case LoopIdiomVectorizeStyle::Masked: 752*0fca6ea1SDimitry Andric VectorLoopRes = 753*0fca6ea1SDimitry Andric createMaskedFindMismatch(Builder, DTU, GEPA, GEPB, ExtStart, ExtEnd); 754*0fca6ea1SDimitry Andric break; 755*0fca6ea1SDimitry Andric case LoopIdiomVectorizeStyle::Predicated: 756*0fca6ea1SDimitry Andric VectorLoopRes = createPredicatedFindMismatch(Builder, DTU, GEPA, GEPB, 757*0fca6ea1SDimitry Andric ExtStart, ExtEnd); 758*0fca6ea1SDimitry Andric break; 759*0fca6ea1SDimitry Andric } 760*0fca6ea1SDimitry Andric 761*0fca6ea1SDimitry Andric Builder.Insert(BranchInst::Create(EndBlock)); 762*0fca6ea1SDimitry Andric 763*0fca6ea1SDimitry Andric DTU.applyUpdates( 764*0fca6ea1SDimitry Andric {{DominatorTree::Insert, VectorLoopMismatchBlock, EndBlock}}); 765*0fca6ea1SDimitry Andric 766*0fca6ea1SDimitry Andric // Generate code for scalar loop. 767*0fca6ea1SDimitry Andric Builder.SetInsertPoint(LoopPreHeaderBlock); 768*0fca6ea1SDimitry Andric Builder.Insert(BranchInst::Create(LoopStartBlock)); 769*0fca6ea1SDimitry Andric 770*0fca6ea1SDimitry Andric DTU.applyUpdates( 771*0fca6ea1SDimitry Andric {{DominatorTree::Insert, LoopPreHeaderBlock, LoopStartBlock}}); 772*0fca6ea1SDimitry Andric 773*0fca6ea1SDimitry Andric Builder.SetInsertPoint(LoopStartBlock); 774*0fca6ea1SDimitry Andric PHINode *IndexPhi = Builder.CreatePHI(ResType, 2, "mismatch_index"); 775*0fca6ea1SDimitry Andric IndexPhi->addIncoming(Start, LoopPreHeaderBlock); 776*0fca6ea1SDimitry Andric 777*0fca6ea1SDimitry Andric // Otherwise compare the values 778*0fca6ea1SDimitry Andric // Load bytes from each array and compare them. 779*0fca6ea1SDimitry Andric Value *GepOffset = Builder.CreateZExt(IndexPhi, I64Type); 780*0fca6ea1SDimitry Andric 781*0fca6ea1SDimitry Andric Value *LhsGep = 782*0fca6ea1SDimitry Andric Builder.CreateGEP(LoadType, PtrA, GepOffset, "", GEPA->isInBounds()); 783*0fca6ea1SDimitry Andric Value *LhsLoad = Builder.CreateLoad(LoadType, LhsGep); 784*0fca6ea1SDimitry Andric 785*0fca6ea1SDimitry Andric Value *RhsGep = 786*0fca6ea1SDimitry Andric Builder.CreateGEP(LoadType, PtrB, GepOffset, "", GEPB->isInBounds()); 787*0fca6ea1SDimitry Andric Value *RhsLoad = Builder.CreateLoad(LoadType, RhsGep); 788*0fca6ea1SDimitry Andric 789*0fca6ea1SDimitry Andric Value *MatchCmp = Builder.CreateICmpEQ(LhsLoad, RhsLoad); 790*0fca6ea1SDimitry Andric // If we have a mismatch then exit the loop ... 791*0fca6ea1SDimitry Andric BranchInst *MatchCmpBr = BranchInst::Create(LoopIncBlock, EndBlock, MatchCmp); 792*0fca6ea1SDimitry Andric Builder.Insert(MatchCmpBr); 793*0fca6ea1SDimitry Andric 794*0fca6ea1SDimitry Andric DTU.applyUpdates({{DominatorTree::Insert, LoopStartBlock, LoopIncBlock}, 795*0fca6ea1SDimitry Andric {DominatorTree::Insert, LoopStartBlock, EndBlock}}); 796*0fca6ea1SDimitry Andric 797*0fca6ea1SDimitry Andric // Have we reached the maximum permitted length for the loop? 798*0fca6ea1SDimitry Andric Builder.SetInsertPoint(LoopIncBlock); 799*0fca6ea1SDimitry Andric Value *PhiInc = Builder.CreateAdd(IndexPhi, ConstantInt::get(ResType, 1), "", 800*0fca6ea1SDimitry Andric /*HasNUW=*/Index->hasNoUnsignedWrap(), 801*0fca6ea1SDimitry Andric /*HasNSW=*/Index->hasNoSignedWrap()); 802*0fca6ea1SDimitry Andric IndexPhi->addIncoming(PhiInc, LoopIncBlock); 803*0fca6ea1SDimitry Andric Value *IVCmp = Builder.CreateICmpEQ(PhiInc, MaxLen); 804*0fca6ea1SDimitry Andric BranchInst *IVCmpBr = BranchInst::Create(EndBlock, LoopStartBlock, IVCmp); 805*0fca6ea1SDimitry Andric Builder.Insert(IVCmpBr); 806*0fca6ea1SDimitry Andric 807*0fca6ea1SDimitry Andric DTU.applyUpdates({{DominatorTree::Insert, LoopIncBlock, EndBlock}, 808*0fca6ea1SDimitry Andric {DominatorTree::Insert, LoopIncBlock, LoopStartBlock}}); 809*0fca6ea1SDimitry Andric 810*0fca6ea1SDimitry Andric // In the end block we need to insert a PHI node to deal with three cases: 811*0fca6ea1SDimitry Andric // 1. We didn't find a mismatch in the scalar loop, so we return MaxLen. 812*0fca6ea1SDimitry Andric // 2. We exitted the scalar loop early due to a mismatch and need to return 813*0fca6ea1SDimitry Andric // the index that we found. 814*0fca6ea1SDimitry Andric // 3. We didn't find a mismatch in the vector loop, so we return MaxLen. 815*0fca6ea1SDimitry Andric // 4. We exitted the vector loop early due to a mismatch and need to return 816*0fca6ea1SDimitry Andric // the index that we found. 817*0fca6ea1SDimitry Andric Builder.SetInsertPoint(EndBlock, EndBlock->getFirstInsertionPt()); 818*0fca6ea1SDimitry Andric PHINode *ResPhi = Builder.CreatePHI(ResType, 4, "mismatch_result"); 819*0fca6ea1SDimitry Andric ResPhi->addIncoming(MaxLen, LoopIncBlock); 820*0fca6ea1SDimitry Andric ResPhi->addIncoming(IndexPhi, LoopStartBlock); 821*0fca6ea1SDimitry Andric ResPhi->addIncoming(MaxLen, VectorLoopIncBlock); 822*0fca6ea1SDimitry Andric ResPhi->addIncoming(VectorLoopRes, VectorLoopMismatchBlock); 823*0fca6ea1SDimitry Andric 824*0fca6ea1SDimitry Andric Value *FinalRes = Builder.CreateTrunc(ResPhi, ResType); 825*0fca6ea1SDimitry Andric 826*0fca6ea1SDimitry Andric if (VerifyLoops) { 827*0fca6ea1SDimitry Andric ScalarLoop->verifyLoop(); 828*0fca6ea1SDimitry Andric VectorLoop->verifyLoop(); 829*0fca6ea1SDimitry Andric if (!VectorLoop->isRecursivelyLCSSAForm(*DT, *LI)) 830*0fca6ea1SDimitry Andric report_fatal_error("Loops must remain in LCSSA form!"); 831*0fca6ea1SDimitry Andric if (!ScalarLoop->isRecursivelyLCSSAForm(*DT, *LI)) 832*0fca6ea1SDimitry Andric report_fatal_error("Loops must remain in LCSSA form!"); 833*0fca6ea1SDimitry Andric } 834*0fca6ea1SDimitry Andric 835*0fca6ea1SDimitry Andric return FinalRes; 836*0fca6ea1SDimitry Andric } 837*0fca6ea1SDimitry Andric 838*0fca6ea1SDimitry Andric void LoopIdiomVectorize::transformByteCompare(GetElementPtrInst *GEPA, 839*0fca6ea1SDimitry Andric GetElementPtrInst *GEPB, 840*0fca6ea1SDimitry Andric PHINode *IndPhi, Value *MaxLen, 841*0fca6ea1SDimitry Andric Instruction *Index, Value *Start, 842*0fca6ea1SDimitry Andric bool IncIdx, BasicBlock *FoundBB, 843*0fca6ea1SDimitry Andric BasicBlock *EndBB) { 844*0fca6ea1SDimitry Andric 845*0fca6ea1SDimitry Andric // Insert the byte compare code at the end of the preheader block 846*0fca6ea1SDimitry Andric BasicBlock *Preheader = CurLoop->getLoopPreheader(); 847*0fca6ea1SDimitry Andric BasicBlock *Header = CurLoop->getHeader(); 848*0fca6ea1SDimitry Andric BranchInst *PHBranch = cast<BranchInst>(Preheader->getTerminator()); 849*0fca6ea1SDimitry Andric IRBuilder<> Builder(PHBranch); 850*0fca6ea1SDimitry Andric DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); 851*0fca6ea1SDimitry Andric Builder.SetCurrentDebugLocation(PHBranch->getDebugLoc()); 852*0fca6ea1SDimitry Andric 853*0fca6ea1SDimitry Andric // Increment the pointer if this was done before the loads in the loop. 854*0fca6ea1SDimitry Andric if (IncIdx) 855*0fca6ea1SDimitry Andric Start = Builder.CreateAdd(Start, ConstantInt::get(Start->getType(), 1)); 856*0fca6ea1SDimitry Andric 857*0fca6ea1SDimitry Andric Value *ByteCmpRes = 858*0fca6ea1SDimitry Andric expandFindMismatch(Builder, DTU, GEPA, GEPB, Index, Start, MaxLen); 859*0fca6ea1SDimitry Andric 860*0fca6ea1SDimitry Andric // Replaces uses of index & induction Phi with intrinsic (we already 861*0fca6ea1SDimitry Andric // checked that the the first instruction of Header is the Phi above). 862*0fca6ea1SDimitry Andric assert(IndPhi->hasOneUse() && "Index phi node has more than one use!"); 863*0fca6ea1SDimitry Andric Index->replaceAllUsesWith(ByteCmpRes); 864*0fca6ea1SDimitry Andric 865*0fca6ea1SDimitry Andric assert(PHBranch->isUnconditional() && 866*0fca6ea1SDimitry Andric "Expected preheader to terminate with an unconditional branch."); 867*0fca6ea1SDimitry Andric 868*0fca6ea1SDimitry Andric // If no mismatch was found, we can jump to the end block. Create a 869*0fca6ea1SDimitry Andric // new basic block for the compare instruction. 870*0fca6ea1SDimitry Andric auto *CmpBB = BasicBlock::Create(Preheader->getContext(), "byte.compare", 871*0fca6ea1SDimitry Andric Preheader->getParent()); 872*0fca6ea1SDimitry Andric CmpBB->moveBefore(EndBB); 873*0fca6ea1SDimitry Andric 874*0fca6ea1SDimitry Andric // Replace the branch in the preheader with an always-true conditional branch. 875*0fca6ea1SDimitry Andric // This ensures there is still a reference to the original loop. 876*0fca6ea1SDimitry Andric Builder.CreateCondBr(Builder.getTrue(), CmpBB, Header); 877*0fca6ea1SDimitry Andric PHBranch->eraseFromParent(); 878*0fca6ea1SDimitry Andric 879*0fca6ea1SDimitry Andric BasicBlock *MismatchEnd = cast<Instruction>(ByteCmpRes)->getParent(); 880*0fca6ea1SDimitry Andric DTU.applyUpdates({{DominatorTree::Insert, MismatchEnd, CmpBB}}); 881*0fca6ea1SDimitry Andric 882*0fca6ea1SDimitry Andric // Create the branch to either the end or found block depending on the value 883*0fca6ea1SDimitry Andric // returned by the intrinsic. 884*0fca6ea1SDimitry Andric Builder.SetInsertPoint(CmpBB); 885*0fca6ea1SDimitry Andric if (FoundBB != EndBB) { 886*0fca6ea1SDimitry Andric Value *FoundCmp = Builder.CreateICmpEQ(ByteCmpRes, MaxLen); 887*0fca6ea1SDimitry Andric Builder.CreateCondBr(FoundCmp, EndBB, FoundBB); 888*0fca6ea1SDimitry Andric DTU.applyUpdates({{DominatorTree::Insert, CmpBB, FoundBB}, 889*0fca6ea1SDimitry Andric {DominatorTree::Insert, CmpBB, EndBB}}); 890*0fca6ea1SDimitry Andric 891*0fca6ea1SDimitry Andric } else { 892*0fca6ea1SDimitry Andric Builder.CreateBr(FoundBB); 893*0fca6ea1SDimitry Andric DTU.applyUpdates({{DominatorTree::Insert, CmpBB, FoundBB}}); 894*0fca6ea1SDimitry Andric } 895*0fca6ea1SDimitry Andric 896*0fca6ea1SDimitry Andric auto fixSuccessorPhis = [&](BasicBlock *SuccBB) { 897*0fca6ea1SDimitry Andric for (PHINode &PN : SuccBB->phis()) { 898*0fca6ea1SDimitry Andric // At this point we've already replaced all uses of the result from the 899*0fca6ea1SDimitry Andric // loop with ByteCmp. Look through the incoming values to find ByteCmp, 900*0fca6ea1SDimitry Andric // meaning this is a Phi collecting the results of the byte compare. 901*0fca6ea1SDimitry Andric bool ResPhi = false; 902*0fca6ea1SDimitry Andric for (Value *Op : PN.incoming_values()) 903*0fca6ea1SDimitry Andric if (Op == ByteCmpRes) { 904*0fca6ea1SDimitry Andric ResPhi = true; 905*0fca6ea1SDimitry Andric break; 906*0fca6ea1SDimitry Andric } 907*0fca6ea1SDimitry Andric 908*0fca6ea1SDimitry Andric // Any PHI that depended upon the result of the byte compare needs a new 909*0fca6ea1SDimitry Andric // incoming value from CmpBB. This is because the original loop will get 910*0fca6ea1SDimitry Andric // deleted. 911*0fca6ea1SDimitry Andric if (ResPhi) 912*0fca6ea1SDimitry Andric PN.addIncoming(ByteCmpRes, CmpBB); 913*0fca6ea1SDimitry Andric else { 914*0fca6ea1SDimitry Andric // There should be no other outside uses of other values in the 915*0fca6ea1SDimitry Andric // original loop. Any incoming values should either: 916*0fca6ea1SDimitry Andric // 1. Be for blocks outside the loop, which aren't interesting. Or .. 917*0fca6ea1SDimitry Andric // 2. These are from blocks in the loop with values defined outside 918*0fca6ea1SDimitry Andric // the loop. We should a similar incoming value from CmpBB. 919*0fca6ea1SDimitry Andric for (BasicBlock *BB : PN.blocks()) 920*0fca6ea1SDimitry Andric if (CurLoop->contains(BB)) { 921*0fca6ea1SDimitry Andric PN.addIncoming(PN.getIncomingValueForBlock(BB), CmpBB); 922*0fca6ea1SDimitry Andric break; 923*0fca6ea1SDimitry Andric } 924*0fca6ea1SDimitry Andric } 925*0fca6ea1SDimitry Andric } 926*0fca6ea1SDimitry Andric }; 927*0fca6ea1SDimitry Andric 928*0fca6ea1SDimitry Andric // Ensure all Phis in the successors of CmpBB have an incoming value from it. 929*0fca6ea1SDimitry Andric fixSuccessorPhis(EndBB); 930*0fca6ea1SDimitry Andric if (EndBB != FoundBB) 931*0fca6ea1SDimitry Andric fixSuccessorPhis(FoundBB); 932*0fca6ea1SDimitry Andric 933*0fca6ea1SDimitry Andric // The new CmpBB block isn't part of the loop, but will need to be added to 934*0fca6ea1SDimitry Andric // the outer loop if there is one. 935*0fca6ea1SDimitry Andric if (!CurLoop->isOutermost()) 936*0fca6ea1SDimitry Andric CurLoop->getParentLoop()->addBasicBlockToLoop(CmpBB, *LI); 937*0fca6ea1SDimitry Andric 938*0fca6ea1SDimitry Andric if (VerifyLoops && CurLoop->getParentLoop()) { 939*0fca6ea1SDimitry Andric CurLoop->getParentLoop()->verifyLoop(); 940*0fca6ea1SDimitry Andric if (!CurLoop->getParentLoop()->isRecursivelyLCSSAForm(*DT, *LI)) 941*0fca6ea1SDimitry Andric report_fatal_error("Loops must remain in LCSSA form!"); 942*0fca6ea1SDimitry Andric } 943*0fca6ea1SDimitry Andric } 944