xref: /freebsd-src/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp (revision 0fca6ea1d4eea4c934cfff25ac9ee8ad6fe95583)
1*0fca6ea1SDimitry Andric //===-------- LoopIdiomVectorize.cpp - Loop idiom vectorization -----------===//
2*0fca6ea1SDimitry Andric //
3*0fca6ea1SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4*0fca6ea1SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
5*0fca6ea1SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6*0fca6ea1SDimitry Andric //
7*0fca6ea1SDimitry Andric //===----------------------------------------------------------------------===//
8*0fca6ea1SDimitry Andric //
9*0fca6ea1SDimitry Andric // This pass implements a pass that recognizes certain loop idioms and
10*0fca6ea1SDimitry Andric // transforms them into more optimized versions of the same loop. In cases
11*0fca6ea1SDimitry Andric // where this happens, it can be a significant performance win.
12*0fca6ea1SDimitry Andric //
13*0fca6ea1SDimitry Andric // We currently only recognize one loop that finds the first mismatched byte
14*0fca6ea1SDimitry Andric // in an array and returns the index, i.e. something like:
15*0fca6ea1SDimitry Andric //
16*0fca6ea1SDimitry Andric //  while (++i != n) {
17*0fca6ea1SDimitry Andric //    if (a[i] != b[i])
18*0fca6ea1SDimitry Andric //      break;
19*0fca6ea1SDimitry Andric //  }
20*0fca6ea1SDimitry Andric //
21*0fca6ea1SDimitry Andric // In this example we can actually vectorize the loop despite the early exit,
22*0fca6ea1SDimitry Andric // although the loop vectorizer does not support it. It requires some extra
23*0fca6ea1SDimitry Andric // checks to deal with the possibility of faulting loads when crossing page
24*0fca6ea1SDimitry Andric // boundaries. However, even with these checks it is still profitable to do the
25*0fca6ea1SDimitry Andric // transformation.
26*0fca6ea1SDimitry Andric //
27*0fca6ea1SDimitry Andric //===----------------------------------------------------------------------===//
28*0fca6ea1SDimitry Andric //
29*0fca6ea1SDimitry Andric // NOTE: This Pass matches a really specific loop pattern because it's only
30*0fca6ea1SDimitry Andric // supposed to be a temporary solution until our LoopVectorizer is powerful
31*0fca6ea1SDimitry Andric // enought to vectorize it automatically.
32*0fca6ea1SDimitry Andric //
33*0fca6ea1SDimitry Andric // TODO List:
34*0fca6ea1SDimitry Andric //
35*0fca6ea1SDimitry Andric // * Add support for the inverse case where we scan for a matching element.
36*0fca6ea1SDimitry Andric // * Permit 64-bit induction variable types.
37*0fca6ea1SDimitry Andric // * Recognize loops that increment the IV *after* comparing bytes.
38*0fca6ea1SDimitry Andric // * Allow 32-bit sign-extends of the IV used by the GEP.
39*0fca6ea1SDimitry Andric //
40*0fca6ea1SDimitry Andric //===----------------------------------------------------------------------===//
41*0fca6ea1SDimitry Andric 
42*0fca6ea1SDimitry Andric #include "llvm/Transforms/Vectorize/LoopIdiomVectorize.h"
43*0fca6ea1SDimitry Andric #include "llvm/Analysis/DomTreeUpdater.h"
44*0fca6ea1SDimitry Andric #include "llvm/Analysis/LoopPass.h"
45*0fca6ea1SDimitry Andric #include "llvm/Analysis/TargetTransformInfo.h"
46*0fca6ea1SDimitry Andric #include "llvm/IR/Dominators.h"
47*0fca6ea1SDimitry Andric #include "llvm/IR/IRBuilder.h"
48*0fca6ea1SDimitry Andric #include "llvm/IR/Intrinsics.h"
49*0fca6ea1SDimitry Andric #include "llvm/IR/MDBuilder.h"
50*0fca6ea1SDimitry Andric #include "llvm/IR/PatternMatch.h"
51*0fca6ea1SDimitry Andric #include "llvm/Transforms/Utils/BasicBlockUtils.h"
52*0fca6ea1SDimitry Andric 
53*0fca6ea1SDimitry Andric using namespace llvm;
54*0fca6ea1SDimitry Andric using namespace PatternMatch;
55*0fca6ea1SDimitry Andric 
56*0fca6ea1SDimitry Andric #define DEBUG_TYPE "loop-idiom-vectorize"
57*0fca6ea1SDimitry Andric 
58*0fca6ea1SDimitry Andric static cl::opt<bool> DisableAll("disable-loop-idiom-vectorize-all", cl::Hidden,
59*0fca6ea1SDimitry Andric                                 cl::init(false),
60*0fca6ea1SDimitry Andric                                 cl::desc("Disable Loop Idiom Vectorize Pass."));
61*0fca6ea1SDimitry Andric 
62*0fca6ea1SDimitry Andric static cl::opt<LoopIdiomVectorizeStyle>
63*0fca6ea1SDimitry Andric     LITVecStyle("loop-idiom-vectorize-style", cl::Hidden,
64*0fca6ea1SDimitry Andric                 cl::desc("The vectorization style for loop idiom transform."),
65*0fca6ea1SDimitry Andric                 cl::values(clEnumValN(LoopIdiomVectorizeStyle::Masked, "masked",
66*0fca6ea1SDimitry Andric                                       "Use masked vector intrinsics"),
67*0fca6ea1SDimitry Andric                            clEnumValN(LoopIdiomVectorizeStyle::Predicated,
68*0fca6ea1SDimitry Andric                                       "predicated", "Use VP intrinsics")),
69*0fca6ea1SDimitry Andric                 cl::init(LoopIdiomVectorizeStyle::Masked));
70*0fca6ea1SDimitry Andric 
71*0fca6ea1SDimitry Andric static cl::opt<bool>
72*0fca6ea1SDimitry Andric     DisableByteCmp("disable-loop-idiom-vectorize-bytecmp", cl::Hidden,
73*0fca6ea1SDimitry Andric                    cl::init(false),
74*0fca6ea1SDimitry Andric                    cl::desc("Proceed with Loop Idiom Vectorize Pass, but do "
75*0fca6ea1SDimitry Andric                             "not convert byte-compare loop(s)."));
76*0fca6ea1SDimitry Andric 
77*0fca6ea1SDimitry Andric static cl::opt<unsigned>
78*0fca6ea1SDimitry Andric     ByteCmpVF("loop-idiom-vectorize-bytecmp-vf", cl::Hidden,
79*0fca6ea1SDimitry Andric               cl::desc("The vectorization factor for byte-compare patterns."),
80*0fca6ea1SDimitry Andric               cl::init(16));
81*0fca6ea1SDimitry Andric 
82*0fca6ea1SDimitry Andric static cl::opt<bool>
83*0fca6ea1SDimitry Andric     VerifyLoops("loop-idiom-vectorize-verify", cl::Hidden, cl::init(false),
84*0fca6ea1SDimitry Andric                 cl::desc("Verify loops generated Loop Idiom Vectorize Pass."));
85*0fca6ea1SDimitry Andric 
86*0fca6ea1SDimitry Andric namespace {
87*0fca6ea1SDimitry Andric class LoopIdiomVectorize {
88*0fca6ea1SDimitry Andric   LoopIdiomVectorizeStyle VectorizeStyle;
89*0fca6ea1SDimitry Andric   unsigned ByteCompareVF;
90*0fca6ea1SDimitry Andric   Loop *CurLoop = nullptr;
91*0fca6ea1SDimitry Andric   DominatorTree *DT;
92*0fca6ea1SDimitry Andric   LoopInfo *LI;
93*0fca6ea1SDimitry Andric   const TargetTransformInfo *TTI;
94*0fca6ea1SDimitry Andric   const DataLayout *DL;
95*0fca6ea1SDimitry Andric 
96*0fca6ea1SDimitry Andric   // Blocks that will be used for inserting vectorized code.
97*0fca6ea1SDimitry Andric   BasicBlock *EndBlock = nullptr;
98*0fca6ea1SDimitry Andric   BasicBlock *VectorLoopPreheaderBlock = nullptr;
99*0fca6ea1SDimitry Andric   BasicBlock *VectorLoopStartBlock = nullptr;
100*0fca6ea1SDimitry Andric   BasicBlock *VectorLoopMismatchBlock = nullptr;
101*0fca6ea1SDimitry Andric   BasicBlock *VectorLoopIncBlock = nullptr;
102*0fca6ea1SDimitry Andric 
103*0fca6ea1SDimitry Andric public:
104*0fca6ea1SDimitry Andric   LoopIdiomVectorize(LoopIdiomVectorizeStyle S, unsigned VF, DominatorTree *DT,
105*0fca6ea1SDimitry Andric                      LoopInfo *LI, const TargetTransformInfo *TTI,
106*0fca6ea1SDimitry Andric                      const DataLayout *DL)
107*0fca6ea1SDimitry Andric       : VectorizeStyle(S), ByteCompareVF(VF), DT(DT), LI(LI), TTI(TTI), DL(DL) {
108*0fca6ea1SDimitry Andric   }
109*0fca6ea1SDimitry Andric 
110*0fca6ea1SDimitry Andric   bool run(Loop *L);
111*0fca6ea1SDimitry Andric 
112*0fca6ea1SDimitry Andric private:
113*0fca6ea1SDimitry Andric   /// \name Countable Loop Idiom Handling
114*0fca6ea1SDimitry Andric   /// @{
115*0fca6ea1SDimitry Andric 
116*0fca6ea1SDimitry Andric   bool runOnCountableLoop();
117*0fca6ea1SDimitry Andric   bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
118*0fca6ea1SDimitry Andric                       SmallVectorImpl<BasicBlock *> &ExitBlocks);
119*0fca6ea1SDimitry Andric 
120*0fca6ea1SDimitry Andric   bool recognizeByteCompare();
121*0fca6ea1SDimitry Andric 
122*0fca6ea1SDimitry Andric   Value *expandFindMismatch(IRBuilder<> &Builder, DomTreeUpdater &DTU,
123*0fca6ea1SDimitry Andric                             GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
124*0fca6ea1SDimitry Andric                             Instruction *Index, Value *Start, Value *MaxLen);
125*0fca6ea1SDimitry Andric 
126*0fca6ea1SDimitry Andric   Value *createMaskedFindMismatch(IRBuilder<> &Builder, DomTreeUpdater &DTU,
127*0fca6ea1SDimitry Andric                                   GetElementPtrInst *GEPA,
128*0fca6ea1SDimitry Andric                                   GetElementPtrInst *GEPB, Value *ExtStart,
129*0fca6ea1SDimitry Andric                                   Value *ExtEnd);
130*0fca6ea1SDimitry Andric   Value *createPredicatedFindMismatch(IRBuilder<> &Builder, DomTreeUpdater &DTU,
131*0fca6ea1SDimitry Andric                                       GetElementPtrInst *GEPA,
132*0fca6ea1SDimitry Andric                                       GetElementPtrInst *GEPB, Value *ExtStart,
133*0fca6ea1SDimitry Andric                                       Value *ExtEnd);
134*0fca6ea1SDimitry Andric 
135*0fca6ea1SDimitry Andric   void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
136*0fca6ea1SDimitry Andric                             PHINode *IndPhi, Value *MaxLen, Instruction *Index,
137*0fca6ea1SDimitry Andric                             Value *Start, bool IncIdx, BasicBlock *FoundBB,
138*0fca6ea1SDimitry Andric                             BasicBlock *EndBB);
139*0fca6ea1SDimitry Andric   /// @}
140*0fca6ea1SDimitry Andric };
141*0fca6ea1SDimitry Andric } // anonymous namespace
142*0fca6ea1SDimitry Andric 
143*0fca6ea1SDimitry Andric PreservedAnalyses LoopIdiomVectorizePass::run(Loop &L, LoopAnalysisManager &AM,
144*0fca6ea1SDimitry Andric                                               LoopStandardAnalysisResults &AR,
145*0fca6ea1SDimitry Andric                                               LPMUpdater &) {
146*0fca6ea1SDimitry Andric   if (DisableAll)
147*0fca6ea1SDimitry Andric     return PreservedAnalyses::all();
148*0fca6ea1SDimitry Andric 
149*0fca6ea1SDimitry Andric   const auto *DL = &L.getHeader()->getDataLayout();
150*0fca6ea1SDimitry Andric 
151*0fca6ea1SDimitry Andric   LoopIdiomVectorizeStyle VecStyle = VectorizeStyle;
152*0fca6ea1SDimitry Andric   if (LITVecStyle.getNumOccurrences())
153*0fca6ea1SDimitry Andric     VecStyle = LITVecStyle;
154*0fca6ea1SDimitry Andric 
155*0fca6ea1SDimitry Andric   unsigned BCVF = ByteCompareVF;
156*0fca6ea1SDimitry Andric   if (ByteCmpVF.getNumOccurrences())
157*0fca6ea1SDimitry Andric     BCVF = ByteCmpVF;
158*0fca6ea1SDimitry Andric 
159*0fca6ea1SDimitry Andric   LoopIdiomVectorize LIV(VecStyle, BCVF, &AR.DT, &AR.LI, &AR.TTI, DL);
160*0fca6ea1SDimitry Andric   if (!LIV.run(&L))
161*0fca6ea1SDimitry Andric     return PreservedAnalyses::all();
162*0fca6ea1SDimitry Andric 
163*0fca6ea1SDimitry Andric   return PreservedAnalyses::none();
164*0fca6ea1SDimitry Andric }
165*0fca6ea1SDimitry Andric 
166*0fca6ea1SDimitry Andric //===----------------------------------------------------------------------===//
167*0fca6ea1SDimitry Andric //
168*0fca6ea1SDimitry Andric //          Implementation of LoopIdiomVectorize
169*0fca6ea1SDimitry Andric //
170*0fca6ea1SDimitry Andric //===----------------------------------------------------------------------===//
171*0fca6ea1SDimitry Andric 
172*0fca6ea1SDimitry Andric bool LoopIdiomVectorize::run(Loop *L) {
173*0fca6ea1SDimitry Andric   CurLoop = L;
174*0fca6ea1SDimitry Andric 
175*0fca6ea1SDimitry Andric   Function &F = *L->getHeader()->getParent();
176*0fca6ea1SDimitry Andric   if (DisableAll || F.hasOptSize())
177*0fca6ea1SDimitry Andric     return false;
178*0fca6ea1SDimitry Andric 
179*0fca6ea1SDimitry Andric   if (F.hasFnAttribute(Attribute::NoImplicitFloat)) {
180*0fca6ea1SDimitry Andric     LLVM_DEBUG(dbgs() << DEBUG_TYPE << " is disabled on " << F.getName()
181*0fca6ea1SDimitry Andric                       << " due to its NoImplicitFloat attribute");
182*0fca6ea1SDimitry Andric     return false;
183*0fca6ea1SDimitry Andric   }
184*0fca6ea1SDimitry Andric 
185*0fca6ea1SDimitry Andric   // If the loop could not be converted to canonical form, it must have an
186*0fca6ea1SDimitry Andric   // indirectbr in it, just give up.
187*0fca6ea1SDimitry Andric   if (!L->getLoopPreheader())
188*0fca6ea1SDimitry Andric     return false;
189*0fca6ea1SDimitry Andric 
190*0fca6ea1SDimitry Andric   LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F[" << F.getName() << "] Loop %"
191*0fca6ea1SDimitry Andric                     << CurLoop->getHeader()->getName() << "\n");
192*0fca6ea1SDimitry Andric 
193*0fca6ea1SDimitry Andric   return recognizeByteCompare();
194*0fca6ea1SDimitry Andric }
195*0fca6ea1SDimitry Andric 
196*0fca6ea1SDimitry Andric bool LoopIdiomVectorize::recognizeByteCompare() {
197*0fca6ea1SDimitry Andric   // Currently the transformation only works on scalable vector types, although
198*0fca6ea1SDimitry Andric   // there is no fundamental reason why it cannot be made to work for fixed
199*0fca6ea1SDimitry Andric   // width too.
200*0fca6ea1SDimitry Andric 
201*0fca6ea1SDimitry Andric   // We also need to know the minimum page size for the target in order to
202*0fca6ea1SDimitry Andric   // generate runtime memory checks to ensure the vector version won't fault.
203*0fca6ea1SDimitry Andric   if (!TTI->supportsScalableVectors() || !TTI->getMinPageSize().has_value() ||
204*0fca6ea1SDimitry Andric       DisableByteCmp)
205*0fca6ea1SDimitry Andric     return false;
206*0fca6ea1SDimitry Andric 
207*0fca6ea1SDimitry Andric   BasicBlock *Header = CurLoop->getHeader();
208*0fca6ea1SDimitry Andric 
209*0fca6ea1SDimitry Andric   // In LoopIdiomVectorize::run we have already checked that the loop
210*0fca6ea1SDimitry Andric   // has a preheader so we can assume it's in a canonical form.
211*0fca6ea1SDimitry Andric   if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 2)
212*0fca6ea1SDimitry Andric     return false;
213*0fca6ea1SDimitry Andric 
214*0fca6ea1SDimitry Andric   PHINode *PN = dyn_cast<PHINode>(&Header->front());
215*0fca6ea1SDimitry Andric   if (!PN || PN->getNumIncomingValues() != 2)
216*0fca6ea1SDimitry Andric     return false;
217*0fca6ea1SDimitry Andric 
218*0fca6ea1SDimitry Andric   auto LoopBlocks = CurLoop->getBlocks();
219*0fca6ea1SDimitry Andric   // The first block in the loop should contain only 4 instructions, e.g.
220*0fca6ea1SDimitry Andric   //
221*0fca6ea1SDimitry Andric   //  while.cond:
222*0fca6ea1SDimitry Andric   //   %res.phi = phi i32 [ %start, %ph ], [ %inc, %while.body ]
223*0fca6ea1SDimitry Andric   //   %inc = add i32 %res.phi, 1
224*0fca6ea1SDimitry Andric   //   %cmp.not = icmp eq i32 %inc, %n
225*0fca6ea1SDimitry Andric   //   br i1 %cmp.not, label %while.end, label %while.body
226*0fca6ea1SDimitry Andric   //
227*0fca6ea1SDimitry Andric   if (LoopBlocks[0]->sizeWithoutDebug() > 4)
228*0fca6ea1SDimitry Andric     return false;
229*0fca6ea1SDimitry Andric 
230*0fca6ea1SDimitry Andric   // The second block should contain 7 instructions, e.g.
231*0fca6ea1SDimitry Andric   //
232*0fca6ea1SDimitry Andric   // while.body:
233*0fca6ea1SDimitry Andric   //   %idx = zext i32 %inc to i64
234*0fca6ea1SDimitry Andric   //   %idx.a = getelementptr inbounds i8, ptr %a, i64 %idx
235*0fca6ea1SDimitry Andric   //   %load.a = load i8, ptr %idx.a
236*0fca6ea1SDimitry Andric   //   %idx.b = getelementptr inbounds i8, ptr %b, i64 %idx
237*0fca6ea1SDimitry Andric   //   %load.b = load i8, ptr %idx.b
238*0fca6ea1SDimitry Andric   //   %cmp.not.ld = icmp eq i8 %load.a, %load.b
239*0fca6ea1SDimitry Andric   //   br i1 %cmp.not.ld, label %while.cond, label %while.end
240*0fca6ea1SDimitry Andric   //
241*0fca6ea1SDimitry Andric   if (LoopBlocks[1]->sizeWithoutDebug() > 7)
242*0fca6ea1SDimitry Andric     return false;
243*0fca6ea1SDimitry Andric 
244*0fca6ea1SDimitry Andric   // The incoming value to the PHI node from the loop should be an add of 1.
245*0fca6ea1SDimitry Andric   Value *StartIdx = nullptr;
246*0fca6ea1SDimitry Andric   Instruction *Index = nullptr;
247*0fca6ea1SDimitry Andric   if (!CurLoop->contains(PN->getIncomingBlock(0))) {
248*0fca6ea1SDimitry Andric     StartIdx = PN->getIncomingValue(0);
249*0fca6ea1SDimitry Andric     Index = dyn_cast<Instruction>(PN->getIncomingValue(1));
250*0fca6ea1SDimitry Andric   } else {
251*0fca6ea1SDimitry Andric     StartIdx = PN->getIncomingValue(1);
252*0fca6ea1SDimitry Andric     Index = dyn_cast<Instruction>(PN->getIncomingValue(0));
253*0fca6ea1SDimitry Andric   }
254*0fca6ea1SDimitry Andric 
255*0fca6ea1SDimitry Andric   // Limit to 32-bit types for now
256*0fca6ea1SDimitry Andric   if (!Index || !Index->getType()->isIntegerTy(32) ||
257*0fca6ea1SDimitry Andric       !match(Index, m_c_Add(m_Specific(PN), m_One())))
258*0fca6ea1SDimitry Andric     return false;
259*0fca6ea1SDimitry Andric 
260*0fca6ea1SDimitry Andric   // If we match the pattern, PN and Index will be replaced with the result of
261*0fca6ea1SDimitry Andric   // the cttz.elts intrinsic. If any other instructions are used outside of
262*0fca6ea1SDimitry Andric   // the loop, we cannot replace it.
263*0fca6ea1SDimitry Andric   for (BasicBlock *BB : LoopBlocks)
264*0fca6ea1SDimitry Andric     for (Instruction &I : *BB)
265*0fca6ea1SDimitry Andric       if (&I != PN && &I != Index)
266*0fca6ea1SDimitry Andric         for (User *U : I.users())
267*0fca6ea1SDimitry Andric           if (!CurLoop->contains(cast<Instruction>(U)))
268*0fca6ea1SDimitry Andric             return false;
269*0fca6ea1SDimitry Andric 
270*0fca6ea1SDimitry Andric   // Match the branch instruction for the header
271*0fca6ea1SDimitry Andric   ICmpInst::Predicate Pred;
272*0fca6ea1SDimitry Andric   Value *MaxLen;
273*0fca6ea1SDimitry Andric   BasicBlock *EndBB, *WhileBB;
274*0fca6ea1SDimitry Andric   if (!match(Header->getTerminator(),
275*0fca6ea1SDimitry Andric              m_Br(m_ICmp(Pred, m_Specific(Index), m_Value(MaxLen)),
276*0fca6ea1SDimitry Andric                   m_BasicBlock(EndBB), m_BasicBlock(WhileBB))) ||
277*0fca6ea1SDimitry Andric       Pred != ICmpInst::Predicate::ICMP_EQ || !CurLoop->contains(WhileBB))
278*0fca6ea1SDimitry Andric     return false;
279*0fca6ea1SDimitry Andric 
280*0fca6ea1SDimitry Andric   // WhileBB should contain the pattern of load & compare instructions. Match
281*0fca6ea1SDimitry Andric   // the pattern and find the GEP instructions used by the loads.
282*0fca6ea1SDimitry Andric   ICmpInst::Predicate WhilePred;
283*0fca6ea1SDimitry Andric   BasicBlock *FoundBB;
284*0fca6ea1SDimitry Andric   BasicBlock *TrueBB;
285*0fca6ea1SDimitry Andric   Value *LoadA, *LoadB;
286*0fca6ea1SDimitry Andric   if (!match(WhileBB->getTerminator(),
287*0fca6ea1SDimitry Andric              m_Br(m_ICmp(WhilePred, m_Value(LoadA), m_Value(LoadB)),
288*0fca6ea1SDimitry Andric                   m_BasicBlock(TrueBB), m_BasicBlock(FoundBB))) ||
289*0fca6ea1SDimitry Andric       WhilePred != ICmpInst::Predicate::ICMP_EQ || !CurLoop->contains(TrueBB))
290*0fca6ea1SDimitry Andric     return false;
291*0fca6ea1SDimitry Andric 
292*0fca6ea1SDimitry Andric   Value *A, *B;
293*0fca6ea1SDimitry Andric   if (!match(LoadA, m_Load(m_Value(A))) || !match(LoadB, m_Load(m_Value(B))))
294*0fca6ea1SDimitry Andric     return false;
295*0fca6ea1SDimitry Andric 
296*0fca6ea1SDimitry Andric   LoadInst *LoadAI = cast<LoadInst>(LoadA);
297*0fca6ea1SDimitry Andric   LoadInst *LoadBI = cast<LoadInst>(LoadB);
298*0fca6ea1SDimitry Andric   if (!LoadAI->isSimple() || !LoadBI->isSimple())
299*0fca6ea1SDimitry Andric     return false;
300*0fca6ea1SDimitry Andric 
301*0fca6ea1SDimitry Andric   GetElementPtrInst *GEPA = dyn_cast<GetElementPtrInst>(A);
302*0fca6ea1SDimitry Andric   GetElementPtrInst *GEPB = dyn_cast<GetElementPtrInst>(B);
303*0fca6ea1SDimitry Andric 
304*0fca6ea1SDimitry Andric   if (!GEPA || !GEPB)
305*0fca6ea1SDimitry Andric     return false;
306*0fca6ea1SDimitry Andric 
307*0fca6ea1SDimitry Andric   Value *PtrA = GEPA->getPointerOperand();
308*0fca6ea1SDimitry Andric   Value *PtrB = GEPB->getPointerOperand();
309*0fca6ea1SDimitry Andric 
310*0fca6ea1SDimitry Andric   // Check we are loading i8 values from two loop invariant pointers
311*0fca6ea1SDimitry Andric   if (!CurLoop->isLoopInvariant(PtrA) || !CurLoop->isLoopInvariant(PtrB) ||
312*0fca6ea1SDimitry Andric       !GEPA->getResultElementType()->isIntegerTy(8) ||
313*0fca6ea1SDimitry Andric       !GEPB->getResultElementType()->isIntegerTy(8) ||
314*0fca6ea1SDimitry Andric       !LoadAI->getType()->isIntegerTy(8) ||
315*0fca6ea1SDimitry Andric       !LoadBI->getType()->isIntegerTy(8) || PtrA == PtrB)
316*0fca6ea1SDimitry Andric     return false;
317*0fca6ea1SDimitry Andric 
318*0fca6ea1SDimitry Andric   // Check that the index to the GEPs is the index we found earlier
319*0fca6ea1SDimitry Andric   if (GEPA->getNumIndices() > 1 || GEPB->getNumIndices() > 1)
320*0fca6ea1SDimitry Andric     return false;
321*0fca6ea1SDimitry Andric 
322*0fca6ea1SDimitry Andric   Value *IdxA = GEPA->getOperand(GEPA->getNumIndices());
323*0fca6ea1SDimitry Andric   Value *IdxB = GEPB->getOperand(GEPB->getNumIndices());
324*0fca6ea1SDimitry Andric   if (IdxA != IdxB || !match(IdxA, m_ZExt(m_Specific(Index))))
325*0fca6ea1SDimitry Andric     return false;
326*0fca6ea1SDimitry Andric 
327*0fca6ea1SDimitry Andric   // We only ever expect the pre-incremented index value to be used inside the
328*0fca6ea1SDimitry Andric   // loop.
329*0fca6ea1SDimitry Andric   if (!PN->hasOneUse())
330*0fca6ea1SDimitry Andric     return false;
331*0fca6ea1SDimitry Andric 
332*0fca6ea1SDimitry Andric   // Ensure that when the Found and End blocks are identical the PHIs have the
333*0fca6ea1SDimitry Andric   // supported format. We don't currently allow cases like this:
334*0fca6ea1SDimitry Andric   // while.cond:
335*0fca6ea1SDimitry Andric   //   ...
336*0fca6ea1SDimitry Andric   //   br i1 %cmp.not, label %while.end, label %while.body
337*0fca6ea1SDimitry Andric   //
338*0fca6ea1SDimitry Andric   // while.body:
339*0fca6ea1SDimitry Andric   //   ...
340*0fca6ea1SDimitry Andric   //   br i1 %cmp.not2, label %while.cond, label %while.end
341*0fca6ea1SDimitry Andric   //
342*0fca6ea1SDimitry Andric   // while.end:
343*0fca6ea1SDimitry Andric   //   %final_ptr = phi ptr [ %c, %while.body ], [ %d, %while.cond ]
344*0fca6ea1SDimitry Andric   //
345*0fca6ea1SDimitry Andric   // Where the incoming values for %final_ptr are unique and from each of the
346*0fca6ea1SDimitry Andric   // loop blocks, but not actually defined in the loop. This requires extra
347*0fca6ea1SDimitry Andric   // work setting up the byte.compare block, i.e. by introducing a select to
348*0fca6ea1SDimitry Andric   // choose the correct value.
349*0fca6ea1SDimitry Andric   // TODO: We could add support for this in future.
350*0fca6ea1SDimitry Andric   if (FoundBB == EndBB) {
351*0fca6ea1SDimitry Andric     for (PHINode &EndPN : EndBB->phis()) {
352*0fca6ea1SDimitry Andric       Value *WhileCondVal = EndPN.getIncomingValueForBlock(Header);
353*0fca6ea1SDimitry Andric       Value *WhileBodyVal = EndPN.getIncomingValueForBlock(WhileBB);
354*0fca6ea1SDimitry Andric 
355*0fca6ea1SDimitry Andric       // The value of the index when leaving the while.cond block is always the
356*0fca6ea1SDimitry Andric       // same as the end value (MaxLen) so we permit either. The value when
357*0fca6ea1SDimitry Andric       // leaving the while.body block should only be the index. Otherwise for
358*0fca6ea1SDimitry Andric       // any other values we only allow ones that are same for both blocks.
359*0fca6ea1SDimitry Andric       if (WhileCondVal != WhileBodyVal &&
360*0fca6ea1SDimitry Andric           ((WhileCondVal != Index && WhileCondVal != MaxLen) ||
361*0fca6ea1SDimitry Andric            (WhileBodyVal != Index)))
362*0fca6ea1SDimitry Andric         return false;
363*0fca6ea1SDimitry Andric     }
364*0fca6ea1SDimitry Andric   }
365*0fca6ea1SDimitry Andric 
366*0fca6ea1SDimitry Andric   LLVM_DEBUG(dbgs() << "FOUND IDIOM IN LOOP: \n"
367*0fca6ea1SDimitry Andric                     << *(EndBB->getParent()) << "\n\n");
368*0fca6ea1SDimitry Andric 
369*0fca6ea1SDimitry Andric   // The index is incremented before the GEP/Load pair so we need to
370*0fca6ea1SDimitry Andric   // add 1 to the start value.
371*0fca6ea1SDimitry Andric   transformByteCompare(GEPA, GEPB, PN, MaxLen, Index, StartIdx, /*IncIdx=*/true,
372*0fca6ea1SDimitry Andric                        FoundBB, EndBB);
373*0fca6ea1SDimitry Andric   return true;
374*0fca6ea1SDimitry Andric }
375*0fca6ea1SDimitry Andric 
376*0fca6ea1SDimitry Andric Value *LoopIdiomVectorize::createMaskedFindMismatch(
377*0fca6ea1SDimitry Andric     IRBuilder<> &Builder, DomTreeUpdater &DTU, GetElementPtrInst *GEPA,
378*0fca6ea1SDimitry Andric     GetElementPtrInst *GEPB, Value *ExtStart, Value *ExtEnd) {
379*0fca6ea1SDimitry Andric   Type *I64Type = Builder.getInt64Ty();
380*0fca6ea1SDimitry Andric   Type *ResType = Builder.getInt32Ty();
381*0fca6ea1SDimitry Andric   Type *LoadType = Builder.getInt8Ty();
382*0fca6ea1SDimitry Andric   Value *PtrA = GEPA->getPointerOperand();
383*0fca6ea1SDimitry Andric   Value *PtrB = GEPB->getPointerOperand();
384*0fca6ea1SDimitry Andric 
385*0fca6ea1SDimitry Andric   ScalableVectorType *PredVTy =
386*0fca6ea1SDimitry Andric       ScalableVectorType::get(Builder.getInt1Ty(), ByteCompareVF);
387*0fca6ea1SDimitry Andric 
388*0fca6ea1SDimitry Andric   Value *InitialPred = Builder.CreateIntrinsic(
389*0fca6ea1SDimitry Andric       Intrinsic::get_active_lane_mask, {PredVTy, I64Type}, {ExtStart, ExtEnd});
390*0fca6ea1SDimitry Andric 
391*0fca6ea1SDimitry Andric   Value *VecLen = Builder.CreateIntrinsic(Intrinsic::vscale, {I64Type}, {});
392*0fca6ea1SDimitry Andric   VecLen =
393*0fca6ea1SDimitry Andric       Builder.CreateMul(VecLen, ConstantInt::get(I64Type, ByteCompareVF), "",
394*0fca6ea1SDimitry Andric                         /*HasNUW=*/true, /*HasNSW=*/true);
395*0fca6ea1SDimitry Andric 
396*0fca6ea1SDimitry Andric   Value *PFalse = Builder.CreateVectorSplat(PredVTy->getElementCount(),
397*0fca6ea1SDimitry Andric                                             Builder.getInt1(false));
398*0fca6ea1SDimitry Andric 
399*0fca6ea1SDimitry Andric   BranchInst *JumpToVectorLoop = BranchInst::Create(VectorLoopStartBlock);
400*0fca6ea1SDimitry Andric   Builder.Insert(JumpToVectorLoop);
401*0fca6ea1SDimitry Andric 
402*0fca6ea1SDimitry Andric   DTU.applyUpdates({{DominatorTree::Insert, VectorLoopPreheaderBlock,
403*0fca6ea1SDimitry Andric                      VectorLoopStartBlock}});
404*0fca6ea1SDimitry Andric 
405*0fca6ea1SDimitry Andric   // Set up the first vector loop block by creating the PHIs, doing the vector
406*0fca6ea1SDimitry Andric   // loads and comparing the vectors.
407*0fca6ea1SDimitry Andric   Builder.SetInsertPoint(VectorLoopStartBlock);
408*0fca6ea1SDimitry Andric   PHINode *LoopPred = Builder.CreatePHI(PredVTy, 2, "mismatch_vec_loop_pred");
409*0fca6ea1SDimitry Andric   LoopPred->addIncoming(InitialPred, VectorLoopPreheaderBlock);
410*0fca6ea1SDimitry Andric   PHINode *VectorIndexPhi = Builder.CreatePHI(I64Type, 2, "mismatch_vec_index");
411*0fca6ea1SDimitry Andric   VectorIndexPhi->addIncoming(ExtStart, VectorLoopPreheaderBlock);
412*0fca6ea1SDimitry Andric   Type *VectorLoadType =
413*0fca6ea1SDimitry Andric       ScalableVectorType::get(Builder.getInt8Ty(), ByteCompareVF);
414*0fca6ea1SDimitry Andric   Value *Passthru = ConstantInt::getNullValue(VectorLoadType);
415*0fca6ea1SDimitry Andric 
416*0fca6ea1SDimitry Andric   Value *VectorLhsGep =
417*0fca6ea1SDimitry Andric       Builder.CreateGEP(LoadType, PtrA, VectorIndexPhi, "", GEPA->isInBounds());
418*0fca6ea1SDimitry Andric   Value *VectorLhsLoad = Builder.CreateMaskedLoad(VectorLoadType, VectorLhsGep,
419*0fca6ea1SDimitry Andric                                                   Align(1), LoopPred, Passthru);
420*0fca6ea1SDimitry Andric 
421*0fca6ea1SDimitry Andric   Value *VectorRhsGep =
422*0fca6ea1SDimitry Andric       Builder.CreateGEP(LoadType, PtrB, VectorIndexPhi, "", GEPB->isInBounds());
423*0fca6ea1SDimitry Andric   Value *VectorRhsLoad = Builder.CreateMaskedLoad(VectorLoadType, VectorRhsGep,
424*0fca6ea1SDimitry Andric                                                   Align(1), LoopPred, Passthru);
425*0fca6ea1SDimitry Andric 
426*0fca6ea1SDimitry Andric   Value *VectorMatchCmp = Builder.CreateICmpNE(VectorLhsLoad, VectorRhsLoad);
427*0fca6ea1SDimitry Andric   VectorMatchCmp = Builder.CreateSelect(LoopPred, VectorMatchCmp, PFalse);
428*0fca6ea1SDimitry Andric   Value *VectorMatchHasActiveLanes = Builder.CreateOrReduce(VectorMatchCmp);
429*0fca6ea1SDimitry Andric   BranchInst *VectorEarlyExit = BranchInst::Create(
430*0fca6ea1SDimitry Andric       VectorLoopMismatchBlock, VectorLoopIncBlock, VectorMatchHasActiveLanes);
431*0fca6ea1SDimitry Andric   Builder.Insert(VectorEarlyExit);
432*0fca6ea1SDimitry Andric 
433*0fca6ea1SDimitry Andric   DTU.applyUpdates(
434*0fca6ea1SDimitry Andric       {{DominatorTree::Insert, VectorLoopStartBlock, VectorLoopMismatchBlock},
435*0fca6ea1SDimitry Andric        {DominatorTree::Insert, VectorLoopStartBlock, VectorLoopIncBlock}});
436*0fca6ea1SDimitry Andric 
437*0fca6ea1SDimitry Andric   // Increment the index counter and calculate the predicate for the next
438*0fca6ea1SDimitry Andric   // iteration of the loop. We branch back to the start of the loop if there
439*0fca6ea1SDimitry Andric   // is at least one active lane.
440*0fca6ea1SDimitry Andric   Builder.SetInsertPoint(VectorLoopIncBlock);
441*0fca6ea1SDimitry Andric   Value *NewVectorIndexPhi =
442*0fca6ea1SDimitry Andric       Builder.CreateAdd(VectorIndexPhi, VecLen, "",
443*0fca6ea1SDimitry Andric                         /*HasNUW=*/true, /*HasNSW=*/true);
444*0fca6ea1SDimitry Andric   VectorIndexPhi->addIncoming(NewVectorIndexPhi, VectorLoopIncBlock);
445*0fca6ea1SDimitry Andric   Value *NewPred =
446*0fca6ea1SDimitry Andric       Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
447*0fca6ea1SDimitry Andric                               {PredVTy, I64Type}, {NewVectorIndexPhi, ExtEnd});
448*0fca6ea1SDimitry Andric   LoopPred->addIncoming(NewPred, VectorLoopIncBlock);
449*0fca6ea1SDimitry Andric 
450*0fca6ea1SDimitry Andric   Value *PredHasActiveLanes =
451*0fca6ea1SDimitry Andric       Builder.CreateExtractElement(NewPred, uint64_t(0));
452*0fca6ea1SDimitry Andric   BranchInst *VectorLoopBranchBack =
453*0fca6ea1SDimitry Andric       BranchInst::Create(VectorLoopStartBlock, EndBlock, PredHasActiveLanes);
454*0fca6ea1SDimitry Andric   Builder.Insert(VectorLoopBranchBack);
455*0fca6ea1SDimitry Andric 
456*0fca6ea1SDimitry Andric   DTU.applyUpdates(
457*0fca6ea1SDimitry Andric       {{DominatorTree::Insert, VectorLoopIncBlock, VectorLoopStartBlock},
458*0fca6ea1SDimitry Andric        {DominatorTree::Insert, VectorLoopIncBlock, EndBlock}});
459*0fca6ea1SDimitry Andric 
460*0fca6ea1SDimitry Andric   // If we found a mismatch then we need to calculate which lane in the vector
461*0fca6ea1SDimitry Andric   // had a mismatch and add that on to the current loop index.
462*0fca6ea1SDimitry Andric   Builder.SetInsertPoint(VectorLoopMismatchBlock);
463*0fca6ea1SDimitry Andric   PHINode *FoundPred = Builder.CreatePHI(PredVTy, 1, "mismatch_vec_found_pred");
464*0fca6ea1SDimitry Andric   FoundPred->addIncoming(VectorMatchCmp, VectorLoopStartBlock);
465*0fca6ea1SDimitry Andric   PHINode *LastLoopPred =
466*0fca6ea1SDimitry Andric       Builder.CreatePHI(PredVTy, 1, "mismatch_vec_last_loop_pred");
467*0fca6ea1SDimitry Andric   LastLoopPred->addIncoming(LoopPred, VectorLoopStartBlock);
468*0fca6ea1SDimitry Andric   PHINode *VectorFoundIndex =
469*0fca6ea1SDimitry Andric       Builder.CreatePHI(I64Type, 1, "mismatch_vec_found_index");
470*0fca6ea1SDimitry Andric   VectorFoundIndex->addIncoming(VectorIndexPhi, VectorLoopStartBlock);
471*0fca6ea1SDimitry Andric 
472*0fca6ea1SDimitry Andric   Value *PredMatchCmp = Builder.CreateAnd(LastLoopPred, FoundPred);
473*0fca6ea1SDimitry Andric   Value *Ctz = Builder.CreateIntrinsic(
474*0fca6ea1SDimitry Andric       Intrinsic::experimental_cttz_elts, {ResType, PredMatchCmp->getType()},
475*0fca6ea1SDimitry Andric       {PredMatchCmp, /*ZeroIsPoison=*/Builder.getInt1(true)});
476*0fca6ea1SDimitry Andric   Ctz = Builder.CreateZExt(Ctz, I64Type);
477*0fca6ea1SDimitry Andric   Value *VectorLoopRes64 = Builder.CreateAdd(VectorFoundIndex, Ctz, "",
478*0fca6ea1SDimitry Andric                                              /*HasNUW=*/true, /*HasNSW=*/true);
479*0fca6ea1SDimitry Andric   return Builder.CreateTrunc(VectorLoopRes64, ResType);
480*0fca6ea1SDimitry Andric }
481*0fca6ea1SDimitry Andric 
482*0fca6ea1SDimitry Andric Value *LoopIdiomVectorize::createPredicatedFindMismatch(
483*0fca6ea1SDimitry Andric     IRBuilder<> &Builder, DomTreeUpdater &DTU, GetElementPtrInst *GEPA,
484*0fca6ea1SDimitry Andric     GetElementPtrInst *GEPB, Value *ExtStart, Value *ExtEnd) {
485*0fca6ea1SDimitry Andric   Type *I64Type = Builder.getInt64Ty();
486*0fca6ea1SDimitry Andric   Type *I32Type = Builder.getInt32Ty();
487*0fca6ea1SDimitry Andric   Type *ResType = I32Type;
488*0fca6ea1SDimitry Andric   Type *LoadType = Builder.getInt8Ty();
489*0fca6ea1SDimitry Andric   Value *PtrA = GEPA->getPointerOperand();
490*0fca6ea1SDimitry Andric   Value *PtrB = GEPB->getPointerOperand();
491*0fca6ea1SDimitry Andric 
492*0fca6ea1SDimitry Andric   auto *JumpToVectorLoop = BranchInst::Create(VectorLoopStartBlock);
493*0fca6ea1SDimitry Andric   Builder.Insert(JumpToVectorLoop);
494*0fca6ea1SDimitry Andric 
495*0fca6ea1SDimitry Andric   DTU.applyUpdates({{DominatorTree::Insert, VectorLoopPreheaderBlock,
496*0fca6ea1SDimitry Andric                      VectorLoopStartBlock}});
497*0fca6ea1SDimitry Andric 
498*0fca6ea1SDimitry Andric   // Set up the first Vector loop block by creating the PHIs, doing the vector
499*0fca6ea1SDimitry Andric   // loads and comparing the vectors.
500*0fca6ea1SDimitry Andric   Builder.SetInsertPoint(VectorLoopStartBlock);
501*0fca6ea1SDimitry Andric   auto *VectorIndexPhi = Builder.CreatePHI(I64Type, 2, "mismatch_vector_index");
502*0fca6ea1SDimitry Andric   VectorIndexPhi->addIncoming(ExtStart, VectorLoopPreheaderBlock);
503*0fca6ea1SDimitry Andric 
504*0fca6ea1SDimitry Andric   // Calculate AVL by subtracting the vector loop index from the trip count
505*0fca6ea1SDimitry Andric   Value *AVL = Builder.CreateSub(ExtEnd, VectorIndexPhi, "avl", /*HasNUW=*/true,
506*0fca6ea1SDimitry Andric                                  /*HasNSW=*/true);
507*0fca6ea1SDimitry Andric 
508*0fca6ea1SDimitry Andric   auto *VectorLoadType = ScalableVectorType::get(LoadType, ByteCompareVF);
509*0fca6ea1SDimitry Andric   auto *VF = ConstantInt::get(I32Type, ByteCompareVF);
510*0fca6ea1SDimitry Andric 
511*0fca6ea1SDimitry Andric   Value *VL = Builder.CreateIntrinsic(Intrinsic::experimental_get_vector_length,
512*0fca6ea1SDimitry Andric                                       {I64Type}, {AVL, VF, Builder.getTrue()});
513*0fca6ea1SDimitry Andric   Value *GepOffset = VectorIndexPhi;
514*0fca6ea1SDimitry Andric 
515*0fca6ea1SDimitry Andric   Value *VectorLhsGep =
516*0fca6ea1SDimitry Andric       Builder.CreateGEP(LoadType, PtrA, GepOffset, "", GEPA->isInBounds());
517*0fca6ea1SDimitry Andric   VectorType *TrueMaskTy =
518*0fca6ea1SDimitry Andric       VectorType::get(Builder.getInt1Ty(), VectorLoadType->getElementCount());
519*0fca6ea1SDimitry Andric   Value *AllTrueMask = Constant::getAllOnesValue(TrueMaskTy);
520*0fca6ea1SDimitry Andric   Value *VectorLhsLoad = Builder.CreateIntrinsic(
521*0fca6ea1SDimitry Andric       Intrinsic::vp_load, {VectorLoadType, VectorLhsGep->getType()},
522*0fca6ea1SDimitry Andric       {VectorLhsGep, AllTrueMask, VL}, nullptr, "lhs.load");
523*0fca6ea1SDimitry Andric 
524*0fca6ea1SDimitry Andric   Value *VectorRhsGep =
525*0fca6ea1SDimitry Andric       Builder.CreateGEP(LoadType, PtrB, GepOffset, "", GEPB->isInBounds());
526*0fca6ea1SDimitry Andric   Value *VectorRhsLoad = Builder.CreateIntrinsic(
527*0fca6ea1SDimitry Andric       Intrinsic::vp_load, {VectorLoadType, VectorLhsGep->getType()},
528*0fca6ea1SDimitry Andric       {VectorRhsGep, AllTrueMask, VL}, nullptr, "rhs.load");
529*0fca6ea1SDimitry Andric 
530*0fca6ea1SDimitry Andric   StringRef PredicateStr = CmpInst::getPredicateName(CmpInst::ICMP_NE);
531*0fca6ea1SDimitry Andric   auto *PredicateMDS = MDString::get(VectorLhsLoad->getContext(), PredicateStr);
532*0fca6ea1SDimitry Andric   Value *Pred = MetadataAsValue::get(VectorLhsLoad->getContext(), PredicateMDS);
533*0fca6ea1SDimitry Andric   Value *VectorMatchCmp = Builder.CreateIntrinsic(
534*0fca6ea1SDimitry Andric       Intrinsic::vp_icmp, {VectorLhsLoad->getType()},
535*0fca6ea1SDimitry Andric       {VectorLhsLoad, VectorRhsLoad, Pred, AllTrueMask, VL}, nullptr,
536*0fca6ea1SDimitry Andric       "mismatch.cmp");
537*0fca6ea1SDimitry Andric   Value *CTZ = Builder.CreateIntrinsic(
538*0fca6ea1SDimitry Andric       Intrinsic::vp_cttz_elts, {ResType, VectorMatchCmp->getType()},
539*0fca6ea1SDimitry Andric       {VectorMatchCmp, /*ZeroIsPoison=*/Builder.getInt1(false), AllTrueMask,
540*0fca6ea1SDimitry Andric        VL});
541*0fca6ea1SDimitry Andric   Value *MismatchFound = Builder.CreateICmpNE(CTZ, VL);
542*0fca6ea1SDimitry Andric   auto *VectorEarlyExit = BranchInst::Create(VectorLoopMismatchBlock,
543*0fca6ea1SDimitry Andric                                              VectorLoopIncBlock, MismatchFound);
544*0fca6ea1SDimitry Andric   Builder.Insert(VectorEarlyExit);
545*0fca6ea1SDimitry Andric 
546*0fca6ea1SDimitry Andric   DTU.applyUpdates(
547*0fca6ea1SDimitry Andric       {{DominatorTree::Insert, VectorLoopStartBlock, VectorLoopMismatchBlock},
548*0fca6ea1SDimitry Andric        {DominatorTree::Insert, VectorLoopStartBlock, VectorLoopIncBlock}});
549*0fca6ea1SDimitry Andric 
550*0fca6ea1SDimitry Andric   // Increment the index counter and calculate the predicate for the next
551*0fca6ea1SDimitry Andric   // iteration of the loop. We branch back to the start of the loop if there
552*0fca6ea1SDimitry Andric   // is at least one active lane.
553*0fca6ea1SDimitry Andric   Builder.SetInsertPoint(VectorLoopIncBlock);
554*0fca6ea1SDimitry Andric   Value *VL64 = Builder.CreateZExt(VL, I64Type);
555*0fca6ea1SDimitry Andric   Value *NewVectorIndexPhi =
556*0fca6ea1SDimitry Andric       Builder.CreateAdd(VectorIndexPhi, VL64, "",
557*0fca6ea1SDimitry Andric                         /*HasNUW=*/true, /*HasNSW=*/true);
558*0fca6ea1SDimitry Andric   VectorIndexPhi->addIncoming(NewVectorIndexPhi, VectorLoopIncBlock);
559*0fca6ea1SDimitry Andric   Value *ExitCond = Builder.CreateICmpNE(NewVectorIndexPhi, ExtEnd);
560*0fca6ea1SDimitry Andric   auto *VectorLoopBranchBack =
561*0fca6ea1SDimitry Andric       BranchInst::Create(VectorLoopStartBlock, EndBlock, ExitCond);
562*0fca6ea1SDimitry Andric   Builder.Insert(VectorLoopBranchBack);
563*0fca6ea1SDimitry Andric 
564*0fca6ea1SDimitry Andric   DTU.applyUpdates(
565*0fca6ea1SDimitry Andric       {{DominatorTree::Insert, VectorLoopIncBlock, VectorLoopStartBlock},
566*0fca6ea1SDimitry Andric        {DominatorTree::Insert, VectorLoopIncBlock, EndBlock}});
567*0fca6ea1SDimitry Andric 
568*0fca6ea1SDimitry Andric   // If we found a mismatch then we need to calculate which lane in the vector
569*0fca6ea1SDimitry Andric   // had a mismatch and add that on to the current loop index.
570*0fca6ea1SDimitry Andric   Builder.SetInsertPoint(VectorLoopMismatchBlock);
571*0fca6ea1SDimitry Andric 
572*0fca6ea1SDimitry Andric   // Add LCSSA phis for CTZ and VectorIndexPhi.
573*0fca6ea1SDimitry Andric   auto *CTZLCSSAPhi = Builder.CreatePHI(CTZ->getType(), 1, "ctz");
574*0fca6ea1SDimitry Andric   CTZLCSSAPhi->addIncoming(CTZ, VectorLoopStartBlock);
575*0fca6ea1SDimitry Andric   auto *VectorIndexLCSSAPhi =
576*0fca6ea1SDimitry Andric       Builder.CreatePHI(VectorIndexPhi->getType(), 1, "mismatch_vector_index");
577*0fca6ea1SDimitry Andric   VectorIndexLCSSAPhi->addIncoming(VectorIndexPhi, VectorLoopStartBlock);
578*0fca6ea1SDimitry Andric 
579*0fca6ea1SDimitry Andric   Value *CTZI64 = Builder.CreateZExt(CTZLCSSAPhi, I64Type);
580*0fca6ea1SDimitry Andric   Value *VectorLoopRes64 = Builder.CreateAdd(VectorIndexLCSSAPhi, CTZI64, "",
581*0fca6ea1SDimitry Andric                                              /*HasNUW=*/true, /*HasNSW=*/true);
582*0fca6ea1SDimitry Andric   return Builder.CreateTrunc(VectorLoopRes64, ResType);
583*0fca6ea1SDimitry Andric }
584*0fca6ea1SDimitry Andric 
585*0fca6ea1SDimitry Andric Value *LoopIdiomVectorize::expandFindMismatch(
586*0fca6ea1SDimitry Andric     IRBuilder<> &Builder, DomTreeUpdater &DTU, GetElementPtrInst *GEPA,
587*0fca6ea1SDimitry Andric     GetElementPtrInst *GEPB, Instruction *Index, Value *Start, Value *MaxLen) {
588*0fca6ea1SDimitry Andric   Value *PtrA = GEPA->getPointerOperand();
589*0fca6ea1SDimitry Andric   Value *PtrB = GEPB->getPointerOperand();
590*0fca6ea1SDimitry Andric 
591*0fca6ea1SDimitry Andric   // Get the arguments and types for the intrinsic.
592*0fca6ea1SDimitry Andric   BasicBlock *Preheader = CurLoop->getLoopPreheader();
593*0fca6ea1SDimitry Andric   BranchInst *PHBranch = cast<BranchInst>(Preheader->getTerminator());
594*0fca6ea1SDimitry Andric   LLVMContext &Ctx = PHBranch->getContext();
595*0fca6ea1SDimitry Andric   Type *LoadType = Type::getInt8Ty(Ctx);
596*0fca6ea1SDimitry Andric   Type *ResType = Builder.getInt32Ty();
597*0fca6ea1SDimitry Andric 
598*0fca6ea1SDimitry Andric   // Split block in the original loop preheader.
599*0fca6ea1SDimitry Andric   EndBlock = SplitBlock(Preheader, PHBranch, DT, LI, nullptr, "mismatch_end");
600*0fca6ea1SDimitry Andric 
601*0fca6ea1SDimitry Andric   // Create the blocks that we're going to need:
602*0fca6ea1SDimitry Andric   //  1. A block for checking the zero-extended length exceeds 0
603*0fca6ea1SDimitry Andric   //  2. A block to check that the start and end addresses of a given array
604*0fca6ea1SDimitry Andric   //     lie on the same page.
605*0fca6ea1SDimitry Andric   //  3. The vector loop preheader.
606*0fca6ea1SDimitry Andric   //  4. The first vector loop block.
607*0fca6ea1SDimitry Andric   //  5. The vector loop increment block.
608*0fca6ea1SDimitry Andric   //  6. A block we can jump to from the vector loop when a mismatch is found.
609*0fca6ea1SDimitry Andric   //  7. The first block of the scalar loop itself, containing PHIs , loads
610*0fca6ea1SDimitry Andric   //  and cmp.
611*0fca6ea1SDimitry Andric   //  8. A scalar loop increment block to increment the PHIs and go back
612*0fca6ea1SDimitry Andric   //  around the loop.
613*0fca6ea1SDimitry Andric 
614*0fca6ea1SDimitry Andric   BasicBlock *MinItCheckBlock = BasicBlock::Create(
615*0fca6ea1SDimitry Andric       Ctx, "mismatch_min_it_check", EndBlock->getParent(), EndBlock);
616*0fca6ea1SDimitry Andric 
617*0fca6ea1SDimitry Andric   // Update the terminator added by SplitBlock to branch to the first block
618*0fca6ea1SDimitry Andric   Preheader->getTerminator()->setSuccessor(0, MinItCheckBlock);
619*0fca6ea1SDimitry Andric 
620*0fca6ea1SDimitry Andric   BasicBlock *MemCheckBlock = BasicBlock::Create(
621*0fca6ea1SDimitry Andric       Ctx, "mismatch_mem_check", EndBlock->getParent(), EndBlock);
622*0fca6ea1SDimitry Andric 
623*0fca6ea1SDimitry Andric   VectorLoopPreheaderBlock = BasicBlock::Create(
624*0fca6ea1SDimitry Andric       Ctx, "mismatch_vec_loop_preheader", EndBlock->getParent(), EndBlock);
625*0fca6ea1SDimitry Andric 
626*0fca6ea1SDimitry Andric   VectorLoopStartBlock = BasicBlock::Create(Ctx, "mismatch_vec_loop",
627*0fca6ea1SDimitry Andric                                             EndBlock->getParent(), EndBlock);
628*0fca6ea1SDimitry Andric 
629*0fca6ea1SDimitry Andric   VectorLoopIncBlock = BasicBlock::Create(Ctx, "mismatch_vec_loop_inc",
630*0fca6ea1SDimitry Andric                                           EndBlock->getParent(), EndBlock);
631*0fca6ea1SDimitry Andric 
632*0fca6ea1SDimitry Andric   VectorLoopMismatchBlock = BasicBlock::Create(Ctx, "mismatch_vec_loop_found",
633*0fca6ea1SDimitry Andric                                                EndBlock->getParent(), EndBlock);
634*0fca6ea1SDimitry Andric 
635*0fca6ea1SDimitry Andric   BasicBlock *LoopPreHeaderBlock = BasicBlock::Create(
636*0fca6ea1SDimitry Andric       Ctx, "mismatch_loop_pre", EndBlock->getParent(), EndBlock);
637*0fca6ea1SDimitry Andric 
638*0fca6ea1SDimitry Andric   BasicBlock *LoopStartBlock =
639*0fca6ea1SDimitry Andric       BasicBlock::Create(Ctx, "mismatch_loop", EndBlock->getParent(), EndBlock);
640*0fca6ea1SDimitry Andric 
641*0fca6ea1SDimitry Andric   BasicBlock *LoopIncBlock = BasicBlock::Create(
642*0fca6ea1SDimitry Andric       Ctx, "mismatch_loop_inc", EndBlock->getParent(), EndBlock);
643*0fca6ea1SDimitry Andric 
644*0fca6ea1SDimitry Andric   DTU.applyUpdates({{DominatorTree::Insert, Preheader, MinItCheckBlock},
645*0fca6ea1SDimitry Andric                     {DominatorTree::Delete, Preheader, EndBlock}});
646*0fca6ea1SDimitry Andric 
647*0fca6ea1SDimitry Andric   // Update LoopInfo with the new vector & scalar loops.
648*0fca6ea1SDimitry Andric   auto VectorLoop = LI->AllocateLoop();
649*0fca6ea1SDimitry Andric   auto ScalarLoop = LI->AllocateLoop();
650*0fca6ea1SDimitry Andric 
651*0fca6ea1SDimitry Andric   if (CurLoop->getParentLoop()) {
652*0fca6ea1SDimitry Andric     CurLoop->getParentLoop()->addBasicBlockToLoop(MinItCheckBlock, *LI);
653*0fca6ea1SDimitry Andric     CurLoop->getParentLoop()->addBasicBlockToLoop(MemCheckBlock, *LI);
654*0fca6ea1SDimitry Andric     CurLoop->getParentLoop()->addBasicBlockToLoop(VectorLoopPreheaderBlock,
655*0fca6ea1SDimitry Andric                                                   *LI);
656*0fca6ea1SDimitry Andric     CurLoop->getParentLoop()->addChildLoop(VectorLoop);
657*0fca6ea1SDimitry Andric     CurLoop->getParentLoop()->addBasicBlockToLoop(VectorLoopMismatchBlock, *LI);
658*0fca6ea1SDimitry Andric     CurLoop->getParentLoop()->addBasicBlockToLoop(LoopPreHeaderBlock, *LI);
659*0fca6ea1SDimitry Andric     CurLoop->getParentLoop()->addChildLoop(ScalarLoop);
660*0fca6ea1SDimitry Andric   } else {
661*0fca6ea1SDimitry Andric     LI->addTopLevelLoop(VectorLoop);
662*0fca6ea1SDimitry Andric     LI->addTopLevelLoop(ScalarLoop);
663*0fca6ea1SDimitry Andric   }
664*0fca6ea1SDimitry Andric 
665*0fca6ea1SDimitry Andric   // Add the new basic blocks to their associated loops.
666*0fca6ea1SDimitry Andric   VectorLoop->addBasicBlockToLoop(VectorLoopStartBlock, *LI);
667*0fca6ea1SDimitry Andric   VectorLoop->addBasicBlockToLoop(VectorLoopIncBlock, *LI);
668*0fca6ea1SDimitry Andric 
669*0fca6ea1SDimitry Andric   ScalarLoop->addBasicBlockToLoop(LoopStartBlock, *LI);
670*0fca6ea1SDimitry Andric   ScalarLoop->addBasicBlockToLoop(LoopIncBlock, *LI);
671*0fca6ea1SDimitry Andric 
672*0fca6ea1SDimitry Andric   // Set up some types and constants that we intend to reuse.
673*0fca6ea1SDimitry Andric   Type *I64Type = Builder.getInt64Ty();
674*0fca6ea1SDimitry Andric 
675*0fca6ea1SDimitry Andric   // Check the zero-extended iteration count > 0
676*0fca6ea1SDimitry Andric   Builder.SetInsertPoint(MinItCheckBlock);
677*0fca6ea1SDimitry Andric   Value *ExtStart = Builder.CreateZExt(Start, I64Type);
678*0fca6ea1SDimitry Andric   Value *ExtEnd = Builder.CreateZExt(MaxLen, I64Type);
679*0fca6ea1SDimitry Andric   // This check doesn't really cost us very much.
680*0fca6ea1SDimitry Andric 
681*0fca6ea1SDimitry Andric   Value *LimitCheck = Builder.CreateICmpULE(Start, MaxLen);
682*0fca6ea1SDimitry Andric   BranchInst *MinItCheckBr =
683*0fca6ea1SDimitry Andric       BranchInst::Create(MemCheckBlock, LoopPreHeaderBlock, LimitCheck);
684*0fca6ea1SDimitry Andric   MinItCheckBr->setMetadata(
685*0fca6ea1SDimitry Andric       LLVMContext::MD_prof,
686*0fca6ea1SDimitry Andric       MDBuilder(MinItCheckBr->getContext()).createBranchWeights(99, 1));
687*0fca6ea1SDimitry Andric   Builder.Insert(MinItCheckBr);
688*0fca6ea1SDimitry Andric 
689*0fca6ea1SDimitry Andric   DTU.applyUpdates(
690*0fca6ea1SDimitry Andric       {{DominatorTree::Insert, MinItCheckBlock, MemCheckBlock},
691*0fca6ea1SDimitry Andric        {DominatorTree::Insert, MinItCheckBlock, LoopPreHeaderBlock}});
692*0fca6ea1SDimitry Andric 
693*0fca6ea1SDimitry Andric   // For each of the arrays, check the start/end addresses are on the same
694*0fca6ea1SDimitry Andric   // page.
695*0fca6ea1SDimitry Andric   Builder.SetInsertPoint(MemCheckBlock);
696*0fca6ea1SDimitry Andric 
697*0fca6ea1SDimitry Andric   // The early exit in the original loop means that when performing vector
698*0fca6ea1SDimitry Andric   // loads we are potentially reading ahead of the early exit. So we could
699*0fca6ea1SDimitry Andric   // fault if crossing a page boundary. Therefore, we create runtime memory
700*0fca6ea1SDimitry Andric   // checks based on the minimum page size as follows:
701*0fca6ea1SDimitry Andric   //   1. Calculate the addresses of the first memory accesses in the loop,
702*0fca6ea1SDimitry Andric   //      i.e. LhsStart and RhsStart.
703*0fca6ea1SDimitry Andric   //   2. Get the last accessed addresses in the loop, i.e. LhsEnd and RhsEnd.
704*0fca6ea1SDimitry Andric   //   3. Determine which pages correspond to all the memory accesses, i.e
705*0fca6ea1SDimitry Andric   //      LhsStartPage, LhsEndPage, RhsStartPage, RhsEndPage.
706*0fca6ea1SDimitry Andric   //   4. If LhsStartPage == LhsEndPage and RhsStartPage == RhsEndPage, then
707*0fca6ea1SDimitry Andric   //      we know we won't cross any page boundaries in the loop so we can
708*0fca6ea1SDimitry Andric   //      enter the vector loop! Otherwise we fall back on the scalar loop.
709*0fca6ea1SDimitry Andric   Value *LhsStartGEP = Builder.CreateGEP(LoadType, PtrA, ExtStart);
710*0fca6ea1SDimitry Andric   Value *RhsStartGEP = Builder.CreateGEP(LoadType, PtrB, ExtStart);
711*0fca6ea1SDimitry Andric   Value *RhsStart = Builder.CreatePtrToInt(RhsStartGEP, I64Type);
712*0fca6ea1SDimitry Andric   Value *LhsStart = Builder.CreatePtrToInt(LhsStartGEP, I64Type);
713*0fca6ea1SDimitry Andric   Value *LhsEndGEP = Builder.CreateGEP(LoadType, PtrA, ExtEnd);
714*0fca6ea1SDimitry Andric   Value *RhsEndGEP = Builder.CreateGEP(LoadType, PtrB, ExtEnd);
715*0fca6ea1SDimitry Andric   Value *LhsEnd = Builder.CreatePtrToInt(LhsEndGEP, I64Type);
716*0fca6ea1SDimitry Andric   Value *RhsEnd = Builder.CreatePtrToInt(RhsEndGEP, I64Type);
717*0fca6ea1SDimitry Andric 
718*0fca6ea1SDimitry Andric   const uint64_t MinPageSize = TTI->getMinPageSize().value();
719*0fca6ea1SDimitry Andric   const uint64_t AddrShiftAmt = llvm::Log2_64(MinPageSize);
720*0fca6ea1SDimitry Andric   Value *LhsStartPage = Builder.CreateLShr(LhsStart, AddrShiftAmt);
721*0fca6ea1SDimitry Andric   Value *LhsEndPage = Builder.CreateLShr(LhsEnd, AddrShiftAmt);
722*0fca6ea1SDimitry Andric   Value *RhsStartPage = Builder.CreateLShr(RhsStart, AddrShiftAmt);
723*0fca6ea1SDimitry Andric   Value *RhsEndPage = Builder.CreateLShr(RhsEnd, AddrShiftAmt);
724*0fca6ea1SDimitry Andric   Value *LhsPageCmp = Builder.CreateICmpNE(LhsStartPage, LhsEndPage);
725*0fca6ea1SDimitry Andric   Value *RhsPageCmp = Builder.CreateICmpNE(RhsStartPage, RhsEndPage);
726*0fca6ea1SDimitry Andric 
727*0fca6ea1SDimitry Andric   Value *CombinedPageCmp = Builder.CreateOr(LhsPageCmp, RhsPageCmp);
728*0fca6ea1SDimitry Andric   BranchInst *CombinedPageCmpCmpBr = BranchInst::Create(
729*0fca6ea1SDimitry Andric       LoopPreHeaderBlock, VectorLoopPreheaderBlock, CombinedPageCmp);
730*0fca6ea1SDimitry Andric   CombinedPageCmpCmpBr->setMetadata(
731*0fca6ea1SDimitry Andric       LLVMContext::MD_prof, MDBuilder(CombinedPageCmpCmpBr->getContext())
732*0fca6ea1SDimitry Andric                                 .createBranchWeights(10, 90));
733*0fca6ea1SDimitry Andric   Builder.Insert(CombinedPageCmpCmpBr);
734*0fca6ea1SDimitry Andric 
735*0fca6ea1SDimitry Andric   DTU.applyUpdates(
736*0fca6ea1SDimitry Andric       {{DominatorTree::Insert, MemCheckBlock, LoopPreHeaderBlock},
737*0fca6ea1SDimitry Andric        {DominatorTree::Insert, MemCheckBlock, VectorLoopPreheaderBlock}});
738*0fca6ea1SDimitry Andric 
739*0fca6ea1SDimitry Andric   // Set up the vector loop preheader, i.e. calculate initial loop predicate,
740*0fca6ea1SDimitry Andric   // zero-extend MaxLen to 64-bits, determine the number of vector elements
741*0fca6ea1SDimitry Andric   // processed in each iteration, etc.
742*0fca6ea1SDimitry Andric   Builder.SetInsertPoint(VectorLoopPreheaderBlock);
743*0fca6ea1SDimitry Andric 
744*0fca6ea1SDimitry Andric   // At this point we know two things must be true:
745*0fca6ea1SDimitry Andric   //  1. Start <= End
746*0fca6ea1SDimitry Andric   //  2. ExtMaxLen <= MinPageSize due to the page checks.
747*0fca6ea1SDimitry Andric   // Therefore, we know that we can use a 64-bit induction variable that
748*0fca6ea1SDimitry Andric   // starts from 0 -> ExtMaxLen and it will not overflow.
749*0fca6ea1SDimitry Andric   Value *VectorLoopRes = nullptr;
750*0fca6ea1SDimitry Andric   switch (VectorizeStyle) {
751*0fca6ea1SDimitry Andric   case LoopIdiomVectorizeStyle::Masked:
752*0fca6ea1SDimitry Andric     VectorLoopRes =
753*0fca6ea1SDimitry Andric         createMaskedFindMismatch(Builder, DTU, GEPA, GEPB, ExtStart, ExtEnd);
754*0fca6ea1SDimitry Andric     break;
755*0fca6ea1SDimitry Andric   case LoopIdiomVectorizeStyle::Predicated:
756*0fca6ea1SDimitry Andric     VectorLoopRes = createPredicatedFindMismatch(Builder, DTU, GEPA, GEPB,
757*0fca6ea1SDimitry Andric                                                  ExtStart, ExtEnd);
758*0fca6ea1SDimitry Andric     break;
759*0fca6ea1SDimitry Andric   }
760*0fca6ea1SDimitry Andric 
761*0fca6ea1SDimitry Andric   Builder.Insert(BranchInst::Create(EndBlock));
762*0fca6ea1SDimitry Andric 
763*0fca6ea1SDimitry Andric   DTU.applyUpdates(
764*0fca6ea1SDimitry Andric       {{DominatorTree::Insert, VectorLoopMismatchBlock, EndBlock}});
765*0fca6ea1SDimitry Andric 
766*0fca6ea1SDimitry Andric   // Generate code for scalar loop.
767*0fca6ea1SDimitry Andric   Builder.SetInsertPoint(LoopPreHeaderBlock);
768*0fca6ea1SDimitry Andric   Builder.Insert(BranchInst::Create(LoopStartBlock));
769*0fca6ea1SDimitry Andric 
770*0fca6ea1SDimitry Andric   DTU.applyUpdates(
771*0fca6ea1SDimitry Andric       {{DominatorTree::Insert, LoopPreHeaderBlock, LoopStartBlock}});
772*0fca6ea1SDimitry Andric 
773*0fca6ea1SDimitry Andric   Builder.SetInsertPoint(LoopStartBlock);
774*0fca6ea1SDimitry Andric   PHINode *IndexPhi = Builder.CreatePHI(ResType, 2, "mismatch_index");
775*0fca6ea1SDimitry Andric   IndexPhi->addIncoming(Start, LoopPreHeaderBlock);
776*0fca6ea1SDimitry Andric 
777*0fca6ea1SDimitry Andric   // Otherwise compare the values
778*0fca6ea1SDimitry Andric   // Load bytes from each array and compare them.
779*0fca6ea1SDimitry Andric   Value *GepOffset = Builder.CreateZExt(IndexPhi, I64Type);
780*0fca6ea1SDimitry Andric 
781*0fca6ea1SDimitry Andric   Value *LhsGep =
782*0fca6ea1SDimitry Andric       Builder.CreateGEP(LoadType, PtrA, GepOffset, "", GEPA->isInBounds());
783*0fca6ea1SDimitry Andric   Value *LhsLoad = Builder.CreateLoad(LoadType, LhsGep);
784*0fca6ea1SDimitry Andric 
785*0fca6ea1SDimitry Andric   Value *RhsGep =
786*0fca6ea1SDimitry Andric       Builder.CreateGEP(LoadType, PtrB, GepOffset, "", GEPB->isInBounds());
787*0fca6ea1SDimitry Andric   Value *RhsLoad = Builder.CreateLoad(LoadType, RhsGep);
788*0fca6ea1SDimitry Andric 
789*0fca6ea1SDimitry Andric   Value *MatchCmp = Builder.CreateICmpEQ(LhsLoad, RhsLoad);
790*0fca6ea1SDimitry Andric   // If we have a mismatch then exit the loop ...
791*0fca6ea1SDimitry Andric   BranchInst *MatchCmpBr = BranchInst::Create(LoopIncBlock, EndBlock, MatchCmp);
792*0fca6ea1SDimitry Andric   Builder.Insert(MatchCmpBr);
793*0fca6ea1SDimitry Andric 
794*0fca6ea1SDimitry Andric   DTU.applyUpdates({{DominatorTree::Insert, LoopStartBlock, LoopIncBlock},
795*0fca6ea1SDimitry Andric                     {DominatorTree::Insert, LoopStartBlock, EndBlock}});
796*0fca6ea1SDimitry Andric 
797*0fca6ea1SDimitry Andric   // Have we reached the maximum permitted length for the loop?
798*0fca6ea1SDimitry Andric   Builder.SetInsertPoint(LoopIncBlock);
799*0fca6ea1SDimitry Andric   Value *PhiInc = Builder.CreateAdd(IndexPhi, ConstantInt::get(ResType, 1), "",
800*0fca6ea1SDimitry Andric                                     /*HasNUW=*/Index->hasNoUnsignedWrap(),
801*0fca6ea1SDimitry Andric                                     /*HasNSW=*/Index->hasNoSignedWrap());
802*0fca6ea1SDimitry Andric   IndexPhi->addIncoming(PhiInc, LoopIncBlock);
803*0fca6ea1SDimitry Andric   Value *IVCmp = Builder.CreateICmpEQ(PhiInc, MaxLen);
804*0fca6ea1SDimitry Andric   BranchInst *IVCmpBr = BranchInst::Create(EndBlock, LoopStartBlock, IVCmp);
805*0fca6ea1SDimitry Andric   Builder.Insert(IVCmpBr);
806*0fca6ea1SDimitry Andric 
807*0fca6ea1SDimitry Andric   DTU.applyUpdates({{DominatorTree::Insert, LoopIncBlock, EndBlock},
808*0fca6ea1SDimitry Andric                     {DominatorTree::Insert, LoopIncBlock, LoopStartBlock}});
809*0fca6ea1SDimitry Andric 
810*0fca6ea1SDimitry Andric   // In the end block we need to insert a PHI node to deal with three cases:
811*0fca6ea1SDimitry Andric   //  1. We didn't find a mismatch in the scalar loop, so we return MaxLen.
812*0fca6ea1SDimitry Andric   //  2. We exitted the scalar loop early due to a mismatch and need to return
813*0fca6ea1SDimitry Andric   //  the index that we found.
814*0fca6ea1SDimitry Andric   //  3. We didn't find a mismatch in the vector loop, so we return MaxLen.
815*0fca6ea1SDimitry Andric   //  4. We exitted the vector loop early due to a mismatch and need to return
816*0fca6ea1SDimitry Andric   //  the index that we found.
817*0fca6ea1SDimitry Andric   Builder.SetInsertPoint(EndBlock, EndBlock->getFirstInsertionPt());
818*0fca6ea1SDimitry Andric   PHINode *ResPhi = Builder.CreatePHI(ResType, 4, "mismatch_result");
819*0fca6ea1SDimitry Andric   ResPhi->addIncoming(MaxLen, LoopIncBlock);
820*0fca6ea1SDimitry Andric   ResPhi->addIncoming(IndexPhi, LoopStartBlock);
821*0fca6ea1SDimitry Andric   ResPhi->addIncoming(MaxLen, VectorLoopIncBlock);
822*0fca6ea1SDimitry Andric   ResPhi->addIncoming(VectorLoopRes, VectorLoopMismatchBlock);
823*0fca6ea1SDimitry Andric 
824*0fca6ea1SDimitry Andric   Value *FinalRes = Builder.CreateTrunc(ResPhi, ResType);
825*0fca6ea1SDimitry Andric 
826*0fca6ea1SDimitry Andric   if (VerifyLoops) {
827*0fca6ea1SDimitry Andric     ScalarLoop->verifyLoop();
828*0fca6ea1SDimitry Andric     VectorLoop->verifyLoop();
829*0fca6ea1SDimitry Andric     if (!VectorLoop->isRecursivelyLCSSAForm(*DT, *LI))
830*0fca6ea1SDimitry Andric       report_fatal_error("Loops must remain in LCSSA form!");
831*0fca6ea1SDimitry Andric     if (!ScalarLoop->isRecursivelyLCSSAForm(*DT, *LI))
832*0fca6ea1SDimitry Andric       report_fatal_error("Loops must remain in LCSSA form!");
833*0fca6ea1SDimitry Andric   }
834*0fca6ea1SDimitry Andric 
835*0fca6ea1SDimitry Andric   return FinalRes;
836*0fca6ea1SDimitry Andric }
837*0fca6ea1SDimitry Andric 
838*0fca6ea1SDimitry Andric void LoopIdiomVectorize::transformByteCompare(GetElementPtrInst *GEPA,
839*0fca6ea1SDimitry Andric                                               GetElementPtrInst *GEPB,
840*0fca6ea1SDimitry Andric                                               PHINode *IndPhi, Value *MaxLen,
841*0fca6ea1SDimitry Andric                                               Instruction *Index, Value *Start,
842*0fca6ea1SDimitry Andric                                               bool IncIdx, BasicBlock *FoundBB,
843*0fca6ea1SDimitry Andric                                               BasicBlock *EndBB) {
844*0fca6ea1SDimitry Andric 
845*0fca6ea1SDimitry Andric   // Insert the byte compare code at the end of the preheader block
846*0fca6ea1SDimitry Andric   BasicBlock *Preheader = CurLoop->getLoopPreheader();
847*0fca6ea1SDimitry Andric   BasicBlock *Header = CurLoop->getHeader();
848*0fca6ea1SDimitry Andric   BranchInst *PHBranch = cast<BranchInst>(Preheader->getTerminator());
849*0fca6ea1SDimitry Andric   IRBuilder<> Builder(PHBranch);
850*0fca6ea1SDimitry Andric   DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
851*0fca6ea1SDimitry Andric   Builder.SetCurrentDebugLocation(PHBranch->getDebugLoc());
852*0fca6ea1SDimitry Andric 
853*0fca6ea1SDimitry Andric   // Increment the pointer if this was done before the loads in the loop.
854*0fca6ea1SDimitry Andric   if (IncIdx)
855*0fca6ea1SDimitry Andric     Start = Builder.CreateAdd(Start, ConstantInt::get(Start->getType(), 1));
856*0fca6ea1SDimitry Andric 
857*0fca6ea1SDimitry Andric   Value *ByteCmpRes =
858*0fca6ea1SDimitry Andric       expandFindMismatch(Builder, DTU, GEPA, GEPB, Index, Start, MaxLen);
859*0fca6ea1SDimitry Andric 
860*0fca6ea1SDimitry Andric   // Replaces uses of index & induction Phi with intrinsic (we already
861*0fca6ea1SDimitry Andric   // checked that the the first instruction of Header is the Phi above).
862*0fca6ea1SDimitry Andric   assert(IndPhi->hasOneUse() && "Index phi node has more than one use!");
863*0fca6ea1SDimitry Andric   Index->replaceAllUsesWith(ByteCmpRes);
864*0fca6ea1SDimitry Andric 
865*0fca6ea1SDimitry Andric   assert(PHBranch->isUnconditional() &&
866*0fca6ea1SDimitry Andric          "Expected preheader to terminate with an unconditional branch.");
867*0fca6ea1SDimitry Andric 
868*0fca6ea1SDimitry Andric   // If no mismatch was found, we can jump to the end block. Create a
869*0fca6ea1SDimitry Andric   // new basic block for the compare instruction.
870*0fca6ea1SDimitry Andric   auto *CmpBB = BasicBlock::Create(Preheader->getContext(), "byte.compare",
871*0fca6ea1SDimitry Andric                                    Preheader->getParent());
872*0fca6ea1SDimitry Andric   CmpBB->moveBefore(EndBB);
873*0fca6ea1SDimitry Andric 
874*0fca6ea1SDimitry Andric   // Replace the branch in the preheader with an always-true conditional branch.
875*0fca6ea1SDimitry Andric   // This ensures there is still a reference to the original loop.
876*0fca6ea1SDimitry Andric   Builder.CreateCondBr(Builder.getTrue(), CmpBB, Header);
877*0fca6ea1SDimitry Andric   PHBranch->eraseFromParent();
878*0fca6ea1SDimitry Andric 
879*0fca6ea1SDimitry Andric   BasicBlock *MismatchEnd = cast<Instruction>(ByteCmpRes)->getParent();
880*0fca6ea1SDimitry Andric   DTU.applyUpdates({{DominatorTree::Insert, MismatchEnd, CmpBB}});
881*0fca6ea1SDimitry Andric 
882*0fca6ea1SDimitry Andric   // Create the branch to either the end or found block depending on the value
883*0fca6ea1SDimitry Andric   // returned by the intrinsic.
884*0fca6ea1SDimitry Andric   Builder.SetInsertPoint(CmpBB);
885*0fca6ea1SDimitry Andric   if (FoundBB != EndBB) {
886*0fca6ea1SDimitry Andric     Value *FoundCmp = Builder.CreateICmpEQ(ByteCmpRes, MaxLen);
887*0fca6ea1SDimitry Andric     Builder.CreateCondBr(FoundCmp, EndBB, FoundBB);
888*0fca6ea1SDimitry Andric     DTU.applyUpdates({{DominatorTree::Insert, CmpBB, FoundBB},
889*0fca6ea1SDimitry Andric                       {DominatorTree::Insert, CmpBB, EndBB}});
890*0fca6ea1SDimitry Andric 
891*0fca6ea1SDimitry Andric   } else {
892*0fca6ea1SDimitry Andric     Builder.CreateBr(FoundBB);
893*0fca6ea1SDimitry Andric     DTU.applyUpdates({{DominatorTree::Insert, CmpBB, FoundBB}});
894*0fca6ea1SDimitry Andric   }
895*0fca6ea1SDimitry Andric 
896*0fca6ea1SDimitry Andric   auto fixSuccessorPhis = [&](BasicBlock *SuccBB) {
897*0fca6ea1SDimitry Andric     for (PHINode &PN : SuccBB->phis()) {
898*0fca6ea1SDimitry Andric       // At this point we've already replaced all uses of the result from the
899*0fca6ea1SDimitry Andric       // loop with ByteCmp. Look through the incoming values to find ByteCmp,
900*0fca6ea1SDimitry Andric       // meaning this is a Phi collecting the results of the byte compare.
901*0fca6ea1SDimitry Andric       bool ResPhi = false;
902*0fca6ea1SDimitry Andric       for (Value *Op : PN.incoming_values())
903*0fca6ea1SDimitry Andric         if (Op == ByteCmpRes) {
904*0fca6ea1SDimitry Andric           ResPhi = true;
905*0fca6ea1SDimitry Andric           break;
906*0fca6ea1SDimitry Andric         }
907*0fca6ea1SDimitry Andric 
908*0fca6ea1SDimitry Andric       // Any PHI that depended upon the result of the byte compare needs a new
909*0fca6ea1SDimitry Andric       // incoming value from CmpBB. This is because the original loop will get
910*0fca6ea1SDimitry Andric       // deleted.
911*0fca6ea1SDimitry Andric       if (ResPhi)
912*0fca6ea1SDimitry Andric         PN.addIncoming(ByteCmpRes, CmpBB);
913*0fca6ea1SDimitry Andric       else {
914*0fca6ea1SDimitry Andric         // There should be no other outside uses of other values in the
915*0fca6ea1SDimitry Andric         // original loop. Any incoming values should either:
916*0fca6ea1SDimitry Andric         //   1. Be for blocks outside the loop, which aren't interesting. Or ..
917*0fca6ea1SDimitry Andric         //   2. These are from blocks in the loop with values defined outside
918*0fca6ea1SDimitry Andric         //      the loop. We should a similar incoming value from CmpBB.
919*0fca6ea1SDimitry Andric         for (BasicBlock *BB : PN.blocks())
920*0fca6ea1SDimitry Andric           if (CurLoop->contains(BB)) {
921*0fca6ea1SDimitry Andric             PN.addIncoming(PN.getIncomingValueForBlock(BB), CmpBB);
922*0fca6ea1SDimitry Andric             break;
923*0fca6ea1SDimitry Andric           }
924*0fca6ea1SDimitry Andric       }
925*0fca6ea1SDimitry Andric     }
926*0fca6ea1SDimitry Andric   };
927*0fca6ea1SDimitry Andric 
928*0fca6ea1SDimitry Andric   // Ensure all Phis in the successors of CmpBB have an incoming value from it.
929*0fca6ea1SDimitry Andric   fixSuccessorPhis(EndBB);
930*0fca6ea1SDimitry Andric   if (EndBB != FoundBB)
931*0fca6ea1SDimitry Andric     fixSuccessorPhis(FoundBB);
932*0fca6ea1SDimitry Andric 
933*0fca6ea1SDimitry Andric   // The new CmpBB block isn't part of the loop, but will need to be added to
934*0fca6ea1SDimitry Andric   // the outer loop if there is one.
935*0fca6ea1SDimitry Andric   if (!CurLoop->isOutermost())
936*0fca6ea1SDimitry Andric     CurLoop->getParentLoop()->addBasicBlockToLoop(CmpBB, *LI);
937*0fca6ea1SDimitry Andric 
938*0fca6ea1SDimitry Andric   if (VerifyLoops && CurLoop->getParentLoop()) {
939*0fca6ea1SDimitry Andric     CurLoop->getParentLoop()->verifyLoop();
940*0fca6ea1SDimitry Andric     if (!CurLoop->getParentLoop()->isRecursivelyLCSSAForm(*DT, *LI))
941*0fca6ea1SDimitry Andric       report_fatal_error("Loops must remain in LCSSA form!");
942*0fca6ea1SDimitry Andric   }
943*0fca6ea1SDimitry Andric }
944