10b57cec5SDimitry Andric //===- LoopIdiomRecognize.cpp - Loop idiom recognition --------------------===// 20b57cec5SDimitry Andric // 30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 60b57cec5SDimitry Andric // 70b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 80b57cec5SDimitry Andric // 90b57cec5SDimitry Andric // This pass implements an idiom recognizer that transforms simple loops into a 100b57cec5SDimitry Andric // non-loop form. In cases that this kicks in, it can be a significant 110b57cec5SDimitry Andric // performance win. 120b57cec5SDimitry Andric // 130b57cec5SDimitry Andric // If compiling for code size we avoid idiom recognition if the resulting 140b57cec5SDimitry Andric // code could be larger than the code for the original loop. One way this could 150b57cec5SDimitry Andric // happen is if the loop is not removable after idiom recognition due to the 160b57cec5SDimitry Andric // presence of non-idiom instructions. The initial implementation of the 170b57cec5SDimitry Andric // heuristics applies to idioms in multi-block loops. 180b57cec5SDimitry Andric // 190b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 200b57cec5SDimitry Andric // 210b57cec5SDimitry Andric // TODO List: 220b57cec5SDimitry Andric // 230b57cec5SDimitry Andric // Future loop memory idioms to recognize: 24fe6060f1SDimitry Andric // memcmp, strlen, etc. 250b57cec5SDimitry Andric // 260b57cec5SDimitry Andric // This could recognize common matrix multiplies and dot product idioms and 270b57cec5SDimitry Andric // replace them with calls to BLAS (if linked in??). 280b57cec5SDimitry Andric // 290b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 300b57cec5SDimitry Andric 310b57cec5SDimitry Andric #include "llvm/Transforms/Scalar/LoopIdiomRecognize.h" 320b57cec5SDimitry Andric #include "llvm/ADT/APInt.h" 330b57cec5SDimitry Andric #include "llvm/ADT/ArrayRef.h" 340b57cec5SDimitry Andric #include "llvm/ADT/DenseMap.h" 350b57cec5SDimitry Andric #include "llvm/ADT/MapVector.h" 360b57cec5SDimitry Andric #include "llvm/ADT/SetVector.h" 370b57cec5SDimitry Andric #include "llvm/ADT/SmallPtrSet.h" 380b57cec5SDimitry Andric #include "llvm/ADT/SmallVector.h" 390b57cec5SDimitry Andric #include "llvm/ADT/Statistic.h" 400b57cec5SDimitry Andric #include "llvm/ADT/StringRef.h" 410b57cec5SDimitry Andric #include "llvm/Analysis/AliasAnalysis.h" 42e8d8bef9SDimitry Andric #include "llvm/Analysis/CmpInstAnalysis.h" 430b57cec5SDimitry Andric #include "llvm/Analysis/LoopAccessAnalysis.h" 440b57cec5SDimitry Andric #include "llvm/Analysis/LoopInfo.h" 450b57cec5SDimitry Andric #include "llvm/Analysis/LoopPass.h" 460b57cec5SDimitry Andric #include "llvm/Analysis/MemoryLocation.h" 475ffd83dbSDimitry Andric #include "llvm/Analysis/MemorySSA.h" 485ffd83dbSDimitry Andric #include "llvm/Analysis/MemorySSAUpdater.h" 495ffd83dbSDimitry Andric #include "llvm/Analysis/MustExecute.h" 500b57cec5SDimitry Andric #include "llvm/Analysis/OptimizationRemarkEmitter.h" 510b57cec5SDimitry Andric #include "llvm/Analysis/ScalarEvolution.h" 520b57cec5SDimitry Andric #include "llvm/Analysis/ScalarEvolutionExpressions.h" 530b57cec5SDimitry Andric #include "llvm/Analysis/TargetLibraryInfo.h" 540b57cec5SDimitry Andric #include "llvm/Analysis/TargetTransformInfo.h" 550b57cec5SDimitry Andric #include "llvm/Analysis/ValueTracking.h" 560b57cec5SDimitry Andric #include "llvm/IR/BasicBlock.h" 570b57cec5SDimitry Andric #include "llvm/IR/Constant.h" 580b57cec5SDimitry Andric #include "llvm/IR/Constants.h" 590b57cec5SDimitry Andric #include "llvm/IR/DataLayout.h" 600b57cec5SDimitry Andric #include "llvm/IR/DebugLoc.h" 610b57cec5SDimitry Andric #include "llvm/IR/DerivedTypes.h" 620b57cec5SDimitry Andric #include "llvm/IR/Dominators.h" 630b57cec5SDimitry Andric #include "llvm/IR/GlobalValue.h" 640b57cec5SDimitry Andric #include "llvm/IR/GlobalVariable.h" 650b57cec5SDimitry Andric #include "llvm/IR/IRBuilder.h" 660b57cec5SDimitry Andric #include "llvm/IR/InstrTypes.h" 670b57cec5SDimitry Andric #include "llvm/IR/Instruction.h" 680b57cec5SDimitry Andric #include "llvm/IR/Instructions.h" 690b57cec5SDimitry Andric #include "llvm/IR/IntrinsicInst.h" 700b57cec5SDimitry Andric #include "llvm/IR/Intrinsics.h" 710b57cec5SDimitry Andric #include "llvm/IR/LLVMContext.h" 720b57cec5SDimitry Andric #include "llvm/IR/Module.h" 730b57cec5SDimitry Andric #include "llvm/IR/PassManager.h" 74e8d8bef9SDimitry Andric #include "llvm/IR/PatternMatch.h" 750b57cec5SDimitry Andric #include "llvm/IR/Type.h" 760b57cec5SDimitry Andric #include "llvm/IR/User.h" 770b57cec5SDimitry Andric #include "llvm/IR/Value.h" 780b57cec5SDimitry Andric #include "llvm/IR/ValueHandle.h" 790b57cec5SDimitry Andric #include "llvm/Support/Casting.h" 800b57cec5SDimitry Andric #include "llvm/Support/CommandLine.h" 810b57cec5SDimitry Andric #include "llvm/Support/Debug.h" 82fe6060f1SDimitry Andric #include "llvm/Support/InstructionCost.h" 830b57cec5SDimitry Andric #include "llvm/Support/raw_ostream.h" 840b57cec5SDimitry Andric #include "llvm/Transforms/Utils/BuildLibCalls.h" 850b57cec5SDimitry Andric #include "llvm/Transforms/Utils/Local.h" 860b57cec5SDimitry Andric #include "llvm/Transforms/Utils/LoopUtils.h" 875ffd83dbSDimitry Andric #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 880b57cec5SDimitry Andric #include <algorithm> 890b57cec5SDimitry Andric #include <cassert> 900b57cec5SDimitry Andric #include <cstdint> 910b57cec5SDimitry Andric #include <utility> 920b57cec5SDimitry Andric #include <vector> 930b57cec5SDimitry Andric 940b57cec5SDimitry Andric using namespace llvm; 950b57cec5SDimitry Andric 960b57cec5SDimitry Andric #define DEBUG_TYPE "loop-idiom" 970b57cec5SDimitry Andric 980b57cec5SDimitry Andric STATISTIC(NumMemSet, "Number of memset's formed from loop stores"); 990b57cec5SDimitry Andric STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores"); 100fe6060f1SDimitry Andric STATISTIC(NumMemMove, "Number of memmove's formed from loop load+stores"); 101e8d8bef9SDimitry Andric STATISTIC( 102e8d8bef9SDimitry Andric NumShiftUntilBitTest, 103e8d8bef9SDimitry Andric "Number of uncountable loops recognized as 'shift until bitttest' idiom"); 104fe6060f1SDimitry Andric STATISTIC(NumShiftUntilZero, 105fe6060f1SDimitry Andric "Number of uncountable loops recognized as 'shift until zero' idiom"); 106e8d8bef9SDimitry Andric 107e8d8bef9SDimitry Andric bool DisableLIRP::All; 108e8d8bef9SDimitry Andric static cl::opt<bool, true> 109e8d8bef9SDimitry Andric DisableLIRPAll("disable-" DEBUG_TYPE "-all", 110e8d8bef9SDimitry Andric cl::desc("Options to disable Loop Idiom Recognize Pass."), 111e8d8bef9SDimitry Andric cl::location(DisableLIRP::All), cl::init(false), 112e8d8bef9SDimitry Andric cl::ReallyHidden); 113e8d8bef9SDimitry Andric 114e8d8bef9SDimitry Andric bool DisableLIRP::Memset; 115e8d8bef9SDimitry Andric static cl::opt<bool, true> 116e8d8bef9SDimitry Andric DisableLIRPMemset("disable-" DEBUG_TYPE "-memset", 117e8d8bef9SDimitry Andric cl::desc("Proceed with loop idiom recognize pass, but do " 118e8d8bef9SDimitry Andric "not convert loop(s) to memset."), 119e8d8bef9SDimitry Andric cl::location(DisableLIRP::Memset), cl::init(false), 120e8d8bef9SDimitry Andric cl::ReallyHidden); 121e8d8bef9SDimitry Andric 122e8d8bef9SDimitry Andric bool DisableLIRP::Memcpy; 123e8d8bef9SDimitry Andric static cl::opt<bool, true> 124e8d8bef9SDimitry Andric DisableLIRPMemcpy("disable-" DEBUG_TYPE "-memcpy", 125e8d8bef9SDimitry Andric cl::desc("Proceed with loop idiom recognize pass, but do " 126e8d8bef9SDimitry Andric "not convert loop(s) to memcpy."), 127e8d8bef9SDimitry Andric cl::location(DisableLIRP::Memcpy), cl::init(false), 128e8d8bef9SDimitry Andric cl::ReallyHidden); 1290b57cec5SDimitry Andric 1300b57cec5SDimitry Andric static cl::opt<bool> UseLIRCodeSizeHeurs( 1310b57cec5SDimitry Andric "use-lir-code-size-heurs", 1320b57cec5SDimitry Andric cl::desc("Use loop idiom recognition code size heuristics when compiling" 1330b57cec5SDimitry Andric "with -Os/-Oz"), 1340b57cec5SDimitry Andric cl::init(true), cl::Hidden); 1350b57cec5SDimitry Andric 1360b57cec5SDimitry Andric namespace { 1370b57cec5SDimitry Andric 1380b57cec5SDimitry Andric class LoopIdiomRecognize { 1390b57cec5SDimitry Andric Loop *CurLoop = nullptr; 1400b57cec5SDimitry Andric AliasAnalysis *AA; 1410b57cec5SDimitry Andric DominatorTree *DT; 1420b57cec5SDimitry Andric LoopInfo *LI; 1430b57cec5SDimitry Andric ScalarEvolution *SE; 1440b57cec5SDimitry Andric TargetLibraryInfo *TLI; 1450b57cec5SDimitry Andric const TargetTransformInfo *TTI; 1460b57cec5SDimitry Andric const DataLayout *DL; 1470b57cec5SDimitry Andric OptimizationRemarkEmitter &ORE; 1480b57cec5SDimitry Andric bool ApplyCodeSizeHeuristics; 1495ffd83dbSDimitry Andric std::unique_ptr<MemorySSAUpdater> MSSAU; 1500b57cec5SDimitry Andric 1510b57cec5SDimitry Andric public: 1520b57cec5SDimitry Andric explicit LoopIdiomRecognize(AliasAnalysis *AA, DominatorTree *DT, 1530b57cec5SDimitry Andric LoopInfo *LI, ScalarEvolution *SE, 1540b57cec5SDimitry Andric TargetLibraryInfo *TLI, 1555ffd83dbSDimitry Andric const TargetTransformInfo *TTI, MemorySSA *MSSA, 156480093f4SDimitry Andric const DataLayout *DL, 1570b57cec5SDimitry Andric OptimizationRemarkEmitter &ORE) 1585ffd83dbSDimitry Andric : AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL), ORE(ORE) { 1595ffd83dbSDimitry Andric if (MSSA) 1605ffd83dbSDimitry Andric MSSAU = std::make_unique<MemorySSAUpdater>(MSSA); 1615ffd83dbSDimitry Andric } 1620b57cec5SDimitry Andric 1630b57cec5SDimitry Andric bool runOnLoop(Loop *L); 1640b57cec5SDimitry Andric 1650b57cec5SDimitry Andric private: 1660b57cec5SDimitry Andric using StoreList = SmallVector<StoreInst *, 8>; 1670b57cec5SDimitry Andric using StoreListMap = MapVector<Value *, StoreList>; 1680b57cec5SDimitry Andric 1690b57cec5SDimitry Andric StoreListMap StoreRefsForMemset; 1700b57cec5SDimitry Andric StoreListMap StoreRefsForMemsetPattern; 1710b57cec5SDimitry Andric StoreList StoreRefsForMemcpy; 1720b57cec5SDimitry Andric bool HasMemset; 1730b57cec5SDimitry Andric bool HasMemsetPattern; 1740b57cec5SDimitry Andric bool HasMemcpy; 1750b57cec5SDimitry Andric 1760b57cec5SDimitry Andric /// Return code for isLegalStore() 1770b57cec5SDimitry Andric enum LegalStoreKind { 1780b57cec5SDimitry Andric None = 0, 1790b57cec5SDimitry Andric Memset, 1800b57cec5SDimitry Andric MemsetPattern, 1810b57cec5SDimitry Andric Memcpy, 1820b57cec5SDimitry Andric UnorderedAtomicMemcpy, 1830b57cec5SDimitry Andric DontUse // Dummy retval never to be used. Allows catching errors in retval 1840b57cec5SDimitry Andric // handling. 1850b57cec5SDimitry Andric }; 1860b57cec5SDimitry Andric 1870b57cec5SDimitry Andric /// \name Countable Loop Idiom Handling 1880b57cec5SDimitry Andric /// @{ 1890b57cec5SDimitry Andric 1900b57cec5SDimitry Andric bool runOnCountableLoop(); 1910b57cec5SDimitry Andric bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, 1920b57cec5SDimitry Andric SmallVectorImpl<BasicBlock *> &ExitBlocks); 1930b57cec5SDimitry Andric 1940b57cec5SDimitry Andric void collectStores(BasicBlock *BB); 1950b57cec5SDimitry Andric LegalStoreKind isLegalStore(StoreInst *SI); 1960b57cec5SDimitry Andric enum class ForMemset { No, Yes }; 1970b57cec5SDimitry Andric bool processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount, 1980b57cec5SDimitry Andric ForMemset For); 199fe6060f1SDimitry Andric 200fe6060f1SDimitry Andric template <typename MemInst> 201fe6060f1SDimitry Andric bool processLoopMemIntrinsic( 202fe6060f1SDimitry Andric BasicBlock *BB, 203fe6060f1SDimitry Andric bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *), 204fe6060f1SDimitry Andric const SCEV *BECount); 205fe6060f1SDimitry Andric bool processLoopMemCpy(MemCpyInst *MCI, const SCEV *BECount); 2060b57cec5SDimitry Andric bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount); 2070b57cec5SDimitry Andric 208349cc55cSDimitry Andric bool processLoopStridedStore(Value *DestPtr, const SCEV *StoreSizeSCEV, 209480093f4SDimitry Andric MaybeAlign StoreAlignment, Value *StoredVal, 2100b57cec5SDimitry Andric Instruction *TheStore, 2110b57cec5SDimitry Andric SmallPtrSetImpl<Instruction *> &Stores, 2120b57cec5SDimitry Andric const SCEVAddRecExpr *Ev, const SCEV *BECount, 213349cc55cSDimitry Andric bool IsNegStride, bool IsLoopMemset = false); 2140b57cec5SDimitry Andric bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount); 215fe6060f1SDimitry Andric bool processLoopStoreOfLoopLoad(Value *DestPtr, Value *SourcePtr, 216349cc55cSDimitry Andric const SCEV *StoreSize, MaybeAlign StoreAlign, 217fe6060f1SDimitry Andric MaybeAlign LoadAlign, Instruction *TheStore, 218fe6060f1SDimitry Andric Instruction *TheLoad, 219fe6060f1SDimitry Andric const SCEVAddRecExpr *StoreEv, 220fe6060f1SDimitry Andric const SCEVAddRecExpr *LoadEv, 221fe6060f1SDimitry Andric const SCEV *BECount); 2220b57cec5SDimitry Andric bool avoidLIRForMultiBlockLoop(bool IsMemset = false, 2230b57cec5SDimitry Andric bool IsLoopMemset = false); 2240b57cec5SDimitry Andric 2250b57cec5SDimitry Andric /// @} 2260b57cec5SDimitry Andric /// \name Noncountable Loop Idiom Handling 2270b57cec5SDimitry Andric /// @{ 2280b57cec5SDimitry Andric 2290b57cec5SDimitry Andric bool runOnNoncountableLoop(); 2300b57cec5SDimitry Andric 2310b57cec5SDimitry Andric bool recognizePopcount(); 2320b57cec5SDimitry Andric void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst, 2330b57cec5SDimitry Andric PHINode *CntPhi, Value *Var); 234*0fca6ea1SDimitry Andric bool isProfitableToInsertFFS(Intrinsic::ID IntrinID, Value *InitX, 235*0fca6ea1SDimitry Andric bool ZeroCheck, size_t CanonicalSize); 236*0fca6ea1SDimitry Andric bool insertFFSIfProfitable(Intrinsic::ID IntrinID, Value *InitX, 237*0fca6ea1SDimitry Andric Instruction *DefX, PHINode *CntPhi, 238*0fca6ea1SDimitry Andric Instruction *CntInst); 2390b57cec5SDimitry Andric bool recognizeAndInsertFFS(); /// Find First Set: ctlz or cttz 240*0fca6ea1SDimitry Andric bool recognizeShiftUntilLessThan(); 2410b57cec5SDimitry Andric void transformLoopToCountable(Intrinsic::ID IntrinID, BasicBlock *PreCondBB, 2420b57cec5SDimitry Andric Instruction *CntInst, PHINode *CntPhi, 2430b57cec5SDimitry Andric Value *Var, Instruction *DefX, 2440b57cec5SDimitry Andric const DebugLoc &DL, bool ZeroCheck, 245*0fca6ea1SDimitry Andric bool IsCntPhiUsedOutsideLoop, 246*0fca6ea1SDimitry Andric bool InsertSub = false); 2470b57cec5SDimitry Andric 248e8d8bef9SDimitry Andric bool recognizeShiftUntilBitTest(); 249fe6060f1SDimitry Andric bool recognizeShiftUntilZero(); 250e8d8bef9SDimitry Andric 2510b57cec5SDimitry Andric /// @} 2520b57cec5SDimitry Andric }; 2530b57cec5SDimitry Andric } // end anonymous namespace 2540b57cec5SDimitry Andric 2550b57cec5SDimitry Andric PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM, 2560b57cec5SDimitry Andric LoopStandardAnalysisResults &AR, 257480093f4SDimitry Andric LPMUpdater &) { 258e8d8bef9SDimitry Andric if (DisableLIRP::All) 259e8d8bef9SDimitry Andric return PreservedAnalyses::all(); 260e8d8bef9SDimitry Andric 261*0fca6ea1SDimitry Andric const auto *DL = &L.getHeader()->getDataLayout(); 2620b57cec5SDimitry Andric 2635ffd83dbSDimitry Andric // For the new PM, we also can't use OptimizationRemarkEmitter as an analysis 2645ffd83dbSDimitry Andric // pass. Function analyses need to be preserved across loop transformations 2655ffd83dbSDimitry Andric // but ORE cannot be preserved (see comment before the pass definition). 2665ffd83dbSDimitry Andric OptimizationRemarkEmitter ORE(L.getHeader()->getParent()); 2670b57cec5SDimitry Andric 2685ffd83dbSDimitry Andric LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI, 2695ffd83dbSDimitry Andric AR.MSSA, DL, ORE); 2700b57cec5SDimitry Andric if (!LIR.runOnLoop(&L)) 2710b57cec5SDimitry Andric return PreservedAnalyses::all(); 2720b57cec5SDimitry Andric 2735ffd83dbSDimitry Andric auto PA = getLoopPassPreservedAnalyses(); 2745ffd83dbSDimitry Andric if (AR.MSSA) 2755ffd83dbSDimitry Andric PA.preserve<MemorySSAAnalysis>(); 2765ffd83dbSDimitry Andric return PA; 2770b57cec5SDimitry Andric } 2780b57cec5SDimitry Andric 2790b57cec5SDimitry Andric static void deleteDeadInstruction(Instruction *I) { 28081ad6265SDimitry Andric I->replaceAllUsesWith(PoisonValue::get(I->getType())); 2810b57cec5SDimitry Andric I->eraseFromParent(); 2820b57cec5SDimitry Andric } 2830b57cec5SDimitry Andric 2840b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 2850b57cec5SDimitry Andric // 2860b57cec5SDimitry Andric // Implementation of LoopIdiomRecognize 2870b57cec5SDimitry Andric // 2880b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 2890b57cec5SDimitry Andric 2900b57cec5SDimitry Andric bool LoopIdiomRecognize::runOnLoop(Loop *L) { 2910b57cec5SDimitry Andric CurLoop = L; 2920b57cec5SDimitry Andric // If the loop could not be converted to canonical form, it must have an 2930b57cec5SDimitry Andric // indirectbr in it, just give up. 2940b57cec5SDimitry Andric if (!L->getLoopPreheader()) 2950b57cec5SDimitry Andric return false; 2960b57cec5SDimitry Andric 2970b57cec5SDimitry Andric // Disable loop idiom recognition if the function's name is a common idiom. 2980b57cec5SDimitry Andric StringRef Name = L->getHeader()->getParent()->getName(); 299480093f4SDimitry Andric if (Name == "memset" || Name == "memcpy") 3000b57cec5SDimitry Andric return false; 3010b57cec5SDimitry Andric 3020b57cec5SDimitry Andric // Determine if code size heuristics need to be applied. 3030b57cec5SDimitry Andric ApplyCodeSizeHeuristics = 3040b57cec5SDimitry Andric L->getHeader()->getParent()->hasOptSize() && UseLIRCodeSizeHeurs; 3050b57cec5SDimitry Andric 3060b57cec5SDimitry Andric HasMemset = TLI->has(LibFunc_memset); 3070b57cec5SDimitry Andric HasMemsetPattern = TLI->has(LibFunc_memset_pattern16); 3080b57cec5SDimitry Andric HasMemcpy = TLI->has(LibFunc_memcpy); 3090b57cec5SDimitry Andric 310480093f4SDimitry Andric if (HasMemset || HasMemsetPattern || HasMemcpy) 3110b57cec5SDimitry Andric if (SE->hasLoopInvariantBackedgeTakenCount(L)) 3120b57cec5SDimitry Andric return runOnCountableLoop(); 3130b57cec5SDimitry Andric 3140b57cec5SDimitry Andric return runOnNoncountableLoop(); 3150b57cec5SDimitry Andric } 3160b57cec5SDimitry Andric 3170b57cec5SDimitry Andric bool LoopIdiomRecognize::runOnCountableLoop() { 3180b57cec5SDimitry Andric const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop); 3190b57cec5SDimitry Andric assert(!isa<SCEVCouldNotCompute>(BECount) && 3200b57cec5SDimitry Andric "runOnCountableLoop() called on a loop without a predictable" 3210b57cec5SDimitry Andric "backedge-taken count"); 3220b57cec5SDimitry Andric 3230b57cec5SDimitry Andric // If this loop executes exactly one time, then it should be peeled, not 3240b57cec5SDimitry Andric // optimized by this pass. 3250b57cec5SDimitry Andric if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount)) 3260b57cec5SDimitry Andric if (BECst->getAPInt() == 0) 3270b57cec5SDimitry Andric return false; 3280b57cec5SDimitry Andric 3290b57cec5SDimitry Andric SmallVector<BasicBlock *, 8> ExitBlocks; 3300b57cec5SDimitry Andric CurLoop->getUniqueExitBlocks(ExitBlocks); 3310b57cec5SDimitry Andric 3320b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F[" 3330b57cec5SDimitry Andric << CurLoop->getHeader()->getParent()->getName() 3340b57cec5SDimitry Andric << "] Countable Loop %" << CurLoop->getHeader()->getName() 3350b57cec5SDimitry Andric << "\n"); 3360b57cec5SDimitry Andric 3370b57cec5SDimitry Andric // The following transforms hoist stores/memsets into the loop pre-header. 3385ffd83dbSDimitry Andric // Give up if the loop has instructions that may throw. 3390b57cec5SDimitry Andric SimpleLoopSafetyInfo SafetyInfo; 3400b57cec5SDimitry Andric SafetyInfo.computeLoopSafetyInfo(CurLoop); 3410b57cec5SDimitry Andric if (SafetyInfo.anyBlockMayThrow()) 3425ffd83dbSDimitry Andric return false; 3435ffd83dbSDimitry Andric 3445ffd83dbSDimitry Andric bool MadeChange = false; 3450b57cec5SDimitry Andric 3460b57cec5SDimitry Andric // Scan all the blocks in the loop that are not in subloops. 3470b57cec5SDimitry Andric for (auto *BB : CurLoop->getBlocks()) { 3480b57cec5SDimitry Andric // Ignore blocks in subloops. 3490b57cec5SDimitry Andric if (LI->getLoopFor(BB) != CurLoop) 3500b57cec5SDimitry Andric continue; 3510b57cec5SDimitry Andric 3520b57cec5SDimitry Andric MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks); 3530b57cec5SDimitry Andric } 3540b57cec5SDimitry Andric return MadeChange; 3550b57cec5SDimitry Andric } 3560b57cec5SDimitry Andric 3570b57cec5SDimitry Andric static APInt getStoreStride(const SCEVAddRecExpr *StoreEv) { 3580b57cec5SDimitry Andric const SCEVConstant *ConstStride = cast<SCEVConstant>(StoreEv->getOperand(1)); 3590b57cec5SDimitry Andric return ConstStride->getAPInt(); 3600b57cec5SDimitry Andric } 3610b57cec5SDimitry Andric 3620b57cec5SDimitry Andric /// getMemSetPatternValue - If a strided store of the specified value is safe to 3630b57cec5SDimitry Andric /// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should 3640b57cec5SDimitry Andric /// be passed in. Otherwise, return null. 3650b57cec5SDimitry Andric /// 3660b57cec5SDimitry Andric /// Note that we don't ever attempt to use memset_pattern8 or 4, because these 3670b57cec5SDimitry Andric /// just replicate their input array and then pass on to memset_pattern16. 3680b57cec5SDimitry Andric static Constant *getMemSetPatternValue(Value *V, const DataLayout *DL) { 3690b57cec5SDimitry Andric // FIXME: This could check for UndefValue because it can be merged into any 3700b57cec5SDimitry Andric // other valid pattern. 3710b57cec5SDimitry Andric 3720b57cec5SDimitry Andric // If the value isn't a constant, we can't promote it to being in a constant 3730b57cec5SDimitry Andric // array. We could theoretically do a store to an alloca or something, but 3740b57cec5SDimitry Andric // that doesn't seem worthwhile. 3750b57cec5SDimitry Andric Constant *C = dyn_cast<Constant>(V); 376bdd1243dSDimitry Andric if (!C || isa<ConstantExpr>(C)) 3770b57cec5SDimitry Andric return nullptr; 3780b57cec5SDimitry Andric 3790b57cec5SDimitry Andric // Only handle simple values that are a power of two bytes in size. 3800b57cec5SDimitry Andric uint64_t Size = DL->getTypeSizeInBits(V->getType()); 3810b57cec5SDimitry Andric if (Size == 0 || (Size & 7) || (Size & (Size - 1))) 3820b57cec5SDimitry Andric return nullptr; 3830b57cec5SDimitry Andric 3840b57cec5SDimitry Andric // Don't care enough about darwin/ppc to implement this. 3850b57cec5SDimitry Andric if (DL->isBigEndian()) 3860b57cec5SDimitry Andric return nullptr; 3870b57cec5SDimitry Andric 3880b57cec5SDimitry Andric // Convert to size in bytes. 3890b57cec5SDimitry Andric Size /= 8; 3900b57cec5SDimitry Andric 3910b57cec5SDimitry Andric // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see 3920b57cec5SDimitry Andric // if the top and bottom are the same (e.g. for vectors and large integers). 3930b57cec5SDimitry Andric if (Size > 16) 3940b57cec5SDimitry Andric return nullptr; 3950b57cec5SDimitry Andric 3960b57cec5SDimitry Andric // If the constant is exactly 16 bytes, just use it. 3970b57cec5SDimitry Andric if (Size == 16) 3980b57cec5SDimitry Andric return C; 3990b57cec5SDimitry Andric 4000b57cec5SDimitry Andric // Otherwise, we'll use an array of the constants. 4010b57cec5SDimitry Andric unsigned ArraySize = 16 / Size; 4020b57cec5SDimitry Andric ArrayType *AT = ArrayType::get(V->getType(), ArraySize); 4030b57cec5SDimitry Andric return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C)); 4040b57cec5SDimitry Andric } 4050b57cec5SDimitry Andric 4060b57cec5SDimitry Andric LoopIdiomRecognize::LegalStoreKind 4070b57cec5SDimitry Andric LoopIdiomRecognize::isLegalStore(StoreInst *SI) { 4080b57cec5SDimitry Andric // Don't touch volatile stores. 4090b57cec5SDimitry Andric if (SI->isVolatile()) 4100b57cec5SDimitry Andric return LegalStoreKind::None; 4110b57cec5SDimitry Andric // We only want simple or unordered-atomic stores. 4120b57cec5SDimitry Andric if (!SI->isUnordered()) 4130b57cec5SDimitry Andric return LegalStoreKind::None; 4140b57cec5SDimitry Andric 4150b57cec5SDimitry Andric // Avoid merging nontemporal stores. 4160b57cec5SDimitry Andric if (SI->getMetadata(LLVMContext::MD_nontemporal)) 4170b57cec5SDimitry Andric return LegalStoreKind::None; 4180b57cec5SDimitry Andric 4190b57cec5SDimitry Andric Value *StoredVal = SI->getValueOperand(); 4200b57cec5SDimitry Andric Value *StorePtr = SI->getPointerOperand(); 4210b57cec5SDimitry Andric 422e8d8bef9SDimitry Andric // Don't convert stores of non-integral pointer types to memsets (which stores 423e8d8bef9SDimitry Andric // integers). 424e8d8bef9SDimitry Andric if (DL->isNonIntegralPointerType(StoredVal->getType()->getScalarType())) 425e8d8bef9SDimitry Andric return LegalStoreKind::None; 426e8d8bef9SDimitry Andric 4270b57cec5SDimitry Andric // Reject stores that are so large that they overflow an unsigned. 428e8d8bef9SDimitry Andric // When storing out scalable vectors we bail out for now, since the code 429e8d8bef9SDimitry Andric // below currently only works for constant strides. 430e8d8bef9SDimitry Andric TypeSize SizeInBits = DL->getTypeSizeInBits(StoredVal->getType()); 431bdd1243dSDimitry Andric if (SizeInBits.isScalable() || (SizeInBits.getFixedValue() & 7) || 432bdd1243dSDimitry Andric (SizeInBits.getFixedValue() >> 32) != 0) 4330b57cec5SDimitry Andric return LegalStoreKind::None; 4340b57cec5SDimitry Andric 4350b57cec5SDimitry Andric // See if the pointer expression is an AddRec like {base,+,1} on the current 4360b57cec5SDimitry Andric // loop, which indicates a strided store. If we have something else, it's a 4370b57cec5SDimitry Andric // random store we can't handle. 4380b57cec5SDimitry Andric const SCEVAddRecExpr *StoreEv = 4390b57cec5SDimitry Andric dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr)); 4400b57cec5SDimitry Andric if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine()) 4410b57cec5SDimitry Andric return LegalStoreKind::None; 4420b57cec5SDimitry Andric 4430b57cec5SDimitry Andric // Check to see if we have a constant stride. 4440b57cec5SDimitry Andric if (!isa<SCEVConstant>(StoreEv->getOperand(1))) 4450b57cec5SDimitry Andric return LegalStoreKind::None; 4460b57cec5SDimitry Andric 4470b57cec5SDimitry Andric // See if the store can be turned into a memset. 4480b57cec5SDimitry Andric 4490b57cec5SDimitry Andric // If the stored value is a byte-wise value (like i32 -1), then it may be 4500b57cec5SDimitry Andric // turned into a memset of i8 -1, assuming that all the consecutive bytes 4510b57cec5SDimitry Andric // are stored. A store of i32 0x01020304 can never be turned into a memset, 4520b57cec5SDimitry Andric // but it can be turned into memset_pattern if the target supports it. 4530b57cec5SDimitry Andric Value *SplatValue = isBytewiseValue(StoredVal, *DL); 4540b57cec5SDimitry Andric 4550b57cec5SDimitry Andric // Note: memset and memset_pattern on unordered-atomic is yet not supported 4560b57cec5SDimitry Andric bool UnorderedAtomic = SI->isUnordered() && !SI->isSimple(); 4570b57cec5SDimitry Andric 4580b57cec5SDimitry Andric // If we're allowed to form a memset, and the stored value would be 4590b57cec5SDimitry Andric // acceptable for memset, use it. 460e8d8bef9SDimitry Andric if (!UnorderedAtomic && HasMemset && SplatValue && !DisableLIRP::Memset && 4610b57cec5SDimitry Andric // Verify that the stored value is loop invariant. If not, we can't 4620b57cec5SDimitry Andric // promote the memset. 4630b57cec5SDimitry Andric CurLoop->isLoopInvariant(SplatValue)) { 4640b57cec5SDimitry Andric // It looks like we can use SplatValue. 4650b57cec5SDimitry Andric return LegalStoreKind::Memset; 466fe6060f1SDimitry Andric } 467fe6060f1SDimitry Andric if (!UnorderedAtomic && HasMemsetPattern && !DisableLIRP::Memset && 4680b57cec5SDimitry Andric // Don't create memset_pattern16s with address spaces. 4690b57cec5SDimitry Andric StorePtr->getType()->getPointerAddressSpace() == 0 && 470fe6060f1SDimitry Andric getMemSetPatternValue(StoredVal, DL)) { 4710b57cec5SDimitry Andric // It looks like we can use PatternValue! 4720b57cec5SDimitry Andric return LegalStoreKind::MemsetPattern; 4730b57cec5SDimitry Andric } 4740b57cec5SDimitry Andric 4750b57cec5SDimitry Andric // Otherwise, see if the store can be turned into a memcpy. 476e8d8bef9SDimitry Andric if (HasMemcpy && !DisableLIRP::Memcpy) { 4770b57cec5SDimitry Andric // Check to see if the stride matches the size of the store. If so, then we 4780b57cec5SDimitry Andric // know that every byte is touched in the loop. 4790b57cec5SDimitry Andric APInt Stride = getStoreStride(StoreEv); 4800b57cec5SDimitry Andric unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType()); 4810b57cec5SDimitry Andric if (StoreSize != Stride && StoreSize != -Stride) 4820b57cec5SDimitry Andric return LegalStoreKind::None; 4830b57cec5SDimitry Andric 4840b57cec5SDimitry Andric // The store must be feeding a non-volatile load. 4850b57cec5SDimitry Andric LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand()); 4860b57cec5SDimitry Andric 4870b57cec5SDimitry Andric // Only allow non-volatile loads 4880b57cec5SDimitry Andric if (!LI || LI->isVolatile()) 4890b57cec5SDimitry Andric return LegalStoreKind::None; 4900b57cec5SDimitry Andric // Only allow simple or unordered-atomic loads 4910b57cec5SDimitry Andric if (!LI->isUnordered()) 4920b57cec5SDimitry Andric return LegalStoreKind::None; 4930b57cec5SDimitry Andric 4940b57cec5SDimitry Andric // See if the pointer expression is an AddRec like {base,+,1} on the current 4950b57cec5SDimitry Andric // loop, which indicates a strided load. If we have something else, it's a 4960b57cec5SDimitry Andric // random load we can't handle. 4970b57cec5SDimitry Andric const SCEVAddRecExpr *LoadEv = 4980b57cec5SDimitry Andric dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand())); 4990b57cec5SDimitry Andric if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine()) 5000b57cec5SDimitry Andric return LegalStoreKind::None; 5010b57cec5SDimitry Andric 5020b57cec5SDimitry Andric // The store and load must share the same stride. 5030b57cec5SDimitry Andric if (StoreEv->getOperand(1) != LoadEv->getOperand(1)) 5040b57cec5SDimitry Andric return LegalStoreKind::None; 5050b57cec5SDimitry Andric 5060b57cec5SDimitry Andric // Success. This store can be converted into a memcpy. 5070b57cec5SDimitry Andric UnorderedAtomic = UnorderedAtomic || LI->isAtomic(); 5080b57cec5SDimitry Andric return UnorderedAtomic ? LegalStoreKind::UnorderedAtomicMemcpy 5090b57cec5SDimitry Andric : LegalStoreKind::Memcpy; 5100b57cec5SDimitry Andric } 5110b57cec5SDimitry Andric // This store can't be transformed into a memset/memcpy. 5120b57cec5SDimitry Andric return LegalStoreKind::None; 5130b57cec5SDimitry Andric } 5140b57cec5SDimitry Andric 5150b57cec5SDimitry Andric void LoopIdiomRecognize::collectStores(BasicBlock *BB) { 5160b57cec5SDimitry Andric StoreRefsForMemset.clear(); 5170b57cec5SDimitry Andric StoreRefsForMemsetPattern.clear(); 5180b57cec5SDimitry Andric StoreRefsForMemcpy.clear(); 5190b57cec5SDimitry Andric for (Instruction &I : *BB) { 5200b57cec5SDimitry Andric StoreInst *SI = dyn_cast<StoreInst>(&I); 5210b57cec5SDimitry Andric if (!SI) 5220b57cec5SDimitry Andric continue; 5230b57cec5SDimitry Andric 5240b57cec5SDimitry Andric // Make sure this is a strided store with a constant stride. 5250b57cec5SDimitry Andric switch (isLegalStore(SI)) { 5260b57cec5SDimitry Andric case LegalStoreKind::None: 5270b57cec5SDimitry Andric // Nothing to do 5280b57cec5SDimitry Andric break; 5290b57cec5SDimitry Andric case LegalStoreKind::Memset: { 5300b57cec5SDimitry Andric // Find the base pointer. 531e8d8bef9SDimitry Andric Value *Ptr = getUnderlyingObject(SI->getPointerOperand()); 5320b57cec5SDimitry Andric StoreRefsForMemset[Ptr].push_back(SI); 5330b57cec5SDimitry Andric } break; 5340b57cec5SDimitry Andric case LegalStoreKind::MemsetPattern: { 5350b57cec5SDimitry Andric // Find the base pointer. 536e8d8bef9SDimitry Andric Value *Ptr = getUnderlyingObject(SI->getPointerOperand()); 5370b57cec5SDimitry Andric StoreRefsForMemsetPattern[Ptr].push_back(SI); 5380b57cec5SDimitry Andric } break; 5390b57cec5SDimitry Andric case LegalStoreKind::Memcpy: 5400b57cec5SDimitry Andric case LegalStoreKind::UnorderedAtomicMemcpy: 5410b57cec5SDimitry Andric StoreRefsForMemcpy.push_back(SI); 5420b57cec5SDimitry Andric break; 5430b57cec5SDimitry Andric default: 5440b57cec5SDimitry Andric assert(false && "unhandled return value"); 5450b57cec5SDimitry Andric break; 5460b57cec5SDimitry Andric } 5470b57cec5SDimitry Andric } 5480b57cec5SDimitry Andric } 5490b57cec5SDimitry Andric 5500b57cec5SDimitry Andric /// runOnLoopBlock - Process the specified block, which lives in a counted loop 5510b57cec5SDimitry Andric /// with the specified backedge count. This block is known to be in the current 5520b57cec5SDimitry Andric /// loop and not in any subloops. 5530b57cec5SDimitry Andric bool LoopIdiomRecognize::runOnLoopBlock( 5540b57cec5SDimitry Andric BasicBlock *BB, const SCEV *BECount, 5550b57cec5SDimitry Andric SmallVectorImpl<BasicBlock *> &ExitBlocks) { 5560b57cec5SDimitry Andric // We can only promote stores in this block if they are unconditionally 5570b57cec5SDimitry Andric // executed in the loop. For a block to be unconditionally executed, it has 5580b57cec5SDimitry Andric // to dominate all the exit blocks of the loop. Verify this now. 559349cc55cSDimitry Andric for (BasicBlock *ExitBlock : ExitBlocks) 560349cc55cSDimitry Andric if (!DT->dominates(BB, ExitBlock)) 5610b57cec5SDimitry Andric return false; 5620b57cec5SDimitry Andric 5630b57cec5SDimitry Andric bool MadeChange = false; 5640b57cec5SDimitry Andric // Look for store instructions, which may be optimized to memset/memcpy. 5650b57cec5SDimitry Andric collectStores(BB); 5660b57cec5SDimitry Andric 5670b57cec5SDimitry Andric // Look for a single store or sets of stores with a common base, which can be 5680b57cec5SDimitry Andric // optimized into a memset (memset_pattern). The latter most commonly happens 5690b57cec5SDimitry Andric // with structs and handunrolled loops. 5700b57cec5SDimitry Andric for (auto &SL : StoreRefsForMemset) 5710b57cec5SDimitry Andric MadeChange |= processLoopStores(SL.second, BECount, ForMemset::Yes); 5720b57cec5SDimitry Andric 5730b57cec5SDimitry Andric for (auto &SL : StoreRefsForMemsetPattern) 5740b57cec5SDimitry Andric MadeChange |= processLoopStores(SL.second, BECount, ForMemset::No); 5750b57cec5SDimitry Andric 5760b57cec5SDimitry Andric // Optimize the store into a memcpy, if it feeds an similarly strided load. 5770b57cec5SDimitry Andric for (auto &SI : StoreRefsForMemcpy) 5780b57cec5SDimitry Andric MadeChange |= processLoopStoreOfLoopLoad(SI, BECount); 5790b57cec5SDimitry Andric 580fe6060f1SDimitry Andric MadeChange |= processLoopMemIntrinsic<MemCpyInst>( 581fe6060f1SDimitry Andric BB, &LoopIdiomRecognize::processLoopMemCpy, BECount); 582fe6060f1SDimitry Andric MadeChange |= processLoopMemIntrinsic<MemSetInst>( 583fe6060f1SDimitry Andric BB, &LoopIdiomRecognize::processLoopMemSet, BECount); 5840b57cec5SDimitry Andric 5850b57cec5SDimitry Andric return MadeChange; 5860b57cec5SDimitry Andric } 5870b57cec5SDimitry Andric 5880b57cec5SDimitry Andric /// See if this store(s) can be promoted to a memset. 5890b57cec5SDimitry Andric bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL, 5900b57cec5SDimitry Andric const SCEV *BECount, ForMemset For) { 5910b57cec5SDimitry Andric // Try to find consecutive stores that can be transformed into memsets. 5920b57cec5SDimitry Andric SetVector<StoreInst *> Heads, Tails; 5930b57cec5SDimitry Andric SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain; 5940b57cec5SDimitry Andric 5950b57cec5SDimitry Andric // Do a quadratic search on all of the given stores and find 5960b57cec5SDimitry Andric // all of the pairs of stores that follow each other. 5970b57cec5SDimitry Andric SmallVector<unsigned, 16> IndexQueue; 5980b57cec5SDimitry Andric for (unsigned i = 0, e = SL.size(); i < e; ++i) { 5990b57cec5SDimitry Andric assert(SL[i]->isSimple() && "Expected only non-volatile stores."); 6000b57cec5SDimitry Andric 6010b57cec5SDimitry Andric Value *FirstStoredVal = SL[i]->getValueOperand(); 6020b57cec5SDimitry Andric Value *FirstStorePtr = SL[i]->getPointerOperand(); 6030b57cec5SDimitry Andric const SCEVAddRecExpr *FirstStoreEv = 6040b57cec5SDimitry Andric cast<SCEVAddRecExpr>(SE->getSCEV(FirstStorePtr)); 6050b57cec5SDimitry Andric APInt FirstStride = getStoreStride(FirstStoreEv); 6060b57cec5SDimitry Andric unsigned FirstStoreSize = DL->getTypeStoreSize(SL[i]->getValueOperand()->getType()); 6070b57cec5SDimitry Andric 6080b57cec5SDimitry Andric // See if we can optimize just this store in isolation. 6090b57cec5SDimitry Andric if (FirstStride == FirstStoreSize || -FirstStride == FirstStoreSize) { 6100b57cec5SDimitry Andric Heads.insert(SL[i]); 6110b57cec5SDimitry Andric continue; 6120b57cec5SDimitry Andric } 6130b57cec5SDimitry Andric 6140b57cec5SDimitry Andric Value *FirstSplatValue = nullptr; 6150b57cec5SDimitry Andric Constant *FirstPatternValue = nullptr; 6160b57cec5SDimitry Andric 6170b57cec5SDimitry Andric if (For == ForMemset::Yes) 6180b57cec5SDimitry Andric FirstSplatValue = isBytewiseValue(FirstStoredVal, *DL); 6190b57cec5SDimitry Andric else 6200b57cec5SDimitry Andric FirstPatternValue = getMemSetPatternValue(FirstStoredVal, DL); 6210b57cec5SDimitry Andric 6220b57cec5SDimitry Andric assert((FirstSplatValue || FirstPatternValue) && 6230b57cec5SDimitry Andric "Expected either splat value or pattern value."); 6240b57cec5SDimitry Andric 6250b57cec5SDimitry Andric IndexQueue.clear(); 6260b57cec5SDimitry Andric // If a store has multiple consecutive store candidates, search Stores 6270b57cec5SDimitry Andric // array according to the sequence: from i+1 to e, then from i-1 to 0. 6280b57cec5SDimitry Andric // This is because usually pairing with immediate succeeding or preceding 6290b57cec5SDimitry Andric // candidate create the best chance to find memset opportunity. 6300b57cec5SDimitry Andric unsigned j = 0; 6310b57cec5SDimitry Andric for (j = i + 1; j < e; ++j) 6320b57cec5SDimitry Andric IndexQueue.push_back(j); 6330b57cec5SDimitry Andric for (j = i; j > 0; --j) 6340b57cec5SDimitry Andric IndexQueue.push_back(j - 1); 6350b57cec5SDimitry Andric 6360b57cec5SDimitry Andric for (auto &k : IndexQueue) { 6370b57cec5SDimitry Andric assert(SL[k]->isSimple() && "Expected only non-volatile stores."); 6380b57cec5SDimitry Andric Value *SecondStorePtr = SL[k]->getPointerOperand(); 6390b57cec5SDimitry Andric const SCEVAddRecExpr *SecondStoreEv = 6400b57cec5SDimitry Andric cast<SCEVAddRecExpr>(SE->getSCEV(SecondStorePtr)); 6410b57cec5SDimitry Andric APInt SecondStride = getStoreStride(SecondStoreEv); 6420b57cec5SDimitry Andric 6430b57cec5SDimitry Andric if (FirstStride != SecondStride) 6440b57cec5SDimitry Andric continue; 6450b57cec5SDimitry Andric 6460b57cec5SDimitry Andric Value *SecondStoredVal = SL[k]->getValueOperand(); 6470b57cec5SDimitry Andric Value *SecondSplatValue = nullptr; 6480b57cec5SDimitry Andric Constant *SecondPatternValue = nullptr; 6490b57cec5SDimitry Andric 6500b57cec5SDimitry Andric if (For == ForMemset::Yes) 6510b57cec5SDimitry Andric SecondSplatValue = isBytewiseValue(SecondStoredVal, *DL); 6520b57cec5SDimitry Andric else 6530b57cec5SDimitry Andric SecondPatternValue = getMemSetPatternValue(SecondStoredVal, DL); 6540b57cec5SDimitry Andric 6550b57cec5SDimitry Andric assert((SecondSplatValue || SecondPatternValue) && 6560b57cec5SDimitry Andric "Expected either splat value or pattern value."); 6570b57cec5SDimitry Andric 6580b57cec5SDimitry Andric if (isConsecutiveAccess(SL[i], SL[k], *DL, *SE, false)) { 6590b57cec5SDimitry Andric if (For == ForMemset::Yes) { 6600b57cec5SDimitry Andric if (isa<UndefValue>(FirstSplatValue)) 6610b57cec5SDimitry Andric FirstSplatValue = SecondSplatValue; 6620b57cec5SDimitry Andric if (FirstSplatValue != SecondSplatValue) 6630b57cec5SDimitry Andric continue; 6640b57cec5SDimitry Andric } else { 6650b57cec5SDimitry Andric if (isa<UndefValue>(FirstPatternValue)) 6660b57cec5SDimitry Andric FirstPatternValue = SecondPatternValue; 6670b57cec5SDimitry Andric if (FirstPatternValue != SecondPatternValue) 6680b57cec5SDimitry Andric continue; 6690b57cec5SDimitry Andric } 6700b57cec5SDimitry Andric Tails.insert(SL[k]); 6710b57cec5SDimitry Andric Heads.insert(SL[i]); 6720b57cec5SDimitry Andric ConsecutiveChain[SL[i]] = SL[k]; 6730b57cec5SDimitry Andric break; 6740b57cec5SDimitry Andric } 6750b57cec5SDimitry Andric } 6760b57cec5SDimitry Andric } 6770b57cec5SDimitry Andric 6780b57cec5SDimitry Andric // We may run into multiple chains that merge into a single chain. We mark the 6790b57cec5SDimitry Andric // stores that we transformed so that we don't visit the same store twice. 6800b57cec5SDimitry Andric SmallPtrSet<Value *, 16> TransformedStores; 6810b57cec5SDimitry Andric bool Changed = false; 6820b57cec5SDimitry Andric 6830b57cec5SDimitry Andric // For stores that start but don't end a link in the chain: 684349cc55cSDimitry Andric for (StoreInst *I : Heads) { 685349cc55cSDimitry Andric if (Tails.count(I)) 6860b57cec5SDimitry Andric continue; 6870b57cec5SDimitry Andric 6880b57cec5SDimitry Andric // We found a store instr that starts a chain. Now follow the chain and try 6890b57cec5SDimitry Andric // to transform it. 6900b57cec5SDimitry Andric SmallPtrSet<Instruction *, 8> AdjacentStores; 6910b57cec5SDimitry Andric StoreInst *HeadStore = I; 6920b57cec5SDimitry Andric unsigned StoreSize = 0; 6930b57cec5SDimitry Andric 6940b57cec5SDimitry Andric // Collect the chain into a list. 6950b57cec5SDimitry Andric while (Tails.count(I) || Heads.count(I)) { 6960b57cec5SDimitry Andric if (TransformedStores.count(I)) 6970b57cec5SDimitry Andric break; 6980b57cec5SDimitry Andric AdjacentStores.insert(I); 6990b57cec5SDimitry Andric 7000b57cec5SDimitry Andric StoreSize += DL->getTypeStoreSize(I->getValueOperand()->getType()); 7010b57cec5SDimitry Andric // Move to the next value in the chain. 7020b57cec5SDimitry Andric I = ConsecutiveChain[I]; 7030b57cec5SDimitry Andric } 7040b57cec5SDimitry Andric 7050b57cec5SDimitry Andric Value *StoredVal = HeadStore->getValueOperand(); 7060b57cec5SDimitry Andric Value *StorePtr = HeadStore->getPointerOperand(); 7070b57cec5SDimitry Andric const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr)); 7080b57cec5SDimitry Andric APInt Stride = getStoreStride(StoreEv); 7090b57cec5SDimitry Andric 7100b57cec5SDimitry Andric // Check to see if the stride matches the size of the stores. If so, then 7110b57cec5SDimitry Andric // we know that every byte is touched in the loop. 7120b57cec5SDimitry Andric if (StoreSize != Stride && StoreSize != -Stride) 7130b57cec5SDimitry Andric continue; 7140b57cec5SDimitry Andric 715349cc55cSDimitry Andric bool IsNegStride = StoreSize == -Stride; 7160b57cec5SDimitry Andric 717349cc55cSDimitry Andric Type *IntIdxTy = DL->getIndexType(StorePtr->getType()); 718349cc55cSDimitry Andric const SCEV *StoreSizeSCEV = SE->getConstant(IntIdxTy, StoreSize); 719349cc55cSDimitry Andric if (processLoopStridedStore(StorePtr, StoreSizeSCEV, 7200eae32dcSDimitry Andric MaybeAlign(HeadStore->getAlign()), StoredVal, 7210eae32dcSDimitry Andric HeadStore, AdjacentStores, StoreEv, BECount, 7220eae32dcSDimitry Andric IsNegStride)) { 7230b57cec5SDimitry Andric TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end()); 7240b57cec5SDimitry Andric Changed = true; 7250b57cec5SDimitry Andric } 7260b57cec5SDimitry Andric } 7270b57cec5SDimitry Andric 7280b57cec5SDimitry Andric return Changed; 7290b57cec5SDimitry Andric } 7300b57cec5SDimitry Andric 731fe6060f1SDimitry Andric /// processLoopMemIntrinsic - Template function for calling different processor 73281ad6265SDimitry Andric /// functions based on mem intrinsic type. 733fe6060f1SDimitry Andric template <typename MemInst> 734fe6060f1SDimitry Andric bool LoopIdiomRecognize::processLoopMemIntrinsic( 735fe6060f1SDimitry Andric BasicBlock *BB, 736fe6060f1SDimitry Andric bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *), 737fe6060f1SDimitry Andric const SCEV *BECount) { 738fe6060f1SDimitry Andric bool MadeChange = false; 739fe6060f1SDimitry Andric for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 740fe6060f1SDimitry Andric Instruction *Inst = &*I++; 741fe6060f1SDimitry Andric // Look for memory instructions, which may be optimized to a larger one. 742fe6060f1SDimitry Andric if (MemInst *MI = dyn_cast<MemInst>(Inst)) { 743fe6060f1SDimitry Andric WeakTrackingVH InstPtr(&*I); 744fe6060f1SDimitry Andric if (!(this->*Processor)(MI, BECount)) 745fe6060f1SDimitry Andric continue; 746fe6060f1SDimitry Andric MadeChange = true; 747fe6060f1SDimitry Andric 748fe6060f1SDimitry Andric // If processing the instruction invalidated our iterator, start over from 749fe6060f1SDimitry Andric // the top of the block. 750fe6060f1SDimitry Andric if (!InstPtr) 751fe6060f1SDimitry Andric I = BB->begin(); 752fe6060f1SDimitry Andric } 753fe6060f1SDimitry Andric } 754fe6060f1SDimitry Andric return MadeChange; 755fe6060f1SDimitry Andric } 756fe6060f1SDimitry Andric 757fe6060f1SDimitry Andric /// processLoopMemCpy - See if this memcpy can be promoted to a large memcpy 758fe6060f1SDimitry Andric bool LoopIdiomRecognize::processLoopMemCpy(MemCpyInst *MCI, 759fe6060f1SDimitry Andric const SCEV *BECount) { 760fe6060f1SDimitry Andric // We can only handle non-volatile memcpys with a constant size. 761fe6060f1SDimitry Andric if (MCI->isVolatile() || !isa<ConstantInt>(MCI->getLength())) 762fe6060f1SDimitry Andric return false; 763fe6060f1SDimitry Andric 764fe6060f1SDimitry Andric // If we're not allowed to hack on memcpy, we fail. 765fe6060f1SDimitry Andric if ((!HasMemcpy && !isa<MemCpyInlineInst>(MCI)) || DisableLIRP::Memcpy) 766fe6060f1SDimitry Andric return false; 767fe6060f1SDimitry Andric 768fe6060f1SDimitry Andric Value *Dest = MCI->getDest(); 769fe6060f1SDimitry Andric Value *Source = MCI->getSource(); 770fe6060f1SDimitry Andric if (!Dest || !Source) 771fe6060f1SDimitry Andric return false; 772fe6060f1SDimitry Andric 773fe6060f1SDimitry Andric // See if the load and store pointer expressions are AddRec like {base,+,1} on 774fe6060f1SDimitry Andric // the current loop, which indicates a strided load and store. If we have 775fe6060f1SDimitry Andric // something else, it's a random load or store we can't handle. 776fe6060f1SDimitry Andric const SCEVAddRecExpr *StoreEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Dest)); 777fe6060f1SDimitry Andric if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine()) 778fe6060f1SDimitry Andric return false; 779fe6060f1SDimitry Andric const SCEVAddRecExpr *LoadEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Source)); 780fe6060f1SDimitry Andric if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine()) 781fe6060f1SDimitry Andric return false; 782fe6060f1SDimitry Andric 783fe6060f1SDimitry Andric // Reject memcpys that are so large that they overflow an unsigned. 784fe6060f1SDimitry Andric uint64_t SizeInBytes = cast<ConstantInt>(MCI->getLength())->getZExtValue(); 785fe6060f1SDimitry Andric if ((SizeInBytes >> 32) != 0) 786fe6060f1SDimitry Andric return false; 787fe6060f1SDimitry Andric 788fe6060f1SDimitry Andric // Check if the stride matches the size of the memcpy. If so, then we know 789fe6060f1SDimitry Andric // that every byte is touched in the loop. 790349cc55cSDimitry Andric const SCEVConstant *ConstStoreStride = 791fe6060f1SDimitry Andric dyn_cast<SCEVConstant>(StoreEv->getOperand(1)); 792349cc55cSDimitry Andric const SCEVConstant *ConstLoadStride = 793fe6060f1SDimitry Andric dyn_cast<SCEVConstant>(LoadEv->getOperand(1)); 794349cc55cSDimitry Andric if (!ConstStoreStride || !ConstLoadStride) 795fe6060f1SDimitry Andric return false; 796fe6060f1SDimitry Andric 797349cc55cSDimitry Andric APInt StoreStrideValue = ConstStoreStride->getAPInt(); 798349cc55cSDimitry Andric APInt LoadStrideValue = ConstLoadStride->getAPInt(); 799fe6060f1SDimitry Andric // Huge stride value - give up 800fe6060f1SDimitry Andric if (StoreStrideValue.getBitWidth() > 64 || LoadStrideValue.getBitWidth() > 64) 801fe6060f1SDimitry Andric return false; 802fe6060f1SDimitry Andric 803fe6060f1SDimitry Andric if (SizeInBytes != StoreStrideValue && SizeInBytes != -StoreStrideValue) { 804fe6060f1SDimitry Andric ORE.emit([&]() { 805fe6060f1SDimitry Andric return OptimizationRemarkMissed(DEBUG_TYPE, "SizeStrideUnequal", MCI) 806fe6060f1SDimitry Andric << ore::NV("Inst", "memcpy") << " in " 807fe6060f1SDimitry Andric << ore::NV("Function", MCI->getFunction()) 808349cc55cSDimitry Andric << " function will not be hoisted: " 809fe6060f1SDimitry Andric << ore::NV("Reason", "memcpy size is not equal to stride"); 810fe6060f1SDimitry Andric }); 811fe6060f1SDimitry Andric return false; 812fe6060f1SDimitry Andric } 813fe6060f1SDimitry Andric 814fe6060f1SDimitry Andric int64_t StoreStrideInt = StoreStrideValue.getSExtValue(); 815fe6060f1SDimitry Andric int64_t LoadStrideInt = LoadStrideValue.getSExtValue(); 816fe6060f1SDimitry Andric // Check if the load stride matches the store stride. 817fe6060f1SDimitry Andric if (StoreStrideInt != LoadStrideInt) 818fe6060f1SDimitry Andric return false; 819fe6060f1SDimitry Andric 820349cc55cSDimitry Andric return processLoopStoreOfLoopLoad( 821349cc55cSDimitry Andric Dest, Source, SE->getConstant(Dest->getType(), SizeInBytes), 822349cc55cSDimitry Andric MCI->getDestAlign(), MCI->getSourceAlign(), MCI, MCI, StoreEv, LoadEv, 823349cc55cSDimitry Andric BECount); 824fe6060f1SDimitry Andric } 825fe6060f1SDimitry Andric 8260b57cec5SDimitry Andric /// processLoopMemSet - See if this memset can be promoted to a large memset. 8270b57cec5SDimitry Andric bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI, 8280b57cec5SDimitry Andric const SCEV *BECount) { 829349cc55cSDimitry Andric // We can only handle non-volatile memsets. 830349cc55cSDimitry Andric if (MSI->isVolatile()) 8310b57cec5SDimitry Andric return false; 8320b57cec5SDimitry Andric 8330b57cec5SDimitry Andric // If we're not allowed to hack on memset, we fail. 834fe6060f1SDimitry Andric if (!HasMemset || DisableLIRP::Memset) 8350b57cec5SDimitry Andric return false; 8360b57cec5SDimitry Andric 8370b57cec5SDimitry Andric Value *Pointer = MSI->getDest(); 8380b57cec5SDimitry Andric 8390b57cec5SDimitry Andric // See if the pointer expression is an AddRec like {base,+,1} on the current 8400b57cec5SDimitry Andric // loop, which indicates a strided store. If we have something else, it's a 8410b57cec5SDimitry Andric // random store we can't handle. 8420b57cec5SDimitry Andric const SCEVAddRecExpr *Ev = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Pointer)); 843349cc55cSDimitry Andric if (!Ev || Ev->getLoop() != CurLoop) 844349cc55cSDimitry Andric return false; 845349cc55cSDimitry Andric if (!Ev->isAffine()) { 846349cc55cSDimitry Andric LLVM_DEBUG(dbgs() << " Pointer is not affine, abort\n"); 847349cc55cSDimitry Andric return false; 848349cc55cSDimitry Andric } 849349cc55cSDimitry Andric 850349cc55cSDimitry Andric const SCEV *PointerStrideSCEV = Ev->getOperand(1); 851349cc55cSDimitry Andric const SCEV *MemsetSizeSCEV = SE->getSCEV(MSI->getLength()); 852349cc55cSDimitry Andric if (!PointerStrideSCEV || !MemsetSizeSCEV) 8530b57cec5SDimitry Andric return false; 8540b57cec5SDimitry Andric 855349cc55cSDimitry Andric bool IsNegStride = false; 856349cc55cSDimitry Andric const bool IsConstantSize = isa<ConstantInt>(MSI->getLength()); 857349cc55cSDimitry Andric 858349cc55cSDimitry Andric if (IsConstantSize) { 859349cc55cSDimitry Andric // Memset size is constant. 860349cc55cSDimitry Andric // Check if the pointer stride matches the memset size. If so, then 861349cc55cSDimitry Andric // we know that every byte is touched in the loop. 862349cc55cSDimitry Andric LLVM_DEBUG(dbgs() << " memset size is constant\n"); 8630b57cec5SDimitry Andric uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue(); 8640b57cec5SDimitry Andric const SCEVConstant *ConstStride = dyn_cast<SCEVConstant>(Ev->getOperand(1)); 8650b57cec5SDimitry Andric if (!ConstStride) 8660b57cec5SDimitry Andric return false; 8670b57cec5SDimitry Andric 8680b57cec5SDimitry Andric APInt Stride = ConstStride->getAPInt(); 8690b57cec5SDimitry Andric if (SizeInBytes != Stride && SizeInBytes != -Stride) 8700b57cec5SDimitry Andric return false; 8710b57cec5SDimitry Andric 872349cc55cSDimitry Andric IsNegStride = SizeInBytes == -Stride; 873349cc55cSDimitry Andric } else { 874349cc55cSDimitry Andric // Memset size is non-constant. 875349cc55cSDimitry Andric // Check if the pointer stride matches the memset size. 876349cc55cSDimitry Andric // To be conservative, the pass would not promote pointers that aren't in 877349cc55cSDimitry Andric // address space zero. Also, the pass only handles memset length and stride 878349cc55cSDimitry Andric // that are invariant for the top level loop. 879349cc55cSDimitry Andric LLVM_DEBUG(dbgs() << " memset size is non-constant\n"); 880349cc55cSDimitry Andric if (Pointer->getType()->getPointerAddressSpace() != 0) { 881349cc55cSDimitry Andric LLVM_DEBUG(dbgs() << " pointer is not in address space zero, " 882349cc55cSDimitry Andric << "abort\n"); 883349cc55cSDimitry Andric return false; 884349cc55cSDimitry Andric } 885349cc55cSDimitry Andric if (!SE->isLoopInvariant(MemsetSizeSCEV, CurLoop)) { 886349cc55cSDimitry Andric LLVM_DEBUG(dbgs() << " memset size is not a loop-invariant, " 887349cc55cSDimitry Andric << "abort\n"); 888349cc55cSDimitry Andric return false; 889349cc55cSDimitry Andric } 890349cc55cSDimitry Andric 891349cc55cSDimitry Andric // Compare positive direction PointerStrideSCEV with MemsetSizeSCEV 892349cc55cSDimitry Andric IsNegStride = PointerStrideSCEV->isNonConstantNegative(); 893349cc55cSDimitry Andric const SCEV *PositiveStrideSCEV = 894349cc55cSDimitry Andric IsNegStride ? SE->getNegativeSCEV(PointerStrideSCEV) 895349cc55cSDimitry Andric : PointerStrideSCEV; 896349cc55cSDimitry Andric LLVM_DEBUG(dbgs() << " MemsetSizeSCEV: " << *MemsetSizeSCEV << "\n" 897349cc55cSDimitry Andric << " PositiveStrideSCEV: " << *PositiveStrideSCEV 898349cc55cSDimitry Andric << "\n"); 899349cc55cSDimitry Andric 900349cc55cSDimitry Andric if (PositiveStrideSCEV != MemsetSizeSCEV) { 9010eae32dcSDimitry Andric // If an expression is covered by the loop guard, compare again and 9020eae32dcSDimitry Andric // proceed with optimization if equal. 9030eae32dcSDimitry Andric const SCEV *FoldedPositiveStride = 9040eae32dcSDimitry Andric SE->applyLoopGuards(PositiveStrideSCEV, CurLoop); 9050eae32dcSDimitry Andric const SCEV *FoldedMemsetSize = 9060eae32dcSDimitry Andric SE->applyLoopGuards(MemsetSizeSCEV, CurLoop); 9070eae32dcSDimitry Andric 9080eae32dcSDimitry Andric LLVM_DEBUG(dbgs() << " Try to fold SCEV based on loop guard\n" 9090eae32dcSDimitry Andric << " FoldedMemsetSize: " << *FoldedMemsetSize << "\n" 9100eae32dcSDimitry Andric << " FoldedPositiveStride: " << *FoldedPositiveStride 9110eae32dcSDimitry Andric << "\n"); 9120eae32dcSDimitry Andric 9130eae32dcSDimitry Andric if (FoldedPositiveStride != FoldedMemsetSize) { 914349cc55cSDimitry Andric LLVM_DEBUG(dbgs() << " SCEV don't match, abort\n"); 915349cc55cSDimitry Andric return false; 916349cc55cSDimitry Andric } 917349cc55cSDimitry Andric } 9180eae32dcSDimitry Andric } 919349cc55cSDimitry Andric 9200b57cec5SDimitry Andric // Verify that the memset value is loop invariant. If not, we can't promote 9210b57cec5SDimitry Andric // the memset. 9220b57cec5SDimitry Andric Value *SplatValue = MSI->getValue(); 9230b57cec5SDimitry Andric if (!SplatValue || !CurLoop->isLoopInvariant(SplatValue)) 9240b57cec5SDimitry Andric return false; 9250b57cec5SDimitry Andric 9260b57cec5SDimitry Andric SmallPtrSet<Instruction *, 1> MSIs; 9270b57cec5SDimitry Andric MSIs.insert(MSI); 928349cc55cSDimitry Andric return processLoopStridedStore(Pointer, SE->getSCEV(MSI->getLength()), 92981ad6265SDimitry Andric MSI->getDestAlign(), SplatValue, MSI, MSIs, Ev, 93081ad6265SDimitry Andric BECount, IsNegStride, /*IsLoopMemset=*/true); 9310b57cec5SDimitry Andric } 9320b57cec5SDimitry Andric 9330b57cec5SDimitry Andric /// mayLoopAccessLocation - Return true if the specified loop might access the 9340b57cec5SDimitry Andric /// specified pointer location, which is a loop-strided access. The 'Access' 9350b57cec5SDimitry Andric /// argument specifies what the verboten forms of access are (read or write). 9360b57cec5SDimitry Andric static bool 9370b57cec5SDimitry Andric mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L, 938349cc55cSDimitry Andric const SCEV *BECount, const SCEV *StoreSizeSCEV, 9390b57cec5SDimitry Andric AliasAnalysis &AA, 940349cc55cSDimitry Andric SmallPtrSetImpl<Instruction *> &IgnoredInsts) { 9410b57cec5SDimitry Andric // Get the location that may be stored across the loop. Since the access is 9420b57cec5SDimitry Andric // strided positively through memory, we say that the modified location starts 9430b57cec5SDimitry Andric // at the pointer and has infinite size. 944e8d8bef9SDimitry Andric LocationSize AccessSize = LocationSize::afterPointer(); 9450b57cec5SDimitry Andric 9460b57cec5SDimitry Andric // If the loop iterates a fixed number of times, we can refine the access size 9470b57cec5SDimitry Andric // to be exactly the size of the memset, which is (BECount+1)*StoreSize 948349cc55cSDimitry Andric const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount); 949349cc55cSDimitry Andric const SCEVConstant *ConstSize = dyn_cast<SCEVConstant>(StoreSizeSCEV); 9505f757f3fSDimitry Andric if (BECst && ConstSize) { 9515f757f3fSDimitry Andric std::optional<uint64_t> BEInt = BECst->getAPInt().tryZExtValue(); 9525f757f3fSDimitry Andric std::optional<uint64_t> SizeInt = ConstSize->getAPInt().tryZExtValue(); 9535f757f3fSDimitry Andric // FIXME: Should this check for overflow? 9545f757f3fSDimitry Andric if (BEInt && SizeInt) 9555f757f3fSDimitry Andric AccessSize = LocationSize::precise((*BEInt + 1) * *SizeInt); 9565f757f3fSDimitry Andric } 9570b57cec5SDimitry Andric 9580b57cec5SDimitry Andric // TODO: For this to be really effective, we have to dive into the pointer 9590b57cec5SDimitry Andric // operand in the store. Store to &A[i] of 100 will always return may alias 9600b57cec5SDimitry Andric // with store of &A[100], we need to StoreLoc to be "A" with size of 100, 9610b57cec5SDimitry Andric // which will then no-alias a store to &A[100]. 9620b57cec5SDimitry Andric MemoryLocation StoreLoc(Ptr, AccessSize); 9630b57cec5SDimitry Andric 964349cc55cSDimitry Andric for (BasicBlock *B : L->blocks()) 965349cc55cSDimitry Andric for (Instruction &I : *B) 966349cc55cSDimitry Andric if (!IgnoredInsts.contains(&I) && 967bdd1243dSDimitry Andric isModOrRefSet(AA.getModRefInfo(&I, StoreLoc) & Access)) 9680b57cec5SDimitry Andric return true; 9690b57cec5SDimitry Andric return false; 9700b57cec5SDimitry Andric } 9710b57cec5SDimitry Andric 9720b57cec5SDimitry Andric // If we have a negative stride, Start refers to the end of the memory location 9730b57cec5SDimitry Andric // we're trying to memset. Therefore, we need to recompute the base pointer, 9740b57cec5SDimitry Andric // which is just Start - BECount*Size. 9750b57cec5SDimitry Andric static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount, 976349cc55cSDimitry Andric Type *IntPtr, const SCEV *StoreSizeSCEV, 9770b57cec5SDimitry Andric ScalarEvolution *SE) { 9780b57cec5SDimitry Andric const SCEV *Index = SE->getTruncateOrZeroExtend(BECount, IntPtr); 979349cc55cSDimitry Andric if (!StoreSizeSCEV->isOne()) { 980349cc55cSDimitry Andric // index = back edge count * store size 981349cc55cSDimitry Andric Index = SE->getMulExpr(Index, 982349cc55cSDimitry Andric SE->getTruncateOrZeroExtend(StoreSizeSCEV, IntPtr), 9830b57cec5SDimitry Andric SCEV::FlagNUW); 984349cc55cSDimitry Andric } 985349cc55cSDimitry Andric // base pointer = start - index * store size 9860b57cec5SDimitry Andric return SE->getMinusSCEV(Start, Index); 9870b57cec5SDimitry Andric } 9880b57cec5SDimitry Andric 9890b57cec5SDimitry Andric /// Compute the number of bytes as a SCEV from the backedge taken count. 9900b57cec5SDimitry Andric /// 9910b57cec5SDimitry Andric /// This also maps the SCEV into the provided type and tries to handle the 9920b57cec5SDimitry Andric /// computation in a way that will fold cleanly. 9930b57cec5SDimitry Andric static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr, 994349cc55cSDimitry Andric const SCEV *StoreSizeSCEV, Loop *CurLoop, 9950b57cec5SDimitry Andric const DataLayout *DL, ScalarEvolution *SE) { 99606c3fb27SDimitry Andric const SCEV *TripCountSCEV = 99706c3fb27SDimitry Andric SE->getTripCountFromExitCount(BECount, IntPtr, CurLoop); 998349cc55cSDimitry Andric return SE->getMulExpr(TripCountSCEV, 999349cc55cSDimitry Andric SE->getTruncateOrZeroExtend(StoreSizeSCEV, IntPtr), 10000b57cec5SDimitry Andric SCEV::FlagNUW); 10010b57cec5SDimitry Andric } 10020b57cec5SDimitry Andric 10030b57cec5SDimitry Andric /// processLoopStridedStore - We see a strided store of some value. If we can 10040b57cec5SDimitry Andric /// transform this into a memset or memset_pattern in the loop preheader, do so. 10050b57cec5SDimitry Andric bool LoopIdiomRecognize::processLoopStridedStore( 1006349cc55cSDimitry Andric Value *DestPtr, const SCEV *StoreSizeSCEV, MaybeAlign StoreAlignment, 10070b57cec5SDimitry Andric Value *StoredVal, Instruction *TheStore, 10080b57cec5SDimitry Andric SmallPtrSetImpl<Instruction *> &Stores, const SCEVAddRecExpr *Ev, 1009349cc55cSDimitry Andric const SCEV *BECount, bool IsNegStride, bool IsLoopMemset) { 101081ad6265SDimitry Andric Module *M = TheStore->getModule(); 10110b57cec5SDimitry Andric Value *SplatValue = isBytewiseValue(StoredVal, *DL); 10120b57cec5SDimitry Andric Constant *PatternValue = nullptr; 10130b57cec5SDimitry Andric 10140b57cec5SDimitry Andric if (!SplatValue) 10150b57cec5SDimitry Andric PatternValue = getMemSetPatternValue(StoredVal, DL); 10160b57cec5SDimitry Andric 10170b57cec5SDimitry Andric assert((SplatValue || PatternValue) && 10180b57cec5SDimitry Andric "Expected either splat value or pattern value."); 10190b57cec5SDimitry Andric 10200b57cec5SDimitry Andric // The trip count of the loop and the base pointer of the addrec SCEV is 10210b57cec5SDimitry Andric // guaranteed to be loop invariant, which means that it should dominate the 10220b57cec5SDimitry Andric // header. This allows us to insert code for it in the preheader. 10230b57cec5SDimitry Andric unsigned DestAS = DestPtr->getType()->getPointerAddressSpace(); 10240b57cec5SDimitry Andric BasicBlock *Preheader = CurLoop->getLoopPreheader(); 10250b57cec5SDimitry Andric IRBuilder<> Builder(Preheader->getTerminator()); 10260b57cec5SDimitry Andric SCEVExpander Expander(*SE, *DL, "loop-idiom"); 102704eeddc0SDimitry Andric SCEVExpanderCleaner ExpCleaner(Expander); 10280b57cec5SDimitry Andric 10295f757f3fSDimitry Andric Type *DestInt8PtrTy = Builder.getPtrTy(DestAS); 1030480093f4SDimitry Andric Type *IntIdxTy = DL->getIndexType(DestPtr->getType()); 10310b57cec5SDimitry Andric 1032e8d8bef9SDimitry Andric bool Changed = false; 10330b57cec5SDimitry Andric const SCEV *Start = Ev->getStart(); 10340b57cec5SDimitry Andric // Handle negative strided loops. 1035349cc55cSDimitry Andric if (IsNegStride) 1036349cc55cSDimitry Andric Start = getStartForNegStride(Start, BECount, IntIdxTy, StoreSizeSCEV, SE); 10370b57cec5SDimitry Andric 10380b57cec5SDimitry Andric // TODO: ideally we should still be able to generate memset if SCEV expander 10390b57cec5SDimitry Andric // is taught to generate the dependencies at the latest point. 1040fcaf7f86SDimitry Andric if (!Expander.isSafeToExpand(Start)) 1041e8d8bef9SDimitry Andric return Changed; 10420b57cec5SDimitry Andric 10430b57cec5SDimitry Andric // Okay, we have a strided store "p[i]" of a splattable value. We can turn 10440b57cec5SDimitry Andric // this into a memset in the loop preheader now if we want. However, this 10450b57cec5SDimitry Andric // would be unsafe to do if there is anything else in the loop that may read 10460b57cec5SDimitry Andric // or write to the aliased location. Check for any overlap by generating the 10470b57cec5SDimitry Andric // base pointer and checking the region. 10480b57cec5SDimitry Andric Value *BasePtr = 10490b57cec5SDimitry Andric Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator()); 1050e8d8bef9SDimitry Andric 1051e8d8bef9SDimitry Andric // From here on out, conservatively report to the pass manager that we've 1052e8d8bef9SDimitry Andric // changed the IR, even if we later clean up these added instructions. There 1053e8d8bef9SDimitry Andric // may be structural differences e.g. in the order of use lists not accounted 1054e8d8bef9SDimitry Andric // for in just a textual dump of the IR. This is written as a variable, even 1055e8d8bef9SDimitry Andric // though statically all the places this dominates could be replaced with 1056e8d8bef9SDimitry Andric // 'true', with the hope that anyone trying to be clever / "more precise" with 1057e8d8bef9SDimitry Andric // the return value will read this comment, and leave them alone. 1058e8d8bef9SDimitry Andric Changed = true; 1059e8d8bef9SDimitry Andric 10600b57cec5SDimitry Andric if (mayLoopAccessLocation(BasePtr, ModRefInfo::ModRef, CurLoop, BECount, 1061349cc55cSDimitry Andric StoreSizeSCEV, *AA, Stores)) 1062e8d8bef9SDimitry Andric return Changed; 10630b57cec5SDimitry Andric 10640b57cec5SDimitry Andric if (avoidLIRForMultiBlockLoop(/*IsMemset=*/true, IsLoopMemset)) 1065e8d8bef9SDimitry Andric return Changed; 10660b57cec5SDimitry Andric 10670b57cec5SDimitry Andric // Okay, everything looks good, insert the memset. 10680b57cec5SDimitry Andric 10690b57cec5SDimitry Andric const SCEV *NumBytesS = 1070349cc55cSDimitry Andric getNumBytes(BECount, IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE); 10710b57cec5SDimitry Andric 10720b57cec5SDimitry Andric // TODO: ideally we should still be able to generate memset if SCEV expander 10730b57cec5SDimitry Andric // is taught to generate the dependencies at the latest point. 1074fcaf7f86SDimitry Andric if (!Expander.isSafeToExpand(NumBytesS)) 1075e8d8bef9SDimitry Andric return Changed; 10760b57cec5SDimitry Andric 10770b57cec5SDimitry Andric Value *NumBytes = 1078480093f4SDimitry Andric Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator()); 10790b57cec5SDimitry Andric 108006c3fb27SDimitry Andric if (!SplatValue && !isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16)) 108106c3fb27SDimitry Andric return Changed; 108206c3fb27SDimitry Andric 10831fd87a68SDimitry Andric AAMDNodes AATags = TheStore->getAAMetadata(); 108456f451bbSDimitry Andric for (Instruction *Store : Stores) 108556f451bbSDimitry Andric AATags = AATags.merge(Store->getAAMetadata()); 10861fd87a68SDimitry Andric if (auto CI = dyn_cast<ConstantInt>(NumBytes)) 10871fd87a68SDimitry Andric AATags = AATags.extendTo(CI->getZExtValue()); 10881fd87a68SDimitry Andric else 10891fd87a68SDimitry Andric AATags = AATags.extendTo(-1); 10901fd87a68SDimitry Andric 109106c3fb27SDimitry Andric CallInst *NewCall; 109206c3fb27SDimitry Andric if (SplatValue) { 10931fd87a68SDimitry Andric NewCall = Builder.CreateMemSet( 10941fd87a68SDimitry Andric BasePtr, SplatValue, NumBytes, MaybeAlign(StoreAlignment), 10951fd87a68SDimitry Andric /*isVolatile=*/false, AATags.TBAA, AATags.Scope, AATags.NoAlias); 109606c3fb27SDimitry Andric } else { 109706c3fb27SDimitry Andric assert (isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16)); 10980b57cec5SDimitry Andric // Everything is emitted in default address space 10990b57cec5SDimitry Andric Type *Int8PtrTy = DestInt8PtrTy; 11000b57cec5SDimitry Andric 11010b57cec5SDimitry Andric StringRef FuncName = "memset_pattern16"; 110281ad6265SDimitry Andric FunctionCallee MSP = getOrInsertLibFunc(M, *TLI, LibFunc_memset_pattern16, 110381ad6265SDimitry Andric Builder.getVoidTy(), Int8PtrTy, Int8PtrTy, IntIdxTy); 110481ad6265SDimitry Andric inferNonMandatoryLibFuncAttrs(M, FuncName, *TLI); 11050b57cec5SDimitry Andric 11060b57cec5SDimitry Andric // Otherwise we should form a memset_pattern16. PatternValue is known to be 11070b57cec5SDimitry Andric // an constant array of 16-bytes. Plop the value into a mergable global. 11080b57cec5SDimitry Andric GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true, 11090b57cec5SDimitry Andric GlobalValue::PrivateLinkage, 11100b57cec5SDimitry Andric PatternValue, ".memset_pattern"); 11110b57cec5SDimitry Andric GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Ok to merge these. 11128bcb0991SDimitry Andric GV->setAlignment(Align(16)); 11135f757f3fSDimitry Andric Value *PatternPtr = GV; 11140b57cec5SDimitry Andric NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes}); 111506c3fb27SDimitry Andric 111606c3fb27SDimitry Andric // Set the TBAA info if present. 111706c3fb27SDimitry Andric if (AATags.TBAA) 111806c3fb27SDimitry Andric NewCall->setMetadata(LLVMContext::MD_tbaa, AATags.TBAA); 111906c3fb27SDimitry Andric 112006c3fb27SDimitry Andric if (AATags.Scope) 112106c3fb27SDimitry Andric NewCall->setMetadata(LLVMContext::MD_alias_scope, AATags.Scope); 112206c3fb27SDimitry Andric 112306c3fb27SDimitry Andric if (AATags.NoAlias) 112406c3fb27SDimitry Andric NewCall->setMetadata(LLVMContext::MD_noalias, AATags.NoAlias); 112506c3fb27SDimitry Andric } 112681ad6265SDimitry Andric 11275ffd83dbSDimitry Andric NewCall->setDebugLoc(TheStore->getDebugLoc()); 11285ffd83dbSDimitry Andric 11295ffd83dbSDimitry Andric if (MSSAU) { 11305ffd83dbSDimitry Andric MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB( 11315ffd83dbSDimitry Andric NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator); 11325ffd83dbSDimitry Andric MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true); 11335ffd83dbSDimitry Andric } 11340b57cec5SDimitry Andric 11350b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << " Formed memset: " << *NewCall << "\n" 11360b57cec5SDimitry Andric << " from store to: " << *Ev << " at: " << *TheStore 11370b57cec5SDimitry Andric << "\n"); 11380b57cec5SDimitry Andric 11390b57cec5SDimitry Andric ORE.emit([&]() { 1140349cc55cSDimitry Andric OptimizationRemark R(DEBUG_TYPE, "ProcessLoopStridedStore", 1141349cc55cSDimitry Andric NewCall->getDebugLoc(), Preheader); 1142349cc55cSDimitry Andric R << "Transformed loop-strided store in " 1143fe6060f1SDimitry Andric << ore::NV("Function", TheStore->getFunction()) 1144fe6060f1SDimitry Andric << " function into a call to " 11450b57cec5SDimitry Andric << ore::NV("NewFunction", NewCall->getCalledFunction()) 1146fe6060f1SDimitry Andric << "() intrinsic"; 1147349cc55cSDimitry Andric if (!Stores.empty()) 1148349cc55cSDimitry Andric R << ore::setExtraArgs(); 1149349cc55cSDimitry Andric for (auto *I : Stores) { 1150349cc55cSDimitry Andric R << ore::NV("FromBlock", I->getParent()->getName()) 1151349cc55cSDimitry Andric << ore::NV("ToBlock", Preheader->getName()); 1152349cc55cSDimitry Andric } 1153349cc55cSDimitry Andric return R; 11540b57cec5SDimitry Andric }); 11550b57cec5SDimitry Andric 11560b57cec5SDimitry Andric // Okay, the memset has been formed. Zap the original store and anything that 11570b57cec5SDimitry Andric // feeds into it. 11585ffd83dbSDimitry Andric for (auto *I : Stores) { 11595ffd83dbSDimitry Andric if (MSSAU) 11605ffd83dbSDimitry Andric MSSAU->removeMemoryAccess(I, true); 11610b57cec5SDimitry Andric deleteDeadInstruction(I); 11625ffd83dbSDimitry Andric } 11635ffd83dbSDimitry Andric if (MSSAU && VerifyMemorySSA) 11645ffd83dbSDimitry Andric MSSAU->getMemorySSA()->verifyMemorySSA(); 11650b57cec5SDimitry Andric ++NumMemSet; 1166e8d8bef9SDimitry Andric ExpCleaner.markResultUsed(); 11670b57cec5SDimitry Andric return true; 11680b57cec5SDimitry Andric } 11690b57cec5SDimitry Andric 11700b57cec5SDimitry Andric /// If the stored value is a strided load in the same loop with the same stride 11710b57cec5SDimitry Andric /// this may be transformable into a memcpy. This kicks in for stuff like 11720b57cec5SDimitry Andric /// for (i) A[i] = B[i]; 11730b57cec5SDimitry Andric bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI, 11740b57cec5SDimitry Andric const SCEV *BECount) { 11750b57cec5SDimitry Andric assert(SI->isUnordered() && "Expected only non-volatile non-ordered stores."); 11760b57cec5SDimitry Andric 11770b57cec5SDimitry Andric Value *StorePtr = SI->getPointerOperand(); 11780b57cec5SDimitry Andric const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr)); 11790b57cec5SDimitry Andric unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType()); 11800b57cec5SDimitry Andric 11810b57cec5SDimitry Andric // The store must be feeding a non-volatile load. 11820b57cec5SDimitry Andric LoadInst *LI = cast<LoadInst>(SI->getValueOperand()); 11830b57cec5SDimitry Andric assert(LI->isUnordered() && "Expected only non-volatile non-ordered loads."); 11840b57cec5SDimitry Andric 11850b57cec5SDimitry Andric // See if the pointer expression is an AddRec like {base,+,1} on the current 11860b57cec5SDimitry Andric // loop, which indicates a strided load. If we have something else, it's a 11870b57cec5SDimitry Andric // random load we can't handle. 1188fe6060f1SDimitry Andric Value *LoadPtr = LI->getPointerOperand(); 1189fe6060f1SDimitry Andric const SCEVAddRecExpr *LoadEv = cast<SCEVAddRecExpr>(SE->getSCEV(LoadPtr)); 1190349cc55cSDimitry Andric 1191349cc55cSDimitry Andric const SCEV *StoreSizeSCEV = SE->getConstant(StorePtr->getType(), StoreSize); 1192349cc55cSDimitry Andric return processLoopStoreOfLoopLoad(StorePtr, LoadPtr, StoreSizeSCEV, 1193fe6060f1SDimitry Andric SI->getAlign(), LI->getAlign(), SI, LI, 1194fe6060f1SDimitry Andric StoreEv, LoadEv, BECount); 1195fe6060f1SDimitry Andric } 1196fe6060f1SDimitry Andric 1197bdd1243dSDimitry Andric namespace { 1198349cc55cSDimitry Andric class MemmoveVerifier { 1199349cc55cSDimitry Andric public: 1200349cc55cSDimitry Andric explicit MemmoveVerifier(const Value &LoadBasePtr, const Value &StoreBasePtr, 1201349cc55cSDimitry Andric const DataLayout &DL) 120281ad6265SDimitry Andric : DL(DL), BP1(llvm::GetPointerBaseWithConstantOffset( 1203349cc55cSDimitry Andric LoadBasePtr.stripPointerCasts(), LoadOff, DL)), 1204349cc55cSDimitry Andric BP2(llvm::GetPointerBaseWithConstantOffset( 1205349cc55cSDimitry Andric StoreBasePtr.stripPointerCasts(), StoreOff, DL)), 1206349cc55cSDimitry Andric IsSameObject(BP1 == BP2) {} 1207349cc55cSDimitry Andric 1208349cc55cSDimitry Andric bool loadAndStoreMayFormMemmove(unsigned StoreSize, bool IsNegStride, 1209349cc55cSDimitry Andric const Instruction &TheLoad, 1210349cc55cSDimitry Andric bool IsMemCpy) const { 1211349cc55cSDimitry Andric if (IsMemCpy) { 1212349cc55cSDimitry Andric // Ensure that LoadBasePtr is after StoreBasePtr or before StoreBasePtr 1213349cc55cSDimitry Andric // for negative stride. 1214349cc55cSDimitry Andric if ((!IsNegStride && LoadOff <= StoreOff) || 1215349cc55cSDimitry Andric (IsNegStride && LoadOff >= StoreOff)) 1216349cc55cSDimitry Andric return false; 1217349cc55cSDimitry Andric } else { 1218349cc55cSDimitry Andric // Ensure that LoadBasePtr is after StoreBasePtr or before StoreBasePtr 1219349cc55cSDimitry Andric // for negative stride. LoadBasePtr shouldn't overlap with StoreBasePtr. 1220349cc55cSDimitry Andric int64_t LoadSize = 1221bdd1243dSDimitry Andric DL.getTypeSizeInBits(TheLoad.getType()).getFixedValue() / 8; 1222349cc55cSDimitry Andric if (BP1 != BP2 || LoadSize != int64_t(StoreSize)) 1223349cc55cSDimitry Andric return false; 1224349cc55cSDimitry Andric if ((!IsNegStride && LoadOff < StoreOff + int64_t(StoreSize)) || 1225349cc55cSDimitry Andric (IsNegStride && LoadOff + LoadSize > StoreOff)) 1226349cc55cSDimitry Andric return false; 1227349cc55cSDimitry Andric } 1228349cc55cSDimitry Andric return true; 1229349cc55cSDimitry Andric } 1230349cc55cSDimitry Andric 1231349cc55cSDimitry Andric private: 1232349cc55cSDimitry Andric const DataLayout &DL; 123381ad6265SDimitry Andric int64_t LoadOff = 0; 123481ad6265SDimitry Andric int64_t StoreOff = 0; 1235349cc55cSDimitry Andric const Value *BP1; 1236349cc55cSDimitry Andric const Value *BP2; 1237349cc55cSDimitry Andric 1238349cc55cSDimitry Andric public: 1239349cc55cSDimitry Andric const bool IsSameObject; 1240349cc55cSDimitry Andric }; 1241bdd1243dSDimitry Andric } // namespace 1242349cc55cSDimitry Andric 1243fe6060f1SDimitry Andric bool LoopIdiomRecognize::processLoopStoreOfLoopLoad( 1244349cc55cSDimitry Andric Value *DestPtr, Value *SourcePtr, const SCEV *StoreSizeSCEV, 1245349cc55cSDimitry Andric MaybeAlign StoreAlign, MaybeAlign LoadAlign, Instruction *TheStore, 1246349cc55cSDimitry Andric Instruction *TheLoad, const SCEVAddRecExpr *StoreEv, 1247349cc55cSDimitry Andric const SCEVAddRecExpr *LoadEv, const SCEV *BECount) { 1248fe6060f1SDimitry Andric 1249fe6060f1SDimitry Andric // FIXME: until llvm.memcpy.inline supports dynamic sizes, we need to 1250fe6060f1SDimitry Andric // conservatively bail here, since otherwise we may have to transform 1251fe6060f1SDimitry Andric // llvm.memcpy.inline into llvm.memcpy which is illegal. 1252fe6060f1SDimitry Andric if (isa<MemCpyInlineInst>(TheStore)) 1253fe6060f1SDimitry Andric return false; 12540b57cec5SDimitry Andric 12550b57cec5SDimitry Andric // The trip count of the loop and the base pointer of the addrec SCEV is 12560b57cec5SDimitry Andric // guaranteed to be loop invariant, which means that it should dominate the 12570b57cec5SDimitry Andric // header. This allows us to insert code for it in the preheader. 12580b57cec5SDimitry Andric BasicBlock *Preheader = CurLoop->getLoopPreheader(); 12590b57cec5SDimitry Andric IRBuilder<> Builder(Preheader->getTerminator()); 12600b57cec5SDimitry Andric SCEVExpander Expander(*SE, *DL, "loop-idiom"); 12610b57cec5SDimitry Andric 126204eeddc0SDimitry Andric SCEVExpanderCleaner ExpCleaner(Expander); 12635ffd83dbSDimitry Andric 1264e8d8bef9SDimitry Andric bool Changed = false; 12650b57cec5SDimitry Andric const SCEV *StrStart = StoreEv->getStart(); 1266fe6060f1SDimitry Andric unsigned StrAS = DestPtr->getType()->getPointerAddressSpace(); 1267480093f4SDimitry Andric Type *IntIdxTy = Builder.getIntNTy(DL->getIndexSizeInBits(StrAS)); 12680b57cec5SDimitry Andric 1269fe6060f1SDimitry Andric APInt Stride = getStoreStride(StoreEv); 1270349cc55cSDimitry Andric const SCEVConstant *ConstStoreSize = dyn_cast<SCEVConstant>(StoreSizeSCEV); 1271349cc55cSDimitry Andric 1272349cc55cSDimitry Andric // TODO: Deal with non-constant size; Currently expect constant store size 1273349cc55cSDimitry Andric assert(ConstStoreSize && "store size is expected to be a constant"); 1274349cc55cSDimitry Andric 1275349cc55cSDimitry Andric int64_t StoreSize = ConstStoreSize->getValue()->getZExtValue(); 1276349cc55cSDimitry Andric bool IsNegStride = StoreSize == -Stride; 1277fe6060f1SDimitry Andric 12780b57cec5SDimitry Andric // Handle negative strided loops. 1279349cc55cSDimitry Andric if (IsNegStride) 1280349cc55cSDimitry Andric StrStart = 1281349cc55cSDimitry Andric getStartForNegStride(StrStart, BECount, IntIdxTy, StoreSizeSCEV, SE); 12820b57cec5SDimitry Andric 12830b57cec5SDimitry Andric // Okay, we have a strided store "p[i]" of a loaded value. We can turn 12840b57cec5SDimitry Andric // this into a memcpy in the loop preheader now if we want. However, this 12850b57cec5SDimitry Andric // would be unsafe to do if there is anything else in the loop that may read 12860b57cec5SDimitry Andric // or write the memory region we're storing to. This includes the load that 12870b57cec5SDimitry Andric // feeds the stores. Check for an alias by generating the base address and 12880b57cec5SDimitry Andric // checking everything. 12890b57cec5SDimitry Andric Value *StoreBasePtr = Expander.expandCodeFor( 12905f757f3fSDimitry Andric StrStart, Builder.getPtrTy(StrAS), Preheader->getTerminator()); 1291e8d8bef9SDimitry Andric 1292e8d8bef9SDimitry Andric // From here on out, conservatively report to the pass manager that we've 1293e8d8bef9SDimitry Andric // changed the IR, even if we later clean up these added instructions. There 1294e8d8bef9SDimitry Andric // may be structural differences e.g. in the order of use lists not accounted 1295e8d8bef9SDimitry Andric // for in just a textual dump of the IR. This is written as a variable, even 1296e8d8bef9SDimitry Andric // though statically all the places this dominates could be replaced with 1297e8d8bef9SDimitry Andric // 'true', with the hope that anyone trying to be clever / "more precise" with 1298e8d8bef9SDimitry Andric // the return value will read this comment, and leave them alone. 1299e8d8bef9SDimitry Andric Changed = true; 13000b57cec5SDimitry Andric 1301349cc55cSDimitry Andric SmallPtrSet<Instruction *, 2> IgnoredInsts; 1302349cc55cSDimitry Andric IgnoredInsts.insert(TheStore); 1303fe6060f1SDimitry Andric 1304fe6060f1SDimitry Andric bool IsMemCpy = isa<MemCpyInst>(TheStore); 1305fe6060f1SDimitry Andric const StringRef InstRemark = IsMemCpy ? "memcpy" : "load and store"; 1306fe6060f1SDimitry Andric 1307349cc55cSDimitry Andric bool LoopAccessStore = 1308fe6060f1SDimitry Andric mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount, 1309349cc55cSDimitry Andric StoreSizeSCEV, *AA, IgnoredInsts); 1310349cc55cSDimitry Andric if (LoopAccessStore) { 131169ade1e0SDimitry Andric // For memmove case it's not enough to guarantee that loop doesn't access 131269ade1e0SDimitry Andric // TheStore and TheLoad. Additionally we need to make sure that TheStore is 131369ade1e0SDimitry Andric // the only user of TheLoad. 131469ade1e0SDimitry Andric if (!TheLoad->hasOneUse()) 131569ade1e0SDimitry Andric return Changed; 1316349cc55cSDimitry Andric IgnoredInsts.insert(TheLoad); 1317fe6060f1SDimitry Andric if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, 1318349cc55cSDimitry Andric BECount, StoreSizeSCEV, *AA, IgnoredInsts)) { 1319fe6060f1SDimitry Andric ORE.emit([&]() { 1320fe6060f1SDimitry Andric return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessStore", 1321fe6060f1SDimitry Andric TheStore) 1322fe6060f1SDimitry Andric << ore::NV("Inst", InstRemark) << " in " 1323fe6060f1SDimitry Andric << ore::NV("Function", TheStore->getFunction()) 1324fe6060f1SDimitry Andric << " function will not be hoisted: " 1325fe6060f1SDimitry Andric << ore::NV("Reason", "The loop may access store location"); 1326fe6060f1SDimitry Andric }); 1327e8d8bef9SDimitry Andric return Changed; 1328fe6060f1SDimitry Andric } 1329349cc55cSDimitry Andric IgnoredInsts.erase(TheLoad); 1330fe6060f1SDimitry Andric } 13310b57cec5SDimitry Andric 13320b57cec5SDimitry Andric const SCEV *LdStart = LoadEv->getStart(); 1333fe6060f1SDimitry Andric unsigned LdAS = SourcePtr->getType()->getPointerAddressSpace(); 13340b57cec5SDimitry Andric 13350b57cec5SDimitry Andric // Handle negative strided loops. 1336349cc55cSDimitry Andric if (IsNegStride) 1337349cc55cSDimitry Andric LdStart = 1338349cc55cSDimitry Andric getStartForNegStride(LdStart, BECount, IntIdxTy, StoreSizeSCEV, SE); 13390b57cec5SDimitry Andric 13400b57cec5SDimitry Andric // For a memcpy, we have to make sure that the input array is not being 13410b57cec5SDimitry Andric // mutated by the loop. 13425f757f3fSDimitry Andric Value *LoadBasePtr = Expander.expandCodeFor(LdStart, Builder.getPtrTy(LdAS), 13435f757f3fSDimitry Andric Preheader->getTerminator()); 13440b57cec5SDimitry Andric 1345fe6060f1SDimitry Andric // If the store is a memcpy instruction, we must check if it will write to 1346fe6060f1SDimitry Andric // the load memory locations. So remove it from the ignored stores. 1347349cc55cSDimitry Andric MemmoveVerifier Verifier(*LoadBasePtr, *StoreBasePtr, *DL); 134856f451bbSDimitry Andric if (IsMemCpy && !Verifier.IsSameObject) 134956f451bbSDimitry Andric IgnoredInsts.erase(TheStore); 13500b57cec5SDimitry Andric if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount, 1351349cc55cSDimitry Andric StoreSizeSCEV, *AA, IgnoredInsts)) { 1352fe6060f1SDimitry Andric ORE.emit([&]() { 135356f451bbSDimitry Andric return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessLoad", TheLoad) 1354fe6060f1SDimitry Andric << ore::NV("Inst", InstRemark) << " in " 1355fe6060f1SDimitry Andric << ore::NV("Function", TheStore->getFunction()) 1356fe6060f1SDimitry Andric << " function will not be hoisted: " 1357fe6060f1SDimitry Andric << ore::NV("Reason", "The loop may access load location"); 1358fe6060f1SDimitry Andric }); 1359e8d8bef9SDimitry Andric return Changed; 1360fe6060f1SDimitry Andric } 13610b57cec5SDimitry Andric 1362349cc55cSDimitry Andric bool UseMemMove = IsMemCpy ? Verifier.IsSameObject : LoopAccessStore; 1363349cc55cSDimitry Andric if (UseMemMove) 1364349cc55cSDimitry Andric if (!Verifier.loadAndStoreMayFormMemmove(StoreSize, IsNegStride, *TheLoad, 1365349cc55cSDimitry Andric IsMemCpy)) 1366349cc55cSDimitry Andric return Changed; 1367349cc55cSDimitry Andric 13680b57cec5SDimitry Andric if (avoidLIRForMultiBlockLoop()) 1369e8d8bef9SDimitry Andric return Changed; 13700b57cec5SDimitry Andric 13710b57cec5SDimitry Andric // Okay, everything is safe, we can transform this! 13720b57cec5SDimitry Andric 13730b57cec5SDimitry Andric const SCEV *NumBytesS = 1374349cc55cSDimitry Andric getNumBytes(BECount, IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE); 13750b57cec5SDimitry Andric 13760b57cec5SDimitry Andric Value *NumBytes = 1377480093f4SDimitry Andric Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator()); 13780b57cec5SDimitry Andric 13791fd87a68SDimitry Andric AAMDNodes AATags = TheLoad->getAAMetadata(); 13801fd87a68SDimitry Andric AAMDNodes StoreAATags = TheStore->getAAMetadata(); 13811fd87a68SDimitry Andric AATags = AATags.merge(StoreAATags); 13821fd87a68SDimitry Andric if (auto CI = dyn_cast<ConstantInt>(NumBytes)) 13831fd87a68SDimitry Andric AATags = AATags.extendTo(CI->getZExtValue()); 13841fd87a68SDimitry Andric else 13851fd87a68SDimitry Andric AATags = AATags.extendTo(-1); 13861fd87a68SDimitry Andric 13870b57cec5SDimitry Andric CallInst *NewCall = nullptr; 13880b57cec5SDimitry Andric // Check whether to generate an unordered atomic memcpy: 13890b57cec5SDimitry Andric // If the load or store are atomic, then they must necessarily be unordered 13900b57cec5SDimitry Andric // by previous checks. 1391fe6060f1SDimitry Andric if (!TheStore->isAtomic() && !TheLoad->isAtomic()) { 1392fe6060f1SDimitry Andric if (UseMemMove) 13931fd87a68SDimitry Andric NewCall = Builder.CreateMemMove( 13941fd87a68SDimitry Andric StoreBasePtr, StoreAlign, LoadBasePtr, LoadAlign, NumBytes, 13951fd87a68SDimitry Andric /*isVolatile=*/false, AATags.TBAA, AATags.Scope, AATags.NoAlias); 1396fe6060f1SDimitry Andric else 13971fd87a68SDimitry Andric NewCall = 13981fd87a68SDimitry Andric Builder.CreateMemCpy(StoreBasePtr, StoreAlign, LoadBasePtr, LoadAlign, 13991fd87a68SDimitry Andric NumBytes, /*isVolatile=*/false, AATags.TBAA, 14001fd87a68SDimitry Andric AATags.TBAAStruct, AATags.Scope, AATags.NoAlias); 1401fe6060f1SDimitry Andric } else { 1402fe6060f1SDimitry Andric // For now don't support unordered atomic memmove. 1403fe6060f1SDimitry Andric if (UseMemMove) 1404fe6060f1SDimitry Andric return Changed; 14050b57cec5SDimitry Andric // We cannot allow unaligned ops for unordered load/store, so reject 14060b57cec5SDimitry Andric // anything where the alignment isn't at least the element size. 140781ad6265SDimitry Andric assert((StoreAlign && LoadAlign) && 1408fe6060f1SDimitry Andric "Expect unordered load/store to have align."); 1409bdd1243dSDimitry Andric if (*StoreAlign < StoreSize || *LoadAlign < StoreSize) 1410e8d8bef9SDimitry Andric return Changed; 14110b57cec5SDimitry Andric 14120b57cec5SDimitry Andric // If the element.atomic memcpy is not lowered into explicit 14130b57cec5SDimitry Andric // loads/stores later, then it will be lowered into an element-size 14140b57cec5SDimitry Andric // specific lib call. If the lib call doesn't exist for our store size, then 14150b57cec5SDimitry Andric // we shouldn't generate the memcpy. 14160b57cec5SDimitry Andric if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize()) 1417e8d8bef9SDimitry Andric return Changed; 14180b57cec5SDimitry Andric 14190b57cec5SDimitry Andric // Create the call. 14200b57cec5SDimitry Andric // Note that unordered atomic loads/stores are *required* by the spec to 14210b57cec5SDimitry Andric // have an alignment but non-atomic loads/stores may not. 14220b57cec5SDimitry Andric NewCall = Builder.CreateElementUnorderedAtomicMemCpy( 1423bdd1243dSDimitry Andric StoreBasePtr, *StoreAlign, LoadBasePtr, *LoadAlign, NumBytes, StoreSize, 1424bdd1243dSDimitry Andric AATags.TBAA, AATags.TBAAStruct, AATags.Scope, AATags.NoAlias); 14250b57cec5SDimitry Andric } 1426fe6060f1SDimitry Andric NewCall->setDebugLoc(TheStore->getDebugLoc()); 14270b57cec5SDimitry Andric 14285ffd83dbSDimitry Andric if (MSSAU) { 14295ffd83dbSDimitry Andric MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB( 14305ffd83dbSDimitry Andric NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator); 14315ffd83dbSDimitry Andric MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true); 14325ffd83dbSDimitry Andric } 14335ffd83dbSDimitry Andric 1434fe6060f1SDimitry Andric LLVM_DEBUG(dbgs() << " Formed new call: " << *NewCall << "\n" 1435fe6060f1SDimitry Andric << " from load ptr=" << *LoadEv << " at: " << *TheLoad 1436fe6060f1SDimitry Andric << "\n" 1437fe6060f1SDimitry Andric << " from store ptr=" << *StoreEv << " at: " << *TheStore 14380b57cec5SDimitry Andric << "\n"); 14390b57cec5SDimitry Andric 14400b57cec5SDimitry Andric ORE.emit([&]() { 14410b57cec5SDimitry Andric return OptimizationRemark(DEBUG_TYPE, "ProcessLoopStoreOfLoopLoad", 14420b57cec5SDimitry Andric NewCall->getDebugLoc(), Preheader) 14430b57cec5SDimitry Andric << "Formed a call to " 14440b57cec5SDimitry Andric << ore::NV("NewFunction", NewCall->getCalledFunction()) 1445fe6060f1SDimitry Andric << "() intrinsic from " << ore::NV("Inst", InstRemark) 1446fe6060f1SDimitry Andric << " instruction in " << ore::NV("Function", TheStore->getFunction()) 1447349cc55cSDimitry Andric << " function" 1448349cc55cSDimitry Andric << ore::setExtraArgs() 1449349cc55cSDimitry Andric << ore::NV("FromBlock", TheStore->getParent()->getName()) 1450349cc55cSDimitry Andric << ore::NV("ToBlock", Preheader->getName()); 14510b57cec5SDimitry Andric }); 14520b57cec5SDimitry Andric 1453349cc55cSDimitry Andric // Okay, a new call to memcpy/memmove has been formed. Zap the original store 1454349cc55cSDimitry Andric // and anything that feeds into it. 14555ffd83dbSDimitry Andric if (MSSAU) 1456fe6060f1SDimitry Andric MSSAU->removeMemoryAccess(TheStore, true); 1457fe6060f1SDimitry Andric deleteDeadInstruction(TheStore); 14585ffd83dbSDimitry Andric if (MSSAU && VerifyMemorySSA) 14595ffd83dbSDimitry Andric MSSAU->getMemorySSA()->verifyMemorySSA(); 1460fe6060f1SDimitry Andric if (UseMemMove) 1461fe6060f1SDimitry Andric ++NumMemMove; 1462fe6060f1SDimitry Andric else 14630b57cec5SDimitry Andric ++NumMemCpy; 1464e8d8bef9SDimitry Andric ExpCleaner.markResultUsed(); 14650b57cec5SDimitry Andric return true; 14660b57cec5SDimitry Andric } 14670b57cec5SDimitry Andric 14680b57cec5SDimitry Andric // When compiling for codesize we avoid idiom recognition for a multi-block loop 14690b57cec5SDimitry Andric // unless it is a loop_memset idiom or a memset/memcpy idiom in a nested loop. 14700b57cec5SDimitry Andric // 14710b57cec5SDimitry Andric bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset, 14720b57cec5SDimitry Andric bool IsLoopMemset) { 14730b57cec5SDimitry Andric if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > 1) { 1474e8d8bef9SDimitry Andric if (CurLoop->isOutermost() && (!IsMemset || !IsLoopMemset)) { 14750b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << " " << CurLoop->getHeader()->getParent()->getName() 14760b57cec5SDimitry Andric << " : LIR " << (IsMemset ? "Memset" : "Memcpy") 14770b57cec5SDimitry Andric << " avoided: multi-block top-level loop\n"); 14780b57cec5SDimitry Andric return true; 14790b57cec5SDimitry Andric } 14800b57cec5SDimitry Andric } 14810b57cec5SDimitry Andric 14820b57cec5SDimitry Andric return false; 14830b57cec5SDimitry Andric } 14840b57cec5SDimitry Andric 14850b57cec5SDimitry Andric bool LoopIdiomRecognize::runOnNoncountableLoop() { 14860b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F[" 14870b57cec5SDimitry Andric << CurLoop->getHeader()->getParent()->getName() 14880b57cec5SDimitry Andric << "] Noncountable Loop %" 14890b57cec5SDimitry Andric << CurLoop->getHeader()->getName() << "\n"); 14900b57cec5SDimitry Andric 1491e8d8bef9SDimitry Andric return recognizePopcount() || recognizeAndInsertFFS() || 1492*0fca6ea1SDimitry Andric recognizeShiftUntilBitTest() || recognizeShiftUntilZero() || 1493*0fca6ea1SDimitry Andric recognizeShiftUntilLessThan(); 14940b57cec5SDimitry Andric } 14950b57cec5SDimitry Andric 14960b57cec5SDimitry Andric /// Check if the given conditional branch is based on the comparison between 14970b57cec5SDimitry Andric /// a variable and zero, and if the variable is non-zero or zero (JmpOnZero is 14980b57cec5SDimitry Andric /// true), the control yields to the loop entry. If the branch matches the 14990b57cec5SDimitry Andric /// behavior, the variable involved in the comparison is returned. This function 15000b57cec5SDimitry Andric /// will be called to see if the precondition and postcondition of the loop are 15010b57cec5SDimitry Andric /// in desirable form. 15020b57cec5SDimitry Andric static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry, 15030b57cec5SDimitry Andric bool JmpOnZero = false) { 15040b57cec5SDimitry Andric if (!BI || !BI->isConditional()) 15050b57cec5SDimitry Andric return nullptr; 15060b57cec5SDimitry Andric 15070b57cec5SDimitry Andric ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition()); 15080b57cec5SDimitry Andric if (!Cond) 15090b57cec5SDimitry Andric return nullptr; 15100b57cec5SDimitry Andric 15110b57cec5SDimitry Andric ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1)); 15120b57cec5SDimitry Andric if (!CmpZero || !CmpZero->isZero()) 15130b57cec5SDimitry Andric return nullptr; 15140b57cec5SDimitry Andric 15150b57cec5SDimitry Andric BasicBlock *TrueSucc = BI->getSuccessor(0); 15160b57cec5SDimitry Andric BasicBlock *FalseSucc = BI->getSuccessor(1); 15170b57cec5SDimitry Andric if (JmpOnZero) 15180b57cec5SDimitry Andric std::swap(TrueSucc, FalseSucc); 15190b57cec5SDimitry Andric 15200b57cec5SDimitry Andric ICmpInst::Predicate Pred = Cond->getPredicate(); 15210b57cec5SDimitry Andric if ((Pred == ICmpInst::ICMP_NE && TrueSucc == LoopEntry) || 15220b57cec5SDimitry Andric (Pred == ICmpInst::ICMP_EQ && FalseSucc == LoopEntry)) 15230b57cec5SDimitry Andric return Cond->getOperand(0); 15240b57cec5SDimitry Andric 15250b57cec5SDimitry Andric return nullptr; 15260b57cec5SDimitry Andric } 15270b57cec5SDimitry Andric 1528*0fca6ea1SDimitry Andric /// Check if the given conditional branch is based on an unsigned less-than 1529*0fca6ea1SDimitry Andric /// comparison between a variable and a constant, and if the comparison is false 1530*0fca6ea1SDimitry Andric /// the control yields to the loop entry. If the branch matches the behaviour, 1531*0fca6ea1SDimitry Andric /// the variable involved in the comparison is returned. 1532*0fca6ea1SDimitry Andric static Value *matchShiftULTCondition(BranchInst *BI, BasicBlock *LoopEntry, 1533*0fca6ea1SDimitry Andric APInt &Threshold) { 1534*0fca6ea1SDimitry Andric if (!BI || !BI->isConditional()) 1535*0fca6ea1SDimitry Andric return nullptr; 1536*0fca6ea1SDimitry Andric 1537*0fca6ea1SDimitry Andric ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition()); 1538*0fca6ea1SDimitry Andric if (!Cond) 1539*0fca6ea1SDimitry Andric return nullptr; 1540*0fca6ea1SDimitry Andric 1541*0fca6ea1SDimitry Andric ConstantInt *CmpConst = dyn_cast<ConstantInt>(Cond->getOperand(1)); 1542*0fca6ea1SDimitry Andric if (!CmpConst) 1543*0fca6ea1SDimitry Andric return nullptr; 1544*0fca6ea1SDimitry Andric 1545*0fca6ea1SDimitry Andric BasicBlock *FalseSucc = BI->getSuccessor(1); 1546*0fca6ea1SDimitry Andric ICmpInst::Predicate Pred = Cond->getPredicate(); 1547*0fca6ea1SDimitry Andric 1548*0fca6ea1SDimitry Andric if (Pred == ICmpInst::ICMP_ULT && FalseSucc == LoopEntry) { 1549*0fca6ea1SDimitry Andric Threshold = CmpConst->getValue(); 1550*0fca6ea1SDimitry Andric return Cond->getOperand(0); 1551*0fca6ea1SDimitry Andric } 1552*0fca6ea1SDimitry Andric 1553*0fca6ea1SDimitry Andric return nullptr; 1554*0fca6ea1SDimitry Andric } 1555*0fca6ea1SDimitry Andric 15560b57cec5SDimitry Andric // Check if the recurrence variable `VarX` is in the right form to create 15570b57cec5SDimitry Andric // the idiom. Returns the value coerced to a PHINode if so. 15580b57cec5SDimitry Andric static PHINode *getRecurrenceVar(Value *VarX, Instruction *DefX, 15590b57cec5SDimitry Andric BasicBlock *LoopEntry) { 15600b57cec5SDimitry Andric auto *PhiX = dyn_cast<PHINode>(VarX); 15610b57cec5SDimitry Andric if (PhiX && PhiX->getParent() == LoopEntry && 15620b57cec5SDimitry Andric (PhiX->getOperand(0) == DefX || PhiX->getOperand(1) == DefX)) 15630b57cec5SDimitry Andric return PhiX; 15640b57cec5SDimitry Andric return nullptr; 15650b57cec5SDimitry Andric } 15660b57cec5SDimitry Andric 1567*0fca6ea1SDimitry Andric /// Return true if the idiom is detected in the loop. 1568*0fca6ea1SDimitry Andric /// 1569*0fca6ea1SDimitry Andric /// Additionally: 1570*0fca6ea1SDimitry Andric /// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ) 1571*0fca6ea1SDimitry Andric /// or nullptr if there is no such. 1572*0fca6ea1SDimitry Andric /// 2) \p CntPhi is set to the corresponding phi node 1573*0fca6ea1SDimitry Andric /// or nullptr if there is no such. 1574*0fca6ea1SDimitry Andric /// 3) \p InitX is set to the value whose CTLZ could be used. 1575*0fca6ea1SDimitry Andric /// 4) \p DefX is set to the instruction calculating Loop exit condition. 1576*0fca6ea1SDimitry Andric /// 5) \p Threshold is set to the constant involved in the unsigned less-than 1577*0fca6ea1SDimitry Andric /// comparison. 1578*0fca6ea1SDimitry Andric /// 1579*0fca6ea1SDimitry Andric /// The core idiom we are trying to detect is: 1580*0fca6ea1SDimitry Andric /// \code 1581*0fca6ea1SDimitry Andric /// if (x0 < 2) 1582*0fca6ea1SDimitry Andric /// goto loop-exit // the precondition of the loop 1583*0fca6ea1SDimitry Andric /// cnt0 = init-val 1584*0fca6ea1SDimitry Andric /// do { 1585*0fca6ea1SDimitry Andric /// x = phi (x0, x.next); //PhiX 1586*0fca6ea1SDimitry Andric /// cnt = phi (cnt0, cnt.next) 1587*0fca6ea1SDimitry Andric /// 1588*0fca6ea1SDimitry Andric /// cnt.next = cnt + 1; 1589*0fca6ea1SDimitry Andric /// ... 1590*0fca6ea1SDimitry Andric /// x.next = x >> 1; // DefX 1591*0fca6ea1SDimitry Andric /// } while (x >= 4) 1592*0fca6ea1SDimitry Andric /// loop-exit: 1593*0fca6ea1SDimitry Andric /// \endcode 1594*0fca6ea1SDimitry Andric static bool detectShiftUntilLessThanIdiom(Loop *CurLoop, const DataLayout &DL, 1595*0fca6ea1SDimitry Andric Intrinsic::ID &IntrinID, 1596*0fca6ea1SDimitry Andric Value *&InitX, Instruction *&CntInst, 1597*0fca6ea1SDimitry Andric PHINode *&CntPhi, Instruction *&DefX, 1598*0fca6ea1SDimitry Andric APInt &Threshold) { 1599*0fca6ea1SDimitry Andric BasicBlock *LoopEntry; 1600*0fca6ea1SDimitry Andric 1601*0fca6ea1SDimitry Andric DefX = nullptr; 1602*0fca6ea1SDimitry Andric CntInst = nullptr; 1603*0fca6ea1SDimitry Andric CntPhi = nullptr; 1604*0fca6ea1SDimitry Andric LoopEntry = *(CurLoop->block_begin()); 1605*0fca6ea1SDimitry Andric 1606*0fca6ea1SDimitry Andric // step 1: Check if the loop-back branch is in desirable form. 1607*0fca6ea1SDimitry Andric if (Value *T = matchShiftULTCondition( 1608*0fca6ea1SDimitry Andric dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry, 1609*0fca6ea1SDimitry Andric Threshold)) 1610*0fca6ea1SDimitry Andric DefX = dyn_cast<Instruction>(T); 1611*0fca6ea1SDimitry Andric else 1612*0fca6ea1SDimitry Andric return false; 1613*0fca6ea1SDimitry Andric 1614*0fca6ea1SDimitry Andric // step 2: Check the recurrence of variable X 1615*0fca6ea1SDimitry Andric if (!DefX || !isa<PHINode>(DefX)) 1616*0fca6ea1SDimitry Andric return false; 1617*0fca6ea1SDimitry Andric 1618*0fca6ea1SDimitry Andric PHINode *VarPhi = cast<PHINode>(DefX); 1619*0fca6ea1SDimitry Andric int Idx = VarPhi->getBasicBlockIndex(LoopEntry); 1620*0fca6ea1SDimitry Andric if (Idx == -1) 1621*0fca6ea1SDimitry Andric return false; 1622*0fca6ea1SDimitry Andric 1623*0fca6ea1SDimitry Andric DefX = dyn_cast<Instruction>(VarPhi->getIncomingValue(Idx)); 1624*0fca6ea1SDimitry Andric if (!DefX || DefX->getNumOperands() == 0 || DefX->getOperand(0) != VarPhi) 1625*0fca6ea1SDimitry Andric return false; 1626*0fca6ea1SDimitry Andric 1627*0fca6ea1SDimitry Andric // step 3: detect instructions corresponding to "x.next = x >> 1" 1628*0fca6ea1SDimitry Andric if (DefX->getOpcode() != Instruction::LShr) 1629*0fca6ea1SDimitry Andric return false; 1630*0fca6ea1SDimitry Andric 1631*0fca6ea1SDimitry Andric IntrinID = Intrinsic::ctlz; 1632*0fca6ea1SDimitry Andric ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1)); 1633*0fca6ea1SDimitry Andric if (!Shft || !Shft->isOne()) 1634*0fca6ea1SDimitry Andric return false; 1635*0fca6ea1SDimitry Andric 1636*0fca6ea1SDimitry Andric InitX = VarPhi->getIncomingValueForBlock(CurLoop->getLoopPreheader()); 1637*0fca6ea1SDimitry Andric 1638*0fca6ea1SDimitry Andric // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1 1639*0fca6ea1SDimitry Andric // or cnt.next = cnt + -1. 1640*0fca6ea1SDimitry Andric // TODO: We can skip the step. If loop trip count is known (CTLZ), 1641*0fca6ea1SDimitry Andric // then all uses of "cnt.next" could be optimized to the trip count 1642*0fca6ea1SDimitry Andric // plus "cnt0". Currently it is not optimized. 1643*0fca6ea1SDimitry Andric // This step could be used to detect POPCNT instruction: 1644*0fca6ea1SDimitry Andric // cnt.next = cnt + (x.next & 1) 1645*0fca6ea1SDimitry Andric for (Instruction &Inst : llvm::make_range( 1646*0fca6ea1SDimitry Andric LoopEntry->getFirstNonPHI()->getIterator(), LoopEntry->end())) { 1647*0fca6ea1SDimitry Andric if (Inst.getOpcode() != Instruction::Add) 1648*0fca6ea1SDimitry Andric continue; 1649*0fca6ea1SDimitry Andric 1650*0fca6ea1SDimitry Andric ConstantInt *Inc = dyn_cast<ConstantInt>(Inst.getOperand(1)); 1651*0fca6ea1SDimitry Andric if (!Inc || (!Inc->isOne() && !Inc->isMinusOne())) 1652*0fca6ea1SDimitry Andric continue; 1653*0fca6ea1SDimitry Andric 1654*0fca6ea1SDimitry Andric PHINode *Phi = getRecurrenceVar(Inst.getOperand(0), &Inst, LoopEntry); 1655*0fca6ea1SDimitry Andric if (!Phi) 1656*0fca6ea1SDimitry Andric continue; 1657*0fca6ea1SDimitry Andric 1658*0fca6ea1SDimitry Andric CntInst = &Inst; 1659*0fca6ea1SDimitry Andric CntPhi = Phi; 1660*0fca6ea1SDimitry Andric break; 1661*0fca6ea1SDimitry Andric } 1662*0fca6ea1SDimitry Andric if (!CntInst) 1663*0fca6ea1SDimitry Andric return false; 1664*0fca6ea1SDimitry Andric 1665*0fca6ea1SDimitry Andric return true; 1666*0fca6ea1SDimitry Andric } 1667*0fca6ea1SDimitry Andric 16680b57cec5SDimitry Andric /// Return true iff the idiom is detected in the loop. 16690b57cec5SDimitry Andric /// 16700b57cec5SDimitry Andric /// Additionally: 16710b57cec5SDimitry Andric /// 1) \p CntInst is set to the instruction counting the population bit. 16720b57cec5SDimitry Andric /// 2) \p CntPhi is set to the corresponding phi node. 16730b57cec5SDimitry Andric /// 3) \p Var is set to the value whose population bits are being counted. 16740b57cec5SDimitry Andric /// 16750b57cec5SDimitry Andric /// The core idiom we are trying to detect is: 16760b57cec5SDimitry Andric /// \code 16770b57cec5SDimitry Andric /// if (x0 != 0) 16780b57cec5SDimitry Andric /// goto loop-exit // the precondition of the loop 16790b57cec5SDimitry Andric /// cnt0 = init-val; 16800b57cec5SDimitry Andric /// do { 16810b57cec5SDimitry Andric /// x1 = phi (x0, x2); 16820b57cec5SDimitry Andric /// cnt1 = phi(cnt0, cnt2); 16830b57cec5SDimitry Andric /// 16840b57cec5SDimitry Andric /// cnt2 = cnt1 + 1; 16850b57cec5SDimitry Andric /// ... 16860b57cec5SDimitry Andric /// x2 = x1 & (x1 - 1); 16870b57cec5SDimitry Andric /// ... 16880b57cec5SDimitry Andric /// } while(x != 0); 16890b57cec5SDimitry Andric /// 16900b57cec5SDimitry Andric /// loop-exit: 16910b57cec5SDimitry Andric /// \endcode 16920b57cec5SDimitry Andric static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB, 16930b57cec5SDimitry Andric Instruction *&CntInst, PHINode *&CntPhi, 16940b57cec5SDimitry Andric Value *&Var) { 16950b57cec5SDimitry Andric // step 1: Check to see if the look-back branch match this pattern: 16960b57cec5SDimitry Andric // "if (a!=0) goto loop-entry". 16970b57cec5SDimitry Andric BasicBlock *LoopEntry; 16980b57cec5SDimitry Andric Instruction *DefX2, *CountInst; 16990b57cec5SDimitry Andric Value *VarX1, *VarX0; 17000b57cec5SDimitry Andric PHINode *PhiX, *CountPhi; 17010b57cec5SDimitry Andric 17020b57cec5SDimitry Andric DefX2 = CountInst = nullptr; 17030b57cec5SDimitry Andric VarX1 = VarX0 = nullptr; 17040b57cec5SDimitry Andric PhiX = CountPhi = nullptr; 17050b57cec5SDimitry Andric LoopEntry = *(CurLoop->block_begin()); 17060b57cec5SDimitry Andric 17070b57cec5SDimitry Andric // step 1: Check if the loop-back branch is in desirable form. 17080b57cec5SDimitry Andric { 17090b57cec5SDimitry Andric if (Value *T = matchCondition( 17100b57cec5SDimitry Andric dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry)) 17110b57cec5SDimitry Andric DefX2 = dyn_cast<Instruction>(T); 17120b57cec5SDimitry Andric else 17130b57cec5SDimitry Andric return false; 17140b57cec5SDimitry Andric } 17150b57cec5SDimitry Andric 17160b57cec5SDimitry Andric // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)" 17170b57cec5SDimitry Andric { 17180b57cec5SDimitry Andric if (!DefX2 || DefX2->getOpcode() != Instruction::And) 17190b57cec5SDimitry Andric return false; 17200b57cec5SDimitry Andric 17210b57cec5SDimitry Andric BinaryOperator *SubOneOp; 17220b57cec5SDimitry Andric 17230b57cec5SDimitry Andric if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0)))) 17240b57cec5SDimitry Andric VarX1 = DefX2->getOperand(1); 17250b57cec5SDimitry Andric else { 17260b57cec5SDimitry Andric VarX1 = DefX2->getOperand(0); 17270b57cec5SDimitry Andric SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1)); 17280b57cec5SDimitry Andric } 17290b57cec5SDimitry Andric if (!SubOneOp || SubOneOp->getOperand(0) != VarX1) 17300b57cec5SDimitry Andric return false; 17310b57cec5SDimitry Andric 17320b57cec5SDimitry Andric ConstantInt *Dec = dyn_cast<ConstantInt>(SubOneOp->getOperand(1)); 17330b57cec5SDimitry Andric if (!Dec || 17340b57cec5SDimitry Andric !((SubOneOp->getOpcode() == Instruction::Sub && Dec->isOne()) || 17350b57cec5SDimitry Andric (SubOneOp->getOpcode() == Instruction::Add && 17360b57cec5SDimitry Andric Dec->isMinusOne()))) { 17370b57cec5SDimitry Andric return false; 17380b57cec5SDimitry Andric } 17390b57cec5SDimitry Andric } 17400b57cec5SDimitry Andric 17410b57cec5SDimitry Andric // step 3: Check the recurrence of variable X 17420b57cec5SDimitry Andric PhiX = getRecurrenceVar(VarX1, DefX2, LoopEntry); 17430b57cec5SDimitry Andric if (!PhiX) 17440b57cec5SDimitry Andric return false; 17450b57cec5SDimitry Andric 17460b57cec5SDimitry Andric // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1 17470b57cec5SDimitry Andric { 17480b57cec5SDimitry Andric CountInst = nullptr; 1749349cc55cSDimitry Andric for (Instruction &Inst : llvm::make_range( 1750349cc55cSDimitry Andric LoopEntry->getFirstNonPHI()->getIterator(), LoopEntry->end())) { 1751349cc55cSDimitry Andric if (Inst.getOpcode() != Instruction::Add) 17520b57cec5SDimitry Andric continue; 17530b57cec5SDimitry Andric 1754349cc55cSDimitry Andric ConstantInt *Inc = dyn_cast<ConstantInt>(Inst.getOperand(1)); 17550b57cec5SDimitry Andric if (!Inc || !Inc->isOne()) 17560b57cec5SDimitry Andric continue; 17570b57cec5SDimitry Andric 1758349cc55cSDimitry Andric PHINode *Phi = getRecurrenceVar(Inst.getOperand(0), &Inst, LoopEntry); 17590b57cec5SDimitry Andric if (!Phi) 17600b57cec5SDimitry Andric continue; 17610b57cec5SDimitry Andric 17620b57cec5SDimitry Andric // Check if the result of the instruction is live of the loop. 17630b57cec5SDimitry Andric bool LiveOutLoop = false; 1764349cc55cSDimitry Andric for (User *U : Inst.users()) { 17650b57cec5SDimitry Andric if ((cast<Instruction>(U))->getParent() != LoopEntry) { 17660b57cec5SDimitry Andric LiveOutLoop = true; 17670b57cec5SDimitry Andric break; 17680b57cec5SDimitry Andric } 17690b57cec5SDimitry Andric } 17700b57cec5SDimitry Andric 17710b57cec5SDimitry Andric if (LiveOutLoop) { 1772349cc55cSDimitry Andric CountInst = &Inst; 17730b57cec5SDimitry Andric CountPhi = Phi; 17740b57cec5SDimitry Andric break; 17750b57cec5SDimitry Andric } 17760b57cec5SDimitry Andric } 17770b57cec5SDimitry Andric 17780b57cec5SDimitry Andric if (!CountInst) 17790b57cec5SDimitry Andric return false; 17800b57cec5SDimitry Andric } 17810b57cec5SDimitry Andric 17820b57cec5SDimitry Andric // step 5: check if the precondition is in this form: 17830b57cec5SDimitry Andric // "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;" 17840b57cec5SDimitry Andric { 17850b57cec5SDimitry Andric auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator()); 17860b57cec5SDimitry Andric Value *T = matchCondition(PreCondBr, CurLoop->getLoopPreheader()); 17870b57cec5SDimitry Andric if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1)) 17880b57cec5SDimitry Andric return false; 17890b57cec5SDimitry Andric 17900b57cec5SDimitry Andric CntInst = CountInst; 17910b57cec5SDimitry Andric CntPhi = CountPhi; 17920b57cec5SDimitry Andric Var = T; 17930b57cec5SDimitry Andric } 17940b57cec5SDimitry Andric 17950b57cec5SDimitry Andric return true; 17960b57cec5SDimitry Andric } 17970b57cec5SDimitry Andric 17980b57cec5SDimitry Andric /// Return true if the idiom is detected in the loop. 17990b57cec5SDimitry Andric /// 18000b57cec5SDimitry Andric /// Additionally: 18010b57cec5SDimitry Andric /// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ) 18020b57cec5SDimitry Andric /// or nullptr if there is no such. 18030b57cec5SDimitry Andric /// 2) \p CntPhi is set to the corresponding phi node 18040b57cec5SDimitry Andric /// or nullptr if there is no such. 18050b57cec5SDimitry Andric /// 3) \p Var is set to the value whose CTLZ could be used. 18060b57cec5SDimitry Andric /// 4) \p DefX is set to the instruction calculating Loop exit condition. 18070b57cec5SDimitry Andric /// 18080b57cec5SDimitry Andric /// The core idiom we are trying to detect is: 18090b57cec5SDimitry Andric /// \code 18100b57cec5SDimitry Andric /// if (x0 == 0) 18110b57cec5SDimitry Andric /// goto loop-exit // the precondition of the loop 18120b57cec5SDimitry Andric /// cnt0 = init-val; 18130b57cec5SDimitry Andric /// do { 18140b57cec5SDimitry Andric /// x = phi (x0, x.next); //PhiX 18150b57cec5SDimitry Andric /// cnt = phi(cnt0, cnt.next); 18160b57cec5SDimitry Andric /// 18170b57cec5SDimitry Andric /// cnt.next = cnt + 1; 18180b57cec5SDimitry Andric /// ... 18190b57cec5SDimitry Andric /// x.next = x >> 1; // DefX 18200b57cec5SDimitry Andric /// ... 18210b57cec5SDimitry Andric /// } while(x.next != 0); 18220b57cec5SDimitry Andric /// 18230b57cec5SDimitry Andric /// loop-exit: 18240b57cec5SDimitry Andric /// \endcode 18250b57cec5SDimitry Andric static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL, 18260b57cec5SDimitry Andric Intrinsic::ID &IntrinID, Value *&InitX, 18270b57cec5SDimitry Andric Instruction *&CntInst, PHINode *&CntPhi, 18280b57cec5SDimitry Andric Instruction *&DefX) { 18290b57cec5SDimitry Andric BasicBlock *LoopEntry; 18300b57cec5SDimitry Andric Value *VarX = nullptr; 18310b57cec5SDimitry Andric 18320b57cec5SDimitry Andric DefX = nullptr; 18330b57cec5SDimitry Andric CntInst = nullptr; 18340b57cec5SDimitry Andric CntPhi = nullptr; 18350b57cec5SDimitry Andric LoopEntry = *(CurLoop->block_begin()); 18360b57cec5SDimitry Andric 18370b57cec5SDimitry Andric // step 1: Check if the loop-back branch is in desirable form. 18380b57cec5SDimitry Andric if (Value *T = matchCondition( 18390b57cec5SDimitry Andric dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry)) 18400b57cec5SDimitry Andric DefX = dyn_cast<Instruction>(T); 18410b57cec5SDimitry Andric else 18420b57cec5SDimitry Andric return false; 18430b57cec5SDimitry Andric 18440b57cec5SDimitry Andric // step 2: detect instructions corresponding to "x.next = x >> 1 or x << 1" 18450b57cec5SDimitry Andric if (!DefX || !DefX->isShift()) 18460b57cec5SDimitry Andric return false; 18470b57cec5SDimitry Andric IntrinID = DefX->getOpcode() == Instruction::Shl ? Intrinsic::cttz : 18480b57cec5SDimitry Andric Intrinsic::ctlz; 18490b57cec5SDimitry Andric ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1)); 18500b57cec5SDimitry Andric if (!Shft || !Shft->isOne()) 18510b57cec5SDimitry Andric return false; 18520b57cec5SDimitry Andric VarX = DefX->getOperand(0); 18530b57cec5SDimitry Andric 18540b57cec5SDimitry Andric // step 3: Check the recurrence of variable X 18550b57cec5SDimitry Andric PHINode *PhiX = getRecurrenceVar(VarX, DefX, LoopEntry); 18560b57cec5SDimitry Andric if (!PhiX) 18570b57cec5SDimitry Andric return false; 18580b57cec5SDimitry Andric 18590b57cec5SDimitry Andric InitX = PhiX->getIncomingValueForBlock(CurLoop->getLoopPreheader()); 18600b57cec5SDimitry Andric 18610b57cec5SDimitry Andric // Make sure the initial value can't be negative otherwise the ashr in the 18620b57cec5SDimitry Andric // loop might never reach zero which would make the loop infinite. 18630b57cec5SDimitry Andric if (DefX->getOpcode() == Instruction::AShr && !isKnownNonNegative(InitX, DL)) 18640b57cec5SDimitry Andric return false; 18650b57cec5SDimitry Andric 18660b57cec5SDimitry Andric // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1 1867e8d8bef9SDimitry Andric // or cnt.next = cnt + -1. 18680b57cec5SDimitry Andric // TODO: We can skip the step. If loop trip count is known (CTLZ), 18690b57cec5SDimitry Andric // then all uses of "cnt.next" could be optimized to the trip count 18700b57cec5SDimitry Andric // plus "cnt0". Currently it is not optimized. 18710b57cec5SDimitry Andric // This step could be used to detect POPCNT instruction: 18720b57cec5SDimitry Andric // cnt.next = cnt + (x.next & 1) 1873349cc55cSDimitry Andric for (Instruction &Inst : llvm::make_range( 1874349cc55cSDimitry Andric LoopEntry->getFirstNonPHI()->getIterator(), LoopEntry->end())) { 1875349cc55cSDimitry Andric if (Inst.getOpcode() != Instruction::Add) 18760b57cec5SDimitry Andric continue; 18770b57cec5SDimitry Andric 1878349cc55cSDimitry Andric ConstantInt *Inc = dyn_cast<ConstantInt>(Inst.getOperand(1)); 1879e8d8bef9SDimitry Andric if (!Inc || (!Inc->isOne() && !Inc->isMinusOne())) 18800b57cec5SDimitry Andric continue; 18810b57cec5SDimitry Andric 1882349cc55cSDimitry Andric PHINode *Phi = getRecurrenceVar(Inst.getOperand(0), &Inst, LoopEntry); 18830b57cec5SDimitry Andric if (!Phi) 18840b57cec5SDimitry Andric continue; 18850b57cec5SDimitry Andric 1886349cc55cSDimitry Andric CntInst = &Inst; 18870b57cec5SDimitry Andric CntPhi = Phi; 18880b57cec5SDimitry Andric break; 18890b57cec5SDimitry Andric } 18900b57cec5SDimitry Andric if (!CntInst) 18910b57cec5SDimitry Andric return false; 18920b57cec5SDimitry Andric 18930b57cec5SDimitry Andric return true; 18940b57cec5SDimitry Andric } 18950b57cec5SDimitry Andric 1896*0fca6ea1SDimitry Andric // Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always 1897*0fca6ea1SDimitry Andric // profitable if we delete the loop. 1898*0fca6ea1SDimitry Andric bool LoopIdiomRecognize::isProfitableToInsertFFS(Intrinsic::ID IntrinID, 1899*0fca6ea1SDimitry Andric Value *InitX, bool ZeroCheck, 1900*0fca6ea1SDimitry Andric size_t CanonicalSize) { 1901*0fca6ea1SDimitry Andric const Value *Args[] = {InitX, 1902*0fca6ea1SDimitry Andric ConstantInt::getBool(InitX->getContext(), ZeroCheck)}; 1903*0fca6ea1SDimitry Andric 1904*0fca6ea1SDimitry Andric // @llvm.dbg doesn't count as they have no semantic effect. 1905*0fca6ea1SDimitry Andric auto InstWithoutDebugIt = CurLoop->getHeader()->instructionsWithoutDebug(); 1906*0fca6ea1SDimitry Andric uint32_t HeaderSize = 1907*0fca6ea1SDimitry Andric std::distance(InstWithoutDebugIt.begin(), InstWithoutDebugIt.end()); 1908*0fca6ea1SDimitry Andric 1909*0fca6ea1SDimitry Andric IntrinsicCostAttributes Attrs(IntrinID, InitX->getType(), Args); 1910*0fca6ea1SDimitry Andric InstructionCost Cost = TTI->getIntrinsicInstrCost( 1911*0fca6ea1SDimitry Andric Attrs, TargetTransformInfo::TCK_SizeAndLatency); 1912*0fca6ea1SDimitry Andric if (HeaderSize != CanonicalSize && Cost > TargetTransformInfo::TCC_Basic) 19130b57cec5SDimitry Andric return false; 19140b57cec5SDimitry Andric 1915*0fca6ea1SDimitry Andric return true; 1916*0fca6ea1SDimitry Andric } 19170b57cec5SDimitry Andric 1918*0fca6ea1SDimitry Andric /// Convert CTLZ / CTTZ idiom loop into countable loop. 1919*0fca6ea1SDimitry Andric /// If CTLZ / CTTZ inserted as a new trip count returns true; otherwise, 1920*0fca6ea1SDimitry Andric /// returns false. 1921*0fca6ea1SDimitry Andric bool LoopIdiomRecognize::insertFFSIfProfitable(Intrinsic::ID IntrinID, 1922*0fca6ea1SDimitry Andric Value *InitX, Instruction *DefX, 1923*0fca6ea1SDimitry Andric PHINode *CntPhi, 1924*0fca6ea1SDimitry Andric Instruction *CntInst) { 19250b57cec5SDimitry Andric bool IsCntPhiUsedOutsideLoop = false; 19260b57cec5SDimitry Andric for (User *U : CntPhi->users()) 19270b57cec5SDimitry Andric if (!CurLoop->contains(cast<Instruction>(U))) { 19280b57cec5SDimitry Andric IsCntPhiUsedOutsideLoop = true; 19290b57cec5SDimitry Andric break; 19300b57cec5SDimitry Andric } 19310b57cec5SDimitry Andric bool IsCntInstUsedOutsideLoop = false; 19320b57cec5SDimitry Andric for (User *U : CntInst->users()) 19330b57cec5SDimitry Andric if (!CurLoop->contains(cast<Instruction>(U))) { 19340b57cec5SDimitry Andric IsCntInstUsedOutsideLoop = true; 19350b57cec5SDimitry Andric break; 19360b57cec5SDimitry Andric } 19370b57cec5SDimitry Andric // If both CntInst and CntPhi are used outside the loop the profitability 19380b57cec5SDimitry Andric // is questionable. 19390b57cec5SDimitry Andric if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop) 19400b57cec5SDimitry Andric return false; 19410b57cec5SDimitry Andric 19420b57cec5SDimitry Andric // For some CPUs result of CTLZ(X) intrinsic is undefined 19430b57cec5SDimitry Andric // when X is 0. If we can not guarantee X != 0, we need to check this 19440b57cec5SDimitry Andric // when expand. 19450b57cec5SDimitry Andric bool ZeroCheck = false; 19460b57cec5SDimitry Andric // It is safe to assume Preheader exist as it was checked in 19470b57cec5SDimitry Andric // parent function RunOnLoop. 19480b57cec5SDimitry Andric BasicBlock *PH = CurLoop->getLoopPreheader(); 19490b57cec5SDimitry Andric 19500b57cec5SDimitry Andric // If we are using the count instruction outside the loop, make sure we 19510b57cec5SDimitry Andric // have a zero check as a precondition. Without the check the loop would run 19520b57cec5SDimitry Andric // one iteration for before any check of the input value. This means 0 and 1 19530b57cec5SDimitry Andric // would have identical behavior in the original loop and thus 19540b57cec5SDimitry Andric if (!IsCntPhiUsedOutsideLoop) { 19550b57cec5SDimitry Andric auto *PreCondBB = PH->getSinglePredecessor(); 19560b57cec5SDimitry Andric if (!PreCondBB) 19570b57cec5SDimitry Andric return false; 19580b57cec5SDimitry Andric auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator()); 19590b57cec5SDimitry Andric if (!PreCondBI) 19600b57cec5SDimitry Andric return false; 19610b57cec5SDimitry Andric if (matchCondition(PreCondBI, PH) != InitX) 19620b57cec5SDimitry Andric return false; 19630b57cec5SDimitry Andric ZeroCheck = true; 19640b57cec5SDimitry Andric } 19650b57cec5SDimitry Andric 1966*0fca6ea1SDimitry Andric // FFS idiom loop has only 6 instructions: 19670b57cec5SDimitry Andric // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ] 19680b57cec5SDimitry Andric // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ] 19690b57cec5SDimitry Andric // %shr = ashr %n.addr.0, 1 19700b57cec5SDimitry Andric // %tobool = icmp eq %shr, 0 19710b57cec5SDimitry Andric // %inc = add nsw %i.0, 1 19720b57cec5SDimitry Andric // br i1 %tobool 1973*0fca6ea1SDimitry Andric size_t IdiomCanonicalSize = 6; 1974*0fca6ea1SDimitry Andric if (!isProfitableToInsertFFS(IntrinID, InitX, ZeroCheck, IdiomCanonicalSize)) 19750b57cec5SDimitry Andric return false; 19760b57cec5SDimitry Andric 19770b57cec5SDimitry Andric transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX, 19780b57cec5SDimitry Andric DefX->getDebugLoc(), ZeroCheck, 19790b57cec5SDimitry Andric IsCntPhiUsedOutsideLoop); 19800b57cec5SDimitry Andric return true; 19810b57cec5SDimitry Andric } 19820b57cec5SDimitry Andric 1983*0fca6ea1SDimitry Andric /// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop 1984*0fca6ea1SDimitry Andric /// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new 1985*0fca6ea1SDimitry Andric /// trip count returns true; otherwise, returns false. 1986*0fca6ea1SDimitry Andric bool LoopIdiomRecognize::recognizeAndInsertFFS() { 1987*0fca6ea1SDimitry Andric // Give up if the loop has multiple blocks or multiple backedges. 1988*0fca6ea1SDimitry Andric if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1) 1989*0fca6ea1SDimitry Andric return false; 1990*0fca6ea1SDimitry Andric 1991*0fca6ea1SDimitry Andric Intrinsic::ID IntrinID; 1992*0fca6ea1SDimitry Andric Value *InitX; 1993*0fca6ea1SDimitry Andric Instruction *DefX = nullptr; 1994*0fca6ea1SDimitry Andric PHINode *CntPhi = nullptr; 1995*0fca6ea1SDimitry Andric Instruction *CntInst = nullptr; 1996*0fca6ea1SDimitry Andric 1997*0fca6ea1SDimitry Andric if (!detectShiftUntilZeroIdiom(CurLoop, *DL, IntrinID, InitX, CntInst, CntPhi, 1998*0fca6ea1SDimitry Andric DefX)) 1999*0fca6ea1SDimitry Andric return false; 2000*0fca6ea1SDimitry Andric 2001*0fca6ea1SDimitry Andric return insertFFSIfProfitable(IntrinID, InitX, DefX, CntPhi, CntInst); 2002*0fca6ea1SDimitry Andric } 2003*0fca6ea1SDimitry Andric 2004*0fca6ea1SDimitry Andric bool LoopIdiomRecognize::recognizeShiftUntilLessThan() { 2005*0fca6ea1SDimitry Andric // Give up if the loop has multiple blocks or multiple backedges. 2006*0fca6ea1SDimitry Andric if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1) 2007*0fca6ea1SDimitry Andric return false; 2008*0fca6ea1SDimitry Andric 2009*0fca6ea1SDimitry Andric Intrinsic::ID IntrinID; 2010*0fca6ea1SDimitry Andric Value *InitX; 2011*0fca6ea1SDimitry Andric Instruction *DefX = nullptr; 2012*0fca6ea1SDimitry Andric PHINode *CntPhi = nullptr; 2013*0fca6ea1SDimitry Andric Instruction *CntInst = nullptr; 2014*0fca6ea1SDimitry Andric 2015*0fca6ea1SDimitry Andric APInt LoopThreshold; 2016*0fca6ea1SDimitry Andric if (!detectShiftUntilLessThanIdiom(CurLoop, *DL, IntrinID, InitX, CntInst, 2017*0fca6ea1SDimitry Andric CntPhi, DefX, LoopThreshold)) 2018*0fca6ea1SDimitry Andric return false; 2019*0fca6ea1SDimitry Andric 2020*0fca6ea1SDimitry Andric if (LoopThreshold == 2) { 2021*0fca6ea1SDimitry Andric // Treat as regular FFS. 2022*0fca6ea1SDimitry Andric return insertFFSIfProfitable(IntrinID, InitX, DefX, CntPhi, CntInst); 2023*0fca6ea1SDimitry Andric } 2024*0fca6ea1SDimitry Andric 2025*0fca6ea1SDimitry Andric // Look for Floor Log2 Idiom. 2026*0fca6ea1SDimitry Andric if (LoopThreshold != 4) 2027*0fca6ea1SDimitry Andric return false; 2028*0fca6ea1SDimitry Andric 2029*0fca6ea1SDimitry Andric // Abort if CntPhi is used outside of the loop. 2030*0fca6ea1SDimitry Andric for (User *U : CntPhi->users()) 2031*0fca6ea1SDimitry Andric if (!CurLoop->contains(cast<Instruction>(U))) 2032*0fca6ea1SDimitry Andric return false; 2033*0fca6ea1SDimitry Andric 2034*0fca6ea1SDimitry Andric // It is safe to assume Preheader exist as it was checked in 2035*0fca6ea1SDimitry Andric // parent function RunOnLoop. 2036*0fca6ea1SDimitry Andric BasicBlock *PH = CurLoop->getLoopPreheader(); 2037*0fca6ea1SDimitry Andric auto *PreCondBB = PH->getSinglePredecessor(); 2038*0fca6ea1SDimitry Andric if (!PreCondBB) 2039*0fca6ea1SDimitry Andric return false; 2040*0fca6ea1SDimitry Andric auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator()); 2041*0fca6ea1SDimitry Andric if (!PreCondBI) 2042*0fca6ea1SDimitry Andric return false; 2043*0fca6ea1SDimitry Andric 2044*0fca6ea1SDimitry Andric APInt PreLoopThreshold; 2045*0fca6ea1SDimitry Andric if (matchShiftULTCondition(PreCondBI, PH, PreLoopThreshold) != InitX || 2046*0fca6ea1SDimitry Andric PreLoopThreshold != 2) 2047*0fca6ea1SDimitry Andric return false; 2048*0fca6ea1SDimitry Andric 2049*0fca6ea1SDimitry Andric bool ZeroCheck = true; 2050*0fca6ea1SDimitry Andric 2051*0fca6ea1SDimitry Andric // the loop has only 6 instructions: 2052*0fca6ea1SDimitry Andric // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ] 2053*0fca6ea1SDimitry Andric // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ] 2054*0fca6ea1SDimitry Andric // %shr = ashr %n.addr.0, 1 2055*0fca6ea1SDimitry Andric // %tobool = icmp ult %n.addr.0, C 2056*0fca6ea1SDimitry Andric // %inc = add nsw %i.0, 1 2057*0fca6ea1SDimitry Andric // br i1 %tobool 2058*0fca6ea1SDimitry Andric size_t IdiomCanonicalSize = 6; 2059*0fca6ea1SDimitry Andric if (!isProfitableToInsertFFS(IntrinID, InitX, ZeroCheck, IdiomCanonicalSize)) 2060*0fca6ea1SDimitry Andric return false; 2061*0fca6ea1SDimitry Andric 2062*0fca6ea1SDimitry Andric // log2(x) = w − 1 − clz(x) 2063*0fca6ea1SDimitry Andric transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX, 2064*0fca6ea1SDimitry Andric DefX->getDebugLoc(), ZeroCheck, 2065*0fca6ea1SDimitry Andric /*IsCntPhiUsedOutsideLoop=*/false, 2066*0fca6ea1SDimitry Andric /*InsertSub=*/true); 2067*0fca6ea1SDimitry Andric return true; 2068*0fca6ea1SDimitry Andric } 2069*0fca6ea1SDimitry Andric 20700b57cec5SDimitry Andric /// Recognizes a population count idiom in a non-countable loop. 20710b57cec5SDimitry Andric /// 20720b57cec5SDimitry Andric /// If detected, transforms the relevant code to issue the popcount intrinsic 20730b57cec5SDimitry Andric /// function call, and returns true; otherwise, returns false. 20740b57cec5SDimitry Andric bool LoopIdiomRecognize::recognizePopcount() { 20750b57cec5SDimitry Andric if (TTI->getPopcntSupport(32) != TargetTransformInfo::PSK_FastHardware) 20760b57cec5SDimitry Andric return false; 20770b57cec5SDimitry Andric 20780b57cec5SDimitry Andric // Counting population are usually conducted by few arithmetic instructions. 20790b57cec5SDimitry Andric // Such instructions can be easily "absorbed" by vacant slots in a 20800b57cec5SDimitry Andric // non-compact loop. Therefore, recognizing popcount idiom only makes sense 20810b57cec5SDimitry Andric // in a compact loop. 20820b57cec5SDimitry Andric 20830b57cec5SDimitry Andric // Give up if the loop has multiple blocks or multiple backedges. 20840b57cec5SDimitry Andric if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1) 20850b57cec5SDimitry Andric return false; 20860b57cec5SDimitry Andric 20870b57cec5SDimitry Andric BasicBlock *LoopBody = *(CurLoop->block_begin()); 20880b57cec5SDimitry Andric if (LoopBody->size() >= 20) { 20890b57cec5SDimitry Andric // The loop is too big, bail out. 20900b57cec5SDimitry Andric return false; 20910b57cec5SDimitry Andric } 20920b57cec5SDimitry Andric 20930b57cec5SDimitry Andric // It should have a preheader containing nothing but an unconditional branch. 20940b57cec5SDimitry Andric BasicBlock *PH = CurLoop->getLoopPreheader(); 20950b57cec5SDimitry Andric if (!PH || &PH->front() != PH->getTerminator()) 20960b57cec5SDimitry Andric return false; 20970b57cec5SDimitry Andric auto *EntryBI = dyn_cast<BranchInst>(PH->getTerminator()); 20980b57cec5SDimitry Andric if (!EntryBI || EntryBI->isConditional()) 20990b57cec5SDimitry Andric return false; 21000b57cec5SDimitry Andric 21010b57cec5SDimitry Andric // It should have a precondition block where the generated popcount intrinsic 21020b57cec5SDimitry Andric // function can be inserted. 21030b57cec5SDimitry Andric auto *PreCondBB = PH->getSinglePredecessor(); 21040b57cec5SDimitry Andric if (!PreCondBB) 21050b57cec5SDimitry Andric return false; 21060b57cec5SDimitry Andric auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator()); 21070b57cec5SDimitry Andric if (!PreCondBI || PreCondBI->isUnconditional()) 21080b57cec5SDimitry Andric return false; 21090b57cec5SDimitry Andric 21100b57cec5SDimitry Andric Instruction *CntInst; 21110b57cec5SDimitry Andric PHINode *CntPhi; 21120b57cec5SDimitry Andric Value *Val; 21130b57cec5SDimitry Andric if (!detectPopcountIdiom(CurLoop, PreCondBB, CntInst, CntPhi, Val)) 21140b57cec5SDimitry Andric return false; 21150b57cec5SDimitry Andric 21160b57cec5SDimitry Andric transformLoopToPopcount(PreCondBB, CntInst, CntPhi, Val); 21170b57cec5SDimitry Andric return true; 21180b57cec5SDimitry Andric } 21190b57cec5SDimitry Andric 21200b57cec5SDimitry Andric static CallInst *createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val, 21210b57cec5SDimitry Andric const DebugLoc &DL) { 21220b57cec5SDimitry Andric Value *Ops[] = {Val}; 21230b57cec5SDimitry Andric Type *Tys[] = {Val->getType()}; 21240b57cec5SDimitry Andric 21250b57cec5SDimitry Andric Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent(); 21260b57cec5SDimitry Andric Function *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys); 21270b57cec5SDimitry Andric CallInst *CI = IRBuilder.CreateCall(Func, Ops); 21280b57cec5SDimitry Andric CI->setDebugLoc(DL); 21290b57cec5SDimitry Andric 21300b57cec5SDimitry Andric return CI; 21310b57cec5SDimitry Andric } 21320b57cec5SDimitry Andric 21330b57cec5SDimitry Andric static CallInst *createFFSIntrinsic(IRBuilder<> &IRBuilder, Value *Val, 21340b57cec5SDimitry Andric const DebugLoc &DL, bool ZeroCheck, 21350b57cec5SDimitry Andric Intrinsic::ID IID) { 2136fe6060f1SDimitry Andric Value *Ops[] = {Val, IRBuilder.getInt1(ZeroCheck)}; 21370b57cec5SDimitry Andric Type *Tys[] = {Val->getType()}; 21380b57cec5SDimitry Andric 21390b57cec5SDimitry Andric Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent(); 21400b57cec5SDimitry Andric Function *Func = Intrinsic::getDeclaration(M, IID, Tys); 21410b57cec5SDimitry Andric CallInst *CI = IRBuilder.CreateCall(Func, Ops); 21420b57cec5SDimitry Andric CI->setDebugLoc(DL); 21430b57cec5SDimitry Andric 21440b57cec5SDimitry Andric return CI; 21450b57cec5SDimitry Andric } 21460b57cec5SDimitry Andric 21470b57cec5SDimitry Andric /// Transform the following loop (Using CTLZ, CTTZ is similar): 21480b57cec5SDimitry Andric /// loop: 21490b57cec5SDimitry Andric /// CntPhi = PHI [Cnt0, CntInst] 21500b57cec5SDimitry Andric /// PhiX = PHI [InitX, DefX] 21510b57cec5SDimitry Andric /// CntInst = CntPhi + 1 21520b57cec5SDimitry Andric /// DefX = PhiX >> 1 21530b57cec5SDimitry Andric /// LOOP_BODY 21540b57cec5SDimitry Andric /// Br: loop if (DefX != 0) 21550b57cec5SDimitry Andric /// Use(CntPhi) or Use(CntInst) 21560b57cec5SDimitry Andric /// 21570b57cec5SDimitry Andric /// Into: 21580b57cec5SDimitry Andric /// If CntPhi used outside the loop: 21590b57cec5SDimitry Andric /// CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1) 21600b57cec5SDimitry Andric /// Count = CountPrev + 1 21610b57cec5SDimitry Andric /// else 21620b57cec5SDimitry Andric /// Count = BitWidth(InitX) - CTLZ(InitX) 21630b57cec5SDimitry Andric /// loop: 21640b57cec5SDimitry Andric /// CntPhi = PHI [Cnt0, CntInst] 21650b57cec5SDimitry Andric /// PhiX = PHI [InitX, DefX] 21660b57cec5SDimitry Andric /// PhiCount = PHI [Count, Dec] 21670b57cec5SDimitry Andric /// CntInst = CntPhi + 1 21680b57cec5SDimitry Andric /// DefX = PhiX >> 1 21690b57cec5SDimitry Andric /// Dec = PhiCount - 1 21700b57cec5SDimitry Andric /// LOOP_BODY 21710b57cec5SDimitry Andric /// Br: loop if (Dec != 0) 21720b57cec5SDimitry Andric /// Use(CountPrev + Cnt0) // Use(CntPhi) 21730b57cec5SDimitry Andric /// or 21740b57cec5SDimitry Andric /// Use(Count + Cnt0) // Use(CntInst) 21750b57cec5SDimitry Andric /// 21760b57cec5SDimitry Andric /// If LOOP_BODY is empty the loop will be deleted. 21770b57cec5SDimitry Andric /// If CntInst and DefX are not used in LOOP_BODY they will be removed. 21780b57cec5SDimitry Andric void LoopIdiomRecognize::transformLoopToCountable( 21790b57cec5SDimitry Andric Intrinsic::ID IntrinID, BasicBlock *Preheader, Instruction *CntInst, 21800b57cec5SDimitry Andric PHINode *CntPhi, Value *InitX, Instruction *DefX, const DebugLoc &DL, 2181*0fca6ea1SDimitry Andric bool ZeroCheck, bool IsCntPhiUsedOutsideLoop, bool InsertSub) { 21820b57cec5SDimitry Andric BranchInst *PreheaderBr = cast<BranchInst>(Preheader->getTerminator()); 21830b57cec5SDimitry Andric 21840b57cec5SDimitry Andric // Step 1: Insert the CTLZ/CTTZ instruction at the end of the preheader block 21850b57cec5SDimitry Andric IRBuilder<> Builder(PreheaderBr); 21860b57cec5SDimitry Andric Builder.SetCurrentDebugLocation(DL); 21870b57cec5SDimitry Andric 2188fe6060f1SDimitry Andric // If there are no uses of CntPhi crate: 21890b57cec5SDimitry Andric // Count = BitWidth - CTLZ(InitX); 2190e8d8bef9SDimitry Andric // NewCount = Count; 21910b57cec5SDimitry Andric // If there are uses of CntPhi create: 2192e8d8bef9SDimitry Andric // NewCount = BitWidth - CTLZ(InitX >> 1); 2193e8d8bef9SDimitry Andric // Count = NewCount + 1; 2194e8d8bef9SDimitry Andric Value *InitXNext; 21950b57cec5SDimitry Andric if (IsCntPhiUsedOutsideLoop) { 21960b57cec5SDimitry Andric if (DefX->getOpcode() == Instruction::AShr) 2197fe6060f1SDimitry Andric InitXNext = Builder.CreateAShr(InitX, 1); 21980b57cec5SDimitry Andric else if (DefX->getOpcode() == Instruction::LShr) 2199fe6060f1SDimitry Andric InitXNext = Builder.CreateLShr(InitX, 1); 22000b57cec5SDimitry Andric else if (DefX->getOpcode() == Instruction::Shl) // cttz 2201fe6060f1SDimitry Andric InitXNext = Builder.CreateShl(InitX, 1); 22020b57cec5SDimitry Andric else 22030b57cec5SDimitry Andric llvm_unreachable("Unexpected opcode!"); 22040b57cec5SDimitry Andric } else 22050b57cec5SDimitry Andric InitXNext = InitX; 2206fe6060f1SDimitry Andric Value *Count = 2207fe6060f1SDimitry Andric createFFSIntrinsic(Builder, InitXNext, DL, ZeroCheck, IntrinID); 2208fe6060f1SDimitry Andric Type *CountTy = Count->getType(); 2209fe6060f1SDimitry Andric Count = Builder.CreateSub( 2210fe6060f1SDimitry Andric ConstantInt::get(CountTy, CountTy->getIntegerBitWidth()), Count); 2211*0fca6ea1SDimitry Andric if (InsertSub) 2212*0fca6ea1SDimitry Andric Count = Builder.CreateSub(Count, ConstantInt::get(CountTy, 1)); 2213e8d8bef9SDimitry Andric Value *NewCount = Count; 2214fe6060f1SDimitry Andric if (IsCntPhiUsedOutsideLoop) 2215fe6060f1SDimitry Andric Count = Builder.CreateAdd(Count, ConstantInt::get(CountTy, 1)); 22160b57cec5SDimitry Andric 2217fe6060f1SDimitry Andric NewCount = Builder.CreateZExtOrTrunc(NewCount, CntInst->getType()); 22180b57cec5SDimitry Andric 22190b57cec5SDimitry Andric Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader); 2220e8d8bef9SDimitry Andric if (cast<ConstantInt>(CntInst->getOperand(1))->isOne()) { 2221e8d8bef9SDimitry Andric // If the counter was being incremented in the loop, add NewCount to the 2222e8d8bef9SDimitry Andric // counter's initial value, but only if the initial value is not zero. 22230b57cec5SDimitry Andric ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal); 22240b57cec5SDimitry Andric if (!InitConst || !InitConst->isZero()) 22250b57cec5SDimitry Andric NewCount = Builder.CreateAdd(NewCount, CntInitVal); 2226e8d8bef9SDimitry Andric } else { 2227e8d8bef9SDimitry Andric // If the count was being decremented in the loop, subtract NewCount from 2228e8d8bef9SDimitry Andric // the counter's initial value. 2229e8d8bef9SDimitry Andric NewCount = Builder.CreateSub(CntInitVal, NewCount); 2230e8d8bef9SDimitry Andric } 22310b57cec5SDimitry Andric 22320b57cec5SDimitry Andric // Step 2: Insert new IV and loop condition: 22330b57cec5SDimitry Andric // loop: 22340b57cec5SDimitry Andric // ... 22350b57cec5SDimitry Andric // PhiCount = PHI [Count, Dec] 22360b57cec5SDimitry Andric // ... 22370b57cec5SDimitry Andric // Dec = PhiCount - 1 22380b57cec5SDimitry Andric // ... 22390b57cec5SDimitry Andric // Br: loop if (Dec != 0) 22400b57cec5SDimitry Andric BasicBlock *Body = *(CurLoop->block_begin()); 22410b57cec5SDimitry Andric auto *LbBr = cast<BranchInst>(Body->getTerminator()); 22420b57cec5SDimitry Andric ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition()); 22430b57cec5SDimitry Andric 22445f757f3fSDimitry Andric PHINode *TcPhi = PHINode::Create(CountTy, 2, "tcphi"); 22455f757f3fSDimitry Andric TcPhi->insertBefore(Body->begin()); 22460b57cec5SDimitry Andric 22470b57cec5SDimitry Andric Builder.SetInsertPoint(LbCond); 2248fe6060f1SDimitry Andric Instruction *TcDec = cast<Instruction>(Builder.CreateSub( 2249fe6060f1SDimitry Andric TcPhi, ConstantInt::get(CountTy, 1), "tcdec", false, true)); 22500b57cec5SDimitry Andric 22510b57cec5SDimitry Andric TcPhi->addIncoming(Count, Preheader); 22520b57cec5SDimitry Andric TcPhi->addIncoming(TcDec, Body); 22530b57cec5SDimitry Andric 22540b57cec5SDimitry Andric CmpInst::Predicate Pred = 22550b57cec5SDimitry Andric (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ; 22560b57cec5SDimitry Andric LbCond->setPredicate(Pred); 22570b57cec5SDimitry Andric LbCond->setOperand(0, TcDec); 2258fe6060f1SDimitry Andric LbCond->setOperand(1, ConstantInt::get(CountTy, 0)); 22590b57cec5SDimitry Andric 22600b57cec5SDimitry Andric // Step 3: All the references to the original counter outside 22610b57cec5SDimitry Andric // the loop are replaced with the NewCount 22620b57cec5SDimitry Andric if (IsCntPhiUsedOutsideLoop) 22630b57cec5SDimitry Andric CntPhi->replaceUsesOutsideBlock(NewCount, Body); 22640b57cec5SDimitry Andric else 22650b57cec5SDimitry Andric CntInst->replaceUsesOutsideBlock(NewCount, Body); 22660b57cec5SDimitry Andric 22670b57cec5SDimitry Andric // step 4: Forget the "non-computable" trip-count SCEV associated with the 22680b57cec5SDimitry Andric // loop. The loop would otherwise not be deleted even if it becomes empty. 22690b57cec5SDimitry Andric SE->forgetLoop(CurLoop); 22700b57cec5SDimitry Andric } 22710b57cec5SDimitry Andric 22720b57cec5SDimitry Andric void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB, 22730b57cec5SDimitry Andric Instruction *CntInst, 22740b57cec5SDimitry Andric PHINode *CntPhi, Value *Var) { 22750b57cec5SDimitry Andric BasicBlock *PreHead = CurLoop->getLoopPreheader(); 22760b57cec5SDimitry Andric auto *PreCondBr = cast<BranchInst>(PreCondBB->getTerminator()); 22770b57cec5SDimitry Andric const DebugLoc &DL = CntInst->getDebugLoc(); 22780b57cec5SDimitry Andric 22790b57cec5SDimitry Andric // Assuming before transformation, the loop is following: 22800b57cec5SDimitry Andric // if (x) // the precondition 22810b57cec5SDimitry Andric // do { cnt++; x &= x - 1; } while(x); 22820b57cec5SDimitry Andric 22830b57cec5SDimitry Andric // Step 1: Insert the ctpop instruction at the end of the precondition block 22840b57cec5SDimitry Andric IRBuilder<> Builder(PreCondBr); 22850b57cec5SDimitry Andric Value *PopCnt, *PopCntZext, *NewCount, *TripCnt; 22860b57cec5SDimitry Andric { 22870b57cec5SDimitry Andric PopCnt = createPopcntIntrinsic(Builder, Var, DL); 22880b57cec5SDimitry Andric NewCount = PopCntZext = 22890b57cec5SDimitry Andric Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType())); 22900b57cec5SDimitry Andric 22910b57cec5SDimitry Andric if (NewCount != PopCnt) 22920b57cec5SDimitry Andric (cast<Instruction>(NewCount))->setDebugLoc(DL); 22930b57cec5SDimitry Andric 22940b57cec5SDimitry Andric // TripCnt is exactly the number of iterations the loop has 22950b57cec5SDimitry Andric TripCnt = NewCount; 22960b57cec5SDimitry Andric 22970b57cec5SDimitry Andric // If the population counter's initial value is not zero, insert Add Inst. 22980b57cec5SDimitry Andric Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead); 22990b57cec5SDimitry Andric ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal); 23000b57cec5SDimitry Andric if (!InitConst || !InitConst->isZero()) { 23010b57cec5SDimitry Andric NewCount = Builder.CreateAdd(NewCount, CntInitVal); 23020b57cec5SDimitry Andric (cast<Instruction>(NewCount))->setDebugLoc(DL); 23030b57cec5SDimitry Andric } 23040b57cec5SDimitry Andric } 23050b57cec5SDimitry Andric 23060b57cec5SDimitry Andric // Step 2: Replace the precondition from "if (x == 0) goto loop-exit" to 23070b57cec5SDimitry Andric // "if (NewCount == 0) loop-exit". Without this change, the intrinsic 23080b57cec5SDimitry Andric // function would be partial dead code, and downstream passes will drag 23090b57cec5SDimitry Andric // it back from the precondition block to the preheader. 23100b57cec5SDimitry Andric { 23110b57cec5SDimitry Andric ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition()); 23120b57cec5SDimitry Andric 23130b57cec5SDimitry Andric Value *Opnd0 = PopCntZext; 23140b57cec5SDimitry Andric Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0); 23150b57cec5SDimitry Andric if (PreCond->getOperand(0) != Var) 23160b57cec5SDimitry Andric std::swap(Opnd0, Opnd1); 23170b57cec5SDimitry Andric 23180b57cec5SDimitry Andric ICmpInst *NewPreCond = cast<ICmpInst>( 23190b57cec5SDimitry Andric Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1)); 23200b57cec5SDimitry Andric PreCondBr->setCondition(NewPreCond); 23210b57cec5SDimitry Andric 23220b57cec5SDimitry Andric RecursivelyDeleteTriviallyDeadInstructions(PreCond, TLI); 23230b57cec5SDimitry Andric } 23240b57cec5SDimitry Andric 23250b57cec5SDimitry Andric // Step 3: Note that the population count is exactly the trip count of the 23260b57cec5SDimitry Andric // loop in question, which enable us to convert the loop from noncountable 23270b57cec5SDimitry Andric // loop into a countable one. The benefit is twofold: 23280b57cec5SDimitry Andric // 23290b57cec5SDimitry Andric // - If the loop only counts population, the entire loop becomes dead after 23300b57cec5SDimitry Andric // the transformation. It is a lot easier to prove a countable loop dead 23310b57cec5SDimitry Andric // than to prove a noncountable one. (In some C dialects, an infinite loop 23320b57cec5SDimitry Andric // isn't dead even if it computes nothing useful. In general, DCE needs 23330b57cec5SDimitry Andric // to prove a noncountable loop finite before safely delete it.) 23340b57cec5SDimitry Andric // 23350b57cec5SDimitry Andric // - If the loop also performs something else, it remains alive. 23360b57cec5SDimitry Andric // Since it is transformed to countable form, it can be aggressively 23370b57cec5SDimitry Andric // optimized by some optimizations which are in general not applicable 23380b57cec5SDimitry Andric // to a noncountable loop. 23390b57cec5SDimitry Andric // 23400b57cec5SDimitry Andric // After this step, this loop (conceptually) would look like following: 23410b57cec5SDimitry Andric // newcnt = __builtin_ctpop(x); 23420b57cec5SDimitry Andric // t = newcnt; 23430b57cec5SDimitry Andric // if (x) 23440b57cec5SDimitry Andric // do { cnt++; x &= x-1; t--) } while (t > 0); 23450b57cec5SDimitry Andric BasicBlock *Body = *(CurLoop->block_begin()); 23460b57cec5SDimitry Andric { 23470b57cec5SDimitry Andric auto *LbBr = cast<BranchInst>(Body->getTerminator()); 23480b57cec5SDimitry Andric ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition()); 23490b57cec5SDimitry Andric Type *Ty = TripCnt->getType(); 23500b57cec5SDimitry Andric 23515f757f3fSDimitry Andric PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi"); 23525f757f3fSDimitry Andric TcPhi->insertBefore(Body->begin()); 23530b57cec5SDimitry Andric 23540b57cec5SDimitry Andric Builder.SetInsertPoint(LbCond); 23550b57cec5SDimitry Andric Instruction *TcDec = cast<Instruction>( 23560b57cec5SDimitry Andric Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1), 23570b57cec5SDimitry Andric "tcdec", false, true)); 23580b57cec5SDimitry Andric 23590b57cec5SDimitry Andric TcPhi->addIncoming(TripCnt, PreHead); 23600b57cec5SDimitry Andric TcPhi->addIncoming(TcDec, Body); 23610b57cec5SDimitry Andric 23620b57cec5SDimitry Andric CmpInst::Predicate Pred = 23630b57cec5SDimitry Andric (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_UGT : CmpInst::ICMP_SLE; 23640b57cec5SDimitry Andric LbCond->setPredicate(Pred); 23650b57cec5SDimitry Andric LbCond->setOperand(0, TcDec); 23660b57cec5SDimitry Andric LbCond->setOperand(1, ConstantInt::get(Ty, 0)); 23670b57cec5SDimitry Andric } 23680b57cec5SDimitry Andric 23690b57cec5SDimitry Andric // Step 4: All the references to the original population counter outside 23700b57cec5SDimitry Andric // the loop are replaced with the NewCount -- the value returned from 23710b57cec5SDimitry Andric // __builtin_ctpop(). 23720b57cec5SDimitry Andric CntInst->replaceUsesOutsideBlock(NewCount, Body); 23730b57cec5SDimitry Andric 23740b57cec5SDimitry Andric // step 5: Forget the "non-computable" trip-count SCEV associated with the 23750b57cec5SDimitry Andric // loop. The loop would otherwise not be deleted even if it becomes empty. 23760b57cec5SDimitry Andric SE->forgetLoop(CurLoop); 23770b57cec5SDimitry Andric } 2378e8d8bef9SDimitry Andric 2379e8d8bef9SDimitry Andric /// Match loop-invariant value. 2380e8d8bef9SDimitry Andric template <typename SubPattern_t> struct match_LoopInvariant { 2381e8d8bef9SDimitry Andric SubPattern_t SubPattern; 2382e8d8bef9SDimitry Andric const Loop *L; 2383e8d8bef9SDimitry Andric 2384e8d8bef9SDimitry Andric match_LoopInvariant(const SubPattern_t &SP, const Loop *L) 2385e8d8bef9SDimitry Andric : SubPattern(SP), L(L) {} 2386e8d8bef9SDimitry Andric 2387e8d8bef9SDimitry Andric template <typename ITy> bool match(ITy *V) { 2388e8d8bef9SDimitry Andric return L->isLoopInvariant(V) && SubPattern.match(V); 2389e8d8bef9SDimitry Andric } 2390e8d8bef9SDimitry Andric }; 2391e8d8bef9SDimitry Andric 2392e8d8bef9SDimitry Andric /// Matches if the value is loop-invariant. 2393e8d8bef9SDimitry Andric template <typename Ty> 2394e8d8bef9SDimitry Andric inline match_LoopInvariant<Ty> m_LoopInvariant(const Ty &M, const Loop *L) { 2395e8d8bef9SDimitry Andric return match_LoopInvariant<Ty>(M, L); 2396e8d8bef9SDimitry Andric } 2397e8d8bef9SDimitry Andric 2398e8d8bef9SDimitry Andric /// Return true if the idiom is detected in the loop. 2399e8d8bef9SDimitry Andric /// 2400e8d8bef9SDimitry Andric /// The core idiom we are trying to detect is: 2401e8d8bef9SDimitry Andric /// \code 2402e8d8bef9SDimitry Andric /// entry: 2403e8d8bef9SDimitry Andric /// <...> 2404e8d8bef9SDimitry Andric /// %bitmask = shl i32 1, %bitpos 2405e8d8bef9SDimitry Andric /// br label %loop 2406e8d8bef9SDimitry Andric /// 2407e8d8bef9SDimitry Andric /// loop: 2408e8d8bef9SDimitry Andric /// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ] 2409e8d8bef9SDimitry Andric /// %x.curr.bitmasked = and i32 %x.curr, %bitmask 2410e8d8bef9SDimitry Andric /// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0 2411e8d8bef9SDimitry Andric /// %x.next = shl i32 %x.curr, 1 2412e8d8bef9SDimitry Andric /// <...> 2413e8d8bef9SDimitry Andric /// br i1 %x.curr.isbitunset, label %loop, label %end 2414e8d8bef9SDimitry Andric /// 2415e8d8bef9SDimitry Andric /// end: 2416e8d8bef9SDimitry Andric /// %x.curr.res = phi i32 [ %x.curr, %loop ] <...> 2417e8d8bef9SDimitry Andric /// %x.next.res = phi i32 [ %x.next, %loop ] <...> 2418e8d8bef9SDimitry Andric /// <...> 2419e8d8bef9SDimitry Andric /// \endcode 2420e8d8bef9SDimitry Andric static bool detectShiftUntilBitTestIdiom(Loop *CurLoop, Value *&BaseX, 2421e8d8bef9SDimitry Andric Value *&BitMask, Value *&BitPos, 2422e8d8bef9SDimitry Andric Value *&CurrX, Instruction *&NextX) { 2423e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << DEBUG_TYPE 2424e8d8bef9SDimitry Andric " Performing shift-until-bittest idiom detection.\n"); 2425e8d8bef9SDimitry Andric 2426e8d8bef9SDimitry Andric // Give up if the loop has multiple blocks or multiple backedges. 2427e8d8bef9SDimitry Andric if (CurLoop->getNumBlocks() != 1 || CurLoop->getNumBackEdges() != 1) { 2428e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n"); 2429e8d8bef9SDimitry Andric return false; 2430e8d8bef9SDimitry Andric } 2431e8d8bef9SDimitry Andric 2432e8d8bef9SDimitry Andric BasicBlock *LoopHeaderBB = CurLoop->getHeader(); 2433e8d8bef9SDimitry Andric BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader(); 2434e8d8bef9SDimitry Andric assert(LoopPreheaderBB && "There is always a loop preheader."); 2435e8d8bef9SDimitry Andric 2436e8d8bef9SDimitry Andric using namespace PatternMatch; 2437e8d8bef9SDimitry Andric 2438e8d8bef9SDimitry Andric // Step 1: Check if the loop backedge is in desirable form. 2439e8d8bef9SDimitry Andric 2440e8d8bef9SDimitry Andric ICmpInst::Predicate Pred; 2441e8d8bef9SDimitry Andric Value *CmpLHS, *CmpRHS; 2442e8d8bef9SDimitry Andric BasicBlock *TrueBB, *FalseBB; 2443e8d8bef9SDimitry Andric if (!match(LoopHeaderBB->getTerminator(), 2444e8d8bef9SDimitry Andric m_Br(m_ICmp(Pred, m_Value(CmpLHS), m_Value(CmpRHS)), 2445e8d8bef9SDimitry Andric m_BasicBlock(TrueBB), m_BasicBlock(FalseBB)))) { 2446e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n"); 2447e8d8bef9SDimitry Andric return false; 2448e8d8bef9SDimitry Andric } 2449e8d8bef9SDimitry Andric 2450e8d8bef9SDimitry Andric // Step 2: Check if the backedge's condition is in desirable form. 2451e8d8bef9SDimitry Andric 2452e8d8bef9SDimitry Andric auto MatchVariableBitMask = [&]() { 2453e8d8bef9SDimitry Andric return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) && 2454e8d8bef9SDimitry Andric match(CmpLHS, 2455e8d8bef9SDimitry Andric m_c_And(m_Value(CurrX), 2456e8d8bef9SDimitry Andric m_CombineAnd( 2457e8d8bef9SDimitry Andric m_Value(BitMask), 2458e8d8bef9SDimitry Andric m_LoopInvariant(m_Shl(m_One(), m_Value(BitPos)), 2459e8d8bef9SDimitry Andric CurLoop)))); 2460e8d8bef9SDimitry Andric }; 2461e8d8bef9SDimitry Andric auto MatchConstantBitMask = [&]() { 2462e8d8bef9SDimitry Andric return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) && 2463e8d8bef9SDimitry Andric match(CmpLHS, m_And(m_Value(CurrX), 2464e8d8bef9SDimitry Andric m_CombineAnd(m_Value(BitMask), m_Power2()))) && 2465e8d8bef9SDimitry Andric (BitPos = ConstantExpr::getExactLogBase2(cast<Constant>(BitMask))); 2466e8d8bef9SDimitry Andric }; 2467e8d8bef9SDimitry Andric auto MatchDecomposableConstantBitMask = [&]() { 2468e8d8bef9SDimitry Andric APInt Mask; 2469e8d8bef9SDimitry Andric return llvm::decomposeBitTestICmp(CmpLHS, CmpRHS, Pred, CurrX, Mask) && 2470e8d8bef9SDimitry Andric ICmpInst::isEquality(Pred) && Mask.isPowerOf2() && 2471e8d8bef9SDimitry Andric (BitMask = ConstantInt::get(CurrX->getType(), Mask)) && 2472e8d8bef9SDimitry Andric (BitPos = ConstantInt::get(CurrX->getType(), Mask.logBase2())); 2473e8d8bef9SDimitry Andric }; 2474e8d8bef9SDimitry Andric 2475e8d8bef9SDimitry Andric if (!MatchVariableBitMask() && !MatchConstantBitMask() && 2476e8d8bef9SDimitry Andric !MatchDecomposableConstantBitMask()) { 2477e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge comparison.\n"); 2478e8d8bef9SDimitry Andric return false; 2479e8d8bef9SDimitry Andric } 2480e8d8bef9SDimitry Andric 2481e8d8bef9SDimitry Andric // Step 3: Check if the recurrence is in desirable form. 2482e8d8bef9SDimitry Andric auto *CurrXPN = dyn_cast<PHINode>(CurrX); 2483e8d8bef9SDimitry Andric if (!CurrXPN || CurrXPN->getParent() != LoopHeaderBB) { 2484e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n"); 2485e8d8bef9SDimitry Andric return false; 2486e8d8bef9SDimitry Andric } 2487e8d8bef9SDimitry Andric 2488e8d8bef9SDimitry Andric BaseX = CurrXPN->getIncomingValueForBlock(LoopPreheaderBB); 2489e8d8bef9SDimitry Andric NextX = 2490e8d8bef9SDimitry Andric dyn_cast<Instruction>(CurrXPN->getIncomingValueForBlock(LoopHeaderBB)); 2491e8d8bef9SDimitry Andric 2492fe6060f1SDimitry Andric assert(CurLoop->isLoopInvariant(BaseX) && 2493fe6060f1SDimitry Andric "Expected BaseX to be avaliable in the preheader!"); 2494fe6060f1SDimitry Andric 2495e8d8bef9SDimitry Andric if (!NextX || !match(NextX, m_Shl(m_Specific(CurrX), m_One()))) { 2496e8d8bef9SDimitry Andric // FIXME: support right-shift? 2497e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n"); 2498e8d8bef9SDimitry Andric return false; 2499e8d8bef9SDimitry Andric } 2500e8d8bef9SDimitry Andric 2501e8d8bef9SDimitry Andric // Step 4: Check if the backedge's destinations are in desirable form. 2502e8d8bef9SDimitry Andric 2503e8d8bef9SDimitry Andric assert(ICmpInst::isEquality(Pred) && 2504e8d8bef9SDimitry Andric "Should only get equality predicates here."); 2505e8d8bef9SDimitry Andric 2506e8d8bef9SDimitry Andric // cmp-br is commutative, so canonicalize to a single variant. 2507e8d8bef9SDimitry Andric if (Pred != ICmpInst::Predicate::ICMP_EQ) { 2508e8d8bef9SDimitry Andric Pred = ICmpInst::getInversePredicate(Pred); 2509e8d8bef9SDimitry Andric std::swap(TrueBB, FalseBB); 2510e8d8bef9SDimitry Andric } 2511e8d8bef9SDimitry Andric 2512e8d8bef9SDimitry Andric // We expect to exit loop when comparison yields false, 2513e8d8bef9SDimitry Andric // so when it yields true we should branch back to loop header. 2514e8d8bef9SDimitry Andric if (TrueBB != LoopHeaderBB) { 2515e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n"); 2516e8d8bef9SDimitry Andric return false; 2517e8d8bef9SDimitry Andric } 2518e8d8bef9SDimitry Andric 2519e8d8bef9SDimitry Andric // Okay, idiom checks out. 2520e8d8bef9SDimitry Andric return true; 2521e8d8bef9SDimitry Andric } 2522e8d8bef9SDimitry Andric 2523e8d8bef9SDimitry Andric /// Look for the following loop: 2524e8d8bef9SDimitry Andric /// \code 2525e8d8bef9SDimitry Andric /// entry: 2526e8d8bef9SDimitry Andric /// <...> 2527e8d8bef9SDimitry Andric /// %bitmask = shl i32 1, %bitpos 2528e8d8bef9SDimitry Andric /// br label %loop 2529e8d8bef9SDimitry Andric /// 2530e8d8bef9SDimitry Andric /// loop: 2531e8d8bef9SDimitry Andric /// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ] 2532e8d8bef9SDimitry Andric /// %x.curr.bitmasked = and i32 %x.curr, %bitmask 2533e8d8bef9SDimitry Andric /// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0 2534e8d8bef9SDimitry Andric /// %x.next = shl i32 %x.curr, 1 2535e8d8bef9SDimitry Andric /// <...> 2536e8d8bef9SDimitry Andric /// br i1 %x.curr.isbitunset, label %loop, label %end 2537e8d8bef9SDimitry Andric /// 2538e8d8bef9SDimitry Andric /// end: 2539e8d8bef9SDimitry Andric /// %x.curr.res = phi i32 [ %x.curr, %loop ] <...> 2540e8d8bef9SDimitry Andric /// %x.next.res = phi i32 [ %x.next, %loop ] <...> 2541e8d8bef9SDimitry Andric /// <...> 2542e8d8bef9SDimitry Andric /// \endcode 2543e8d8bef9SDimitry Andric /// 2544e8d8bef9SDimitry Andric /// And transform it into: 2545e8d8bef9SDimitry Andric /// \code 2546e8d8bef9SDimitry Andric /// entry: 2547e8d8bef9SDimitry Andric /// %bitmask = shl i32 1, %bitpos 2548e8d8bef9SDimitry Andric /// %lowbitmask = add i32 %bitmask, -1 2549e8d8bef9SDimitry Andric /// %mask = or i32 %lowbitmask, %bitmask 2550e8d8bef9SDimitry Andric /// %x.masked = and i32 %x, %mask 2551e8d8bef9SDimitry Andric /// %x.masked.numleadingzeros = call i32 @llvm.ctlz.i32(i32 %x.masked, 2552e8d8bef9SDimitry Andric /// i1 true) 2553e8d8bef9SDimitry Andric /// %x.masked.numactivebits = sub i32 32, %x.masked.numleadingzeros 2554e8d8bef9SDimitry Andric /// %x.masked.leadingonepos = add i32 %x.masked.numactivebits, -1 2555e8d8bef9SDimitry Andric /// %backedgetakencount = sub i32 %bitpos, %x.masked.leadingonepos 2556e8d8bef9SDimitry Andric /// %tripcount = add i32 %backedgetakencount, 1 2557e8d8bef9SDimitry Andric /// %x.curr = shl i32 %x, %backedgetakencount 2558e8d8bef9SDimitry Andric /// %x.next = shl i32 %x, %tripcount 2559e8d8bef9SDimitry Andric /// br label %loop 2560e8d8bef9SDimitry Andric /// 2561e8d8bef9SDimitry Andric /// loop: 2562e8d8bef9SDimitry Andric /// %loop.iv = phi i32 [ 0, %entry ], [ %loop.iv.next, %loop ] 2563e8d8bef9SDimitry Andric /// %loop.iv.next = add nuw i32 %loop.iv, 1 2564e8d8bef9SDimitry Andric /// %loop.ivcheck = icmp eq i32 %loop.iv.next, %tripcount 2565e8d8bef9SDimitry Andric /// <...> 2566e8d8bef9SDimitry Andric /// br i1 %loop.ivcheck, label %end, label %loop 2567e8d8bef9SDimitry Andric /// 2568e8d8bef9SDimitry Andric /// end: 2569e8d8bef9SDimitry Andric /// %x.curr.res = phi i32 [ %x.curr, %loop ] <...> 2570e8d8bef9SDimitry Andric /// %x.next.res = phi i32 [ %x.next, %loop ] <...> 2571e8d8bef9SDimitry Andric /// <...> 2572e8d8bef9SDimitry Andric /// \endcode 2573e8d8bef9SDimitry Andric bool LoopIdiomRecognize::recognizeShiftUntilBitTest() { 2574e8d8bef9SDimitry Andric bool MadeChange = false; 2575e8d8bef9SDimitry Andric 2576e8d8bef9SDimitry Andric Value *X, *BitMask, *BitPos, *XCurr; 2577e8d8bef9SDimitry Andric Instruction *XNext; 2578e8d8bef9SDimitry Andric if (!detectShiftUntilBitTestIdiom(CurLoop, X, BitMask, BitPos, XCurr, 2579e8d8bef9SDimitry Andric XNext)) { 2580e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << DEBUG_TYPE 2581e8d8bef9SDimitry Andric " shift-until-bittest idiom detection failed.\n"); 2582e8d8bef9SDimitry Andric return MadeChange; 2583e8d8bef9SDimitry Andric } 2584e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom detected!\n"); 2585e8d8bef9SDimitry Andric 2586e8d8bef9SDimitry Andric // Ok, it is the idiom we were looking for, we *could* transform this loop, 2587e8d8bef9SDimitry Andric // but is it profitable to transform? 2588e8d8bef9SDimitry Andric 2589e8d8bef9SDimitry Andric BasicBlock *LoopHeaderBB = CurLoop->getHeader(); 2590e8d8bef9SDimitry Andric BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader(); 2591e8d8bef9SDimitry Andric assert(LoopPreheaderBB && "There is always a loop preheader."); 2592e8d8bef9SDimitry Andric 2593e8d8bef9SDimitry Andric BasicBlock *SuccessorBB = CurLoop->getExitBlock(); 2594fe6060f1SDimitry Andric assert(SuccessorBB && "There is only a single successor."); 2595e8d8bef9SDimitry Andric 2596e8d8bef9SDimitry Andric IRBuilder<> Builder(LoopPreheaderBB->getTerminator()); 2597e8d8bef9SDimitry Andric Builder.SetCurrentDebugLocation(cast<Instruction>(XCurr)->getDebugLoc()); 2598e8d8bef9SDimitry Andric 2599e8d8bef9SDimitry Andric Intrinsic::ID IntrID = Intrinsic::ctlz; 2600e8d8bef9SDimitry Andric Type *Ty = X->getType(); 2601fe6060f1SDimitry Andric unsigned Bitwidth = Ty->getScalarSizeInBits(); 2602e8d8bef9SDimitry Andric 2603e8d8bef9SDimitry Andric TargetTransformInfo::TargetCostKind CostKind = 2604e8d8bef9SDimitry Andric TargetTransformInfo::TCK_SizeAndLatency; 2605e8d8bef9SDimitry Andric 2606e8d8bef9SDimitry Andric // The rewrite is considered to be unprofitable iff and only iff the 2607e8d8bef9SDimitry Andric // intrinsic/shift we'll use are not cheap. Note that we are okay with *just* 2608e8d8bef9SDimitry Andric // making the loop countable, even if nothing else changes. 2609e8d8bef9SDimitry Andric IntrinsicCostAttributes Attrs( 261006c3fb27SDimitry Andric IntrID, Ty, {PoisonValue::get(Ty), /*is_zero_poison=*/Builder.getTrue()}); 2611fe6060f1SDimitry Andric InstructionCost Cost = TTI->getIntrinsicInstrCost(Attrs, CostKind); 2612e8d8bef9SDimitry Andric if (Cost > TargetTransformInfo::TCC_Basic) { 2613e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << DEBUG_TYPE 2614e8d8bef9SDimitry Andric " Intrinsic is too costly, not beneficial\n"); 2615e8d8bef9SDimitry Andric return MadeChange; 2616e8d8bef9SDimitry Andric } 2617e8d8bef9SDimitry Andric if (TTI->getArithmeticInstrCost(Instruction::Shl, Ty, CostKind) > 2618e8d8bef9SDimitry Andric TargetTransformInfo::TCC_Basic) { 2619e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << DEBUG_TYPE " Shift is too costly, not beneficial\n"); 2620e8d8bef9SDimitry Andric return MadeChange; 2621e8d8bef9SDimitry Andric } 2622e8d8bef9SDimitry Andric 2623e8d8bef9SDimitry Andric // Ok, transform appears worthwhile. 2624e8d8bef9SDimitry Andric MadeChange = true; 2625e8d8bef9SDimitry Andric 262606c3fb27SDimitry Andric if (!isGuaranteedNotToBeUndefOrPoison(BitPos)) { 262706c3fb27SDimitry Andric // BitMask may be computed from BitPos, Freeze BitPos so we can increase 262806c3fb27SDimitry Andric // it's use count. 2629*0fca6ea1SDimitry Andric std::optional<BasicBlock::iterator> InsertPt = std::nullopt; 263006c3fb27SDimitry Andric if (auto *BitPosI = dyn_cast<Instruction>(BitPos)) 2631*0fca6ea1SDimitry Andric InsertPt = BitPosI->getInsertionPointAfterDef(); 263206c3fb27SDimitry Andric else 2633*0fca6ea1SDimitry Andric InsertPt = DT->getRoot()->getFirstNonPHIOrDbgOrAlloca(); 263406c3fb27SDimitry Andric if (!InsertPt) 263506c3fb27SDimitry Andric return false; 263606c3fb27SDimitry Andric FreezeInst *BitPosFrozen = 2637*0fca6ea1SDimitry Andric new FreezeInst(BitPos, BitPos->getName() + ".fr", *InsertPt); 263806c3fb27SDimitry Andric BitPos->replaceUsesWithIf(BitPosFrozen, [BitPosFrozen](Use &U) { 263906c3fb27SDimitry Andric return U.getUser() != BitPosFrozen; 264006c3fb27SDimitry Andric }); 264106c3fb27SDimitry Andric BitPos = BitPosFrozen; 264206c3fb27SDimitry Andric } 264306c3fb27SDimitry Andric 2644e8d8bef9SDimitry Andric // Step 1: Compute the loop trip count. 2645e8d8bef9SDimitry Andric 2646e8d8bef9SDimitry Andric Value *LowBitMask = Builder.CreateAdd(BitMask, Constant::getAllOnesValue(Ty), 2647e8d8bef9SDimitry Andric BitPos->getName() + ".lowbitmask"); 2648e8d8bef9SDimitry Andric Value *Mask = 2649e8d8bef9SDimitry Andric Builder.CreateOr(LowBitMask, BitMask, BitPos->getName() + ".mask"); 2650e8d8bef9SDimitry Andric Value *XMasked = Builder.CreateAnd(X, Mask, X->getName() + ".masked"); 2651e8d8bef9SDimitry Andric CallInst *XMaskedNumLeadingZeros = Builder.CreateIntrinsic( 265206c3fb27SDimitry Andric IntrID, Ty, {XMasked, /*is_zero_poison=*/Builder.getTrue()}, 2653e8d8bef9SDimitry Andric /*FMFSource=*/nullptr, XMasked->getName() + ".numleadingzeros"); 2654e8d8bef9SDimitry Andric Value *XMaskedNumActiveBits = Builder.CreateSub( 2655e8d8bef9SDimitry Andric ConstantInt::get(Ty, Ty->getScalarSizeInBits()), XMaskedNumLeadingZeros, 2656fe6060f1SDimitry Andric XMasked->getName() + ".numactivebits", /*HasNUW=*/true, 2657fe6060f1SDimitry Andric /*HasNSW=*/Bitwidth != 2); 2658e8d8bef9SDimitry Andric Value *XMaskedLeadingOnePos = 2659e8d8bef9SDimitry Andric Builder.CreateAdd(XMaskedNumActiveBits, Constant::getAllOnesValue(Ty), 2660fe6060f1SDimitry Andric XMasked->getName() + ".leadingonepos", /*HasNUW=*/false, 2661fe6060f1SDimitry Andric /*HasNSW=*/Bitwidth > 2); 2662e8d8bef9SDimitry Andric 2663e8d8bef9SDimitry Andric Value *LoopBackedgeTakenCount = Builder.CreateSub( 2664fe6060f1SDimitry Andric BitPos, XMaskedLeadingOnePos, CurLoop->getName() + ".backedgetakencount", 2665fe6060f1SDimitry Andric /*HasNUW=*/true, /*HasNSW=*/true); 2666e8d8bef9SDimitry Andric // We know loop's backedge-taken count, but what's loop's trip count? 2667e8d8bef9SDimitry Andric // Note that while NUW is always safe, while NSW is only for bitwidths != 2. 2668e8d8bef9SDimitry Andric Value *LoopTripCount = 2669fe6060f1SDimitry Andric Builder.CreateAdd(LoopBackedgeTakenCount, ConstantInt::get(Ty, 1), 2670fe6060f1SDimitry Andric CurLoop->getName() + ".tripcount", /*HasNUW=*/true, 2671fe6060f1SDimitry Andric /*HasNSW=*/Bitwidth != 2); 2672e8d8bef9SDimitry Andric 2673e8d8bef9SDimitry Andric // Step 2: Compute the recurrence's final value without a loop. 2674e8d8bef9SDimitry Andric 2675e8d8bef9SDimitry Andric // NewX is always safe to compute, because `LoopBackedgeTakenCount` 2676e8d8bef9SDimitry Andric // will always be smaller than `bitwidth(X)`, i.e. we never get poison. 2677e8d8bef9SDimitry Andric Value *NewX = Builder.CreateShl(X, LoopBackedgeTakenCount); 2678e8d8bef9SDimitry Andric NewX->takeName(XCurr); 2679e8d8bef9SDimitry Andric if (auto *I = dyn_cast<Instruction>(NewX)) 2680e8d8bef9SDimitry Andric I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true); 2681e8d8bef9SDimitry Andric 2682e8d8bef9SDimitry Andric Value *NewXNext; 2683e8d8bef9SDimitry Andric // Rewriting XNext is more complicated, however, because `X << LoopTripCount` 2684e8d8bef9SDimitry Andric // will be poison iff `LoopTripCount == bitwidth(X)` (which will happen 2685e8d8bef9SDimitry Andric // iff `BitPos` is `bitwidth(x) - 1` and `X` is `1`). So unless we know 2686e8d8bef9SDimitry Andric // that isn't the case, we'll need to emit an alternative, safe IR. 2687e8d8bef9SDimitry Andric if (XNext->hasNoSignedWrap() || XNext->hasNoUnsignedWrap() || 2688e8d8bef9SDimitry Andric PatternMatch::match( 2689e8d8bef9SDimitry Andric BitPos, PatternMatch::m_SpecificInt_ICMP( 2690e8d8bef9SDimitry Andric ICmpInst::ICMP_NE, APInt(Ty->getScalarSizeInBits(), 2691e8d8bef9SDimitry Andric Ty->getScalarSizeInBits() - 1)))) 2692e8d8bef9SDimitry Andric NewXNext = Builder.CreateShl(X, LoopTripCount); 2693e8d8bef9SDimitry Andric else { 2694e8d8bef9SDimitry Andric // Otherwise, just additionally shift by one. It's the smallest solution, 2695e8d8bef9SDimitry Andric // alternatively, we could check that NewX is INT_MIN (or BitPos is ) 2696e8d8bef9SDimitry Andric // and select 0 instead. 2697e8d8bef9SDimitry Andric NewXNext = Builder.CreateShl(NewX, ConstantInt::get(Ty, 1)); 2698e8d8bef9SDimitry Andric } 2699e8d8bef9SDimitry Andric 2700e8d8bef9SDimitry Andric NewXNext->takeName(XNext); 2701e8d8bef9SDimitry Andric if (auto *I = dyn_cast<Instruction>(NewXNext)) 2702e8d8bef9SDimitry Andric I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true); 2703e8d8bef9SDimitry Andric 2704e8d8bef9SDimitry Andric // Step 3: Adjust the successor basic block to recieve the computed 2705e8d8bef9SDimitry Andric // recurrence's final value instead of the recurrence itself. 2706e8d8bef9SDimitry Andric 2707e8d8bef9SDimitry Andric XCurr->replaceUsesOutsideBlock(NewX, LoopHeaderBB); 2708e8d8bef9SDimitry Andric XNext->replaceUsesOutsideBlock(NewXNext, LoopHeaderBB); 2709e8d8bef9SDimitry Andric 2710e8d8bef9SDimitry Andric // Step 4: Rewrite the loop into a countable form, with canonical IV. 2711e8d8bef9SDimitry Andric 2712e8d8bef9SDimitry Andric // The new canonical induction variable. 27135f757f3fSDimitry Andric Builder.SetInsertPoint(LoopHeaderBB, LoopHeaderBB->begin()); 2714e8d8bef9SDimitry Andric auto *IV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv"); 2715e8d8bef9SDimitry Andric 2716e8d8bef9SDimitry Andric // The induction itself. 2717e8d8bef9SDimitry Andric // Note that while NUW is always safe, while NSW is only for bitwidths != 2. 2718e8d8bef9SDimitry Andric Builder.SetInsertPoint(LoopHeaderBB->getTerminator()); 2719fe6060f1SDimitry Andric auto *IVNext = 2720fe6060f1SDimitry Andric Builder.CreateAdd(IV, ConstantInt::get(Ty, 1), IV->getName() + ".next", 2721fe6060f1SDimitry Andric /*HasNUW=*/true, /*HasNSW=*/Bitwidth != 2); 2722e8d8bef9SDimitry Andric 2723e8d8bef9SDimitry Andric // The loop trip count check. 2724e8d8bef9SDimitry Andric auto *IVCheck = Builder.CreateICmpEQ(IVNext, LoopTripCount, 2725e8d8bef9SDimitry Andric CurLoop->getName() + ".ivcheck"); 2726e8d8bef9SDimitry Andric Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB); 2727e8d8bef9SDimitry Andric LoopHeaderBB->getTerminator()->eraseFromParent(); 2728e8d8bef9SDimitry Andric 2729e8d8bef9SDimitry Andric // Populate the IV PHI. 2730e8d8bef9SDimitry Andric IV->addIncoming(ConstantInt::get(Ty, 0), LoopPreheaderBB); 2731e8d8bef9SDimitry Andric IV->addIncoming(IVNext, LoopHeaderBB); 2732e8d8bef9SDimitry Andric 2733e8d8bef9SDimitry Andric // Step 5: Forget the "non-computable" trip-count SCEV associated with the 2734e8d8bef9SDimitry Andric // loop. The loop would otherwise not be deleted even if it becomes empty. 2735e8d8bef9SDimitry Andric 2736e8d8bef9SDimitry Andric SE->forgetLoop(CurLoop); 2737e8d8bef9SDimitry Andric 2738e8d8bef9SDimitry Andric // Other passes will take care of actually deleting the loop if possible. 2739e8d8bef9SDimitry Andric 2740e8d8bef9SDimitry Andric LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom optimized!\n"); 2741e8d8bef9SDimitry Andric 2742e8d8bef9SDimitry Andric ++NumShiftUntilBitTest; 2743e8d8bef9SDimitry Andric return MadeChange; 2744e8d8bef9SDimitry Andric } 2745fe6060f1SDimitry Andric 2746fe6060f1SDimitry Andric /// Return true if the idiom is detected in the loop. 2747fe6060f1SDimitry Andric /// 2748fe6060f1SDimitry Andric /// The core idiom we are trying to detect is: 2749fe6060f1SDimitry Andric /// \code 2750fe6060f1SDimitry Andric /// entry: 2751fe6060f1SDimitry Andric /// <...> 2752fe6060f1SDimitry Andric /// %start = <...> 2753fe6060f1SDimitry Andric /// %extraoffset = <...> 2754fe6060f1SDimitry Andric /// <...> 2755fe6060f1SDimitry Andric /// br label %for.cond 2756fe6060f1SDimitry Andric /// 2757fe6060f1SDimitry Andric /// loop: 2758fe6060f1SDimitry Andric /// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ] 2759fe6060f1SDimitry Andric /// %nbits = add nsw i8 %iv, %extraoffset 2760fe6060f1SDimitry Andric /// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits 2761fe6060f1SDimitry Andric /// %val.shifted.iszero = icmp eq i8 %val.shifted, 0 2762fe6060f1SDimitry Andric /// %iv.next = add i8 %iv, 1 2763fe6060f1SDimitry Andric /// <...> 2764fe6060f1SDimitry Andric /// br i1 %val.shifted.iszero, label %end, label %loop 2765fe6060f1SDimitry Andric /// 2766fe6060f1SDimitry Andric /// end: 2767fe6060f1SDimitry Andric /// %iv.res = phi i8 [ %iv, %loop ] <...> 2768fe6060f1SDimitry Andric /// %nbits.res = phi i8 [ %nbits, %loop ] <...> 2769fe6060f1SDimitry Andric /// %val.shifted.res = phi i8 [ %val.shifted, %loop ] <...> 2770fe6060f1SDimitry Andric /// %val.shifted.iszero.res = phi i1 [ %val.shifted.iszero, %loop ] <...> 2771fe6060f1SDimitry Andric /// %iv.next.res = phi i8 [ %iv.next, %loop ] <...> 2772fe6060f1SDimitry Andric /// <...> 2773fe6060f1SDimitry Andric /// \endcode 2774fe6060f1SDimitry Andric static bool detectShiftUntilZeroIdiom(Loop *CurLoop, ScalarEvolution *SE, 2775fe6060f1SDimitry Andric Instruction *&ValShiftedIsZero, 2776fe6060f1SDimitry Andric Intrinsic::ID &IntrinID, Instruction *&IV, 2777fe6060f1SDimitry Andric Value *&Start, Value *&Val, 2778fe6060f1SDimitry Andric const SCEV *&ExtraOffsetExpr, 2779fe6060f1SDimitry Andric bool &InvertedCond) { 2780fe6060f1SDimitry Andric LLVM_DEBUG(dbgs() << DEBUG_TYPE 2781fe6060f1SDimitry Andric " Performing shift-until-zero idiom detection.\n"); 2782fe6060f1SDimitry Andric 2783fe6060f1SDimitry Andric // Give up if the loop has multiple blocks or multiple backedges. 2784fe6060f1SDimitry Andric if (CurLoop->getNumBlocks() != 1 || CurLoop->getNumBackEdges() != 1) { 2785fe6060f1SDimitry Andric LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n"); 2786fe6060f1SDimitry Andric return false; 2787fe6060f1SDimitry Andric } 2788fe6060f1SDimitry Andric 2789fe6060f1SDimitry Andric Instruction *ValShifted, *NBits, *IVNext; 2790fe6060f1SDimitry Andric Value *ExtraOffset; 2791fe6060f1SDimitry Andric 2792fe6060f1SDimitry Andric BasicBlock *LoopHeaderBB = CurLoop->getHeader(); 2793fe6060f1SDimitry Andric BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader(); 2794fe6060f1SDimitry Andric assert(LoopPreheaderBB && "There is always a loop preheader."); 2795fe6060f1SDimitry Andric 2796fe6060f1SDimitry Andric using namespace PatternMatch; 2797fe6060f1SDimitry Andric 2798fe6060f1SDimitry Andric // Step 1: Check if the loop backedge, condition is in desirable form. 2799fe6060f1SDimitry Andric 2800fe6060f1SDimitry Andric ICmpInst::Predicate Pred; 2801fe6060f1SDimitry Andric BasicBlock *TrueBB, *FalseBB; 2802fe6060f1SDimitry Andric if (!match(LoopHeaderBB->getTerminator(), 2803fe6060f1SDimitry Andric m_Br(m_Instruction(ValShiftedIsZero), m_BasicBlock(TrueBB), 2804fe6060f1SDimitry Andric m_BasicBlock(FalseBB))) || 2805fe6060f1SDimitry Andric !match(ValShiftedIsZero, 2806fe6060f1SDimitry Andric m_ICmp(Pred, m_Instruction(ValShifted), m_Zero())) || 2807fe6060f1SDimitry Andric !ICmpInst::isEquality(Pred)) { 2808fe6060f1SDimitry Andric LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n"); 2809fe6060f1SDimitry Andric return false; 2810fe6060f1SDimitry Andric } 2811fe6060f1SDimitry Andric 2812fe6060f1SDimitry Andric // Step 2: Check if the comparison's operand is in desirable form. 2813fe6060f1SDimitry Andric // FIXME: Val could be a one-input PHI node, which we should look past. 2814fe6060f1SDimitry Andric if (!match(ValShifted, m_Shift(m_LoopInvariant(m_Value(Val), CurLoop), 2815fe6060f1SDimitry Andric m_Instruction(NBits)))) { 2816fe6060f1SDimitry Andric LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad comparisons value computation.\n"); 2817fe6060f1SDimitry Andric return false; 2818fe6060f1SDimitry Andric } 2819fe6060f1SDimitry Andric IntrinID = ValShifted->getOpcode() == Instruction::Shl ? Intrinsic::cttz 2820fe6060f1SDimitry Andric : Intrinsic::ctlz; 2821fe6060f1SDimitry Andric 2822fe6060f1SDimitry Andric // Step 3: Check if the shift amount is in desirable form. 2823fe6060f1SDimitry Andric 2824fe6060f1SDimitry Andric if (match(NBits, m_c_Add(m_Instruction(IV), 2825fe6060f1SDimitry Andric m_LoopInvariant(m_Value(ExtraOffset), CurLoop))) && 2826fe6060f1SDimitry Andric (NBits->hasNoSignedWrap() || NBits->hasNoUnsignedWrap())) 2827fe6060f1SDimitry Andric ExtraOffsetExpr = SE->getNegativeSCEV(SE->getSCEV(ExtraOffset)); 2828fe6060f1SDimitry Andric else if (match(NBits, 2829fe6060f1SDimitry Andric m_Sub(m_Instruction(IV), 2830fe6060f1SDimitry Andric m_LoopInvariant(m_Value(ExtraOffset), CurLoop))) && 2831fe6060f1SDimitry Andric NBits->hasNoSignedWrap()) 2832fe6060f1SDimitry Andric ExtraOffsetExpr = SE->getSCEV(ExtraOffset); 2833fe6060f1SDimitry Andric else { 2834fe6060f1SDimitry Andric IV = NBits; 2835fe6060f1SDimitry Andric ExtraOffsetExpr = SE->getZero(NBits->getType()); 2836fe6060f1SDimitry Andric } 2837fe6060f1SDimitry Andric 2838fe6060f1SDimitry Andric // Step 4: Check if the recurrence is in desirable form. 2839fe6060f1SDimitry Andric auto *IVPN = dyn_cast<PHINode>(IV); 2840fe6060f1SDimitry Andric if (!IVPN || IVPN->getParent() != LoopHeaderBB) { 2841fe6060f1SDimitry Andric LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n"); 2842fe6060f1SDimitry Andric return false; 2843fe6060f1SDimitry Andric } 2844fe6060f1SDimitry Andric 2845fe6060f1SDimitry Andric Start = IVPN->getIncomingValueForBlock(LoopPreheaderBB); 2846fe6060f1SDimitry Andric IVNext = dyn_cast<Instruction>(IVPN->getIncomingValueForBlock(LoopHeaderBB)); 2847fe6060f1SDimitry Andric 2848fe6060f1SDimitry Andric if (!IVNext || !match(IVNext, m_Add(m_Specific(IVPN), m_One()))) { 2849fe6060f1SDimitry Andric LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n"); 2850fe6060f1SDimitry Andric return false; 2851fe6060f1SDimitry Andric } 2852fe6060f1SDimitry Andric 2853fe6060f1SDimitry Andric // Step 4: Check if the backedge's destinations are in desirable form. 2854fe6060f1SDimitry Andric 2855fe6060f1SDimitry Andric assert(ICmpInst::isEquality(Pred) && 2856fe6060f1SDimitry Andric "Should only get equality predicates here."); 2857fe6060f1SDimitry Andric 2858fe6060f1SDimitry Andric // cmp-br is commutative, so canonicalize to a single variant. 2859fe6060f1SDimitry Andric InvertedCond = Pred != ICmpInst::Predicate::ICMP_EQ; 2860fe6060f1SDimitry Andric if (InvertedCond) { 2861fe6060f1SDimitry Andric Pred = ICmpInst::getInversePredicate(Pred); 2862fe6060f1SDimitry Andric std::swap(TrueBB, FalseBB); 2863fe6060f1SDimitry Andric } 2864fe6060f1SDimitry Andric 2865fe6060f1SDimitry Andric // We expect to exit loop when comparison yields true, 2866fe6060f1SDimitry Andric // so when it yields false we should branch back to loop header. 2867fe6060f1SDimitry Andric if (FalseBB != LoopHeaderBB) { 2868fe6060f1SDimitry Andric LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n"); 2869fe6060f1SDimitry Andric return false; 2870fe6060f1SDimitry Andric } 2871fe6060f1SDimitry Andric 2872fe6060f1SDimitry Andric // The new, countable, loop will certainly only run a known number of 2873fe6060f1SDimitry Andric // iterations, It won't be infinite. But the old loop might be infinite 2874fe6060f1SDimitry Andric // under certain conditions. For logical shifts, the value will become zero 2875fe6060f1SDimitry Andric // after at most bitwidth(%Val) loop iterations. However, for arithmetic 2876fe6060f1SDimitry Andric // right-shift, iff the sign bit was set, the value will never become zero, 2877fe6060f1SDimitry Andric // and the loop may never finish. 2878fe6060f1SDimitry Andric if (ValShifted->getOpcode() == Instruction::AShr && 2879fe6060f1SDimitry Andric !isMustProgress(CurLoop) && !SE->isKnownNonNegative(SE->getSCEV(Val))) { 2880fe6060f1SDimitry Andric LLVM_DEBUG(dbgs() << DEBUG_TYPE " Can not prove the loop is finite.\n"); 2881fe6060f1SDimitry Andric return false; 2882fe6060f1SDimitry Andric } 2883fe6060f1SDimitry Andric 2884fe6060f1SDimitry Andric // Okay, idiom checks out. 2885fe6060f1SDimitry Andric return true; 2886fe6060f1SDimitry Andric } 2887fe6060f1SDimitry Andric 2888fe6060f1SDimitry Andric /// Look for the following loop: 2889fe6060f1SDimitry Andric /// \code 2890fe6060f1SDimitry Andric /// entry: 2891fe6060f1SDimitry Andric /// <...> 2892fe6060f1SDimitry Andric /// %start = <...> 2893fe6060f1SDimitry Andric /// %extraoffset = <...> 2894fe6060f1SDimitry Andric /// <...> 2895fe6060f1SDimitry Andric /// br label %for.cond 2896fe6060f1SDimitry Andric /// 2897fe6060f1SDimitry Andric /// loop: 2898fe6060f1SDimitry Andric /// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ] 2899fe6060f1SDimitry Andric /// %nbits = add nsw i8 %iv, %extraoffset 2900fe6060f1SDimitry Andric /// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits 2901fe6060f1SDimitry Andric /// %val.shifted.iszero = icmp eq i8 %val.shifted, 0 2902fe6060f1SDimitry Andric /// %iv.next = add i8 %iv, 1 2903fe6060f1SDimitry Andric /// <...> 2904fe6060f1SDimitry Andric /// br i1 %val.shifted.iszero, label %end, label %loop 2905fe6060f1SDimitry Andric /// 2906fe6060f1SDimitry Andric /// end: 2907fe6060f1SDimitry Andric /// %iv.res = phi i8 [ %iv, %loop ] <...> 2908fe6060f1SDimitry Andric /// %nbits.res = phi i8 [ %nbits, %loop ] <...> 2909fe6060f1SDimitry Andric /// %val.shifted.res = phi i8 [ %val.shifted, %loop ] <...> 2910fe6060f1SDimitry Andric /// %val.shifted.iszero.res = phi i1 [ %val.shifted.iszero, %loop ] <...> 2911fe6060f1SDimitry Andric /// %iv.next.res = phi i8 [ %iv.next, %loop ] <...> 2912fe6060f1SDimitry Andric /// <...> 2913fe6060f1SDimitry Andric /// \endcode 2914fe6060f1SDimitry Andric /// 2915fe6060f1SDimitry Andric /// And transform it into: 2916fe6060f1SDimitry Andric /// \code 2917fe6060f1SDimitry Andric /// entry: 2918fe6060f1SDimitry Andric /// <...> 2919fe6060f1SDimitry Andric /// %start = <...> 2920fe6060f1SDimitry Andric /// %extraoffset = <...> 2921fe6060f1SDimitry Andric /// <...> 2922fe6060f1SDimitry Andric /// %val.numleadingzeros = call i8 @llvm.ct{l,t}z.i8(i8 %val, i1 0) 2923fe6060f1SDimitry Andric /// %val.numactivebits = sub i8 8, %val.numleadingzeros 2924fe6060f1SDimitry Andric /// %extraoffset.neg = sub i8 0, %extraoffset 2925fe6060f1SDimitry Andric /// %tmp = add i8 %val.numactivebits, %extraoffset.neg 2926fe6060f1SDimitry Andric /// %iv.final = call i8 @llvm.smax.i8(i8 %tmp, i8 %start) 2927fe6060f1SDimitry Andric /// %loop.tripcount = sub i8 %iv.final, %start 2928fe6060f1SDimitry Andric /// br label %loop 2929fe6060f1SDimitry Andric /// 2930fe6060f1SDimitry Andric /// loop: 2931fe6060f1SDimitry Andric /// %loop.iv = phi i8 [ 0, %entry ], [ %loop.iv.next, %loop ] 2932fe6060f1SDimitry Andric /// %loop.iv.next = add i8 %loop.iv, 1 2933fe6060f1SDimitry Andric /// %loop.ivcheck = icmp eq i8 %loop.iv.next, %loop.tripcount 2934fe6060f1SDimitry Andric /// %iv = add i8 %loop.iv, %start 2935fe6060f1SDimitry Andric /// <...> 2936fe6060f1SDimitry Andric /// br i1 %loop.ivcheck, label %end, label %loop 2937fe6060f1SDimitry Andric /// 2938fe6060f1SDimitry Andric /// end: 2939fe6060f1SDimitry Andric /// %iv.res = phi i8 [ %iv.final, %loop ] <...> 2940fe6060f1SDimitry Andric /// <...> 2941fe6060f1SDimitry Andric /// \endcode 2942fe6060f1SDimitry Andric bool LoopIdiomRecognize::recognizeShiftUntilZero() { 2943fe6060f1SDimitry Andric bool MadeChange = false; 2944fe6060f1SDimitry Andric 2945fe6060f1SDimitry Andric Instruction *ValShiftedIsZero; 2946fe6060f1SDimitry Andric Intrinsic::ID IntrID; 2947fe6060f1SDimitry Andric Instruction *IV; 2948fe6060f1SDimitry Andric Value *Start, *Val; 2949fe6060f1SDimitry Andric const SCEV *ExtraOffsetExpr; 2950fe6060f1SDimitry Andric bool InvertedCond; 2951fe6060f1SDimitry Andric if (!detectShiftUntilZeroIdiom(CurLoop, SE, ValShiftedIsZero, IntrID, IV, 2952fe6060f1SDimitry Andric Start, Val, ExtraOffsetExpr, InvertedCond)) { 2953fe6060f1SDimitry Andric LLVM_DEBUG(dbgs() << DEBUG_TYPE 2954fe6060f1SDimitry Andric " shift-until-zero idiom detection failed.\n"); 2955fe6060f1SDimitry Andric return MadeChange; 2956fe6060f1SDimitry Andric } 2957fe6060f1SDimitry Andric LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-zero idiom detected!\n"); 2958fe6060f1SDimitry Andric 2959fe6060f1SDimitry Andric // Ok, it is the idiom we were looking for, we *could* transform this loop, 2960fe6060f1SDimitry Andric // but is it profitable to transform? 2961fe6060f1SDimitry Andric 2962fe6060f1SDimitry Andric BasicBlock *LoopHeaderBB = CurLoop->getHeader(); 2963fe6060f1SDimitry Andric BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader(); 2964fe6060f1SDimitry Andric assert(LoopPreheaderBB && "There is always a loop preheader."); 2965fe6060f1SDimitry Andric 2966fe6060f1SDimitry Andric BasicBlock *SuccessorBB = CurLoop->getExitBlock(); 2967fe6060f1SDimitry Andric assert(SuccessorBB && "There is only a single successor."); 2968fe6060f1SDimitry Andric 2969fe6060f1SDimitry Andric IRBuilder<> Builder(LoopPreheaderBB->getTerminator()); 2970fe6060f1SDimitry Andric Builder.SetCurrentDebugLocation(IV->getDebugLoc()); 2971fe6060f1SDimitry Andric 2972fe6060f1SDimitry Andric Type *Ty = Val->getType(); 2973fe6060f1SDimitry Andric unsigned Bitwidth = Ty->getScalarSizeInBits(); 2974fe6060f1SDimitry Andric 2975fe6060f1SDimitry Andric TargetTransformInfo::TargetCostKind CostKind = 2976fe6060f1SDimitry Andric TargetTransformInfo::TCK_SizeAndLatency; 2977fe6060f1SDimitry Andric 2978fe6060f1SDimitry Andric // The rewrite is considered to be unprofitable iff and only iff the 2979fe6060f1SDimitry Andric // intrinsic we'll use are not cheap. Note that we are okay with *just* 2980fe6060f1SDimitry Andric // making the loop countable, even if nothing else changes. 2981fe6060f1SDimitry Andric IntrinsicCostAttributes Attrs( 298206c3fb27SDimitry Andric IntrID, Ty, {PoisonValue::get(Ty), /*is_zero_poison=*/Builder.getFalse()}); 2983fe6060f1SDimitry Andric InstructionCost Cost = TTI->getIntrinsicInstrCost(Attrs, CostKind); 2984fe6060f1SDimitry Andric if (Cost > TargetTransformInfo::TCC_Basic) { 2985fe6060f1SDimitry Andric LLVM_DEBUG(dbgs() << DEBUG_TYPE 2986fe6060f1SDimitry Andric " Intrinsic is too costly, not beneficial\n"); 2987fe6060f1SDimitry Andric return MadeChange; 2988fe6060f1SDimitry Andric } 2989fe6060f1SDimitry Andric 2990fe6060f1SDimitry Andric // Ok, transform appears worthwhile. 2991fe6060f1SDimitry Andric MadeChange = true; 2992fe6060f1SDimitry Andric 2993fe6060f1SDimitry Andric bool OffsetIsZero = false; 2994fe6060f1SDimitry Andric if (auto *ExtraOffsetExprC = dyn_cast<SCEVConstant>(ExtraOffsetExpr)) 2995fe6060f1SDimitry Andric OffsetIsZero = ExtraOffsetExprC->isZero(); 2996fe6060f1SDimitry Andric 2997fe6060f1SDimitry Andric // Step 1: Compute the loop's final IV value / trip count. 2998fe6060f1SDimitry Andric 2999fe6060f1SDimitry Andric CallInst *ValNumLeadingZeros = Builder.CreateIntrinsic( 300006c3fb27SDimitry Andric IntrID, Ty, {Val, /*is_zero_poison=*/Builder.getFalse()}, 3001fe6060f1SDimitry Andric /*FMFSource=*/nullptr, Val->getName() + ".numleadingzeros"); 3002fe6060f1SDimitry Andric Value *ValNumActiveBits = Builder.CreateSub( 3003fe6060f1SDimitry Andric ConstantInt::get(Ty, Ty->getScalarSizeInBits()), ValNumLeadingZeros, 3004fe6060f1SDimitry Andric Val->getName() + ".numactivebits", /*HasNUW=*/true, 3005fe6060f1SDimitry Andric /*HasNSW=*/Bitwidth != 2); 3006fe6060f1SDimitry Andric 3007fe6060f1SDimitry Andric SCEVExpander Expander(*SE, *DL, "loop-idiom"); 3008fe6060f1SDimitry Andric Expander.setInsertPoint(&*Builder.GetInsertPoint()); 3009fe6060f1SDimitry Andric Value *ExtraOffset = Expander.expandCodeFor(ExtraOffsetExpr); 3010fe6060f1SDimitry Andric 3011fe6060f1SDimitry Andric Value *ValNumActiveBitsOffset = Builder.CreateAdd( 3012fe6060f1SDimitry Andric ValNumActiveBits, ExtraOffset, ValNumActiveBits->getName() + ".offset", 3013fe6060f1SDimitry Andric /*HasNUW=*/OffsetIsZero, /*HasNSW=*/true); 3014fe6060f1SDimitry Andric Value *IVFinal = Builder.CreateIntrinsic(Intrinsic::smax, {Ty}, 3015fe6060f1SDimitry Andric {ValNumActiveBitsOffset, Start}, 3016fe6060f1SDimitry Andric /*FMFSource=*/nullptr, "iv.final"); 3017fe6060f1SDimitry Andric 3018fe6060f1SDimitry Andric auto *LoopBackedgeTakenCount = cast<Instruction>(Builder.CreateSub( 3019fe6060f1SDimitry Andric IVFinal, Start, CurLoop->getName() + ".backedgetakencount", 3020fe6060f1SDimitry Andric /*HasNUW=*/OffsetIsZero, /*HasNSW=*/true)); 3021fe6060f1SDimitry Andric // FIXME: or when the offset was `add nuw` 3022fe6060f1SDimitry Andric 3023fe6060f1SDimitry Andric // We know loop's backedge-taken count, but what's loop's trip count? 3024fe6060f1SDimitry Andric Value *LoopTripCount = 3025fe6060f1SDimitry Andric Builder.CreateAdd(LoopBackedgeTakenCount, ConstantInt::get(Ty, 1), 3026fe6060f1SDimitry Andric CurLoop->getName() + ".tripcount", /*HasNUW=*/true, 3027fe6060f1SDimitry Andric /*HasNSW=*/Bitwidth != 2); 3028fe6060f1SDimitry Andric 3029fe6060f1SDimitry Andric // Step 2: Adjust the successor basic block to recieve the original 3030fe6060f1SDimitry Andric // induction variable's final value instead of the orig. IV itself. 3031fe6060f1SDimitry Andric 3032fe6060f1SDimitry Andric IV->replaceUsesOutsideBlock(IVFinal, LoopHeaderBB); 3033fe6060f1SDimitry Andric 3034fe6060f1SDimitry Andric // Step 3: Rewrite the loop into a countable form, with canonical IV. 3035fe6060f1SDimitry Andric 3036fe6060f1SDimitry Andric // The new canonical induction variable. 30375f757f3fSDimitry Andric Builder.SetInsertPoint(LoopHeaderBB, LoopHeaderBB->begin()); 3038fe6060f1SDimitry Andric auto *CIV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv"); 3039fe6060f1SDimitry Andric 3040fe6060f1SDimitry Andric // The induction itself. 30415f757f3fSDimitry Andric Builder.SetInsertPoint(LoopHeaderBB, LoopHeaderBB->getFirstNonPHIIt()); 3042fe6060f1SDimitry Andric auto *CIVNext = 3043fe6060f1SDimitry Andric Builder.CreateAdd(CIV, ConstantInt::get(Ty, 1), CIV->getName() + ".next", 3044fe6060f1SDimitry Andric /*HasNUW=*/true, /*HasNSW=*/Bitwidth != 2); 3045fe6060f1SDimitry Andric 3046fe6060f1SDimitry Andric // The loop trip count check. 3047fe6060f1SDimitry Andric auto *CIVCheck = Builder.CreateICmpEQ(CIVNext, LoopTripCount, 3048fe6060f1SDimitry Andric CurLoop->getName() + ".ivcheck"); 3049fe6060f1SDimitry Andric auto *NewIVCheck = CIVCheck; 3050fe6060f1SDimitry Andric if (InvertedCond) { 3051fe6060f1SDimitry Andric NewIVCheck = Builder.CreateNot(CIVCheck); 3052fe6060f1SDimitry Andric NewIVCheck->takeName(ValShiftedIsZero); 3053fe6060f1SDimitry Andric } 3054fe6060f1SDimitry Andric 3055fe6060f1SDimitry Andric // The original IV, but rebased to be an offset to the CIV. 3056fe6060f1SDimitry Andric auto *IVDePHId = Builder.CreateAdd(CIV, Start, "", /*HasNUW=*/false, 3057fe6060f1SDimitry Andric /*HasNSW=*/true); // FIXME: what about NUW? 3058fe6060f1SDimitry Andric IVDePHId->takeName(IV); 3059fe6060f1SDimitry Andric 3060fe6060f1SDimitry Andric // The loop terminator. 3061fe6060f1SDimitry Andric Builder.SetInsertPoint(LoopHeaderBB->getTerminator()); 3062fe6060f1SDimitry Andric Builder.CreateCondBr(CIVCheck, SuccessorBB, LoopHeaderBB); 3063fe6060f1SDimitry Andric LoopHeaderBB->getTerminator()->eraseFromParent(); 3064fe6060f1SDimitry Andric 3065fe6060f1SDimitry Andric // Populate the IV PHI. 3066fe6060f1SDimitry Andric CIV->addIncoming(ConstantInt::get(Ty, 0), LoopPreheaderBB); 3067fe6060f1SDimitry Andric CIV->addIncoming(CIVNext, LoopHeaderBB); 3068fe6060f1SDimitry Andric 3069fe6060f1SDimitry Andric // Step 4: Forget the "non-computable" trip-count SCEV associated with the 3070fe6060f1SDimitry Andric // loop. The loop would otherwise not be deleted even if it becomes empty. 3071fe6060f1SDimitry Andric 3072fe6060f1SDimitry Andric SE->forgetLoop(CurLoop); 3073fe6060f1SDimitry Andric 3074fe6060f1SDimitry Andric // Step 5: Try to cleanup the loop's body somewhat. 3075fe6060f1SDimitry Andric IV->replaceAllUsesWith(IVDePHId); 3076fe6060f1SDimitry Andric IV->eraseFromParent(); 3077fe6060f1SDimitry Andric 3078fe6060f1SDimitry Andric ValShiftedIsZero->replaceAllUsesWith(NewIVCheck); 3079fe6060f1SDimitry Andric ValShiftedIsZero->eraseFromParent(); 3080fe6060f1SDimitry Andric 3081fe6060f1SDimitry Andric // Other passes will take care of actually deleting the loop if possible. 3082fe6060f1SDimitry Andric 3083fe6060f1SDimitry Andric LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-zero idiom optimized!\n"); 3084fe6060f1SDimitry Andric 3085fe6060f1SDimitry Andric ++NumShiftUntilZero; 3086fe6060f1SDimitry Andric return MadeChange; 3087fe6060f1SDimitry Andric } 3088