18bcb0991SDimitry Andric //===- ARMParallelDSP.cpp - Parallel DSP Pass -----------------------------===// 20b57cec5SDimitry Andric // 30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 60b57cec5SDimitry Andric // 70b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 80b57cec5SDimitry Andric // 90b57cec5SDimitry Andric /// \file 100b57cec5SDimitry Andric /// Armv6 introduced instructions to perform 32-bit SIMD operations. The 110b57cec5SDimitry Andric /// purpose of this pass is do some IR pattern matching to create ACLE 120b57cec5SDimitry Andric /// DSP intrinsics, which map on these 32-bit SIMD operations. 130b57cec5SDimitry Andric /// This pass runs only when unaligned accesses is supported/enabled. 140b57cec5SDimitry Andric // 150b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 160b57cec5SDimitry Andric 17480093f4SDimitry Andric #include "ARM.h" 18480093f4SDimitry Andric #include "ARMSubtarget.h" 190b57cec5SDimitry Andric #include "llvm/ADT/SmallPtrSet.h" 20480093f4SDimitry Andric #include "llvm/ADT/Statistic.h" 210b57cec5SDimitry Andric #include "llvm/Analysis/AliasAnalysis.h" 225ffd83dbSDimitry Andric #include "llvm/Analysis/AssumptionCache.h" 235ffd83dbSDimitry Andric #include "llvm/Analysis/GlobalsModRef.h" 240b57cec5SDimitry Andric #include "llvm/Analysis/LoopAccessAnalysis.h" 25e8d8bef9SDimitry Andric #include "llvm/Analysis/TargetLibraryInfo.h" 26480093f4SDimitry Andric #include "llvm/CodeGen/TargetPassConfig.h" 27*0fca6ea1SDimitry Andric #include "llvm/IR/IRBuilder.h" 280b57cec5SDimitry Andric #include "llvm/IR/Instructions.h" 29480093f4SDimitry Andric #include "llvm/IR/IntrinsicsARM.h" 30*0fca6ea1SDimitry Andric #include "llvm/IR/Module.h" 310b57cec5SDimitry Andric #include "llvm/IR/NoFolder.h" 32480093f4SDimitry Andric #include "llvm/IR/PatternMatch.h" 330b57cec5SDimitry Andric #include "llvm/Pass.h" 340b57cec5SDimitry Andric #include "llvm/PassRegistry.h" 350b57cec5SDimitry Andric #include "llvm/Support/Debug.h" 36480093f4SDimitry Andric #include "llvm/Transforms/Scalar.h" 37480093f4SDimitry Andric #include "llvm/Transforms/Utils/BasicBlockUtils.h" 380b57cec5SDimitry Andric 390b57cec5SDimitry Andric using namespace llvm; 400b57cec5SDimitry Andric using namespace PatternMatch; 410b57cec5SDimitry Andric 420b57cec5SDimitry Andric #define DEBUG_TYPE "arm-parallel-dsp" 430b57cec5SDimitry Andric 440b57cec5SDimitry Andric STATISTIC(NumSMLAD , "Number of smlad instructions generated"); 450b57cec5SDimitry Andric 460b57cec5SDimitry Andric static cl::opt<bool> 470b57cec5SDimitry Andric DisableParallelDSP("disable-arm-parallel-dsp", cl::Hidden, cl::init(false), 480b57cec5SDimitry Andric cl::desc("Disable the ARM Parallel DSP pass")); 490b57cec5SDimitry Andric 508bcb0991SDimitry Andric static cl::opt<unsigned> 518bcb0991SDimitry Andric NumLoadLimit("arm-parallel-dsp-load-limit", cl::Hidden, cl::init(16), 528bcb0991SDimitry Andric cl::desc("Limit the number of loads analysed")); 538bcb0991SDimitry Andric 540b57cec5SDimitry Andric namespace { 558bcb0991SDimitry Andric struct MulCandidate; 560b57cec5SDimitry Andric class Reduction; 570b57cec5SDimitry Andric 588bcb0991SDimitry Andric using MulCandList = SmallVector<std::unique_ptr<MulCandidate>, 8>; 598bcb0991SDimitry Andric using MemInstList = SmallVectorImpl<LoadInst*>; 608bcb0991SDimitry Andric using MulPairList = SmallVector<std::pair<MulCandidate*, MulCandidate*>, 8>; 610b57cec5SDimitry Andric 628bcb0991SDimitry Andric // 'MulCandidate' holds the multiplication instructions that are candidates 630b57cec5SDimitry Andric // for parallel execution. 648bcb0991SDimitry Andric struct MulCandidate { 658bcb0991SDimitry Andric Instruction *Root; 668bcb0991SDimitry Andric Value* LHS; 678bcb0991SDimitry Andric Value* RHS; 680b57cec5SDimitry Andric bool Exchange = false; 698bcb0991SDimitry Andric bool Paired = false; 708bcb0991SDimitry Andric SmallVector<LoadInst*, 2> VecLd; // Container for loads to widen. 710b57cec5SDimitry Andric 728bcb0991SDimitry Andric MulCandidate(Instruction *I, Value *lhs, Value *rhs) : 738bcb0991SDimitry Andric Root(I), LHS(lhs), RHS(rhs) { } 748bcb0991SDimitry Andric 758bcb0991SDimitry Andric bool HasTwoLoadInputs() const { 768bcb0991SDimitry Andric return isa<LoadInst>(LHS) && isa<LoadInst>(RHS); 770b57cec5SDimitry Andric } 780b57cec5SDimitry Andric 798bcb0991SDimitry Andric LoadInst *getBaseLoad() const { 808bcb0991SDimitry Andric return VecLd.front(); 818bcb0991SDimitry Andric } 820b57cec5SDimitry Andric }; 830b57cec5SDimitry Andric 840b57cec5SDimitry Andric /// Represent a sequence of multiply-accumulate operations with the aim to 850b57cec5SDimitry Andric /// perform the multiplications in parallel. 860b57cec5SDimitry Andric class Reduction { 870b57cec5SDimitry Andric Instruction *Root = nullptr; 880b57cec5SDimitry Andric Value *Acc = nullptr; 898bcb0991SDimitry Andric MulCandList Muls; 908bcb0991SDimitry Andric MulPairList MulPairs; 918bcb0991SDimitry Andric SetVector<Instruction*> Adds; 920b57cec5SDimitry Andric 930b57cec5SDimitry Andric public: 940b57cec5SDimitry Andric Reduction() = delete; 950b57cec5SDimitry Andric 960b57cec5SDimitry Andric Reduction (Instruction *Add) : Root(Add) { } 970b57cec5SDimitry Andric 980b57cec5SDimitry Andric /// Record an Add instruction that is a part of the this reduction. 990b57cec5SDimitry Andric void InsertAdd(Instruction *I) { Adds.insert(I); } 1000b57cec5SDimitry Andric 1018bcb0991SDimitry Andric /// Create MulCandidates, each rooted at a Mul instruction, that is a part 1028bcb0991SDimitry Andric /// of this reduction. 1038bcb0991SDimitry Andric void InsertMuls() { 1048bcb0991SDimitry Andric auto GetMulOperand = [](Value *V) -> Instruction* { 1058bcb0991SDimitry Andric if (auto *SExt = dyn_cast<SExtInst>(V)) { 1068bcb0991SDimitry Andric if (auto *I = dyn_cast<Instruction>(SExt->getOperand(0))) 1078bcb0991SDimitry Andric if (I->getOpcode() == Instruction::Mul) 1088bcb0991SDimitry Andric return I; 1098bcb0991SDimitry Andric } else if (auto *I = dyn_cast<Instruction>(V)) { 1108bcb0991SDimitry Andric if (I->getOpcode() == Instruction::Mul) 1118bcb0991SDimitry Andric return I; 1128bcb0991SDimitry Andric } 1138bcb0991SDimitry Andric return nullptr; 1148bcb0991SDimitry Andric }; 1158bcb0991SDimitry Andric 1168bcb0991SDimitry Andric auto InsertMul = [this](Instruction *I) { 1178bcb0991SDimitry Andric Value *LHS = cast<Instruction>(I->getOperand(0))->getOperand(0); 1188bcb0991SDimitry Andric Value *RHS = cast<Instruction>(I->getOperand(1))->getOperand(0); 1198bcb0991SDimitry Andric Muls.push_back(std::make_unique<MulCandidate>(I, LHS, RHS)); 1208bcb0991SDimitry Andric }; 1218bcb0991SDimitry Andric 1228bcb0991SDimitry Andric for (auto *Add : Adds) { 1238bcb0991SDimitry Andric if (Add == Acc) 1248bcb0991SDimitry Andric continue; 1258bcb0991SDimitry Andric if (auto *Mul = GetMulOperand(Add->getOperand(0))) 1268bcb0991SDimitry Andric InsertMul(Mul); 1278bcb0991SDimitry Andric if (auto *Mul = GetMulOperand(Add->getOperand(1))) 1288bcb0991SDimitry Andric InsertMul(Mul); 1298bcb0991SDimitry Andric } 1300b57cec5SDimitry Andric } 1310b57cec5SDimitry Andric 1320b57cec5SDimitry Andric /// Add the incoming accumulator value, returns true if a value had not 1330b57cec5SDimitry Andric /// already been added. Returning false signals to the user that this 1340b57cec5SDimitry Andric /// reduction already has a value to initialise the accumulator. 1350b57cec5SDimitry Andric bool InsertAcc(Value *V) { 1360b57cec5SDimitry Andric if (Acc) 1370b57cec5SDimitry Andric return false; 1380b57cec5SDimitry Andric Acc = V; 1390b57cec5SDimitry Andric return true; 1400b57cec5SDimitry Andric } 1410b57cec5SDimitry Andric 1428bcb0991SDimitry Andric /// Set two MulCandidates, rooted at muls, that can be executed as a single 1430b57cec5SDimitry Andric /// parallel operation. 1448bcb0991SDimitry Andric void AddMulPair(MulCandidate *Mul0, MulCandidate *Mul1, 1458bcb0991SDimitry Andric bool Exchange = false) { 1468bcb0991SDimitry Andric LLVM_DEBUG(dbgs() << "Pairing:\n" 1478bcb0991SDimitry Andric << *Mul0->Root << "\n" 1488bcb0991SDimitry Andric << *Mul1->Root << "\n"); 1498bcb0991SDimitry Andric Mul0->Paired = true; 1508bcb0991SDimitry Andric Mul1->Paired = true; 1518bcb0991SDimitry Andric if (Exchange) 1528bcb0991SDimitry Andric Mul1->Exchange = true; 1530b57cec5SDimitry Andric MulPairs.push_back(std::make_pair(Mul0, Mul1)); 1540b57cec5SDimitry Andric } 1550b57cec5SDimitry Andric 1560b57cec5SDimitry Andric /// Return the add instruction which is the root of the reduction. 1570b57cec5SDimitry Andric Instruction *getRoot() { return Root; } 1580b57cec5SDimitry Andric 1598bcb0991SDimitry Andric bool is64Bit() const { return Root->getType()->isIntegerTy(64); } 1608bcb0991SDimitry Andric 1618bcb0991SDimitry Andric Type *getType() const { return Root->getType(); } 1628bcb0991SDimitry Andric 1630b57cec5SDimitry Andric /// Return the incoming value to be accumulated. This maybe null. 1640b57cec5SDimitry Andric Value *getAccumulator() { return Acc; } 1650b57cec5SDimitry Andric 1660b57cec5SDimitry Andric /// Return the set of adds that comprise the reduction. 1678bcb0991SDimitry Andric SetVector<Instruction*> &getAdds() { return Adds; } 1680b57cec5SDimitry Andric 1698bcb0991SDimitry Andric /// Return the MulCandidate, rooted at mul instruction, that comprise the 1700b57cec5SDimitry Andric /// the reduction. 1718bcb0991SDimitry Andric MulCandList &getMuls() { return Muls; } 1720b57cec5SDimitry Andric 1738bcb0991SDimitry Andric /// Return the MulCandidate, rooted at mul instructions, that have been 1740b57cec5SDimitry Andric /// paired for parallel execution. 1758bcb0991SDimitry Andric MulPairList &getMulPairs() { return MulPairs; } 1760b57cec5SDimitry Andric 1770b57cec5SDimitry Andric /// To finalise, replace the uses of the root with the intrinsic call. 1780b57cec5SDimitry Andric void UpdateRoot(Instruction *SMLAD) { 1790b57cec5SDimitry Andric Root->replaceAllUsesWith(SMLAD); 1800b57cec5SDimitry Andric } 1818bcb0991SDimitry Andric 1828bcb0991SDimitry Andric void dump() { 1838bcb0991SDimitry Andric LLVM_DEBUG(dbgs() << "Reduction:\n"; 1848bcb0991SDimitry Andric for (auto *Add : Adds) 1858bcb0991SDimitry Andric LLVM_DEBUG(dbgs() << *Add << "\n"); 1868bcb0991SDimitry Andric for (auto &Mul : Muls) 1878bcb0991SDimitry Andric LLVM_DEBUG(dbgs() << *Mul->Root << "\n" 1888bcb0991SDimitry Andric << " " << *Mul->LHS << "\n" 1898bcb0991SDimitry Andric << " " << *Mul->RHS << "\n"); 1908bcb0991SDimitry Andric LLVM_DEBUG(if (Acc) dbgs() << "Acc in: " << *Acc << "\n") 1918bcb0991SDimitry Andric ); 1928bcb0991SDimitry Andric } 1930b57cec5SDimitry Andric }; 1940b57cec5SDimitry Andric 1950b57cec5SDimitry Andric class WidenedLoad { 1960b57cec5SDimitry Andric LoadInst *NewLd = nullptr; 1970b57cec5SDimitry Andric SmallVector<LoadInst*, 4> Loads; 1980b57cec5SDimitry Andric 1990b57cec5SDimitry Andric public: 2000b57cec5SDimitry Andric WidenedLoad(SmallVectorImpl<LoadInst*> &Lds, LoadInst *Wide) 2010b57cec5SDimitry Andric : NewLd(Wide) { 202e8d8bef9SDimitry Andric append_range(Loads, Lds); 2030b57cec5SDimitry Andric } 2040b57cec5SDimitry Andric LoadInst *getLoad() { 2050b57cec5SDimitry Andric return NewLd; 2060b57cec5SDimitry Andric } 2070b57cec5SDimitry Andric }; 2080b57cec5SDimitry Andric 2098bcb0991SDimitry Andric class ARMParallelDSP : public FunctionPass { 2100b57cec5SDimitry Andric ScalarEvolution *SE; 2110b57cec5SDimitry Andric AliasAnalysis *AA; 2120b57cec5SDimitry Andric TargetLibraryInfo *TLI; 2130b57cec5SDimitry Andric DominatorTree *DT; 2140b57cec5SDimitry Andric const DataLayout *DL; 2150b57cec5SDimitry Andric Module *M; 2160b57cec5SDimitry Andric std::map<LoadInst*, LoadInst*> LoadPairs; 2170b57cec5SDimitry Andric SmallPtrSet<LoadInst*, 4> OffsetLoads; 2180b57cec5SDimitry Andric std::map<LoadInst*, std::unique_ptr<WidenedLoad>> WideLoads; 2190b57cec5SDimitry Andric 2200b57cec5SDimitry Andric template<unsigned> 2218bcb0991SDimitry Andric bool IsNarrowSequence(Value *V); 2228bcb0991SDimitry Andric bool Search(Value *V, BasicBlock *BB, Reduction &R); 2230b57cec5SDimitry Andric bool RecordMemoryOps(BasicBlock *BB); 2240b57cec5SDimitry Andric void InsertParallelMACs(Reduction &Reduction); 2250b57cec5SDimitry Andric bool AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1, MemInstList &VecMem); 2268bcb0991SDimitry Andric LoadInst* CreateWideLoad(MemInstList &Loads, IntegerType *LoadTy); 2270b57cec5SDimitry Andric bool CreateParallelPairs(Reduction &R); 2280b57cec5SDimitry Andric 2290b57cec5SDimitry Andric /// Try to match and generate: SMLAD, SMLADX - Signed Multiply Accumulate 2300b57cec5SDimitry Andric /// Dual performs two signed 16x16-bit multiplications. It adds the 2310b57cec5SDimitry Andric /// products to a 32-bit accumulate operand. Optionally, the instruction can 2320b57cec5SDimitry Andric /// exchange the halfwords of the second operand before performing the 2330b57cec5SDimitry Andric /// arithmetic. 2348bcb0991SDimitry Andric bool MatchSMLAD(Function &F); 2350b57cec5SDimitry Andric 2360b57cec5SDimitry Andric public: 2370b57cec5SDimitry Andric static char ID; 2380b57cec5SDimitry Andric 2398bcb0991SDimitry Andric ARMParallelDSP() : FunctionPass(ID) { } 2400b57cec5SDimitry Andric 2410b57cec5SDimitry Andric void getAnalysisUsage(AnalysisUsage &AU) const override { 2428bcb0991SDimitry Andric FunctionPass::getAnalysisUsage(AU); 2430b57cec5SDimitry Andric AU.addRequired<AssumptionCacheTracker>(); 2440b57cec5SDimitry Andric AU.addRequired<ScalarEvolutionWrapperPass>(); 2450b57cec5SDimitry Andric AU.addRequired<AAResultsWrapperPass>(); 2460b57cec5SDimitry Andric AU.addRequired<TargetLibraryInfoWrapperPass>(); 2470b57cec5SDimitry Andric AU.addRequired<DominatorTreeWrapperPass>(); 2480b57cec5SDimitry Andric AU.addRequired<TargetPassConfig>(); 2498bcb0991SDimitry Andric AU.addPreserved<ScalarEvolutionWrapperPass>(); 2508bcb0991SDimitry Andric AU.addPreserved<GlobalsAAWrapperPass>(); 2510b57cec5SDimitry Andric AU.setPreservesCFG(); 2520b57cec5SDimitry Andric } 2530b57cec5SDimitry Andric 2548bcb0991SDimitry Andric bool runOnFunction(Function &F) override { 2550b57cec5SDimitry Andric if (DisableParallelDSP) 2560b57cec5SDimitry Andric return false; 2578bcb0991SDimitry Andric if (skipFunction(F)) 2588bcb0991SDimitry Andric return false; 2598bcb0991SDimitry Andric 2600b57cec5SDimitry Andric SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2610b57cec5SDimitry Andric AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2628bcb0991SDimitry Andric TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); 2630b57cec5SDimitry Andric DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2640b57cec5SDimitry Andric auto &TPC = getAnalysis<TargetPassConfig>(); 2650b57cec5SDimitry Andric 2660b57cec5SDimitry Andric M = F.getParent(); 2670b57cec5SDimitry Andric DL = &M->getDataLayout(); 2680b57cec5SDimitry Andric 2690b57cec5SDimitry Andric auto &TM = TPC.getTM<TargetMachine>(); 2700b57cec5SDimitry Andric auto *ST = &TM.getSubtarget<ARMSubtarget>(F); 2710b57cec5SDimitry Andric 2720b57cec5SDimitry Andric if (!ST->allowsUnalignedMem()) { 2730b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "Unaligned memory access not supported: not " 2740b57cec5SDimitry Andric "running pass ARMParallelDSP\n"); 2750b57cec5SDimitry Andric return false; 2760b57cec5SDimitry Andric } 2770b57cec5SDimitry Andric 2780b57cec5SDimitry Andric if (!ST->hasDSP()) { 2790b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "DSP extension not enabled: not running pass " 2800b57cec5SDimitry Andric "ARMParallelDSP\n"); 2810b57cec5SDimitry Andric return false; 2820b57cec5SDimitry Andric } 2830b57cec5SDimitry Andric 2840b57cec5SDimitry Andric if (!ST->isLittle()) { 2850b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "Only supporting little endian: not running pass " 2860b57cec5SDimitry Andric << "ARMParallelDSP\n"); 2870b57cec5SDimitry Andric return false; 2880b57cec5SDimitry Andric } 2890b57cec5SDimitry Andric 2900b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n"); 2910b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << " - " << F.getName() << "\n\n"); 2920b57cec5SDimitry Andric 2938bcb0991SDimitry Andric bool Changes = MatchSMLAD(F); 2940b57cec5SDimitry Andric return Changes; 2950b57cec5SDimitry Andric } 2960b57cec5SDimitry Andric }; 2970b57cec5SDimitry Andric } 2980b57cec5SDimitry Andric 2990b57cec5SDimitry Andric bool ARMParallelDSP::AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1, 3000b57cec5SDimitry Andric MemInstList &VecMem) { 3010b57cec5SDimitry Andric if (!Ld0 || !Ld1) 3020b57cec5SDimitry Andric return false; 3030b57cec5SDimitry Andric 3040b57cec5SDimitry Andric if (!LoadPairs.count(Ld0) || LoadPairs[Ld0] != Ld1) 3050b57cec5SDimitry Andric return false; 3060b57cec5SDimitry Andric 3070b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "Loads are sequential and valid:\n"; 3080b57cec5SDimitry Andric dbgs() << "Ld0:"; Ld0->dump(); 3090b57cec5SDimitry Andric dbgs() << "Ld1:"; Ld1->dump(); 3100b57cec5SDimitry Andric ); 3110b57cec5SDimitry Andric 3120b57cec5SDimitry Andric VecMem.clear(); 3130b57cec5SDimitry Andric VecMem.push_back(Ld0); 3140b57cec5SDimitry Andric VecMem.push_back(Ld1); 3150b57cec5SDimitry Andric return true; 3160b57cec5SDimitry Andric } 3170b57cec5SDimitry Andric 3180b57cec5SDimitry Andric // MaxBitwidth: the maximum supported bitwidth of the elements in the DSP 3190b57cec5SDimitry Andric // instructions, which is set to 16. So here we should collect all i8 and i16 3200b57cec5SDimitry Andric // narrow operations. 3210b57cec5SDimitry Andric // TODO: we currently only collect i16, and will support i8 later, so that's 3220b57cec5SDimitry Andric // why we check that types are equal to MaxBitWidth, and not <= MaxBitWidth. 3230b57cec5SDimitry Andric template<unsigned MaxBitWidth> 3248bcb0991SDimitry Andric bool ARMParallelDSP::IsNarrowSequence(Value *V) { 3258bcb0991SDimitry Andric if (auto *SExt = dyn_cast<SExtInst>(V)) { 3268bcb0991SDimitry Andric if (SExt->getSrcTy()->getIntegerBitWidth() != MaxBitWidth) 3270b57cec5SDimitry Andric return false; 3280b57cec5SDimitry Andric 3298bcb0991SDimitry Andric if (auto *Ld = dyn_cast<LoadInst>(SExt->getOperand(0))) { 3308bcb0991SDimitry Andric // Check that this load could be paired. 3318bcb0991SDimitry Andric return LoadPairs.count(Ld) || OffsetLoads.count(Ld); 3320b57cec5SDimitry Andric } 3330b57cec5SDimitry Andric } 3340b57cec5SDimitry Andric return false; 3350b57cec5SDimitry Andric } 3360b57cec5SDimitry Andric 3370b57cec5SDimitry Andric /// Iterate through the block and record base, offset pairs of loads which can 3380b57cec5SDimitry Andric /// be widened into a single load. 3390b57cec5SDimitry Andric bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) { 3400b57cec5SDimitry Andric SmallVector<LoadInst*, 8> Loads; 3410b57cec5SDimitry Andric SmallVector<Instruction*, 8> Writes; 3428bcb0991SDimitry Andric LoadPairs.clear(); 3438bcb0991SDimitry Andric WideLoads.clear(); 3440b57cec5SDimitry Andric 3450b57cec5SDimitry Andric // Collect loads and instruction that may write to memory. For now we only 3460b57cec5SDimitry Andric // record loads which are simple, sign-extended and have a single user. 3470b57cec5SDimitry Andric // TODO: Allow zero-extended loads. 3480b57cec5SDimitry Andric for (auto &I : *BB) { 3490b57cec5SDimitry Andric if (I.mayWriteToMemory()) 3500b57cec5SDimitry Andric Writes.push_back(&I); 3510b57cec5SDimitry Andric auto *Ld = dyn_cast<LoadInst>(&I); 3520b57cec5SDimitry Andric if (!Ld || !Ld->isSimple() || 3530b57cec5SDimitry Andric !Ld->hasOneUse() || !isa<SExtInst>(Ld->user_back())) 3540b57cec5SDimitry Andric continue; 3550b57cec5SDimitry Andric Loads.push_back(Ld); 3560b57cec5SDimitry Andric } 3570b57cec5SDimitry Andric 3588bcb0991SDimitry Andric if (Loads.empty() || Loads.size() > NumLoadLimit) 3598bcb0991SDimitry Andric return false; 3608bcb0991SDimitry Andric 3610b57cec5SDimitry Andric using InstSet = std::set<Instruction*>; 3620b57cec5SDimitry Andric using DepMap = std::map<Instruction*, InstSet>; 3630b57cec5SDimitry Andric DepMap RAWDeps; 3640b57cec5SDimitry Andric 3650b57cec5SDimitry Andric // Record any writes that may alias a load. 366e8d8bef9SDimitry Andric const auto Size = LocationSize::beforeOrAfterPointer(); 367bdd1243dSDimitry Andric for (auto *Write : Writes) { 368bdd1243dSDimitry Andric for (auto *Read : Loads) { 3690b57cec5SDimitry Andric MemoryLocation ReadLoc = 3700b57cec5SDimitry Andric MemoryLocation(Read->getPointerOperand(), Size); 3710b57cec5SDimitry Andric 372bdd1243dSDimitry Andric if (!isModOrRefSet(AA->getModRefInfo(Write, ReadLoc))) 3730b57cec5SDimitry Andric continue; 3745ffd83dbSDimitry Andric if (Write->comesBefore(Read)) 3750b57cec5SDimitry Andric RAWDeps[Read].insert(Write); 3760b57cec5SDimitry Andric } 3770b57cec5SDimitry Andric } 3780b57cec5SDimitry Andric 3790b57cec5SDimitry Andric // Check whether there's not a write between the two loads which would 3800b57cec5SDimitry Andric // prevent them from being safely merged. 3810b57cec5SDimitry Andric auto SafeToPair = [&](LoadInst *Base, LoadInst *Offset) { 3825ffd83dbSDimitry Andric bool BaseFirst = Base->comesBefore(Offset); 3835ffd83dbSDimitry Andric LoadInst *Dominator = BaseFirst ? Base : Offset; 3845ffd83dbSDimitry Andric LoadInst *Dominated = BaseFirst ? Offset : Base; 3850b57cec5SDimitry Andric 3860b57cec5SDimitry Andric if (RAWDeps.count(Dominated)) { 3870b57cec5SDimitry Andric InstSet &WritesBefore = RAWDeps[Dominated]; 3880b57cec5SDimitry Andric 389bdd1243dSDimitry Andric for (auto *Before : WritesBefore) { 3900b57cec5SDimitry Andric // We can't move the second load backward, past a write, to merge 3910b57cec5SDimitry Andric // with the first load. 3925ffd83dbSDimitry Andric if (Dominator->comesBefore(Before)) 3930b57cec5SDimitry Andric return false; 3940b57cec5SDimitry Andric } 3950b57cec5SDimitry Andric } 3960b57cec5SDimitry Andric return true; 3970b57cec5SDimitry Andric }; 3980b57cec5SDimitry Andric 3990b57cec5SDimitry Andric // Record base, offset load pairs. 4000b57cec5SDimitry Andric for (auto *Base : Loads) { 4010b57cec5SDimitry Andric for (auto *Offset : Loads) { 4028bcb0991SDimitry Andric if (Base == Offset || OffsetLoads.count(Offset)) 4030b57cec5SDimitry Andric continue; 4040b57cec5SDimitry Andric 405fe6060f1SDimitry Andric if (isConsecutiveAccess(Base, Offset, *DL, *SE) && 4060b57cec5SDimitry Andric SafeToPair(Base, Offset)) { 4070b57cec5SDimitry Andric LoadPairs[Base] = Offset; 4080b57cec5SDimitry Andric OffsetLoads.insert(Offset); 4090b57cec5SDimitry Andric break; 4100b57cec5SDimitry Andric } 4110b57cec5SDimitry Andric } 4120b57cec5SDimitry Andric } 4130b57cec5SDimitry Andric 4140b57cec5SDimitry Andric LLVM_DEBUG(if (!LoadPairs.empty()) { 4150b57cec5SDimitry Andric dbgs() << "Consecutive load pairs:\n"; 4160b57cec5SDimitry Andric for (auto &MapIt : LoadPairs) { 4170b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << *MapIt.first << ", " 4180b57cec5SDimitry Andric << *MapIt.second << "\n"); 4190b57cec5SDimitry Andric } 4200b57cec5SDimitry Andric }); 4210b57cec5SDimitry Andric return LoadPairs.size() > 1; 4220b57cec5SDimitry Andric } 4230b57cec5SDimitry Andric 4248bcb0991SDimitry Andric // Search recursively back through the operands to find a tree of values that 4258bcb0991SDimitry Andric // form a multiply-accumulate chain. The search records the Add and Mul 4268bcb0991SDimitry Andric // instructions that form the reduction and allows us to find a single value 4278bcb0991SDimitry Andric // to be used as the initial input to the accumlator. 4288bcb0991SDimitry Andric bool ARMParallelDSP::Search(Value *V, BasicBlock *BB, Reduction &R) { 4298bcb0991SDimitry Andric // If we find a non-instruction, try to use it as the initial accumulator 4308bcb0991SDimitry Andric // value. This may have already been found during the search in which case 4318bcb0991SDimitry Andric // this function will return false, signaling a search fail. 4328bcb0991SDimitry Andric auto *I = dyn_cast<Instruction>(V); 4338bcb0991SDimitry Andric if (!I) 4348bcb0991SDimitry Andric return R.InsertAcc(V); 4358bcb0991SDimitry Andric 4368bcb0991SDimitry Andric if (I->getParent() != BB) 4378bcb0991SDimitry Andric return false; 4388bcb0991SDimitry Andric 4398bcb0991SDimitry Andric switch (I->getOpcode()) { 4408bcb0991SDimitry Andric default: 4418bcb0991SDimitry Andric break; 4428bcb0991SDimitry Andric case Instruction::PHI: 4438bcb0991SDimitry Andric // Could be the accumulator value. 4448bcb0991SDimitry Andric return R.InsertAcc(V); 4458bcb0991SDimitry Andric case Instruction::Add: { 4468bcb0991SDimitry Andric // Adds should be adding together two muls, or another add and a mul to 4478bcb0991SDimitry Andric // be within the mac chain. One of the operands may also be the 4488bcb0991SDimitry Andric // accumulator value at which point we should stop searching. 4498bcb0991SDimitry Andric R.InsertAdd(I); 4508bcb0991SDimitry Andric Value *LHS = I->getOperand(0); 4518bcb0991SDimitry Andric Value *RHS = I->getOperand(1); 4528bcb0991SDimitry Andric bool ValidLHS = Search(LHS, BB, R); 4538bcb0991SDimitry Andric bool ValidRHS = Search(RHS, BB, R); 4548bcb0991SDimitry Andric 4558bcb0991SDimitry Andric if (ValidLHS && ValidRHS) 4568bcb0991SDimitry Andric return true; 4578bcb0991SDimitry Andric 45881ad6265SDimitry Andric // Ensure we don't add the root as the incoming accumulator. 45981ad6265SDimitry Andric if (R.getRoot() == I) 46081ad6265SDimitry Andric return false; 46181ad6265SDimitry Andric 4628bcb0991SDimitry Andric return R.InsertAcc(I); 4638bcb0991SDimitry Andric } 4648bcb0991SDimitry Andric case Instruction::Mul: { 4658bcb0991SDimitry Andric Value *MulOp0 = I->getOperand(0); 4668bcb0991SDimitry Andric Value *MulOp1 = I->getOperand(1); 4678bcb0991SDimitry Andric return IsNarrowSequence<16>(MulOp0) && IsNarrowSequence<16>(MulOp1); 4688bcb0991SDimitry Andric } 4698bcb0991SDimitry Andric case Instruction::SExt: 4708bcb0991SDimitry Andric return Search(I->getOperand(0), BB, R); 4718bcb0991SDimitry Andric } 4728bcb0991SDimitry Andric return false; 4738bcb0991SDimitry Andric } 4748bcb0991SDimitry Andric 4758bcb0991SDimitry Andric // The pass needs to identify integer add/sub reductions of 16-bit vector 4760b57cec5SDimitry Andric // multiplications. 4770b57cec5SDimitry Andric // To use SMLAD: 4780b57cec5SDimitry Andric // 1) we first need to find integer add then look for this pattern: 4790b57cec5SDimitry Andric // 4800b57cec5SDimitry Andric // acc0 = ... 4810b57cec5SDimitry Andric // ld0 = load i16 4820b57cec5SDimitry Andric // sext0 = sext i16 %ld0 to i32 4830b57cec5SDimitry Andric // ld1 = load i16 4840b57cec5SDimitry Andric // sext1 = sext i16 %ld1 to i32 4850b57cec5SDimitry Andric // mul0 = mul %sext0, %sext1 4860b57cec5SDimitry Andric // ld2 = load i16 4870b57cec5SDimitry Andric // sext2 = sext i16 %ld2 to i32 4880b57cec5SDimitry Andric // ld3 = load i16 4890b57cec5SDimitry Andric // sext3 = sext i16 %ld3 to i32 4900b57cec5SDimitry Andric // mul1 = mul i32 %sext2, %sext3 4910b57cec5SDimitry Andric // add0 = add i32 %mul0, %acc0 4920b57cec5SDimitry Andric // acc1 = add i32 %add0, %mul1 4930b57cec5SDimitry Andric // 4940b57cec5SDimitry Andric // Which can be selected to: 4950b57cec5SDimitry Andric // 4960b57cec5SDimitry Andric // ldr r0 4970b57cec5SDimitry Andric // ldr r1 4980b57cec5SDimitry Andric // smlad r2, r0, r1, r2 4990b57cec5SDimitry Andric // 5000b57cec5SDimitry Andric // If constants are used instead of loads, these will need to be hoisted 5010b57cec5SDimitry Andric // out and into a register. 5020b57cec5SDimitry Andric // 5030b57cec5SDimitry Andric // If loop invariants are used instead of loads, these need to be packed 5040b57cec5SDimitry Andric // before the loop begins. 5050b57cec5SDimitry Andric // 5068bcb0991SDimitry Andric bool ARMParallelDSP::MatchSMLAD(Function &F) { 5070b57cec5SDimitry Andric bool Changed = false; 5080b57cec5SDimitry Andric 5098bcb0991SDimitry Andric for (auto &BB : F) { 5108bcb0991SDimitry Andric SmallPtrSet<Instruction*, 4> AllAdds; 5118bcb0991SDimitry Andric if (!RecordMemoryOps(&BB)) 5128bcb0991SDimitry Andric continue; 5138bcb0991SDimitry Andric 5148bcb0991SDimitry Andric for (Instruction &I : reverse(BB)) { 5150b57cec5SDimitry Andric if (I.getOpcode() != Instruction::Add) 5160b57cec5SDimitry Andric continue; 5170b57cec5SDimitry Andric 5180b57cec5SDimitry Andric if (AllAdds.count(&I)) 5190b57cec5SDimitry Andric continue; 5200b57cec5SDimitry Andric 5210b57cec5SDimitry Andric const auto *Ty = I.getType(); 5220b57cec5SDimitry Andric if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64)) 5230b57cec5SDimitry Andric continue; 5240b57cec5SDimitry Andric 5250b57cec5SDimitry Andric Reduction R(&I); 5268bcb0991SDimitry Andric if (!Search(&I, &BB, R)) 5270b57cec5SDimitry Andric continue; 5280b57cec5SDimitry Andric 5298bcb0991SDimitry Andric R.InsertMuls(); 5308bcb0991SDimitry Andric LLVM_DEBUG(dbgs() << "After search, Reduction:\n"; R.dump()); 5318bcb0991SDimitry Andric 5320b57cec5SDimitry Andric if (!CreateParallelPairs(R)) 5330b57cec5SDimitry Andric continue; 5340b57cec5SDimitry Andric 5350b57cec5SDimitry Andric InsertParallelMACs(R); 5360b57cec5SDimitry Andric Changed = true; 5370b57cec5SDimitry Andric AllAdds.insert(R.getAdds().begin(), R.getAdds().end()); 53881ad6265SDimitry Andric LLVM_DEBUG(dbgs() << "BB after inserting parallel MACs:\n" << BB); 5390b57cec5SDimitry Andric } 5408bcb0991SDimitry Andric } 5410b57cec5SDimitry Andric 5420b57cec5SDimitry Andric return Changed; 5430b57cec5SDimitry Andric } 5440b57cec5SDimitry Andric 5450b57cec5SDimitry Andric bool ARMParallelDSP::CreateParallelPairs(Reduction &R) { 5460b57cec5SDimitry Andric 5470b57cec5SDimitry Andric // Not enough mul operations to make a pair. 5480b57cec5SDimitry Andric if (R.getMuls().size() < 2) 5490b57cec5SDimitry Andric return false; 5500b57cec5SDimitry Andric 5510b57cec5SDimitry Andric // Check that the muls operate directly upon sign extended loads. 5528bcb0991SDimitry Andric for (auto &MulCand : R.getMuls()) { 5538bcb0991SDimitry Andric if (!MulCand->HasTwoLoadInputs()) 5540b57cec5SDimitry Andric return false; 5550b57cec5SDimitry Andric } 5560b57cec5SDimitry Andric 5578bcb0991SDimitry Andric auto CanPair = [&](Reduction &R, MulCandidate *PMul0, MulCandidate *PMul1) { 5580b57cec5SDimitry Andric // The first elements of each vector should be loads with sexts. If we 5590b57cec5SDimitry Andric // find that its two pairs of consecutive loads, then these can be 5600b57cec5SDimitry Andric // transformed into two wider loads and the users can be replaced with 5610b57cec5SDimitry Andric // DSP intrinsics. 5628bcb0991SDimitry Andric auto Ld0 = static_cast<LoadInst*>(PMul0->LHS); 5638bcb0991SDimitry Andric auto Ld1 = static_cast<LoadInst*>(PMul1->LHS); 5648bcb0991SDimitry Andric auto Ld2 = static_cast<LoadInst*>(PMul0->RHS); 5658bcb0991SDimitry Andric auto Ld3 = static_cast<LoadInst*>(PMul1->RHS); 5660b57cec5SDimitry Andric 5675ffd83dbSDimitry Andric // Check that each mul is operating on two different loads. 5685ffd83dbSDimitry Andric if (Ld0 == Ld2 || Ld1 == Ld3) 5695ffd83dbSDimitry Andric return false; 5705ffd83dbSDimitry Andric 5710b57cec5SDimitry Andric if (AreSequentialLoads(Ld0, Ld1, PMul0->VecLd)) { 5720b57cec5SDimitry Andric if (AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) { 5730b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n"); 5740b57cec5SDimitry Andric R.AddMulPair(PMul0, PMul1); 5750b57cec5SDimitry Andric return true; 5760b57cec5SDimitry Andric } else if (AreSequentialLoads(Ld3, Ld2, PMul1->VecLd)) { 5770b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n"); 5780b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << " exchanging Ld2 and Ld3\n"); 5798bcb0991SDimitry Andric R.AddMulPair(PMul0, PMul1, true); 5800b57cec5SDimitry Andric return true; 5810b57cec5SDimitry Andric } 5820b57cec5SDimitry Andric } else if (AreSequentialLoads(Ld1, Ld0, PMul0->VecLd) && 5830b57cec5SDimitry Andric AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) { 5840b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n"); 5850b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << " exchanging Ld0 and Ld1\n"); 5860b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << " and swapping muls\n"); 5870b57cec5SDimitry Andric // Only the second operand can be exchanged, so swap the muls. 5888bcb0991SDimitry Andric R.AddMulPair(PMul1, PMul0, true); 5890b57cec5SDimitry Andric return true; 5900b57cec5SDimitry Andric } 5910b57cec5SDimitry Andric return false; 5920b57cec5SDimitry Andric }; 5930b57cec5SDimitry Andric 5948bcb0991SDimitry Andric MulCandList &Muls = R.getMuls(); 5950b57cec5SDimitry Andric const unsigned Elems = Muls.size(); 5960b57cec5SDimitry Andric for (unsigned i = 0; i < Elems; ++i) { 5978bcb0991SDimitry Andric MulCandidate *PMul0 = static_cast<MulCandidate*>(Muls[i].get()); 5988bcb0991SDimitry Andric if (PMul0->Paired) 5990b57cec5SDimitry Andric continue; 6000b57cec5SDimitry Andric 6010b57cec5SDimitry Andric for (unsigned j = 0; j < Elems; ++j) { 6020b57cec5SDimitry Andric if (i == j) 6030b57cec5SDimitry Andric continue; 6040b57cec5SDimitry Andric 6058bcb0991SDimitry Andric MulCandidate *PMul1 = static_cast<MulCandidate*>(Muls[j].get()); 6068bcb0991SDimitry Andric if (PMul1->Paired) 6070b57cec5SDimitry Andric continue; 6080b57cec5SDimitry Andric 6090b57cec5SDimitry Andric const Instruction *Mul0 = PMul0->Root; 6100b57cec5SDimitry Andric const Instruction *Mul1 = PMul1->Root; 6110b57cec5SDimitry Andric if (Mul0 == Mul1) 6120b57cec5SDimitry Andric continue; 6130b57cec5SDimitry Andric 6140b57cec5SDimitry Andric assert(PMul0 != PMul1 && "expected different chains"); 6150b57cec5SDimitry Andric 6168bcb0991SDimitry Andric if (CanPair(R, PMul0, PMul1)) 6170b57cec5SDimitry Andric break; 6180b57cec5SDimitry Andric } 6190b57cec5SDimitry Andric } 6200b57cec5SDimitry Andric return !R.getMulPairs().empty(); 6210b57cec5SDimitry Andric } 6220b57cec5SDimitry Andric 6230b57cec5SDimitry Andric void ARMParallelDSP::InsertParallelMACs(Reduction &R) { 6240b57cec5SDimitry Andric 6258bcb0991SDimitry Andric auto CreateSMLAD = [&](LoadInst* WideLd0, LoadInst *WideLd1, 6260b57cec5SDimitry Andric Value *Acc, bool Exchange, 6270b57cec5SDimitry Andric Instruction *InsertAfter) { 6280b57cec5SDimitry Andric // Replace the reduction chain with an intrinsic call 6290b57cec5SDimitry Andric 6300b57cec5SDimitry Andric Value* Args[] = { WideLd0, WideLd1, Acc }; 6310b57cec5SDimitry Andric Function *SMLAD = nullptr; 6320b57cec5SDimitry Andric if (Exchange) 6330b57cec5SDimitry Andric SMLAD = Acc->getType()->isIntegerTy(32) ? 6340b57cec5SDimitry Andric Intrinsic::getDeclaration(M, Intrinsic::arm_smladx) : 6350b57cec5SDimitry Andric Intrinsic::getDeclaration(M, Intrinsic::arm_smlaldx); 6360b57cec5SDimitry Andric else 6370b57cec5SDimitry Andric SMLAD = Acc->getType()->isIntegerTy(32) ? 6380b57cec5SDimitry Andric Intrinsic::getDeclaration(M, Intrinsic::arm_smlad) : 6390b57cec5SDimitry Andric Intrinsic::getDeclaration(M, Intrinsic::arm_smlald); 6400b57cec5SDimitry Andric 6410b57cec5SDimitry Andric IRBuilder<NoFolder> Builder(InsertAfter->getParent(), 6428bcb0991SDimitry Andric BasicBlock::iterator(InsertAfter)); 6430b57cec5SDimitry Andric Instruction *Call = Builder.CreateCall(SMLAD, Args); 6440b57cec5SDimitry Andric NumSMLAD++; 6450b57cec5SDimitry Andric return Call; 6460b57cec5SDimitry Andric }; 6470b57cec5SDimitry Andric 6488bcb0991SDimitry Andric // Return the instruction after the dominated instruction. 6498bcb0991SDimitry Andric auto GetInsertPoint = [this](Value *A, Value *B) { 6508bcb0991SDimitry Andric assert((isa<Instruction>(A) || isa<Instruction>(B)) && 6518bcb0991SDimitry Andric "expected at least one instruction"); 6528bcb0991SDimitry Andric 6538bcb0991SDimitry Andric Value *V = nullptr; 6548bcb0991SDimitry Andric if (!isa<Instruction>(A)) 6558bcb0991SDimitry Andric V = B; 6568bcb0991SDimitry Andric else if (!isa<Instruction>(B)) 6578bcb0991SDimitry Andric V = A; 6588bcb0991SDimitry Andric else 6598bcb0991SDimitry Andric V = DT->dominates(cast<Instruction>(A), cast<Instruction>(B)) ? B : A; 6608bcb0991SDimitry Andric 6618bcb0991SDimitry Andric return &*++BasicBlock::iterator(cast<Instruction>(V)); 6628bcb0991SDimitry Andric }; 6638bcb0991SDimitry Andric 6640b57cec5SDimitry Andric Value *Acc = R.getAccumulator(); 6650b57cec5SDimitry Andric 6668bcb0991SDimitry Andric // For any muls that were discovered but not paired, accumulate their values 6678bcb0991SDimitry Andric // as before. 6688bcb0991SDimitry Andric IRBuilder<NoFolder> Builder(R.getRoot()->getParent()); 6698bcb0991SDimitry Andric MulCandList &MulCands = R.getMuls(); 6708bcb0991SDimitry Andric for (auto &MulCand : MulCands) { 6718bcb0991SDimitry Andric if (MulCand->Paired) 6728bcb0991SDimitry Andric continue; 6738bcb0991SDimitry Andric 6748bcb0991SDimitry Andric Instruction *Mul = cast<Instruction>(MulCand->Root); 6758bcb0991SDimitry Andric LLVM_DEBUG(dbgs() << "Accumulating unpaired mul: " << *Mul << "\n"); 6768bcb0991SDimitry Andric 6778bcb0991SDimitry Andric if (R.getType() != Mul->getType()) { 6788bcb0991SDimitry Andric assert(R.is64Bit() && "expected 64-bit result"); 6798bcb0991SDimitry Andric Builder.SetInsertPoint(&*++BasicBlock::iterator(Mul)); 6808bcb0991SDimitry Andric Mul = cast<Instruction>(Builder.CreateSExt(Mul, R.getRoot()->getType())); 6818bcb0991SDimitry Andric } 6828bcb0991SDimitry Andric 6838bcb0991SDimitry Andric if (!Acc) { 6848bcb0991SDimitry Andric Acc = Mul; 6858bcb0991SDimitry Andric continue; 6868bcb0991SDimitry Andric } 6878bcb0991SDimitry Andric 6888bcb0991SDimitry Andric // If Acc is the original incoming value to the reduction, it could be a 6898bcb0991SDimitry Andric // phi. But the phi will dominate Mul, meaning that Mul will be the 6908bcb0991SDimitry Andric // insertion point. 6918bcb0991SDimitry Andric Builder.SetInsertPoint(GetInsertPoint(Mul, Acc)); 6928bcb0991SDimitry Andric Acc = Builder.CreateAdd(Mul, Acc); 6938bcb0991SDimitry Andric } 6948bcb0991SDimitry Andric 6958bcb0991SDimitry Andric if (!Acc) { 6968bcb0991SDimitry Andric Acc = R.is64Bit() ? 6978bcb0991SDimitry Andric ConstantInt::get(IntegerType::get(M->getContext(), 64), 0) : 6988bcb0991SDimitry Andric ConstantInt::get(IntegerType::get(M->getContext(), 32), 0); 6998bcb0991SDimitry Andric } else if (Acc->getType() != R.getType()) { 7008bcb0991SDimitry Andric Builder.SetInsertPoint(R.getRoot()); 7018bcb0991SDimitry Andric Acc = Builder.CreateSExt(Acc, R.getType()); 7028bcb0991SDimitry Andric } 7038bcb0991SDimitry Andric 7048bcb0991SDimitry Andric // Roughly sort the mul pairs in their program order. 7055ffd83dbSDimitry Andric llvm::sort(R.getMulPairs(), [](auto &PairA, auto &PairB) { 7068bcb0991SDimitry Andric const Instruction *A = PairA.first->Root; 7078bcb0991SDimitry Andric const Instruction *B = PairB.first->Root; 7085ffd83dbSDimitry Andric return A->comesBefore(B); 7098bcb0991SDimitry Andric }); 7108bcb0991SDimitry Andric 7118bcb0991SDimitry Andric IntegerType *Ty = IntegerType::get(M->getContext(), 32); 7120b57cec5SDimitry Andric for (auto &Pair : R.getMulPairs()) { 7138bcb0991SDimitry Andric MulCandidate *LHSMul = Pair.first; 7148bcb0991SDimitry Andric MulCandidate *RHSMul = Pair.second; 7158bcb0991SDimitry Andric LoadInst *BaseLHS = LHSMul->getBaseLoad(); 7168bcb0991SDimitry Andric LoadInst *BaseRHS = RHSMul->getBaseLoad(); 7178bcb0991SDimitry Andric LoadInst *WideLHS = WideLoads.count(BaseLHS) ? 7188bcb0991SDimitry Andric WideLoads[BaseLHS]->getLoad() : CreateWideLoad(LHSMul->VecLd, Ty); 7198bcb0991SDimitry Andric LoadInst *WideRHS = WideLoads.count(BaseRHS) ? 7208bcb0991SDimitry Andric WideLoads[BaseRHS]->getLoad() : CreateWideLoad(RHSMul->VecLd, Ty); 7210b57cec5SDimitry Andric 7228bcb0991SDimitry Andric Instruction *InsertAfter = GetInsertPoint(WideLHS, WideRHS); 7238bcb0991SDimitry Andric InsertAfter = GetInsertPoint(InsertAfter, Acc); 7248bcb0991SDimitry Andric Acc = CreateSMLAD(WideLHS, WideRHS, Acc, RHSMul->Exchange, InsertAfter); 7250b57cec5SDimitry Andric } 7260b57cec5SDimitry Andric R.UpdateRoot(cast<Instruction>(Acc)); 7270b57cec5SDimitry Andric } 7280b57cec5SDimitry Andric 7298bcb0991SDimitry Andric LoadInst* ARMParallelDSP::CreateWideLoad(MemInstList &Loads, 7300b57cec5SDimitry Andric IntegerType *LoadTy) { 7310b57cec5SDimitry Andric assert(Loads.size() == 2 && "currently only support widening two loads"); 7320b57cec5SDimitry Andric 7330b57cec5SDimitry Andric LoadInst *Base = Loads[0]; 7340b57cec5SDimitry Andric LoadInst *Offset = Loads[1]; 7350b57cec5SDimitry Andric 7360b57cec5SDimitry Andric Instruction *BaseSExt = dyn_cast<SExtInst>(Base->user_back()); 7370b57cec5SDimitry Andric Instruction *OffsetSExt = dyn_cast<SExtInst>(Offset->user_back()); 7380b57cec5SDimitry Andric 7390b57cec5SDimitry Andric assert((BaseSExt && OffsetSExt) 7400b57cec5SDimitry Andric && "Loads should have a single, extending, user"); 7410b57cec5SDimitry Andric 7420b57cec5SDimitry Andric std::function<void(Value*, Value*)> MoveBefore = 7430b57cec5SDimitry Andric [&](Value *A, Value *B) -> void { 7440b57cec5SDimitry Andric if (!isa<Instruction>(A) || !isa<Instruction>(B)) 7450b57cec5SDimitry Andric return; 7460b57cec5SDimitry Andric 7470b57cec5SDimitry Andric auto *Source = cast<Instruction>(A); 7480b57cec5SDimitry Andric auto *Sink = cast<Instruction>(B); 7490b57cec5SDimitry Andric 7500b57cec5SDimitry Andric if (DT->dominates(Source, Sink) || 7510b57cec5SDimitry Andric Source->getParent() != Sink->getParent() || 7520b57cec5SDimitry Andric isa<PHINode>(Source) || isa<PHINode>(Sink)) 7530b57cec5SDimitry Andric return; 7540b57cec5SDimitry Andric 7550b57cec5SDimitry Andric Source->moveBefore(Sink); 7568bcb0991SDimitry Andric for (auto &Op : Source->operands()) 7578bcb0991SDimitry Andric MoveBefore(Op, Source); 7580b57cec5SDimitry Andric }; 7590b57cec5SDimitry Andric 7600b57cec5SDimitry Andric // Insert the load at the point of the original dominating load. 7610b57cec5SDimitry Andric LoadInst *DomLoad = DT->dominates(Base, Offset) ? Base : Offset; 7620b57cec5SDimitry Andric IRBuilder<NoFolder> IRB(DomLoad->getParent(), 7630b57cec5SDimitry Andric ++BasicBlock::iterator(DomLoad)); 7640b57cec5SDimitry Andric 76506c3fb27SDimitry Andric // Create the wide load, while making sure to maintain the original alignment 76606c3fb27SDimitry Andric // as this prevents ldrd from being generated when it could be illegal due to 76706c3fb27SDimitry Andric // memory alignment. 76806c3fb27SDimitry Andric Value *VecPtr = Base->getPointerOperand(); 7695ffd83dbSDimitry Andric LoadInst *WideLoad = IRB.CreateAlignedLoad(LoadTy, VecPtr, Base->getAlign()); 7700b57cec5SDimitry Andric 7710b57cec5SDimitry Andric // Make sure everything is in the correct order in the basic block. 7720b57cec5SDimitry Andric MoveBefore(Base->getPointerOperand(), VecPtr); 7730b57cec5SDimitry Andric MoveBefore(VecPtr, WideLoad); 7740b57cec5SDimitry Andric 7750b57cec5SDimitry Andric // From the wide load, create two values that equal the original two loads. 7760b57cec5SDimitry Andric // Loads[0] needs trunc while Loads[1] needs a lshr and trunc. 7770b57cec5SDimitry Andric // TODO: Support big-endian as well. 7780b57cec5SDimitry Andric Value *Bottom = IRB.CreateTrunc(WideLoad, Base->getType()); 7798bcb0991SDimitry Andric Value *NewBaseSExt = IRB.CreateSExt(Bottom, BaseSExt->getType()); 7808bcb0991SDimitry Andric BaseSExt->replaceAllUsesWith(NewBaseSExt); 7810b57cec5SDimitry Andric 7820b57cec5SDimitry Andric IntegerType *OffsetTy = cast<IntegerType>(Offset->getType()); 7830b57cec5SDimitry Andric Value *ShiftVal = ConstantInt::get(LoadTy, OffsetTy->getBitWidth()); 7840b57cec5SDimitry Andric Value *Top = IRB.CreateLShr(WideLoad, ShiftVal); 7850b57cec5SDimitry Andric Value *Trunc = IRB.CreateTrunc(Top, OffsetTy); 7868bcb0991SDimitry Andric Value *NewOffsetSExt = IRB.CreateSExt(Trunc, OffsetSExt->getType()); 7878bcb0991SDimitry Andric OffsetSExt->replaceAllUsesWith(NewOffsetSExt); 7880b57cec5SDimitry Andric 7898bcb0991SDimitry Andric LLVM_DEBUG(dbgs() << "From Base and Offset:\n" 7908bcb0991SDimitry Andric << *Base << "\n" << *Offset << "\n" 7918bcb0991SDimitry Andric << "Created Wide Load:\n" 7928bcb0991SDimitry Andric << *WideLoad << "\n" 7938bcb0991SDimitry Andric << *Bottom << "\n" 7948bcb0991SDimitry Andric << *NewBaseSExt << "\n" 7958bcb0991SDimitry Andric << *Top << "\n" 7968bcb0991SDimitry Andric << *Trunc << "\n" 7978bcb0991SDimitry Andric << *NewOffsetSExt << "\n"); 7980b57cec5SDimitry Andric WideLoads.emplace(std::make_pair(Base, 7998bcb0991SDimitry Andric std::make_unique<WidenedLoad>(Loads, WideLoad))); 8000b57cec5SDimitry Andric return WideLoad; 8010b57cec5SDimitry Andric } 8020b57cec5SDimitry Andric 8030b57cec5SDimitry Andric Pass *llvm::createARMParallelDSPPass() { 8040b57cec5SDimitry Andric return new ARMParallelDSP(); 8050b57cec5SDimitry Andric } 8060b57cec5SDimitry Andric 8070b57cec5SDimitry Andric char ARMParallelDSP::ID = 0; 8080b57cec5SDimitry Andric 8090b57cec5SDimitry Andric INITIALIZE_PASS_BEGIN(ARMParallelDSP, "arm-parallel-dsp", 8108bcb0991SDimitry Andric "Transform functions to use DSP intrinsics", false, false) 8110b57cec5SDimitry Andric INITIALIZE_PASS_END(ARMParallelDSP, "arm-parallel-dsp", 8128bcb0991SDimitry Andric "Transform functions to use DSP intrinsics", false, false) 813