xref: /freebsd-src/contrib/llvm-project/llvm/lib/Target/ARM/ARMParallelDSP.cpp (revision 0fca6ea1d4eea4c934cfff25ac9ee8ad6fe95583)
18bcb0991SDimitry Andric //===- ARMParallelDSP.cpp - Parallel DSP Pass -----------------------------===//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric //
90b57cec5SDimitry Andric /// \file
100b57cec5SDimitry Andric /// Armv6 introduced instructions to perform 32-bit SIMD operations. The
110b57cec5SDimitry Andric /// purpose of this pass is do some IR pattern matching to create ACLE
120b57cec5SDimitry Andric /// DSP intrinsics, which map on these 32-bit SIMD operations.
130b57cec5SDimitry Andric /// This pass runs only when unaligned accesses is supported/enabled.
140b57cec5SDimitry Andric //
150b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
160b57cec5SDimitry Andric 
17480093f4SDimitry Andric #include "ARM.h"
18480093f4SDimitry Andric #include "ARMSubtarget.h"
190b57cec5SDimitry Andric #include "llvm/ADT/SmallPtrSet.h"
20480093f4SDimitry Andric #include "llvm/ADT/Statistic.h"
210b57cec5SDimitry Andric #include "llvm/Analysis/AliasAnalysis.h"
225ffd83dbSDimitry Andric #include "llvm/Analysis/AssumptionCache.h"
235ffd83dbSDimitry Andric #include "llvm/Analysis/GlobalsModRef.h"
240b57cec5SDimitry Andric #include "llvm/Analysis/LoopAccessAnalysis.h"
25e8d8bef9SDimitry Andric #include "llvm/Analysis/TargetLibraryInfo.h"
26480093f4SDimitry Andric #include "llvm/CodeGen/TargetPassConfig.h"
27*0fca6ea1SDimitry Andric #include "llvm/IR/IRBuilder.h"
280b57cec5SDimitry Andric #include "llvm/IR/Instructions.h"
29480093f4SDimitry Andric #include "llvm/IR/IntrinsicsARM.h"
30*0fca6ea1SDimitry Andric #include "llvm/IR/Module.h"
310b57cec5SDimitry Andric #include "llvm/IR/NoFolder.h"
32480093f4SDimitry Andric #include "llvm/IR/PatternMatch.h"
330b57cec5SDimitry Andric #include "llvm/Pass.h"
340b57cec5SDimitry Andric #include "llvm/PassRegistry.h"
350b57cec5SDimitry Andric #include "llvm/Support/Debug.h"
36480093f4SDimitry Andric #include "llvm/Transforms/Scalar.h"
37480093f4SDimitry Andric #include "llvm/Transforms/Utils/BasicBlockUtils.h"
380b57cec5SDimitry Andric 
390b57cec5SDimitry Andric using namespace llvm;
400b57cec5SDimitry Andric using namespace PatternMatch;
410b57cec5SDimitry Andric 
420b57cec5SDimitry Andric #define DEBUG_TYPE "arm-parallel-dsp"
430b57cec5SDimitry Andric 
440b57cec5SDimitry Andric STATISTIC(NumSMLAD , "Number of smlad instructions generated");
450b57cec5SDimitry Andric 
460b57cec5SDimitry Andric static cl::opt<bool>
470b57cec5SDimitry Andric DisableParallelDSP("disable-arm-parallel-dsp", cl::Hidden, cl::init(false),
480b57cec5SDimitry Andric                    cl::desc("Disable the ARM Parallel DSP pass"));
490b57cec5SDimitry Andric 
508bcb0991SDimitry Andric static cl::opt<unsigned>
518bcb0991SDimitry Andric NumLoadLimit("arm-parallel-dsp-load-limit", cl::Hidden, cl::init(16),
528bcb0991SDimitry Andric              cl::desc("Limit the number of loads analysed"));
538bcb0991SDimitry Andric 
540b57cec5SDimitry Andric namespace {
558bcb0991SDimitry Andric   struct MulCandidate;
560b57cec5SDimitry Andric   class Reduction;
570b57cec5SDimitry Andric 
588bcb0991SDimitry Andric   using MulCandList = SmallVector<std::unique_ptr<MulCandidate>, 8>;
598bcb0991SDimitry Andric   using MemInstList = SmallVectorImpl<LoadInst*>;
608bcb0991SDimitry Andric   using MulPairList = SmallVector<std::pair<MulCandidate*, MulCandidate*>, 8>;
610b57cec5SDimitry Andric 
628bcb0991SDimitry Andric   // 'MulCandidate' holds the multiplication instructions that are candidates
630b57cec5SDimitry Andric   // for parallel execution.
648bcb0991SDimitry Andric   struct MulCandidate {
658bcb0991SDimitry Andric     Instruction   *Root;
668bcb0991SDimitry Andric     Value*        LHS;
678bcb0991SDimitry Andric     Value*        RHS;
680b57cec5SDimitry Andric     bool          Exchange = false;
698bcb0991SDimitry Andric     bool          Paired = false;
708bcb0991SDimitry Andric     SmallVector<LoadInst*, 2> VecLd;    // Container for loads to widen.
710b57cec5SDimitry Andric 
728bcb0991SDimitry Andric     MulCandidate(Instruction *I, Value *lhs, Value *rhs) :
738bcb0991SDimitry Andric       Root(I), LHS(lhs), RHS(rhs) { }
748bcb0991SDimitry Andric 
758bcb0991SDimitry Andric     bool HasTwoLoadInputs() const {
768bcb0991SDimitry Andric       return isa<LoadInst>(LHS) && isa<LoadInst>(RHS);
770b57cec5SDimitry Andric     }
780b57cec5SDimitry Andric 
798bcb0991SDimitry Andric     LoadInst *getBaseLoad() const {
808bcb0991SDimitry Andric       return VecLd.front();
818bcb0991SDimitry Andric     }
820b57cec5SDimitry Andric   };
830b57cec5SDimitry Andric 
840b57cec5SDimitry Andric   /// Represent a sequence of multiply-accumulate operations with the aim to
850b57cec5SDimitry Andric   /// perform the multiplications in parallel.
860b57cec5SDimitry Andric   class Reduction {
870b57cec5SDimitry Andric     Instruction     *Root = nullptr;
880b57cec5SDimitry Andric     Value           *Acc = nullptr;
898bcb0991SDimitry Andric     MulCandList     Muls;
908bcb0991SDimitry Andric     MulPairList        MulPairs;
918bcb0991SDimitry Andric     SetVector<Instruction*> Adds;
920b57cec5SDimitry Andric 
930b57cec5SDimitry Andric   public:
940b57cec5SDimitry Andric     Reduction() = delete;
950b57cec5SDimitry Andric 
960b57cec5SDimitry Andric     Reduction (Instruction *Add) : Root(Add) { }
970b57cec5SDimitry Andric 
980b57cec5SDimitry Andric     /// Record an Add instruction that is a part of the this reduction.
990b57cec5SDimitry Andric     void InsertAdd(Instruction *I) { Adds.insert(I); }
1000b57cec5SDimitry Andric 
1018bcb0991SDimitry Andric     /// Create MulCandidates, each rooted at a Mul instruction, that is a part
1028bcb0991SDimitry Andric     /// of this reduction.
1038bcb0991SDimitry Andric     void InsertMuls() {
1048bcb0991SDimitry Andric       auto GetMulOperand = [](Value *V) -> Instruction* {
1058bcb0991SDimitry Andric         if (auto *SExt = dyn_cast<SExtInst>(V)) {
1068bcb0991SDimitry Andric           if (auto *I = dyn_cast<Instruction>(SExt->getOperand(0)))
1078bcb0991SDimitry Andric             if (I->getOpcode() == Instruction::Mul)
1088bcb0991SDimitry Andric               return I;
1098bcb0991SDimitry Andric         } else if (auto *I = dyn_cast<Instruction>(V)) {
1108bcb0991SDimitry Andric           if (I->getOpcode() == Instruction::Mul)
1118bcb0991SDimitry Andric             return I;
1128bcb0991SDimitry Andric         }
1138bcb0991SDimitry Andric         return nullptr;
1148bcb0991SDimitry Andric       };
1158bcb0991SDimitry Andric 
1168bcb0991SDimitry Andric       auto InsertMul = [this](Instruction *I) {
1178bcb0991SDimitry Andric         Value *LHS = cast<Instruction>(I->getOperand(0))->getOperand(0);
1188bcb0991SDimitry Andric         Value *RHS = cast<Instruction>(I->getOperand(1))->getOperand(0);
1198bcb0991SDimitry Andric         Muls.push_back(std::make_unique<MulCandidate>(I, LHS, RHS));
1208bcb0991SDimitry Andric       };
1218bcb0991SDimitry Andric 
1228bcb0991SDimitry Andric       for (auto *Add : Adds) {
1238bcb0991SDimitry Andric         if (Add == Acc)
1248bcb0991SDimitry Andric           continue;
1258bcb0991SDimitry Andric         if (auto *Mul = GetMulOperand(Add->getOperand(0)))
1268bcb0991SDimitry Andric           InsertMul(Mul);
1278bcb0991SDimitry Andric         if (auto *Mul = GetMulOperand(Add->getOperand(1)))
1288bcb0991SDimitry Andric           InsertMul(Mul);
1298bcb0991SDimitry Andric       }
1300b57cec5SDimitry Andric     }
1310b57cec5SDimitry Andric 
1320b57cec5SDimitry Andric     /// Add the incoming accumulator value, returns true if a value had not
1330b57cec5SDimitry Andric     /// already been added. Returning false signals to the user that this
1340b57cec5SDimitry Andric     /// reduction already has a value to initialise the accumulator.
1350b57cec5SDimitry Andric     bool InsertAcc(Value *V) {
1360b57cec5SDimitry Andric       if (Acc)
1370b57cec5SDimitry Andric         return false;
1380b57cec5SDimitry Andric       Acc = V;
1390b57cec5SDimitry Andric       return true;
1400b57cec5SDimitry Andric     }
1410b57cec5SDimitry Andric 
1428bcb0991SDimitry Andric     /// Set two MulCandidates, rooted at muls, that can be executed as a single
1430b57cec5SDimitry Andric     /// parallel operation.
1448bcb0991SDimitry Andric     void AddMulPair(MulCandidate *Mul0, MulCandidate *Mul1,
1458bcb0991SDimitry Andric                     bool Exchange = false) {
1468bcb0991SDimitry Andric       LLVM_DEBUG(dbgs() << "Pairing:\n"
1478bcb0991SDimitry Andric                  << *Mul0->Root << "\n"
1488bcb0991SDimitry Andric                  << *Mul1->Root << "\n");
1498bcb0991SDimitry Andric       Mul0->Paired = true;
1508bcb0991SDimitry Andric       Mul1->Paired = true;
1518bcb0991SDimitry Andric       if (Exchange)
1528bcb0991SDimitry Andric         Mul1->Exchange = true;
1530b57cec5SDimitry Andric       MulPairs.push_back(std::make_pair(Mul0, Mul1));
1540b57cec5SDimitry Andric     }
1550b57cec5SDimitry Andric 
1560b57cec5SDimitry Andric     /// Return the add instruction which is the root of the reduction.
1570b57cec5SDimitry Andric     Instruction *getRoot() { return Root; }
1580b57cec5SDimitry Andric 
1598bcb0991SDimitry Andric     bool is64Bit() const { return Root->getType()->isIntegerTy(64); }
1608bcb0991SDimitry Andric 
1618bcb0991SDimitry Andric     Type *getType() const { return Root->getType(); }
1628bcb0991SDimitry Andric 
1630b57cec5SDimitry Andric     /// Return the incoming value to be accumulated. This maybe null.
1640b57cec5SDimitry Andric     Value *getAccumulator() { return Acc; }
1650b57cec5SDimitry Andric 
1660b57cec5SDimitry Andric     /// Return the set of adds that comprise the reduction.
1678bcb0991SDimitry Andric     SetVector<Instruction*> &getAdds() { return Adds; }
1680b57cec5SDimitry Andric 
1698bcb0991SDimitry Andric     /// Return the MulCandidate, rooted at mul instruction, that comprise the
1700b57cec5SDimitry Andric     /// the reduction.
1718bcb0991SDimitry Andric     MulCandList &getMuls() { return Muls; }
1720b57cec5SDimitry Andric 
1738bcb0991SDimitry Andric     /// Return the MulCandidate, rooted at mul instructions, that have been
1740b57cec5SDimitry Andric     /// paired for parallel execution.
1758bcb0991SDimitry Andric     MulPairList &getMulPairs() { return MulPairs; }
1760b57cec5SDimitry Andric 
1770b57cec5SDimitry Andric     /// To finalise, replace the uses of the root with the intrinsic call.
1780b57cec5SDimitry Andric     void UpdateRoot(Instruction *SMLAD) {
1790b57cec5SDimitry Andric       Root->replaceAllUsesWith(SMLAD);
1800b57cec5SDimitry Andric     }
1818bcb0991SDimitry Andric 
1828bcb0991SDimitry Andric     void dump() {
1838bcb0991SDimitry Andric       LLVM_DEBUG(dbgs() << "Reduction:\n";
1848bcb0991SDimitry Andric         for (auto *Add : Adds)
1858bcb0991SDimitry Andric           LLVM_DEBUG(dbgs() << *Add << "\n");
1868bcb0991SDimitry Andric         for (auto &Mul : Muls)
1878bcb0991SDimitry Andric           LLVM_DEBUG(dbgs() << *Mul->Root << "\n"
1888bcb0991SDimitry Andric                      << "  " << *Mul->LHS << "\n"
1898bcb0991SDimitry Andric                      << "  " << *Mul->RHS << "\n");
1908bcb0991SDimitry Andric         LLVM_DEBUG(if (Acc) dbgs() << "Acc in: " << *Acc << "\n")
1918bcb0991SDimitry Andric       );
1928bcb0991SDimitry Andric     }
1930b57cec5SDimitry Andric   };
1940b57cec5SDimitry Andric 
1950b57cec5SDimitry Andric   class WidenedLoad {
1960b57cec5SDimitry Andric     LoadInst *NewLd = nullptr;
1970b57cec5SDimitry Andric     SmallVector<LoadInst*, 4> Loads;
1980b57cec5SDimitry Andric 
1990b57cec5SDimitry Andric   public:
2000b57cec5SDimitry Andric     WidenedLoad(SmallVectorImpl<LoadInst*> &Lds, LoadInst *Wide)
2010b57cec5SDimitry Andric       : NewLd(Wide) {
202e8d8bef9SDimitry Andric       append_range(Loads, Lds);
2030b57cec5SDimitry Andric     }
2040b57cec5SDimitry Andric     LoadInst *getLoad() {
2050b57cec5SDimitry Andric       return NewLd;
2060b57cec5SDimitry Andric     }
2070b57cec5SDimitry Andric   };
2080b57cec5SDimitry Andric 
2098bcb0991SDimitry Andric   class ARMParallelDSP : public FunctionPass {
2100b57cec5SDimitry Andric     ScalarEvolution   *SE;
2110b57cec5SDimitry Andric     AliasAnalysis     *AA;
2120b57cec5SDimitry Andric     TargetLibraryInfo *TLI;
2130b57cec5SDimitry Andric     DominatorTree     *DT;
2140b57cec5SDimitry Andric     const DataLayout  *DL;
2150b57cec5SDimitry Andric     Module            *M;
2160b57cec5SDimitry Andric     std::map<LoadInst*, LoadInst*> LoadPairs;
2170b57cec5SDimitry Andric     SmallPtrSet<LoadInst*, 4> OffsetLoads;
2180b57cec5SDimitry Andric     std::map<LoadInst*, std::unique_ptr<WidenedLoad>> WideLoads;
2190b57cec5SDimitry Andric 
2200b57cec5SDimitry Andric     template<unsigned>
2218bcb0991SDimitry Andric     bool IsNarrowSequence(Value *V);
2228bcb0991SDimitry Andric     bool Search(Value *V, BasicBlock *BB, Reduction &R);
2230b57cec5SDimitry Andric     bool RecordMemoryOps(BasicBlock *BB);
2240b57cec5SDimitry Andric     void InsertParallelMACs(Reduction &Reduction);
2250b57cec5SDimitry Andric     bool AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1, MemInstList &VecMem);
2268bcb0991SDimitry Andric     LoadInst* CreateWideLoad(MemInstList &Loads, IntegerType *LoadTy);
2270b57cec5SDimitry Andric     bool CreateParallelPairs(Reduction &R);
2280b57cec5SDimitry Andric 
2290b57cec5SDimitry Andric     /// Try to match and generate: SMLAD, SMLADX - Signed Multiply Accumulate
2300b57cec5SDimitry Andric     /// Dual performs two signed 16x16-bit multiplications. It adds the
2310b57cec5SDimitry Andric     /// products to a 32-bit accumulate operand. Optionally, the instruction can
2320b57cec5SDimitry Andric     /// exchange the halfwords of the second operand before performing the
2330b57cec5SDimitry Andric     /// arithmetic.
2348bcb0991SDimitry Andric     bool MatchSMLAD(Function &F);
2350b57cec5SDimitry Andric 
2360b57cec5SDimitry Andric   public:
2370b57cec5SDimitry Andric     static char ID;
2380b57cec5SDimitry Andric 
2398bcb0991SDimitry Andric     ARMParallelDSP() : FunctionPass(ID) { }
2400b57cec5SDimitry Andric 
2410b57cec5SDimitry Andric     void getAnalysisUsage(AnalysisUsage &AU) const override {
2428bcb0991SDimitry Andric       FunctionPass::getAnalysisUsage(AU);
2430b57cec5SDimitry Andric       AU.addRequired<AssumptionCacheTracker>();
2440b57cec5SDimitry Andric       AU.addRequired<ScalarEvolutionWrapperPass>();
2450b57cec5SDimitry Andric       AU.addRequired<AAResultsWrapperPass>();
2460b57cec5SDimitry Andric       AU.addRequired<TargetLibraryInfoWrapperPass>();
2470b57cec5SDimitry Andric       AU.addRequired<DominatorTreeWrapperPass>();
2480b57cec5SDimitry Andric       AU.addRequired<TargetPassConfig>();
2498bcb0991SDimitry Andric       AU.addPreserved<ScalarEvolutionWrapperPass>();
2508bcb0991SDimitry Andric       AU.addPreserved<GlobalsAAWrapperPass>();
2510b57cec5SDimitry Andric       AU.setPreservesCFG();
2520b57cec5SDimitry Andric     }
2530b57cec5SDimitry Andric 
2548bcb0991SDimitry Andric     bool runOnFunction(Function &F) override {
2550b57cec5SDimitry Andric       if (DisableParallelDSP)
2560b57cec5SDimitry Andric         return false;
2578bcb0991SDimitry Andric       if (skipFunction(F))
2588bcb0991SDimitry Andric         return false;
2598bcb0991SDimitry Andric 
2600b57cec5SDimitry Andric       SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2610b57cec5SDimitry Andric       AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2628bcb0991SDimitry Andric       TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
2630b57cec5SDimitry Andric       DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2640b57cec5SDimitry Andric       auto &TPC = getAnalysis<TargetPassConfig>();
2650b57cec5SDimitry Andric 
2660b57cec5SDimitry Andric       M = F.getParent();
2670b57cec5SDimitry Andric       DL = &M->getDataLayout();
2680b57cec5SDimitry Andric 
2690b57cec5SDimitry Andric       auto &TM = TPC.getTM<TargetMachine>();
2700b57cec5SDimitry Andric       auto *ST = &TM.getSubtarget<ARMSubtarget>(F);
2710b57cec5SDimitry Andric 
2720b57cec5SDimitry Andric       if (!ST->allowsUnalignedMem()) {
2730b57cec5SDimitry Andric         LLVM_DEBUG(dbgs() << "Unaligned memory access not supported: not "
2740b57cec5SDimitry Andric                              "running pass ARMParallelDSP\n");
2750b57cec5SDimitry Andric         return false;
2760b57cec5SDimitry Andric       }
2770b57cec5SDimitry Andric 
2780b57cec5SDimitry Andric       if (!ST->hasDSP()) {
2790b57cec5SDimitry Andric         LLVM_DEBUG(dbgs() << "DSP extension not enabled: not running pass "
2800b57cec5SDimitry Andric                              "ARMParallelDSP\n");
2810b57cec5SDimitry Andric         return false;
2820b57cec5SDimitry Andric       }
2830b57cec5SDimitry Andric 
2840b57cec5SDimitry Andric       if (!ST->isLittle()) {
2850b57cec5SDimitry Andric         LLVM_DEBUG(dbgs() << "Only supporting little endian: not running pass "
2860b57cec5SDimitry Andric                           << "ARMParallelDSP\n");
2870b57cec5SDimitry Andric         return false;
2880b57cec5SDimitry Andric       }
2890b57cec5SDimitry Andric 
2900b57cec5SDimitry Andric       LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n");
2910b57cec5SDimitry Andric       LLVM_DEBUG(dbgs() << " - " << F.getName() << "\n\n");
2920b57cec5SDimitry Andric 
2938bcb0991SDimitry Andric       bool Changes = MatchSMLAD(F);
2940b57cec5SDimitry Andric       return Changes;
2950b57cec5SDimitry Andric     }
2960b57cec5SDimitry Andric   };
2970b57cec5SDimitry Andric }
2980b57cec5SDimitry Andric 
2990b57cec5SDimitry Andric bool ARMParallelDSP::AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1,
3000b57cec5SDimitry Andric                                         MemInstList &VecMem) {
3010b57cec5SDimitry Andric   if (!Ld0 || !Ld1)
3020b57cec5SDimitry Andric     return false;
3030b57cec5SDimitry Andric 
3040b57cec5SDimitry Andric   if (!LoadPairs.count(Ld0) || LoadPairs[Ld0] != Ld1)
3050b57cec5SDimitry Andric     return false;
3060b57cec5SDimitry Andric 
3070b57cec5SDimitry Andric   LLVM_DEBUG(dbgs() << "Loads are sequential and valid:\n";
3080b57cec5SDimitry Andric     dbgs() << "Ld0:"; Ld0->dump();
3090b57cec5SDimitry Andric     dbgs() << "Ld1:"; Ld1->dump();
3100b57cec5SDimitry Andric   );
3110b57cec5SDimitry Andric 
3120b57cec5SDimitry Andric   VecMem.clear();
3130b57cec5SDimitry Andric   VecMem.push_back(Ld0);
3140b57cec5SDimitry Andric   VecMem.push_back(Ld1);
3150b57cec5SDimitry Andric   return true;
3160b57cec5SDimitry Andric }
3170b57cec5SDimitry Andric 
3180b57cec5SDimitry Andric // MaxBitwidth: the maximum supported bitwidth of the elements in the DSP
3190b57cec5SDimitry Andric // instructions, which is set to 16. So here we should collect all i8 and i16
3200b57cec5SDimitry Andric // narrow operations.
3210b57cec5SDimitry Andric // TODO: we currently only collect i16, and will support i8 later, so that's
3220b57cec5SDimitry Andric // why we check that types are equal to MaxBitWidth, and not <= MaxBitWidth.
3230b57cec5SDimitry Andric template<unsigned MaxBitWidth>
3248bcb0991SDimitry Andric bool ARMParallelDSP::IsNarrowSequence(Value *V) {
3258bcb0991SDimitry Andric   if (auto *SExt = dyn_cast<SExtInst>(V)) {
3268bcb0991SDimitry Andric     if (SExt->getSrcTy()->getIntegerBitWidth() != MaxBitWidth)
3270b57cec5SDimitry Andric       return false;
3280b57cec5SDimitry Andric 
3298bcb0991SDimitry Andric     if (auto *Ld = dyn_cast<LoadInst>(SExt->getOperand(0))) {
3308bcb0991SDimitry Andric       // Check that this load could be paired.
3318bcb0991SDimitry Andric       return LoadPairs.count(Ld) || OffsetLoads.count(Ld);
3320b57cec5SDimitry Andric     }
3330b57cec5SDimitry Andric   }
3340b57cec5SDimitry Andric   return false;
3350b57cec5SDimitry Andric }
3360b57cec5SDimitry Andric 
3370b57cec5SDimitry Andric /// Iterate through the block and record base, offset pairs of loads which can
3380b57cec5SDimitry Andric /// be widened into a single load.
3390b57cec5SDimitry Andric bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
3400b57cec5SDimitry Andric   SmallVector<LoadInst*, 8> Loads;
3410b57cec5SDimitry Andric   SmallVector<Instruction*, 8> Writes;
3428bcb0991SDimitry Andric   LoadPairs.clear();
3438bcb0991SDimitry Andric   WideLoads.clear();
3440b57cec5SDimitry Andric 
3450b57cec5SDimitry Andric   // Collect loads and instruction that may write to memory. For now we only
3460b57cec5SDimitry Andric   // record loads which are simple, sign-extended and have a single user.
3470b57cec5SDimitry Andric   // TODO: Allow zero-extended loads.
3480b57cec5SDimitry Andric   for (auto &I : *BB) {
3490b57cec5SDimitry Andric     if (I.mayWriteToMemory())
3500b57cec5SDimitry Andric       Writes.push_back(&I);
3510b57cec5SDimitry Andric     auto *Ld = dyn_cast<LoadInst>(&I);
3520b57cec5SDimitry Andric     if (!Ld || !Ld->isSimple() ||
3530b57cec5SDimitry Andric         !Ld->hasOneUse() || !isa<SExtInst>(Ld->user_back()))
3540b57cec5SDimitry Andric       continue;
3550b57cec5SDimitry Andric     Loads.push_back(Ld);
3560b57cec5SDimitry Andric   }
3570b57cec5SDimitry Andric 
3588bcb0991SDimitry Andric   if (Loads.empty() || Loads.size() > NumLoadLimit)
3598bcb0991SDimitry Andric     return false;
3608bcb0991SDimitry Andric 
3610b57cec5SDimitry Andric   using InstSet = std::set<Instruction*>;
3620b57cec5SDimitry Andric   using DepMap = std::map<Instruction*, InstSet>;
3630b57cec5SDimitry Andric   DepMap RAWDeps;
3640b57cec5SDimitry Andric 
3650b57cec5SDimitry Andric   // Record any writes that may alias a load.
366e8d8bef9SDimitry Andric   const auto Size = LocationSize::beforeOrAfterPointer();
367bdd1243dSDimitry Andric   for (auto *Write : Writes) {
368bdd1243dSDimitry Andric     for (auto *Read : Loads) {
3690b57cec5SDimitry Andric       MemoryLocation ReadLoc =
3700b57cec5SDimitry Andric         MemoryLocation(Read->getPointerOperand(), Size);
3710b57cec5SDimitry Andric 
372bdd1243dSDimitry Andric       if (!isModOrRefSet(AA->getModRefInfo(Write, ReadLoc)))
3730b57cec5SDimitry Andric         continue;
3745ffd83dbSDimitry Andric       if (Write->comesBefore(Read))
3750b57cec5SDimitry Andric         RAWDeps[Read].insert(Write);
3760b57cec5SDimitry Andric     }
3770b57cec5SDimitry Andric   }
3780b57cec5SDimitry Andric 
3790b57cec5SDimitry Andric   // Check whether there's not a write between the two loads which would
3800b57cec5SDimitry Andric   // prevent them from being safely merged.
3810b57cec5SDimitry Andric   auto SafeToPair = [&](LoadInst *Base, LoadInst *Offset) {
3825ffd83dbSDimitry Andric     bool BaseFirst = Base->comesBefore(Offset);
3835ffd83dbSDimitry Andric     LoadInst *Dominator = BaseFirst ? Base : Offset;
3845ffd83dbSDimitry Andric     LoadInst *Dominated = BaseFirst ? Offset : Base;
3850b57cec5SDimitry Andric 
3860b57cec5SDimitry Andric     if (RAWDeps.count(Dominated)) {
3870b57cec5SDimitry Andric       InstSet &WritesBefore = RAWDeps[Dominated];
3880b57cec5SDimitry Andric 
389bdd1243dSDimitry Andric       for (auto *Before : WritesBefore) {
3900b57cec5SDimitry Andric         // We can't move the second load backward, past a write, to merge
3910b57cec5SDimitry Andric         // with the first load.
3925ffd83dbSDimitry Andric         if (Dominator->comesBefore(Before))
3930b57cec5SDimitry Andric           return false;
3940b57cec5SDimitry Andric       }
3950b57cec5SDimitry Andric     }
3960b57cec5SDimitry Andric     return true;
3970b57cec5SDimitry Andric   };
3980b57cec5SDimitry Andric 
3990b57cec5SDimitry Andric   // Record base, offset load pairs.
4000b57cec5SDimitry Andric   for (auto *Base : Loads) {
4010b57cec5SDimitry Andric     for (auto *Offset : Loads) {
4028bcb0991SDimitry Andric       if (Base == Offset || OffsetLoads.count(Offset))
4030b57cec5SDimitry Andric         continue;
4040b57cec5SDimitry Andric 
405fe6060f1SDimitry Andric       if (isConsecutiveAccess(Base, Offset, *DL, *SE) &&
4060b57cec5SDimitry Andric           SafeToPair(Base, Offset)) {
4070b57cec5SDimitry Andric         LoadPairs[Base] = Offset;
4080b57cec5SDimitry Andric         OffsetLoads.insert(Offset);
4090b57cec5SDimitry Andric         break;
4100b57cec5SDimitry Andric       }
4110b57cec5SDimitry Andric     }
4120b57cec5SDimitry Andric   }
4130b57cec5SDimitry Andric 
4140b57cec5SDimitry Andric   LLVM_DEBUG(if (!LoadPairs.empty()) {
4150b57cec5SDimitry Andric                dbgs() << "Consecutive load pairs:\n";
4160b57cec5SDimitry Andric                for (auto &MapIt : LoadPairs) {
4170b57cec5SDimitry Andric                  LLVM_DEBUG(dbgs() << *MapIt.first << ", "
4180b57cec5SDimitry Andric                             << *MapIt.second << "\n");
4190b57cec5SDimitry Andric                }
4200b57cec5SDimitry Andric              });
4210b57cec5SDimitry Andric   return LoadPairs.size() > 1;
4220b57cec5SDimitry Andric }
4230b57cec5SDimitry Andric 
4248bcb0991SDimitry Andric // Search recursively back through the operands to find a tree of values that
4258bcb0991SDimitry Andric // form a multiply-accumulate chain. The search records the Add and Mul
4268bcb0991SDimitry Andric // instructions that form the reduction and allows us to find a single value
4278bcb0991SDimitry Andric // to be used as the initial input to the accumlator.
4288bcb0991SDimitry Andric bool ARMParallelDSP::Search(Value *V, BasicBlock *BB, Reduction &R) {
4298bcb0991SDimitry Andric   // If we find a non-instruction, try to use it as the initial accumulator
4308bcb0991SDimitry Andric   // value. This may have already been found during the search in which case
4318bcb0991SDimitry Andric   // this function will return false, signaling a search fail.
4328bcb0991SDimitry Andric   auto *I = dyn_cast<Instruction>(V);
4338bcb0991SDimitry Andric   if (!I)
4348bcb0991SDimitry Andric     return R.InsertAcc(V);
4358bcb0991SDimitry Andric 
4368bcb0991SDimitry Andric   if (I->getParent() != BB)
4378bcb0991SDimitry Andric     return false;
4388bcb0991SDimitry Andric 
4398bcb0991SDimitry Andric   switch (I->getOpcode()) {
4408bcb0991SDimitry Andric   default:
4418bcb0991SDimitry Andric     break;
4428bcb0991SDimitry Andric   case Instruction::PHI:
4438bcb0991SDimitry Andric     // Could be the accumulator value.
4448bcb0991SDimitry Andric     return R.InsertAcc(V);
4458bcb0991SDimitry Andric   case Instruction::Add: {
4468bcb0991SDimitry Andric     // Adds should be adding together two muls, or another add and a mul to
4478bcb0991SDimitry Andric     // be within the mac chain. One of the operands may also be the
4488bcb0991SDimitry Andric     // accumulator value at which point we should stop searching.
4498bcb0991SDimitry Andric     R.InsertAdd(I);
4508bcb0991SDimitry Andric     Value *LHS = I->getOperand(0);
4518bcb0991SDimitry Andric     Value *RHS = I->getOperand(1);
4528bcb0991SDimitry Andric     bool ValidLHS = Search(LHS, BB, R);
4538bcb0991SDimitry Andric     bool ValidRHS = Search(RHS, BB, R);
4548bcb0991SDimitry Andric 
4558bcb0991SDimitry Andric     if (ValidLHS && ValidRHS)
4568bcb0991SDimitry Andric       return true;
4578bcb0991SDimitry Andric 
45881ad6265SDimitry Andric     // Ensure we don't add the root as the incoming accumulator.
45981ad6265SDimitry Andric     if (R.getRoot() == I)
46081ad6265SDimitry Andric       return false;
46181ad6265SDimitry Andric 
4628bcb0991SDimitry Andric     return R.InsertAcc(I);
4638bcb0991SDimitry Andric   }
4648bcb0991SDimitry Andric   case Instruction::Mul: {
4658bcb0991SDimitry Andric     Value *MulOp0 = I->getOperand(0);
4668bcb0991SDimitry Andric     Value *MulOp1 = I->getOperand(1);
4678bcb0991SDimitry Andric     return IsNarrowSequence<16>(MulOp0) && IsNarrowSequence<16>(MulOp1);
4688bcb0991SDimitry Andric   }
4698bcb0991SDimitry Andric   case Instruction::SExt:
4708bcb0991SDimitry Andric     return Search(I->getOperand(0), BB, R);
4718bcb0991SDimitry Andric   }
4728bcb0991SDimitry Andric   return false;
4738bcb0991SDimitry Andric }
4748bcb0991SDimitry Andric 
4758bcb0991SDimitry Andric // The pass needs to identify integer add/sub reductions of 16-bit vector
4760b57cec5SDimitry Andric // multiplications.
4770b57cec5SDimitry Andric // To use SMLAD:
4780b57cec5SDimitry Andric // 1) we first need to find integer add then look for this pattern:
4790b57cec5SDimitry Andric //
4800b57cec5SDimitry Andric // acc0 = ...
4810b57cec5SDimitry Andric // ld0 = load i16
4820b57cec5SDimitry Andric // sext0 = sext i16 %ld0 to i32
4830b57cec5SDimitry Andric // ld1 = load i16
4840b57cec5SDimitry Andric // sext1 = sext i16 %ld1 to i32
4850b57cec5SDimitry Andric // mul0 = mul %sext0, %sext1
4860b57cec5SDimitry Andric // ld2 = load i16
4870b57cec5SDimitry Andric // sext2 = sext i16 %ld2 to i32
4880b57cec5SDimitry Andric // ld3 = load i16
4890b57cec5SDimitry Andric // sext3 = sext i16 %ld3 to i32
4900b57cec5SDimitry Andric // mul1 = mul i32 %sext2, %sext3
4910b57cec5SDimitry Andric // add0 = add i32 %mul0, %acc0
4920b57cec5SDimitry Andric // acc1 = add i32 %add0, %mul1
4930b57cec5SDimitry Andric //
4940b57cec5SDimitry Andric // Which can be selected to:
4950b57cec5SDimitry Andric //
4960b57cec5SDimitry Andric // ldr r0
4970b57cec5SDimitry Andric // ldr r1
4980b57cec5SDimitry Andric // smlad r2, r0, r1, r2
4990b57cec5SDimitry Andric //
5000b57cec5SDimitry Andric // If constants are used instead of loads, these will need to be hoisted
5010b57cec5SDimitry Andric // out and into a register.
5020b57cec5SDimitry Andric //
5030b57cec5SDimitry Andric // If loop invariants are used instead of loads, these need to be packed
5040b57cec5SDimitry Andric // before the loop begins.
5050b57cec5SDimitry Andric //
5068bcb0991SDimitry Andric bool ARMParallelDSP::MatchSMLAD(Function &F) {
5070b57cec5SDimitry Andric   bool Changed = false;
5080b57cec5SDimitry Andric 
5098bcb0991SDimitry Andric   for (auto &BB : F) {
5108bcb0991SDimitry Andric     SmallPtrSet<Instruction*, 4> AllAdds;
5118bcb0991SDimitry Andric     if (!RecordMemoryOps(&BB))
5128bcb0991SDimitry Andric       continue;
5138bcb0991SDimitry Andric 
5148bcb0991SDimitry Andric     for (Instruction &I : reverse(BB)) {
5150b57cec5SDimitry Andric       if (I.getOpcode() != Instruction::Add)
5160b57cec5SDimitry Andric         continue;
5170b57cec5SDimitry Andric 
5180b57cec5SDimitry Andric       if (AllAdds.count(&I))
5190b57cec5SDimitry Andric         continue;
5200b57cec5SDimitry Andric 
5210b57cec5SDimitry Andric       const auto *Ty = I.getType();
5220b57cec5SDimitry Andric       if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64))
5230b57cec5SDimitry Andric         continue;
5240b57cec5SDimitry Andric 
5250b57cec5SDimitry Andric       Reduction R(&I);
5268bcb0991SDimitry Andric       if (!Search(&I, &BB, R))
5270b57cec5SDimitry Andric         continue;
5280b57cec5SDimitry Andric 
5298bcb0991SDimitry Andric       R.InsertMuls();
5308bcb0991SDimitry Andric       LLVM_DEBUG(dbgs() << "After search, Reduction:\n"; R.dump());
5318bcb0991SDimitry Andric 
5320b57cec5SDimitry Andric       if (!CreateParallelPairs(R))
5330b57cec5SDimitry Andric         continue;
5340b57cec5SDimitry Andric 
5350b57cec5SDimitry Andric       InsertParallelMACs(R);
5360b57cec5SDimitry Andric       Changed = true;
5370b57cec5SDimitry Andric       AllAdds.insert(R.getAdds().begin(), R.getAdds().end());
53881ad6265SDimitry Andric       LLVM_DEBUG(dbgs() << "BB after inserting parallel MACs:\n" << BB);
5390b57cec5SDimitry Andric     }
5408bcb0991SDimitry Andric   }
5410b57cec5SDimitry Andric 
5420b57cec5SDimitry Andric   return Changed;
5430b57cec5SDimitry Andric }
5440b57cec5SDimitry Andric 
5450b57cec5SDimitry Andric bool ARMParallelDSP::CreateParallelPairs(Reduction &R) {
5460b57cec5SDimitry Andric 
5470b57cec5SDimitry Andric   // Not enough mul operations to make a pair.
5480b57cec5SDimitry Andric   if (R.getMuls().size() < 2)
5490b57cec5SDimitry Andric     return false;
5500b57cec5SDimitry Andric 
5510b57cec5SDimitry Andric   // Check that the muls operate directly upon sign extended loads.
5528bcb0991SDimitry Andric   for (auto &MulCand : R.getMuls()) {
5538bcb0991SDimitry Andric     if (!MulCand->HasTwoLoadInputs())
5540b57cec5SDimitry Andric       return false;
5550b57cec5SDimitry Andric   }
5560b57cec5SDimitry Andric 
5578bcb0991SDimitry Andric   auto CanPair = [&](Reduction &R, MulCandidate *PMul0, MulCandidate *PMul1) {
5580b57cec5SDimitry Andric     // The first elements of each vector should be loads with sexts. If we
5590b57cec5SDimitry Andric     // find that its two pairs of consecutive loads, then these can be
5600b57cec5SDimitry Andric     // transformed into two wider loads and the users can be replaced with
5610b57cec5SDimitry Andric     // DSP intrinsics.
5628bcb0991SDimitry Andric     auto Ld0 = static_cast<LoadInst*>(PMul0->LHS);
5638bcb0991SDimitry Andric     auto Ld1 = static_cast<LoadInst*>(PMul1->LHS);
5648bcb0991SDimitry Andric     auto Ld2 = static_cast<LoadInst*>(PMul0->RHS);
5658bcb0991SDimitry Andric     auto Ld3 = static_cast<LoadInst*>(PMul1->RHS);
5660b57cec5SDimitry Andric 
5675ffd83dbSDimitry Andric     // Check that each mul is operating on two different loads.
5685ffd83dbSDimitry Andric     if (Ld0 == Ld2 || Ld1 == Ld3)
5695ffd83dbSDimitry Andric       return false;
5705ffd83dbSDimitry Andric 
5710b57cec5SDimitry Andric     if (AreSequentialLoads(Ld0, Ld1, PMul0->VecLd)) {
5720b57cec5SDimitry Andric       if (AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
5730b57cec5SDimitry Andric         LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
5740b57cec5SDimitry Andric         R.AddMulPair(PMul0, PMul1);
5750b57cec5SDimitry Andric         return true;
5760b57cec5SDimitry Andric       } else if (AreSequentialLoads(Ld3, Ld2, PMul1->VecLd)) {
5770b57cec5SDimitry Andric         LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
5780b57cec5SDimitry Andric         LLVM_DEBUG(dbgs() << "    exchanging Ld2 and Ld3\n");
5798bcb0991SDimitry Andric         R.AddMulPair(PMul0, PMul1, true);
5800b57cec5SDimitry Andric         return true;
5810b57cec5SDimitry Andric       }
5820b57cec5SDimitry Andric     } else if (AreSequentialLoads(Ld1, Ld0, PMul0->VecLd) &&
5830b57cec5SDimitry Andric                AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
5840b57cec5SDimitry Andric       LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
5850b57cec5SDimitry Andric       LLVM_DEBUG(dbgs() << "    exchanging Ld0 and Ld1\n");
5860b57cec5SDimitry Andric       LLVM_DEBUG(dbgs() << "    and swapping muls\n");
5870b57cec5SDimitry Andric       // Only the second operand can be exchanged, so swap the muls.
5888bcb0991SDimitry Andric       R.AddMulPair(PMul1, PMul0, true);
5890b57cec5SDimitry Andric       return true;
5900b57cec5SDimitry Andric     }
5910b57cec5SDimitry Andric     return false;
5920b57cec5SDimitry Andric   };
5930b57cec5SDimitry Andric 
5948bcb0991SDimitry Andric   MulCandList &Muls = R.getMuls();
5950b57cec5SDimitry Andric   const unsigned Elems = Muls.size();
5960b57cec5SDimitry Andric   for (unsigned i = 0; i < Elems; ++i) {
5978bcb0991SDimitry Andric     MulCandidate *PMul0 = static_cast<MulCandidate*>(Muls[i].get());
5988bcb0991SDimitry Andric     if (PMul0->Paired)
5990b57cec5SDimitry Andric       continue;
6000b57cec5SDimitry Andric 
6010b57cec5SDimitry Andric     for (unsigned j = 0; j < Elems; ++j) {
6020b57cec5SDimitry Andric       if (i == j)
6030b57cec5SDimitry Andric         continue;
6040b57cec5SDimitry Andric 
6058bcb0991SDimitry Andric       MulCandidate *PMul1 = static_cast<MulCandidate*>(Muls[j].get());
6068bcb0991SDimitry Andric       if (PMul1->Paired)
6070b57cec5SDimitry Andric         continue;
6080b57cec5SDimitry Andric 
6090b57cec5SDimitry Andric       const Instruction *Mul0 = PMul0->Root;
6100b57cec5SDimitry Andric       const Instruction *Mul1 = PMul1->Root;
6110b57cec5SDimitry Andric       if (Mul0 == Mul1)
6120b57cec5SDimitry Andric         continue;
6130b57cec5SDimitry Andric 
6140b57cec5SDimitry Andric       assert(PMul0 != PMul1 && "expected different chains");
6150b57cec5SDimitry Andric 
6168bcb0991SDimitry Andric       if (CanPair(R, PMul0, PMul1))
6170b57cec5SDimitry Andric         break;
6180b57cec5SDimitry Andric     }
6190b57cec5SDimitry Andric   }
6200b57cec5SDimitry Andric   return !R.getMulPairs().empty();
6210b57cec5SDimitry Andric }
6220b57cec5SDimitry Andric 
6230b57cec5SDimitry Andric void ARMParallelDSP::InsertParallelMACs(Reduction &R) {
6240b57cec5SDimitry Andric 
6258bcb0991SDimitry Andric   auto CreateSMLAD = [&](LoadInst* WideLd0, LoadInst *WideLd1,
6260b57cec5SDimitry Andric                          Value *Acc, bool Exchange,
6270b57cec5SDimitry Andric                          Instruction *InsertAfter) {
6280b57cec5SDimitry Andric     // Replace the reduction chain with an intrinsic call
6290b57cec5SDimitry Andric 
6300b57cec5SDimitry Andric     Value* Args[] = { WideLd0, WideLd1, Acc };
6310b57cec5SDimitry Andric     Function *SMLAD = nullptr;
6320b57cec5SDimitry Andric     if (Exchange)
6330b57cec5SDimitry Andric       SMLAD = Acc->getType()->isIntegerTy(32) ?
6340b57cec5SDimitry Andric         Intrinsic::getDeclaration(M, Intrinsic::arm_smladx) :
6350b57cec5SDimitry Andric         Intrinsic::getDeclaration(M, Intrinsic::arm_smlaldx);
6360b57cec5SDimitry Andric     else
6370b57cec5SDimitry Andric       SMLAD = Acc->getType()->isIntegerTy(32) ?
6380b57cec5SDimitry Andric         Intrinsic::getDeclaration(M, Intrinsic::arm_smlad) :
6390b57cec5SDimitry Andric         Intrinsic::getDeclaration(M, Intrinsic::arm_smlald);
6400b57cec5SDimitry Andric 
6410b57cec5SDimitry Andric     IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
6428bcb0991SDimitry Andric                                 BasicBlock::iterator(InsertAfter));
6430b57cec5SDimitry Andric     Instruction *Call = Builder.CreateCall(SMLAD, Args);
6440b57cec5SDimitry Andric     NumSMLAD++;
6450b57cec5SDimitry Andric     return Call;
6460b57cec5SDimitry Andric   };
6470b57cec5SDimitry Andric 
6488bcb0991SDimitry Andric   // Return the instruction after the dominated instruction.
6498bcb0991SDimitry Andric   auto GetInsertPoint = [this](Value *A, Value *B) {
6508bcb0991SDimitry Andric     assert((isa<Instruction>(A) || isa<Instruction>(B)) &&
6518bcb0991SDimitry Andric            "expected at least one instruction");
6528bcb0991SDimitry Andric 
6538bcb0991SDimitry Andric     Value *V = nullptr;
6548bcb0991SDimitry Andric     if (!isa<Instruction>(A))
6558bcb0991SDimitry Andric       V = B;
6568bcb0991SDimitry Andric     else if (!isa<Instruction>(B))
6578bcb0991SDimitry Andric       V = A;
6588bcb0991SDimitry Andric     else
6598bcb0991SDimitry Andric       V = DT->dominates(cast<Instruction>(A), cast<Instruction>(B)) ? B : A;
6608bcb0991SDimitry Andric 
6618bcb0991SDimitry Andric     return &*++BasicBlock::iterator(cast<Instruction>(V));
6628bcb0991SDimitry Andric   };
6638bcb0991SDimitry Andric 
6640b57cec5SDimitry Andric   Value *Acc = R.getAccumulator();
6650b57cec5SDimitry Andric 
6668bcb0991SDimitry Andric   // For any muls that were discovered but not paired, accumulate their values
6678bcb0991SDimitry Andric   // as before.
6688bcb0991SDimitry Andric   IRBuilder<NoFolder> Builder(R.getRoot()->getParent());
6698bcb0991SDimitry Andric   MulCandList &MulCands = R.getMuls();
6708bcb0991SDimitry Andric   for (auto &MulCand : MulCands) {
6718bcb0991SDimitry Andric     if (MulCand->Paired)
6728bcb0991SDimitry Andric       continue;
6738bcb0991SDimitry Andric 
6748bcb0991SDimitry Andric     Instruction *Mul = cast<Instruction>(MulCand->Root);
6758bcb0991SDimitry Andric     LLVM_DEBUG(dbgs() << "Accumulating unpaired mul: " << *Mul << "\n");
6768bcb0991SDimitry Andric 
6778bcb0991SDimitry Andric     if (R.getType() != Mul->getType()) {
6788bcb0991SDimitry Andric       assert(R.is64Bit() && "expected 64-bit result");
6798bcb0991SDimitry Andric       Builder.SetInsertPoint(&*++BasicBlock::iterator(Mul));
6808bcb0991SDimitry Andric       Mul = cast<Instruction>(Builder.CreateSExt(Mul, R.getRoot()->getType()));
6818bcb0991SDimitry Andric     }
6828bcb0991SDimitry Andric 
6838bcb0991SDimitry Andric     if (!Acc) {
6848bcb0991SDimitry Andric       Acc = Mul;
6858bcb0991SDimitry Andric       continue;
6868bcb0991SDimitry Andric     }
6878bcb0991SDimitry Andric 
6888bcb0991SDimitry Andric     // If Acc is the original incoming value to the reduction, it could be a
6898bcb0991SDimitry Andric     // phi. But the phi will dominate Mul, meaning that Mul will be the
6908bcb0991SDimitry Andric     // insertion point.
6918bcb0991SDimitry Andric     Builder.SetInsertPoint(GetInsertPoint(Mul, Acc));
6928bcb0991SDimitry Andric     Acc = Builder.CreateAdd(Mul, Acc);
6938bcb0991SDimitry Andric   }
6948bcb0991SDimitry Andric 
6958bcb0991SDimitry Andric   if (!Acc) {
6968bcb0991SDimitry Andric     Acc = R.is64Bit() ?
6978bcb0991SDimitry Andric       ConstantInt::get(IntegerType::get(M->getContext(), 64), 0) :
6988bcb0991SDimitry Andric       ConstantInt::get(IntegerType::get(M->getContext(), 32), 0);
6998bcb0991SDimitry Andric   } else if (Acc->getType() != R.getType()) {
7008bcb0991SDimitry Andric     Builder.SetInsertPoint(R.getRoot());
7018bcb0991SDimitry Andric     Acc = Builder.CreateSExt(Acc, R.getType());
7028bcb0991SDimitry Andric   }
7038bcb0991SDimitry Andric 
7048bcb0991SDimitry Andric   // Roughly sort the mul pairs in their program order.
7055ffd83dbSDimitry Andric   llvm::sort(R.getMulPairs(), [](auto &PairA, auto &PairB) {
7068bcb0991SDimitry Andric     const Instruction *A = PairA.first->Root;
7078bcb0991SDimitry Andric     const Instruction *B = PairB.first->Root;
7085ffd83dbSDimitry Andric     return A->comesBefore(B);
7098bcb0991SDimitry Andric   });
7108bcb0991SDimitry Andric 
7118bcb0991SDimitry Andric   IntegerType *Ty = IntegerType::get(M->getContext(), 32);
7120b57cec5SDimitry Andric   for (auto &Pair : R.getMulPairs()) {
7138bcb0991SDimitry Andric     MulCandidate *LHSMul = Pair.first;
7148bcb0991SDimitry Andric     MulCandidate *RHSMul = Pair.second;
7158bcb0991SDimitry Andric     LoadInst *BaseLHS = LHSMul->getBaseLoad();
7168bcb0991SDimitry Andric     LoadInst *BaseRHS = RHSMul->getBaseLoad();
7178bcb0991SDimitry Andric     LoadInst *WideLHS = WideLoads.count(BaseLHS) ?
7188bcb0991SDimitry Andric       WideLoads[BaseLHS]->getLoad() : CreateWideLoad(LHSMul->VecLd, Ty);
7198bcb0991SDimitry Andric     LoadInst *WideRHS = WideLoads.count(BaseRHS) ?
7208bcb0991SDimitry Andric       WideLoads[BaseRHS]->getLoad() : CreateWideLoad(RHSMul->VecLd, Ty);
7210b57cec5SDimitry Andric 
7228bcb0991SDimitry Andric     Instruction *InsertAfter = GetInsertPoint(WideLHS, WideRHS);
7238bcb0991SDimitry Andric     InsertAfter = GetInsertPoint(InsertAfter, Acc);
7248bcb0991SDimitry Andric     Acc = CreateSMLAD(WideLHS, WideRHS, Acc, RHSMul->Exchange, InsertAfter);
7250b57cec5SDimitry Andric   }
7260b57cec5SDimitry Andric   R.UpdateRoot(cast<Instruction>(Acc));
7270b57cec5SDimitry Andric }
7280b57cec5SDimitry Andric 
7298bcb0991SDimitry Andric LoadInst* ARMParallelDSP::CreateWideLoad(MemInstList &Loads,
7300b57cec5SDimitry Andric                                          IntegerType *LoadTy) {
7310b57cec5SDimitry Andric   assert(Loads.size() == 2 && "currently only support widening two loads");
7320b57cec5SDimitry Andric 
7330b57cec5SDimitry Andric   LoadInst *Base = Loads[0];
7340b57cec5SDimitry Andric   LoadInst *Offset = Loads[1];
7350b57cec5SDimitry Andric 
7360b57cec5SDimitry Andric   Instruction *BaseSExt = dyn_cast<SExtInst>(Base->user_back());
7370b57cec5SDimitry Andric   Instruction *OffsetSExt = dyn_cast<SExtInst>(Offset->user_back());
7380b57cec5SDimitry Andric 
7390b57cec5SDimitry Andric   assert((BaseSExt && OffsetSExt)
7400b57cec5SDimitry Andric          && "Loads should have a single, extending, user");
7410b57cec5SDimitry Andric 
7420b57cec5SDimitry Andric   std::function<void(Value*, Value*)> MoveBefore =
7430b57cec5SDimitry Andric     [&](Value *A, Value *B) -> void {
7440b57cec5SDimitry Andric       if (!isa<Instruction>(A) || !isa<Instruction>(B))
7450b57cec5SDimitry Andric         return;
7460b57cec5SDimitry Andric 
7470b57cec5SDimitry Andric       auto *Source = cast<Instruction>(A);
7480b57cec5SDimitry Andric       auto *Sink = cast<Instruction>(B);
7490b57cec5SDimitry Andric 
7500b57cec5SDimitry Andric       if (DT->dominates(Source, Sink) ||
7510b57cec5SDimitry Andric           Source->getParent() != Sink->getParent() ||
7520b57cec5SDimitry Andric           isa<PHINode>(Source) || isa<PHINode>(Sink))
7530b57cec5SDimitry Andric         return;
7540b57cec5SDimitry Andric 
7550b57cec5SDimitry Andric       Source->moveBefore(Sink);
7568bcb0991SDimitry Andric       for (auto &Op : Source->operands())
7578bcb0991SDimitry Andric         MoveBefore(Op, Source);
7580b57cec5SDimitry Andric     };
7590b57cec5SDimitry Andric 
7600b57cec5SDimitry Andric   // Insert the load at the point of the original dominating load.
7610b57cec5SDimitry Andric   LoadInst *DomLoad = DT->dominates(Base, Offset) ? Base : Offset;
7620b57cec5SDimitry Andric   IRBuilder<NoFolder> IRB(DomLoad->getParent(),
7630b57cec5SDimitry Andric                           ++BasicBlock::iterator(DomLoad));
7640b57cec5SDimitry Andric 
76506c3fb27SDimitry Andric   // Create the wide load, while making sure to maintain the original alignment
76606c3fb27SDimitry Andric   // as this prevents ldrd from being generated when it could be illegal due to
76706c3fb27SDimitry Andric   // memory alignment.
76806c3fb27SDimitry Andric   Value *VecPtr = Base->getPointerOperand();
7695ffd83dbSDimitry Andric   LoadInst *WideLoad = IRB.CreateAlignedLoad(LoadTy, VecPtr, Base->getAlign());
7700b57cec5SDimitry Andric 
7710b57cec5SDimitry Andric   // Make sure everything is in the correct order in the basic block.
7720b57cec5SDimitry Andric   MoveBefore(Base->getPointerOperand(), VecPtr);
7730b57cec5SDimitry Andric   MoveBefore(VecPtr, WideLoad);
7740b57cec5SDimitry Andric 
7750b57cec5SDimitry Andric   // From the wide load, create two values that equal the original two loads.
7760b57cec5SDimitry Andric   // Loads[0] needs trunc while Loads[1] needs a lshr and trunc.
7770b57cec5SDimitry Andric   // TODO: Support big-endian as well.
7780b57cec5SDimitry Andric   Value *Bottom = IRB.CreateTrunc(WideLoad, Base->getType());
7798bcb0991SDimitry Andric   Value *NewBaseSExt = IRB.CreateSExt(Bottom, BaseSExt->getType());
7808bcb0991SDimitry Andric   BaseSExt->replaceAllUsesWith(NewBaseSExt);
7810b57cec5SDimitry Andric 
7820b57cec5SDimitry Andric   IntegerType *OffsetTy = cast<IntegerType>(Offset->getType());
7830b57cec5SDimitry Andric   Value *ShiftVal = ConstantInt::get(LoadTy, OffsetTy->getBitWidth());
7840b57cec5SDimitry Andric   Value *Top = IRB.CreateLShr(WideLoad, ShiftVal);
7850b57cec5SDimitry Andric   Value *Trunc = IRB.CreateTrunc(Top, OffsetTy);
7868bcb0991SDimitry Andric   Value *NewOffsetSExt = IRB.CreateSExt(Trunc, OffsetSExt->getType());
7878bcb0991SDimitry Andric   OffsetSExt->replaceAllUsesWith(NewOffsetSExt);
7880b57cec5SDimitry Andric 
7898bcb0991SDimitry Andric   LLVM_DEBUG(dbgs() << "From Base and Offset:\n"
7908bcb0991SDimitry Andric              << *Base << "\n" << *Offset << "\n"
7918bcb0991SDimitry Andric              << "Created Wide Load:\n"
7928bcb0991SDimitry Andric              << *WideLoad << "\n"
7938bcb0991SDimitry Andric              << *Bottom << "\n"
7948bcb0991SDimitry Andric              << *NewBaseSExt << "\n"
7958bcb0991SDimitry Andric              << *Top << "\n"
7968bcb0991SDimitry Andric              << *Trunc << "\n"
7978bcb0991SDimitry Andric              << *NewOffsetSExt << "\n");
7980b57cec5SDimitry Andric   WideLoads.emplace(std::make_pair(Base,
7998bcb0991SDimitry Andric                                    std::make_unique<WidenedLoad>(Loads, WideLoad)));
8000b57cec5SDimitry Andric   return WideLoad;
8010b57cec5SDimitry Andric }
8020b57cec5SDimitry Andric 
8030b57cec5SDimitry Andric Pass *llvm::createARMParallelDSPPass() {
8040b57cec5SDimitry Andric   return new ARMParallelDSP();
8050b57cec5SDimitry Andric }
8060b57cec5SDimitry Andric 
8070b57cec5SDimitry Andric char ARMParallelDSP::ID = 0;
8080b57cec5SDimitry Andric 
8090b57cec5SDimitry Andric INITIALIZE_PASS_BEGIN(ARMParallelDSP, "arm-parallel-dsp",
8108bcb0991SDimitry Andric                 "Transform functions to use DSP intrinsics", false, false)
8110b57cec5SDimitry Andric INITIALIZE_PASS_END(ARMParallelDSP, "arm-parallel-dsp",
8128bcb0991SDimitry Andric                 "Transform functions to use DSP intrinsics", false, false)
813