xref: /freebsd-src/contrib/llvm-project/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp (revision 0fca6ea1d4eea4c934cfff25ac9ee8ad6fe95583)
10b57cec5SDimitry Andric //===- InterleavedLoadCombine.cpp - Combine Interleaved Loads ---*- C++ -*-===//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric //
90b57cec5SDimitry Andric // \file
100b57cec5SDimitry Andric //
110b57cec5SDimitry Andric // This file defines the interleaved-load-combine pass. The pass searches for
120b57cec5SDimitry Andric // ShuffleVectorInstruction that execute interleaving loads. If a matching
130b57cec5SDimitry Andric // pattern is found, it adds a combined load and further instructions in a
140b57cec5SDimitry Andric // pattern that is detectable by InterleavedAccesPass. The old instructions are
150b57cec5SDimitry Andric // left dead to be removed later. The pass is specifically designed to be
160b57cec5SDimitry Andric // executed just before InterleavedAccesPass to find any left-over instances
170b57cec5SDimitry Andric // that are not detected within former passes.
180b57cec5SDimitry Andric //
190b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
200b57cec5SDimitry Andric 
210b57cec5SDimitry Andric #include "llvm/ADT/Statistic.h"
220b57cec5SDimitry Andric #include "llvm/Analysis/MemorySSA.h"
230b57cec5SDimitry Andric #include "llvm/Analysis/MemorySSAUpdater.h"
240b57cec5SDimitry Andric #include "llvm/Analysis/OptimizationRemarkEmitter.h"
250b57cec5SDimitry Andric #include "llvm/Analysis/TargetTransformInfo.h"
265f757f3fSDimitry Andric #include "llvm/CodeGen/InterleavedLoadCombine.h"
270b57cec5SDimitry Andric #include "llvm/CodeGen/Passes.h"
280b57cec5SDimitry Andric #include "llvm/CodeGen/TargetLowering.h"
290b57cec5SDimitry Andric #include "llvm/CodeGen/TargetPassConfig.h"
300b57cec5SDimitry Andric #include "llvm/CodeGen/TargetSubtargetInfo.h"
310b57cec5SDimitry Andric #include "llvm/IR/DataLayout.h"
320b57cec5SDimitry Andric #include "llvm/IR/Dominators.h"
330b57cec5SDimitry Andric #include "llvm/IR/Function.h"
34fe6060f1SDimitry Andric #include "llvm/IR/IRBuilder.h"
3581ad6265SDimitry Andric #include "llvm/IR/Instructions.h"
360b57cec5SDimitry Andric #include "llvm/IR/Module.h"
37480093f4SDimitry Andric #include "llvm/InitializePasses.h"
380b57cec5SDimitry Andric #include "llvm/Pass.h"
390b57cec5SDimitry Andric #include "llvm/Support/Debug.h"
400b57cec5SDimitry Andric #include "llvm/Support/ErrorHandling.h"
410b57cec5SDimitry Andric #include "llvm/Support/raw_ostream.h"
420b57cec5SDimitry Andric #include "llvm/Target/TargetMachine.h"
430b57cec5SDimitry Andric 
440b57cec5SDimitry Andric #include <algorithm>
450b57cec5SDimitry Andric #include <cassert>
460b57cec5SDimitry Andric #include <list>
470b57cec5SDimitry Andric 
480b57cec5SDimitry Andric using namespace llvm;
490b57cec5SDimitry Andric 
500b57cec5SDimitry Andric #define DEBUG_TYPE "interleaved-load-combine"
510b57cec5SDimitry Andric 
520b57cec5SDimitry Andric namespace {
530b57cec5SDimitry Andric 
540b57cec5SDimitry Andric /// Statistic counter
550b57cec5SDimitry Andric STATISTIC(NumInterleavedLoadCombine, "Number of combined loads");
560b57cec5SDimitry Andric 
570b57cec5SDimitry Andric /// Option to disable the pass
580b57cec5SDimitry Andric static cl::opt<bool> DisableInterleavedLoadCombine(
590b57cec5SDimitry Andric     "disable-" DEBUG_TYPE, cl::init(false), cl::Hidden,
600b57cec5SDimitry Andric     cl::desc("Disable combining of interleaved loads"));
610b57cec5SDimitry Andric 
620b57cec5SDimitry Andric struct VectorInfo;
630b57cec5SDimitry Andric 
640b57cec5SDimitry Andric struct InterleavedLoadCombineImpl {
650b57cec5SDimitry Andric public:
660b57cec5SDimitry Andric   InterleavedLoadCombineImpl(Function &F, DominatorTree &DT, MemorySSA &MSSA,
67*0fca6ea1SDimitry Andric                              const TargetTransformInfo &TTI,
685f757f3fSDimitry Andric                              const TargetMachine &TM)
690b57cec5SDimitry Andric       : F(F), DT(DT), MSSA(MSSA),
70*0fca6ea1SDimitry Andric         TLI(*TM.getSubtargetImpl(F)->getTargetLowering()), TTI(TTI) {}
710b57cec5SDimitry Andric 
720b57cec5SDimitry Andric   /// Scan the function for interleaved load candidates and execute the
730b57cec5SDimitry Andric   /// replacement if applicable.
740b57cec5SDimitry Andric   bool run();
750b57cec5SDimitry Andric 
760b57cec5SDimitry Andric private:
770b57cec5SDimitry Andric   /// Function this pass is working on
780b57cec5SDimitry Andric   Function &F;
790b57cec5SDimitry Andric 
800b57cec5SDimitry Andric   /// Dominator Tree Analysis
810b57cec5SDimitry Andric   DominatorTree &DT;
820b57cec5SDimitry Andric 
830b57cec5SDimitry Andric   /// Memory Alias Analyses
840b57cec5SDimitry Andric   MemorySSA &MSSA;
850b57cec5SDimitry Andric 
860b57cec5SDimitry Andric   /// Target Lowering Information
870b57cec5SDimitry Andric   const TargetLowering &TLI;
880b57cec5SDimitry Andric 
890b57cec5SDimitry Andric   /// Target Transform Information
90*0fca6ea1SDimitry Andric   const TargetTransformInfo &TTI;
910b57cec5SDimitry Andric 
920b57cec5SDimitry Andric   /// Find the instruction in sets LIs that dominates all others, return nullptr
930b57cec5SDimitry Andric   /// if there is none.
940b57cec5SDimitry Andric   LoadInst *findFirstLoad(const std::set<LoadInst *> &LIs);
950b57cec5SDimitry Andric 
960b57cec5SDimitry Andric   /// Replace interleaved load candidates. It does additional
970b57cec5SDimitry Andric   /// analyses if this makes sense. Returns true on success and false
980b57cec5SDimitry Andric   /// of nothing has been changed.
990b57cec5SDimitry Andric   bool combine(std::list<VectorInfo> &InterleavedLoad,
1000b57cec5SDimitry Andric                OptimizationRemarkEmitter &ORE);
1010b57cec5SDimitry Andric 
1020b57cec5SDimitry Andric   /// Given a set of VectorInfo containing candidates for a given interleave
1030b57cec5SDimitry Andric   /// factor, find a set that represents a 'factor' interleaved load.
1040b57cec5SDimitry Andric   bool findPattern(std::list<VectorInfo> &Candidates,
1050b57cec5SDimitry Andric                    std::list<VectorInfo> &InterleavedLoad, unsigned Factor,
1060b57cec5SDimitry Andric                    const DataLayout &DL);
1070b57cec5SDimitry Andric }; // InterleavedLoadCombine
1080b57cec5SDimitry Andric 
1090b57cec5SDimitry Andric /// First Order Polynomial on an n-Bit Integer Value
1100b57cec5SDimitry Andric ///
1110b57cec5SDimitry Andric /// Polynomial(Value) = Value * B + A + E*2^(n-e)
1120b57cec5SDimitry Andric ///
1130b57cec5SDimitry Andric /// A and B are the coefficients. E*2^(n-e) is an error within 'e' most
1140b57cec5SDimitry Andric /// significant bits. It is introduced if an exact computation cannot be proven
1150b57cec5SDimitry Andric /// (e.q. division by 2).
1160b57cec5SDimitry Andric ///
1170b57cec5SDimitry Andric /// As part of this optimization multiple loads will be combined. It necessary
1180b57cec5SDimitry Andric /// to prove that loads are within some relative offset to each other. This
1190b57cec5SDimitry Andric /// class is used to prove relative offsets of values loaded from memory.
1200b57cec5SDimitry Andric ///
1210b57cec5SDimitry Andric /// Representing an integer in this form is sound since addition in two's
1220b57cec5SDimitry Andric /// complement is associative (trivial) and multiplication distributes over the
1230b57cec5SDimitry Andric /// addition (see Proof(1) in Polynomial::mul). Further, both operations
1240b57cec5SDimitry Andric /// commute.
1250b57cec5SDimitry Andric //
1260b57cec5SDimitry Andric // Example:
1270b57cec5SDimitry Andric // declare @fn(i64 %IDX, <4 x float>* %PTR) {
1280b57cec5SDimitry Andric //   %Pa1 = add i64 %IDX, 2
1290b57cec5SDimitry Andric //   %Pa2 = lshr i64 %Pa1, 1
1300b57cec5SDimitry Andric //   %Pa3 = getelementptr inbounds <4 x float>, <4 x float>* %PTR, i64 %Pa2
1310b57cec5SDimitry Andric //   %Va = load <4 x float>, <4 x float>* %Pa3
1320b57cec5SDimitry Andric //
1330b57cec5SDimitry Andric //   %Pb1 = add i64 %IDX, 4
1340b57cec5SDimitry Andric //   %Pb2 = lshr i64 %Pb1, 1
1350b57cec5SDimitry Andric //   %Pb3 = getelementptr inbounds <4 x float>, <4 x float>* %PTR, i64 %Pb2
1360b57cec5SDimitry Andric //   %Vb = load <4 x float>, <4 x float>* %Pb3
1370b57cec5SDimitry Andric // ... }
1380b57cec5SDimitry Andric //
1390b57cec5SDimitry Andric // The goal is to prove that two loads load consecutive addresses.
1400b57cec5SDimitry Andric //
1410b57cec5SDimitry Andric // In this case the polynomials are constructed by the following
1420b57cec5SDimitry Andric // steps.
1430b57cec5SDimitry Andric //
1440b57cec5SDimitry Andric // The number tag #e specifies the error bits.
1450b57cec5SDimitry Andric //
1460b57cec5SDimitry Andric // Pa_0 = %IDX              #0
1470b57cec5SDimitry Andric // Pa_1 = %IDX + 2          #0 | add 2
1480b57cec5SDimitry Andric // Pa_2 = %IDX/2 + 1        #1 | lshr 1
1490b57cec5SDimitry Andric // Pa_3 = %IDX/2 + 1        #1 | GEP, step signext to i64
1500b57cec5SDimitry Andric // Pa_4 = (%IDX/2)*16 + 16  #0 | GEP, multiply index by sizeof(4) for floats
1510b57cec5SDimitry Andric // Pa_5 = (%IDX/2)*16 + 16  #0 | GEP, add offset of leading components
1520b57cec5SDimitry Andric //
1530b57cec5SDimitry Andric // Pb_0 = %IDX              #0
1540b57cec5SDimitry Andric // Pb_1 = %IDX + 4          #0 | add 2
1550b57cec5SDimitry Andric // Pb_2 = %IDX/2 + 2        #1 | lshr 1
1560b57cec5SDimitry Andric // Pb_3 = %IDX/2 + 2        #1 | GEP, step signext to i64
1570b57cec5SDimitry Andric // Pb_4 = (%IDX/2)*16 + 32  #0 | GEP, multiply index by sizeof(4) for floats
1580b57cec5SDimitry Andric // Pb_5 = (%IDX/2)*16 + 16  #0 | GEP, add offset of leading components
1590b57cec5SDimitry Andric //
1600b57cec5SDimitry Andric // Pb_5 - Pa_5 = 16         #0 | subtract to get the offset
1610b57cec5SDimitry Andric //
1620b57cec5SDimitry Andric // Remark: %PTR is not maintained within this class. So in this instance the
1630b57cec5SDimitry Andric // offset of 16 can only be assumed if the pointers are equal.
1640b57cec5SDimitry Andric //
1650b57cec5SDimitry Andric class Polynomial {
1660b57cec5SDimitry Andric   /// Operations on B
1670b57cec5SDimitry Andric   enum BOps {
1680b57cec5SDimitry Andric     LShr,
1690b57cec5SDimitry Andric     Mul,
1700b57cec5SDimitry Andric     SExt,
1710b57cec5SDimitry Andric     Trunc,
1720b57cec5SDimitry Andric   };
1730b57cec5SDimitry Andric 
1740b57cec5SDimitry Andric   /// Number of Error Bits e
17581ad6265SDimitry Andric   unsigned ErrorMSBs = (unsigned)-1;
1760b57cec5SDimitry Andric 
1770b57cec5SDimitry Andric   /// Value
17881ad6265SDimitry Andric   Value *V = nullptr;
1790b57cec5SDimitry Andric 
1800b57cec5SDimitry Andric   /// Coefficient B
1810b57cec5SDimitry Andric   SmallVector<std::pair<BOps, APInt>, 4> B;
1820b57cec5SDimitry Andric 
1830b57cec5SDimitry Andric   /// Coefficient A
1840b57cec5SDimitry Andric   APInt A;
1850b57cec5SDimitry Andric 
1860b57cec5SDimitry Andric public:
18781ad6265SDimitry Andric   Polynomial(Value *V) : V(V) {
1880b57cec5SDimitry Andric     IntegerType *Ty = dyn_cast<IntegerType>(V->getType());
1890b57cec5SDimitry Andric     if (Ty) {
1900b57cec5SDimitry Andric       ErrorMSBs = 0;
1910b57cec5SDimitry Andric       this->V = V;
1920b57cec5SDimitry Andric       A = APInt(Ty->getBitWidth(), 0);
1930b57cec5SDimitry Andric     }
1940b57cec5SDimitry Andric   }
1950b57cec5SDimitry Andric 
1960b57cec5SDimitry Andric   Polynomial(const APInt &A, unsigned ErrorMSBs = 0)
19781ad6265SDimitry Andric       : ErrorMSBs(ErrorMSBs), A(A) {}
1980b57cec5SDimitry Andric 
1990b57cec5SDimitry Andric   Polynomial(unsigned BitWidth, uint64_t A, unsigned ErrorMSBs = 0)
20081ad6265SDimitry Andric       : ErrorMSBs(ErrorMSBs), A(BitWidth, A) {}
2010b57cec5SDimitry Andric 
20281ad6265SDimitry Andric   Polynomial() = default;
2030b57cec5SDimitry Andric 
2040b57cec5SDimitry Andric   /// Increment and clamp the number of undefined bits.
2050b57cec5SDimitry Andric   void incErrorMSBs(unsigned amt) {
2060b57cec5SDimitry Andric     if (ErrorMSBs == (unsigned)-1)
2070b57cec5SDimitry Andric       return;
2080b57cec5SDimitry Andric 
2090b57cec5SDimitry Andric     ErrorMSBs += amt;
2100b57cec5SDimitry Andric     if (ErrorMSBs > A.getBitWidth())
2110b57cec5SDimitry Andric       ErrorMSBs = A.getBitWidth();
2120b57cec5SDimitry Andric   }
2130b57cec5SDimitry Andric 
2140b57cec5SDimitry Andric   /// Decrement and clamp the number of undefined bits.
2150b57cec5SDimitry Andric   void decErrorMSBs(unsigned amt) {
2160b57cec5SDimitry Andric     if (ErrorMSBs == (unsigned)-1)
2170b57cec5SDimitry Andric       return;
2180b57cec5SDimitry Andric 
2190b57cec5SDimitry Andric     if (ErrorMSBs > amt)
2200b57cec5SDimitry Andric       ErrorMSBs -= amt;
2210b57cec5SDimitry Andric     else
2220b57cec5SDimitry Andric       ErrorMSBs = 0;
2230b57cec5SDimitry Andric   }
2240b57cec5SDimitry Andric 
2250b57cec5SDimitry Andric   /// Apply an add on the polynomial
2260b57cec5SDimitry Andric   Polynomial &add(const APInt &C) {
2270b57cec5SDimitry Andric     // Note: Addition is associative in two's complement even when in case of
2280b57cec5SDimitry Andric     // signed overflow.
2290b57cec5SDimitry Andric     //
2300b57cec5SDimitry Andric     // Error bits can only propagate into higher significant bits. As these are
2310b57cec5SDimitry Andric     // already regarded as undefined, there is no change.
2320b57cec5SDimitry Andric     //
2330b57cec5SDimitry Andric     // Theorem: Adding a constant to a polynomial does not change the error
2340b57cec5SDimitry Andric     // term.
2350b57cec5SDimitry Andric     //
2360b57cec5SDimitry Andric     // Proof:
2370b57cec5SDimitry Andric     //
2380b57cec5SDimitry Andric     //   Since the addition is associative and commutes:
2390b57cec5SDimitry Andric     //
2400b57cec5SDimitry Andric     //   (B + A + E*2^(n-e)) + C = B + (A + C) + E*2^(n-e)
2410b57cec5SDimitry Andric     // [qed]
2420b57cec5SDimitry Andric 
2430b57cec5SDimitry Andric     if (C.getBitWidth() != A.getBitWidth()) {
2440b57cec5SDimitry Andric       ErrorMSBs = (unsigned)-1;
2450b57cec5SDimitry Andric       return *this;
2460b57cec5SDimitry Andric     }
2470b57cec5SDimitry Andric 
2480b57cec5SDimitry Andric     A += C;
2490b57cec5SDimitry Andric     return *this;
2500b57cec5SDimitry Andric   }
2510b57cec5SDimitry Andric 
2520b57cec5SDimitry Andric   /// Apply a multiplication onto the polynomial.
2530b57cec5SDimitry Andric   Polynomial &mul(const APInt &C) {
2540b57cec5SDimitry Andric     // Note: Multiplication distributes over the addition
2550b57cec5SDimitry Andric     //
2560b57cec5SDimitry Andric     // Theorem: Multiplication distributes over the addition
2570b57cec5SDimitry Andric     //
2580b57cec5SDimitry Andric     // Proof(1):
2590b57cec5SDimitry Andric     //
2600b57cec5SDimitry Andric     //   (B+A)*C =-
2610b57cec5SDimitry Andric     //        = (B + A) + (B + A) + .. {C Times}
2620b57cec5SDimitry Andric     //         addition is associative and commutes, hence
2630b57cec5SDimitry Andric     //        = B + B + .. {C Times} .. + A + A + .. {C times}
2640b57cec5SDimitry Andric     //        = B*C + A*C
2650b57cec5SDimitry Andric     //   (see (function add) for signed values and overflows)
2660b57cec5SDimitry Andric     // [qed]
2670b57cec5SDimitry Andric     //
2680b57cec5SDimitry Andric     // Theorem: If C has c trailing zeros, errors bits in A or B are shifted out
2690b57cec5SDimitry Andric     // to the left.
2700b57cec5SDimitry Andric     //
2710b57cec5SDimitry Andric     // Proof(2):
2720b57cec5SDimitry Andric     //
2730b57cec5SDimitry Andric     //   Let B' and A' be the n-Bit inputs with some unknown errors EA,
2740b57cec5SDimitry Andric     //   EB at e leading bits. B' and A' can be written down as:
2750b57cec5SDimitry Andric     //
2760b57cec5SDimitry Andric     //     B' = B + 2^(n-e)*EB
2770b57cec5SDimitry Andric     //     A' = A + 2^(n-e)*EA
2780b57cec5SDimitry Andric     //
2790b57cec5SDimitry Andric     //   Let C' be an input with c trailing zero bits. C' can be written as
2800b57cec5SDimitry Andric     //
2810b57cec5SDimitry Andric     //     C' = C*2^c
2820b57cec5SDimitry Andric     //
2830b57cec5SDimitry Andric     //   Therefore we can compute the result by using distributivity and
2840b57cec5SDimitry Andric     //   commutativity.
2850b57cec5SDimitry Andric     //
2860b57cec5SDimitry Andric     //     (B'*C' + A'*C') = [B + 2^(n-e)*EB] * C' + [A + 2^(n-e)*EA] * C' =
2870b57cec5SDimitry Andric     //                     = [B + 2^(n-e)*EB + A + 2^(n-e)*EA] * C' =
2880b57cec5SDimitry Andric     //                     = (B'+A') * C' =
2890b57cec5SDimitry Andric     //                     = [B + 2^(n-e)*EB + A + 2^(n-e)*EA] * C' =
2900b57cec5SDimitry Andric     //                     = [B + A + 2^(n-e)*EB + 2^(n-e)*EA] * C' =
2910b57cec5SDimitry Andric     //                     = (B + A) * C' + [2^(n-e)*EB + 2^(n-e)*EA)] * C' =
2920b57cec5SDimitry Andric     //                     = (B + A) * C' + [2^(n-e)*EB + 2^(n-e)*EA)] * C*2^c =
2930b57cec5SDimitry Andric     //                     = (B + A) * C' + C*(EB + EA)*2^(n-e)*2^c =
2940b57cec5SDimitry Andric     //
2950b57cec5SDimitry Andric     //   Let EC be the final error with EC = C*(EB + EA)
2960b57cec5SDimitry Andric     //
2970b57cec5SDimitry Andric     //                     = (B + A)*C' + EC*2^(n-e)*2^c =
2980b57cec5SDimitry Andric     //                     = (B + A)*C' + EC*2^(n-(e-c))
2990b57cec5SDimitry Andric     //
3000b57cec5SDimitry Andric     //   Since EC is multiplied by 2^(n-(e-c)) the resulting error contains c
3010b57cec5SDimitry Andric     //   less error bits than the input. c bits are shifted out to the left.
3020b57cec5SDimitry Andric     // [qed]
3030b57cec5SDimitry Andric 
3040b57cec5SDimitry Andric     if (C.getBitWidth() != A.getBitWidth()) {
3050b57cec5SDimitry Andric       ErrorMSBs = (unsigned)-1;
3060b57cec5SDimitry Andric       return *this;
3070b57cec5SDimitry Andric     }
3080b57cec5SDimitry Andric 
3090b57cec5SDimitry Andric     // Multiplying by one is a no-op.
310349cc55cSDimitry Andric     if (C.isOne()) {
3110b57cec5SDimitry Andric       return *this;
3120b57cec5SDimitry Andric     }
3130b57cec5SDimitry Andric 
3140b57cec5SDimitry Andric     // Multiplying by zero removes the coefficient B and defines all bits.
315349cc55cSDimitry Andric     if (C.isZero()) {
3160b57cec5SDimitry Andric       ErrorMSBs = 0;
3170b57cec5SDimitry Andric       deleteB();
3180b57cec5SDimitry Andric     }
3190b57cec5SDimitry Andric 
3200b57cec5SDimitry Andric     // See Proof(2): Trailing zero bits indicate a left shift. This removes
3210b57cec5SDimitry Andric     // leading bits from the result even if they are undefined.
32206c3fb27SDimitry Andric     decErrorMSBs(C.countr_zero());
3230b57cec5SDimitry Andric 
3240b57cec5SDimitry Andric     A *= C;
3250b57cec5SDimitry Andric     pushBOperation(Mul, C);
3260b57cec5SDimitry Andric     return *this;
3270b57cec5SDimitry Andric   }
3280b57cec5SDimitry Andric 
3290b57cec5SDimitry Andric   /// Apply a logical shift right on the polynomial
3300b57cec5SDimitry Andric   Polynomial &lshr(const APInt &C) {
3310b57cec5SDimitry Andric     // Theorem(1): (B + A + E*2^(n-e)) >> 1 => (B >> 1) + (A >> 1) + E'*2^(n-e')
3320b57cec5SDimitry Andric     //          where
3330b57cec5SDimitry Andric     //             e' = e + 1,
3340b57cec5SDimitry Andric     //             E is a e-bit number,
3350b57cec5SDimitry Andric     //             E' is a e'-bit number,
3360b57cec5SDimitry Andric     //   holds under the following precondition:
3370b57cec5SDimitry Andric     //          pre(1): A % 2 = 0
3380b57cec5SDimitry Andric     //          pre(2): e < n, (see Theorem(2) for the trivial case with e=n)
3390b57cec5SDimitry Andric     //   where >> expresses a logical shift to the right, with adding zeros.
3400b57cec5SDimitry Andric     //
3410b57cec5SDimitry Andric     //  We need to show that for every, E there is a E'
3420b57cec5SDimitry Andric     //
3430b57cec5SDimitry Andric     //  B = b_h * 2^(n-1) + b_m * 2 + b_l
3440b57cec5SDimitry Andric     //  A = a_h * 2^(n-1) + a_m * 2         (pre(1))
3450b57cec5SDimitry Andric     //
3460b57cec5SDimitry Andric     //  where a_h, b_h, b_l are single bits, and a_m, b_m are (n-2) bit numbers
3470b57cec5SDimitry Andric     //
3480b57cec5SDimitry Andric     //  Let X = (B + A + E*2^(n-e)) >> 1
3490b57cec5SDimitry Andric     //  Let Y = (B >> 1) + (A >> 1) + E*2^(n-e) >> 1
3500b57cec5SDimitry Andric     //
3510b57cec5SDimitry Andric     //    X = [B + A + E*2^(n-e)] >> 1 =
3520b57cec5SDimitry Andric     //      = [  b_h * 2^(n-1) + b_m * 2 + b_l +
3530b57cec5SDimitry Andric     //         + a_h * 2^(n-1) + a_m * 2 +
3540b57cec5SDimitry Andric     //         + E * 2^(n-e) ] >> 1 =
3550b57cec5SDimitry Andric     //
3560b57cec5SDimitry Andric     //    The sum is built by putting the overflow of [a_m + b+n] into the term
3570b57cec5SDimitry Andric     //    2^(n-1). As there are no more bits beyond 2^(n-1) the overflow within
3580b57cec5SDimitry Andric     //    this bit is discarded. This is expressed by % 2.
3590b57cec5SDimitry Andric     //
3600b57cec5SDimitry Andric     //    The bit in position 0 cannot overflow into the term (b_m + a_m).
3610b57cec5SDimitry Andric     //
3620b57cec5SDimitry Andric     //      = [  ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-1) +
3630b57cec5SDimitry Andric     //         + ((b_m + a_m) % 2^(n-2)) * 2 +
3640b57cec5SDimitry Andric     //         + b_l + E * 2^(n-e) ] >> 1 =
3650b57cec5SDimitry Andric     //
3660b57cec5SDimitry Andric     //    The shift is computed by dividing the terms by 2 and by cutting off
3670b57cec5SDimitry Andric     //    b_l.
3680b57cec5SDimitry Andric     //
3690b57cec5SDimitry Andric     //      =    ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
3700b57cec5SDimitry Andric     //         + ((b_m + a_m) % 2^(n-2)) +
3710b57cec5SDimitry Andric     //         + E * 2^(n-(e+1)) =
3720b57cec5SDimitry Andric     //
3730b57cec5SDimitry Andric     //    by the definition in the Theorem e+1 = e'
3740b57cec5SDimitry Andric     //
3750b57cec5SDimitry Andric     //      =    ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
3760b57cec5SDimitry Andric     //         + ((b_m + a_m) % 2^(n-2)) +
3770b57cec5SDimitry Andric     //         + E * 2^(n-e') =
3780b57cec5SDimitry Andric     //
3790b57cec5SDimitry Andric     //    Compute Y by applying distributivity first
3800b57cec5SDimitry Andric     //
3810b57cec5SDimitry Andric     //    Y =  (B >> 1) + (A >> 1) + E*2^(n-e') =
3820b57cec5SDimitry Andric     //      =    (b_h * 2^(n-1) + b_m * 2 + b_l) >> 1 +
3830b57cec5SDimitry Andric     //         + (a_h * 2^(n-1) + a_m * 2) >> 1 +
3840b57cec5SDimitry Andric     //         + E * 2^(n-e) >> 1 =
3850b57cec5SDimitry Andric     //
3860b57cec5SDimitry Andric     //    Again, the shift is computed by dividing the terms by 2 and by cutting
3870b57cec5SDimitry Andric     //    off b_l.
3880b57cec5SDimitry Andric     //
3890b57cec5SDimitry Andric     //      =     b_h * 2^(n-2) + b_m +
3900b57cec5SDimitry Andric     //         +  a_h * 2^(n-2) + a_m +
3910b57cec5SDimitry Andric     //         +  E * 2^(n-(e+1)) =
3920b57cec5SDimitry Andric     //
3930b57cec5SDimitry Andric     //    Again, the sum is built by putting the overflow of [a_m + b+n] into
3940b57cec5SDimitry Andric     //    the term 2^(n-1). But this time there is room for a second bit in the
3950b57cec5SDimitry Andric     //    term 2^(n-2) we add this bit to a new term and denote it o_h in a
3960b57cec5SDimitry Andric     //    second step.
3970b57cec5SDimitry Andric     //
3980b57cec5SDimitry Andric     //      =    ([b_h + a_h + (b_m + a_m) >> (n-2)] >> 1) * 2^(n-1) +
3990b57cec5SDimitry Andric     //         + ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
4000b57cec5SDimitry Andric     //         + ((b_m + a_m) % 2^(n-2)) +
4010b57cec5SDimitry Andric     //         + E * 2^(n-(e+1)) =
4020b57cec5SDimitry Andric     //
4030b57cec5SDimitry Andric     //    Let o_h = [b_h + a_h + (b_m + a_m) >> (n-2)] >> 1
4040b57cec5SDimitry Andric     //    Further replace e+1 by e'.
4050b57cec5SDimitry Andric     //
4060b57cec5SDimitry Andric     //      =    o_h * 2^(n-1) +
4070b57cec5SDimitry Andric     //         + ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
4080b57cec5SDimitry Andric     //         + ((b_m + a_m) % 2^(n-2)) +
4090b57cec5SDimitry Andric     //         + E * 2^(n-e') =
4100b57cec5SDimitry Andric     //
4110b57cec5SDimitry Andric     //    Move o_h into the error term and construct E'. To ensure that there is
4120b57cec5SDimitry Andric     //    no 2^x with negative x, this step requires pre(2) (e < n).
4130b57cec5SDimitry Andric     //
4140b57cec5SDimitry Andric     //      =    ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
4150b57cec5SDimitry Andric     //         + ((b_m + a_m) % 2^(n-2)) +
4160b57cec5SDimitry Andric     //         + o_h * 2^(e'-1) * 2^(n-e') +               | pre(2), move 2^(e'-1)
4170b57cec5SDimitry Andric     //                                                     | out of the old exponent
4180b57cec5SDimitry Andric     //         + E * 2^(n-e') =
4190b57cec5SDimitry Andric     //      =    ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
4200b57cec5SDimitry Andric     //         + ((b_m + a_m) % 2^(n-2)) +
4210b57cec5SDimitry Andric     //         + [o_h * 2^(e'-1) + E] * 2^(n-e') +         | move 2^(e'-1) out of
4220b57cec5SDimitry Andric     //                                                     | the old exponent
4230b57cec5SDimitry Andric     //
4240b57cec5SDimitry Andric     //    Let E' = o_h * 2^(e'-1) + E
4250b57cec5SDimitry Andric     //
4260b57cec5SDimitry Andric     //      =    ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
4270b57cec5SDimitry Andric     //         + ((b_m + a_m) % 2^(n-2)) +
4280b57cec5SDimitry Andric     //         + E' * 2^(n-e')
4290b57cec5SDimitry Andric     //
4300b57cec5SDimitry Andric     //    Because X and Y are distinct only in there error terms and E' can be
4310b57cec5SDimitry Andric     //    constructed as shown the theorem holds.
4320b57cec5SDimitry Andric     // [qed]
4330b57cec5SDimitry Andric     //
4340b57cec5SDimitry Andric     // For completeness in case of the case e=n it is also required to show that
4350b57cec5SDimitry Andric     // distributivity can be applied.
4360b57cec5SDimitry Andric     //
4370b57cec5SDimitry Andric     // In this case Theorem(1) transforms to (the pre-condition on A can also be
4380b57cec5SDimitry Andric     // dropped)
4390b57cec5SDimitry Andric     //
4400b57cec5SDimitry Andric     // Theorem(2): (B + A + E) >> 1 => (B >> 1) + (A >> 1) + E'
4410b57cec5SDimitry Andric     //          where
4420b57cec5SDimitry Andric     //             A, B, E, E' are two's complement numbers with the same bit
4430b57cec5SDimitry Andric     //             width
4440b57cec5SDimitry Andric     //
4450b57cec5SDimitry Andric     //   Let A + B + E = X
4460b57cec5SDimitry Andric     //   Let (B >> 1) + (A >> 1) = Y
4470b57cec5SDimitry Andric     //
4480b57cec5SDimitry Andric     //   Therefore we need to show that for every X and Y there is an E' which
4490b57cec5SDimitry Andric     //   makes the equation
4500b57cec5SDimitry Andric     //
4510b57cec5SDimitry Andric     //     X = Y + E'
4520b57cec5SDimitry Andric     //
4530b57cec5SDimitry Andric     //   hold. This is trivially the case for E' = X - Y.
4540b57cec5SDimitry Andric     //
4550b57cec5SDimitry Andric     // [qed]
4560b57cec5SDimitry Andric     //
4570b57cec5SDimitry Andric     // Remark: Distributing lshr with and arbitrary number n can be expressed as
4580b57cec5SDimitry Andric     //   ((((B + A) lshr 1) lshr 1) ... ) {n times}.
4590b57cec5SDimitry Andric     // This construction induces n additional error bits at the left.
4600b57cec5SDimitry Andric 
4610b57cec5SDimitry Andric     if (C.getBitWidth() != A.getBitWidth()) {
4620b57cec5SDimitry Andric       ErrorMSBs = (unsigned)-1;
4630b57cec5SDimitry Andric       return *this;
4640b57cec5SDimitry Andric     }
4650b57cec5SDimitry Andric 
466349cc55cSDimitry Andric     if (C.isZero())
4670b57cec5SDimitry Andric       return *this;
4680b57cec5SDimitry Andric 
4690b57cec5SDimitry Andric     // Test if the result will be zero
4700b57cec5SDimitry Andric     unsigned shiftAmt = C.getZExtValue();
4710b57cec5SDimitry Andric     if (shiftAmt >= C.getBitWidth())
4720b57cec5SDimitry Andric       return mul(APInt(C.getBitWidth(), 0));
4730b57cec5SDimitry Andric 
4740b57cec5SDimitry Andric     // The proof that shiftAmt LSBs are zero for at least one summand is only
4750b57cec5SDimitry Andric     // possible for the constant number.
4760b57cec5SDimitry Andric     //
4770b57cec5SDimitry Andric     // If this can be proven add shiftAmt to the error counter
4780b57cec5SDimitry Andric     // `ErrorMSBs`. Otherwise set all bits as undefined.
47906c3fb27SDimitry Andric     if (A.countr_zero() < shiftAmt)
4800b57cec5SDimitry Andric       ErrorMSBs = A.getBitWidth();
4810b57cec5SDimitry Andric     else
4820b57cec5SDimitry Andric       incErrorMSBs(shiftAmt);
4830b57cec5SDimitry Andric 
4840b57cec5SDimitry Andric     // Apply the operation.
4850b57cec5SDimitry Andric     pushBOperation(LShr, C);
4860b57cec5SDimitry Andric     A = A.lshr(shiftAmt);
4870b57cec5SDimitry Andric 
4880b57cec5SDimitry Andric     return *this;
4890b57cec5SDimitry Andric   }
4900b57cec5SDimitry Andric 
4910b57cec5SDimitry Andric   /// Apply a sign-extend or truncate operation on the polynomial.
4920b57cec5SDimitry Andric   Polynomial &sextOrTrunc(unsigned n) {
4930b57cec5SDimitry Andric     if (n < A.getBitWidth()) {
4940b57cec5SDimitry Andric       // Truncate: Clearly undefined Bits on the MSB side are removed
4950b57cec5SDimitry Andric       // if there are any.
4960b57cec5SDimitry Andric       decErrorMSBs(A.getBitWidth() - n);
4970b57cec5SDimitry Andric       A = A.trunc(n);
4980b57cec5SDimitry Andric       pushBOperation(Trunc, APInt(sizeof(n) * 8, n));
4990b57cec5SDimitry Andric     }
5000b57cec5SDimitry Andric     if (n > A.getBitWidth()) {
5010b57cec5SDimitry Andric       // Extend: Clearly extending first and adding later is different
5020b57cec5SDimitry Andric       // to adding first and extending later in all extended bits.
5030b57cec5SDimitry Andric       incErrorMSBs(n - A.getBitWidth());
5040b57cec5SDimitry Andric       A = A.sext(n);
5050b57cec5SDimitry Andric       pushBOperation(SExt, APInt(sizeof(n) * 8, n));
5060b57cec5SDimitry Andric     }
5070b57cec5SDimitry Andric 
5080b57cec5SDimitry Andric     return *this;
5090b57cec5SDimitry Andric   }
5100b57cec5SDimitry Andric 
5110b57cec5SDimitry Andric   /// Test if there is a coefficient B.
5120b57cec5SDimitry Andric   bool isFirstOrder() const { return V != nullptr; }
5130b57cec5SDimitry Andric 
5140b57cec5SDimitry Andric   /// Test coefficient B of two Polynomials are equal.
5150b57cec5SDimitry Andric   bool isCompatibleTo(const Polynomial &o) const {
5160b57cec5SDimitry Andric     // The polynomial use different bit width.
5170b57cec5SDimitry Andric     if (A.getBitWidth() != o.A.getBitWidth())
5180b57cec5SDimitry Andric       return false;
5190b57cec5SDimitry Andric 
5200b57cec5SDimitry Andric     // If neither Polynomial has the Coefficient B.
5210b57cec5SDimitry Andric     if (!isFirstOrder() && !o.isFirstOrder())
5220b57cec5SDimitry Andric       return true;
5230b57cec5SDimitry Andric 
5240b57cec5SDimitry Andric     // The index variable is different.
5250b57cec5SDimitry Andric     if (V != o.V)
5260b57cec5SDimitry Andric       return false;
5270b57cec5SDimitry Andric 
5280b57cec5SDimitry Andric     // Check the operations.
5290b57cec5SDimitry Andric     if (B.size() != o.B.size())
5300b57cec5SDimitry Andric       return false;
5310b57cec5SDimitry Andric 
532fcaf7f86SDimitry Andric     auto *ob = o.B.begin();
533fcaf7f86SDimitry Andric     for (const auto &b : B) {
5340b57cec5SDimitry Andric       if (b != *ob)
5350b57cec5SDimitry Andric         return false;
5360b57cec5SDimitry Andric       ob++;
5370b57cec5SDimitry Andric     }
5380b57cec5SDimitry Andric 
5390b57cec5SDimitry Andric     return true;
5400b57cec5SDimitry Andric   }
5410b57cec5SDimitry Andric 
5420b57cec5SDimitry Andric   /// Subtract two polynomials, return an undefined polynomial if
5430b57cec5SDimitry Andric   /// subtraction is not possible.
5440b57cec5SDimitry Andric   Polynomial operator-(const Polynomial &o) const {
5450b57cec5SDimitry Andric     // Return an undefined polynomial if incompatible.
5460b57cec5SDimitry Andric     if (!isCompatibleTo(o))
5470b57cec5SDimitry Andric       return Polynomial();
5480b57cec5SDimitry Andric 
5490b57cec5SDimitry Andric     // If the polynomials are compatible (meaning they have the same
5500b57cec5SDimitry Andric     // coefficient on B), B is eliminated. Thus a polynomial solely
5510b57cec5SDimitry Andric     // containing A is returned
5520b57cec5SDimitry Andric     return Polynomial(A - o.A, std::max(ErrorMSBs, o.ErrorMSBs));
5530b57cec5SDimitry Andric   }
5540b57cec5SDimitry Andric 
5550b57cec5SDimitry Andric   /// Subtract a constant from a polynomial,
5560b57cec5SDimitry Andric   Polynomial operator-(uint64_t C) const {
5570b57cec5SDimitry Andric     Polynomial Result(*this);
5580b57cec5SDimitry Andric     Result.A -= C;
5590b57cec5SDimitry Andric     return Result;
5600b57cec5SDimitry Andric   }
5610b57cec5SDimitry Andric 
5620b57cec5SDimitry Andric   /// Add a constant to a polynomial,
5630b57cec5SDimitry Andric   Polynomial operator+(uint64_t C) const {
5640b57cec5SDimitry Andric     Polynomial Result(*this);
5650b57cec5SDimitry Andric     Result.A += C;
5660b57cec5SDimitry Andric     return Result;
5670b57cec5SDimitry Andric   }
5680b57cec5SDimitry Andric 
5690b57cec5SDimitry Andric   /// Returns true if it can be proven that two Polynomials are equal.
5700b57cec5SDimitry Andric   bool isProvenEqualTo(const Polynomial &o) {
5710b57cec5SDimitry Andric     // Subtract both polynomials and test if it is fully defined and zero.
5720b57cec5SDimitry Andric     Polynomial r = *this - o;
573349cc55cSDimitry Andric     return (r.ErrorMSBs == 0) && (!r.isFirstOrder()) && (r.A.isZero());
5740b57cec5SDimitry Andric   }
5750b57cec5SDimitry Andric 
5760b57cec5SDimitry Andric   /// Print the polynomial into a stream.
5770b57cec5SDimitry Andric   void print(raw_ostream &OS) const {
5780b57cec5SDimitry Andric     OS << "[{#ErrBits:" << ErrorMSBs << "} ";
5790b57cec5SDimitry Andric 
5800b57cec5SDimitry Andric     if (V) {
5810b57cec5SDimitry Andric       for (auto b : B)
5820b57cec5SDimitry Andric         OS << "(";
5830b57cec5SDimitry Andric       OS << "(" << *V << ") ";
5840b57cec5SDimitry Andric 
5850b57cec5SDimitry Andric       for (auto b : B) {
5860b57cec5SDimitry Andric         switch (b.first) {
5870b57cec5SDimitry Andric         case LShr:
5880b57cec5SDimitry Andric           OS << "LShr ";
5890b57cec5SDimitry Andric           break;
5900b57cec5SDimitry Andric         case Mul:
5910b57cec5SDimitry Andric           OS << "Mul ";
5920b57cec5SDimitry Andric           break;
5930b57cec5SDimitry Andric         case SExt:
5940b57cec5SDimitry Andric           OS << "SExt ";
5950b57cec5SDimitry Andric           break;
5960b57cec5SDimitry Andric         case Trunc:
5970b57cec5SDimitry Andric           OS << "Trunc ";
5980b57cec5SDimitry Andric           break;
5990b57cec5SDimitry Andric         }
6000b57cec5SDimitry Andric 
6010b57cec5SDimitry Andric         OS << b.second << ") ";
6020b57cec5SDimitry Andric       }
6030b57cec5SDimitry Andric     }
6040b57cec5SDimitry Andric 
6050b57cec5SDimitry Andric     OS << "+ " << A << "]";
6060b57cec5SDimitry Andric   }
6070b57cec5SDimitry Andric 
6080b57cec5SDimitry Andric private:
6090b57cec5SDimitry Andric   void deleteB() {
6100b57cec5SDimitry Andric     V = nullptr;
6110b57cec5SDimitry Andric     B.clear();
6120b57cec5SDimitry Andric   }
6130b57cec5SDimitry Andric 
6140b57cec5SDimitry Andric   void pushBOperation(const BOps Op, const APInt &C) {
6150b57cec5SDimitry Andric     if (isFirstOrder()) {
6160b57cec5SDimitry Andric       B.push_back(std::make_pair(Op, C));
6170b57cec5SDimitry Andric       return;
6180b57cec5SDimitry Andric     }
6190b57cec5SDimitry Andric   }
6200b57cec5SDimitry Andric };
6210b57cec5SDimitry Andric 
6220b57cec5SDimitry Andric #ifndef NDEBUG
6230b57cec5SDimitry Andric static raw_ostream &operator<<(raw_ostream &OS, const Polynomial &S) {
6240b57cec5SDimitry Andric   S.print(OS);
6250b57cec5SDimitry Andric   return OS;
6260b57cec5SDimitry Andric }
6270b57cec5SDimitry Andric #endif
6280b57cec5SDimitry Andric 
6290b57cec5SDimitry Andric /// VectorInfo stores abstract the following information for each vector
6300b57cec5SDimitry Andric /// element:
6310b57cec5SDimitry Andric ///
6325f757f3fSDimitry Andric /// 1) The memory address loaded into the element as Polynomial
6330b57cec5SDimitry Andric /// 2) a set of load instruction necessary to construct the vector,
6340b57cec5SDimitry Andric /// 3) a set of all other instructions that are necessary to create the vector and
6350b57cec5SDimitry Andric /// 4) a pointer value that can be used as relative base for all elements.
6360b57cec5SDimitry Andric struct VectorInfo {
6370b57cec5SDimitry Andric private:
6380b57cec5SDimitry Andric   VectorInfo(const VectorInfo &c) : VTy(c.VTy) {
6390b57cec5SDimitry Andric     llvm_unreachable(
6400b57cec5SDimitry Andric         "Copying VectorInfo is neither implemented nor necessary,");
6410b57cec5SDimitry Andric   }
6420b57cec5SDimitry Andric 
6430b57cec5SDimitry Andric public:
6440b57cec5SDimitry Andric   /// Information of a Vector Element
6450b57cec5SDimitry Andric   struct ElementInfo {
6460b57cec5SDimitry Andric     /// Offset Polynomial.
6470b57cec5SDimitry Andric     Polynomial Ofs;
6480b57cec5SDimitry Andric 
6490b57cec5SDimitry Andric     /// The Load Instruction used to Load the entry. LI is null if the pointer
6500b57cec5SDimitry Andric     /// of the load instruction does not point on to the entry
6510b57cec5SDimitry Andric     LoadInst *LI;
6520b57cec5SDimitry Andric 
6530b57cec5SDimitry Andric     ElementInfo(Polynomial Offset = Polynomial(), LoadInst *LI = nullptr)
6540b57cec5SDimitry Andric         : Ofs(Offset), LI(LI) {}
6550b57cec5SDimitry Andric   };
6560b57cec5SDimitry Andric 
6570b57cec5SDimitry Andric   /// Basic-block the load instructions are within
6581fd87a68SDimitry Andric   BasicBlock *BB = nullptr;
6590b57cec5SDimitry Andric 
6600b57cec5SDimitry Andric   /// Pointer value of all participation load instructions
6611fd87a68SDimitry Andric   Value *PV = nullptr;
6620b57cec5SDimitry Andric 
6630b57cec5SDimitry Andric   /// Participating load instructions
6640b57cec5SDimitry Andric   std::set<LoadInst *> LIs;
6650b57cec5SDimitry Andric 
6660b57cec5SDimitry Andric   /// Participating instructions
6670b57cec5SDimitry Andric   std::set<Instruction *> Is;
6680b57cec5SDimitry Andric 
6690b57cec5SDimitry Andric   /// Final shuffle-vector instruction
6701fd87a68SDimitry Andric   ShuffleVectorInst *SVI = nullptr;
6710b57cec5SDimitry Andric 
6720b57cec5SDimitry Andric   /// Information of the offset for each vector element
6730b57cec5SDimitry Andric   ElementInfo *EI;
6740b57cec5SDimitry Andric 
6750b57cec5SDimitry Andric   /// Vector Type
6765ffd83dbSDimitry Andric   FixedVectorType *const VTy;
6770b57cec5SDimitry Andric 
6781fd87a68SDimitry Andric   VectorInfo(FixedVectorType *VTy) : VTy(VTy) {
6790b57cec5SDimitry Andric     EI = new ElementInfo[VTy->getNumElements()];
6800b57cec5SDimitry Andric   }
6810b57cec5SDimitry Andric 
68206c3fb27SDimitry Andric   VectorInfo &operator=(const VectorInfo &other) = delete;
68306c3fb27SDimitry Andric 
6840b57cec5SDimitry Andric   virtual ~VectorInfo() { delete[] EI; }
6850b57cec5SDimitry Andric 
6860b57cec5SDimitry Andric   unsigned getDimension() const { return VTy->getNumElements(); }
6870b57cec5SDimitry Andric 
6880b57cec5SDimitry Andric   /// Test if the VectorInfo can be part of an interleaved load with the
6890b57cec5SDimitry Andric   /// specified factor.
6900b57cec5SDimitry Andric   ///
6910b57cec5SDimitry Andric   /// \param Factor of the interleave
6920b57cec5SDimitry Andric   /// \param DL Targets Datalayout
6930b57cec5SDimitry Andric   ///
6940b57cec5SDimitry Andric   /// \returns true if this is possible and false if not
6950b57cec5SDimitry Andric   bool isInterleaved(unsigned Factor, const DataLayout &DL) const {
6960b57cec5SDimitry Andric     unsigned Size = DL.getTypeAllocSize(VTy->getElementType());
6970b57cec5SDimitry Andric     for (unsigned i = 1; i < getDimension(); i++) {
6980b57cec5SDimitry Andric       if (!EI[i].Ofs.isProvenEqualTo(EI[0].Ofs + i * Factor * Size)) {
6990b57cec5SDimitry Andric         return false;
7000b57cec5SDimitry Andric       }
7010b57cec5SDimitry Andric     }
7020b57cec5SDimitry Andric     return true;
7030b57cec5SDimitry Andric   }
7040b57cec5SDimitry Andric 
7050b57cec5SDimitry Andric   /// Recursively computes the vector information stored in V.
7060b57cec5SDimitry Andric   ///
7070b57cec5SDimitry Andric   /// This function delegates the work to specialized implementations
7080b57cec5SDimitry Andric   ///
7090b57cec5SDimitry Andric   /// \param V Value to operate on
7100b57cec5SDimitry Andric   /// \param Result Result of the computation
7110b57cec5SDimitry Andric   ///
7120b57cec5SDimitry Andric   /// \returns false if no sensible information can be gathered.
7130b57cec5SDimitry Andric   static bool compute(Value *V, VectorInfo &Result, const DataLayout &DL) {
7140b57cec5SDimitry Andric     ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(V);
7150b57cec5SDimitry Andric     if (SVI)
7160b57cec5SDimitry Andric       return computeFromSVI(SVI, Result, DL);
7170b57cec5SDimitry Andric     LoadInst *LI = dyn_cast<LoadInst>(V);
7180b57cec5SDimitry Andric     if (LI)
7190b57cec5SDimitry Andric       return computeFromLI(LI, Result, DL);
7200b57cec5SDimitry Andric     BitCastInst *BCI = dyn_cast<BitCastInst>(V);
7210b57cec5SDimitry Andric     if (BCI)
7220b57cec5SDimitry Andric       return computeFromBCI(BCI, Result, DL);
7230b57cec5SDimitry Andric     return false;
7240b57cec5SDimitry Andric   }
7250b57cec5SDimitry Andric 
7260b57cec5SDimitry Andric   /// BitCastInst specialization to compute the vector information.
7270b57cec5SDimitry Andric   ///
7280b57cec5SDimitry Andric   /// \param BCI BitCastInst to operate on
7290b57cec5SDimitry Andric   /// \param Result Result of the computation
7300b57cec5SDimitry Andric   ///
7310b57cec5SDimitry Andric   /// \returns false if no sensible information can be gathered.
7320b57cec5SDimitry Andric   static bool computeFromBCI(BitCastInst *BCI, VectorInfo &Result,
7330b57cec5SDimitry Andric                              const DataLayout &DL) {
7340b57cec5SDimitry Andric     Instruction *Op = dyn_cast<Instruction>(BCI->getOperand(0));
7350b57cec5SDimitry Andric 
7360b57cec5SDimitry Andric     if (!Op)
7370b57cec5SDimitry Andric       return false;
7380b57cec5SDimitry Andric 
7395ffd83dbSDimitry Andric     FixedVectorType *VTy = dyn_cast<FixedVectorType>(Op->getType());
7400b57cec5SDimitry Andric     if (!VTy)
7410b57cec5SDimitry Andric       return false;
7420b57cec5SDimitry Andric 
7430b57cec5SDimitry Andric     // We can only cast from large to smaller vectors
7440b57cec5SDimitry Andric     if (Result.VTy->getNumElements() % VTy->getNumElements())
7450b57cec5SDimitry Andric       return false;
7460b57cec5SDimitry Andric 
7470b57cec5SDimitry Andric     unsigned Factor = Result.VTy->getNumElements() / VTy->getNumElements();
7480b57cec5SDimitry Andric     unsigned NewSize = DL.getTypeAllocSize(Result.VTy->getElementType());
7490b57cec5SDimitry Andric     unsigned OldSize = DL.getTypeAllocSize(VTy->getElementType());
7500b57cec5SDimitry Andric 
7510b57cec5SDimitry Andric     if (NewSize * Factor != OldSize)
7520b57cec5SDimitry Andric       return false;
7530b57cec5SDimitry Andric 
7540b57cec5SDimitry Andric     VectorInfo Old(VTy);
7550b57cec5SDimitry Andric     if (!compute(Op, Old, DL))
7560b57cec5SDimitry Andric       return false;
7570b57cec5SDimitry Andric 
7580b57cec5SDimitry Andric     for (unsigned i = 0; i < Result.VTy->getNumElements(); i += Factor) {
7590b57cec5SDimitry Andric       for (unsigned j = 0; j < Factor; j++) {
7600b57cec5SDimitry Andric         Result.EI[i + j] =
7610b57cec5SDimitry Andric             ElementInfo(Old.EI[i / Factor].Ofs + j * NewSize,
7620b57cec5SDimitry Andric                         j == 0 ? Old.EI[i / Factor].LI : nullptr);
7630b57cec5SDimitry Andric       }
7640b57cec5SDimitry Andric     }
7650b57cec5SDimitry Andric 
7660b57cec5SDimitry Andric     Result.BB = Old.BB;
7670b57cec5SDimitry Andric     Result.PV = Old.PV;
7680b57cec5SDimitry Andric     Result.LIs.insert(Old.LIs.begin(), Old.LIs.end());
7690b57cec5SDimitry Andric     Result.Is.insert(Old.Is.begin(), Old.Is.end());
7700b57cec5SDimitry Andric     Result.Is.insert(BCI);
7710b57cec5SDimitry Andric     Result.SVI = nullptr;
7720b57cec5SDimitry Andric 
7730b57cec5SDimitry Andric     return true;
7740b57cec5SDimitry Andric   }
7750b57cec5SDimitry Andric 
7760b57cec5SDimitry Andric   /// ShuffleVectorInst specialization to compute vector information.
7770b57cec5SDimitry Andric   ///
7780b57cec5SDimitry Andric   /// \param SVI ShuffleVectorInst to operate on
7790b57cec5SDimitry Andric   /// \param Result Result of the computation
7800b57cec5SDimitry Andric   ///
7810b57cec5SDimitry Andric   /// Compute the left and the right side vector information and merge them by
7820b57cec5SDimitry Andric   /// applying the shuffle operation. This function also ensures that the left
7830b57cec5SDimitry Andric   /// and right side have compatible loads. This means that all loads are with
7840b57cec5SDimitry Andric   /// in the same basic block and are based on the same pointer.
7850b57cec5SDimitry Andric   ///
7860b57cec5SDimitry Andric   /// \returns false if no sensible information can be gathered.
7870b57cec5SDimitry Andric   static bool computeFromSVI(ShuffleVectorInst *SVI, VectorInfo &Result,
7880b57cec5SDimitry Andric                              const DataLayout &DL) {
7895ffd83dbSDimitry Andric     FixedVectorType *ArgTy =
7905ffd83dbSDimitry Andric         cast<FixedVectorType>(SVI->getOperand(0)->getType());
7910b57cec5SDimitry Andric 
7920b57cec5SDimitry Andric     // Compute the left hand vector information.
7930b57cec5SDimitry Andric     VectorInfo LHS(ArgTy);
7940b57cec5SDimitry Andric     if (!compute(SVI->getOperand(0), LHS, DL))
7950b57cec5SDimitry Andric       LHS.BB = nullptr;
7960b57cec5SDimitry Andric 
7970b57cec5SDimitry Andric     // Compute the right hand vector information.
7980b57cec5SDimitry Andric     VectorInfo RHS(ArgTy);
7990b57cec5SDimitry Andric     if (!compute(SVI->getOperand(1), RHS, DL))
8000b57cec5SDimitry Andric       RHS.BB = nullptr;
8010b57cec5SDimitry Andric 
8020b57cec5SDimitry Andric     // Neither operand produced sensible results?
8030b57cec5SDimitry Andric     if (!LHS.BB && !RHS.BB)
8040b57cec5SDimitry Andric       return false;
8050b57cec5SDimitry Andric     // Only RHS produced sensible results?
8060b57cec5SDimitry Andric     else if (!LHS.BB) {
8070b57cec5SDimitry Andric       Result.BB = RHS.BB;
8080b57cec5SDimitry Andric       Result.PV = RHS.PV;
8090b57cec5SDimitry Andric     }
8100b57cec5SDimitry Andric     // Only LHS produced sensible results?
8110b57cec5SDimitry Andric     else if (!RHS.BB) {
8120b57cec5SDimitry Andric       Result.BB = LHS.BB;
8130b57cec5SDimitry Andric       Result.PV = LHS.PV;
8140b57cec5SDimitry Andric     }
8150b57cec5SDimitry Andric     // Both operands produced sensible results?
8160b57cec5SDimitry Andric     else if ((LHS.BB == RHS.BB) && (LHS.PV == RHS.PV)) {
8170b57cec5SDimitry Andric       Result.BB = LHS.BB;
8180b57cec5SDimitry Andric       Result.PV = LHS.PV;
8190b57cec5SDimitry Andric     }
8200b57cec5SDimitry Andric     // Both operands produced sensible results but they are incompatible.
8210b57cec5SDimitry Andric     else {
8220b57cec5SDimitry Andric       return false;
8230b57cec5SDimitry Andric     }
8240b57cec5SDimitry Andric 
8250b57cec5SDimitry Andric     // Merge and apply the operation on the offset information.
8260b57cec5SDimitry Andric     if (LHS.BB) {
8270b57cec5SDimitry Andric       Result.LIs.insert(LHS.LIs.begin(), LHS.LIs.end());
8280b57cec5SDimitry Andric       Result.Is.insert(LHS.Is.begin(), LHS.Is.end());
8290b57cec5SDimitry Andric     }
8300b57cec5SDimitry Andric     if (RHS.BB) {
8310b57cec5SDimitry Andric       Result.LIs.insert(RHS.LIs.begin(), RHS.LIs.end());
8320b57cec5SDimitry Andric       Result.Is.insert(RHS.Is.begin(), RHS.Is.end());
8330b57cec5SDimitry Andric     }
8340b57cec5SDimitry Andric     Result.Is.insert(SVI);
8350b57cec5SDimitry Andric     Result.SVI = SVI;
8360b57cec5SDimitry Andric 
8370b57cec5SDimitry Andric     int j = 0;
8380b57cec5SDimitry Andric     for (int i : SVI->getShuffleMask()) {
8390b57cec5SDimitry Andric       assert((i < 2 * (signed)ArgTy->getNumElements()) &&
8400b57cec5SDimitry Andric              "Invalid ShuffleVectorInst (index out of bounds)");
8410b57cec5SDimitry Andric 
8420b57cec5SDimitry Andric       if (i < 0)
8430b57cec5SDimitry Andric         Result.EI[j] = ElementInfo();
8440b57cec5SDimitry Andric       else if (i < (signed)ArgTy->getNumElements()) {
8450b57cec5SDimitry Andric         if (LHS.BB)
8460b57cec5SDimitry Andric           Result.EI[j] = LHS.EI[i];
8470b57cec5SDimitry Andric         else
8480b57cec5SDimitry Andric           Result.EI[j] = ElementInfo();
8490b57cec5SDimitry Andric       } else {
8500b57cec5SDimitry Andric         if (RHS.BB)
8510b57cec5SDimitry Andric           Result.EI[j] = RHS.EI[i - ArgTy->getNumElements()];
8520b57cec5SDimitry Andric         else
8530b57cec5SDimitry Andric           Result.EI[j] = ElementInfo();
8540b57cec5SDimitry Andric       }
8550b57cec5SDimitry Andric       j++;
8560b57cec5SDimitry Andric     }
8570b57cec5SDimitry Andric 
8580b57cec5SDimitry Andric     return true;
8590b57cec5SDimitry Andric   }
8600b57cec5SDimitry Andric 
8610b57cec5SDimitry Andric   /// LoadInst specialization to compute vector information.
8620b57cec5SDimitry Andric   ///
8630b57cec5SDimitry Andric   /// This function also acts as abort condition to the recursion.
8640b57cec5SDimitry Andric   ///
8650b57cec5SDimitry Andric   /// \param LI LoadInst to operate on
8660b57cec5SDimitry Andric   /// \param Result Result of the computation
8670b57cec5SDimitry Andric   ///
8680b57cec5SDimitry Andric   /// \returns false if no sensible information can be gathered.
8690b57cec5SDimitry Andric   static bool computeFromLI(LoadInst *LI, VectorInfo &Result,
8700b57cec5SDimitry Andric                             const DataLayout &DL) {
8710b57cec5SDimitry Andric     Value *BasePtr;
8720b57cec5SDimitry Andric     Polynomial Offset;
8730b57cec5SDimitry Andric 
8740b57cec5SDimitry Andric     if (LI->isVolatile())
8750b57cec5SDimitry Andric       return false;
8760b57cec5SDimitry Andric 
8770b57cec5SDimitry Andric     if (LI->isAtomic())
8780b57cec5SDimitry Andric       return false;
8790b57cec5SDimitry Andric 
8803a079333SDimitry Andric     if (!DL.typeSizeEqualsStoreSize(Result.VTy->getElementType()))
8813a079333SDimitry Andric       return false;
8823a079333SDimitry Andric 
8830b57cec5SDimitry Andric     // Get the base polynomial
8840b57cec5SDimitry Andric     computePolynomialFromPointer(*LI->getPointerOperand(), Offset, BasePtr, DL);
8850b57cec5SDimitry Andric 
8860b57cec5SDimitry Andric     Result.BB = LI->getParent();
8870b57cec5SDimitry Andric     Result.PV = BasePtr;
8880b57cec5SDimitry Andric     Result.LIs.insert(LI);
8890b57cec5SDimitry Andric     Result.Is.insert(LI);
8900b57cec5SDimitry Andric 
8910b57cec5SDimitry Andric     for (unsigned i = 0; i < Result.getDimension(); i++) {
8920b57cec5SDimitry Andric       Value *Idx[2] = {
8930b57cec5SDimitry Andric           ConstantInt::get(Type::getInt32Ty(LI->getContext()), 0),
8940b57cec5SDimitry Andric           ConstantInt::get(Type::getInt32Ty(LI->getContext()), i),
8950b57cec5SDimitry Andric       };
896*0fca6ea1SDimitry Andric       int64_t Ofs = DL.getIndexedOffsetInType(Result.VTy, Idx);
8970b57cec5SDimitry Andric       Result.EI[i] = ElementInfo(Offset + Ofs, i == 0 ? LI : nullptr);
8980b57cec5SDimitry Andric     }
8990b57cec5SDimitry Andric 
9000b57cec5SDimitry Andric     return true;
9010b57cec5SDimitry Andric   }
9020b57cec5SDimitry Andric 
9030b57cec5SDimitry Andric   /// Recursively compute polynomial of a value.
9040b57cec5SDimitry Andric   ///
9050b57cec5SDimitry Andric   /// \param BO Input binary operation
9060b57cec5SDimitry Andric   /// \param Result Result polynomial
9070b57cec5SDimitry Andric   static void computePolynomialBinOp(BinaryOperator &BO, Polynomial &Result) {
9080b57cec5SDimitry Andric     Value *LHS = BO.getOperand(0);
9090b57cec5SDimitry Andric     Value *RHS = BO.getOperand(1);
9100b57cec5SDimitry Andric 
9110b57cec5SDimitry Andric     // Find the RHS Constant if any
9120b57cec5SDimitry Andric     ConstantInt *C = dyn_cast<ConstantInt>(RHS);
9130b57cec5SDimitry Andric     if ((!C) && BO.isCommutative()) {
9140b57cec5SDimitry Andric       C = dyn_cast<ConstantInt>(LHS);
9150b57cec5SDimitry Andric       if (C)
9160b57cec5SDimitry Andric         std::swap(LHS, RHS);
9170b57cec5SDimitry Andric     }
9180b57cec5SDimitry Andric 
9190b57cec5SDimitry Andric     switch (BO.getOpcode()) {
9200b57cec5SDimitry Andric     case Instruction::Add:
9210b57cec5SDimitry Andric       if (!C)
9220b57cec5SDimitry Andric         break;
9230b57cec5SDimitry Andric 
9240b57cec5SDimitry Andric       computePolynomial(*LHS, Result);
9250b57cec5SDimitry Andric       Result.add(C->getValue());
9260b57cec5SDimitry Andric       return;
9270b57cec5SDimitry Andric 
9280b57cec5SDimitry Andric     case Instruction::LShr:
9290b57cec5SDimitry Andric       if (!C)
9300b57cec5SDimitry Andric         break;
9310b57cec5SDimitry Andric 
9320b57cec5SDimitry Andric       computePolynomial(*LHS, Result);
9330b57cec5SDimitry Andric       Result.lshr(C->getValue());
9340b57cec5SDimitry Andric       return;
9350b57cec5SDimitry Andric 
9360b57cec5SDimitry Andric     default:
9370b57cec5SDimitry Andric       break;
9380b57cec5SDimitry Andric     }
9390b57cec5SDimitry Andric 
9400b57cec5SDimitry Andric     Result = Polynomial(&BO);
9410b57cec5SDimitry Andric   }
9420b57cec5SDimitry Andric 
9430b57cec5SDimitry Andric   /// Recursively compute polynomial of a value
9440b57cec5SDimitry Andric   ///
9450b57cec5SDimitry Andric   /// \param V input value
9460b57cec5SDimitry Andric   /// \param Result result polynomial
9470b57cec5SDimitry Andric   static void computePolynomial(Value &V, Polynomial &Result) {
9488bcb0991SDimitry Andric     if (auto *BO = dyn_cast<BinaryOperator>(&V))
9498bcb0991SDimitry Andric       computePolynomialBinOp(*BO, Result);
9500b57cec5SDimitry Andric     else
9510b57cec5SDimitry Andric       Result = Polynomial(&V);
9520b57cec5SDimitry Andric   }
9530b57cec5SDimitry Andric 
9540b57cec5SDimitry Andric   /// Compute the Polynomial representation of a Pointer type.
9550b57cec5SDimitry Andric   ///
9560b57cec5SDimitry Andric   /// \param Ptr input pointer value
9570b57cec5SDimitry Andric   /// \param Result result polynomial
9580b57cec5SDimitry Andric   /// \param BasePtr pointer the polynomial is based on
9590b57cec5SDimitry Andric   /// \param DL Datalayout of the target machine
9600b57cec5SDimitry Andric   static void computePolynomialFromPointer(Value &Ptr, Polynomial &Result,
9610b57cec5SDimitry Andric                                            Value *&BasePtr,
9620b57cec5SDimitry Andric                                            const DataLayout &DL) {
9630b57cec5SDimitry Andric     // Not a pointer type? Return an undefined polynomial
9640b57cec5SDimitry Andric     PointerType *PtrTy = dyn_cast<PointerType>(Ptr.getType());
9650b57cec5SDimitry Andric     if (!PtrTy) {
9660b57cec5SDimitry Andric       Result = Polynomial();
9670b57cec5SDimitry Andric       BasePtr = nullptr;
9680b57cec5SDimitry Andric       return;
9690b57cec5SDimitry Andric     }
9700b57cec5SDimitry Andric     unsigned PointerBits =
9710b57cec5SDimitry Andric         DL.getIndexSizeInBits(PtrTy->getPointerAddressSpace());
9720b57cec5SDimitry Andric 
9730b57cec5SDimitry Andric     /// Skip pointer casts. Return Zero polynomial otherwise
9740b57cec5SDimitry Andric     if (isa<CastInst>(&Ptr)) {
9750b57cec5SDimitry Andric       CastInst &CI = *cast<CastInst>(&Ptr);
9760b57cec5SDimitry Andric       switch (CI.getOpcode()) {
9770b57cec5SDimitry Andric       case Instruction::BitCast:
9780b57cec5SDimitry Andric         computePolynomialFromPointer(*CI.getOperand(0), Result, BasePtr, DL);
9790b57cec5SDimitry Andric         break;
9800b57cec5SDimitry Andric       default:
9810b57cec5SDimitry Andric         BasePtr = &Ptr;
9820b57cec5SDimitry Andric         Polynomial(PointerBits, 0);
9830b57cec5SDimitry Andric         break;
9840b57cec5SDimitry Andric       }
9850b57cec5SDimitry Andric     }
9860b57cec5SDimitry Andric     /// Resolve GetElementPtrInst.
9870b57cec5SDimitry Andric     else if (isa<GetElementPtrInst>(&Ptr)) {
9880b57cec5SDimitry Andric       GetElementPtrInst &GEP = *cast<GetElementPtrInst>(&Ptr);
9890b57cec5SDimitry Andric 
9900b57cec5SDimitry Andric       APInt BaseOffset(PointerBits, 0);
9910b57cec5SDimitry Andric 
9920b57cec5SDimitry Andric       // Check if we can compute the Offset with accumulateConstantOffset
9930b57cec5SDimitry Andric       if (GEP.accumulateConstantOffset(DL, BaseOffset)) {
9940b57cec5SDimitry Andric         Result = Polynomial(BaseOffset);
9950b57cec5SDimitry Andric         BasePtr = GEP.getPointerOperand();
9960b57cec5SDimitry Andric         return;
9970b57cec5SDimitry Andric       } else {
9980b57cec5SDimitry Andric         // Otherwise we allow that the last index operand of the GEP is
9990b57cec5SDimitry Andric         // non-constant.
10000b57cec5SDimitry Andric         unsigned idxOperand, e;
10010b57cec5SDimitry Andric         SmallVector<Value *, 4> Indices;
10020b57cec5SDimitry Andric         for (idxOperand = 1, e = GEP.getNumOperands(); idxOperand < e;
10030b57cec5SDimitry Andric              idxOperand++) {
10040b57cec5SDimitry Andric           ConstantInt *IDX = dyn_cast<ConstantInt>(GEP.getOperand(idxOperand));
10050b57cec5SDimitry Andric           if (!IDX)
10060b57cec5SDimitry Andric             break;
10070b57cec5SDimitry Andric           Indices.push_back(IDX);
10080b57cec5SDimitry Andric         }
10090b57cec5SDimitry Andric 
10100b57cec5SDimitry Andric         // It must also be the last operand.
10110b57cec5SDimitry Andric         if (idxOperand + 1 != e) {
10120b57cec5SDimitry Andric           Result = Polynomial();
10130b57cec5SDimitry Andric           BasePtr = nullptr;
10140b57cec5SDimitry Andric           return;
10150b57cec5SDimitry Andric         }
10160b57cec5SDimitry Andric 
10170b57cec5SDimitry Andric         // Compute the polynomial of the index operand.
10180b57cec5SDimitry Andric         computePolynomial(*GEP.getOperand(idxOperand), Result);
10190b57cec5SDimitry Andric 
10200b57cec5SDimitry Andric         // Compute base offset from zero based index, excluding the last
10210b57cec5SDimitry Andric         // variable operand.
10220b57cec5SDimitry Andric         BaseOffset =
10230b57cec5SDimitry Andric             DL.getIndexedOffsetInType(GEP.getSourceElementType(), Indices);
10240b57cec5SDimitry Andric 
10250b57cec5SDimitry Andric         // Apply the operations of GEP to the polynomial.
10260b57cec5SDimitry Andric         unsigned ResultSize = DL.getTypeAllocSize(GEP.getResultElementType());
10270b57cec5SDimitry Andric         Result.sextOrTrunc(PointerBits);
10280b57cec5SDimitry Andric         Result.mul(APInt(PointerBits, ResultSize));
10290b57cec5SDimitry Andric         Result.add(BaseOffset);
10300b57cec5SDimitry Andric         BasePtr = GEP.getPointerOperand();
10310b57cec5SDimitry Andric       }
10320b57cec5SDimitry Andric     }
10330b57cec5SDimitry Andric     // All other instructions are handled by using the value as base pointer and
10340b57cec5SDimitry Andric     // a zero polynomial.
10350b57cec5SDimitry Andric     else {
10360b57cec5SDimitry Andric       BasePtr = &Ptr;
10370b57cec5SDimitry Andric       Polynomial(DL.getIndexSizeInBits(PtrTy->getPointerAddressSpace()), 0);
10380b57cec5SDimitry Andric     }
10390b57cec5SDimitry Andric   }
10400b57cec5SDimitry Andric 
10410b57cec5SDimitry Andric #ifndef NDEBUG
10420b57cec5SDimitry Andric   void print(raw_ostream &OS) const {
10430b57cec5SDimitry Andric     if (PV)
10440b57cec5SDimitry Andric       OS << *PV;
10450b57cec5SDimitry Andric     else
10460b57cec5SDimitry Andric       OS << "(none)";
10470b57cec5SDimitry Andric     OS << " + ";
10480b57cec5SDimitry Andric     for (unsigned i = 0; i < getDimension(); i++)
10490b57cec5SDimitry Andric       OS << ((i == 0) ? "[" : ", ") << EI[i].Ofs;
10500b57cec5SDimitry Andric     OS << "]";
10510b57cec5SDimitry Andric   }
10520b57cec5SDimitry Andric #endif
10530b57cec5SDimitry Andric };
10540b57cec5SDimitry Andric 
10550b57cec5SDimitry Andric } // anonymous namespace
10560b57cec5SDimitry Andric 
10570b57cec5SDimitry Andric bool InterleavedLoadCombineImpl::findPattern(
10580b57cec5SDimitry Andric     std::list<VectorInfo> &Candidates, std::list<VectorInfo> &InterleavedLoad,
10590b57cec5SDimitry Andric     unsigned Factor, const DataLayout &DL) {
10600b57cec5SDimitry Andric   for (auto C0 = Candidates.begin(), E0 = Candidates.end(); C0 != E0; ++C0) {
10610b57cec5SDimitry Andric     unsigned i;
10620b57cec5SDimitry Andric     // Try to find an interleaved load using the front of Worklist as first line
10630b57cec5SDimitry Andric     unsigned Size = DL.getTypeAllocSize(C0->VTy->getElementType());
10640b57cec5SDimitry Andric 
10650b57cec5SDimitry Andric     // List containing iterators pointing to the VectorInfos of the candidates
10660b57cec5SDimitry Andric     std::vector<std::list<VectorInfo>::iterator> Res(Factor, Candidates.end());
10670b57cec5SDimitry Andric 
10680b57cec5SDimitry Andric     for (auto C = Candidates.begin(), E = Candidates.end(); C != E; C++) {
10690b57cec5SDimitry Andric       if (C->VTy != C0->VTy)
10700b57cec5SDimitry Andric         continue;
10710b57cec5SDimitry Andric       if (C->BB != C0->BB)
10720b57cec5SDimitry Andric         continue;
10730b57cec5SDimitry Andric       if (C->PV != C0->PV)
10740b57cec5SDimitry Andric         continue;
10750b57cec5SDimitry Andric 
10760b57cec5SDimitry Andric       // Check the current value matches any of factor - 1 remaining lines
10770b57cec5SDimitry Andric       for (i = 1; i < Factor; i++) {
10780b57cec5SDimitry Andric         if (C->EI[0].Ofs.isProvenEqualTo(C0->EI[0].Ofs + i * Size)) {
10790b57cec5SDimitry Andric           Res[i] = C;
10800b57cec5SDimitry Andric         }
10810b57cec5SDimitry Andric       }
10820b57cec5SDimitry Andric 
10830b57cec5SDimitry Andric       for (i = 1; i < Factor; i++) {
10840b57cec5SDimitry Andric         if (Res[i] == Candidates.end())
10850b57cec5SDimitry Andric           break;
10860b57cec5SDimitry Andric       }
10870b57cec5SDimitry Andric       if (i == Factor) {
10880b57cec5SDimitry Andric         Res[0] = C0;
10890b57cec5SDimitry Andric         break;
10900b57cec5SDimitry Andric       }
10910b57cec5SDimitry Andric     }
10920b57cec5SDimitry Andric 
10930b57cec5SDimitry Andric     if (Res[0] != Candidates.end()) {
10940b57cec5SDimitry Andric       // Move the result into the output
10950b57cec5SDimitry Andric       for (unsigned i = 0; i < Factor; i++) {
10960b57cec5SDimitry Andric         InterleavedLoad.splice(InterleavedLoad.end(), Candidates, Res[i]);
10970b57cec5SDimitry Andric       }
10980b57cec5SDimitry Andric 
10990b57cec5SDimitry Andric       return true;
11000b57cec5SDimitry Andric     }
11010b57cec5SDimitry Andric   }
11020b57cec5SDimitry Andric   return false;
11030b57cec5SDimitry Andric }
11040b57cec5SDimitry Andric 
11050b57cec5SDimitry Andric LoadInst *
11060b57cec5SDimitry Andric InterleavedLoadCombineImpl::findFirstLoad(const std::set<LoadInst *> &LIs) {
11070b57cec5SDimitry Andric   assert(!LIs.empty() && "No load instructions given.");
11080b57cec5SDimitry Andric 
11090b57cec5SDimitry Andric   // All LIs are within the same BB. Select the first for a reference.
11100b57cec5SDimitry Andric   BasicBlock *BB = (*LIs.begin())->getParent();
1111e8d8bef9SDimitry Andric   BasicBlock::iterator FLI = llvm::find_if(
1112e8d8bef9SDimitry Andric       *BB, [&LIs](Instruction &I) -> bool { return is_contained(LIs, &I); });
11130b57cec5SDimitry Andric   assert(FLI != BB->end());
11140b57cec5SDimitry Andric 
11150b57cec5SDimitry Andric   return cast<LoadInst>(FLI);
11160b57cec5SDimitry Andric }
11170b57cec5SDimitry Andric 
11180b57cec5SDimitry Andric bool InterleavedLoadCombineImpl::combine(std::list<VectorInfo> &InterleavedLoad,
11190b57cec5SDimitry Andric                                          OptimizationRemarkEmitter &ORE) {
11200b57cec5SDimitry Andric   LLVM_DEBUG(dbgs() << "Checking interleaved load\n");
11210b57cec5SDimitry Andric 
11220b57cec5SDimitry Andric   // The insertion point is the LoadInst which loads the first values. The
11230b57cec5SDimitry Andric   // following tests are used to proof that the combined load can be inserted
11240b57cec5SDimitry Andric   // just before InsertionPoint.
11250b57cec5SDimitry Andric   LoadInst *InsertionPoint = InterleavedLoad.front().EI[0].LI;
11260b57cec5SDimitry Andric 
11270b57cec5SDimitry Andric   // Test if the offset is computed
11280b57cec5SDimitry Andric   if (!InsertionPoint)
11290b57cec5SDimitry Andric     return false;
11300b57cec5SDimitry Andric 
11310b57cec5SDimitry Andric   std::set<LoadInst *> LIs;
11320b57cec5SDimitry Andric   std::set<Instruction *> Is;
11330b57cec5SDimitry Andric   std::set<Instruction *> SVIs;
11340b57cec5SDimitry Andric 
1135e8d8bef9SDimitry Andric   InstructionCost InterleavedCost;
1136e8d8bef9SDimitry Andric   InstructionCost InstructionCost = 0;
1137349cc55cSDimitry Andric   const TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency;
11380b57cec5SDimitry Andric 
11390b57cec5SDimitry Andric   // Get the interleave factor
11400b57cec5SDimitry Andric   unsigned Factor = InterleavedLoad.size();
11410b57cec5SDimitry Andric 
11420b57cec5SDimitry Andric   // Merge all input sets used in analysis
11430b57cec5SDimitry Andric   for (auto &VI : InterleavedLoad) {
11440b57cec5SDimitry Andric     // Generate a set of all load instructions to be combined
11450b57cec5SDimitry Andric     LIs.insert(VI.LIs.begin(), VI.LIs.end());
11460b57cec5SDimitry Andric 
11470b57cec5SDimitry Andric     // Generate a set of all instructions taking part in load
11480b57cec5SDimitry Andric     // interleaved. This list excludes the instructions necessary for the
11490b57cec5SDimitry Andric     // polynomial construction.
11500b57cec5SDimitry Andric     Is.insert(VI.Is.begin(), VI.Is.end());
11510b57cec5SDimitry Andric 
11520b57cec5SDimitry Andric     // Generate the set of the final ShuffleVectorInst.
11530b57cec5SDimitry Andric     SVIs.insert(VI.SVI);
11540b57cec5SDimitry Andric   }
11550b57cec5SDimitry Andric 
11560b57cec5SDimitry Andric   // There is nothing to combine.
11570b57cec5SDimitry Andric   if (LIs.size() < 2)
11580b57cec5SDimitry Andric     return false;
11590b57cec5SDimitry Andric 
11600b57cec5SDimitry Andric   // Test if all participating instruction will be dead after the
11610b57cec5SDimitry Andric   // transformation. If intermediate results are used, no performance gain can
11620b57cec5SDimitry Andric   // be expected. Also sum the cost of the Instructions beeing left dead.
1163fcaf7f86SDimitry Andric   for (const auto &I : Is) {
11640b57cec5SDimitry Andric     // Compute the old cost
1165349cc55cSDimitry Andric     InstructionCost += TTI.getInstructionCost(I, CostKind);
11660b57cec5SDimitry Andric 
11670b57cec5SDimitry Andric     // The final SVIs are allowed not to be dead, all uses will be replaced
11680b57cec5SDimitry Andric     if (SVIs.find(I) != SVIs.end())
11690b57cec5SDimitry Andric       continue;
11700b57cec5SDimitry Andric 
11710b57cec5SDimitry Andric     // If there are users outside the set to be eliminated, we abort the
11720b57cec5SDimitry Andric     // transformation. No gain can be expected.
1173480093f4SDimitry Andric     for (auto *U : I->users()) {
11740b57cec5SDimitry Andric       if (Is.find(dyn_cast<Instruction>(U)) == Is.end())
11750b57cec5SDimitry Andric         return false;
11760b57cec5SDimitry Andric     }
11770b57cec5SDimitry Andric   }
11780b57cec5SDimitry Andric 
1179e8d8bef9SDimitry Andric   // We need to have a valid cost in order to proceed.
1180e8d8bef9SDimitry Andric   if (!InstructionCost.isValid())
1181e8d8bef9SDimitry Andric     return false;
1182e8d8bef9SDimitry Andric 
11830b57cec5SDimitry Andric   // We know that all LoadInst are within the same BB. This guarantees that
11840b57cec5SDimitry Andric   // either everything or nothing is loaded.
11850b57cec5SDimitry Andric   LoadInst *First = findFirstLoad(LIs);
11860b57cec5SDimitry Andric 
11870b57cec5SDimitry Andric   // To be safe that the loads can be combined, iterate over all loads and test
11880b57cec5SDimitry Andric   // that the corresponding defining access dominates first LI. This guarantees
11890b57cec5SDimitry Andric   // that there are no aliasing stores in between the loads.
11900b57cec5SDimitry Andric   auto FMA = MSSA.getMemoryAccess(First);
1191fcaf7f86SDimitry Andric   for (auto *LI : LIs) {
11920b57cec5SDimitry Andric     auto MADef = MSSA.getMemoryAccess(LI)->getDefiningAccess();
11930b57cec5SDimitry Andric     if (!MSSA.dominates(MADef, FMA))
11940b57cec5SDimitry Andric       return false;
11950b57cec5SDimitry Andric   }
11960b57cec5SDimitry Andric   assert(!LIs.empty() && "There are no LoadInst to combine");
11970b57cec5SDimitry Andric 
11980b57cec5SDimitry Andric   // It is necessary that insertion point dominates all final ShuffleVectorInst.
11990b57cec5SDimitry Andric   for (auto &VI : InterleavedLoad) {
12000b57cec5SDimitry Andric     if (!DT.dominates(InsertionPoint, VI.SVI))
12010b57cec5SDimitry Andric       return false;
12020b57cec5SDimitry Andric   }
12030b57cec5SDimitry Andric 
12040b57cec5SDimitry Andric   // All checks are done. Add instructions detectable by InterleavedAccessPass
12050b57cec5SDimitry Andric   // The old instruction will are left dead.
12060b57cec5SDimitry Andric   IRBuilder<> Builder(InsertionPoint);
12070b57cec5SDimitry Andric   Type *ETy = InterleavedLoad.front().SVI->getType()->getElementType();
12080b57cec5SDimitry Andric   unsigned ElementsPerSVI =
12095ffd83dbSDimitry Andric       cast<FixedVectorType>(InterleavedLoad.front().SVI->getType())
12105ffd83dbSDimitry Andric           ->getNumElements();
12115ffd83dbSDimitry Andric   FixedVectorType *ILTy = FixedVectorType::get(ETy, Factor * ElementsPerSVI);
12120b57cec5SDimitry Andric 
121381ad6265SDimitry Andric   auto Indices = llvm::to_vector<4>(llvm::seq<unsigned>(0, Factor));
12140b57cec5SDimitry Andric   InterleavedCost = TTI.getInterleavedMemoryOpCost(
12155ffd83dbSDimitry Andric       Instruction::Load, ILTy, Factor, Indices, InsertionPoint->getAlign(),
1216349cc55cSDimitry Andric       InsertionPoint->getPointerAddressSpace(), CostKind);
12170b57cec5SDimitry Andric 
12180b57cec5SDimitry Andric   if (InterleavedCost >= InstructionCost) {
12190b57cec5SDimitry Andric     return false;
12200b57cec5SDimitry Andric   }
12210b57cec5SDimitry Andric 
12220b57cec5SDimitry Andric   // Create the wide load and update the MemorySSA.
12235f757f3fSDimitry Andric   auto Ptr = InsertionPoint->getPointerOperand();
12245f757f3fSDimitry Andric   auto LI = Builder.CreateAlignedLoad(ILTy, Ptr, InsertionPoint->getAlign(),
12250b57cec5SDimitry Andric                                       "interleaved.wide.load");
12260b57cec5SDimitry Andric   auto MSSAU = MemorySSAUpdater(&MSSA);
12270b57cec5SDimitry Andric   MemoryUse *MSSALoad = cast<MemoryUse>(MSSAU.createMemoryAccessBefore(
12280b57cec5SDimitry Andric       LI, nullptr, MSSA.getMemoryAccess(InsertionPoint)));
122981ad6265SDimitry Andric   MSSAU.insertUse(MSSALoad, /*RenameUses=*/ true);
12300b57cec5SDimitry Andric 
12310b57cec5SDimitry Andric   // Create the final SVIs and replace all uses.
12320b57cec5SDimitry Andric   int i = 0;
12330b57cec5SDimitry Andric   for (auto &VI : InterleavedLoad) {
12345ffd83dbSDimitry Andric     SmallVector<int, 4> Mask;
12350b57cec5SDimitry Andric     for (unsigned j = 0; j < ElementsPerSVI; j++)
12360b57cec5SDimitry Andric       Mask.push_back(i + j * Factor);
12370b57cec5SDimitry Andric 
12380b57cec5SDimitry Andric     Builder.SetInsertPoint(VI.SVI);
1239e8d8bef9SDimitry Andric     auto SVI = Builder.CreateShuffleVector(LI, Mask, "interleaved.shuffle");
12400b57cec5SDimitry Andric     VI.SVI->replaceAllUsesWith(SVI);
12410b57cec5SDimitry Andric     i++;
12420b57cec5SDimitry Andric   }
12430b57cec5SDimitry Andric 
12440b57cec5SDimitry Andric   NumInterleavedLoadCombine++;
12450b57cec5SDimitry Andric   ORE.emit([&]() {
12460b57cec5SDimitry Andric     return OptimizationRemark(DEBUG_TYPE, "Combined Interleaved Load", LI)
12470b57cec5SDimitry Andric            << "Load interleaved combined with factor "
12480b57cec5SDimitry Andric            << ore::NV("Factor", Factor);
12490b57cec5SDimitry Andric   });
12500b57cec5SDimitry Andric 
12510b57cec5SDimitry Andric   return true;
12520b57cec5SDimitry Andric }
12530b57cec5SDimitry Andric 
12540b57cec5SDimitry Andric bool InterleavedLoadCombineImpl::run() {
12550b57cec5SDimitry Andric   OptimizationRemarkEmitter ORE(&F);
12560b57cec5SDimitry Andric   bool changed = false;
12570b57cec5SDimitry Andric   unsigned MaxFactor = TLI.getMaxSupportedInterleaveFactor();
12580b57cec5SDimitry Andric 
1259*0fca6ea1SDimitry Andric   auto &DL = F.getDataLayout();
12600b57cec5SDimitry Andric 
12610b57cec5SDimitry Andric   // Start with the highest factor to avoid combining and recombining.
12620b57cec5SDimitry Andric   for (unsigned Factor = MaxFactor; Factor >= 2; Factor--) {
12630b57cec5SDimitry Andric     std::list<VectorInfo> Candidates;
12640b57cec5SDimitry Andric 
12650b57cec5SDimitry Andric     for (BasicBlock &BB : F) {
12660b57cec5SDimitry Andric       for (Instruction &I : BB) {
12670b57cec5SDimitry Andric         if (auto SVI = dyn_cast<ShuffleVectorInst>(&I)) {
12685ffd83dbSDimitry Andric           // We don't support scalable vectors in this pass.
12695ffd83dbSDimitry Andric           if (isa<ScalableVectorType>(SVI->getType()))
12705ffd83dbSDimitry Andric             continue;
12710b57cec5SDimitry Andric 
12725ffd83dbSDimitry Andric           Candidates.emplace_back(cast<FixedVectorType>(SVI->getType()));
12730b57cec5SDimitry Andric 
12740b57cec5SDimitry Andric           if (!VectorInfo::computeFromSVI(SVI, Candidates.back(), DL)) {
12750b57cec5SDimitry Andric             Candidates.pop_back();
12760b57cec5SDimitry Andric             continue;
12770b57cec5SDimitry Andric           }
12780b57cec5SDimitry Andric 
12790b57cec5SDimitry Andric           if (!Candidates.back().isInterleaved(Factor, DL)) {
12800b57cec5SDimitry Andric             Candidates.pop_back();
12810b57cec5SDimitry Andric           }
12820b57cec5SDimitry Andric         }
12830b57cec5SDimitry Andric       }
12840b57cec5SDimitry Andric     }
12850b57cec5SDimitry Andric 
12860b57cec5SDimitry Andric     std::list<VectorInfo> InterleavedLoad;
12870b57cec5SDimitry Andric     while (findPattern(Candidates, InterleavedLoad, Factor, DL)) {
12880b57cec5SDimitry Andric       if (combine(InterleavedLoad, ORE)) {
12890b57cec5SDimitry Andric         changed = true;
12900b57cec5SDimitry Andric       } else {
12910b57cec5SDimitry Andric         // Remove the first element of the Interleaved Load but put the others
12920b57cec5SDimitry Andric         // back on the list and continue searching
12930b57cec5SDimitry Andric         Candidates.splice(Candidates.begin(), InterleavedLoad,
12940b57cec5SDimitry Andric                           std::next(InterleavedLoad.begin()),
12950b57cec5SDimitry Andric                           InterleavedLoad.end());
12960b57cec5SDimitry Andric       }
12970b57cec5SDimitry Andric       InterleavedLoad.clear();
12980b57cec5SDimitry Andric     }
12990b57cec5SDimitry Andric   }
13000b57cec5SDimitry Andric 
13010b57cec5SDimitry Andric   return changed;
13020b57cec5SDimitry Andric }
13030b57cec5SDimitry Andric 
13040b57cec5SDimitry Andric namespace {
13050b57cec5SDimitry Andric /// This pass combines interleaved loads into a pattern detectable by
13060b57cec5SDimitry Andric /// InterleavedAccessPass.
13070b57cec5SDimitry Andric struct InterleavedLoadCombine : public FunctionPass {
13080b57cec5SDimitry Andric   static char ID;
13090b57cec5SDimitry Andric 
13100b57cec5SDimitry Andric   InterleavedLoadCombine() : FunctionPass(ID) {
13110b57cec5SDimitry Andric     initializeInterleavedLoadCombinePass(*PassRegistry::getPassRegistry());
13120b57cec5SDimitry Andric   }
13130b57cec5SDimitry Andric 
13140b57cec5SDimitry Andric   StringRef getPassName() const override {
13150b57cec5SDimitry Andric     return "Interleaved Load Combine Pass";
13160b57cec5SDimitry Andric   }
13170b57cec5SDimitry Andric 
13180b57cec5SDimitry Andric   bool runOnFunction(Function &F) override {
13190b57cec5SDimitry Andric     if (DisableInterleavedLoadCombine)
13200b57cec5SDimitry Andric       return false;
13210b57cec5SDimitry Andric 
13220b57cec5SDimitry Andric     auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
13230b57cec5SDimitry Andric     if (!TPC)
13240b57cec5SDimitry Andric       return false;
13250b57cec5SDimitry Andric 
13260b57cec5SDimitry Andric     LLVM_DEBUG(dbgs() << "*** " << getPassName() << ": " << F.getName()
13270b57cec5SDimitry Andric                       << "\n");
13280b57cec5SDimitry Andric 
13290b57cec5SDimitry Andric     return InterleavedLoadCombineImpl(
13300b57cec5SDimitry Andric                F, getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
13310b57cec5SDimitry Andric                getAnalysis<MemorySSAWrapperPass>().getMSSA(),
1332*0fca6ea1SDimitry Andric                getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F),
13330b57cec5SDimitry Andric                TPC->getTM<TargetMachine>())
13340b57cec5SDimitry Andric         .run();
13350b57cec5SDimitry Andric   }
13360b57cec5SDimitry Andric 
13370b57cec5SDimitry Andric   void getAnalysisUsage(AnalysisUsage &AU) const override {
13380b57cec5SDimitry Andric     AU.addRequired<MemorySSAWrapperPass>();
13390b57cec5SDimitry Andric     AU.addRequired<DominatorTreeWrapperPass>();
1340*0fca6ea1SDimitry Andric     AU.addRequired<TargetTransformInfoWrapperPass>();
13410b57cec5SDimitry Andric     FunctionPass::getAnalysisUsage(AU);
13420b57cec5SDimitry Andric   }
13430b57cec5SDimitry Andric 
13440b57cec5SDimitry Andric private:
13450b57cec5SDimitry Andric };
13460b57cec5SDimitry Andric } // anonymous namespace
13470b57cec5SDimitry Andric 
13485f757f3fSDimitry Andric PreservedAnalyses
13495f757f3fSDimitry Andric InterleavedLoadCombinePass::run(Function &F, FunctionAnalysisManager &FAM) {
13505f757f3fSDimitry Andric 
13515f757f3fSDimitry Andric   auto &DT = FAM.getResult<DominatorTreeAnalysis>(F);
13525f757f3fSDimitry Andric   auto &MemSSA = FAM.getResult<MemorySSAAnalysis>(F).getMSSA();
1353*0fca6ea1SDimitry Andric   auto &TTI = FAM.getResult<TargetIRAnalysis>(F);
1354*0fca6ea1SDimitry Andric   bool Changed = InterleavedLoadCombineImpl(F, DT, MemSSA, TTI, *TM).run();
13555f757f3fSDimitry Andric   return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
13565f757f3fSDimitry Andric }
13575f757f3fSDimitry Andric 
13580b57cec5SDimitry Andric char InterleavedLoadCombine::ID = 0;
13590b57cec5SDimitry Andric 
13600b57cec5SDimitry Andric INITIALIZE_PASS_BEGIN(
13610b57cec5SDimitry Andric     InterleavedLoadCombine, DEBUG_TYPE,
13620b57cec5SDimitry Andric     "Combine interleaved loads into wide loads and shufflevector instructions",
13630b57cec5SDimitry Andric     false, false)
13640b57cec5SDimitry Andric INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
13650b57cec5SDimitry Andric INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
1366*0fca6ea1SDimitry Andric INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
13670b57cec5SDimitry Andric INITIALIZE_PASS_END(
13680b57cec5SDimitry Andric     InterleavedLoadCombine, DEBUG_TYPE,
13690b57cec5SDimitry Andric     "Combine interleaved loads into wide loads and shufflevector instructions",
13700b57cec5SDimitry Andric     false, false)
13710b57cec5SDimitry Andric 
13720b57cec5SDimitry Andric FunctionPass *
13730b57cec5SDimitry Andric llvm::createInterleavedLoadCombinePass() {
13740b57cec5SDimitry Andric   auto P = new InterleavedLoadCombine();
13750b57cec5SDimitry Andric   return P;
13760b57cec5SDimitry Andric }
1377