Transforms/Vectorize/LoadStoreVectorizer.cpp

//===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This pass merges loads/stores to/from sequential memory addresses into vector
// loads/stores.  Although there's nothing GPU-specific in here, this pass is
// motivated by the microarchitectural quirks of nVidia and AMD GPUs.
//
// (For simplicity below we talk about loads only, but everything also applies
// to stores.)
//
// This pass is intended to be run late in the pipeline, after other
// vectorization opportunities have been exploited.  So the assumption here is
// that immediately following our new vector load we'll need to extract out the
// individual elements of the load, so we can operate on them individually.
//
// On CPUs this transformation is usually not beneficial, because extracting the
// elements of a vector register is expensive on most architectures.  It's
// usually better just to load each element individually into its own scalar
// register.
//
// However, nVidia and AMD GPUs don't have proper vector registers.  Instead, a
// "vector load" loads directly into a series of scalar registers.  In effect,
// extracting the elements of the vector is free.  It's therefore always
// beneficial to vectorize a sequence of loads on these architectures.
//
// Vectorizing (perhaps a better name might be "coalescing") loads can have
// large performance impacts on GPU kernels, and opportunities for vectorizing
// are common in GPU code.  This pass tries very hard to find such
// opportunities; its runtime is quadratic in the number of loads in a BB.
//
// Some CPU architectures, such as ARM, have instructions that load into
// multiple scalar registers, similar to a GPU vectorized load.  In theory ARM
// could use this pass (with some modifications), but currently it implements
// its own pass to do something similar to what we do here.
//
// Overview of the algorithm and terminology in this pass:
//
//  - Break up each basic block into pseudo-BBs, composed of instructions which
//    are guaranteed to transfer control to their successors.
//  - Within a single pseudo-BB, find all loads, and group them into
//    "equivalence classes" according to getUnderlyingObject() and loaded
//    element size.  Do the same for stores.
//  - For each equivalence class, greedily build "chains".  Each chain has a
//    leader instruction, and every other member of the chain has a known
//    constant offset from the first instr in the chain.
//  - Break up chains so that they contain only contiguous accesses of legal
//    size with no intervening may-alias instrs.
//  - Convert each chain to vector instructions.
//
// The O(n^2) behavior of this pass comes from initially building the chains.
// In the worst case we have to compare each new instruction to all of those
// that came before. To limit this, we only calculate the offset to the leaders
// of the N most recently-used chains.

#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/Sequence.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/iterator_range.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/MemoryLocation.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/ConstantRange.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GetElementPtrTypeIterator.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Support/Alignment.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/ModRef.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Vectorize.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
#include <cstdlib>
#include <iterator>
#include <limits>
#include <numeric>
#include <optional>
#include <tuple>
#include <type_traits>
#include <utility>
#include <vector>

using namespace llvm;

#define DEBUG_TYPE "load-store-vectorizer"

STATISTIC(NumVectorInstructions, "Number of vector accesses generated");
STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized");

namespace {

// Equivalence class key, the initial tuple by which we group loads/stores.
// Loads/stores with different EqClassKeys are never merged.
//
// (We could in theory remove element-size from the this tuple.  We'd just need
// to fix up the vector packing/unpacking code.)
using EqClassKey =
    std::tuple<const Value * /* result of getUnderlyingObject() */,
               unsigned /* AddrSpace */,
               unsigned /* Load/Store element size bits */,
               char /* IsLoad; char b/c bool can't be a DenseMap key */
               >;
[[maybe_unused]] llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
                                               const EqClassKey &K) {
  const auto &[UnderlyingObject, AddrSpace, ElementSize, IsLoad] = K;
  return OS << (IsLoad ? "load" : "store") << " of " << *UnderlyingObject
            << " of element size " << ElementSize << " bits in addrspace "
            << AddrSpace;
}

// A Chain is a set of instructions such that:
//  - All instructions have the same equivalence class, so in particular all are
//    loads, or all are stores.
//  - We know the address accessed by the i'th chain elem relative to the
//    chain's leader instruction, which is the first instr of the chain in BB
//    order.
//
// Chains have two canonical orderings:
//  - BB order, sorted by Instr->comesBefore.
//  - Offset order, sorted by OffsetFromLeader.
// This pass switches back and forth between these orders.
struct ChainElem {
  Instruction *Inst;
  APInt OffsetFromLeader;
};
using Chain = SmallVector<ChainElem, 1>;

void sortChainInBBOrder(Chain &C) {
  sort(C, [](auto &A, auto &B) { return A.Inst->comesBefore(B.Inst); });
}

void sortChainInOffsetOrder(Chain &C) {
  sort(C, [](const auto &A, const auto &B) {
    if (A.OffsetFromLeader != B.OffsetFromLeader)
      return A.OffsetFromLeader.slt(B.OffsetFromLeader);
    return A.Inst->comesBefore(B.Inst); // stable tiebreaker
  });
}

[[maybe_unused]] void dumpChain(ArrayRef<ChainElem> C) {
  for (const auto &E : C) {
    dbgs() << "  " << *E.Inst << " (offset " << E.OffsetFromLeader << ")\n";
  }
}

using EquivalenceClassMap =
    MapVector<EqClassKey, SmallVector<Instruction *, 8>>;

// FIXME: Assuming stack alignment of 4 is always good enough
constexpr unsigned StackAdjustedAlignment = 4;

Instruction *propagateMetadata(Instruction *I, const Chain &C) {
  SmallVector<Value *, 8> Values;
  for (const ChainElem &E : C)
    Values.push_back(E.Inst);
  return propagateMetadata(I, Values);
}

bool isInvariantLoad(const Instruction *I) {
  const LoadInst *LI = dyn_cast<LoadInst>(I);
  return LI != nullptr && LI->hasMetadata(LLVMContext::MD_invariant_load);
}

/// Reorders the instructions that I depends on (the instructions defining its
/// operands), to ensure they dominate I.
void reorder(Instruction *I) {
  SmallPtrSet<Instruction *, 16> InstructionsToMove;
  SmallVector<Instruction *, 16> Worklist;

  Worklist.push_back(I);
  while (!Worklist.empty()) {
    Instruction *IW = Worklist.pop_back_val();
    int NumOperands = IW->getNumOperands();
    for (int i = 0; i < NumOperands; i++) {
      Instruction *IM = dyn_cast<Instruction>(IW->getOperand(i));
      if (!IM || IM->getOpcode() == Instruction::PHI)
        continue;

      // If IM is in another BB, no need to move it, because this pass only
      // vectorizes instructions within one BB.
      if (IM->getParent() != I->getParent())
        continue;

      if (!IM->comesBefore(I)) {
        InstructionsToMove.insert(IM);
        Worklist.push_back(IM);
      }
    }
  }

  // All instructions to move should follow I. Start from I, not from begin().
  for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;) {
    Instruction *IM = &*(BBI++);
    if (!InstructionsToMove.count(IM))
      continue;
    IM->moveBefore(I);
  }
}

class Vectorizer {
  Function &F;
  AliasAnalysis &AA;
  AssumptionCache &AC;
  DominatorTree &DT;
  ScalarEvolution &SE;
  TargetTransformInfo &TTI;
  const DataLayout &DL;
  IRBuilder<> Builder;

  // We could erase instrs right after vectorizing them, but that can mess up
  // our BB iterators, and also can make the equivalence class keys point to
  // freed memory.  This is fixable, but it's simpler just to wait until we're
  // done with the BB and erase all at once.
  SmallVector<Instruction *, 128> ToErase;

public:
  Vectorizer(Function &F, AliasAnalysis &AA, AssumptionCache &AC,
             DominatorTree &DT, ScalarEvolution &SE, TargetTransformInfo &TTI)
      : F(F), AA(AA), AC(AC), DT(DT), SE(SE), TTI(TTI),
        DL(F.getParent()->getDataLayout()), Builder(SE.getContext()) {}

  bool run();

private:
  static const unsigned MaxDepth = 3;

  /// Runs the vectorizer on a "pseudo basic block", which is a range of
  /// instructions [Begin, End) within one BB all of which have
  /// isGuaranteedToTransferExecutionToSuccessor(I) == true.
  bool runOnPseudoBB(BasicBlock::iterator Begin, BasicBlock::iterator End);

  /// Runs the vectorizer on one equivalence class, i.e. one set of loads/stores
  /// in the same BB with the same value for getUnderlyingObject() etc.
  bool runOnEquivalenceClass(const EqClassKey &EqClassKey,
                             ArrayRef<Instruction *> EqClass);

  /// Runs the vectorizer on one chain, i.e. a subset of an equivalence class
  /// where all instructions access a known, constant offset from the first
  /// instruction.
  bool runOnChain(Chain &C);

  /// Splits the chain into subchains of instructions which read/write a
  /// contiguous block of memory.  Discards any length-1 subchains (because
  /// there's nothing to vectorize in there).
  std::vector<Chain> splitChainByContiguity(Chain &C);

  /// Splits the chain into subchains where it's safe to hoist loads up to the
  /// beginning of the sub-chain and it's safe to sink loads up to the end of
  /// the sub-chain.  Discards any length-1 subchains.
  std::vector<Chain> splitChainByMayAliasInstrs(Chain &C);

  /// Splits the chain into subchains that make legal, aligned accesses.
  /// Discards any length-1 subchains.
  std::vector<Chain> splitChainByAlignment(Chain &C);

  /// Converts the instrs in the chain into a single vectorized load or store.
  /// Adds the old scalar loads/stores to ToErase.
  bool vectorizeChain(Chain &C);

  /// Tries to compute the offset in bytes PtrB - PtrA.
  std::optional<APInt> getConstantOffset(Value *PtrA, Value *PtrB,
                                         unsigned Depth = 0);
  std::optional<APInt> gtConstantOffsetComplexAddrs(Value *PtrA, Value *PtrB,
                                                    unsigned Depth);
  std::optional<APInt> getConstantOffsetSelects(Value *PtrA, Value *PtrB,
                                                unsigned Depth);

  /// Gets the element type of the vector that the chain will load or store.
  /// This is nontrivial because the chain may contain elements of different
  /// types; e.g. it's legal to have a chain that contains both i32 and float.
  Type *getChainElemTy(const Chain &C);

  /// Determines whether ChainElem can be moved up (if IsLoad) or down (if
  /// !IsLoad) to ChainBegin -- i.e. there are no intervening may-alias
  /// instructions.
  ///
  /// The map ChainElemOffsets must contain all of the elements in
  /// [ChainBegin, ChainElem] and their offsets from some arbitrary base
  /// address.  It's ok if it contains additional entries.
  template <bool IsLoadChain>
  bool isSafeToMove(
      Instruction *ChainElem, Instruction *ChainBegin,
      const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets);

  /// Collects loads and stores grouped by "equivalence class", where:
  ///   - all elements in an eq class are a load or all are a store,
  ///   - they all load/store the same element size (it's OK to have e.g. i8 and
  ///     <4 x i8> in the same class, but not i32 and <4 x i8>), and
  ///   - they all have the same value for getUnderlyingObject().
  EquivalenceClassMap collectEquivalenceClasses(BasicBlock::iterator Begin,
                                                BasicBlock::iterator End);

  /// Partitions Instrs into "chains" where every instruction has a known
  /// constant offset from the first instr in the chain.
  ///
  /// Postcondition: For all i, ret[i][0].second == 0, because the first instr
  /// in the chain is the leader, and an instr touches distance 0 from itself.
  std::vector<Chain> gatherChains(ArrayRef<Instruction *> Instrs);
};

class LoadStoreVectorizerLegacyPass : public FunctionPass {
public:
  static char ID;

  LoadStoreVectorizerLegacyPass() : FunctionPass(ID) {
    initializeLoadStoreVectorizerLegacyPassPass(
        *PassRegistry::getPassRegistry());
  }

  bool runOnFunction(Function &F) override;

  StringRef getPassName() const override {
    return "GPU Load and Store Vectorizer";
  }

  void getAnalysisUsage(AnalysisUsage &AU) const override {
    AU.addRequired<AAResultsWrapperPass>();
    AU.addRequired<AssumptionCacheTracker>();
    AU.addRequired<ScalarEvolutionWrapperPass>();
    AU.addRequired<DominatorTreeWrapperPass>();
    AU.addRequired<TargetTransformInfoWrapperPass>();
    AU.setPreservesCFG();
  }
};

} // end anonymous namespace

char LoadStoreVectorizerLegacyPass::ID = 0;

INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
                      "Vectorize load and Store instructions", false, false)
INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker);
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
                    "Vectorize load and store instructions", false, false)

Pass *llvm::createLoadStoreVectorizerPass() {
  return new LoadStoreVectorizerLegacyPass();
}

bool LoadStoreVectorizerLegacyPass::runOnFunction(Function &F) {
  // Don't vectorize when the attribute NoImplicitFloat is used.
  if (skipFunction(F) || F.hasFnAttribute(Attribute::NoImplicitFloat))
    return false;

  AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
  ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
  TargetTransformInfo &TTI =
      getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);

  AssumptionCache &AC =
      getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);

  return Vectorizer(F, AA, AC, DT, SE, TTI).run();
}

PreservedAnalyses LoadStoreVectorizerPass::run(Function &F,
                                               FunctionAnalysisManager &AM) {
  // Don't vectorize when the attribute NoImplicitFloat is used.
  if (F.hasFnAttribute(Attribute::NoImplicitFloat))
    return PreservedAnalyses::all();

  AliasAnalysis &AA = AM.getResult<AAManager>(F);
  DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
  ScalarEvolution &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
  TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
  AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(F);

  bool Changed = Vectorizer(F, AA, AC, DT, SE, TTI).run();
  PreservedAnalyses PA;
  PA.preserveSet<CFGAnalyses>();
  return Changed ? PA : PreservedAnalyses::all();
}

bool Vectorizer::run() {
  bool Changed = false;
  // Break up the BB if there are any instrs which aren't guaranteed to transfer
  // execution to their successor.
  //
  // Consider, for example:
  //
  //   def assert_arr_len(int n) { if (n < 2) exit(); }
  //
  //   load arr[0]
  //   call assert_array_len(arr.length)
  //   load arr[1]
  //
  // Even though assert_arr_len does not read or write any memory, we can't
  // speculate the second load before the call.  More info at
  // https://github.com/llvm/llvm-project/issues/52950.
  for (BasicBlock *BB : post_order(&F)) {
    // BB must at least have a terminator.
    assert(!BB->empty());

    SmallVector<BasicBlock::iterator, 8> Barriers;
    Barriers.push_back(BB->begin());
    for (Instruction &I : *BB)
      if (!isGuaranteedToTransferExecutionToSuccessor(&I))
        Barriers.push_back(I.getIterator());
    Barriers.push_back(BB->end());

    for (auto It = Barriers.begin(), End = std::prev(Barriers.end()); It != End;
         ++It)
      Changed |= runOnPseudoBB(*It, *std::next(It));

    for (Instruction *I : ToErase) {
      auto *PtrOperand = getLoadStorePointerOperand(I);
      if (I->use_empty())
        I->eraseFromParent();
      RecursivelyDeleteTriviallyDeadInstructions(PtrOperand);
    }
    ToErase.clear();
  }

  return Changed;
}

bool Vectorizer::runOnPseudoBB(BasicBlock::iterator Begin,
                               BasicBlock::iterator End) {
  LLVM_DEBUG({
    dbgs() << "LSV: Running on pseudo-BB [" << *Begin << " ... ";
    if (End != Begin->getParent()->end())
      dbgs() << *End;
    else
      dbgs() << "<BB end>";
    dbgs() << ")\n";
  });

  bool Changed = false;
  for (const auto &[EqClassKey, EqClass] :
       collectEquivalenceClasses(Begin, End))
    Changed |= runOnEquivalenceClass(EqClassKey, EqClass);

  return Changed;
}

bool Vectorizer::runOnEquivalenceClass(const EqClassKey &EqClassKey,
                                       ArrayRef<Instruction *> EqClass) {
  bool Changed = false;

  LLVM_DEBUG({
    dbgs() << "LSV: Running on equivalence class of size " << EqClass.size()
           << " keyed on " << EqClassKey << ":\n";
    for (Instruction *I : EqClass)
      dbgs() << "  " << *I << "\n";
  });

  std::vector<Chain> Chains = gatherChains(EqClass);
  LLVM_DEBUG(dbgs() << "LSV: Got " << Chains.size()
                    << " nontrivial chains.\n";);
  for (Chain &C : Chains)
    Changed |= runOnChain(C);
  return Changed;
}

bool Vectorizer::runOnChain(Chain &C) {
  LLVM_DEBUG({
    dbgs() << "LSV: Running on chain with " << C.size() << " instructions:\n";
    dumpChain(C);
  });

  // Split up the chain into increasingly smaller chains, until we can finally
  // vectorize the chains.
  //
  // (Don't be scared by the depth of the loop nest here.  These operations are
  // all at worst O(n lg n) in the number of instructions, and splitting chains
  // doesn't change the number of instrs.  So the whole loop nest is O(n lg n).)
  bool Changed = false;
  for (auto &C : splitChainByMayAliasInstrs(C))
    for (auto &C : splitChainByContiguity(C))
      for (auto &C : splitChainByAlignment(C))
        Changed |= vectorizeChain(C);
  return Changed;
}

std::vector<Chain> Vectorizer::splitChainByMayAliasInstrs(Chain &C) {
  if (C.empty())
    return {};

  sortChainInBBOrder(C);

  LLVM_DEBUG({
    dbgs() << "LSV: splitChainByMayAliasInstrs considering chain:\n";
    dumpChain(C);
  });

  // We know that elements in the chain with nonverlapping offsets can't
  // alias, but AA may not be smart enough to figure this out.  Use a
  // hashtable.
  DenseMap<Instruction *, APInt /*OffsetFromLeader*/> ChainOffsets;
  for (const auto &E : C)
    ChainOffsets.insert({&*E.Inst, E.OffsetFromLeader});

  // Loads get hoisted up to the first load in the chain.  Stores get sunk
  // down to the last store in the chain.  Our algorithm for loads is:
  //
  //  - Take the first element of the chain.  This is the start of a new chain.
  //  - Take the next element of `Chain` and check for may-alias instructions
  //    up to the start of NewChain.  If no may-alias instrs, add it to
  //    NewChain.  Otherwise, start a new NewChain.
  //
  // For stores it's the same except in the reverse direction.
  //
  // We expect IsLoad to be an std::bool_constant.
  auto Impl = [&](auto IsLoad) {
    // MSVC is unhappy if IsLoad is a capture, so pass it as an arg.
    auto [ChainBegin, ChainEnd] = [&](auto IsLoad) {
      if constexpr (IsLoad())
        return std::make_pair(C.begin(), C.end());
      else
        return std::make_pair(C.rbegin(), C.rend());
    }(IsLoad);
    assert(ChainBegin != ChainEnd);

    std::vector<Chain> Chains;
    SmallVector<ChainElem, 1> NewChain;
    NewChain.push_back(*ChainBegin);
    for (auto ChainIt = std::next(ChainBegin); ChainIt != ChainEnd; ++ChainIt) {
      if (isSafeToMove<IsLoad>(ChainIt->Inst, NewChain.front().Inst,
                               ChainOffsets)) {
        LLVM_DEBUG(dbgs() << "LSV: No intervening may-alias instrs; can merge "
                          << *ChainIt->Inst << " into " << *ChainBegin->Inst
                          << "\n");
        NewChain.push_back(*ChainIt);
      } else {
        LLVM_DEBUG(
            dbgs() << "LSV: Found intervening may-alias instrs; cannot merge "
                   << *ChainIt->Inst << " into " << *ChainBegin->Inst << "\n");
        if (NewChain.size() > 1) {
          LLVM_DEBUG({
            dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";
            dumpChain(NewChain);
          });
          Chains.push_back(std::move(NewChain));
        }

        // Start a new chain.
        NewChain = SmallVector<ChainElem, 1>({*ChainIt});
      }
    }
    if (NewChain.size() > 1) {
      LLVM_DEBUG({
        dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";
        dumpChain(NewChain);
      });
      Chains.push_back(std::move(NewChain));
    }
    return Chains;
  };

  if (isa<LoadInst>(C[0].Inst))
    return Impl(/*IsLoad=*/std::bool_constant<true>());

  assert(isa<StoreInst>(C[0].Inst));
  return Impl(/*IsLoad=*/std::bool_constant<false>());
}

std::vector<Chain> Vectorizer::splitChainByContiguity(Chain &C) {
  if (C.empty())
    return {};

  sortChainInOffsetOrder(C);

  LLVM_DEBUG({
    dbgs() << "LSV: splitChainByContiguity considering chain:\n";
    dumpChain(C);
  });

  std::vector<Chain> Ret;
  Ret.push_back({C.front()});

  for (auto It = std::next(C.begin()), End = C.end(); It != End; ++It) {
    // `prev` accesses offsets [PrevDistFromBase, PrevReadEnd).
    auto &CurChain = Ret.back();
    const ChainElem &Prev = CurChain.back();
    unsigned SzBits = DL.getTypeSizeInBits(getLoadStoreType(&*Prev.Inst));
    assert(SzBits % 8 == 0 && "Non-byte sizes should have been filtered out by "
                              "collectEquivalenceClass");
    APInt PrevReadEnd = Prev.OffsetFromLeader + SzBits / 8;

    // Add this instruction to the end of the current chain, or start a new one.
    bool AreContiguous = It->OffsetFromLeader == PrevReadEnd;
    LLVM_DEBUG(dbgs() << "LSV: Instructions are "
                      << (AreContiguous ? "" : "not ") << "contiguous: "
                      << *Prev.Inst << " (ends at offset " << PrevReadEnd
                      << ") -> " << *It->Inst << " (starts at offset "
                      << It->OffsetFromLeader << ")\n");
    if (AreContiguous)
      CurChain.push_back(*It);
    else
      Ret.push_back({*It});
  }

  // Filter out length-1 chains, these are uninteresting.
  llvm::erase_if(Ret, [](const auto &Chain) { return Chain.size() <= 1; });
  return Ret;
}

Type *Vectorizer::getChainElemTy(const Chain &C) {
  assert(!C.empty());
  // The rules are:
  //  - If there are any pointer types in the chain, use an integer type.
  //  - Prefer an integer type if it appears in the chain.
  //  - Otherwise, use the first type in the chain.
  //
  // The rule about pointer types is a simplification when we merge e.g.  a load
  // of a ptr and a double.  There's no direct conversion from a ptr to a
  // double; it requires a ptrtoint followed by a bitcast.
  //
  // It's unclear to me if the other rules have any practical effect, but we do
  // it to match this pass's previous behavior.
  if (any_of(C, [](const ChainElem &E) {
        return getLoadStoreType(E.Inst)->getScalarType()->isPointerTy();
      })) {
    return Type::getIntNTy(
        F.getContext(),
        DL.getTypeSizeInBits(getLoadStoreType(C[0].Inst)->getScalarType()));
  }

  for (const ChainElem &E : C)
    if (Type *T = getLoadStoreType(E.Inst)->getScalarType(); T->isIntegerTy())
      return T;
  return getLoadStoreType(C[0].Inst)->getScalarType();
}

std::vector<Chain> Vectorizer::splitChainByAlignment(Chain &C) {
  // We use a simple greedy algorithm.
  //  - Given a chain of length N, find all prefixes that
  //    (a) are not longer than the max register length, and
  //    (b) are a power of 2.
  //  - Starting from the longest prefix, try to create a vector of that length.
  //  - If one of them works, great.  Repeat the algorithm on any remaining
  //    elements in the chain.
  //  - If none of them work, discard the first element and repeat on a chain
  //    of length N-1.
  if (C.empty())
    return {};

  sortChainInOffsetOrder(C);

  LLVM_DEBUG({
    dbgs() << "LSV: splitChainByAlignment considering chain:\n";
    dumpChain(C);
  });

  bool IsLoadChain = isa<LoadInst>(C[0].Inst);
  auto getVectorFactor = [&](unsigned VF, unsigned LoadStoreSize,
                             unsigned ChainSizeBytes, VectorType *VecTy) {
    return IsLoadChain ? TTI.getLoadVectorFactor(VF, LoadStoreSize,
                                                 ChainSizeBytes, VecTy)
                       : TTI.getStoreVectorFactor(VF, LoadStoreSize,
                                                  ChainSizeBytes, VecTy);
  };

#ifndef NDEBUG
  for (const auto &E : C) {
    Type *Ty = getLoadStoreType(E.Inst)->getScalarType();
    assert(isPowerOf2_32(DL.getTypeSizeInBits(Ty)) &&
           "Should have filtered out non-power-of-two elements in "
           "collectEquivalenceClasses.");
  }
#endif

  unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
  unsigned VecRegBytes = TTI.getLoadStoreVecRegBitWidth(AS) / 8;

  std::vector<Chain> Ret;
  for (unsigned CBegin = 0; CBegin < C.size(); ++CBegin) {
    // Find candidate chains of size not greater than the largest vector reg.
    // These chains are over the closed interval [CBegin, CEnd].
    SmallVector<std::pair<unsigned /*CEnd*/, unsigned /*SizeBytes*/>, 8>
        CandidateChains;
    for (unsigned CEnd = CBegin + 1, Size = C.size(); CEnd < Size; ++CEnd) {
      APInt Sz = C[CEnd].OffsetFromLeader +
                 DL.getTypeStoreSize(getLoadStoreType(C[CEnd].Inst)) -
                 C[CBegin].OffsetFromLeader;
      if (Sz.sgt(VecRegBytes))
        break;
      CandidateChains.push_back(
          {CEnd, static_cast<unsigned>(Sz.getLimitedValue())});
    }

    // Consider the longest chain first.
    for (auto It = CandidateChains.rbegin(), End = CandidateChains.rend();
         It != End; ++It) {
      auto [CEnd, SizeBytes] = *It;
      LLVM_DEBUG(
          dbgs() << "LSV: splitChainByAlignment considering candidate chain ["
                 << *C[CBegin].Inst << " ... " << *C[CEnd].Inst << "]\n");

      Type *VecElemTy = getChainElemTy(C);
      // Note, VecElemTy is a power of 2, but might be less than one byte.  For
      // example, we can vectorize 2 x <2 x i4> to <4 x i4>, and in this case
      // VecElemTy would be i4.
      unsigned VecElemBits = DL.getTypeSizeInBits(VecElemTy);

      // SizeBytes and VecElemBits are powers of 2, so they divide evenly.
      assert((8 * SizeBytes) % VecElemBits == 0);
      unsigned NumVecElems = 8 * SizeBytes / VecElemBits;
      FixedVectorType *VecTy = FixedVectorType::get(VecElemTy, NumVecElems);
      unsigned VF = 8 * VecRegBytes / VecElemBits;

      // Check that TTI is happy with this vectorization factor.
      unsigned TargetVF = getVectorFactor(VF, VecElemBits,
                                          VecElemBits * NumVecElems / 8, VecTy);
      if (TargetVF != VF && TargetVF < NumVecElems) {
        LLVM_DEBUG(
            dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
                      "because TargetVF="
                   << TargetVF << " != VF=" << VF
                   << " and TargetVF < NumVecElems=" << NumVecElems << "\n");
        continue;
      }

      // Is a load/store with this alignment allowed by TTI and at least as fast
      // as an unvectorized load/store?
      //
      // TTI and F are passed as explicit captures to WAR an MSVC misparse (??).
      auto IsAllowedAndFast = [&, SizeBytes = SizeBytes, &TTI = TTI,
                               &F = F](Align Alignment) {
        if (Alignment.value() % SizeBytes == 0)
          return true;
        unsigned VectorizedSpeed = 0;
        bool AllowsMisaligned = TTI.allowsMisalignedMemoryAccesses(
            F.getContext(), SizeBytes * 8, AS, Alignment, &VectorizedSpeed);
        if (!AllowsMisaligned) {
          LLVM_DEBUG(dbgs()
                     << "LSV: Access of " << SizeBytes << "B in addrspace "
                     << AS << " with alignment " << Alignment.value()
                     << " is misaligned, and therefore can't be vectorized.\n");
          return false;
        }

        unsigned ElementwiseSpeed = 0;
        (TTI).allowsMisalignedMemoryAccesses((F).getContext(), VecElemBits, AS,
                                             Alignment, &ElementwiseSpeed);
        if (VectorizedSpeed < ElementwiseSpeed) {
          LLVM_DEBUG(dbgs()
                     << "LSV: Access of " << SizeBytes << "B in addrspace "
                     << AS << " with alignment " << Alignment.value()
                     << " has relative speed " << VectorizedSpeed
                     << ", which is lower than the elementwise speed of "
                     << ElementwiseSpeed
                     << ".  Therefore this access won't be vectorized.\n");
          return false;
        }
        return true;
      };

      // If we're loading/storing from an alloca, align it if possible.
      //
      // FIXME: We eagerly upgrade the alignment, regardless of whether TTI
      // tells us this is beneficial.  This feels a bit odd, but it matches
      // existing tests.  This isn't *so* bad, because at most we align to 4
      // bytes (current value of StackAdjustedAlignment).
      //
      // FIXME: We will upgrade the alignment of the alloca even if it turns out
      // we can't vectorize for some other reason.
      Align Alignment = getLoadStoreAlignment(C[CBegin].Inst);
      if (AS == DL.getAllocaAddrSpace() && Alignment.value() % SizeBytes != 0 &&
          IsAllowedAndFast(Align(StackAdjustedAlignment))) {
        Align NewAlign = getOrEnforceKnownAlignment(
            getLoadStorePointerOperand(C[CBegin].Inst),
            Align(StackAdjustedAlignment), DL, C[CBegin].Inst, nullptr, &DT);
        if (NewAlign >= Alignment) {
          LLVM_DEBUG(dbgs()
                     << "LSV: splitByChain upgrading alloca alignment from "
                     << Alignment.value() << " to " << NewAlign.value()
                     << "\n");
          Alignment = NewAlign;
        }
      }

      if (!IsAllowedAndFast(Alignment)) {
        LLVM_DEBUG(
            dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
                      "because its alignment is not AllowedAndFast: "
                   << Alignment.value() << "\n");
        continue;
      }

      if ((IsLoadChain &&
           !TTI.isLegalToVectorizeLoadChain(SizeBytes, Alignment, AS)) ||
          (!IsLoadChain &&
           !TTI.isLegalToVectorizeStoreChain(SizeBytes, Alignment, AS))) {
        LLVM_DEBUG(
            dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
                      "because !isLegalToVectorizeLoad/StoreChain.");
        continue;
      }

      // Hooray, we can vectorize this chain!
      Chain &NewChain = Ret.emplace_back();
      for (unsigned I = CBegin; I <= CEnd; ++I)
        NewChain.push_back(C[I]);
      CBegin = CEnd; // Skip over the instructions we've added to the chain.
      break;
    }
  }
  return Ret;
}

bool Vectorizer::vectorizeChain(Chain &C) {
  if (C.size() < 2)
    return false;

  sortChainInOffsetOrder(C);

  LLVM_DEBUG({
    dbgs() << "LSV: Vectorizing chain of " << C.size() << " instructions:\n";
    dumpChain(C);
  });

  Type *VecElemTy = getChainElemTy(C);
  bool IsLoadChain = isa<LoadInst>(C[0].Inst);
  unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
  unsigned ChainBytes = std::accumulate(
      C.begin(), C.end(), 0u, [&](unsigned Bytes, const ChainElem &E) {
        return Bytes + DL.getTypeStoreSize(getLoadStoreType(E.Inst));
      });
  assert(ChainBytes % DL.getTypeStoreSize(VecElemTy) == 0);
  // VecTy is a power of 2 and 1 byte at smallest, but VecElemTy may be smaller
  // than 1 byte (e.g. VecTy == <32 x i1>).
  Type *VecTy = FixedVectorType::get(
      VecElemTy, 8 * ChainBytes / DL.getTypeSizeInBits(VecElemTy));

  Align Alignment = getLoadStoreAlignment(C[0].Inst);
  // If this is a load/store of an alloca, we might have upgraded the alloca's
  // alignment earlier.  Get the new alignment.
  if (AS == DL.getAllocaAddrSpace()) {
    Alignment = std::max(
        Alignment,
        getOrEnforceKnownAlignment(getLoadStorePointerOperand(C[0].Inst),
                                   MaybeAlign(), DL, C[0].Inst, nullptr, &DT));
  }

  // All elements of the chain must have the same scalar-type size.
#ifndef NDEBUG
  for (const ChainElem &E : C)
    assert(DL.getTypeStoreSize(getLoadStoreType(E.Inst)->getScalarType()) ==
           DL.getTypeStoreSize(VecElemTy));
#endif

  Instruction *VecInst;
  if (IsLoadChain) {
    // Loads get hoisted to the location of the first load in the chain.  We may
    // also need to hoist the (transitive) operands of the loads.
    Builder.SetInsertPoint(
        std::min_element(C.begin(), C.end(), [](const auto &A, const auto &B) {
          return A.Inst->comesBefore(B.Inst);
        })->Inst);

    // Chain is in offset order, so C[0] is the instr with the lowest offset,
    // i.e. the root of the vector.
    Value *Bitcast = Builder.CreateBitCast(
        getLoadStorePointerOperand(C[0].Inst), VecTy->getPointerTo(AS));
    VecInst = Builder.CreateAlignedLoad(VecTy, Bitcast, Alignment);

    unsigned VecIdx = 0;
    for (const ChainElem &E : C) {
      Instruction *I = E.Inst;
      Value *V;
      Type *T = getLoadStoreType(I);
      if (auto *VT = dyn_cast<FixedVectorType>(T)) {
        auto Mask = llvm::to_vector<8>(
            llvm::seq<int>(VecIdx, VecIdx + VT->getNumElements()));
        V = Builder.CreateShuffleVector(VecInst, Mask, I->getName());
        VecIdx += VT->getNumElements();
      } else {
        V = Builder.CreateExtractElement(VecInst, Builder.getInt32(VecIdx),
                                         I->getName());
        ++VecIdx;
      }
      if (V->getType() != I->getType())
        V = Builder.CreateBitOrPointerCast(V, I->getType());
      I->replaceAllUsesWith(V);
    }

    // Finally, we need to reorder the instrs in the BB so that the (transitive)
    // operands of VecInst appear before it.  To see why, suppose we have
    // vectorized the following code:
    //
    //   ptr1  = gep a, 1
    //   load1 = load i32 ptr1
    //   ptr0  = gep a, 0
    //   load0 = load i32 ptr0
    //
    // We will put the vectorized load at the location of the earliest load in
    // the BB, i.e. load1.  We get:
    //
    //   ptr1  = gep a, 1
    //   loadv = load <2 x i32> ptr0
    //   load0 = extractelement loadv, 0
    //   load1 = extractelement loadv, 1
    //   ptr0 = gep a, 0
    //
    // Notice that loadv uses ptr0, which is defined *after* it!
    reorder(VecInst);
  } else {
    // Stores get sunk to the location of the last store in the chain.
    Builder.SetInsertPoint(
        std::max_element(C.begin(), C.end(), [](auto &A, auto &B) {
          return A.Inst->comesBefore(B.Inst);
        })->Inst);

    // Build the vector to store.
    Value *Vec = PoisonValue::get(VecTy);
    unsigned VecIdx = 0;
    auto InsertElem = [&](Value *V) {
      if (V->getType() != VecElemTy)
        V = Builder.CreateBitOrPointerCast(V, VecElemTy);
      Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(VecIdx++));
    };
    for (const ChainElem &E : C) {
      auto I = cast<StoreInst>(E.Inst);
      if (FixedVectorType *VT =
              dyn_cast<FixedVectorType>(getLoadStoreType(I))) {
        for (int J = 0, JE = VT->getNumElements(); J < JE; ++J) {
          InsertElem(Builder.CreateExtractElement(I->getValueOperand(),
                                                  Builder.getInt32(J)));
        }
      } else {
        InsertElem(I->getValueOperand());
      }
    }

    // Chain is in offset order, so C[0] is the instr with the lowest offset,
    // i.e. the root of the vector.
    VecInst = Builder.CreateAlignedStore(
        Vec,
        Builder.CreateBitCast(getLoadStorePointerOperand(C[0].Inst),
                              VecTy->getPointerTo(AS)),
        Alignment);
  }

  propagateMetadata(VecInst, C);

  for (const ChainElem &E : C)
    ToErase.push_back(E.Inst);

  ++NumVectorInstructions;
  NumScalarsVectorized += C.size();
  return true;
}

template <bool IsLoadChain>
bool Vectorizer::isSafeToMove(
    Instruction *ChainElem, Instruction *ChainBegin,
    const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets) {
  LLVM_DEBUG(dbgs() << "LSV: isSafeToMove(" << *ChainElem << " -> "
                    << *ChainBegin << ")\n");

  assert(isa<LoadInst>(ChainElem) == IsLoadChain);
  if (ChainElem == ChainBegin)
    return true;

  // Invariant loads can always be reordered; by definition they are not
  // clobbered by stores.
  if (isInvariantLoad(ChainElem))
    return true;

  auto BBIt = std::next([&] {
    if constexpr (IsLoadChain)
      return BasicBlock::reverse_iterator(ChainElem);
    else
      return BasicBlock::iterator(ChainElem);
  }());
  auto BBItEnd = std::next([&] {
    if constexpr (IsLoadChain)
      return BasicBlock::reverse_iterator(ChainBegin);
    else
      return BasicBlock::iterator(ChainBegin);
  }());

  const APInt &ChainElemOffset = ChainOffsets.at(ChainElem);
  const unsigned ChainElemSize =
      DL.getTypeStoreSize(getLoadStoreType(ChainElem));

  for (; BBIt != BBItEnd; ++BBIt) {
    Instruction *I = &*BBIt;

    if (!I->mayReadOrWriteMemory())
      continue;

    // Loads can be reordered with other loads.
    if (IsLoadChain && isa<LoadInst>(I))
      continue;

    // Stores can be sunk below invariant loads.
    if (!IsLoadChain && isInvariantLoad(I))
      continue;

    // If I is in the chain, we can tell whether it aliases ChainIt by checking
    // what offset ChainIt accesses.  This may be better than AA is able to do.
    //
    // We should really only have duplicate offsets for stores (the duplicate
    // loads should be CSE'ed), but in case we have a duplicate load, we'll
    // split the chain so we don't have to handle this case specially.
    if (auto OffsetIt = ChainOffsets.find(I); OffsetIt != ChainOffsets.end()) {
      // I and ChainElem overlap if:
      //   - I and ChainElem have the same offset, OR
      //   - I's offset is less than ChainElem's, but I touches past the
      //     beginning of ChainElem, OR
      //   - ChainElem's offset is less than I's, but ChainElem touches past the
      //     beginning of I.
      const APInt &IOffset = OffsetIt->second;
      unsigned IElemSize = DL.getTypeStoreSize(getLoadStoreType(I));
      if (IOffset == ChainElemOffset ||
          (IOffset.sle(ChainElemOffset) &&
           (IOffset + IElemSize).sgt(ChainElemOffset)) ||
          (ChainElemOffset.sle(IOffset) &&
           (ChainElemOffset + ChainElemSize).sgt(OffsetIt->second))) {
        LLVM_DEBUG({
          // Double check that AA also sees this alias.  If not, we probably
          // have a bug.
          ModRefInfo MR = AA.getModRefInfo(I, MemoryLocation::get(ChainElem));
          assert(IsLoadChain ? isModSet(MR) : isModOrRefSet(MR));
          dbgs() << "LSV: Found alias in chain: " << *I << "\n";
        });
        return false; // We found an aliasing instruction; bail.
      }

      continue; // We're confident there's no alias.
    }

    LLVM_DEBUG(dbgs() << "LSV: Querying AA for " << *I << "\n");
    ModRefInfo MR = AA.getModRefInfo(I, MemoryLocation::get(ChainElem));
    if (IsLoadChain ? isModSet(MR) : isModOrRefSet(MR)) {
      LLVM_DEBUG(dbgs() << "LSV: Found alias in chain:\n"
                        << "  Aliasing instruction:\n"
                        << "    " << *I << '\n'
                        << "  Aliased instruction and pointer:\n"
                        << "    " << *ChainElem << '\n'
                        << "    " << *getLoadStorePointerOperand(ChainElem)
                        << '\n');

      return false;
    }
  }
  return true;
}

static bool checkNoWrapFlags(Instruction *I, bool Signed) {
  BinaryOperator *BinOpI = cast<BinaryOperator>(I);
  return (Signed && BinOpI->hasNoSignedWrap()) ||
         (!Signed && BinOpI->hasNoUnsignedWrap());
}

static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA,
                                   unsigned MatchingOpIdxA, Instruction *AddOpB,
                                   unsigned MatchingOpIdxB, bool Signed) {
  LLVM_DEBUG(dbgs() << "LSV: checkIfSafeAddSequence IdxDiff=" << IdxDiff
                    << ", AddOpA=" << *AddOpA << ", MatchingOpIdxA="
                    << MatchingOpIdxA << ", AddOpB=" << *AddOpB
                    << ", MatchingOpIdxB=" << MatchingOpIdxB
                    << ", Signed=" << Signed << "\n");
  // If both OpA and OpB are adds with NSW/NUW and with one of the operands
  // being the same, we can guarantee that the transformation is safe if we can
  // prove that OpA won't overflow when Ret added to the other operand of OpA.
  // For example:
  //  %tmp7 = add nsw i32 %tmp2, %v0
  //  %tmp8 = sext i32 %tmp7 to i64
  //  ...
  //  %tmp11 = add nsw i32 %v0, 1
  //  %tmp12 = add nsw i32 %tmp2, %tmp11
  //  %tmp13 = sext i32 %tmp12 to i64
  //
  //  Both %tmp7 and %tmp12 have the nsw flag and the first operand is %tmp2.
  //  It's guaranteed that adding 1 to %tmp7 won't overflow because %tmp11 adds
  //  1 to %v0 and both %tmp11 and %tmp12 have the nsw flag.
  assert(AddOpA->getOpcode() == Instruction::Add &&
         AddOpB->getOpcode() == Instruction::Add &&
         checkNoWrapFlags(AddOpA, Signed) && checkNoWrapFlags(AddOpB, Signed));
  if (AddOpA->getOperand(MatchingOpIdxA) ==
      AddOpB->getOperand(MatchingOpIdxB)) {
    Value *OtherOperandA = AddOpA->getOperand(MatchingOpIdxA == 1 ? 0 : 1);
    Value *OtherOperandB = AddOpB->getOperand(MatchingOpIdxB == 1 ? 0 : 1);
    Instruction *OtherInstrA = dyn_cast<Instruction>(OtherOperandA);
    Instruction *OtherInstrB = dyn_cast<Instruction>(OtherOperandB);
    // Match `x +nsw/nuw y` and `x +nsw/nuw (y +nsw/nuw IdxDiff)`.
    if (OtherInstrB && OtherInstrB->getOpcode() == Instruction::Add &&
        checkNoWrapFlags(OtherInstrB, Signed) &&
        isa<ConstantInt>(OtherInstrB->getOperand(1))) {
      int64_t CstVal =
          cast<ConstantInt>(OtherInstrB->getOperand(1))->getSExtValue();
      if (OtherInstrB->getOperand(0) == OtherOperandA &&
          IdxDiff.getSExtValue() == CstVal)
        return true;
    }
    // Match `x +nsw/nuw (y +nsw/nuw -Idx)` and `x +nsw/nuw (y +nsw/nuw x)`.
    if (OtherInstrA && OtherInstrA->getOpcode() == Instruction::Add &&
        checkNoWrapFlags(OtherInstrA, Signed) &&
        isa<ConstantInt>(OtherInstrA->getOperand(1))) {
      int64_t CstVal =
          cast<ConstantInt>(OtherInstrA->getOperand(1))->getSExtValue();
      if (OtherInstrA->getOperand(0) == OtherOperandB &&
          IdxDiff.getSExtValue() == -CstVal)
        return true;
    }
    // Match `x +nsw/nuw (y +nsw/nuw c)` and
    // `x +nsw/nuw (y +nsw/nuw (c + IdxDiff))`.
    if (OtherInstrA && OtherInstrB &&
        OtherInstrA->getOpcode() == Instruction::Add &&
        OtherInstrB->getOpcode() == Instruction::Add &&
        checkNoWrapFlags(OtherInstrA, Signed) &&
        checkNoWrapFlags(OtherInstrB, Signed) &&
        isa<ConstantInt>(OtherInstrA->getOperand(1)) &&
        isa<ConstantInt>(OtherInstrB->getOperand(1))) {
      int64_t CstValA =
          cast<ConstantInt>(OtherInstrA->getOperand(1))->getSExtValue();
      int64_t CstValB =
          cast<ConstantInt>(OtherInstrB->getOperand(1))->getSExtValue();
      if (OtherInstrA->getOperand(0) == OtherInstrB->getOperand(0) &&
          IdxDiff.getSExtValue() == (CstValB - CstValA))
        return true;
    }
  }
  return false;
}

std::optional<APInt> Vectorizer::gtConstantOffsetComplexAddrs(Value *PtrA,
                                                              Value *PtrB,
                                                              unsigned Depth) {
  LLVM_DEBUG(dbgs() << "LSV: gtConstantOffsetComplexAddrs PtrA=" << *PtrA
                    << " PtrB=" << *PtrB << " Depth=" << Depth << "\n");
  auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA);
  auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB);
  if (!GEPA || !GEPB)
    return getConstantOffsetSelects(PtrA, PtrB, Depth);

  // Look through GEPs after checking they're the same except for the last
  // index.
  if (GEPA->getNumOperands() != GEPB->getNumOperands() ||
      GEPA->getPointerOperand() != GEPB->getPointerOperand())
    return std::nullopt;
  gep_type_iterator GTIA = gep_type_begin(GEPA);
  gep_type_iterator GTIB = gep_type_begin(GEPB);
  for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) {
    if (GTIA.getOperand() != GTIB.getOperand())
      return std::nullopt;
    ++GTIA;
    ++GTIB;
  }

  Instruction *OpA = dyn_cast<Instruction>(GTIA.getOperand());
  Instruction *OpB = dyn_cast<Instruction>(GTIB.getOperand());
  if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() ||
      OpA->getType() != OpB->getType())
    return std::nullopt;

  uint64_t Stride = DL.getTypeAllocSize(GTIA.getIndexedType());

  // Only look through a ZExt/SExt.
  if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))
    return std::nullopt;

  bool Signed = isa<SExtInst>(OpA);

  // At this point A could be a function parameter, i.e. not an instruction
  Value *ValA = OpA->getOperand(0);
  OpB = dyn_cast<Instruction>(OpB->getOperand(0));
  if (!OpB || ValA->getType() != OpB->getType())
    return std::nullopt;

  const SCEV *OffsetSCEVA = SE.getSCEV(ValA);
  const SCEV *OffsetSCEVB = SE.getSCEV(OpB);
  const SCEV *IdxDiffSCEV = SE.getMinusSCEV(OffsetSCEVB, OffsetSCEVA);
  if (IdxDiffSCEV == SE.getCouldNotCompute())
    return std::nullopt;

  ConstantRange IdxDiffRange = SE.getSignedRange(IdxDiffSCEV);
  if (!IdxDiffRange.isSingleElement())
    return std::nullopt;
  APInt IdxDiff = *IdxDiffRange.getSingleElement();

  LLVM_DEBUG(dbgs() << "LSV: gtConstantOffsetComplexAddrs IdxDiff=" << IdxDiff
                    << "\n");

  // Now we need to prove that adding IdxDiff to ValA won't overflow.
  bool Safe = false;

  // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to
  // ValA, we're okay.
  if (OpB->getOpcode() == Instruction::Add &&
      isa<ConstantInt>(OpB->getOperand(1)) &&
      IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue()) &&
      checkNoWrapFlags(OpB, Signed))
    Safe = true;

  // Second attempt: check if we have eligible add NSW/NUW instruction
  // sequences.
  OpA = dyn_cast<Instruction>(ValA);
  if (!Safe && OpA && OpA->getOpcode() == Instruction::Add &&
      OpB->getOpcode() == Instruction::Add && checkNoWrapFlags(OpA, Signed) &&
      checkNoWrapFlags(OpB, Signed)) {
    // In the checks below a matching operand in OpA and OpB is an operand which
    // is the same in those two instructions.  Below we account for possible
    // orders of the operands of these add instructions.
    for (unsigned MatchingOpIdxA : {0, 1})
      for (unsigned MatchingOpIdxB : {0, 1})
        if (!Safe)
          Safe = checkIfSafeAddSequence(IdxDiff, OpA, MatchingOpIdxA, OpB,
                                        MatchingOpIdxB, Signed);
  }

  unsigned BitWidth = ValA->getType()->getScalarSizeInBits();

  // Third attempt:
  //
  // Assuming IdxDiff is positive: If all set bits of IdxDiff or any higher
  // order bit other than the sign bit are known to be zero in ValA, we can add
  // Diff to it while guaranteeing no overflow of any sort.
  //
  // If IdxDiff is negative, do the same, but swap ValA and ValB.
  if (!Safe) {
    // When computing known bits, use the GEPs as context instructions, since
    // they likely are in the same BB as the load/store.
    Instruction *ContextInst = GEPA->comesBefore(GEPB) ? GEPB : GEPA;
    KnownBits Known(BitWidth);
    computeKnownBits((IdxDiff.sge(0) ? ValA : OpB), Known, DL, 0, &AC,
                     ContextInst, &DT);
    APInt BitsAllowedToBeSet = Known.Zero.zext(IdxDiff.getBitWidth());
    if (Signed)
      BitsAllowedToBeSet.clearBit(BitWidth - 1);
    if (BitsAllowedToBeSet.ult(IdxDiff.abs()))
      return std::nullopt;
    Safe = true;
  }

  if (Safe)
    return IdxDiff * Stride;
  return std::nullopt;
}

std::optional<APInt>
Vectorizer::getConstantOffsetSelects(Value *PtrA, Value *PtrB, unsigned Depth) {
  if (Depth++ == MaxDepth)
    return std::nullopt;

  if (auto *SelectA = dyn_cast<SelectInst>(PtrA)) {
    if (auto *SelectB = dyn_cast<SelectInst>(PtrB)) {
      if (SelectA->getCondition() != SelectB->getCondition())
        return std::nullopt;
      LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetSelects, PtrA=" << *PtrA
                        << ", PtrB=" << *PtrB << ", Depth=" << Depth << "\n");
      std::optional<APInt> TrueDiff = getConstantOffset(
          SelectA->getTrueValue(), SelectB->getTrueValue(), Depth);
      if (!TrueDiff.has_value())
        return std::nullopt;
      std::optional<APInt> FalseDiff = getConstantOffset(
          SelectA->getFalseValue(), SelectB->getFalseValue(), Depth);
      if (TrueDiff == FalseDiff)
        return TrueDiff;
    }
  }
  return std::nullopt;
}

EquivalenceClassMap
Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin,
                                      BasicBlock::iterator End) {
  EquivalenceClassMap Ret;

  auto getUnderlyingObject = [](const Value *Ptr) -> const Value * {
    const Value *ObjPtr = llvm::getUnderlyingObject(Ptr);
    if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
      // The select's themselves are distinct instructions even if they share
      // the same condition and evaluate to consecutive pointers for true and
      // false values of the condition. Therefore using the select's themselves
      // for grouping instructions would put consecutive accesses into different
      // lists and they won't be even checked for being consecutive, and won't
      // be vectorized.
      return Sel->getCondition();
    }
    return ObjPtr;
  };

  for (Instruction &I : make_range(Begin, End)) {
    auto *LI = dyn_cast<LoadInst>(&I);
    auto *SI = dyn_cast<StoreInst>(&I);
    if (!LI && !SI)
      continue;

    if ((LI && !LI->isSimple()) || (SI && !SI->isSimple()))
      continue;

    if ((LI && !TTI.isLegalToVectorizeLoad(LI)) ||
        (SI && !TTI.isLegalToVectorizeStore(SI)))
      continue;

    Type *Ty = getLoadStoreType(&I);
    if (!VectorType::isValidElementType(Ty->getScalarType()))
      continue;

    // Skip weird non-byte sizes. They probably aren't worth the effort of
    // handling correctly.
    unsigned TySize = DL.getTypeSizeInBits(Ty);
    if ((TySize % 8) != 0)
      continue;

    // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
    // functions are currently using an integer type for the vectorized
    // load/store, and does not support casting between the integer type and a
    // vector of pointers (e.g. i64 to <2 x i16*>)
    if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
      continue;

    Value *Ptr = getLoadStorePointerOperand(&I);
    unsigned AS = Ptr->getType()->getPointerAddressSpace();
    unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);

    unsigned VF = VecRegSize / TySize;
    VectorType *VecTy = dyn_cast<VectorType>(Ty);

    // Only handle power-of-two sized elements.
    if ((!VecTy && !isPowerOf2_32(DL.getTypeSizeInBits(Ty))) ||
        (VecTy && !isPowerOf2_32(DL.getTypeSizeInBits(VecTy->getScalarType()))))
      continue;

    // No point in looking at these if they're too big to vectorize.
    if (TySize > VecRegSize / 2 ||
        (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
      continue;

    Ret[{getUnderlyingObject(Ptr), AS,
         DL.getTypeSizeInBits(getLoadStoreType(&I)->getScalarType()),
         /*IsLoad=*/LI != nullptr}]
        .push_back(&I);
  }

  return Ret;
}

std::vector<Chain> Vectorizer::gatherChains(ArrayRef<Instruction *> Instrs) {
  if (Instrs.empty())
    return {};

  unsigned AS = getLoadStoreAddressSpace(Instrs[0]);
  unsigned ASPtrBits = DL.getIndexSizeInBits(AS);

#ifndef NDEBUG
  // Check that Instrs is in BB order and all have the same addr space.
  for (size_t I = 1; I < Instrs.size(); ++I) {
    assert(Instrs[I - 1]->comesBefore(Instrs[I]));
    assert(getLoadStoreAddressSpace(Instrs[I]) == AS);
  }
#endif

  // Machinery to build an MRU-hashtable of Chains.
  //
  // (Ideally this could be done with MapVector, but as currently implemented,
  // moving an element to the front of a MapVector is O(n).)
  struct InstrListElem : ilist_node<InstrListElem>,
                         std::pair<Instruction *, Chain> {
    explicit InstrListElem(Instruction *I)
        : std::pair<Instruction *, Chain>(I, {}) {}
  };
  struct InstrListElemDenseMapInfo {
    using PtrInfo = DenseMapInfo<InstrListElem *>;
    using IInfo = DenseMapInfo<Instruction *>;
    static InstrListElem *getEmptyKey() { return PtrInfo::getEmptyKey(); }
    static InstrListElem *getTombstoneKey() {
      return PtrInfo::getTombstoneKey();
    }
    static unsigned getHashValue(const InstrListElem *E) {
      return IInfo::getHashValue(E->first);
    }
    static bool isEqual(const InstrListElem *A, const InstrListElem *B) {
      if (A == getEmptyKey() || B == getEmptyKey())
        return A == getEmptyKey() && B == getEmptyKey();
      if (A == getTombstoneKey() || B == getTombstoneKey())
        return A == getTombstoneKey() && B == getTombstoneKey();
      return IInfo::isEqual(A->first, B->first);
    }
  };
  SpecificBumpPtrAllocator<InstrListElem> Allocator;
  simple_ilist<InstrListElem> MRU;
  DenseSet<InstrListElem *, InstrListElemDenseMapInfo> Chains;

  // Compare each instruction in `instrs` to leader of the N most recently-used
  // chains.  This limits the O(n^2) behavior of this pass while also allowing
  // us to build arbitrarily long chains.
  for (Instruction *I : Instrs) {
    constexpr int MaxChainsToTry = 64;

    bool MatchFound = false;
    auto ChainIter = MRU.begin();
    for (size_t J = 0; J < MaxChainsToTry && ChainIter != MRU.end();
         ++J, ++ChainIter) {
      std::optional<APInt> Offset =
          getConstantOffset(getLoadStorePointerOperand(ChainIter->first),
                            getLoadStorePointerOperand(I));
      if (Offset.has_value()) {
        // `Offset` might not have the expected number of bits, if e.g. AS has a
        // different number of bits than opaque pointers.
        ChainIter->second.push_back(
            ChainElem{I, Offset.value().sextOrTrunc(ASPtrBits)});
        // Move ChainIter to the front of the MRU list.
        MRU.remove(*ChainIter);
        MRU.push_front(*ChainIter);
        MatchFound = true;
        break;
      }
    }

    if (!MatchFound) {
      APInt ZeroOffset(ASPtrBits, 0);
      InstrListElem *E = new (Allocator.Allocate()) InstrListElem(I);
      E->second.push_back(ChainElem{I, ZeroOffset});
      MRU.push_front(*E);
      Chains.insert(E);
    }
  }

  std::vector<Chain> Ret;
  Ret.reserve(Chains.size());
  // Iterate over MRU rather than Chains so the order is deterministic.
  for (auto &E : MRU)
    if (E.second.size() > 1)
      Ret.push_back(std::move(E.second));
  return Ret;
}

std::optional<APInt> Vectorizer::getConstantOffset(Value *PtrA, Value *PtrB,
                                                   unsigned Depth) {
  LLVM_DEBUG(dbgs() << "LSV: getConstantOffset, PtrA=" << *PtrA
                    << ", PtrB=" << *PtrB << ", Depth=" << Depth << "\n");
  unsigned OffsetBitWidth = DL.getIndexTypeSizeInBits(PtrA->getType());
  APInt OffsetA(OffsetBitWidth, 0);
  APInt OffsetB(OffsetBitWidth, 0);
  PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
  PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
  unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(PtrA->getType());
  if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(PtrB->getType()))
    return std::nullopt;

  // If we have to shrink the pointer, stripAndAccumulateInBoundsConstantOffsets
  // should properly handle a possible overflow and the value should fit into
  // the smallest data type used in the cast/gep chain.
  assert(OffsetA.getSignificantBits() <= NewPtrBitWidth &&
         OffsetB.getSignificantBits() <= NewPtrBitWidth);

  OffsetA = OffsetA.sextOrTrunc(NewPtrBitWidth);
  OffsetB = OffsetB.sextOrTrunc(NewPtrBitWidth);
  if (PtrA == PtrB)
    return OffsetB - OffsetA;

  // Try to compute B - A.
  const SCEV *DistScev = SE.getMinusSCEV(SE.getSCEV(PtrB), SE.getSCEV(PtrA));
  if (DistScev != SE.getCouldNotCompute()) {
    LLVM_DEBUG(dbgs() << "LSV: SCEV PtrB - PtrA =" << *DistScev << "\n");
    ConstantRange DistRange = SE.getSignedRange(DistScev);
    if (DistRange.isSingleElement())
      return OffsetB - OffsetA + *DistRange.getSingleElement();
  }
  std::optional<APInt> Diff = gtConstantOffsetComplexAddrs(PtrA, PtrB, Depth);
  if (Diff.has_value())
    return OffsetB - OffsetA + Diff->sext(OffsetB.getBitWidth());
  return std::nullopt;
}