Target/RISCV/RISCVGatherScatterLowering.cpp

//===- RISCVGatherScatterLowering.cpp - Gather/Scatter lowering -----------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This pass custom lowers llvm.gather and llvm.scatter instructions to
// RISC-V intrinsics.
//
//===----------------------------------------------------------------------===//

#include "RISCV.h"
#include "RISCVTargetMachine.h"
#include "llvm/Analysis/InstSimplifyFolder.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/GetElementPtrTypeIterator.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/Transforms/Utils/Local.h"
#include <optional>

using namespace llvm;
using namespace PatternMatch;

#define DEBUG_TYPE "riscv-gather-scatter-lowering"

namespace {

class RISCVGatherScatterLowering : public FunctionPass {
  const RISCVSubtarget *ST = nullptr;
  const RISCVTargetLowering *TLI = nullptr;
  LoopInfo *LI = nullptr;
  const DataLayout *DL = nullptr;

  SmallVector<WeakTrackingVH> MaybeDeadPHIs;

  // Cache of the BasePtr and Stride determined from this GEP. When a GEP is
  // used by multiple gathers/scatters, this allow us to reuse the scalar
  // instructions we created for the first gather/scatter for the others.
  DenseMap<GetElementPtrInst *, std::pair<Value *, Value *>> StridedAddrs;

public:
  static char ID; // Pass identification, replacement for typeid

  RISCVGatherScatterLowering() : FunctionPass(ID) {}

  bool runOnFunction(Function &F) override;

  void getAnalysisUsage(AnalysisUsage &AU) const override {
    AU.setPreservesCFG();
    AU.addRequired<TargetPassConfig>();
    AU.addRequired<LoopInfoWrapperPass>();
  }

  StringRef getPassName() const override {
    return "RISC-V gather/scatter lowering";
  }

private:
  bool tryCreateStridedLoadStore(IntrinsicInst *II);

  std::pair<Value *, Value *> determineBaseAndStride(Instruction *Ptr,
                                                     IRBuilderBase &Builder);

  bool matchStridedRecurrence(Value *Index, Loop *L, Value *&Stride,
                              PHINode *&BasePtr, BinaryOperator *&Inc,
                              IRBuilderBase &Builder);
};

} // end anonymous namespace

char RISCVGatherScatterLowering::ID = 0;

INITIALIZE_PASS(RISCVGatherScatterLowering, DEBUG_TYPE,
                "RISC-V gather/scatter lowering pass", false, false)

FunctionPass *llvm::createRISCVGatherScatterLoweringPass() {
  return new RISCVGatherScatterLowering();
}

// TODO: Should we consider the mask when looking for a stride?
static std::pair<Value *, Value *> matchStridedConstant(Constant *StartC) {
  if (!isa<FixedVectorType>(StartC->getType()))
    return std::make_pair(nullptr, nullptr);

  unsigned NumElts = cast<FixedVectorType>(StartC->getType())->getNumElements();

  // Check that the start value is a strided constant.
  auto *StartVal =
      dyn_cast_or_null<ConstantInt>(StartC->getAggregateElement((unsigned)0));
  if (!StartVal)
    return std::make_pair(nullptr, nullptr);
  APInt StrideVal(StartVal->getValue().getBitWidth(), 0);
  ConstantInt *Prev = StartVal;
  for (unsigned i = 1; i != NumElts; ++i) {
    auto *C = dyn_cast_or_null<ConstantInt>(StartC->getAggregateElement(i));
    if (!C)
      return std::make_pair(nullptr, nullptr);

    APInt LocalStride = C->getValue() - Prev->getValue();
    if (i == 1)
      StrideVal = LocalStride;
    else if (StrideVal != LocalStride)
      return std::make_pair(nullptr, nullptr);

    Prev = C;
  }

  Value *Stride = ConstantInt::get(StartVal->getType(), StrideVal);

  return std::make_pair(StartVal, Stride);
}

static std::pair<Value *, Value *> matchStridedStart(Value *Start,
                                                     IRBuilderBase &Builder) {
  // Base case, start is a strided constant.
  auto *StartC = dyn_cast<Constant>(Start);
  if (StartC)
    return matchStridedConstant(StartC);

  // Base case, start is a stepvector
  if (match(Start, m_Intrinsic<Intrinsic::stepvector>())) {
    auto *Ty = Start->getType()->getScalarType();
    return std::make_pair(ConstantInt::get(Ty, 0), ConstantInt::get(Ty, 1));
  }

  // Not a constant, maybe it's a strided constant with a splat added or
  // multipled.
  auto *BO = dyn_cast<BinaryOperator>(Start);
  if (!BO || (BO->getOpcode() != Instruction::Add &&
              BO->getOpcode() != Instruction::Or &&
              BO->getOpcode() != Instruction::Shl &&
              BO->getOpcode() != Instruction::Mul))
    return std::make_pair(nullptr, nullptr);

  if (BO->getOpcode() == Instruction::Or &&
      !cast<PossiblyDisjointInst>(BO)->isDisjoint())
    return std::make_pair(nullptr, nullptr);

  // Look for an operand that is splatted.
  unsigned OtherIndex = 0;
  Value *Splat = getSplatValue(BO->getOperand(1));
  if (!Splat && Instruction::isCommutative(BO->getOpcode())) {
    Splat = getSplatValue(BO->getOperand(0));
    OtherIndex = 1;
  }
  if (!Splat)
    return std::make_pair(nullptr, nullptr);

  Value *Stride;
  std::tie(Start, Stride) = matchStridedStart(BO->getOperand(OtherIndex),
                                              Builder);
  if (!Start)
    return std::make_pair(nullptr, nullptr);

  Builder.SetInsertPoint(BO);
  Builder.SetCurrentDebugLocation(DebugLoc());
  // Add the splat value to the start or multiply the start and stride by the
  // splat.
  switch (BO->getOpcode()) {
  default:
    llvm_unreachable("Unexpected opcode");
  case Instruction::Or:
    // TODO: We'd be better off creating disjoint or here, but we don't yet
    // have an IRBuilder API for that.
    [[fallthrough]];
  case Instruction::Add:
    Start = Builder.CreateAdd(Start, Splat);
    break;
  case Instruction::Mul:
    Start = Builder.CreateMul(Start, Splat);
    Stride = Builder.CreateMul(Stride, Splat);
    break;
  case Instruction::Shl:
    Start = Builder.CreateShl(Start, Splat);
    Stride = Builder.CreateShl(Stride, Splat);
    break;
  }

  return std::make_pair(Start, Stride);
}

// Recursively, walk about the use-def chain until we find a Phi with a strided
// start value. Build and update a scalar recurrence as we unwind the recursion.
// We also update the Stride as we unwind. Our goal is to move all of the
// arithmetic out of the loop.
bool RISCVGatherScatterLowering::matchStridedRecurrence(Value *Index, Loop *L,
                                                        Value *&Stride,
                                                        PHINode *&BasePtr,
                                                        BinaryOperator *&Inc,
                                                        IRBuilderBase &Builder) {
  // Our base case is a Phi.
  if (auto *Phi = dyn_cast<PHINode>(Index)) {
    // A phi node we want to perform this function on should be from the
    // loop header.
    if (Phi->getParent() != L->getHeader())
      return false;

    Value *Step, *Start;
    if (!matchSimpleRecurrence(Phi, Inc, Start, Step) ||
        Inc->getOpcode() != Instruction::Add)
      return false;
    assert(Phi->getNumIncomingValues() == 2 && "Expected 2 operand phi.");
    unsigned IncrementingBlock = Phi->getIncomingValue(0) == Inc ? 0 : 1;
    assert(Phi->getIncomingValue(IncrementingBlock) == Inc &&
           "Expected one operand of phi to be Inc");

    // Step should be a splat.
    Step = getSplatValue(Step);
    if (!Step)
      return false;

    std::tie(Start, Stride) = matchStridedStart(Start, Builder);
    if (!Start)
      return false;
    assert(Stride != nullptr);

    // Build scalar phi and increment.
    BasePtr =
        PHINode::Create(Start->getType(), 2, Phi->getName() + ".scalar", Phi->getIterator());
    Inc = BinaryOperator::CreateAdd(BasePtr, Step, Inc->getName() + ".scalar",
                                    Inc->getIterator());
    BasePtr->addIncoming(Start, Phi->getIncomingBlock(1 - IncrementingBlock));
    BasePtr->addIncoming(Inc, Phi->getIncomingBlock(IncrementingBlock));

    // Note that this Phi might be eligible for removal.
    MaybeDeadPHIs.push_back(Phi);
    return true;
  }

  // Otherwise look for binary operator.
  auto *BO = dyn_cast<BinaryOperator>(Index);
  if (!BO)
    return false;

  switch (BO->getOpcode()) {
  default:
    return false;
  case Instruction::Or:
    // We need to be able to treat Or as Add.
    if (!cast<PossiblyDisjointInst>(BO)->isDisjoint())
      return false;
    break;
  case Instruction::Add:
    break;
  case Instruction::Shl:
    break;
  case Instruction::Mul:
    break;
  }

  // We should have one operand in the loop and one splat.
  Value *OtherOp;
  if (isa<Instruction>(BO->getOperand(0)) &&
      L->contains(cast<Instruction>(BO->getOperand(0)))) {
    Index = cast<Instruction>(BO->getOperand(0));
    OtherOp = BO->getOperand(1);
  } else if (isa<Instruction>(BO->getOperand(1)) &&
             L->contains(cast<Instruction>(BO->getOperand(1))) &&
             Instruction::isCommutative(BO->getOpcode())) {
    Index = cast<Instruction>(BO->getOperand(1));
    OtherOp = BO->getOperand(0);
  } else {
    return false;
  }

  // Make sure other op is loop invariant.
  if (!L->isLoopInvariant(OtherOp))
    return false;

  // Make sure we have a splat.
  Value *SplatOp = getSplatValue(OtherOp);
  if (!SplatOp)
    return false;

  // Recurse up the use-def chain.
  if (!matchStridedRecurrence(Index, L, Stride, BasePtr, Inc, Builder))
    return false;

  // Locate the Step and Start values from the recurrence.
  unsigned StepIndex = Inc->getOperand(0) == BasePtr ? 1 : 0;
  unsigned StartBlock = BasePtr->getOperand(0) == Inc ? 1 : 0;
  Value *Step = Inc->getOperand(StepIndex);
  Value *Start = BasePtr->getOperand(StartBlock);

  // We need to adjust the start value in the preheader.
  Builder.SetInsertPoint(
      BasePtr->getIncomingBlock(StartBlock)->getTerminator());
  Builder.SetCurrentDebugLocation(DebugLoc());

  // TODO: Share this switch with matchStridedStart?
  switch (BO->getOpcode()) {
  default:
    llvm_unreachable("Unexpected opcode!");
  case Instruction::Add:
  case Instruction::Or: {
    // An add only affects the start value. It's ok to do this for Or because
    // we already checked that there are no common set bits.
    Start = Builder.CreateAdd(Start, SplatOp, "start");
    break;
  }
  case Instruction::Mul: {
    Start = Builder.CreateMul(Start, SplatOp, "start");
    Stride = Builder.CreateMul(Stride, SplatOp, "stride");
    break;
  }
  case Instruction::Shl: {
    Start = Builder.CreateShl(Start, SplatOp, "start");
    Stride = Builder.CreateShl(Stride, SplatOp, "stride");
    break;
  }
  }

  // If the Step was defined inside the loop, adjust it before its definition
  // instead of in the preheader.
  if (auto *StepI = dyn_cast<Instruction>(Step); StepI && L->contains(StepI))
    Builder.SetInsertPoint(*StepI->getInsertionPointAfterDef());

  switch (BO->getOpcode()) {
  default:
    break;
  case Instruction::Mul:
    Step = Builder.CreateMul(Step, SplatOp, "step");
    break;
  case Instruction::Shl:
    Step = Builder.CreateShl(Step, SplatOp, "step");
    break;
  }

  Inc->setOperand(StepIndex, Step);
  BasePtr->setIncomingValue(StartBlock, Start);
  return true;
}

std::pair<Value *, Value *>
RISCVGatherScatterLowering::determineBaseAndStride(Instruction *Ptr,
                                                   IRBuilderBase &Builder) {

  // A gather/scatter of a splat is a zero strided load/store.
  if (auto *BasePtr = getSplatValue(Ptr)) {
    Type *IntPtrTy = DL->getIntPtrType(BasePtr->getType());
    return std::make_pair(BasePtr, ConstantInt::get(IntPtrTy, 0));
  }

  auto *GEP = dyn_cast<GetElementPtrInst>(Ptr);
  if (!GEP)
    return std::make_pair(nullptr, nullptr);

  auto I = StridedAddrs.find(GEP);
  if (I != StridedAddrs.end())
    return I->second;

  SmallVector<Value *, 2> Ops(GEP->operands());

  // If the base pointer is a vector, check if it's strided.
  Value *Base = GEP->getPointerOperand();
  if (auto *BaseInst = dyn_cast<Instruction>(Base);
      BaseInst && BaseInst->getType()->isVectorTy()) {
    // If GEP's offset is scalar then we can add it to the base pointer's base.
    auto IsScalar = [](Value *Idx) { return !Idx->getType()->isVectorTy(); };
    if (all_of(GEP->indices(), IsScalar)) {
      auto [BaseBase, Stride] = determineBaseAndStride(BaseInst, Builder);
      if (BaseBase) {
        Builder.SetInsertPoint(GEP);
        SmallVector<Value *> Indices(GEP->indices());
        Value *OffsetBase =
            Builder.CreateGEP(GEP->getSourceElementType(), BaseBase, Indices,
                              GEP->getName() + "offset", GEP->isInBounds());
        return {OffsetBase, Stride};
      }
    }
  }

  // Base pointer needs to be a scalar.
  Value *ScalarBase = Base;
  if (ScalarBase->getType()->isVectorTy()) {
    ScalarBase = getSplatValue(ScalarBase);
    if (!ScalarBase)
      return std::make_pair(nullptr, nullptr);
  }

  std::optional<unsigned> VecOperand;
  unsigned TypeScale = 0;

  // Look for a vector operand and scale.
  gep_type_iterator GTI = gep_type_begin(GEP);
  for (unsigned i = 1, e = GEP->getNumOperands(); i != e; ++i, ++GTI) {
    if (!Ops[i]->getType()->isVectorTy())
      continue;

    if (VecOperand)
      return std::make_pair(nullptr, nullptr);

    VecOperand = i;

    TypeSize TS = GTI.getSequentialElementStride(*DL);
    if (TS.isScalable())
      return std::make_pair(nullptr, nullptr);

    TypeScale = TS.getFixedValue();
  }

  // We need to find a vector index to simplify.
  if (!VecOperand)
    return std::make_pair(nullptr, nullptr);

  // We can't extract the stride if the arithmetic is done at a different size
  // than the pointer type. Adding the stride later may not wrap correctly.
  // Technically we could handle wider indices, but I don't expect that in
  // practice.  Handle one special case here - constants.  This simplifies
  // writing test cases.
  Value *VecIndex = Ops[*VecOperand];
  Type *VecIntPtrTy = DL->getIntPtrType(GEP->getType());
  if (VecIndex->getType() != VecIntPtrTy) {
    auto *VecIndexC = dyn_cast<Constant>(VecIndex);
    if (!VecIndexC)
      return std::make_pair(nullptr, nullptr);
    if (VecIndex->getType()->getScalarSizeInBits() > VecIntPtrTy->getScalarSizeInBits())
      VecIndex = ConstantFoldCastInstruction(Instruction::Trunc, VecIndexC, VecIntPtrTy);
    else
      VecIndex = ConstantFoldCastInstruction(Instruction::SExt, VecIndexC, VecIntPtrTy);
  }

  // Handle the non-recursive case.  This is what we see if the vectorizer
  // decides to use a scalar IV + vid on demand instead of a vector IV.
  auto [Start, Stride] = matchStridedStart(VecIndex, Builder);
  if (Start) {
    assert(Stride);
    Builder.SetInsertPoint(GEP);

    // Replace the vector index with the scalar start and build a scalar GEP.
    Ops[*VecOperand] = Start;
    Type *SourceTy = GEP->getSourceElementType();
    Value *BasePtr =
        Builder.CreateGEP(SourceTy, ScalarBase, ArrayRef(Ops).drop_front());

    // Convert stride to pointer size if needed.
    Type *IntPtrTy = DL->getIntPtrType(BasePtr->getType());
    assert(Stride->getType() == IntPtrTy && "Unexpected type");

    // Scale the stride by the size of the indexed type.
    if (TypeScale != 1)
      Stride = Builder.CreateMul(Stride, ConstantInt::get(IntPtrTy, TypeScale));

    auto P = std::make_pair(BasePtr, Stride);
    StridedAddrs[GEP] = P;
    return P;
  }

  // Make sure we're in a loop and that has a pre-header and a single latch.
  Loop *L = LI->getLoopFor(GEP->getParent());
  if (!L || !L->getLoopPreheader() || !L->getLoopLatch())
    return std::make_pair(nullptr, nullptr);

  BinaryOperator *Inc;
  PHINode *BasePhi;
  if (!matchStridedRecurrence(VecIndex, L, Stride, BasePhi, Inc, Builder))
    return std::make_pair(nullptr, nullptr);

  assert(BasePhi->getNumIncomingValues() == 2 && "Expected 2 operand phi.");
  unsigned IncrementingBlock = BasePhi->getOperand(0) == Inc ? 0 : 1;
  assert(BasePhi->getIncomingValue(IncrementingBlock) == Inc &&
         "Expected one operand of phi to be Inc");

  Builder.SetInsertPoint(GEP);

  // Replace the vector index with the scalar phi and build a scalar GEP.
  Ops[*VecOperand] = BasePhi;
  Type *SourceTy = GEP->getSourceElementType();
  Value *BasePtr =
      Builder.CreateGEP(SourceTy, ScalarBase, ArrayRef(Ops).drop_front());

  // Final adjustments to stride should go in the start block.
  Builder.SetInsertPoint(
      BasePhi->getIncomingBlock(1 - IncrementingBlock)->getTerminator());

  // Convert stride to pointer size if needed.
  Type *IntPtrTy = DL->getIntPtrType(BasePtr->getType());
  assert(Stride->getType() == IntPtrTy && "Unexpected type");

  // Scale the stride by the size of the indexed type.
  if (TypeScale != 1)
    Stride = Builder.CreateMul(Stride, ConstantInt::get(IntPtrTy, TypeScale));

  auto P = std::make_pair(BasePtr, Stride);
  StridedAddrs[GEP] = P;
  return P;
}

bool RISCVGatherScatterLowering::tryCreateStridedLoadStore(IntrinsicInst *II) {
  VectorType *DataType;
  Value *StoreVal = nullptr, *Ptr, *Mask, *EVL = nullptr;
  MaybeAlign MA;
  switch (II->getIntrinsicID()) {
  case Intrinsic::masked_gather:
    DataType = cast<VectorType>(II->getType());
    Ptr = II->getArgOperand(0);
    MA = cast<ConstantInt>(II->getArgOperand(1))->getMaybeAlignValue();
    Mask = II->getArgOperand(2);
    break;
  case Intrinsic::vp_gather:
    DataType = cast<VectorType>(II->getType());
    Ptr = II->getArgOperand(0);
    MA = II->getParamAlign(0).value_or(
        DL->getABITypeAlign(DataType->getElementType()));
    Mask = II->getArgOperand(1);
    EVL = II->getArgOperand(2);
    break;
  case Intrinsic::masked_scatter:
    DataType = cast<VectorType>(II->getArgOperand(0)->getType());
    StoreVal = II->getArgOperand(0);
    Ptr = II->getArgOperand(1);
    MA = cast<ConstantInt>(II->getArgOperand(2))->getMaybeAlignValue();
    Mask = II->getArgOperand(3);
    break;
  case Intrinsic::vp_scatter:
    DataType = cast<VectorType>(II->getArgOperand(0)->getType());
    StoreVal = II->getArgOperand(0);
    Ptr = II->getArgOperand(1);
    MA = II->getParamAlign(1).value_or(
        DL->getABITypeAlign(DataType->getElementType()));
    Mask = II->getArgOperand(2);
    EVL = II->getArgOperand(3);
    break;
  default:
    llvm_unreachable("Unexpected intrinsic");
  }

  // Make sure the operation will be supported by the backend.
  EVT DataTypeVT = TLI->getValueType(*DL, DataType);
  if (!MA || !TLI->isLegalStridedLoadStore(DataTypeVT, *MA))
    return false;

  // FIXME: Let the backend type legalize by splitting/widening?
  if (!TLI->isTypeLegal(DataTypeVT))
    return false;

  // Pointer should be an instruction.
  auto *PtrI = dyn_cast<Instruction>(Ptr);
  if (!PtrI)
    return false;

  LLVMContext &Ctx = PtrI->getContext();
  IRBuilder Builder(Ctx, InstSimplifyFolder(*DL));
  Builder.SetInsertPoint(PtrI);

  Value *BasePtr, *Stride;
  std::tie(BasePtr, Stride) = determineBaseAndStride(PtrI, Builder);
  if (!BasePtr)
    return false;
  assert(Stride != nullptr);

  Builder.SetInsertPoint(II);

  if (!EVL)
    EVL = Builder.CreateElementCount(
        Builder.getInt32Ty(), cast<VectorType>(DataType)->getElementCount());

  CallInst *Call;

  if (!StoreVal) {
    Call = Builder.CreateIntrinsic(
        Intrinsic::experimental_vp_strided_load,
        {DataType, BasePtr->getType(), Stride->getType()},
        {BasePtr, Stride, Mask, EVL});

    // Merge llvm.masked.gather's passthru
    if (II->getIntrinsicID() == Intrinsic::masked_gather)
      Call = Builder.CreateIntrinsic(Intrinsic::vp_select, {DataType},
                                     {Mask, Call, II->getArgOperand(3), EVL});
  } else
    Call = Builder.CreateIntrinsic(
        Intrinsic::experimental_vp_strided_store,
        {DataType, BasePtr->getType(), Stride->getType()},
        {StoreVal, BasePtr, Stride, Mask, EVL});

  Call->takeName(II);
  II->replaceAllUsesWith(Call);
  II->eraseFromParent();

  if (PtrI->use_empty())
    RecursivelyDeleteTriviallyDeadInstructions(PtrI);

  return true;
}

bool RISCVGatherScatterLowering::runOnFunction(Function &F) {
  if (skipFunction(F))
    return false;

  auto &TPC = getAnalysis<TargetPassConfig>();
  auto &TM = TPC.getTM<RISCVTargetMachine>();
  ST = &TM.getSubtarget<RISCVSubtarget>(F);
  if (!ST->hasVInstructions() || !ST->useRVVForFixedLengthVectors())
    return false;

  TLI = ST->getTargetLowering();
  DL = &F.getDataLayout();
  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();

  StridedAddrs.clear();

  SmallVector<IntrinsicInst *, 4> Worklist;

  bool Changed = false;

  for (BasicBlock &BB : F) {
    for (Instruction &I : BB) {
      IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I);
      if (!II)
        continue;
      switch (II->getIntrinsicID()) {
      case Intrinsic::masked_gather:
      case Intrinsic::masked_scatter:
      case Intrinsic::vp_gather:
      case Intrinsic::vp_scatter:
        Worklist.push_back(II);
        break;
      default:
        break;
      }
    }
  }

  // Rewrite gather/scatter to form strided load/store if possible.
  for (auto *II : Worklist)
    Changed |= tryCreateStridedLoadStore(II);

  // Remove any dead phis.
  while (!MaybeDeadPHIs.empty()) {
    if (auto *Phi = dyn_cast_or_null<PHINode>(MaybeDeadPHIs.pop_back_val()))
      RecursivelyDeleteDeadPHINode(Phi);
  }

  return Changed;
}