//===- SparseVectorization.cpp - Vectorization of sparsified loops --------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// A pass that converts loops generated by the sparse compiler into a form that
// can exploit SIMD instructions of the target architecture. Note that this pass
// ensures the sparse compiler can generate efficient SIMD (including ArmSVE
// support) with proper separation of concerns as far as sparsification and
// vectorization is concerned. However, this pass is not the final abstraction
// level we want, and not the general vectorizer we want either. It forms a good
// stepping stone for incremental future improvements though.
//
//===----------------------------------------------------------------------===//

#include "CodegenUtils.h"

#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/Complex/IR/Complex.h"
#include "mlir/Dialect/Math/IR/Math.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/SCF/IR/SCF.h"
#include "mlir/Dialect/SparseTensor/Transforms/Passes.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/IR/Matchers.h"

using namespace mlir;
using namespace mlir::sparse_tensor;

namespace {

/// Target SIMD properties:
///   vectorLength: # packed data elements (viz. vector<16xf32> has length 16)
///   enableVLAVectorization: enables scalable vectors (viz. ARMSve)
///   enableSIMDIndex32: uses 32-bit indices in gather/scatter for efficiency
struct VL {
  unsigned vectorLength;
  bool enableVLAVectorization;
  bool enableSIMDIndex32;
};

/// Helper to test for given index value.
static bool isIntValue(Value val, int64_t idx) {
  if (auto ival = getConstantIntValue(val))
    return *ival == idx;
  return false;
}

/// Constructs vector type for element type.
static VectorType vectorType(VL vl, Type etp) {
  unsigned numScalableDims = vl.enableVLAVectorization;
  return VectorType::get(vl.vectorLength, etp, numScalableDims);
}

/// Constructs vector type from pointer.
static VectorType vectorType(VL vl, Value ptr) {
  return vectorType(vl, ptr.getType().cast<MemRefType>().getElementType());
}

/// Constructs vector iteration mask.
static Value genVectorMask(PatternRewriter &rewriter, Location loc, VL vl,
                           Value iv, Value lo, Value hi, Value step) {
  VectorType mtp = vectorType(vl, rewriter.getI1Type());
  // Special case if the vector length evenly divides the trip count (for
  // example, "for i = 0, 128, 16"). A constant all-true mask is generated
  // so that all subsequent masked memory operations are immediately folded
  // into unconditional memory operations.
  IntegerAttr loInt, hiInt, stepInt;
  if (matchPattern(lo, m_Constant(&loInt)) &&
      matchPattern(hi, m_Constant(&hiInt)) &&
      matchPattern(step, m_Constant(&stepInt))) {
    if (((hiInt.getInt() - loInt.getInt()) % stepInt.getInt()) == 0) {
      Value trueVal = constantI1(rewriter, loc, true);
      return rewriter.create<vector::BroadcastOp>(loc, mtp, trueVal);
    }
  }
  // Otherwise, generate a vector mask that avoids overrunning the upperbound
  // during vector execution. Here we rely on subsequent loop optimizations to
  // avoid executing the mask in all iterations, for example, by splitting the
  // loop into an unconditional vector loop and a scalar cleanup loop.
  auto min = AffineMap::get(
      /*dimCount=*/2, /*symbolCount=*/1,
      {rewriter.getAffineSymbolExpr(0),
       rewriter.getAffineDimExpr(0) - rewriter.getAffineDimExpr(1)},
      rewriter.getContext());
  Value end =
      rewriter.createOrFold<AffineMinOp>(loc, min, ValueRange{hi, iv, step});
  return rewriter.create<vector::CreateMaskOp>(loc, mtp, end);
}

/// Generates a vectorized invariant. Here we rely on subsequent loop
/// optimizations to hoist the invariant broadcast out of the vector loop.
static Value genVectorInvariantValue(PatternRewriter &rewriter, VL vl,
                                     Value val) {
  VectorType vtp = vectorType(vl, val.getType());
  return rewriter.create<vector::BroadcastOp>(val.getLoc(), vtp, val);
}

/// Generates a vectorized load lhs = a[ind[lo:hi]] or lhs = a[lo:hi],
/// where 'lo' denotes the current index and 'hi = lo + vl - 1'.
static Value genVectorLoad(PatternRewriter &rewriter, Location loc, VL vl,
                           Value ptr, ArrayRef<Value> idxs, Value vmask) {
  VectorType vtp = vectorType(vl, ptr);
  Value pass = constantZero(rewriter, loc, vtp);
  if (idxs.back().getType().isa<VectorType>()) {
    SmallVector<Value> scalarArgs(idxs.begin(), idxs.end());
    Value indexVec = idxs.back();
    scalarArgs.back() = constantIndex(rewriter, loc, 0);
    return rewriter.create<vector::GatherOp>(loc, vtp, ptr, scalarArgs,
                                             indexVec, vmask, pass);
  }
  return rewriter.create<vector::MaskedLoadOp>(loc, vtp, ptr, idxs, vmask,
                                               pass);
}

/// Generates a vectorized store a[ind[lo:hi]] = rhs or a[lo:hi] = rhs
/// where 'lo' denotes the current index and 'hi = lo + vl - 1'.
static void genVectorStore(PatternRewriter &rewriter, Location loc, Value ptr,
                           ArrayRef<Value> idxs, Value vmask, Value rhs) {
  if (idxs.back().getType().isa<VectorType>()) {
    SmallVector<Value> scalarArgs(idxs.begin(), idxs.end());
    Value indexVec = idxs.back();
    scalarArgs.back() = constantIndex(rewriter, loc, 0);
    rewriter.create<vector::ScatterOp>(loc, ptr, scalarArgs, indexVec, vmask,
                                       rhs);
    return;
  }
  rewriter.create<vector::MaskedStoreOp>(loc, ptr, idxs, vmask, rhs);
}

/// Maps operation to combining kind for reduction.
static vector::CombiningKind getCombiningKind(Operation *def) {
  if (isa<arith::AddFOp>(def) || isa<arith::AddIOp>(def) ||
      isa<arith::SubFOp>(def) || isa<arith::SubIOp>(def))
    return vector::CombiningKind::ADD;
  if (isa<arith::MulFOp>(def) || isa<arith::MulIOp>(def))
    return vector::CombiningKind::MUL;
  if (isa<arith::AndIOp>(def))
    return vector::CombiningKind::AND;
  if (isa<arith::OrIOp>(def))
    return vector::CombiningKind::OR;
  if (isa<arith::XOrIOp>(def))
    return vector::CombiningKind::XOR;
  llvm_unreachable("unknown reduction kind");
}

/// Generates an initial value for a vector reduction, following the scheme
/// given in Chapter 5 of "The Software Vectorization Handbook", where the
/// initial scalar value is correctly embedded in the vector reduction value,
/// and a straightforward horizontal reduction will complete the operation.
/// The value 'r' denotes the initial value of the accumulator. Value 'rd'
/// denotes the accumulation operation, which is solely used here to determine
/// the kind of combining reduction (viz. addf -> sum-accumulation).
static Value genVectorReducInit(PatternRewriter &rewriter, Location loc,
                                VectorType vtp, Value r, Value rd) {
  vector::CombiningKind kind = getCombiningKind(rd.getDefiningOp());
  switch (kind) {
  case vector::CombiningKind::ADD:
  case vector::CombiningKind::XOR:
    // Initialize reduction vector to: | 0 | .. | 0 | r |
    return rewriter.create<vector::InsertElementOp>(
        loc, r, constantZero(rewriter, loc, vtp),
        constantIndex(rewriter, loc, 0));
  case vector::CombiningKind::MUL:
    // Initialize reduction vector to: | 1 | .. | 1 | r |
    return rewriter.create<vector::InsertElementOp>(
        loc, r, constantOne(rewriter, loc, vtp),
        constantIndex(rewriter, loc, 0));
  case vector::CombiningKind::AND:
  case vector::CombiningKind::OR:
    // Initialize reduction vector to: | r | .. | r | r |
    return rewriter.create<vector::BroadcastOp>(loc, vtp, r);
  default:
    break;
  }
  llvm_unreachable("unknown reduction kind");
}

/// Generates final value for a vector reduction.
static Value genVectorReducEnd(PatternRewriter &rewriter, Location loc,
                               Value vexp, Value rd) {
  vector::CombiningKind kind = getCombiningKind(rd.getDefiningOp());
  return rewriter.create<vector::ReductionOp>(loc, kind, vexp);
}

/// This method is called twice to analyze and rewrite the given subscripts.
/// The first call (!codegen) does the analysis. Then, on success, the second
/// call (codegen) yields the proper vector form in the output parameter
/// vector 'idxs'. This mechanism ensures that analysis and rewriting code
/// stay in sync.
///
/// See https://llvm.org/docs/GetElementPtr.html for some background on
/// the complications described below.
///
/// We need to generate a pointer/index load from the sparse storage scheme.
/// Narrower data types need to be zero extended before casting the value
/// into the index type used for looping and indexing.
///
/// For the scalar case, subscripts simply zero extend narrower indices
/// into 64-bit values before casting to an index type without a performance
/// penalty. Indices that already are 64-bit, in theory, cannot express the
/// full range since the LLVM backend defines addressing in terms of an
/// unsigned pointer/signed index pair.
static bool vectorizeSubscripts(PatternRewriter &rewriter, scf::ForOp forOp,
                                VL vl, ValueRange subs, bool codegen,
                                Value vmask, SmallVectorImpl<Value> &idxs) {
  for (auto sub : subs) {
    // Invariant indices simply pass through.
    if (sub.dyn_cast<BlockArgument>() ||
        sub.getDefiningOp()->getBlock() != &forOp.getRegion().front()) {
      if (codegen)
        idxs.push_back(sub);
      continue; // success so far
    }
    // Look under the hood of casting.
    auto cast = sub;
    while (1) {
      if (auto icast = cast.getDefiningOp<arith::IndexCastOp>())
        cast = icast->getOperand(0);
      else if (auto ecast = cast.getDefiningOp<arith::ExtUIOp>())
        cast = ecast->getOperand(0);
      else
        break;
    }
    // Since the index vector is used in a subsequent gather/scatter
    // operations, which effectively defines an unsigned pointer + signed
    // index, we must zero extend the vector to an index width. For 8-bit
    // and 16-bit values, an 32-bit index width suffices. For 32-bit values,
    // zero extending the elements into 64-bit loses some performance since
    // the 32-bit indexed gather/scatter is more efficient than the 64-bit
    // index variant (if the negative 32-bit index space is unused, the
    // enableSIMDIndex32 flag can preserve this performance). For 64-bit
    // values, there is no good way to state that the indices are unsigned,
    // which creates the potential of incorrect address calculations in the
    // unlikely case we need such extremely large offsets.
    if (auto load = cast.getDefiningOp<memref::LoadOp>()) {
      if (codegen) {
        SmallVector<Value> idxs2(load.getIndices()); // no need to analyze
        Location loc = forOp.getLoc();
        Value vload =
            genVectorLoad(rewriter, loc, vl, load.getMemRef(), idxs2, vmask);
        Type etp = vload.getType().cast<VectorType>().getElementType();
        if (!etp.isa<IndexType>()) {
          if (etp.getIntOrFloatBitWidth() < 32)
            vload = rewriter.create<arith::ExtUIOp>(
                loc, vectorType(vl, rewriter.getI32Type()), vload);
          else if (etp.getIntOrFloatBitWidth() < 64 && !vl.enableSIMDIndex32)
            vload = rewriter.create<arith::ExtUIOp>(
                loc, vectorType(vl, rewriter.getI64Type()), vload);
        }
        idxs.push_back(vload);
      }
      continue; // success so far
    }
    return false;
  }
  return true;
}

#define UNAOP(xxx)                                                             \
  if (isa<xxx>(def)) {                                                         \
    if (codegen)                                                               \
      vexp = rewriter.create<xxx>(loc, vx);                                    \
    return true;                                                               \
  }

#define BINOP(xxx)                                                             \
  if (isa<xxx>(def)) {                                                         \
    if (codegen)                                                               \
      vexp = rewriter.create<xxx>(loc, vx, vy);                                \
    return true;                                                               \
  }

/// This method is called twice to analyze and rewrite the given expression.
/// The first call (!codegen) does the analysis. Then, on success, the second
/// call (codegen) yields the proper vector form in the output parameter 'vexp'.
/// This mechanism ensures that analysis and rewriting code stay in sync.
static bool vectorizeExpr(PatternRewriter &rewriter, scf::ForOp forOp, VL vl,
                          Value exp, bool codegen, Value vmask, Value &vexp) {
  // A block argument in invariant.
  if (auto arg = exp.dyn_cast<BlockArgument>()) {
    if (codegen)
      vexp = genVectorInvariantValue(rewriter, vl, exp);
    return true;
  }
  // Something defined outside the loop-body is invariant as well.
  Operation *def = exp.getDefiningOp();
  if (def->getBlock() != &forOp.getRegion().front()) {
    if (codegen)
      vexp = genVectorInvariantValue(rewriter, vl, exp);
    return true;
  }
  // Inside loop-body unary and binary operations. Note that it would be
  // nicer if we could somehow test and build the operations in a more
  // concise manner than just listing them all (although this way we know
  // for certain that they can vectorize).
  Location loc = forOp.getLoc();
  if (auto load = dyn_cast<memref::LoadOp>(def)) {
    auto subs = load.getIndices();
    SmallVector<Value> idxs;
    if (vectorizeSubscripts(rewriter, forOp, vl, subs, codegen, vmask, idxs)) {
      if (codegen)
        vexp = genVectorLoad(rewriter, loc, vl, load.getMemRef(), idxs, vmask);
      return true;
    }
  } else if (def->getNumOperands() == 1) {
    Value vx;
    if (vectorizeExpr(rewriter, forOp, vl, def->getOperand(0), codegen, vmask,
                      vx)) {
      UNAOP(math::AbsFOp)
      UNAOP(math::AbsIOp)
      UNAOP(math::CeilOp)
      UNAOP(math::FloorOp)
      UNAOP(math::SqrtOp)
      UNAOP(math::ExpM1Op)
      UNAOP(math::Log1pOp)
      UNAOP(math::SinOp)
      UNAOP(math::TanhOp)
      UNAOP(arith::NegFOp)
    }
  } else if (def->getNumOperands() == 2) {
    Value vx, vy;
    if (vectorizeExpr(rewriter, forOp, vl, def->getOperand(0), codegen, vmask,
                      vx) &&
        vectorizeExpr(rewriter, forOp, vl, def->getOperand(1), codegen, vmask,
                      vy)) {
      BINOP(arith::MulFOp)
      BINOP(arith::MulIOp)
      BINOP(arith::DivFOp)
      BINOP(arith::DivSIOp)
      BINOP(arith::DivUIOp)
      BINOP(arith::AddFOp)
      BINOP(arith::AddIOp)
      BINOP(arith::SubFOp)
      BINOP(arith::SubIOp)
      BINOP(arith::AndIOp)
      BINOP(arith::OrIOp)
      BINOP(arith::XOrIOp)
    }
  }
  return false;
}

#undef UNAOP
#undef BINOP

/// This method is called twice to analyze and rewrite the given for-loop.
/// The first call (!codegen) does the analysis. Then, on success, the second
/// call (codegen) rewriters the IR into vector form. This mechanism ensures
/// that analysis and rewriting code stay in sync.
static bool vectorizeStmt(PatternRewriter &rewriter, scf::ForOp forOp, VL vl,
                          bool codegen) {
  Location loc = forOp.getLoc();
  Block &block = forOp.getRegion().front();
  scf::YieldOp yield = cast<scf::YieldOp>(block.getTerminator());
  auto &last = *++block.rbegin();
  scf::ForOp forOpNew;

  // Perform initial set up during codegen (we know that the first analysis
  // pass was successful). For reductions, we need to construct a completely
  // new for-loop, since the incoming and outgoing reduction type
  // changes into SIMD form. For stores, we can simply adjust the stride
  // and insert in the existing for-loop. In both cases, we set up a vector
  // mask for all operations which takes care of confining vectors to
  // the original iteration space (later cleanup loops or other
  // optimizations can take care of those).
  Value vmask;
  if (codegen) {
    Value step = constantIndex(rewriter, loc, vl.vectorLength);
    if (vl.enableVLAVectorization) {
      Value vscale =
          rewriter.create<vector::VectorScaleOp>(loc, rewriter.getIndexType());
      step = rewriter.create<arith::MulIOp>(loc, vscale, step);
    }
    if (!yield.getResults().empty()) {
      Value init = forOp.getInitArgs()[0];
      VectorType vtp = vectorType(vl, init.getType());
      Value vinit =
          genVectorReducInit(rewriter, loc, vtp, init, yield->getOperand(0));
      forOpNew = rewriter.create<scf::ForOp>(
          loc, forOp.getLowerBound(), forOp.getUpperBound(), step, vinit);
      rewriter.setInsertionPointToStart(forOpNew.getBody());
    } else {
      forOp.setStep(step);
      rewriter.setInsertionPoint(yield);
    }
    vmask = genVectorMask(rewriter, loc, vl, forOp.getInductionVar(),
                          forOp.getLowerBound(), forOp.getUpperBound(), step);
  }

  // Sparse for-loops either are terminated by a non-empty yield operation
  // (reduction loop) or otherwise by a store operation (pararallel loop).
  if (!yield.getResults().empty()) {
    if (yield->getNumOperands() != 1)
      return false;
    Value redOp = yield->getOperand(0);
    // Analyze/vectorize reduction.
    // TODO: use linalg utils to verify the actual reduction?
    Value vrhs;
    if (vectorizeExpr(rewriter, forOp, vl, redOp, codegen, vmask, vrhs)) {
      if (codegen) {
        Value vpass =
            genVectorInvariantValue(rewriter, vl, forOp.getRegionIterArg(0));
        Value vred = rewriter.create<arith::SelectOp>(loc, vmask, vrhs, vpass);
        rewriter.create<scf::YieldOp>(loc, vred);
        rewriter.setInsertionPointAfter(forOpNew);
        Value vres = genVectorReducEnd(rewriter, loc, forOpNew.getResult(0), redOp);
        // Now do some relinking (last one is not completely type safe
        // but all bad ones are removed right away). This also folds away
        // nop broadcast operations.
        forOp.getResult(0).replaceAllUsesWith(vres);
        forOp.getInductionVar().replaceAllUsesWith(forOpNew.getInductionVar());
        forOp.getRegionIterArg(0).replaceAllUsesWith(
            forOpNew.getRegionIterArg(0));
        rewriter.eraseOp(forOp);
      }
      return true;
    }
  } else if (auto store = dyn_cast<memref::StoreOp>(last)) {
    // Analyze/vectorize store operation.
    auto subs = store.getIndices();
    SmallVector<Value> idxs;
    Value rhs = store.getValue();
    Value vrhs;
    if (vectorizeSubscripts(rewriter, forOp, vl, subs, codegen, vmask, idxs) &&
        vectorizeExpr(rewriter, forOp, vl, rhs, codegen, vmask, vrhs)) {
      if (codegen) {
        genVectorStore(rewriter, loc, store.getMemRef(), idxs, vmask, vrhs);
        rewriter.eraseOp(store);
      }
      return true;
    }
  }

  assert(!codegen && "cannot call codegen when analysis failed");
  return false;
}

/// Basic for-loop vectorizer.
struct ForOpRewriter : public OpRewritePattern<scf::ForOp> {
public:
  using OpRewritePattern<scf::ForOp>::OpRewritePattern;

  ForOpRewriter(MLIRContext *context, unsigned vectorLength,
                bool enableVLAVectorization, bool enableSIMDIndex32)
      : OpRewritePattern(context),
        vl{vectorLength, enableVLAVectorization, enableSIMDIndex32} {}

  LogicalResult matchAndRewrite(scf::ForOp op,
                                PatternRewriter &rewriter) const override {
    // Check for single block, unit-stride for-loop that is generated by
    // sparse compiler, which means no data dependence analysis is required,
    // and its loop-body is very restricted in form.
    if (!op.getRegion().hasOneBlock() || !isIntValue(op.getStep(), 1) ||
        !op->hasAttr(SparseTensorLoopEmitter::getLoopEmitterLoopAttrName()))
      return failure();
    // Analyze (!codegen) and rewrite (codegen) loop-body.
    if (vectorizeStmt(rewriter, op, vl, /*codegen=*/false) &&
        vectorizeStmt(rewriter, op, vl, /*codegen=*/true))
      return success();
    return failure();
  }

private:
  const VL vl;
};

} // namespace

//===----------------------------------------------------------------------===//
// Public method for populating vectorization rules.
//===----------------------------------------------------------------------===//

/// Populates the given patterns list with vectorization rules.
void mlir::populateSparseVectorizationPatterns(RewritePatternSet &patterns,
                                               unsigned vectorLength,
                                               bool enableVLAVectorization,
                                               bool enableSIMDIndex32) {
  patterns.add<ForOpRewriter>(patterns.getContext(), vectorLength,
                              enableVLAVectorization, enableSIMDIndex32);
}