xref: /llvm-project/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp (revision c4e5a8a4d3ef0948384d9411ea1e44fc113e5b5c)
1 //===- SparseTensorRewriting.cpp - Sparse tensor rewriting rules ----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements rewriting rules that are specific to sparse tensors.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "Utils/CodegenUtils.h"
14 #include "Utils/LoopEmitter.h"
15 
16 #include "mlir/Dialect/Affine/IR/AffineOps.h"
17 #include "mlir/Dialect/Arith/IR/Arith.h"
18 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"
19 #include "mlir/Dialect/Linalg/IR/Linalg.h"
20 #include "mlir/Dialect/Linalg/Utils/Utils.h"
21 #include "mlir/Dialect/MemRef/IR/MemRef.h"
22 #include "mlir/Dialect/SCF/IR/SCF.h"
23 #include "mlir/Dialect/SparseTensor/IR/SparseTensor.h"
24 #include "mlir/Dialect/SparseTensor/IR/SparseTensorStorageLayout.h"
25 #include "mlir/Dialect/SparseTensor/IR/SparseTensorType.h"
26 #include "mlir/Dialect/SparseTensor/Transforms/Passes.h"
27 #include "mlir/Dialect/Tensor/IR/Tensor.h"
28 #include "mlir/Dialect/Vector/IR/VectorOps.h"
29 #include "mlir/IR/AffineMap.h"
30 #include "mlir/IR/Matchers.h"
31 #include "mlir/Support/LLVM.h"
32 
33 using namespace mlir;
34 using namespace mlir::bufferization;
35 using namespace mlir::linalg;
36 using namespace mlir::sparse_tensor;
37 
38 //===---------------------------------------------------------------------===//
39 // Helper methods for the actual rewriting rules.
40 //===---------------------------------------------------------------------===//
41 
42 // Helper method to match any typed zero.
43 static bool isZeroValue(Value val) {
44   return matchPattern(val, m_Zero()) || matchPattern(val, m_AnyZeroFloat());
45 }
46 
47 // Helper to detect a sparse tensor type operand.
48 static bool isSparseTensor(Value v) {
49   auto enc = getSparseTensorEncoding(v.getType());
50   return enc && !llvm::all_of(enc.getLvlTypes(),
51                               [](auto lt) { return lt == LevelFormat::Dense; });
52 }
53 static bool isSparseTensor(OpOperand *op) { return isSparseTensor(op->get()); }
54 
55 // Helper method to find zero/uninitialized tensor materialization.
56 static bool isMaterializing(OpOperand *op, bool isZero) {
57   Value val = op->get();
58   // Check allocation, with zero alloc when required.
59   if (auto alloc = val.getDefiningOp<AllocTensorOp>()) {
60     Value copy = alloc.getCopy();
61     if (isZero)
62       return copy && isZeroValue(copy);
63     return !copy;
64   }
65   // Check for empty tensor materialization.
66   if (auto empty = val.getDefiningOp<tensor::EmptyOp>())
67     return !isZero;
68   // Last resort for zero alloc: the whole value is zero.
69   return isZero && isZeroValue(val);
70 }
71 
72 // Helper to detect sampling operation.
73 static bool isSampling(GenericOp op) {
74   auto yieldOp = cast<linalg::YieldOp>(op.getRegion().front().getTerminator());
75   if (auto *def = yieldOp.getOperand(0).getDefiningOp()) {
76     if (isa<arith::MulFOp>(def) || isa<arith::MulIOp>(def)) {
77       // Both scalar input arguments used exactly once.
78       Value s1 = op.getBlock()->getArgument(0);
79       Value s2 = op.getBlock()->getArgument(1);
80       return (def->getOperand(0) == s1 && def->getOperand(1) == s2) ||
81              (def->getOperand(1) == s1 && def->getOperand(0) == s2);
82     }
83   }
84   return false;
85 }
86 
87 // Helper to detect chain of multiplications that do not involve x.
88 static bool isMulChain(Value val, Value x) {
89   if (auto arg = dyn_cast<BlockArgument>(val))
90     return arg != x;
91   if (auto *def = val.getDefiningOp()) {
92     if (isa<arith::MulFOp>(def) || isa<arith::MulIOp>(def))
93       return isMulChain(def->getOperand(0), x) &&
94              isMulChain(def->getOperand(1), x);
95   }
96   return false;
97 }
98 
99 // Helper to detect x = x + <multiplications>.
100 static bool isSumOfMul(GenericOp op) {
101   auto yieldOp = cast<linalg::YieldOp>(op.getRegion().front().getTerminator());
102   if (auto *def = yieldOp.getOperand(0).getDefiningOp()) {
103     if (isa<arith::AddFOp>(def) || isa<arith::AddIOp>(def)) {
104       Value x = op.getBlock()->getArguments().back();
105       return (def->getOperand(0) == x && isMulChain(def->getOperand(1), x)) ||
106              (def->getOperand(1) == x && isMulChain(def->getOperand(0), x));
107     }
108   }
109   return false;
110 }
111 
112 // Helper to detect direct yield of a zero value.
113 static bool isZeroYield(GenericOp op) {
114   auto yieldOp = cast<linalg::YieldOp>(op.getRegion().front().getTerminator());
115   if (auto arg = dyn_cast<BlockArgument>(yieldOp.getOperand(0))) {
116     if (arg.getOwner()->getParentOp() == op) {
117       return isZeroValue(op->getOperand(arg.getArgNumber()));
118     }
119   }
120   return isZeroValue(yieldOp.getOperand(0));
121 }
122 
123 /// Populates given sizes array from type (for static sizes) and from
124 /// the tensor (for dynamic sizes).
125 static void sizesForTensor(OpBuilder &builder, SmallVectorImpl<Value> &sizes,
126                            Location loc, ShapedType stp, Value tensor) {
127   for (const auto &d : enumerate(stp.getShape())) {
128     Value dim;
129     if (d.value() == ShapedType::kDynamic)
130       dim = builder.create<tensor::DimOp>(loc, tensor, d.index());
131     else
132       dim = constantIndex(builder, loc, d.value());
133     sizes.push_back(dim);
134   }
135 }
136 
137 static RankedTensorType getBufferType(const SparseTensorType &stt,
138                                       bool needTmpCOO) {
139   return needTmpCOO ? stt.getCOOType(/*ordered=*/false)
140                     : stt.getRankedTensorType();
141 }
142 
143 /// Collects the dynamic dimension sizes for `tp` with the assumption that
144 /// `sizes` are the dimension sizes for the type. Stores the dynamic dimension
145 /// sizes to dynSizes.
146 static void getDynamicSizes(RankedTensorType tp, ValueRange sizes,
147                             SmallVectorImpl<Value> &dynSizes) {
148   for (const auto &d : enumerate(tp.getShape())) {
149     if (d.value() == ShapedType::kDynamic)
150       dynSizes.push_back(sizes[d.index()]);
151   }
152 }
153 
154 static LogicalResult genForeachOnSparseConstant(ForeachOp op,
155                                                 RewriterBase &rewriter,
156                                                 SparseElementsAttr attr) {
157   auto loc = op.getLoc();
158   SmallVector<Value> reduc = op.getInitArgs();
159 
160   // Foreach on constant.
161   foreachInSparseConstant(
162       rewriter, loc, attr, op.getOrder().value_or(AffineMap()),
163       [&reduc, &rewriter, op](ArrayRef<Value> cvs, Value v) mutable {
164         SmallVector<Value> args;
165         args.append(cvs.begin(), cvs.end());
166         args.push_back(v);
167         args.append(reduc);
168         // Clones the foreach op to get a copy of the loop body.
169         auto cloned = cast<ForeachOp>(rewriter.clone(*op.getOperation()));
170         assert(args.size() == cloned.getBody()->getNumArguments());
171         Operation *yield = cloned.getBody()->getTerminator();
172         rewriter.inlineBlockBefore(cloned.getBody(), op, args);
173         // clean up
174         rewriter.eraseOp(cloned);
175         reduc = yield->getOperands();
176         rewriter.eraseOp(yield);
177       });
178 
179   rewriter.replaceOp(op, reduc);
180   return success();
181 }
182 
183 /// Populates the given sizes array for concatenation from types (for static
184 /// sizes) and from the source tensors (for dynamic sizes).
185 static void concatSizesFromInputs(OpBuilder &builder,
186                                   SmallVectorImpl<Value> &sizes, Location loc,
187                                   ShapedType dstTp, ValueRange srcs,
188                                   unsigned dim) {
189   auto dstShape = dstTp.getShape();
190   sizesFromSrc(builder, sizes, loc, srcs[0]);
191 
192   // Sum up on the `dim` if the dimension is dynamic.
193   if (dstShape[dim] != ShapedType::kDynamic) {
194     // Faithfully take the static size.
195     sizes[dim] = constantIndex(builder, loc, dstShape[dim]);
196   } else {
197     // Else, compute the shape dynamically.
198     for (const auto &src : srcs.drop_front()) {
199       Value srcSz = linalg::createOrFoldDimOp(builder, loc, src, dim);
200       // Sum up all the sizes.
201       sizes[dim] = builder.create<arith::AddIOp>(loc, sizes[dim], srcSz);
202     }
203   }
204 }
205 
206 //===---------------------------------------------------------------------===//
207 // The actual sparse tensor rewriting rules.
208 //===---------------------------------------------------------------------===//
209 
210 namespace {
211 
212 /// TODO: move it to tensor dialect instead.
213 ///
214 /// Fold `tensor.concat` and `tensor.extract_slice`
215 ///
216 /// %concat = tensor.concat dim(2) %t0, %t1
217 ///   : (tensor<1x64x1xf32>, tensor<1x64x1xf32>) -> tensor<1x64x2xf32>
218 /// %extracted0 = tensor.extract_slice %concat[0, 0, 0][1, 64, 1][1, 1, 1]
219 ///   : tensor<1x64x2xf32> to tensor<1x64x1xf32>
220 /// %extracted1 = tensor.extract_slice %concat[0, 0, 1][1, 64, 1][1, 1, 1]
221 ///   : tensor<1x64x2xf32> to tensor<1x64x1xf32>
222 ///
223 /// Becomes
224 ///
225 /// %extract0, %extract1 = %t0, %t1
226 struct FuseExtractSliceWithConcat
227     : public OpRewritePattern<tensor::ExtractSliceOp> {
228   using OpRewritePattern<tensor::ExtractSliceOp>::OpRewritePattern;
229 
230   LogicalResult matchAndRewrite(tensor::ExtractSliceOp extractOp,
231                                 PatternRewriter &rewriter) const override {
232     auto concatOp = extractOp.getSource().getDefiningOp<tensor::ConcatOp>();
233     if (!concatOp)
234       return failure();
235 
236     Location loc = extractOp.getLoc();
237     int64_t dim = concatOp.getDim();
238     int64_t rank = extractOp.getResultType().getRank();
239 
240     SmallVector<OpFoldResult> srcStrides(rank, rewriter.getIndexAttr(1));
241     SmallVector<OpFoldResult> srcOffsets(rank, rewriter.getIndexAttr(0));
242 
243     // Compute the partial sums for the slice offsets.
244     AffineExpr sum = rewriter.getAffineDimExpr(0);
245     SmallVector<AffineExpr> partialSums = {sum};
246     SmallVector<OpFoldResult> offsetStrides = {rewriter.getIndexAttr(0)};
247     for (auto [idx, input] :
248          llvm::enumerate(concatOp.getInputs().drop_back())) {
249       sum = sum + rewriter.getAffineDimExpr(idx + 1);
250       partialSums.push_back(sum);
251       offsetStrides.push_back(
252           rewriter.createOrFold<tensor::DimOp>(loc, input, dim));
253     }
254     auto partialSumMap = AffineMap::get(concatOp.getInputs().size(), 0,
255                                         partialSums, rewriter.getContext());
256     SmallVector<OpFoldResult> dimOffsets =
257         affine::makeComposedFoldedMultiResultAffineApply(
258             rewriter, loc, partialSumMap, offsetStrides);
259 
260     auto allEqual = [](ArrayRef<OpFoldResult> lhs, ArrayRef<OpFoldResult> rhs) {
261       for (auto [l, r] : llvm::zip(lhs, rhs)) {
262         std::optional<int64_t> staticVal = getConstantIntValue(l);
263         if (!staticVal.has_value() || staticVal != getConstantIntValue(r))
264           return false;
265       }
266       return lhs.size() == rhs.size();
267     };
268 
269     for (auto [i, input, offset] :
270          llvm::enumerate(concatOp.getInputs(), dimOffsets)) {
271       SmallVector<OpFoldResult> srcSizes =
272           tensor::getMixedSizes(rewriter, loc, input);
273       srcOffsets[dim] = offset;
274 
275       SmallVector<OpFoldResult> dstSizes = extractOp.getMixedSizes();
276       SmallVector<OpFoldResult> dstOffsets = extractOp.getMixedOffsets();
277       SmallVector<OpFoldResult> dstStrides = extractOp.getMixedStrides();
278 
279       if (allEqual(srcSizes, dstSizes) && allEqual(srcOffsets, dstOffsets) &&
280           allEqual(srcStrides, dstStrides)) {
281         Value operand = concatOp.getOperand(i);
282         if (operand.getType() == extractOp.getResultType())
283           rewriter.replaceOp(extractOp, operand);
284         break;
285       }
286     }
287 
288     return success();
289   }
290 };
291 
292 /// Rewriting rule that fuses sparse_tensor.convert into producer.
293 struct FoldConvertIntoProducer : public OpRewritePattern<ConvertOp> {
294 public:
295   using OpRewritePattern::OpRewritePattern;
296 
297   LogicalResult matchAndRewrite(ConvertOp op,
298                                 PatternRewriter &rewriter) const override {
299     auto producer = op.getSource().getDefiningOp<GenericOp>();
300     if (!producer || producer.getDpsInits().size() != 1 ||
301         !isMaterializing(producer.getDpsInitOperand(0), false) ||
302         !producer.getResult(0).hasOneUse()) {
303       return failure();
304     }
305     rewriter.modifyOpInPlace(producer, [&]() {
306       producer.getResult(0).setType(op.getResult().getType());
307     });
308 
309     Operation *materializeOp =
310         producer.getDpsInitOperand(0)->get().getDefiningOp();
311 
312     rewriter.modifyOpInPlace(materializeOp, [&]() {
313       materializeOp->getResult(0).setType(op.getResult().getType());
314     });
315 
316     rewriter.replaceAllOpUsesWith(op, producer);
317     op->erase();
318 
319     return success();
320   }
321 };
322 
323 /// Rewriting rule that converts direct yield of zero with initial allocation.
324 struct FoldInvariantYield : public OpRewritePattern<GenericOp> {
325 public:
326   using OpRewritePattern<GenericOp>::OpRewritePattern;
327 
328   LogicalResult matchAndRewrite(GenericOp op,
329                                 PatternRewriter &rewriter) const override {
330     if (!op.hasPureTensorSemantics() || op.getNumResults() != 1 ||
331         !isMaterializing(op.getDpsInitOperand(0), /*isZero=*/false) ||
332         !isZeroYield(op) || !op.getDpsInitOperand(0)->get().hasOneUse())
333       return failure();
334     auto outputType = getRankedTensorType(op.getResult(0));
335     // Yielding zero on newly materialized sparse tensor can be
336     // optimized directly (regardless of dynamic or static size).
337     if (getSparseTensorEncoding(outputType)) {
338       rewriter.replaceOp(op, op.getDpsInitOperand(0)->get());
339       return success();
340     }
341     // Use static zero value directly instead of materialization.
342     if (!outputType.hasStaticShape())
343       return failure();
344     Operation *def = op.getDpsInitOperand(0)->get().getDefiningOp();
345     rewriter.replaceOp(op, constantZero(rewriter, op.getLoc(), outputType));
346     rewriter.eraseOp(def);
347     return success();
348   }
349 };
350 
351 /// Rewriting rule that converts two kernels:
352 ///
353 ///      T(i,j) = SUM(k, A(i,j,k) * B(i,j,k) * ... )
354 ///      X(i,j) = S(i,j) * T(i,j)
355 ///
356 /// into a single kernel, using distributive law:
357 ///
358 ///      X(i,j) = SUM(k, S(i,j) * A(i,j,k) * B(i,j,k) * ... )
359 ///
360 /// This kind of fusion (merging two ops into one but using arithmetic
361 /// equalities that may not hold for floating-point computations) would
362 /// be undesirable in the dense case, since we distribute the multiplication
363 /// into the reduction loop. However, for sparse sampling tensor S, such
364 /// a fusion may actually reduce the asymptotic complexity of the kernel,
365 /// since intermediate results may be nullified.
366 struct FuseSparseMultiplyOverAdd : public OpRewritePattern<GenericOp> {
367 public:
368   using OpRewritePattern<GenericOp>::OpRewritePattern;
369 
370   LogicalResult matchAndRewrite(GenericOp op,
371                                 PatternRewriter &rewriter) const override {
372     // Check consumer.
373     if (!op.hasPureTensorSemantics() || op.getNumDpsInputs() != 2 ||
374         op.getNumResults() != 1 ||
375         op.getNumParallelLoops() != op.getNumLoops() ||
376         !op.getMatchingIndexingMap(op.getDpsInitOperand(0)).isIdentity() ||
377         !op.getMatchingIndexingMap(op.getDpsInputOperand(0)).isIdentity() ||
378         !op.getMatchingIndexingMap(op.getDpsInputOperand(1)).isIdentity())
379       return failure();
380     // Find consuming OP2(sparse, other) or OP2(other, sparse). The other
381     // operand can be sparse or dense, since the point of this rewriting rule
382     // is detecting a situation in which *more* sparsity is introduced into
383     // a computation, be it already sparse or still dense.
384     unsigned other = 0;
385     if (isSparseTensor(op.getDpsInputOperand(0)))
386       other = 1;
387     else if (!isSparseTensor(op.getDpsInputOperand(1)))
388       return failure();
389     // Check producer.
390     auto prod = dyn_cast_or_null<GenericOp>(
391         op.getDpsInputOperand(other)->get().getDefiningOp());
392     if (!prod || !prod.hasPureTensorSemantics() || prod.getNumResults() != 1 ||
393         !prod.getResult(0).hasOneUse())
394       return failure();
395     // Sampling consumer and sum of multiplication chain producer.
396     if (!isMaterializing(op.getDpsInitOperand(0), /*isZero=*/false) ||
397         !isMaterializing(prod.getDpsInitOperand(0), /*isZero=*/true) ||
398         !isSampling(op) || !isSumOfMul(prod))
399       return failure();
400     // Modify operand structure of producer and consumer.
401     Location loc = prod.getLoc();
402     SmallVector<Value> inputOps = prod.getInputs();
403     SmallVector<Value> outputOps = op.getOutputs();
404     SmallVector<AffineMap> fusedIndexMaps = prod.getIndexingMapsArray();
405     inputOps.push_back(op.getDpsInputOperand(1 - other)->get());
406     fusedIndexMaps.push_back(fusedIndexMaps.back()); // mimic other
407     // Fuse producer and consumer into a new generic op.
408     auto fusedOp = rewriter.create<GenericOp>(
409         loc, op.getResult(0).getType(), inputOps, outputOps,
410         rewriter.getAffineMapArrayAttr(fusedIndexMaps), prod.getIteratorTypes(),
411         /*doc=*/nullptr, /*library_call=*/nullptr);
412     Block &prodBlock = prod.getRegion().front();
413     Block &consBlock = op.getRegion().front();
414     IRMapping mapper;
415     Block *fusedBlock = rewriter.createBlock(&fusedOp.getRegion());
416     unsigned num = prodBlock.getNumArguments();
417     for (unsigned i = 0; i < num - 1; i++)
418       addArg(mapper, fusedBlock, prodBlock.getArgument(i));
419     addArg(mapper, fusedBlock, consBlock.getArgument(1 - other));
420     addArg(mapper, fusedBlock, prodBlock.getArgument(num - 1));
421     // Clone bodies of the producer and consumer in new evaluation order.
422     auto *acc = prodBlock.getTerminator()->getOperand(0).getDefiningOp();
423     auto *sampler = consBlock.getTerminator()->getOperand(0).getDefiningOp();
424     Value last;
425     for (auto &op : prodBlock.without_terminator())
426       if (&op != acc) {
427         last = op.getResult(0);
428         rewriter.clone(op, mapper);
429       }
430     mapper.map(consBlock.getArgument(other), fusedBlock->back().getResult(0));
431     mapper.map(last, rewriter.clone(*sampler, mapper)->getResult(0));
432     last = rewriter.clone(*acc, mapper)->getResult(0);
433     rewriter.create<linalg::YieldOp>(loc, last);
434     // Force initial value on merged allocation for dense outputs.
435     // TODO: deal with non alloc tensor here one day
436     if (!getSparseTensorEncoding(op.getResult(0).getType())) {
437       Value init = prod.getDpsInitOperand(0)
438                        ->get()
439                        .getDefiningOp<AllocTensorOp>()
440                        .getCopy();
441       AllocTensorOp a =
442           op.getDpsInitOperand(0)->get().getDefiningOp<AllocTensorOp>();
443       rewriter.modifyOpInPlace(a, [&]() { a.getCopyMutable().assign(init); });
444     }
445     // Replace consumer with fused operation. Old producer
446     // and consumer ops will be removed by DCE.
447     rewriter.replaceOp(op, fusedOp->getResults());
448     return success();
449   }
450 
451 private:
452   // Helper to add argument and record the mapping.
453   static void addArg(IRMapping &mapper, Block *b, BlockArgument a) {
454     mapper.map(a, b->addArgument(a.getType(), a.getLoc()));
455   }
456 };
457 
458 // Fuse a tensor cast into producing operation. Note that a tensor.cast
459 // should really not be used to convert between sparse encodings. Since
460 // the pattern currently appears as a result of some prior rewriting
461 // we make an attempt to repair very obvious cases.
462 // TODO: audit the pure tensor dialect rewriting rules
463 struct FuseTensorCast : public OpRewritePattern<tensor::CastOp> {
464 public:
465   using OpRewritePattern<tensor::CastOp>::OpRewritePattern;
466 
467   LogicalResult matchAndRewrite(tensor::CastOp op,
468                                 PatternRewriter &rewriter) const override {
469     Type srcType = op.getSource().getType();
470     Type dstType = op.getDest().getType();
471     // A nop cast simply folds away.
472     if (srcType == dstType) {
473       rewriter.replaceOp(op, op->getResults());
474       return success();
475     }
476     // See if a sparsity changing cast can be fused into producer.
477     if (tensor::isSameTypeWithoutEncoding(srcType, dstType)) {
478       if (Operation *def = op.getSource().getDefiningOp()) {
479         if (def->hasOneUse() && isa<tensor::ExtractSliceOp>(def)) {
480           rewriter.modifyOpInPlace(def, [&]() {
481             def->getResult(0).setType(op->getResultTypes()[0]);
482           });
483           rewriter.replaceOp(op, def->getResult(0));
484           return success();
485         }
486       }
487     }
488     // Repair tensor casts with at least one sparse operand into the
489     // the properly supported sparse_tensor.convert.
490     if (getSparseTensorEncoding(srcType) || getSparseTensorEncoding(dstType)) {
491       rewriter.replaceOpWithNewOp<ConvertOp>(op, dstType, op.getSource());
492       return success();
493     }
494     // Fail otherwise.
495     return failure();
496   }
497 };
498 
499 /// Rewrites a sequence of operations for sparse tensor selections in to
500 /// semi-ring operations such that they can be compiled correctly by the
501 /// sparsifier. E.g., transforming the following sequence
502 ///
503 /// %sel = arith.select %cond, %sp1, %sp2
504 ///
505 /// to
506 ///
507 /// %sel = binary %sp1, %sp2:
508 ///         both  (%l, %r) {yield select %cond, %l, %r}
509 ///         left  (%l)     {yield select %cond, %l,  0}
510 ///         right (%r)     {yield select %cond,  0, %r}
511 ///
512 /// TODO: We require that the tensor used for extracting conditions to be dense
513 /// to sparsify the code. To support a sparse condition tensor, we need a
514 /// tri-nary operation.
515 struct GenSemiRingSelect : public OpRewritePattern<GenericOp> {
516 public:
517   using OpRewritePattern<GenericOp>::OpRewritePattern;
518   LogicalResult matchAndRewrite(GenericOp op,
519                                 PatternRewriter &rewriter) const override {
520     // Rejects non sparse kernels.
521     if (!op.hasPureTensorSemantics() || !hasAnySparseOperand(op))
522       return failure();
523 
524     Location loc = op.getLoc();
525     SmallVector<std::pair<Operation *, sparse_tensor::BinaryOp>> semiRings;
526     for (Operation &inst : *op.getBody()) {
527       // Matches pattern.
528       auto matched = isRewritablePattern(op, &inst);
529       if (!matched.has_value())
530         continue;
531 
532       rewriter.setInsertionPoint(&inst);
533       auto [c, t, f] = matched.value();
534       assert(t.getType() == f.getType());
535       auto selTp = t.getType();
536       auto c0 = constantZero(rewriter, loc, selTp);
537       auto binOp = rewriter.create<sparse_tensor::BinaryOp>(loc, selTp, t, f);
538       // Initializes all the blocks.
539       rewriter.createBlock(&binOp.getOverlapRegion(), {}, {selTp, selTp},
540                            {t.getLoc(), f.getLoc()});
541       rewriter.createBlock(&binOp.getRightRegion(), {}, selTp, f.getLoc());
542       rewriter.createBlock(&binOp.getLeftRegion(), {}, selTp, t.getLoc());
543 
544       for (auto *r : binOp.getRegions()) {
545         Block *b = &r->front();
546         rewriter.setInsertionPointToStart(b);
547 
548         IRMapping irMap;
549         // Clones the cmp operations into the region to make the binary op
550         // admissible.
551         Value newC = c;
552         if (auto *def = c.getDefiningOp())
553           newC = rewriter.clone(*def, irMap)->getResult(0);
554 
555         irMap.map(c, newC);
556         if (r == &binOp.getLeftRegion()) {
557           irMap.map(t, b->getArgument(0));
558           irMap.map(f, c0);
559         } else if (r == &binOp.getRightRegion()) {
560           irMap.map(t, c0);
561           irMap.map(f, b->getArgument(0));
562         } else {
563           irMap.map(t, b->getArgument(0));
564           irMap.map(f, b->getArgument(1));
565         }
566         auto y = rewriter.clone(inst, irMap)->getResult(0);
567         rewriter.create<sparse_tensor::YieldOp>(loc, y);
568       }
569 
570       // We successfully rewrited a operation. We can not do replacement here
571       // becuase it invalidate the iterator for the current loop to traverse
572       // the instructions.
573       semiRings.emplace_back(&inst, binOp);
574     }
575 
576     // Finalizes the replacement.
577     for (auto [sel, semi] : semiRings)
578       rewriter.replaceOp(sel, semi->getResults());
579 
580     return success(!semiRings.empty());
581   }
582 
583 private:
584   static std::optional<std::tuple<Value, BlockArgument, BlockArgument>>
585   isRewritablePattern(GenericOp op, Operation *v) {
586     auto sel = dyn_cast<arith::SelectOp>(v);
587     if (!sel)
588       return std::nullopt;
589 
590     auto tVal = dyn_cast<BlockArgument>(sel.getTrueValue());
591     auto fVal = dyn_cast<BlockArgument>(sel.getFalseValue());
592     // TODO: For simplicity, we only handle cases where both true/false value
593     // are directly loaded the input tensor. We can probably admit more cases
594     // in theory.
595     if (!tVal || !fVal)
596       return std::nullopt;
597 
598     // Helper lambda to determine whether the value is loaded from a dense input
599     // or is a loop invariant.
600     auto isValFromDenseInputOrInvariant = [&op](Value v) -> bool {
601       if (auto bArg = dyn_cast<BlockArgument>(v);
602           bArg && !isSparseTensor(op.getDpsInputOperand(bArg.getArgNumber())))
603         return true;
604       // If the value is defined outside the loop, it is a loop invariant.
605       return v.getDefiningOp() && v.getDefiningOp()->getBlock() != op.getBody();
606     };
607 
608     // If the condition value is load directly from a dense tensor or
609     // loop-invariants, we can sparsify the kernel.
610     auto cond = sel.getCondition();
611     if (isValFromDenseInputOrInvariant(cond))
612       return std::make_tuple(cond, tVal, fVal);
613 
614     Value cmpL, cmpR;
615     if (matchPattern(cond, m_Op<arith::CmpIOp>(matchers::m_Any(&cmpL),
616                                                matchers::m_Any(&cmpR))) ||
617         matchPattern(cond, m_Op<arith::CmpFOp>(matchers::m_Any(&cmpL),
618                                                matchers::m_Any(&cmpR)))) {
619       // TODO: we can do it recursively to check whether all the leaf values are
620       // loaded from dense tensors or are loop invariants.
621       if (isValFromDenseInputOrInvariant(cmpL) ||
622           isValFromDenseInputOrInvariant(cmpR))
623         return std::make_tuple(cond, tVal, fVal);
624     }
625 
626     return std::nullopt;
627   };
628 };
629 
630 /// Rewrites a sparse reduction that would not sparsify directly since
631 /// doing so would only iterate over the stored elements, ignoring the
632 /// implicit zeros, into a semi-ring. Applies to all prod/and/min/max
633 /// (note that reductions like add/sub/or/xor can directly be sparsified
634 /// since the implicit zeros do not contribute to the final result).
635 /// Note that prod/and are still included since, even though they often
636 /// are nullified in sparse data, they may still occur for special
637 /// situations in which e.g. some rows in a sparse matrix are fully
638 /// dense. For min/max, including the implicit zeros is a much more
639 /// common situation.
640 ///
641 /// TODO: this essentially "densifies" the operation; we want to implement
642 ///       this much more efficiently by performing the reduction over the
643 ///       stored values, and feed in the zero once if there were *any*
644 ///       implicit zeros as well; but for now, at least we provide
645 ///       the functionality
646 ///
647 struct GenSemiRingReduction : public OpRewritePattern<GenericOp> {
648 public:
649   using OpRewritePattern<GenericOp>::OpRewritePattern;
650 
651   LogicalResult matchAndRewrite(GenericOp op,
652                                 PatternRewriter &rewriter) const override {
653     // Reject non-reductions.
654     if (!op.hasPureTensorSemantics() || op.getNumDpsInputs() != 1 ||
655         op.getNumReductionLoops() == 0 || op.getNumResults() != 1)
656       return failure();
657     auto *inp = op.getDpsInputOperand(0);
658     auto *init = op.getDpsInitOperand(0);
659     if (!isSparseTensor(inp))
660       return failure();
661     // Look for direct x = x OP y for semi-ring ready reductions.
662     auto *red = cast<linalg::YieldOp>(op.getRegion().front().getTerminator())
663                     .getOperand(0)
664                     .getDefiningOp();
665     if (!isa<arith::AndIOp, arith::MulIOp, arith::MulFOp, arith::MinimumFOp,
666              arith::MinSIOp, arith::MinUIOp, arith::MaximumFOp, arith::MaxSIOp,
667              arith::MaxUIOp>(red))
668       return failure();
669     Value s0 = op.getBlock()->getArgument(0);
670     Value s1 = op.getBlock()->getArgument(1);
671     if ((red->getOperand(0) != s0 || red->getOperand(1) != s1) &&
672         (red->getOperand(0) != s1 || red->getOperand(1) != s0))
673       return failure();
674     // Identity.
675     Location loc = op.getLoc();
676     Value identity =
677         rewriter.create<tensor::ExtractOp>(loc, init->get(), ValueRange());
678     // Unary {
679     //    present -> value
680     //    absent  -> zero.
681     // }
682     Type rtp = s0.getType();
683     rewriter.setInsertionPointToStart(&op.getRegion().front());
684     auto semiring = rewriter.create<sparse_tensor::UnaryOp>(loc, rtp, s0);
685     Block *present =
686         rewriter.createBlock(&semiring.getPresentRegion(), {}, rtp, loc);
687     rewriter.setInsertionPointToStart(&semiring.getPresentRegion().front());
688     rewriter.create<sparse_tensor::YieldOp>(loc, present->getArgument(0));
689     rewriter.createBlock(&semiring.getAbsentRegion(), {}, {}, {});
690     rewriter.setInsertionPointToStart(&semiring.getAbsentRegion().front());
691     auto zero =
692         rewriter.create<arith::ConstantOp>(loc, rewriter.getZeroAttr(rtp));
693     rewriter.create<sparse_tensor::YieldOp>(loc, zero);
694     rewriter.setInsertionPointAfter(semiring);
695     // CustomReduce {
696     //    x = x REDUC y, identity
697     // }
698     auto custom = rewriter.create<sparse_tensor::ReduceOp>(
699         loc, rtp, semiring.getResult(), s1, identity);
700     Block *region =
701         rewriter.createBlock(&custom.getRegion(), {}, {rtp, rtp}, {loc, loc});
702     rewriter.setInsertionPointToStart(&custom.getRegion().front());
703     IRMapping irMap;
704     irMap.map(red->getOperand(0), region->getArgument(0));
705     irMap.map(red->getOperand(1), region->getArgument(1));
706     auto *cloned = rewriter.clone(*red, irMap);
707     rewriter.create<sparse_tensor::YieldOp>(loc, cloned->getResult(0));
708     rewriter.setInsertionPointAfter(custom);
709     rewriter.replaceOp(red, custom.getResult());
710     return success();
711   }
712 };
713 
714 /// Sparse rewriting rule for the print operator. This operation is mainly used
715 /// for debugging and testing. As such, it lowers to the vector.print operation
716 /// which only require very light-weight runtime support.
717 struct PrintRewriter : public OpRewritePattern<PrintOp> {
718 public:
719   using OpRewritePattern::OpRewritePattern;
720   LogicalResult matchAndRewrite(PrintOp op,
721                                 PatternRewriter &rewriter) const override {
722     Location loc = op.getLoc();
723     auto tensor = op.getTensor();
724     auto stt = getSparseTensorType(tensor);
725     // Header with NSE.
726     auto nse = rewriter.create<NumberOfEntriesOp>(loc, tensor);
727     rewriter.create<vector::PrintOp>(
728         loc, rewriter.getStringAttr("---- Sparse Tensor ----\nnse = "));
729     rewriter.create<vector::PrintOp>(loc, nse);
730     // Print run-time contents for dim/lvl sizes.
731     rewriter.create<vector::PrintOp>(loc, rewriter.getStringAttr("dim = "));
732     printSizes(rewriter, loc, tensor, stt.getDimRank(), /*isDim=*/true);
733     rewriter.create<vector::PrintOp>(loc, rewriter.getStringAttr("lvl = "));
734     printSizes(rewriter, loc, tensor, stt.getLvlRank(), /*isDim=*/false);
735     // Use the "codegen" foreach loop construct to iterate over
736     // all typical sparse tensor components for printing.
737     foreachFieldAndTypeInSparseTensor(stt, [&rewriter, &loc, &tensor,
738                                             &stt](Type, FieldIndex,
739                                                   SparseTensorFieldKind kind,
740                                                   Level l, LevelType) {
741       switch (kind) {
742       case SparseTensorFieldKind::StorageSpec: {
743         break;
744       }
745       case SparseTensorFieldKind::PosMemRef: {
746         auto lvl = constantIndex(rewriter, loc, l);
747         rewriter.create<vector::PrintOp>(loc, rewriter.getStringAttr("pos["));
748         rewriter.create<vector::PrintOp>(
749             loc, lvl, vector::PrintPunctuation::NoPunctuation);
750         rewriter.create<vector::PrintOp>(loc, rewriter.getStringAttr("] : "));
751         auto pos = rewriter.create<ToPositionsOp>(loc, tensor, l);
752         printContents(rewriter, loc, pos);
753         break;
754       }
755       case SparseTensorFieldKind::CrdMemRef: {
756         auto lvl = constantIndex(rewriter, loc, l);
757         rewriter.create<vector::PrintOp>(loc, rewriter.getStringAttr("crd["));
758         rewriter.create<vector::PrintOp>(
759             loc, lvl, vector::PrintPunctuation::NoPunctuation);
760         rewriter.create<vector::PrintOp>(loc, rewriter.getStringAttr("] : "));
761         Value crd = nullptr;
762         // For COO AoS storage, we want to print a single, linear view of
763         // the full coordinate storage at this level. For any other storage,
764         // we show the coordinate storage for every indivual level.
765         if (stt.getAoSCOOStart() == l)
766           crd = rewriter.create<ToCoordinatesBufferOp>(loc, tensor);
767         else
768           crd = rewriter.create<ToCoordinatesOp>(loc, tensor, l);
769         printContents(rewriter, loc, crd);
770         break;
771       }
772       case SparseTensorFieldKind::ValMemRef: {
773         rewriter.create<vector::PrintOp>(loc,
774                                          rewriter.getStringAttr("values : "));
775         auto val = rewriter.create<ToValuesOp>(loc, tensor);
776         printContents(rewriter, loc, val);
777         break;
778       }
779       }
780       return true;
781     });
782     rewriter.create<vector::PrintOp>(loc, rewriter.getStringAttr("----\n"));
783     rewriter.eraseOp(op);
784     return success();
785   }
786 
787 private:
788   // Helper to print contents of a single memref. For "push_back" vectors,
789   // we assume that the previous getters for pos/crd/val have added a
790   // slice-to-size view to make sure we just print the size and not the
791   // full capacity.
792   //
793   // Generates code to print (1-dim or higher):
794   //    ( a0, a1, ... )
795   static void printContents(PatternRewriter &rewriter, Location loc,
796                             Value vec) {
797     auto shape = cast<ShapedType>(vec.getType()).getShape();
798     SmallVector<Value> idxs;
799     printContentsLevel(rewriter, loc, vec, 0, shape, idxs);
800     rewriter.create<vector::PrintOp>(loc, vector::PrintPunctuation::NewLine);
801   }
802 
803   // Helper to the helper.
804   static void printContentsLevel(PatternRewriter &rewriter, Location loc,
805                                  Value vec, unsigned i, ArrayRef<int64_t> shape,
806                                  SmallVectorImpl<Value> &idxs) {
807     // Open bracket.
808     rewriter.create<vector::PrintOp>(loc, vector::PrintPunctuation::Open);
809     // Generate for loop.
810     auto zero = constantIndex(rewriter, loc, 0);
811     auto index = constantIndex(rewriter, loc, i);
812     auto size = rewriter.create<memref::DimOp>(loc, vec, index);
813     auto step = constantIndex(rewriter, loc, 1);
814     auto forOp = rewriter.create<scf::ForOp>(loc, zero, size, step);
815     idxs.push_back(forOp.getInductionVar());
816     rewriter.setInsertionPointToStart(forOp.getBody());
817     if (i < shape.size() - 1) {
818       // Enter deeper loop nest.
819       printContentsLevel(rewriter, loc, vec, i + 1, shape, idxs);
820     } else {
821       // Actual contents printing.
822       auto val = rewriter.create<memref::LoadOp>(loc, vec, idxs);
823       if (llvm::isa<ComplexType>(val.getType())) {
824         // Since the vector dialect does not support complex types in any op,
825         // we split those into (real, imag) pairs here.
826         Value real = rewriter.create<complex::ReOp>(loc, val);
827         Value imag = rewriter.create<complex::ImOp>(loc, val);
828         rewriter.create<vector::PrintOp>(loc, vector::PrintPunctuation::Open);
829         rewriter.create<vector::PrintOp>(loc, real,
830                                          vector::PrintPunctuation::Comma);
831         rewriter.create<vector::PrintOp>(loc, imag,
832                                          vector::PrintPunctuation::Close);
833         rewriter.create<vector::PrintOp>(loc, vector::PrintPunctuation::Comma);
834       } else {
835         rewriter.create<vector::PrintOp>(loc, val,
836                                          vector::PrintPunctuation::Comma);
837       }
838     }
839     idxs.pop_back();
840     rewriter.setInsertionPointAfter(forOp);
841     // Close bracket.
842     rewriter.create<vector::PrintOp>(loc, vector::PrintPunctuation::Close);
843   }
844 
845   // Helper method to print run-time lvl/dim sizes.
846   static void printSizes(PatternRewriter &rewriter, Location loc, Value tensor,
847                          unsigned size, bool isDim) {
848     // Open bracket.
849     rewriter.create<vector::PrintOp>(loc, vector::PrintPunctuation::Open);
850     // Print unrolled contents (dimop requires constant value).
851     for (unsigned i = 0; i < size; i++) {
852       auto idx = constantIndex(rewriter, loc, i);
853       Value val;
854       if (isDim)
855         val = rewriter.create<tensor::DimOp>(loc, tensor, idx);
856       else
857         val = rewriter.create<LvlOp>(loc, tensor, idx);
858       rewriter.create<vector::PrintOp>(
859           loc, val,
860           i != size - 1 ? vector::PrintPunctuation::Comma
861                         : vector::PrintPunctuation::NoPunctuation);
862     }
863     // Close bracket and end of line.
864     rewriter.create<vector::PrintOp>(loc, vector::PrintPunctuation::Close);
865     rewriter.create<vector::PrintOp>(loc, vector::PrintPunctuation::NewLine);
866   }
867 };
868 
869 /// Sparse rewriting rule for sparse-to-sparse reshape operator.
870 struct TensorReshapeRewriter : public OpRewritePattern<tensor::ReshapeOp> {
871 public:
872   using OpRewritePattern<tensor::ReshapeOp>::OpRewritePattern;
873 
874   LogicalResult matchAndRewrite(tensor::ReshapeOp op,
875                                 PatternRewriter &rewriter) const override {
876     Location loc = op.getLoc();
877     Value srcTensor = op.getSource();
878     const auto srcTp = getSparseTensorType(srcTensor);
879     const auto dstTp = getSparseTensorType(op.getResult());
880 
881     if (!srcTp.hasEncoding() || !dstTp.hasEncoding() ||
882         !dstTp.hasStaticDimShape())
883       return failure();
884 
885     SmallVector<Value> srcSizes;
886     sizesForTensor(rewriter, srcSizes, loc, srcTp, srcTensor);
887     SmallVector<Value> dstSizes;
888     for (Dimension d : dstTp.getDimShape())
889       dstSizes.push_back(constantIndex(rewriter, loc, d));
890 
891     Value nnz = rewriter.create<NumberOfEntriesOp>(loc, srcTensor);
892     // Only need an unordered COO buffer if input and output are not sorted
893     // in the same way.
894     Type bufferTp = getBufferType(
895         dstTp.withoutDimToLvl(),
896         !srcTp.isAllOrdered() || !srcTp.isIdentity() || !dstTp.isIdentity());
897     SmallVector<Value> dynSizes;
898     Value buffer = rewriter
899                        .create<AllocTensorOp>(loc, bufferTp, dynSizes, Value(),
900                                               nnz, Attribute())
901                        .getResult();
902 
903     // Convert src coordinates to dst coordinates by first collapsing it to 1D
904     // and then expand it to the match the rank of the destination tensor.
905     // Implemented as follows:
906     //   foreach srcCoords %srcTensor
907     //     collapsedCoords = reshapeCvs(srcCoords, [1, ..., srcRank])
908     //     expandedCoords = reshapeCvs(collapsedCoords, [1, ..., dstRank])
909     //     insert expandedCoords, %buffer
910     //
911     // followed by an optional
912     //   %t = sparse_tensor.cast %tmp
913     // depending on whether the input/output are sorted in the same way.
914     const auto encSrc = srcTp.getEncoding();
915     ForeachOp foreachOp = rewriter.create<ForeachOp>(
916         loc, srcTensor, buffer,
917         [&](OpBuilder &builder, Location loc, ValueRange srcLcvs, Value v,
918             ValueRange reduc) {
919           const Dimension srcRank = srcTp.getDimRank();
920           SmallVector<Value> srcDcvs;
921           srcDcvs.reserve(srcRank);
922           for (Dimension d = 0; d < srcRank; d++) {
923             Level lvl = toLvl(encSrc, d);
924             srcDcvs.push_back(srcLcvs[lvl]);
925           }
926 
927           Value collapseSize = constantIndex(builder, loc, 1);
928           for (Dimension d = 0; d < srcRank; d++)
929             collapseSize =
930                 builder.create<arith::MulIOp>(loc, collapseSize, srcSizes[d]);
931           SmallVector<Value, 1> collapsedSizes = {collapseSize};
932 
933           ReassociationIndices collapseIdx;
934           for (Dimension i = 0; i < srcRank; i++)
935             collapseIdx.push_back(i);
936           SmallVector<ReassociationIndices, 1> collapseReass = {collapseIdx};
937           SmallVector<Value, 1> collapsedDcvs;
938           reshapeCvs(builder, loc, collapseReass, srcSizes, srcDcvs,
939                      collapsedSizes, collapsedDcvs);
940 
941           ReassociationIndices expandIdx;
942           for (Dimension i = 0; i < dstTp.getDimRank(); i++)
943             expandIdx.push_back(i);
944           SmallVector<ReassociationIndices, 1> expandReass = {expandIdx};
945           SmallVector<Value> dstDcvs;
946           reshapeCvs(builder, loc, expandReass, collapsedSizes, collapsedDcvs,
947                      dstSizes, dstDcvs);
948 
949           auto t =
950               builder.create<tensor::InsertOp>(loc, v, reduc.front(), dstDcvs);
951           builder.create<sparse_tensor::YieldOp>(loc, t);
952         });
953 
954     Value t = rewriter.create<LoadOp>(loc, foreachOp.getResult(0), true);
955     if (bufferTp != dstTp) {
956       auto dstRTT = dstTp.getRankedTensorType();
957       Value converted = rewriter.create<ConvertOp>(loc, dstRTT, t).getResult();
958       rewriter.create<DeallocTensorOp>(loc, t);
959       t = converted;
960     }
961     rewriter.replaceOp(op, t);
962     return success();
963   }
964 };
965 
966 /// Sparse rewriting rule for sparse-to-sparse reshape operator.
967 template <typename ReshapeOp>
968 struct Sparse2SparseReshapeRewriter : public OpRewritePattern<ReshapeOp> {
969 public:
970   using OpRewritePattern<ReshapeOp>::OpRewritePattern;
971 
972   LogicalResult matchAndRewrite(ReshapeOp op,
973                                 PatternRewriter &rewriter) const override {
974     Location loc = op.getLoc();
975     Value srcTensor = op.getSrc();
976     const auto srcTp = getSparseTensorType(srcTensor);
977     const auto dstTp = getSparseTensorType(op.getResult());
978     if (!srcTp.hasEncoding() || !dstTp.hasEncoding())
979       return failure();
980 
981     // Generate code to represent the static dimension constants or compute
982     // the dynamic dimension values.
983     SmallVector<Value> srcSizes;
984     sizesForTensor(rewriter, srcSizes, loc, srcTp, srcTensor);
985     SmallVector<Value> dstSizes;
986     SmallVector<Value> dstDynSizes;
987     if (dstTp.hasStaticDimShape()) {
988       for (Dimension d : dstTp.getDimShape())
989         dstSizes.push_back(constantIndex(rewriter, loc, d));
990     } else {
991       ArrayRef<Size> dstShape = dstTp.getDimShape();
992       genReshapeDstShape(rewriter, loc, dstSizes, srcSizes, dstShape,
993                          op.getReassociationIndices());
994       for (auto [idx, shape] : llvm::enumerate(dstShape)) {
995         if (shape == ShapedType::kDynamic)
996           dstDynSizes.push_back(dstSizes[idx]);
997       }
998     }
999     Value nnz = rewriter.create<NumberOfEntriesOp>(loc, srcTensor);
1000     // Only need a unordered COO buffer if input and output are not sorted
1001     // in the same way.
1002     Type bufferTp = getBufferType(
1003         dstTp.withoutDimToLvl(),
1004         !srcTp.isAllOrdered() || !srcTp.isIdentity() || !dstTp.isIdentity());
1005 
1006     Value buffer =
1007         rewriter
1008             .create<AllocTensorOp>(loc, bufferTp, dstDynSizes, Value(),
1009                                    /*sizeHint=*/nnz, Attribute())
1010             .getResult();
1011 
1012     // Implement the sparse2sparse reshape as follows:
1013     //   foreach srcCoords %srcTensor
1014     //     insert reshapeCvs(srcCoords), %buffer
1015     //
1016     // followed by an optional
1017     //   %t = sparse_tensor.cast %tmp
1018     // depending on whether the input/output are sorted in the same way.
1019     const auto encSrc = srcTp.getEncoding();
1020     ForeachOp foreachOp = rewriter.create<ForeachOp>(
1021         loc, srcTensor, buffer,
1022         [&](OpBuilder &builder, Location loc, ValueRange srcLcvs, Value v,
1023             ValueRange reduc) {
1024           const Dimension dimRank = srcTp.getDimRank();
1025           SmallVector<Value> srcDcvs;
1026           srcDcvs.reserve(dimRank);
1027           for (Dimension d = 0; d < dimRank; d++) {
1028             Level lvl = toLvl(encSrc, d);
1029             srcDcvs.push_back(srcLcvs[lvl]);
1030           }
1031           SmallVector<Value> dstDcvs;
1032           reshapeCvs(builder, loc, op.getReassociationIndices(), srcSizes,
1033                      srcDcvs, dstSizes, dstDcvs);
1034           auto t =
1035               builder.create<tensor::InsertOp>(loc, v, reduc.front(), dstDcvs);
1036           builder.create<sparse_tensor::YieldOp>(loc, t);
1037         });
1038 
1039     Value t = rewriter.create<LoadOp>(loc, foreachOp.getResult(0), true);
1040     if (bufferTp != dstTp) {
1041       auto dstRTT = dstTp.getRankedTensorType();
1042       Value converted = rewriter.create<ConvertOp>(loc, dstRTT, t).getResult();
1043       rewriter.create<DeallocTensorOp>(loc, t);
1044       t = converted;
1045     }
1046     rewriter.replaceOp(op, t);
1047     return success();
1048   }
1049 };
1050 
1051 /// Sparse rewriting rule for sparse-to-dense and dense-to-sparse reshape
1052 /// operator.
1053 template <typename ReshapeOp>
1054 struct ReshapeRewriter : public OpRewritePattern<ReshapeOp> {
1055 public:
1056   using OpRewritePattern<ReshapeOp>::OpRewritePattern;
1057 
1058   LogicalResult matchAndRewrite(ReshapeOp op,
1059                                 PatternRewriter &rewriter) const override {
1060     Location loc = op->getLoc();
1061     auto encDst = getSparseTensorEncoding(op.getResult().getType());
1062     auto encSrc = getSparseTensorEncoding(op.getSrc().getType());
1063     // Since a pure dense expansion is very cheap (change of view), for
1064     // a sparse2dense or dense2sparse, we can simply unfuse a sparse
1065     // conversion from the reshape operation itself.
1066     // All other cases are handled elsewhere.
1067     if (encDst && encSrc) {
1068       return failure();
1069     }
1070     if (encSrc) {
1071       auto rtp = getRankedTensorType(op.getSrc());
1072       auto denseTp =
1073           RankedTensorType::get(rtp.getShape(), rtp.getElementType());
1074       auto convert = rewriter.create<ConvertOp>(loc, denseTp, op.getSrc());
1075       rewriter.modifyOpInPlace(op, [&]() { op->setOperand(0, convert); });
1076       return success();
1077     }
1078     if (encDst) {
1079       auto rtp = getRankedTensorType(op.getResult());
1080       auto denseTp =
1081           RankedTensorType::get(rtp.getShape(), rtp.getElementType());
1082       ReshapeOp reshape;
1083       if constexpr (std::is_same<ReshapeOp, tensor::ExpandShapeOp>::value) {
1084         reshape = rewriter.create<ReshapeOp>(
1085             loc, denseTp, op.getSrc(), op.getReassociation(),
1086             op.getOutputShape(), op.getStaticOutputShape());
1087       } else {
1088         reshape = rewriter.create<ReshapeOp>(loc, denseTp, op.getSrc(),
1089                                              op.getReassociation());
1090       }
1091       Value convert = rewriter.create<ConvertOp>(loc, rtp, reshape);
1092       rewriter.replaceOp(op, convert);
1093       return success();
1094     }
1095     return failure();
1096   }
1097 };
1098 
1099 // A trivial wrapper to help generate different operations for dense/sparse
1100 // tensors.
1101 struct TensorLike {
1102   TensorLike(OpBuilder &builder, Location loc, RankedTensorType rtt,
1103              ValueRange sizes) {
1104     SmallVector<Value> dynSzs;
1105     getDynamicSizes(rtt, sizes, dynSzs);
1106 
1107     val = builder.create<AllocTensorOp>(loc, rtt, dynSzs);
1108     if (!isSparse()) {
1109       Value c0 = constantZero(builder, loc, rtt.getElementType());
1110       val = builder.create<linalg::FillOp>(loc, c0, val).getResult(0);
1111     }
1112   }
1113 
1114   void insert(OpBuilder &builder, Location loc, Value v, ValueRange crds) {
1115     val = builder.create<tensor::InsertOp>(loc, v, val, crds);
1116   }
1117 
1118   Value finalize(OpBuilder &builder, Location loc, RankedTensorType rtp) const {
1119     if (isSparse())
1120       return builder.create<LoadOp>(loc, val, true);
1121     return val;
1122   }
1123 
1124   bool isSparse() const {
1125     return getSparseTensorEncoding(val.getType()) != nullptr;
1126   }
1127 
1128   Value val;
1129 };
1130 
1131 struct SparseTensorDimOpRewriter : public OpRewritePattern<tensor::DimOp> {
1132   using OpRewritePattern::OpRewritePattern;
1133   LogicalResult matchAndRewrite(tensor::DimOp op,
1134                                 PatternRewriter &rewriter) const override {
1135     std::optional<int64_t> dim = op.getConstantIndex();
1136     auto stt = getSparseTensorType(op.getSource());
1137     if (!dim || !stt.hasEncoding())
1138       return failure();
1139 
1140     if (stt.isPermutation()) {
1141       rewriter.replaceOpWithNewOp<LvlOp>(op, op.getSource(),
1142                                          toLvl(stt.getEncoding(), *dim));
1143       return success();
1144     }
1145 
1146     // Non-permutation dim2lvl/lvl2dim maps.
1147     // Compute as follows:
1148     // affine.apply #map (l0 - 1, l1 - 1, ...) + 1
1149     // Note that it is not the most efficient way (but a more general one) for
1150     // the lvl to dim translation, e.g., for BSR, the dimension size for can be
1151     // computed simply by lvl_size * block_size.
1152     Location loc = op.getLoc();
1153     SmallVector<Value> maxLvlCrds;
1154     for (Level l = 0; l < stt.getLvlRank(); l++) {
1155       Value lvlSz = rewriter.create<LvlOp>(loc, op.getSource(), l);
1156       Value maxLvlCrd = rewriter.create<arith::SubIOp>(
1157           loc, lvlSz, constantOne(rewriter, loc, rewriter.getIndexType()));
1158       maxLvlCrds.push_back(maxLvlCrd);
1159     }
1160 
1161     AffineExpr lvl2DimExp = stt.getLvlToDim().getResult(*dim);
1162     Value maxDimCrd = rewriter.create<affine::AffineApplyOp>(
1163         op.getLoc(), AffineMap::get(stt.getLvlRank(), 0, lvl2DimExp),
1164         maxLvlCrds);
1165 
1166     Value dimSz = rewriter.create<arith::AddIOp>(
1167         loc, maxDimCrd, constantOne(rewriter, loc, rewriter.getIndexType()));
1168     rewriter.replaceOp(op, dimSz);
1169     return success();
1170   }
1171 };
1172 
1173 struct ConcatenateRewriter : public OpRewritePattern<ConcatenateOp> {
1174   using OpRewritePattern::OpRewritePattern;
1175   LogicalResult matchAndRewrite(ConcatenateOp op,
1176                                 PatternRewriter &rewriter) const override {
1177     if (op.needsExtraSort())
1178       op.emitError("ConcatenateOp not staged");
1179 
1180     const Location loc = op.getLoc();
1181     const auto dstTp = getSparseTensorType(op);
1182     const Dimension conDim = op.getDimension();
1183     SmallVector<Value> sizes;
1184     concatSizesFromInputs(rewriter, sizes, loc, dstTp, op.getInputs(), conDim);
1185 
1186     // %t = concatenate %s1, %s2, %s3 {dim = 1}
1187     // ==>
1188     // if (isSparseDst)
1189     //   if (allDense)
1190     //     %tmp = bufferization.alloc_tensor dstTp
1191     //   else
1192     //     %tmp = bufferization.alloc_tensor : unordered COO
1193     // else
1194     //   %tmp = memref.alloc : dense tensor
1195     // foreach in %s1 : insert d0, d1, %tmp
1196     // foreach in %s2 : insert d0, d1 + size(s1), %tmp
1197     // foreach in %s3 : insert d0, d1 + size(s1) + size(s2), %tmp
1198 
1199     TensorLike dstBuf(rewriter, loc, dstTp.getRankedTensorType(), sizes);
1200     Value offset = constantIndex(rewriter, loc, 0);
1201     Value iterArg = dstBuf.val;
1202 
1203     ForeachOp foreachOp;
1204     for (Value input : op.getInputs()) {
1205       // Builds a for op for each input tensor to append new values into the
1206       // output tensor.
1207       foreachOp = rewriter.create<ForeachOp>(
1208           loc, input, iterArg,
1209           [&](OpBuilder &builder, Location loc, ValueRange dcvs, Value v,
1210               ValueRange reduc) {
1211             SmallVector<Value> offDimCrd(dcvs);
1212             offDimCrd[conDim] =
1213                 builder.create<arith::AddIOp>(loc, offDimCrd[conDim], offset);
1214 
1215             // Enters foreach, updates the SSA chain.
1216             dstBuf.val = reduc.front();
1217             if (!dstTp.isAllDense()) {
1218               Value cond = genIsNonzero(builder, loc, v);
1219               auto ifOp = builder.create<scf::IfOp>(loc, reduc.getTypes(), cond,
1220                                                     /*else*/ true);
1221               builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
1222               builder.create<scf::YieldOp>(loc, dstBuf.val);
1223 
1224               builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
1225               dstBuf.insert(builder, loc, v, offDimCrd);
1226               builder.create<scf::YieldOp>(loc, dstBuf.val);
1227 
1228               // Exits the ifOp, update the sparse tensor SSA value.
1229               builder.setInsertionPointAfter(ifOp);
1230               dstBuf.val = ifOp.getResult(0);
1231             } else {
1232               dstBuf.insert(builder, loc, v, offDimCrd);
1233             }
1234             builder.create<sparse_tensor::YieldOp>(loc, dstBuf.val);
1235           });
1236       // Accumulates the offset. Note that only static-shaped inputs are allowed
1237       // by concatenate op verifier, which saves us from computing the offset
1238       // dynamically.
1239       const Size sz = getSparseTensorType(input).getDynamicDimSize(conDim);
1240       assert(!ShapedType::isDynamic(sz));
1241       offset = rewriter.create<arith::AddIOp>(loc, offset,
1242                                               constantIndex(rewriter, loc, sz));
1243       iterArg = foreachOp.getResult(0);
1244       dstBuf.val = iterArg;
1245     }
1246 
1247     dstBuf.val = iterArg;
1248     Value ret = dstBuf.finalize(rewriter, loc, dstTp.getRankedTensorType());
1249     rewriter.replaceOp(op, ret);
1250     return success();
1251   }
1252 };
1253 
1254 struct DirectConvertRewriter : public OpRewritePattern<ConvertOp> {
1255   using OpRewritePattern::OpRewritePattern;
1256   LogicalResult matchAndRewrite(ConvertOp op,
1257                                 PatternRewriter &rewriter) const override {
1258     if (op.needsExtraSort())
1259       return op.emitError("ConvertOp not staged.");
1260 
1261     // TODO: Maybe we want a different operation for this too.
1262     auto encDst = getSparseTensorEncoding(op.getType());
1263     auto encSrc = getSparseTensorEncoding(op.getSource().getType());
1264     if (encDst && encSrc && !encSrc.isSlice() &&
1265         encSrc.withoutBitWidths() == encDst.withoutBitWidths()) {
1266       // Trivial tensor conversion and simple element type conversion is handled
1267       // in codegen.
1268       return failure();
1269     }
1270 
1271     Location loc = op.getLoc();
1272     Value src = op.getSource();
1273 
1274     SparseTensorType srcStt = getSparseTensorType(op.getSource());
1275     SparseTensorType dstStt = getSparseTensorType(op.getDest());
1276 
1277     bool fromSparseConst = false;
1278     if (auto constOp = op.getSource().getDefiningOp<arith::ConstantOp>())
1279       if (dyn_cast<SparseElementsAttr>(constOp.getValue()))
1280         fromSparseConst = true;
1281 
1282     const AffineMapAttr foreachOrder =
1283         (!dstStt.isIdentity() && fromSparseConst)
1284             ? AffineMapAttr::get(dstStt.getExpandedDimToLvl())
1285             : nullptr;
1286 
1287     bool skipZeroCheck = srcStt.hasEncoding() || fromSparseConst;
1288 
1289     SmallVector<Value> sizes;
1290     sizesFromSrc(rewriter, sizes, loc, src);
1291     ValueRange vs;
1292     TensorLike dstBuf(rewriter, loc, dstStt.getRankedTensorType(), sizes);
1293 
1294     auto foreachOp = rewriter.create<ForeachOp>(
1295         loc, src, dstBuf.val, foreachOrder,
1296         [&](OpBuilder &builder, Location loc, ValueRange dcvs, Value v,
1297             ValueRange reduc) {
1298           // Enters the loop, update the SSA value for insertion chain.
1299           dstBuf.val = reduc.front();
1300           if (!skipZeroCheck) {
1301             Value cond = genIsNonzero(builder, loc, v);
1302             auto ifOp = builder.create<scf::IfOp>(loc, reduc.getTypes(), cond,
1303                                                   /*else*/ true);
1304             builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
1305             builder.create<scf::YieldOp>(loc, dstBuf.val);
1306 
1307             builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
1308             dstBuf.insert(builder, loc, v, dcvs);
1309             builder.create<scf::YieldOp>(loc, dstBuf.val);
1310 
1311             // Exits the ifOp, update the sparse tensor SSA value.
1312             builder.setInsertionPointAfter(ifOp);
1313             dstBuf.val = ifOp.getResult(0);
1314           } else {
1315             dstBuf.insert(builder, loc, v, dcvs);
1316           }
1317           builder.create<sparse_tensor::YieldOp>(loc, dstBuf.val);
1318         });
1319 
1320     rewriter.setInsertionPointAfter(foreachOp);
1321 
1322     // Exits the for loop, links the SSA chain.
1323     dstBuf.val = foreachOp.getResult(0);
1324 
1325     Value ret = dstBuf.finalize(rewriter, loc, dstStt.getRankedTensorType());
1326     rewriter.replaceOp(op, ret);
1327     return success();
1328   }
1329 };
1330 
1331 struct CrdTranslateRewriter : public OpRewritePattern<CrdTranslateOp> {
1332   using OpRewritePattern::OpRewritePattern;
1333   LogicalResult matchAndRewrite(CrdTranslateOp op,
1334                                 PatternRewriter &rewriter) const override {
1335     AffineMap map = op.getDirection() == CrdTransDirectionKind::dim2lvl
1336                         ? op.getEncoder().getDimToLvl()
1337                         : op.getEncoder().getLvlToDim();
1338 
1339     SmallVector<Value> outCrds;
1340     for (AffineExpr result : map.getResults()) {
1341       // TODO: we should probably expand the affine map to IR using our own
1342       // rules, since affine.apply assume signed value, while the cooridinates
1343       // we provided must always be signless.
1344       Value trans = rewriter.create<affine::AffineApplyOp>(
1345           op.getLoc(), AffineMap::get(map.getNumDims(), 0, result),
1346           op.getInCrds());
1347       outCrds.push_back(trans);
1348     }
1349     rewriter.replaceOp(op, outCrds);
1350     return success();
1351   }
1352 };
1353 
1354 /// Sparse rewriting rule for the foreach operator.
1355 struct ForeachRewriter : public OpRewritePattern<ForeachOp> {
1356 public:
1357   using OpRewritePattern::OpRewritePattern;
1358 
1359   LogicalResult matchAndRewrite(ForeachOp op,
1360                                 PatternRewriter &rewriter) const override {
1361 
1362     auto loc = op.getLoc();
1363     Value input = op.getTensor();
1364     SmallVector<Value> reduc = op.getInitArgs();
1365     const auto stt = getSparseTensorType(input);
1366     const Level lvlRank = stt.getLvlRank();
1367 
1368     // Special-case: for each over a sparse constant uses its own rewriting
1369     // rule.
1370     if (auto constOp = input.getDefiningOp<arith::ConstantOp>()) {
1371       if (auto attr = dyn_cast<SparseElementsAttr>(constOp.getValue())) {
1372         return genForeachOnSparseConstant(op, rewriter, attr);
1373       }
1374     }
1375 
1376     // Otherwise, use loop emitter to generate loops.
1377     const auto enc = stt.getEncoding();
1378 
1379     // 1. Generates loop for the sparse input.
1380     LoopEmitter loopEmitter(
1381         ValueRange{input},
1382         StringAttr::get(getContext(), ForeachOp::getOperationName()));
1383     loopEmitter.initializeLoopEmit(rewriter, loc);
1384     for (Level l = 0; l < lvlRank; l++) {
1385       // TODO: provide utility function for loop sequences that only contains
1386       // one for loop?
1387       const SmallVector<TensorLevel, 1> tidLvls{
1388           loopEmitter.makeTensorLevel(0, l)};
1389       loopEmitter.enterNewLoopSeq(rewriter, loc, tidLvls);
1390       // Note that reduc will be taken care of by loop emitter and get updated
1391       // in place.
1392       loopEmitter.enterCoIterationOverTensorsAtLvls(rewriter, loc, tidLvls,
1393                                                     reduc);
1394     }
1395 
1396     SmallVector<Value> lcvs = loopEmitter.getLoopIVs();
1397     if (op.getOrder()) {
1398       // TODO: Support it so that we can do direct conversion from CSR->BSR.
1399       llvm_unreachable(
1400           "Level order not yet implemented on non-constant input tensors.");
1401     }
1402 
1403     Value vals = loopEmitter.getValBuffer()[0];
1404     SmallVector<Value> pos = loopEmitter.getValPosits(0);
1405     // Loads the value from sparse tensor using position-index;
1406     // loads the value from dense tensor using coords.
1407     Value val = enc ? rewriter.create<memref::LoadOp>(loc, vals, pos)
1408                     : rewriter.create<memref::LoadOp>(loc, vals, lcvs);
1409 
1410     // 2. Inline the block in the foreach operator.
1411     Block *srcBlock = op.getBody();
1412 
1413     // Remap coordinates.
1414     SmallVector<Value> args =
1415         enc.translateCrds(rewriter, loc, lcvs, CrdTransDirectionKind::lvl2dim);
1416 
1417     // Remap value.
1418     args.push_back(val);
1419     // Remap reduction variables.
1420     args.append(reduc);
1421 
1422     // Remove sparse_tensor.yield.
1423     SmallVector<Value> reducValue = srcBlock->getTerminator()->getOperands();
1424     rewriter.eraseOp(srcBlock->getTerminator());
1425 
1426     Operation &last = rewriter.getBlock()->back();
1427     if (llvm::isa<scf::YieldOp>(last)) {
1428       // Because `scf.for` inserts an implicit yield op when there is no
1429       // reduction variable upon creation, we reset the insertion point such
1430       // that the block is inlined before *before* the yield op.
1431       rewriter.setInsertionPoint(&last);
1432     }
1433 
1434     rewriter.inlineBlockBefore(srcBlock, rewriter.getBlock(),
1435                                rewriter.getInsertionPoint(), args);
1436     rewriter.setInsertionPointToEnd(rewriter.getBlock());
1437     for (Level l = 0; l < lvlRank; l++) {
1438       // Link the reduction chain. Note that loop emitter update the reducValue
1439       // in place.
1440       loopEmitter.exitCurrentLoop(rewriter, loc, reducValue);
1441       loopEmitter.exitCurrentLoopSeq(rewriter, loc);
1442     }
1443 
1444     // Replace the foreach operator with the value returned by the outtermost
1445     // for loop.
1446     rewriter.replaceOp(op, reducValue);
1447     return success();
1448   }
1449 };
1450 
1451 /// Sparse rewriting rule for the new operator.
1452 struct NewRewriter : public OpRewritePattern<NewOp> {
1453   using OpRewritePattern::OpRewritePattern;
1454   LogicalResult matchAndRewrite(NewOp op,
1455                                 PatternRewriter &rewriter) const override {
1456     Location loc = op.getLoc();
1457     auto stt = getSparseTensorType(op.getResult());
1458     if (!stt.hasEncoding() || stt.getAoSCOOStart() == 0)
1459       return failure();
1460 
1461     // Implement the NewOp as follows:
1462     //   %orderedCoo = sparse_tensor.new %filename
1463     //   %t = sparse_tensor.convert %orderedCoo
1464     // with enveloping reinterpreted_map ops for non-permutations.
1465     RankedTensorType dstTp = stt.getRankedTensorType();
1466     RankedTensorType cooTp = stt.getCOOType(/*ordered=*/true);
1467     Value cooTensor = rewriter.create<NewOp>(loc, cooTp, op.getSource());
1468     Value convert = cooTensor;
1469     auto enc = stt.getEncoding();
1470     if (!stt.isPermutation()) { // demap coo, demap dstTp
1471       auto coo = getSparseTensorType(cooTensor).getEncoding().withoutDimToLvl();
1472       convert = rewriter.create<ReinterpretMapOp>(loc, coo, convert);
1473       dstTp = getSparseTensorType(convert).withEncoding(enc.withoutDimToLvl());
1474     }
1475     convert = rewriter.create<ConvertOp>(loc, dstTp, convert);
1476     if (!stt.isPermutation()) // remap to original enc
1477       convert = rewriter.create<ReinterpretMapOp>(loc, enc, convert);
1478     rewriter.replaceOp(op, convert);
1479 
1480     // Release the temporary ordered COO tensor.
1481     rewriter.setInsertionPointAfterValue(convert);
1482     rewriter.create<DeallocTensorOp>(loc, cooTensor);
1483 
1484     return success();
1485   }
1486 };
1487 
1488 /// Sparse rewriting rule for the out operator.
1489 struct OutRewriter : public OpRewritePattern<OutOp> {
1490   using OpRewritePattern::OpRewritePattern;
1491   LogicalResult matchAndRewrite(OutOp op,
1492                                 PatternRewriter &rewriter) const override {
1493     Location loc = op.getLoc();
1494     // Calculate NNZ.
1495     Value src = op.getTensor();
1496     Value nnz = rewriter.create<NumberOfEntriesOp>(loc, src);
1497 
1498     // Allocate a temporary buffer for storing dimension-sizes/coordinates.
1499     const auto srcTp = getSparseTensorType(src);
1500     const Dimension dimRank = srcTp.getDimRank();
1501     Type indexTp = rewriter.getIndexType();
1502     Value dimSizes = genAlloca(rewriter, loc, dimRank, indexTp);
1503 
1504     // Generate code to calculate dimension size values and store the values to
1505     // the buffer.
1506     SmallVector<Value> dims;
1507     sizesForTensor(rewriter, dims, loc, srcTp, src);
1508     for (Dimension d = 0; d < dimRank; d++) {
1509       rewriter.create<memref::StoreOp>(loc, dims[d], dimSizes,
1510                                        constantIndex(rewriter, loc, d));
1511     }
1512 
1513     // Create a sparse tensor writer and output meta data.
1514     Type opaqueTp = getOpaquePointerType(rewriter);
1515     Value writer =
1516         createFuncCall(rewriter, loc, "createSparseTensorWriter", {opaqueTp},
1517                        {op.getDest()}, EmitCInterface::Off)
1518             .getResult(0);
1519     Value rankValue = constantIndex(rewriter, loc, dimRank);
1520     createFuncCall(rewriter, loc, "outSparseTensorWriterMetaData", {},
1521                    {writer, rankValue, nnz, dimSizes}, EmitCInterface::On);
1522 
1523     Value dimCoords = dimSizes; // Reuse the dimSizes buffer for dimCoords.
1524     Type eltTp = srcTp.getElementType();
1525     SmallString<29> outNextFuncName{"outSparseTensorWriterNext",
1526                                     primaryTypeFunctionSuffix(eltTp)};
1527     Value value = genAllocaScalar(rewriter, loc, eltTp);
1528     ModuleOp module = op->getParentOfType<ModuleOp>();
1529 
1530     // For each element in the source tensor, output the element.
1531     rewriter.create<ForeachOp>(
1532         loc, src, std::nullopt,
1533         [&](OpBuilder &builder, Location loc, ValueRange dcvs, Value v,
1534             ValueRange reduc) {
1535           for (Dimension d = 0; d < dimRank; d++) {
1536             rewriter.create<memref::StoreOp>(loc, dcvs[d], dimCoords,
1537                                              constantIndex(builder, loc, d));
1538           }
1539           rewriter.create<memref::StoreOp>(loc, v, value);
1540           SmallVector<Value> operands{writer, rankValue, dimCoords, value};
1541           FlatSymbolRefAttr fn = getFunc(module, outNextFuncName, {}, operands,
1542                                          EmitCInterface::On);
1543           builder.create<func::CallOp>(loc, TypeRange(), fn, operands);
1544           builder.create<sparse_tensor::YieldOp>(loc);
1545         });
1546 
1547     // Release the writer.
1548     createFuncCall(rewriter, loc, "delSparseTensorWriter", {}, {writer},
1549                    EmitCInterface::Off);
1550 
1551     rewriter.eraseOp(op);
1552     return success();
1553   }
1554 };
1555 
1556 } // namespace
1557 
1558 //===---------------------------------------------------------------------===//
1559 // Methods that add patterns described in this file to a pattern list.
1560 //===---------------------------------------------------------------------===//
1561 
1562 void mlir::populatePreSparsificationRewriting(RewritePatternSet &patterns) {
1563   patterns.add<FuseExtractSliceWithConcat, FoldConvertIntoProducer,
1564                FoldInvariantYield, FuseSparseMultiplyOverAdd, FuseTensorCast,
1565                GenSemiRingReduction, GenSemiRingSelect, PrintRewriter>(
1566       patterns.getContext());
1567 }
1568 
1569 void mlir::populateLowerSparseOpsToForeachPatterns(RewritePatternSet &patterns,
1570                                                    bool enableRT,
1571                                                    bool enableConvert) {
1572   patterns.add<ConcatenateRewriter, ReshapeRewriter<tensor::ExpandShapeOp>,
1573                ReshapeRewriter<tensor::CollapseShapeOp>,
1574                Sparse2SparseReshapeRewriter<tensor::ExpandShapeOp>,
1575                Sparse2SparseReshapeRewriter<tensor::CollapseShapeOp>,
1576                SparseTensorDimOpRewriter, TensorReshapeRewriter, OutRewriter>(
1577       patterns.getContext());
1578 
1579   if (enableConvert)
1580     patterns.add<DirectConvertRewriter>(patterns.getContext());
1581   if (!enableRT)
1582     patterns.add<NewRewriter>(patterns.getContext());
1583 }
1584 
1585 void mlir::populateLowerForeachToSCFPatterns(RewritePatternSet &patterns) {
1586   // Run CrdTranslateRewriter later in the pipeline so that operation can be
1587   // folded before lowering to affine.apply
1588   patterns.add<CrdTranslateRewriter, ForeachRewriter>(patterns.getContext());
1589 }
1590