xref: /llvm-project/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp (revision 2010269c0f4694d68f3c9a87971049f4586bb940)
1 //===- Promotion.cpp - Implementation of linalg Promotion -----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the linalg dialect Promotion pass.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "mlir/Dialect/Arith/IR/Arith.h"
14 #include "mlir/Dialect/Arith/Utils/Utils.h"
15 #include "mlir/Dialect/Complex/IR/Complex.h"
16 #include "mlir/Dialect/Func/IR/FuncOps.h"
17 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
18 #include "mlir/Dialect/Linalg/IR/Linalg.h"
19 #include "mlir/Dialect/Linalg/Passes.h"
20 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
21 #include "mlir/Dialect/SCF/IR/SCF.h"
22 #include "mlir/IR/AffineExpr.h"
23 #include "mlir/IR/AffineExprVisitor.h"
24 #include "mlir/IR/AffineMap.h"
25 #include "mlir/IR/ImplicitLocOpBuilder.h"
26 #include "mlir/Support/LLVM.h"
27 #include "mlir/Transforms/FoldUtils.h"
28 #include "llvm/ADT/MapVector.h"
29 #include "llvm/ADT/SmallBitVector.h"
30 #include "llvm/ADT/TypeSwitch.h"
31 #include "llvm/Support/CommandLine.h"
32 #include "llvm/Support/Debug.h"
33 
34 using namespace mlir;
35 using namespace mlir::linalg;
36 using namespace mlir::scf;
37 
38 using llvm::MapVector;
39 
40 #define DEBUG_TYPE "linalg-promotion"
41 
42 /// Alloc a new buffer of `size` * `width` i8; where `width` is given by the
43 /// data `layout` for `elementType`.
44 /// Use AllocOp or AllocaOp depending on `options`.
45 /// Take an optional alignment.
46 static Value allocBuffer(ImplicitLocOpBuilder &b,
47                          const LinalgPromotionOptions &options,
48                          Type elementType, Value allocSize, DataLayout &layout,
49                          std::optional<unsigned> alignment = std::nullopt) {
50   auto width = layout.getTypeSize(elementType);
51 
52   IntegerAttr alignmentAttr;
53   if (alignment.has_value())
54     alignmentAttr = b.getI64IntegerAttr(alignment.value());
55 
56   // Static buffer.
57   if (auto cst = allocSize.getDefiningOp<arith::ConstantIndexOp>()) {
58     auto staticBufferType =
59         MemRefType::get(width * cst.value(), b.getIntegerType(8));
60     if (options.useAlloca) {
61       return b.createOrFold<memref::AllocaOp>(staticBufferType, ValueRange{},
62                                               alignmentAttr);
63     }
64     return b.createOrFold<memref::AllocOp>(staticBufferType, ValueRange{},
65                                            alignmentAttr);
66   }
67 
68   // Fallback dynamic buffer.
69   auto dynamicBufferType =
70       MemRefType::get(ShapedType::kDynamic, b.getIntegerType(8));
71   Value mul = b.createOrFold<arith::MulIOp>(
72       b.create<arith::ConstantIndexOp>(width), allocSize);
73   if (options.useAlloca)
74     return b.create<memref::AllocaOp>(dynamicBufferType, mul, alignmentAttr);
75   return b.create<memref::AllocOp>(dynamicBufferType, mul, alignmentAttr);
76 }
77 
78 /// Default allocation callback function. This allocates a promoted buffer when
79 /// no call back to do so is provided. The default is to allocate a
80 /// memref<..xi8> and return a view to get a memref type of shape
81 /// boundingSubViewSize.
82 static std::optional<Value> defaultAllocBufferCallBack(
83     const LinalgPromotionOptions &options, OpBuilder &builder,
84     memref::SubViewOp subView, ArrayRef<Value> boundingSubViewSize,
85     std::optional<unsigned> alignment, DataLayout &layout) {
86   ShapedType viewType = subView.getType();
87   ImplicitLocOpBuilder b(subView.getLoc(), builder);
88   auto zero = b.createOrFold<arith::ConstantIndexOp>(0);
89   auto one = b.createOrFold<arith::ConstantIndexOp>(1);
90 
91   Value allocSize = one;
92   for (const auto &size : llvm::enumerate(boundingSubViewSize))
93     allocSize = b.createOrFold<arith::MulIOp>(allocSize, size.value());
94   Value buffer = allocBuffer(b, options, viewType.getElementType(), allocSize,
95                              layout, alignment);
96   SmallVector<int64_t, 4> dynSizes(boundingSubViewSize.size(),
97                                    ShapedType::kDynamic);
98   Value view = b.createOrFold<memref::ViewOp>(
99       MemRefType::get(dynSizes, viewType.getElementType()), buffer, zero,
100       boundingSubViewSize);
101   return view;
102 }
103 
104 /// Default implementation of deallocation of the buffer use for promotion. It
105 /// expects to get the same value that the default allocation method returned,
106 /// i.e. result of a ViewOp.
107 static LogicalResult
108 defaultDeallocBufferCallBack(const LinalgPromotionOptions &options,
109                              OpBuilder &b, Value fullLocalView) {
110   if (!options.useAlloca) {
111     auto viewOp = cast<memref::ViewOp>(fullLocalView.getDefiningOp());
112     b.create<memref::DeallocOp>(viewOp.getSource().getLoc(),
113                                 viewOp.getSource());
114   }
115   return success();
116 }
117 
118 namespace {
119 
120 /// Helper struct that captures the information required to apply the
121 /// transformation on each op. This bridges the abstraction gap with the
122 /// user-facing API which exposes positional arguments to control which operands
123 /// are promoted.
124 struct LinalgOpInstancePromotionOptions {
125   LinalgOpInstancePromotionOptions(LinalgOp op,
126                                    const LinalgPromotionOptions &options);
127   /// SubViews to promote.
128   MapVector<int64_t, Value> subViews;
129   /// True if the full view should be used for the promoted buffer.
130   DenseMap<Value, bool> useFullTileBuffers;
131 
132   /// Callback functions for allocation and deallocation of promoted buffers, as
133   /// well as to copy the data into and out of these buffers.
134   AllocBufferCallbackFn allocationFn;
135   DeallocBufferCallbackFn deallocationFn;
136   CopyCallbackFn copyInFn;
137   CopyCallbackFn copyOutFn;
138 
139   /// Alignment of promoted buffer.
140   std::optional<unsigned> alignment;
141 };
142 } // namespace
143 
144 LinalgOpInstancePromotionOptions::LinalgOpInstancePromotionOptions(
145     LinalgOp linalgOp, const LinalgPromotionOptions &options)
146     : subViews(), alignment(options.alignment) {
147   assert(linalgOp.hasBufferSemantics() && "revisit usage of shaped operand");
148   auto vUseFullTileBuffers =
149       options.useFullTileBuffers.value_or(llvm::SmallBitVector());
150   vUseFullTileBuffers.resize(linalgOp->getNumOperands(),
151                              options.useFullTileBuffersDefault);
152 
153   for (OpOperand &opOperand : linalgOp->getOpOperands()) {
154     int64_t operandNumber = opOperand.getOperandNumber();
155     if (options.operandsToPromote &&
156         !options.operandsToPromote->count(operandNumber))
157       continue;
158     Operation *op = opOperand.get().getDefiningOp();
159     if (auto sv = dyn_cast_or_null<memref::SubViewOp>(op)) {
160       subViews[operandNumber] = sv;
161       useFullTileBuffers[sv] = vUseFullTileBuffers[operandNumber];
162     }
163   }
164 
165   if (options.allocationFn) {
166     allocationFn = *options.allocationFn;
167   } else {
168     allocationFn = [&](OpBuilder &b, memref::SubViewOp subViewOp,
169                        ArrayRef<Value> boundingSubViewSize,
170                        DataLayout &layout) -> std::optional<Value> {
171       return defaultAllocBufferCallBack(options, b, subViewOp,
172                                         boundingSubViewSize, alignment, layout);
173     };
174   }
175 
176   if (options.deallocationFn) {
177     deallocationFn = *options.deallocationFn;
178   } else {
179     deallocationFn = [&](OpBuilder &b, Value buffer) {
180       return defaultDeallocBufferCallBack(options, b, buffer);
181     };
182   }
183 
184   // Save the loc because `linalgOp` goes out of scope.
185   Location loc = linalgOp.getLoc();
186   auto defaultCopyCallBack = [loc](OpBuilder &b, Value src,
187                                    Value dst) -> LogicalResult {
188     b.create<memref::CopyOp>(loc, src, dst);
189     return success();
190   };
191   copyInFn = (options.copyInFn ? *(options.copyInFn) : defaultCopyCallBack);
192   copyOutFn = (options.copyOutFn ? *(options.copyOutFn) : defaultCopyCallBack);
193 }
194 
195 // Performs promotion of a `subView` into a local buffer of the size of the
196 // *ranges* of the `subView`. This produces a buffer whose size may be bigger
197 // than the actual size of the `subView` at the boundaries.
198 // This is related to the full/partial tile problem.
199 // Returns a PromotionInfo containing a `buffer`, `fullLocalView` and
200 // `partialLocalView` such that:
201 //   * `buffer` is always the size of the full tile.
202 //   * `fullLocalView` is a dense contiguous view into that buffer.
203 //   * `partialLocalView` is a dense non-contiguous slice of `fullLocalView`
204 //     that corresponds to the size of `subView` and accounting for boundary
205 //     effects.
206 // The point of the full tile buffer is that constant static tile sizes are
207 // folded and result in a buffer type with statically known size and alignment
208 // properties.
209 // To account for general boundary effects, padding must be performed on the
210 // boundary tiles. For now this is done with an unconditional `fill` op followed
211 // by a partial `copy` op.
212 FailureOr<PromotionInfo> mlir::linalg::promoteSubviewAsNewBuffer(
213     OpBuilder &b, Location loc, memref::SubViewOp subView,
214     const AllocBufferCallbackFn &allocationFn, DataLayout &layout) {
215   auto viewType = subView.getType();
216   auto rank = viewType.getRank();
217   SmallVector<Value, 4> fullSizes;
218   SmallVector<OpFoldResult> partialSizes;
219   fullSizes.reserve(rank);
220   partialSizes.reserve(rank);
221   llvm::SmallBitVector droppedDims = subView.getDroppedDims();
222   int64_t resultDimIdx = 0;
223   for (const auto &en : llvm::enumerate(subView.getOrCreateRanges(b, loc))) {
224     if (droppedDims[en.index()])
225       continue;
226     auto rangeValue = en.value();
227     // Try to extract a tight constant. If the size is known statically, no need
228     // to look for the bound.
229     LLVM_DEBUG(llvm::dbgs() << "Extract tightest: " << rangeValue.size << "\n");
230     Value size;
231     if (auto attr = rangeValue.size.dyn_cast<Attribute>()) {
232       size = getValueOrCreateConstantIndexOp(b, loc, rangeValue.size);
233     } else {
234       Value materializedSize =
235           getValueOrCreateConstantIndexOp(b, loc, rangeValue.size);
236       FailureOr<int64_t> upperBound =
237           getConstantUpperBoundForIndex(materializedSize);
238       size = failed(upperBound)
239                  ? materializedSize
240                  : b.create<arith::ConstantIndexOp>(loc, *upperBound);
241     }
242     LLVM_DEBUG(llvm::dbgs() << "Extracted tightest: " << size << "\n");
243     fullSizes.push_back(size);
244     partialSizes.push_back(
245         b.createOrFold<memref::DimOp>(loc, subView, resultDimIdx++));
246   }
247   SmallVector<int64_t, 4> dynSizes(fullSizes.size(), ShapedType::kDynamic);
248   // If a callback is not specified, then use the default implementation for
249   // allocating the promoted buffer.
250   std::optional<Value> fullLocalView =
251       allocationFn(b, subView, fullSizes, layout);
252   if (!fullLocalView)
253     return failure();
254   SmallVector<OpFoldResult, 4> zeros(fullSizes.size(), b.getIndexAttr(0));
255   SmallVector<OpFoldResult, 4> ones(fullSizes.size(), b.getIndexAttr(1));
256   auto partialLocalView = b.createOrFold<memref::SubViewOp>(
257       loc, *fullLocalView, zeros, partialSizes, ones);
258   return PromotionInfo{*fullLocalView, partialLocalView};
259 }
260 
261 static FailureOr<MapVector<int64_t, PromotionInfo>>
262 promoteSubViews(ImplicitLocOpBuilder &b,
263                 LinalgOpInstancePromotionOptions options, DataLayout &layout) {
264   if (options.subViews.empty())
265     return failure();
266 
267   MapVector<int64_t, PromotionInfo> promotionInfoMap;
268 
269   for (auto v : options.subViews) {
270     memref::SubViewOp subView =
271         cast<memref::SubViewOp>(v.second.getDefiningOp());
272     auto promotionInfo = promoteSubviewAsNewBuffer(
273         b, b.getLoc(), subView, options.allocationFn, layout);
274     if (failed(promotionInfo))
275       return failure();
276     promotionInfoMap[v.first] = *promotionInfo;
277 
278     // Only fill the buffer if the full local view is used
279     if (!options.useFullTileBuffers[v.second])
280       continue;
281     Type subviewEltType = subView.getType().getElementType();
282     Value fillVal =
283         llvm::TypeSwitch<Type, Value>(subviewEltType)
284             .Case([&](FloatType t) {
285               return b.create<arith::ConstantOp>(FloatAttr::get(t, 0.0));
286             })
287             .Case([&](IntegerType t) {
288               return b.create<arith::ConstantOp>(IntegerAttr::get(t, 0));
289             })
290             .Case([&](ComplexType t) {
291               Value tmp;
292               if (auto et = t.getElementType().dyn_cast<FloatType>())
293                 tmp = b.create<arith::ConstantOp>(FloatAttr::get(et, 0.0));
294               else if (auto et = t.getElementType().cast<IntegerType>())
295                 tmp = b.create<arith::ConstantOp>(IntegerAttr::get(et, 0));
296               return b.create<complex::CreateOp>(t, tmp, tmp);
297             })
298             .Default([](auto) { return Value(); });
299     if (!fillVal)
300       return failure();
301     b.create<linalg::FillOp>(fillVal, promotionInfo->fullLocalView);
302   }
303 
304   // Copy data into the promoted buffers. Use callback if provided.
305   for (auto v : options.subViews) {
306     auto info = promotionInfoMap.find(v.first);
307     if (info == promotionInfoMap.end())
308       continue;
309     if (failed(options.copyInFn(
310             b, cast<memref::SubViewOp>(v.second.getDefiningOp()),
311             info->second.partialLocalView)))
312       return failure();
313   }
314   return promotionInfoMap;
315 }
316 
317 static FailureOr<LinalgOp>
318 promoteSubViews(ImplicitLocOpBuilder &b, LinalgOp op,
319                 LinalgOpInstancePromotionOptions options, DataLayout &layout) {
320   assert(op.hasBufferSemantics() && "expected linalg op with buffer semantics");
321 
322   // 1. Promote the specified views and use them in the new op.
323   auto promotedBuffersAndViews = promoteSubViews(b, options, layout);
324   if (failed(promotedBuffersAndViews) ||
325       promotedBuffersAndViews->size() != options.subViews.size())
326     return failure();
327 
328   // 2. Append all other operands as they appear, this enforces that such
329   // operands are not views. This is to support cases such as FillOp taking
330   // extra scalars etc.  Keep a reference to output buffers;
331   SmallVector<Value, 8> opViews;
332   opViews.reserve(op->getNumOperands());
333   SmallVector<std::pair<Value, Value>, 8> writebackViews;
334   writebackViews.reserve(promotedBuffersAndViews->size());
335   for (OpOperand &opOperand : op->getOpOperands()) {
336     int64_t operandNumber = opOperand.getOperandNumber();
337     if (options.subViews.count(operandNumber) != 0) {
338       if (options.useFullTileBuffers[opOperand.get()])
339         opViews.push_back(
340             (*promotedBuffersAndViews)[operandNumber].fullLocalView);
341       else
342         opViews.push_back(
343             (*promotedBuffersAndViews)[operandNumber].partialLocalView);
344       if (operandNumber >= op.getNumDpsInputs())
345         writebackViews.emplace_back(std::make_pair(
346             opOperand.get(),
347             (*promotedBuffersAndViews)[operandNumber].partialLocalView));
348     } else {
349       opViews.push_back(opOperand.get());
350     }
351   }
352   op->setOperands(0, opViews.size(), opViews);
353 
354   OpBuilder::InsertionGuard guard(b);
355   b.setInsertionPointAfter(op);
356   // 3. Emit write-back for the promoted output views: copy the partial view.
357   for (auto viewAndPartialLocalView : writebackViews) {
358     if (failed(options.copyOutFn(b, viewAndPartialLocalView.second,
359                                  viewAndPartialLocalView.first)))
360       return failure();
361   }
362 
363   // 4. Dealloc all local buffers.
364   for (const auto &pi : *promotedBuffersAndViews)
365     (void)options.deallocationFn(b, pi.second.fullLocalView);
366   return op;
367 }
368 
369 LogicalResult
370 mlir::linalg::promoteSubviewsPrecondition(Operation *op,
371                                           LinalgPromotionOptions options) {
372   LinalgOp linalgOp = dyn_cast<LinalgOp>(op);
373   // Transformation applies to buffers only.
374   if (!linalgOp || !linalgOp.hasBufferSemantics())
375     return failure();
376   // Check that at least one of the requested operands is indeed a subview.
377   for (OpOperand &opOperand : linalgOp->getOpOperands()) {
378     auto sv =
379         isa_and_nonnull<memref::SubViewOp>(opOperand.get().getDefiningOp());
380     if (sv) {
381       if (!options.operandsToPromote ||
382           options.operandsToPromote->count(opOperand.getOperandNumber()))
383         return success();
384     }
385   }
386   // TODO: Check all subviews requested are bound by a static constant.
387   // TODO: Check that the total footprint fits within a given size.
388   return failure();
389 }
390 
391 FailureOr<LinalgOp>
392 mlir::linalg::promoteSubViews(OpBuilder &builder, LinalgOp linalgOp,
393                               const LinalgPromotionOptions &options) {
394   LinalgOpInstancePromotionOptions linalgOptions(linalgOp, options);
395   auto layout = DataLayout::closest(linalgOp);
396   ImplicitLocOpBuilder b(linalgOp.getLoc(), builder);
397   auto res = ::promoteSubViews(b, linalgOp, linalgOptions, layout);
398   if (failed(res))
399     return failure();
400   return res;
401 }
402 
403 /// Allocate the given subview to a memory address space in GPU by creating a
404 /// allocation operation and setting the memref type address space to desired
405 /// address space.
406 static std::optional<Value> allocateSubviewGPUMemoryInAddressSpace(
407     OpBuilder &builder, memref::SubViewOp subview, ArrayRef<Value> sizeBounds,
408     gpu::AddressSpace addressSpace) {
409   OpBuilder::InsertionGuard guard(builder);
410 
411   func::FuncOp funcOp = subview->getParentOfType<func::FuncOp>();
412   if (!funcOp)
413     return std::nullopt;
414 
415   // The subview size bounds are expected to be constant; they specify the shape
416   // of the allocation.
417   SmallVector<int64_t> shape;
418   for (Value bound : sizeBounds) {
419     APInt value;
420     if (!matchPattern(bound, m_ConstantInt(&value)))
421       return std::nullopt;
422     shape.push_back(value.getSExtValue());
423   }
424 
425   builder.setInsertionPoint(&funcOp.front(), funcOp.front().begin());
426   auto type = MemRefType::get(
427       shape, subview.getType().getElementType(), MemRefLayoutAttrInterface{},
428       gpu::AddressSpaceAttr::get(builder.getContext(), addressSpace));
429   Value buffer;
430   if (addressSpace == gpu::GPUDialect::getWorkgroupAddressSpace()) {
431     buffer = builder.create<memref::AllocOp>(funcOp.getLoc(), type);
432   } else if (addressSpace == gpu::GPUDialect::getPrivateAddressSpace()) {
433     buffer = builder.create<memref::AllocaOp>(funcOp.getLoc(), type);
434   } else {
435     return std::nullopt;
436   }
437   return buffer;
438 }
439 
440 /// Allocate the subview in the GPU workgroup memory.
441 std::optional<Value> mlir::linalg::allocateWorkgroupMemory(
442     OpBuilder &builder, memref::SubViewOp subview, ArrayRef<Value> sizeBounds,
443     DataLayout &) {
444   return allocateSubviewGPUMemoryInAddressSpace(
445       builder, subview, sizeBounds,
446       gpu::GPUDialect::getWorkgroupAddressSpace());
447 }
448 
449 /// In case of GPU group memory there is no need to deallocate.
450 LogicalResult mlir::linalg::deallocateWorkgroupMemory(OpBuilder &,
451                                                       Value /*buffer*/) {
452   return success();
453 }
454 
455 /// Create Memref copy operations and add gpu barrier guards before and after
456 /// the copy operation to ensure data integrity.
457 LogicalResult mlir::linalg::copyToWorkgroupMemory(OpBuilder &b, Value src,
458                                                   Value dst) {
459   b.create<gpu::BarrierOp>(src.getLoc());
460   Operation *copyOp = b.create<memref::CopyOp>(src.getLoc(), src, dst);
461   b.create<gpu::BarrierOp>(copyOp->getLoc());
462   return success();
463 }
464 
465 /// Allocate the subview in the GPU private memory.
466 std::optional<Value> mlir::linalg::allocateGPUPrivateMemory(
467     OpBuilder &builder, memref::SubViewOp subview, ArrayRef<Value> sizeBounds,
468     DataLayout &) {
469   return allocateSubviewGPUMemoryInAddressSpace(
470       builder, subview, sizeBounds, gpu::GPUDialect::getPrivateAddressSpace());
471 }
472 
473 /// Normal copy to between src and dst.
474 LogicalResult mlir::linalg::copyToGPUPrivateMemory(OpBuilder &b, Value src,
475                                                    Value dst) {
476   b.create<memref::CopyOp>(src.getLoc(), src, dst);
477   return success();
478 }
479 
480 /// In case of GPU private memory there is no need to deallocate since the
481 /// memory is freed when going outside of the scope.
482 LogicalResult mlir::linalg::deallocateGPUPrivateMemory(OpBuilder &,
483                                                        Value /*buffer*/) {
484   return success();
485 }
486