xref: /llvm-project/mlir/lib/Dialect/Affine/Transforms/AffineDataCopyGeneration.cpp (revision 6bdecbcb99bc6b8fa25a2841bf2087bdbb91b4aa)
1 //===- AffineDataCopyGeneration.cpp - Explicit memref copying pass ------*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements a pass to automatically promote accessed memref regions
10 // to buffers in a faster memory space that is explicitly managed, with the
11 // necessary data movement operations performed through either regular
12 // point-wise load/store's or DMAs. Such explicit copying (also referred to as
13 // array packing/unpacking in the literature), when done on arrays that exhibit
14 // reuse, results in near elimination of conflict misses, TLB misses, reduced
15 // use of hardware prefetch streams, and reduced false sharing. It is also
16 // necessary for hardware that explicitly managed levels in the memory
17 // hierarchy, and where DMAs may have to be used. This optimization is often
18 // performed on already tiled code.
19 //
20 //===----------------------------------------------------------------------===//
21 
22 #include "mlir/Dialect/Affine/Passes.h"
23 
24 #include "mlir/Dialect/Affine/Analysis/Utils.h"
25 #include "mlir/Dialect/Affine/IR/AffineOps.h"
26 #include "mlir/Dialect/Affine/LoopUtils.h"
27 #include "mlir/Dialect/Arith/IR/Arith.h"
28 #include "mlir/Dialect/Func/IR/FuncOps.h"
29 #include "mlir/Dialect/MemRef/IR/MemRef.h"
30 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
31 #include "llvm/ADT/MapVector.h"
32 #include "llvm/Support/CommandLine.h"
33 #include "llvm/Support/Debug.h"
34 #include <algorithm>
35 #include <optional>
36 
37 namespace mlir {
38 #define GEN_PASS_DEF_AFFINEDATACOPYGENERATION
39 #include "mlir/Dialect/Affine/Passes.h.inc"
40 } // namespace mlir
41 
42 #define DEBUG_TYPE "affine-data-copy-generate"
43 
44 using namespace mlir;
45 
46 namespace {
47 
48 /// Replaces all loads and stores on memref's living in 'slowMemorySpace' by
49 /// introducing copy operations to transfer data into `fastMemorySpace` and
50 /// rewriting the original load's/store's to instead load/store from the
51 /// allocated fast memory buffers. Additional options specify the identifier
52 /// corresponding to the fast memory space and the amount of fast memory space
53 /// available. The pass traverses through the nesting structure, recursing to
54 /// inner levels if necessary to determine at what depth copies need to be
55 /// placed so that the allocated buffers fit within the memory capacity
56 /// provided.
57 // TODO: We currently can't generate copies correctly when stores
58 // are strided. Check for strided stores.
59 struct AffineDataCopyGeneration
60     : public impl::AffineDataCopyGenerationBase<AffineDataCopyGeneration> {
61   AffineDataCopyGeneration() = default;
62   explicit AffineDataCopyGeneration(unsigned slowMemorySpace,
63                                     unsigned fastMemorySpace,
64                                     unsigned tagMemorySpace,
65                                     int minDmaTransferSize,
66                                     uint64_t fastMemCapacityBytes) {
67     this->slowMemorySpace = slowMemorySpace;
68     this->fastMemorySpace = fastMemorySpace;
69     this->tagMemorySpace = tagMemorySpace;
70     this->minDmaTransferSize = minDmaTransferSize;
71     this->fastMemoryCapacity = fastMemCapacityBytes / 1024;
72   }
73 
74   void runOnOperation() override;
75   void runOnBlock(Block *block, DenseSet<Operation *> &copyNests);
76 
77   // Constant zero index to avoid too many duplicates.
78   Value zeroIndex = nullptr;
79 };
80 
81 } // namespace
82 
83 /// Generates copies for memref's living in 'slowMemorySpace' into newly created
84 /// buffers in 'fastMemorySpace', and replaces memory operations to the former
85 /// by the latter. Only load op's handled for now.
86 /// TODO: extend this to store op's.
87 std::unique_ptr<OperationPass<func::FuncOp>>
88 mlir::createAffineDataCopyGenerationPass(unsigned slowMemorySpace,
89                                          unsigned fastMemorySpace,
90                                          unsigned tagMemorySpace,
91                                          int minDmaTransferSize,
92                                          uint64_t fastMemCapacityBytes) {
93   return std::make_unique<AffineDataCopyGeneration>(
94       slowMemorySpace, fastMemorySpace, tagMemorySpace, minDmaTransferSize,
95       fastMemCapacityBytes);
96 }
97 std::unique_ptr<OperationPass<func::FuncOp>>
98 mlir::createAffineDataCopyGenerationPass() {
99   return std::make_unique<AffineDataCopyGeneration>();
100 }
101 
102 /// Generate copies for this block. The block is partitioned into separate
103 /// ranges: each range is either a sequence of one or more operations starting
104 /// and ending with an affine load or store op, or just an affine.forop (which
105 /// could have other affine for op's nested within).
106 void AffineDataCopyGeneration::runOnBlock(Block *block,
107                                           DenseSet<Operation *> &copyNests) {
108   if (block->empty())
109     return;
110 
111   uint64_t fastMemCapacityBytes =
112       fastMemoryCapacity != std::numeric_limits<uint64_t>::max()
113           ? fastMemoryCapacity * 1024
114           : fastMemoryCapacity;
115   AffineCopyOptions copyOptions = {generateDma, slowMemorySpace,
116                                    fastMemorySpace, tagMemorySpace,
117                                    fastMemCapacityBytes};
118 
119   // Every affine.for op in the block starts and ends a block range for copying;
120   // in addition, a contiguous sequence of operations starting with a
121   // load/store op but not including any copy nests themselves is also
122   // identified as a copy block range. Straightline code (a contiguous chunk of
123   // operations excluding AffineForOp's) are always assumed to not exhaust
124   // memory. As a result, this approach is conservative in some cases at the
125   // moment; we do a check later and report an error with location info.
126   // TODO: An 'affine.if' operation is being treated similar to an
127   // operation. 'affine.if''s could have 'affine.for's in them;
128   // treat them separately.
129 
130   // Get to the first load, store, or for op (that is not a copy nest itself).
131   auto curBegin =
132       std::find_if(block->begin(), block->end(), [&](Operation &op) {
133         return isa<AffineLoadOp, AffineStoreOp, AffineForOp>(op) &&
134                copyNests.count(&op) == 0;
135       });
136 
137   // Create [begin, end) ranges.
138   auto it = curBegin;
139   while (it != block->end()) {
140     AffineForOp forOp;
141     // If you hit a non-copy for loop, we will split there.
142     if ((forOp = dyn_cast<AffineForOp>(&*it)) && copyNests.count(forOp) == 0) {
143       // Perform the copying up unti this 'for' op first.
144       (void)affineDataCopyGenerate(/*begin=*/curBegin, /*end=*/it, copyOptions,
145                                    /*filterMemRef=*/std::nullopt, copyNests);
146 
147       // Returns true if the footprint is known to exceed capacity.
148       auto exceedsCapacity = [&](AffineForOp forOp) {
149         std::optional<int64_t> footprint =
150             getMemoryFootprintBytes(forOp,
151                                     /*memorySpace=*/0);
152         return (footprint.has_value() &&
153                 static_cast<uint64_t>(*footprint) > fastMemCapacityBytes);
154       };
155 
156       // If the memory footprint of the 'affine.for' loop is higher than fast
157       // memory capacity (when provided), we recurse to copy at an inner level
158       // until we find a depth at which footprint fits in fast mem capacity. If
159       // the footprint can't be calculated, we assume for now it fits. Recurse
160       // inside if footprint for 'forOp' exceeds capacity, or when
161       // skipNonUnitStrideLoops is set and the step size is not one.
162       bool recurseInner = skipNonUnitStrideLoops ? forOp.getStep() != 1
163                                                  : exceedsCapacity(forOp);
164       if (recurseInner) {
165         // We'll recurse and do the copies at an inner level for 'forInst'.
166         // Recurse onto the body of this loop.
167         runOnBlock(forOp.getBody(), copyNests);
168       } else {
169         // We have enough capacity, i.e., copies will be computed for the
170         // portion of the block until 'it', and for 'it', which is 'forOp'. Note
171         // that for the latter, the copies are placed just before this loop (for
172         // incoming copies) and right after (for outgoing ones).
173 
174         // Inner loop copies have their own scope - we don't thus update
175         // consumed capacity. The footprint check above guarantees this inner
176         // loop's footprint fits.
177         (void)affineDataCopyGenerate(/*begin=*/it, /*end=*/std::next(it),
178                                      copyOptions,
179                                      /*filterMemRef=*/std::nullopt, copyNests);
180       }
181       // Get to the next load or store op after 'forOp'.
182       curBegin = std::find_if(std::next(it), block->end(), [&](Operation &op) {
183         return isa<AffineLoadOp, AffineStoreOp, AffineForOp>(op) &&
184                copyNests.count(&op) == 0;
185       });
186       it = curBegin;
187     } else {
188       assert(copyNests.count(&*it) == 0 &&
189              "all copy nests generated should have been skipped above");
190       // We simply include this op in the current range and continue for more.
191       ++it;
192     }
193   }
194 
195   // Generate the copy for the final block range.
196   if (curBegin != block->end()) {
197     // Can't be a terminator because it would have been skipped above.
198     assert(!curBegin->hasTrait<OpTrait::IsTerminator>() &&
199            "can't be a terminator");
200     // Exclude the affine.yield - hence, the std::prev.
201     (void)affineDataCopyGenerate(/*begin=*/curBegin,
202                                  /*end=*/std::prev(block->end()), copyOptions,
203                                  /*filterMemRef=*/std::nullopt, copyNests);
204   }
205 }
206 
207 void AffineDataCopyGeneration::runOnOperation() {
208   func::FuncOp f = getOperation();
209   OpBuilder topBuilder(f.getBody());
210   zeroIndex = topBuilder.create<arith::ConstantIndexOp>(f.getLoc(), 0);
211 
212   // Nests that are copy-in's or copy-out's; the root AffineForOps of those
213   // nests are stored herein.
214   DenseSet<Operation *> copyNests;
215 
216   // Clear recorded copy nests.
217   copyNests.clear();
218 
219   for (auto &block : f)
220     runOnBlock(&block, copyNests);
221 
222   // Promote any single iteration loops in the copy nests and collect
223   // load/stores to simplify.
224   SmallVector<Operation *, 4> copyOps;
225   for (Operation *nest : copyNests)
226     // With a post order walk, the erasure of loops does not affect
227     // continuation of the walk or the collection of load/store ops.
228     nest->walk([&](Operation *op) {
229       if (auto forOp = dyn_cast<AffineForOp>(op))
230         (void)promoteIfSingleIteration(forOp);
231       else if (isa<AffineLoadOp, AffineStoreOp>(op))
232         copyOps.push_back(op);
233     });
234 
235   // Promoting single iteration loops could lead to simplification of
236   // contained load's/store's, and the latter could anyway also be
237   // canonicalized.
238   RewritePatternSet patterns(&getContext());
239   AffineLoadOp::getCanonicalizationPatterns(patterns, &getContext());
240   AffineStoreOp::getCanonicalizationPatterns(patterns, &getContext());
241   FrozenRewritePatternSet frozenPatterns(std::move(patterns));
242   GreedyRewriteConfig config;
243   config.strictMode = GreedyRewriteStrictness::ExistingAndNewOps;
244   (void)applyOpPatternsAndFold(copyOps, frozenPatterns, config);
245 }
246