xref: /llvm-project/mlir/lib/Dialect/Affine/Transforms/AffineDataCopyGeneration.cpp (revision 09dfc5713d7e2342bea4c8447d1ed76c85eb8225)
1e7084713SRob Suderman //===- AffineDataCopyGeneration.cpp - Explicit memref copying pass ------*-===//
2e7084713SRob Suderman //
3e7084713SRob Suderman // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4e7084713SRob Suderman // See https://llvm.org/LICENSE.txt for license information.
5e7084713SRob Suderman // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6e7084713SRob Suderman //
7e7084713SRob Suderman //===----------------------------------------------------------------------===//
8e7084713SRob Suderman //
9e7084713SRob Suderman // This file implements a pass to automatically promote accessed memref regions
10e7084713SRob Suderman // to buffers in a faster memory space that is explicitly managed, with the
11e7084713SRob Suderman // necessary data movement operations performed through either regular
12e7084713SRob Suderman // point-wise load/store's or DMAs. Such explicit copying (also referred to as
13e7084713SRob Suderman // array packing/unpacking in the literature), when done on arrays that exhibit
14e7084713SRob Suderman // reuse, results in near elimination of conflict misses, TLB misses, reduced
15e7084713SRob Suderman // use of hardware prefetch streams, and reduced false sharing. It is also
16e7084713SRob Suderman // necessary for hardware that explicitly managed levels in the memory
17e7084713SRob Suderman // hierarchy, and where DMAs may have to be used. This optimization is often
18e7084713SRob Suderman // performed on already tiled code.
19e7084713SRob Suderman //
20e7084713SRob Suderman //===----------------------------------------------------------------------===//
21e7084713SRob Suderman 
2267d0d7acSMichele Scuttari #include "mlir/Dialect/Affine/Passes.h"
2367d0d7acSMichele Scuttari 
24755dc07dSRiver Riddle #include "mlir/Dialect/Affine/Analysis/Utils.h"
25e7084713SRob Suderman #include "mlir/Dialect/Affine/IR/AffineOps.h"
26a70aa7bbSRiver Riddle #include "mlir/Dialect/Affine/LoopUtils.h"
27abc362a1SJakub Kuderski #include "mlir/Dialect/Arith/IR/Arith.h"
2867d0d7acSMichele Scuttari #include "mlir/Dialect/Func/IR/FuncOps.h"
2966f878ceSMatthias Springer #include "mlir/Dialect/MemRef/IR/MemRef.h"
30b6eb26fdSRiver Riddle #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
31e7084713SRob Suderman #include "llvm/ADT/MapVector.h"
32e7084713SRob Suderman #include "llvm/Support/CommandLine.h"
33e7084713SRob Suderman #include "llvm/Support/Debug.h"
34e7084713SRob Suderman #include <algorithm>
35a1fe1f5fSKazu Hirata #include <optional>
36e7084713SRob Suderman 
3767d0d7acSMichele Scuttari namespace mlir {
384c48f016SMatthias Springer namespace affine {
3967d0d7acSMichele Scuttari #define GEN_PASS_DEF_AFFINEDATACOPYGENERATION
4067d0d7acSMichele Scuttari #include "mlir/Dialect/Affine/Passes.h.inc"
414c48f016SMatthias Springer } // namespace affine
4267d0d7acSMichele Scuttari } // namespace mlir
4367d0d7acSMichele Scuttari 
44e7084713SRob Suderman #define DEBUG_TYPE "affine-data-copy-generate"
45e7084713SRob Suderman 
46e7084713SRob Suderman using namespace mlir;
474c48f016SMatthias Springer using namespace mlir::affine;
48e7084713SRob Suderman 
49e7084713SRob Suderman namespace {
50e7084713SRob Suderman 
51e7084713SRob Suderman /// Replaces all loads and stores on memref's living in 'slowMemorySpace' by
52e7084713SRob Suderman /// introducing copy operations to transfer data into `fastMemorySpace` and
53e7084713SRob Suderman /// rewriting the original load's/store's to instead load/store from the
54e7084713SRob Suderman /// allocated fast memory buffers. Additional options specify the identifier
55e7084713SRob Suderman /// corresponding to the fast memory space and the amount of fast memory space
56e7084713SRob Suderman /// available. The pass traverses through the nesting structure, recursing to
57e7084713SRob Suderman /// inner levels if necessary to determine at what depth copies need to be
58e7084713SRob Suderman /// placed so that the allocated buffers fit within the memory capacity
59e7084713SRob Suderman /// provided.
609db53a18SRiver Riddle // TODO: We currently can't generate copies correctly when stores
61e7084713SRob Suderman // are strided. Check for strided stores.
62039b969bSMichele Scuttari struct AffineDataCopyGeneration
634c48f016SMatthias Springer     : public affine::impl::AffineDataCopyGenerationBase<
644c48f016SMatthias Springer           AffineDataCopyGeneration> {
65039b969bSMichele Scuttari   AffineDataCopyGeneration() = default;
66039b969bSMichele Scuttari   explicit AffineDataCopyGeneration(unsigned slowMemorySpace,
67400ad6f9SRiver Riddle                                     unsigned fastMemorySpace,
68400ad6f9SRiver Riddle                                     unsigned tagMemorySpace,
69400ad6f9SRiver Riddle                                     int minDmaTransferSize,
70400ad6f9SRiver Riddle                                     uint64_t fastMemCapacityBytes) {
71400ad6f9SRiver Riddle     this->slowMemorySpace = slowMemorySpace;
72400ad6f9SRiver Riddle     this->fastMemorySpace = fastMemorySpace;
73400ad6f9SRiver Riddle     this->tagMemorySpace = tagMemorySpace;
74400ad6f9SRiver Riddle     this->minDmaTransferSize = minDmaTransferSize;
75400ad6f9SRiver Riddle     this->fastMemoryCapacity = fastMemCapacityBytes / 1024;
76400ad6f9SRiver Riddle   }
77e7084713SRob Suderman 
7841574554SRiver Riddle   void runOnOperation() override;
7905f6e939SUday Bondhugula   void runOnBlock(Block *block, DenseSet<Operation *> &copyNests);
80e7084713SRob Suderman 
81e7084713SRob Suderman   // Constant zero index to avoid too many duplicates.
82e7084713SRob Suderman   Value zeroIndex = nullptr;
83e7084713SRob Suderman };
84e7084713SRob Suderman 
85be0a7e9fSMehdi Amini } // namespace
86e7084713SRob Suderman 
87e7084713SRob Suderman /// Generates copies for memref's living in 'slowMemorySpace' into newly created
88e7084713SRob Suderman /// buffers in 'fastMemorySpace', and replaces memory operations to the former
89e7084713SRob Suderman /// by the latter. Only load op's handled for now.
9058ceae95SRiver Riddle std::unique_ptr<OperationPass<func::FuncOp>>
914c48f016SMatthias Springer mlir::affine::createAffineDataCopyGenerationPass(
924c48f016SMatthias Springer     unsigned slowMemorySpace, unsigned fastMemorySpace, unsigned tagMemorySpace,
934c48f016SMatthias Springer     int minDmaTransferSize, uint64_t fastMemCapacityBytes) {
94039b969bSMichele Scuttari   return std::make_unique<AffineDataCopyGeneration>(
95e7084713SRob Suderman       slowMemorySpace, fastMemorySpace, tagMemorySpace, minDmaTransferSize,
96e7084713SRob Suderman       fastMemCapacityBytes);
97e7084713SRob Suderman }
9858ceae95SRiver Riddle std::unique_ptr<OperationPass<func::FuncOp>>
994c48f016SMatthias Springer mlir::affine::createAffineDataCopyGenerationPass() {
100039b969bSMichele Scuttari   return std::make_unique<AffineDataCopyGeneration>();
101e3d834a5SRiver Riddle }
102e7084713SRob Suderman 
103e7084713SRob Suderman /// Generate copies for this block. The block is partitioned into separate
104e7084713SRob Suderman /// ranges: each range is either a sequence of one or more operations starting
105e7084713SRob Suderman /// and ending with an affine load or store op, or just an affine.for op (which
106e7084713SRob Suderman /// could have other affine for op's nested within).
107039b969bSMichele Scuttari void AffineDataCopyGeneration::runOnBlock(Block *block,
108039b969bSMichele Scuttari                                           DenseSet<Operation *> &copyNests) {
109e7084713SRob Suderman   if (block->empty())
11005f6e939SUday Bondhugula     return;
111e7084713SRob Suderman 
112400ad6f9SRiver Riddle   uint64_t fastMemCapacityBytes =
113400ad6f9SRiver Riddle       fastMemoryCapacity != std::numeric_limits<uint64_t>::max()
114400ad6f9SRiver Riddle           ? fastMemoryCapacity * 1024
115400ad6f9SRiver Riddle           : fastMemoryCapacity;
116e7084713SRob Suderman   AffineCopyOptions copyOptions = {generateDma, slowMemorySpace,
117e7084713SRob Suderman                                    fastMemorySpace, tagMemorySpace,
118e7084713SRob Suderman                                    fastMemCapacityBytes};
119e7084713SRob Suderman 
120e7084713SRob Suderman   // Every affine.for op in the block starts and ends a block range for copying;
121e7084713SRob Suderman   // in addition, a contiguous sequence of operations starting with a
122e7084713SRob Suderman   // load/store op but not including any copy nests themselves is also
123e7084713SRob Suderman   // identified as a copy block range. Straightline code (a contiguous chunk of
124e7084713SRob Suderman   // operations excluding AffineForOp's) are always assumed to not exhaust
125e7084713SRob Suderman   // memory. As a result, this approach is conservative in some cases at the
126e7084713SRob Suderman   // moment; we do a check later and report an error with location info.
127e7084713SRob Suderman 
128e7084713SRob Suderman   // Get to the first load, store, or for op (that is not a copy nest itself).
129e7084713SRob Suderman   auto curBegin =
130e7084713SRob Suderman       std::find_if(block->begin(), block->end(), [&](Operation &op) {
131d891d738SRahul Joshi         return isa<AffineLoadOp, AffineStoreOp, AffineForOp>(op) &&
132e7084713SRob Suderman                copyNests.count(&op) == 0;
133e7084713SRob Suderman       });
134e7084713SRob Suderman 
135e7084713SRob Suderman   // Create [begin, end) ranges.
136e7084713SRob Suderman   auto it = curBegin;
137e7084713SRob Suderman   while (it != block->end()) {
138e7084713SRob Suderman     AffineForOp forOp;
139e7084713SRob Suderman     // If you hit a non-copy for loop, we will split there.
140e7084713SRob Suderman     if ((forOp = dyn_cast<AffineForOp>(&*it)) && copyNests.count(forOp) == 0) {
141e7084713SRob Suderman       // Perform the copying up unti this 'for' op first.
14205f6e939SUday Bondhugula       (void)affineDataCopyGenerate(/*begin=*/curBegin, /*end=*/it, copyOptions,
1431a36588eSKazu Hirata                                    /*filterMemRef=*/std::nullopt, copyNests);
144e7084713SRob Suderman 
145e7084713SRob Suderman       // Returns true if the footprint is known to exceed capacity.
146e7084713SRob Suderman       auto exceedsCapacity = [&](AffineForOp forOp) {
1470a81ace0SKazu Hirata         std::optional<int64_t> footprint =
148e7084713SRob Suderman             getMemoryFootprintBytes(forOp,
149e7084713SRob Suderman                                     /*memorySpace=*/0);
150491d2701SKazu Hirata         return (footprint.has_value() &&
151cbb09813SFangrui Song                 static_cast<uint64_t>(*footprint) > fastMemCapacityBytes);
152e7084713SRob Suderman       };
153e7084713SRob Suderman 
154e7084713SRob Suderman       // If the memory footprint of the 'affine.for' loop is higher than fast
155e7084713SRob Suderman       // memory capacity (when provided), we recurse to copy at an inner level
156e7084713SRob Suderman       // until we find a depth at which footprint fits in fast mem capacity. If
157e7084713SRob Suderman       // the footprint can't be calculated, we assume for now it fits. Recurse
158e7084713SRob Suderman       // inside if footprint for 'forOp' exceeds capacity, or when
159e7084713SRob Suderman       // skipNonUnitStrideLoops is set and the step size is not one.
160e7084713SRob Suderman       bool recurseInner = skipNonUnitStrideLoops ? forOp.getStep() != 1
161e7084713SRob Suderman                                                  : exceedsCapacity(forOp);
162e7084713SRob Suderman       if (recurseInner) {
163e7084713SRob Suderman         // We'll recurse and do the copies at an inner level for 'forInst'.
164e7084713SRob Suderman         // Recurse onto the body of this loop.
16505f6e939SUday Bondhugula         runOnBlock(forOp.getBody(), copyNests);
166e7084713SRob Suderman       } else {
167e7084713SRob Suderman         // We have enough capacity, i.e., copies will be computed for the
168e7084713SRob Suderman         // portion of the block until 'it', and for 'it', which is 'forOp'. Note
169e7084713SRob Suderman         // that for the latter, the copies are placed just before this loop (for
170e7084713SRob Suderman         // incoming copies) and right after (for outgoing ones).
171e7084713SRob Suderman 
172e7084713SRob Suderman         // Inner loop copies have their own scope - we don't thus update
173e7084713SRob Suderman         // consumed capacity. The footprint check above guarantees this inner
174e7084713SRob Suderman         // loop's footprint fits.
17505f6e939SUday Bondhugula         (void)affineDataCopyGenerate(/*begin=*/it, /*end=*/std::next(it),
17605f6e939SUday Bondhugula                                      copyOptions,
1771a36588eSKazu Hirata                                      /*filterMemRef=*/std::nullopt, copyNests);
178e7084713SRob Suderman       }
179e7084713SRob Suderman       // Get to the next load or store op after 'forOp'.
180e7084713SRob Suderman       curBegin = std::find_if(std::next(it), block->end(), [&](Operation &op) {
181d891d738SRahul Joshi         return isa<AffineLoadOp, AffineStoreOp, AffineForOp>(op) &&
182e7084713SRob Suderman                copyNests.count(&op) == 0;
183e7084713SRob Suderman       });
184e7084713SRob Suderman       it = curBegin;
185e7084713SRob Suderman     } else {
186e7084713SRob Suderman       assert(copyNests.count(&*it) == 0 &&
187e7084713SRob Suderman              "all copy nests generated should have been skipped above");
188e7084713SRob Suderman       // We simply include this op in the current range and continue for more.
189e7084713SRob Suderman       ++it;
190e7084713SRob Suderman     }
191e7084713SRob Suderman   }
192e7084713SRob Suderman 
193e7084713SRob Suderman   // Generate the copy for the final block range.
194e7084713SRob Suderman   if (curBegin != block->end()) {
195e7084713SRob Suderman     // Can't be a terminator because it would have been skipped above.
196fe7c0d90SRiver Riddle     assert(!curBegin->hasTrait<OpTrait::IsTerminator>() &&
197fe7c0d90SRiver Riddle            "can't be a terminator");
1982ede8918SJeremy Bruestle     // Exclude the affine.yield - hence, the std::prev.
19905f6e939SUday Bondhugula     (void)affineDataCopyGenerate(/*begin=*/curBegin,
20005f6e939SUday Bondhugula                                  /*end=*/std::prev(block->end()), copyOptions,
2011a36588eSKazu Hirata                                  /*filterMemRef=*/std::nullopt, copyNests);
202e7084713SRob Suderman   }
203e7084713SRob Suderman }
204e7084713SRob Suderman 
205039b969bSMichele Scuttari void AffineDataCopyGeneration::runOnOperation() {
20658ceae95SRiver Riddle   func::FuncOp f = getOperation();
207e7084713SRob Suderman   OpBuilder topBuilder(f.getBody());
208a54f4eaeSMogball   zeroIndex = topBuilder.create<arith::ConstantIndexOp>(f.getLoc(), 0);
209e7084713SRob Suderman 
210e7084713SRob Suderman   // Nests that are copy-in's or copy-out's; the root AffineForOps of those
211e7084713SRob Suderman   // nests are stored herein.
212e7084713SRob Suderman   DenseSet<Operation *> copyNests;
213e7084713SRob Suderman 
214e7084713SRob Suderman   // Clear recorded copy nests.
215e7084713SRob Suderman   copyNests.clear();
216e7084713SRob Suderman 
217e7084713SRob Suderman   for (auto &block : f)
21805f6e939SUday Bondhugula     runOnBlock(&block, copyNests);
219e7084713SRob Suderman 
22004b5274eSUday Bondhugula   // Promote any single iteration loops in the copy nests and collect
22104b5274eSUday Bondhugula   // load/stores to simplify.
22204b5274eSUday Bondhugula   SmallVector<Operation *, 4> copyOps;
223ec85d7c8SUday Bondhugula   for (Operation *nest : copyNests)
22404b5274eSUday Bondhugula     // With a post order walk, the erasure of loops does not affect
22504b5274eSUday Bondhugula     // continuation of the walk or the collection of load/store ops.
22604b5274eSUday Bondhugula     nest->walk([&](Operation *op) {
22704b5274eSUday Bondhugula       if (auto forOp = dyn_cast<AffineForOp>(op))
228d1e9c7b6Smax         (void)promoteIfSingleIteration(forOp);
229ee394e68SRahul Joshi       else if (isa<AffineLoadOp, AffineStoreOp>(op))
23004b5274eSUday Bondhugula         copyOps.push_back(op);
23104b5274eSUday Bondhugula     });
23270da33bfSUday Bondhugula 
23370da33bfSUday Bondhugula   // Promoting single iteration loops could lead to simplification of
23404b5274eSUday Bondhugula   // contained load's/store's, and the latter could anyway also be
23504b5274eSUday Bondhugula   // canonicalized.
236dc4e913bSChris Lattner   RewritePatternSet patterns(&getContext());
23770da33bfSUday Bondhugula   AffineLoadOp::getCanonicalizationPatterns(patterns, &getContext());
23870da33bfSUday Bondhugula   AffineStoreOp::getCanonicalizationPatterns(patterns, &getContext());
23979d7f618SChris Lattner   FrozenRewritePatternSet frozenPatterns(std::move(patterns));
2406bdecbcbSMatthias Springer   GreedyRewriteConfig config;
2416bdecbcbSMatthias Springer   config.strictMode = GreedyRewriteStrictness::ExistingAndNewOps;
242*09dfc571SJacques Pienaar   (void)applyOpPatternsGreedily(copyOps, frozenPatterns, config);
243e7084713SRob Suderman }
244