1e7084713SRob Suderman //===- AffineDataCopyGeneration.cpp - Explicit memref copying pass ------*-===// 2e7084713SRob Suderman // 3e7084713SRob Suderman // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4e7084713SRob Suderman // See https://llvm.org/LICENSE.txt for license information. 5e7084713SRob Suderman // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6e7084713SRob Suderman // 7e7084713SRob Suderman //===----------------------------------------------------------------------===// 8e7084713SRob Suderman // 9e7084713SRob Suderman // This file implements a pass to automatically promote accessed memref regions 10e7084713SRob Suderman // to buffers in a faster memory space that is explicitly managed, with the 11e7084713SRob Suderman // necessary data movement operations performed through either regular 12e7084713SRob Suderman // point-wise load/store's or DMAs. Such explicit copying (also referred to as 13e7084713SRob Suderman // array packing/unpacking in the literature), when done on arrays that exhibit 14e7084713SRob Suderman // reuse, results in near elimination of conflict misses, TLB misses, reduced 15e7084713SRob Suderman // use of hardware prefetch streams, and reduced false sharing. It is also 16e7084713SRob Suderman // necessary for hardware that explicitly managed levels in the memory 17e7084713SRob Suderman // hierarchy, and where DMAs may have to be used. This optimization is often 18e7084713SRob Suderman // performed on already tiled code. 19e7084713SRob Suderman // 20e7084713SRob Suderman //===----------------------------------------------------------------------===// 21e7084713SRob Suderman 2267d0d7acSMichele Scuttari #include "mlir/Dialect/Affine/Passes.h" 2367d0d7acSMichele Scuttari 24755dc07dSRiver Riddle #include "mlir/Dialect/Affine/Analysis/Utils.h" 25e7084713SRob Suderman #include "mlir/Dialect/Affine/IR/AffineOps.h" 26a70aa7bbSRiver Riddle #include "mlir/Dialect/Affine/LoopUtils.h" 27abc362a1SJakub Kuderski #include "mlir/Dialect/Arith/IR/Arith.h" 2867d0d7acSMichele Scuttari #include "mlir/Dialect/Func/IR/FuncOps.h" 2966f878ceSMatthias Springer #include "mlir/Dialect/MemRef/IR/MemRef.h" 30b6eb26fdSRiver Riddle #include "mlir/Transforms/GreedyPatternRewriteDriver.h" 31e7084713SRob Suderman #include "llvm/ADT/MapVector.h" 32e7084713SRob Suderman #include "llvm/Support/CommandLine.h" 33e7084713SRob Suderman #include "llvm/Support/Debug.h" 34e7084713SRob Suderman #include <algorithm> 35a1fe1f5fSKazu Hirata #include <optional> 36e7084713SRob Suderman 3767d0d7acSMichele Scuttari namespace mlir { 384c48f016SMatthias Springer namespace affine { 3967d0d7acSMichele Scuttari #define GEN_PASS_DEF_AFFINEDATACOPYGENERATION 4067d0d7acSMichele Scuttari #include "mlir/Dialect/Affine/Passes.h.inc" 414c48f016SMatthias Springer } // namespace affine 4267d0d7acSMichele Scuttari } // namespace mlir 4367d0d7acSMichele Scuttari 44e7084713SRob Suderman #define DEBUG_TYPE "affine-data-copy-generate" 45e7084713SRob Suderman 46e7084713SRob Suderman using namespace mlir; 474c48f016SMatthias Springer using namespace mlir::affine; 48e7084713SRob Suderman 49e7084713SRob Suderman namespace { 50e7084713SRob Suderman 51e7084713SRob Suderman /// Replaces all loads and stores on memref's living in 'slowMemorySpace' by 52e7084713SRob Suderman /// introducing copy operations to transfer data into `fastMemorySpace` and 53e7084713SRob Suderman /// rewriting the original load's/store's to instead load/store from the 54e7084713SRob Suderman /// allocated fast memory buffers. Additional options specify the identifier 55e7084713SRob Suderman /// corresponding to the fast memory space and the amount of fast memory space 56e7084713SRob Suderman /// available. The pass traverses through the nesting structure, recursing to 57e7084713SRob Suderman /// inner levels if necessary to determine at what depth copies need to be 58e7084713SRob Suderman /// placed so that the allocated buffers fit within the memory capacity 59e7084713SRob Suderman /// provided. 609db53a18SRiver Riddle // TODO: We currently can't generate copies correctly when stores 61e7084713SRob Suderman // are strided. Check for strided stores. 62039b969bSMichele Scuttari struct AffineDataCopyGeneration 634c48f016SMatthias Springer : public affine::impl::AffineDataCopyGenerationBase< 644c48f016SMatthias Springer AffineDataCopyGeneration> { 65039b969bSMichele Scuttari AffineDataCopyGeneration() = default; 66039b969bSMichele Scuttari explicit AffineDataCopyGeneration(unsigned slowMemorySpace, 67400ad6f9SRiver Riddle unsigned fastMemorySpace, 68400ad6f9SRiver Riddle unsigned tagMemorySpace, 69400ad6f9SRiver Riddle int minDmaTransferSize, 70400ad6f9SRiver Riddle uint64_t fastMemCapacityBytes) { 71400ad6f9SRiver Riddle this->slowMemorySpace = slowMemorySpace; 72400ad6f9SRiver Riddle this->fastMemorySpace = fastMemorySpace; 73400ad6f9SRiver Riddle this->tagMemorySpace = tagMemorySpace; 74400ad6f9SRiver Riddle this->minDmaTransferSize = minDmaTransferSize; 75400ad6f9SRiver Riddle this->fastMemoryCapacity = fastMemCapacityBytes / 1024; 76400ad6f9SRiver Riddle } 77e7084713SRob Suderman 7841574554SRiver Riddle void runOnOperation() override; 7905f6e939SUday Bondhugula void runOnBlock(Block *block, DenseSet<Operation *> ©Nests); 80e7084713SRob Suderman 81e7084713SRob Suderman // Constant zero index to avoid too many duplicates. 82e7084713SRob Suderman Value zeroIndex = nullptr; 83e7084713SRob Suderman }; 84e7084713SRob Suderman 85be0a7e9fSMehdi Amini } // namespace 86e7084713SRob Suderman 87e7084713SRob Suderman /// Generates copies for memref's living in 'slowMemorySpace' into newly created 88e7084713SRob Suderman /// buffers in 'fastMemorySpace', and replaces memory operations to the former 89e7084713SRob Suderman /// by the latter. Only load op's handled for now. 9058ceae95SRiver Riddle std::unique_ptr<OperationPass<func::FuncOp>> 914c48f016SMatthias Springer mlir::affine::createAffineDataCopyGenerationPass( 924c48f016SMatthias Springer unsigned slowMemorySpace, unsigned fastMemorySpace, unsigned tagMemorySpace, 934c48f016SMatthias Springer int minDmaTransferSize, uint64_t fastMemCapacityBytes) { 94039b969bSMichele Scuttari return std::make_unique<AffineDataCopyGeneration>( 95e7084713SRob Suderman slowMemorySpace, fastMemorySpace, tagMemorySpace, minDmaTransferSize, 96e7084713SRob Suderman fastMemCapacityBytes); 97e7084713SRob Suderman } 9858ceae95SRiver Riddle std::unique_ptr<OperationPass<func::FuncOp>> 994c48f016SMatthias Springer mlir::affine::createAffineDataCopyGenerationPass() { 100039b969bSMichele Scuttari return std::make_unique<AffineDataCopyGeneration>(); 101e3d834a5SRiver Riddle } 102e7084713SRob Suderman 103e7084713SRob Suderman /// Generate copies for this block. The block is partitioned into separate 104e7084713SRob Suderman /// ranges: each range is either a sequence of one or more operations starting 105e7084713SRob Suderman /// and ending with an affine load or store op, or just an affine.for op (which 106e7084713SRob Suderman /// could have other affine for op's nested within). 107039b969bSMichele Scuttari void AffineDataCopyGeneration::runOnBlock(Block *block, 108039b969bSMichele Scuttari DenseSet<Operation *> ©Nests) { 109e7084713SRob Suderman if (block->empty()) 11005f6e939SUday Bondhugula return; 111e7084713SRob Suderman 112400ad6f9SRiver Riddle uint64_t fastMemCapacityBytes = 113400ad6f9SRiver Riddle fastMemoryCapacity != std::numeric_limits<uint64_t>::max() 114400ad6f9SRiver Riddle ? fastMemoryCapacity * 1024 115400ad6f9SRiver Riddle : fastMemoryCapacity; 116e7084713SRob Suderman AffineCopyOptions copyOptions = {generateDma, slowMemorySpace, 117e7084713SRob Suderman fastMemorySpace, tagMemorySpace, 118e7084713SRob Suderman fastMemCapacityBytes}; 119e7084713SRob Suderman 120e7084713SRob Suderman // Every affine.for op in the block starts and ends a block range for copying; 121e7084713SRob Suderman // in addition, a contiguous sequence of operations starting with a 122e7084713SRob Suderman // load/store op but not including any copy nests themselves is also 123e7084713SRob Suderman // identified as a copy block range. Straightline code (a contiguous chunk of 124e7084713SRob Suderman // operations excluding AffineForOp's) are always assumed to not exhaust 125e7084713SRob Suderman // memory. As a result, this approach is conservative in some cases at the 126e7084713SRob Suderman // moment; we do a check later and report an error with location info. 127e7084713SRob Suderman 128e7084713SRob Suderman // Get to the first load, store, or for op (that is not a copy nest itself). 129e7084713SRob Suderman auto curBegin = 130e7084713SRob Suderman std::find_if(block->begin(), block->end(), [&](Operation &op) { 131d891d738SRahul Joshi return isa<AffineLoadOp, AffineStoreOp, AffineForOp>(op) && 132e7084713SRob Suderman copyNests.count(&op) == 0; 133e7084713SRob Suderman }); 134e7084713SRob Suderman 135e7084713SRob Suderman // Create [begin, end) ranges. 136e7084713SRob Suderman auto it = curBegin; 137e7084713SRob Suderman while (it != block->end()) { 138e7084713SRob Suderman AffineForOp forOp; 139e7084713SRob Suderman // If you hit a non-copy for loop, we will split there. 140e7084713SRob Suderman if ((forOp = dyn_cast<AffineForOp>(&*it)) && copyNests.count(forOp) == 0) { 141e7084713SRob Suderman // Perform the copying up unti this 'for' op first. 14205f6e939SUday Bondhugula (void)affineDataCopyGenerate(/*begin=*/curBegin, /*end=*/it, copyOptions, 1431a36588eSKazu Hirata /*filterMemRef=*/std::nullopt, copyNests); 144e7084713SRob Suderman 145e7084713SRob Suderman // Returns true if the footprint is known to exceed capacity. 146e7084713SRob Suderman auto exceedsCapacity = [&](AffineForOp forOp) { 1470a81ace0SKazu Hirata std::optional<int64_t> footprint = 148e7084713SRob Suderman getMemoryFootprintBytes(forOp, 149e7084713SRob Suderman /*memorySpace=*/0); 150491d2701SKazu Hirata return (footprint.has_value() && 151cbb09813SFangrui Song static_cast<uint64_t>(*footprint) > fastMemCapacityBytes); 152e7084713SRob Suderman }; 153e7084713SRob Suderman 154e7084713SRob Suderman // If the memory footprint of the 'affine.for' loop is higher than fast 155e7084713SRob Suderman // memory capacity (when provided), we recurse to copy at an inner level 156e7084713SRob Suderman // until we find a depth at which footprint fits in fast mem capacity. If 157e7084713SRob Suderman // the footprint can't be calculated, we assume for now it fits. Recurse 158e7084713SRob Suderman // inside if footprint for 'forOp' exceeds capacity, or when 159e7084713SRob Suderman // skipNonUnitStrideLoops is set and the step size is not one. 160e7084713SRob Suderman bool recurseInner = skipNonUnitStrideLoops ? forOp.getStep() != 1 161e7084713SRob Suderman : exceedsCapacity(forOp); 162e7084713SRob Suderman if (recurseInner) { 163e7084713SRob Suderman // We'll recurse and do the copies at an inner level for 'forInst'. 164e7084713SRob Suderman // Recurse onto the body of this loop. 16505f6e939SUday Bondhugula runOnBlock(forOp.getBody(), copyNests); 166e7084713SRob Suderman } else { 167e7084713SRob Suderman // We have enough capacity, i.e., copies will be computed for the 168e7084713SRob Suderman // portion of the block until 'it', and for 'it', which is 'forOp'. Note 169e7084713SRob Suderman // that for the latter, the copies are placed just before this loop (for 170e7084713SRob Suderman // incoming copies) and right after (for outgoing ones). 171e7084713SRob Suderman 172e7084713SRob Suderman // Inner loop copies have their own scope - we don't thus update 173e7084713SRob Suderman // consumed capacity. The footprint check above guarantees this inner 174e7084713SRob Suderman // loop's footprint fits. 17505f6e939SUday Bondhugula (void)affineDataCopyGenerate(/*begin=*/it, /*end=*/std::next(it), 17605f6e939SUday Bondhugula copyOptions, 1771a36588eSKazu Hirata /*filterMemRef=*/std::nullopt, copyNests); 178e7084713SRob Suderman } 179e7084713SRob Suderman // Get to the next load or store op after 'forOp'. 180e7084713SRob Suderman curBegin = std::find_if(std::next(it), block->end(), [&](Operation &op) { 181d891d738SRahul Joshi return isa<AffineLoadOp, AffineStoreOp, AffineForOp>(op) && 182e7084713SRob Suderman copyNests.count(&op) == 0; 183e7084713SRob Suderman }); 184e7084713SRob Suderman it = curBegin; 185e7084713SRob Suderman } else { 186e7084713SRob Suderman assert(copyNests.count(&*it) == 0 && 187e7084713SRob Suderman "all copy nests generated should have been skipped above"); 188e7084713SRob Suderman // We simply include this op in the current range and continue for more. 189e7084713SRob Suderman ++it; 190e7084713SRob Suderman } 191e7084713SRob Suderman } 192e7084713SRob Suderman 193e7084713SRob Suderman // Generate the copy for the final block range. 194e7084713SRob Suderman if (curBegin != block->end()) { 195e7084713SRob Suderman // Can't be a terminator because it would have been skipped above. 196fe7c0d90SRiver Riddle assert(!curBegin->hasTrait<OpTrait::IsTerminator>() && 197fe7c0d90SRiver Riddle "can't be a terminator"); 1982ede8918SJeremy Bruestle // Exclude the affine.yield - hence, the std::prev. 19905f6e939SUday Bondhugula (void)affineDataCopyGenerate(/*begin=*/curBegin, 20005f6e939SUday Bondhugula /*end=*/std::prev(block->end()), copyOptions, 2011a36588eSKazu Hirata /*filterMemRef=*/std::nullopt, copyNests); 202e7084713SRob Suderman } 203e7084713SRob Suderman } 204e7084713SRob Suderman 205039b969bSMichele Scuttari void AffineDataCopyGeneration::runOnOperation() { 20658ceae95SRiver Riddle func::FuncOp f = getOperation(); 207e7084713SRob Suderman OpBuilder topBuilder(f.getBody()); 208a54f4eaeSMogball zeroIndex = topBuilder.create<arith::ConstantIndexOp>(f.getLoc(), 0); 209e7084713SRob Suderman 210e7084713SRob Suderman // Nests that are copy-in's or copy-out's; the root AffineForOps of those 211e7084713SRob Suderman // nests are stored herein. 212e7084713SRob Suderman DenseSet<Operation *> copyNests; 213e7084713SRob Suderman 214e7084713SRob Suderman // Clear recorded copy nests. 215e7084713SRob Suderman copyNests.clear(); 216e7084713SRob Suderman 217e7084713SRob Suderman for (auto &block : f) 21805f6e939SUday Bondhugula runOnBlock(&block, copyNests); 219e7084713SRob Suderman 22004b5274eSUday Bondhugula // Promote any single iteration loops in the copy nests and collect 22104b5274eSUday Bondhugula // load/stores to simplify. 22204b5274eSUday Bondhugula SmallVector<Operation *, 4> copyOps; 223ec85d7c8SUday Bondhugula for (Operation *nest : copyNests) 22404b5274eSUday Bondhugula // With a post order walk, the erasure of loops does not affect 22504b5274eSUday Bondhugula // continuation of the walk or the collection of load/store ops. 22604b5274eSUday Bondhugula nest->walk([&](Operation *op) { 22704b5274eSUday Bondhugula if (auto forOp = dyn_cast<AffineForOp>(op)) 228d1e9c7b6Smax (void)promoteIfSingleIteration(forOp); 229ee394e68SRahul Joshi else if (isa<AffineLoadOp, AffineStoreOp>(op)) 23004b5274eSUday Bondhugula copyOps.push_back(op); 23104b5274eSUday Bondhugula }); 23270da33bfSUday Bondhugula 23370da33bfSUday Bondhugula // Promoting single iteration loops could lead to simplification of 23404b5274eSUday Bondhugula // contained load's/store's, and the latter could anyway also be 23504b5274eSUday Bondhugula // canonicalized. 236dc4e913bSChris Lattner RewritePatternSet patterns(&getContext()); 23770da33bfSUday Bondhugula AffineLoadOp::getCanonicalizationPatterns(patterns, &getContext()); 23870da33bfSUday Bondhugula AffineStoreOp::getCanonicalizationPatterns(patterns, &getContext()); 23979d7f618SChris Lattner FrozenRewritePatternSet frozenPatterns(std::move(patterns)); 2406bdecbcbSMatthias Springer GreedyRewriteConfig config; 2416bdecbcbSMatthias Springer config.strictMode = GreedyRewriteStrictness::ExistingAndNewOps; 242*09dfc571SJacques Pienaar (void)applyOpPatternsGreedily(copyOps, frozenPatterns, config); 243e7084713SRob Suderman } 244