1 //===- AffineDataCopyGeneration.cpp - Explicit memref copying pass ------*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements a pass to automatically promote accessed memref regions 10 // to buffers in a faster memory space that is explicitly managed, with the 11 // necessary data movement operations performed through either regular 12 // point-wise load/store's or DMAs. Such explicit copying (also referred to as 13 // array packing/unpacking in the literature), when done on arrays that exhibit 14 // reuse, results in near elimination of conflict misses, TLB misses, reduced 15 // use of hardware prefetch streams, and reduced false sharing. It is also 16 // necessary for hardware that explicitly managed levels in the memory 17 // hierarchy, and where DMAs may have to be used. This optimization is often 18 // performed on already tiled code. 19 // 20 //===----------------------------------------------------------------------===// 21 22 #include "mlir/Dialect/Affine/Passes.h" 23 24 #include "mlir/Dialect/Affine/Analysis/Utils.h" 25 #include "mlir/Dialect/Affine/IR/AffineOps.h" 26 #include "mlir/Dialect/Affine/LoopUtils.h" 27 #include "mlir/Dialect/Arith/IR/Arith.h" 28 #include "mlir/Dialect/Func/IR/FuncOps.h" 29 #include "mlir/Dialect/MemRef/IR/MemRef.h" 30 #include "mlir/Transforms/GreedyPatternRewriteDriver.h" 31 #include "llvm/ADT/MapVector.h" 32 #include "llvm/Support/CommandLine.h" 33 #include "llvm/Support/Debug.h" 34 #include <algorithm> 35 #include <optional> 36 37 namespace mlir { 38 #define GEN_PASS_DEF_AFFINEDATACOPYGENERATION 39 #include "mlir/Dialect/Affine/Passes.h.inc" 40 } // namespace mlir 41 42 #define DEBUG_TYPE "affine-data-copy-generate" 43 44 using namespace mlir; 45 46 namespace { 47 48 /// Replaces all loads and stores on memref's living in 'slowMemorySpace' by 49 /// introducing copy operations to transfer data into `fastMemorySpace` and 50 /// rewriting the original load's/store's to instead load/store from the 51 /// allocated fast memory buffers. Additional options specify the identifier 52 /// corresponding to the fast memory space and the amount of fast memory space 53 /// available. The pass traverses through the nesting structure, recursing to 54 /// inner levels if necessary to determine at what depth copies need to be 55 /// placed so that the allocated buffers fit within the memory capacity 56 /// provided. 57 // TODO: We currently can't generate copies correctly when stores 58 // are strided. Check for strided stores. 59 struct AffineDataCopyGeneration 60 : public impl::AffineDataCopyGenerationBase<AffineDataCopyGeneration> { 61 AffineDataCopyGeneration() = default; 62 explicit AffineDataCopyGeneration(unsigned slowMemorySpace, 63 unsigned fastMemorySpace, 64 unsigned tagMemorySpace, 65 int minDmaTransferSize, 66 uint64_t fastMemCapacityBytes) { 67 this->slowMemorySpace = slowMemorySpace; 68 this->fastMemorySpace = fastMemorySpace; 69 this->tagMemorySpace = tagMemorySpace; 70 this->minDmaTransferSize = minDmaTransferSize; 71 this->fastMemoryCapacity = fastMemCapacityBytes / 1024; 72 } 73 74 void runOnOperation() override; 75 void runOnBlock(Block *block, DenseSet<Operation *> ©Nests); 76 77 // Constant zero index to avoid too many duplicates. 78 Value zeroIndex = nullptr; 79 }; 80 81 } // namespace 82 83 /// Generates copies for memref's living in 'slowMemorySpace' into newly created 84 /// buffers in 'fastMemorySpace', and replaces memory operations to the former 85 /// by the latter. Only load op's handled for now. 86 /// TODO: extend this to store op's. 87 std::unique_ptr<OperationPass<func::FuncOp>> 88 mlir::createAffineDataCopyGenerationPass(unsigned slowMemorySpace, 89 unsigned fastMemorySpace, 90 unsigned tagMemorySpace, 91 int minDmaTransferSize, 92 uint64_t fastMemCapacityBytes) { 93 return std::make_unique<AffineDataCopyGeneration>( 94 slowMemorySpace, fastMemorySpace, tagMemorySpace, minDmaTransferSize, 95 fastMemCapacityBytes); 96 } 97 std::unique_ptr<OperationPass<func::FuncOp>> 98 mlir::createAffineDataCopyGenerationPass() { 99 return std::make_unique<AffineDataCopyGeneration>(); 100 } 101 102 /// Generate copies for this block. The block is partitioned into separate 103 /// ranges: each range is either a sequence of one or more operations starting 104 /// and ending with an affine load or store op, or just an affine.forop (which 105 /// could have other affine for op's nested within). 106 void AffineDataCopyGeneration::runOnBlock(Block *block, 107 DenseSet<Operation *> ©Nests) { 108 if (block->empty()) 109 return; 110 111 uint64_t fastMemCapacityBytes = 112 fastMemoryCapacity != std::numeric_limits<uint64_t>::max() 113 ? fastMemoryCapacity * 1024 114 : fastMemoryCapacity; 115 AffineCopyOptions copyOptions = {generateDma, slowMemorySpace, 116 fastMemorySpace, tagMemorySpace, 117 fastMemCapacityBytes}; 118 119 // Every affine.for op in the block starts and ends a block range for copying; 120 // in addition, a contiguous sequence of operations starting with a 121 // load/store op but not including any copy nests themselves is also 122 // identified as a copy block range. Straightline code (a contiguous chunk of 123 // operations excluding AffineForOp's) are always assumed to not exhaust 124 // memory. As a result, this approach is conservative in some cases at the 125 // moment; we do a check later and report an error with location info. 126 // TODO: An 'affine.if' operation is being treated similar to an 127 // operation. 'affine.if''s could have 'affine.for's in them; 128 // treat them separately. 129 130 // Get to the first load, store, or for op (that is not a copy nest itself). 131 auto curBegin = 132 std::find_if(block->begin(), block->end(), [&](Operation &op) { 133 return isa<AffineLoadOp, AffineStoreOp, AffineForOp>(op) && 134 copyNests.count(&op) == 0; 135 }); 136 137 // Create [begin, end) ranges. 138 auto it = curBegin; 139 while (it != block->end()) { 140 AffineForOp forOp; 141 // If you hit a non-copy for loop, we will split there. 142 if ((forOp = dyn_cast<AffineForOp>(&*it)) && copyNests.count(forOp) == 0) { 143 // Perform the copying up unti this 'for' op first. 144 (void)affineDataCopyGenerate(/*begin=*/curBegin, /*end=*/it, copyOptions, 145 /*filterMemRef=*/std::nullopt, copyNests); 146 147 // Returns true if the footprint is known to exceed capacity. 148 auto exceedsCapacity = [&](AffineForOp forOp) { 149 std::optional<int64_t> footprint = 150 getMemoryFootprintBytes(forOp, 151 /*memorySpace=*/0); 152 return (footprint.has_value() && 153 static_cast<uint64_t>(*footprint) > fastMemCapacityBytes); 154 }; 155 156 // If the memory footprint of the 'affine.for' loop is higher than fast 157 // memory capacity (when provided), we recurse to copy at an inner level 158 // until we find a depth at which footprint fits in fast mem capacity. If 159 // the footprint can't be calculated, we assume for now it fits. Recurse 160 // inside if footprint for 'forOp' exceeds capacity, or when 161 // skipNonUnitStrideLoops is set and the step size is not one. 162 bool recurseInner = skipNonUnitStrideLoops ? forOp.getStep() != 1 163 : exceedsCapacity(forOp); 164 if (recurseInner) { 165 // We'll recurse and do the copies at an inner level for 'forInst'. 166 // Recurse onto the body of this loop. 167 runOnBlock(forOp.getBody(), copyNests); 168 } else { 169 // We have enough capacity, i.e., copies will be computed for the 170 // portion of the block until 'it', and for 'it', which is 'forOp'. Note 171 // that for the latter, the copies are placed just before this loop (for 172 // incoming copies) and right after (for outgoing ones). 173 174 // Inner loop copies have their own scope - we don't thus update 175 // consumed capacity. The footprint check above guarantees this inner 176 // loop's footprint fits. 177 (void)affineDataCopyGenerate(/*begin=*/it, /*end=*/std::next(it), 178 copyOptions, 179 /*filterMemRef=*/std::nullopt, copyNests); 180 } 181 // Get to the next load or store op after 'forOp'. 182 curBegin = std::find_if(std::next(it), block->end(), [&](Operation &op) { 183 return isa<AffineLoadOp, AffineStoreOp, AffineForOp>(op) && 184 copyNests.count(&op) == 0; 185 }); 186 it = curBegin; 187 } else { 188 assert(copyNests.count(&*it) == 0 && 189 "all copy nests generated should have been skipped above"); 190 // We simply include this op in the current range and continue for more. 191 ++it; 192 } 193 } 194 195 // Generate the copy for the final block range. 196 if (curBegin != block->end()) { 197 // Can't be a terminator because it would have been skipped above. 198 assert(!curBegin->hasTrait<OpTrait::IsTerminator>() && 199 "can't be a terminator"); 200 // Exclude the affine.yield - hence, the std::prev. 201 (void)affineDataCopyGenerate(/*begin=*/curBegin, 202 /*end=*/std::prev(block->end()), copyOptions, 203 /*filterMemRef=*/std::nullopt, copyNests); 204 } 205 } 206 207 void AffineDataCopyGeneration::runOnOperation() { 208 func::FuncOp f = getOperation(); 209 OpBuilder topBuilder(f.getBody()); 210 zeroIndex = topBuilder.create<arith::ConstantIndexOp>(f.getLoc(), 0); 211 212 // Nests that are copy-in's or copy-out's; the root AffineForOps of those 213 // nests are stored herein. 214 DenseSet<Operation *> copyNests; 215 216 // Clear recorded copy nests. 217 copyNests.clear(); 218 219 for (auto &block : f) 220 runOnBlock(&block, copyNests); 221 222 // Promote any single iteration loops in the copy nests and collect 223 // load/stores to simplify. 224 SmallVector<Operation *, 4> copyOps; 225 for (Operation *nest : copyNests) 226 // With a post order walk, the erasure of loops does not affect 227 // continuation of the walk or the collection of load/store ops. 228 nest->walk([&](Operation *op) { 229 if (auto forOp = dyn_cast<AffineForOp>(op)) 230 (void)promoteIfSingleIteration(forOp); 231 else if (isa<AffineLoadOp, AffineStoreOp>(op)) 232 copyOps.push_back(op); 233 }); 234 235 // Promoting single iteration loops could lead to simplification of 236 // contained load's/store's, and the latter could anyway also be 237 // canonicalized. 238 RewritePatternSet patterns(&getContext()); 239 AffineLoadOp::getCanonicalizationPatterns(patterns, &getContext()); 240 AffineStoreOp::getCanonicalizationPatterns(patterns, &getContext()); 241 FrozenRewritePatternSet frozenPatterns(std::move(patterns)); 242 GreedyRewriteConfig config; 243 config.strictMode = GreedyRewriteStrictness::ExistingAndNewOps; 244 (void)applyOpPatternsAndFold(copyOps, frozenPatterns, config); 245 } 246