1 //===- LoopTiling.cpp --- Loop tiling pass ------------------------------*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements a pass to tile loop nests. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "mlir/Dialect/Affine/Passes.h" 14 15 #include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h" 16 #include "mlir/Dialect/Affine/Analysis/AffineStructures.h" 17 #include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h" 18 #include "mlir/Dialect/Affine/Analysis/Utils.h" 19 #include "mlir/Dialect/Affine/IR/AffineOps.h" 20 #include "mlir/Dialect/Affine/IR/AffineValueMap.h" 21 #include "mlir/Dialect/Affine/LoopUtils.h" 22 #include "mlir/Dialect/Affine/Utils.h" 23 #include "mlir/Dialect/Func/IR/FuncOps.h" 24 #include "mlir/IR/Builders.h" 25 #include "mlir/IR/IRMapping.h" 26 #include "llvm/Support/CommandLine.h" 27 #include "llvm/Support/Debug.h" 28 #include <optional> 29 30 namespace mlir { 31 namespace affine { 32 #define GEN_PASS_DEF_AFFINELOOPTILING 33 #include "mlir/Dialect/Affine/Passes.h.inc" 34 } // namespace affine 35 } // namespace mlir 36 37 using namespace mlir; 38 using namespace mlir::affine; 39 40 #define DEBUG_TYPE "affine-loop-tile" 41 42 namespace { 43 44 /// A pass to perform loop tiling on all suitable loop nests of a Function. 45 struct LoopTiling : public affine::impl::AffineLoopTilingBase<LoopTiling> { 46 LoopTiling() = default; 47 explicit LoopTiling(uint64_t cacheSizeBytes, bool avoidMaxMinBounds = true) 48 : avoidMaxMinBounds(avoidMaxMinBounds) { 49 this->cacheSizeInKiB = cacheSizeBytes / 1024; 50 } 51 52 void runOnOperation() override; 53 void getTileSizes(ArrayRef<AffineForOp> band, 54 SmallVectorImpl<unsigned> *tileSizes); 55 56 // Default tile size if nothing is provided. 57 constexpr static unsigned kDefaultTileSize = 4; 58 59 // If true, tile sizes are set to avoid max/min in bounds if possible. 60 bool avoidMaxMinBounds = true; 61 }; 62 63 } // namespace 64 65 /// Creates a pass to perform loop tiling on all suitable loop nests of a 66 /// Function. 67 std::unique_ptr<OperationPass<func::FuncOp>> 68 mlir::affine::createLoopTilingPass(uint64_t cacheSizeBytes) { 69 return std::make_unique<LoopTiling>(cacheSizeBytes); 70 } 71 std::unique_ptr<OperationPass<func::FuncOp>> 72 mlir::affine::createLoopTilingPass() { 73 return std::make_unique<LoopTiling>(); 74 } 75 76 /// Reduces each tile size to the largest divisor of the corresponding trip 77 /// count (if the trip count is known). 78 static void adjustToDivisorsOfTripCounts(ArrayRef<AffineForOp> band, 79 SmallVectorImpl<unsigned> *tileSizes) { 80 assert(band.size() == tileSizes->size() && "invalid tile size count"); 81 for (unsigned i = 0, e = band.size(); i < e; i++) { 82 unsigned &tSizeAdjusted = (*tileSizes)[i]; 83 std::optional<uint64_t> mayConst = getConstantTripCount(band[i]); 84 if (!mayConst) 85 continue; 86 // Adjust the tile size to largest factor of the trip count less than 87 // tSize. 88 uint64_t constTripCount = *mayConst; 89 if (constTripCount > 1 && tSizeAdjusted > constTripCount / 2) 90 tSizeAdjusted = constTripCount / 2; 91 while (constTripCount % tSizeAdjusted != 0) 92 tSizeAdjusted--; 93 } 94 } 95 96 /// Checks whether hyper-rectangular loop tiling of the nest represented by 97 /// `origLoops` is valid. The validity condition is from Irigoin and Triolet, 98 /// which states that two tiles cannot depend on each other. We simplify such 99 /// condition to just checking whether there is any negative dependence 100 /// direction, since we have the prior knowledge that the tiling results will be 101 /// hyper-rectangles, which are scheduled in the lexicographically increasing 102 /// order on the vector of loop indices. This function will return failure when 103 /// any dependence component is negative along any of `origLoops`. 104 static bool checkTilingLegality(MutableArrayRef<AffineForOp> origLoops) { 105 assert(!origLoops.empty() && "no original loops provided"); 106 107 // We first find out all dependences we intend to check. 108 SmallVector<Operation *, 8> loadAndStoreOps; 109 origLoops[0]->walk([&](Operation *op) { 110 if (isa<AffineReadOpInterface, AffineWriteOpInterface>(op)) 111 loadAndStoreOps.push_back(op); 112 }); 113 114 unsigned numOps = loadAndStoreOps.size(); 115 unsigned numLoops = origLoops.size(); 116 for (unsigned d = 1; d <= numLoops + 1; ++d) { 117 for (unsigned i = 0; i < numOps; ++i) { 118 Operation *srcOp = loadAndStoreOps[i]; 119 MemRefAccess srcAccess(srcOp); 120 for (unsigned j = 0; j < numOps; ++j) { 121 Operation *dstOp = loadAndStoreOps[j]; 122 MemRefAccess dstAccess(dstOp); 123 124 SmallVector<DependenceComponent, 2> depComps; 125 DependenceResult result = checkMemrefAccessDependence( 126 srcAccess, dstAccess, d, /*dependenceConstraints=*/nullptr, 127 &depComps); 128 129 // Skip if there is no dependence in this case. 130 if (!hasDependence(result)) 131 continue; 132 133 // Check whether there is any negative direction vector in the 134 // dependence components found above, which means that dependence is 135 // violated by the default hyper-rect tiling method. 136 LLVM_DEBUG(llvm::dbgs() << "Checking whether tiling legality violated " 137 "for dependence at depth: " 138 << Twine(d) << " between:\n";); 139 LLVM_DEBUG(srcAccess.opInst->dump();); 140 LLVM_DEBUG(dstAccess.opInst->dump();); 141 for (unsigned k = 0, e = depComps.size(); k < e; k++) { 142 DependenceComponent depComp = depComps[k]; 143 if (depComp.lb.has_value() && depComp.ub.has_value() && 144 *depComp.lb < *depComp.ub && *depComp.ub < 0) { 145 LLVM_DEBUG(llvm::dbgs() 146 << "Dependence component lb = " << Twine(*depComp.lb) 147 << " ub = " << Twine(*depComp.ub) 148 << " is negative at depth: " << Twine(d) 149 << " and thus violates the legality rule.\n"); 150 return false; 151 } 152 } 153 } 154 } 155 } 156 157 return true; 158 } 159 160 // Returns tile sizes to use. Checks CL options; if none are specified, sets it 161 // based on a simple model that looks at the memory footprint and determines 162 // tile sizes assuming identity accesses / 1:1 tile size proportional footprint 163 // along each of the dimensions being tiled. 164 // TODO: evolve this model. Tile size determination is a large area 165 // to play with in general. 166 void LoopTiling::getTileSizes(ArrayRef<AffineForOp> band, 167 SmallVectorImpl<unsigned> *tileSizes) { 168 if (band.empty()) 169 return; 170 171 // Use command-line tileSize for all loops if specified. 172 if (tileSize) { 173 tileSizes->assign(band.size(), tileSize); 174 return; 175 } 176 177 // Use tileSizes and fill them with default tile size if it's short. 178 if (!this->tileSizes.empty()) { 179 tileSizes->assign(this->tileSizes.begin(), this->tileSizes.end()); 180 tileSizes->resize(band.size(), kDefaultTileSize); 181 return; 182 } 183 tileSizes->resize(band.size()); 184 185 // The first loop in the band. 186 AffineForOp rootForOp = band[0]; 187 (void)rootForOp; 188 189 // Obtain memory footprint and set tile sizes so that a tile fits in 190 // the cache size. This is an approximation with the assumption that the 191 // footprint increases with the tile size linearly in that dimension (i.e., 192 // assumes one-to-one access function). 193 std::optional<int64_t> fp = getMemoryFootprintBytes(band[0], 0); 194 if (!fp) { 195 // Fill with default tile sizes if footprint is unknown. 196 std::fill(tileSizes->begin(), tileSizes->end(), 197 LoopTiling::kDefaultTileSize); 198 if (avoidMaxMinBounds) 199 adjustToDivisorsOfTripCounts(band, tileSizes); 200 LLVM_DEBUG( 201 rootForOp.emitWarning("memory footprint unknown: using default tile " 202 "sizes adjusted to trip count divisors")); 203 return; 204 } 205 206 // Check how many times larger the cache size is when compared to footprint. 207 uint64_t cacheSizeBytes = cacheSizeInKiB * 1024; 208 uint64_t excessFactor = llvm::divideCeil(*fp, cacheSizeBytes); 209 if (excessFactor <= 1) { 210 // No need of any tiling - set tile size to 1. 211 std::fill(tileSizes->begin(), tileSizes->end(), 1); 212 return; 213 } 214 215 // Divide all loops equally in an attempt to reduce footprint. 216 // TODO: this is approximate. Ideally, obtain reuse factor / 217 // profitability along each dimension and weight tile sizes based on that as 218 // one possible approach. Or compute a polynomial in tile sizes and solve for 219 // it. 220 221 // For an n-d tileable band, compute the n^th root of the excess. 222 unsigned tSize = 223 static_cast<unsigned>(floorl(std::pow(excessFactor, 1.0 / band.size()))); 224 // We'll keep a running product to determine the last tile size better. 225 unsigned cumulProductOfTileSizes = 1; 226 for (unsigned i = 0, e = band.size(); i < e; i++) { 227 if (i < e - 1) 228 (*tileSizes)[i] = tSize; 229 else 230 // Set last tile size to cover the balance. 231 (*tileSizes)[i] = std::max( 232 1U, static_cast<unsigned>(excessFactor / cumulProductOfTileSizes)); 233 cumulProductOfTileSizes *= (*tileSizes)[i]; 234 } 235 if (avoidMaxMinBounds) 236 adjustToDivisorsOfTripCounts(band, tileSizes); 237 } 238 239 void LoopTiling::runOnOperation() { 240 // Bands of loops to tile. 241 std::vector<SmallVector<AffineForOp, 6>> bands; 242 getTileableBands(getOperation(), &bands); 243 244 // Tile each band. 245 for (auto &band : bands) { 246 if (!checkTilingLegality(band)) { 247 band.front().emitRemark("tiling code is illegal due to dependences"); 248 continue; 249 } 250 251 // Set up tile sizes; fill missing tile sizes at the end with default tile 252 // size or tileSize if one was provided. 253 SmallVector<unsigned, 6> tileSizes; 254 getTileSizes(band, &tileSizes); 255 if (llvm::DebugFlag) { 256 auto diag = band[0].emitRemark("using tile sizes ["); 257 for (unsigned tSize : tileSizes) 258 diag << tSize << ' '; 259 diag << "]\n"; 260 } 261 SmallVector<AffineForOp, 6> tiledNest; 262 if (failed(tilePerfectlyNested(band, tileSizes, &tiledNest))) { 263 // An empty band always succeeds. 264 assert(!band.empty() && "guaranteed to succeed on empty bands"); 265 LLVM_DEBUG(band.front()->emitRemark("loop tiling failed!\n")); 266 continue; 267 } 268 269 // Separate full and partial tiles. 270 if (separate) { 271 auto intraTileLoops = 272 MutableArrayRef<AffineForOp>(tiledNest).drop_front(band.size()); 273 if (failed(separateFullTiles(intraTileLoops))) { 274 assert(!intraTileLoops.empty() && 275 "guaranteed to succeed on empty bands"); 276 LLVM_DEBUG(intraTileLoops.front()->emitRemark( 277 "separation post tiling failed!\n")); 278 } 279 } 280 } 281 } 282 283 constexpr unsigned LoopTiling::kDefaultTileSize; 284