Lines Matching +full:depth +full:- +full:wise
1 //===- LoopUtils.cpp ---- Misc utilities for loop transformation ----------===//
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
11 //===----------------------------------------------------------------------===//
31 #define DEBUG_TYPE "loop-utils"
64 // lb + tr1 - tr1 % ufactor, lb + tr2 - tr2 % ufactor; the results of all
71 bumpExprs[i] = (tripCountExpr - tripCountExpr % unrollFactor) * step;
94 v.getDefiningOp()->erase();
111 auto innerResults = forOp.getBody()->getTerminator()->getOperands();
129 auto *parentBlock = forOp->getBlock();
132 auto func = forOp->getParentOfType<FunctionOpInterface>();
133 OpBuilder builder(forOp->getContext());
160 forOp.getBody()->back().erase();
161 parentBlock->getOperations().splice(Block::iterator(forOp),
162 forOp.getBody()->getOperations());
198 // Generate the remapping if the shift is not zero: remappedIV = newIV -
204 -static_cast<int64_t>(srcForOp.getStepAsInt() * shift)),
225 // in the body of the for loop - (using the 'sweep line' paradigm). This method
227 // memory-based dependence preservation check rests with the users of this
232 assert(forOp.getBody()->getOperations().size() == shifts.size() &&
234 if (forOp.getBody()->begin() == std::prev(forOp.getBody()->end()))
243 LLVM_DEBUG(forOp.emitRemark("non-constant trip count loop not handled"));
268 for (auto &op : forOp.getBody()->without_terminator()) {
359 /// Checks whether a loop nest is hyper-rectangular or not.
364 // 0-d or 1-d is trivially hyper-rectangular.
373 << "Non-hyperrectangular nests not supported for tiling!\n");
399 // TODO: handle non hyper-rectangular spaces.
410 auto &ops = src.getBody()->getOperations();
411 dest.getBody()->getOperations().splice(loc, ops, ops.begin(),
418 moveLoopBodyImpl(src, dest, dest.getBody()->begin());
432 // Add intra-tile (or point) loops.
437 pointLoop.getBody()->getOperations().splice(
438 pointLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
440 tiledLoops[2 * width - 1 - i] = pointLoop;
451 tileSpaceLoop.getBody()->getOperations().splice(
452 tileSpaceLoop.getBody()->begin(), topLoop->getBlock()->getOperations(),
454 tiledLoops[2 * width - i - 1] = tileSpaceLoop;
462 /// Set lower and upper bounds of intra-tile loops for parametric tiling.
463 // TODO: Handle non-constant lower bounds.
468 // The lower bound for the intra-tile loop is represented by an affine map
469 // as (%i, %t0)->((%i - %origlb) * %t0 + %origlb). Similarly, the upper bound
470 // for the intra-tile loop is represented by an affine map as (%i, %t0)->((%i
471 // - %origlb) * %t0) + (%t0 * %origLoopStep) + %origlb), where %i is loop IV
472 // of the corresponding inter-tile loop, %t0 is the corresponding tiling
474 // corresponding inter-tile loop.
504 AffineExpr lbLoopIvExpr = b.getAffineDimExpr(lbOperands.size() - 1);
505 AffineExpr ubLoopIvExpr = b.getAffineDimExpr(ubOperands.size() - 1);
526 // Insert lb as inter-tile ((loop IV - origlb) * tilingParameter) + origlb.
528 ((lbLoopIvExpr - origLowerBoundExpr) * lbTileParameter) +
534 // Insert ub as inter-tile ((loop IV - origlb) * tilingParameter) +
537 ((ubLoopIvExpr - origLowerBoundExpr) * ubTileParameter) +
557 /// Set lower and upper bounds of inter-tile loops for parametric tiling.
558 // TODO: Handle non-constant lower bounds.
563 // The lower bounds for inter-tile loops are same as the corresponding lower
567 // The new upper bound map for inter-tile loops, assuming constant lower
568 // bounds, are now originalLowerBound + ceildiv((originalUpperBound -
571 // ()->(1024), the new map will be
572 // ()[s0]->(ceildiv((1024 -lb) % s0)), where s0 is the tiling parameter.
615 // Insert the bound as originalLowerBoundceildiv((originalUpperBound -
619 (origUpperBoundExpr - origLowerBoundExpr).ceilDiv(tileParameter));
627 // new loop bounds here will be like ()[s0, s1] -> ((s0 - 5) ceildiv s1 + 5)
630 // #map0 = affine_map<()[s0, s1] -> (s0, s1)
634 // new loop bounds will be like ()[s0, s1, s2] -> ((s0 - 5) ceildiv s2 + 5,
635 // (s1 -5) ceildiv s2 + 5), where s2 is the tiling parameter.
637 // Insert the bounds as originalLowerBound + ceildiv((originalUpperBound -
642 (origUpperBoundExpr - origLowerBoundExpr).ceilDiv(tileParameter));
655 /// hyper-rectangular index sets, where the bounds of one dimension do not
675 // Set bounds for intra-tile loops.
683 /// hyper-rectangular index sets, where the bounds of one dimension do not
707 // Bounds for intra-tile loops.
712 // The lower bound is just the tile-space loop.
716 // The step sizes of intra-tile loops is just the original loops' step size.
728 // Intra-tile loop ii goes from i to min(i + tileSize * stepSize, ub_i).
730 // with 'i' (tile-space loop) appended to it. The new upper bound map is
798 // Replace original IVs with intra-tile loop IVs.
811 /// Tiles the specified band of perfectly nested loops creating tile-space
812 /// loops and intra-tile loops, using SSA values as tiling parameters. A band
839 // Replace original IVs with intra-tile loop IVs.
853 /// (the first op being another AffineFor, and the second op - a terminator).
880 bands->push_back(band);
926 // Keep a pointer to the last non-terminator operation in the original block
927 // so that we know what to clone (since we are doing this in-place).
928 Block::iterator srcBlockEnd = std::prev(loopBodyBlock->end(), 2);
930 // Unroll the contents of 'forOp' (append unrollFactor - 1 additional copies).
947 for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++) {
958 if (defOp && defOp->getBlock() == loopBodyBlock)
965 for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++)
969 loopBodyBlock->getTerminator()->setOperands(lastYielded);
972 /// Helper to generate cleanup loop for unroll or unroll-and-jam when the trip
977 OpBuilder builder(forOp->getBlock(), std::next(Block::iterator(forOp)));
988 cleanupForOp->replaceUsesOfWith(std::get<2>(e), std::get<0>(e));
1024 if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
1055 auto yieldedValues = forOp.getBody()->getTerminator()->getOperands();
1112 if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
1117 LLVM_DEBUG(llvm::dbgs() << "[failed] trip count < unroll-jam factor\n");
1126 // Gather all sub-blocks to jam upon the loop being unrolled.
1158 // `operandMaps[i - 1]` carries old->new operand mapping for the ith unrolled
1159 // iteration. There are (`unrollJamFactor` - 1) iterations.
1160 SmallVector<IRMapping, 4> operandMaps(unrollJamFactor - 1);
1172 cast<AffineYieldOp>(oldForOp.getBody()->getTerminator()).getOperands();
1174 // fix iterOperands and yield operands after cloning of sub-blocks.
1175 for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1199 for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1201 // `newForOp` has `unrollJamFactor` - 1 new sets of iterArgs and
1202 // results. Update `operandMaps[i - 1]` to map old iterArgs and results
1204 operandMaps[i - 1].map(newIterArgs[j],
1206 operandMaps[i - 1].map(newResults[j],
1212 // Scale the step of loop being unroll-jammed by the unroll-jam factor.
1217 // Unroll and jam (appends unrollJamFactor - 1 additional copies).
1218 for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1220 // Builder to insert unroll-jammed bodies. Insert right at the end of
1221 // sub-block.
1222 OpBuilder builder(subBlock.first->getBlock(), std::next(subBlock.second));
1227 // iv' = iv + i * step, i = 1 to unrollJamFactor-1.
1232 operandMaps[i - 1].map(forOpIV, ivUnroll);
1234 // Clone the sub-block being unroll-jammed.
1236 builder.clone(*it, operandMaps[i - 1]);
1243 auto yieldOp = cast<AffineYieldOp>(newForOp.getBody()->getTerminator());
1249 // needs to be replaced with a mapped value from `operandMaps[i - 1]`
1252 operandMaps[i - 1].lookupOrDefault(
1256 operandMaps[i - 1].lookupOrDefault(yieldOp.getOperand(j)));
1273 for (unsigned i = unrollJamFactor - 1; i >= 1; --i) {
1294 /// nested within 'forOpA' as the only non-terminator operation in its block.
1296 assert(&*forOpA.getBody()->begin() == forOpB.getOperation());
1297 auto &forOpABody = forOpA.getBody()->getOperations();
1298 auto &forOpBBody = forOpB.getBody()->getOperations();
1300 // 1) Splice forOpA's non-terminator operations (which is just forOpB) just
1303 forOpA->getBlock()->getOperations().splice(Block::iterator(forOpA),
1306 // 2) Splice forOpB's non-terminator operations into the beginning of forOpA's
1311 forOpBBody.splice(forOpBBody.begin(), forOpA->getBlock()->getOperations(),
1331 // Example 1: [-1, 1][0, 0]
1332 // Example 2: [0, 0][-1, 1]
1335 // Check if the first non-zero dependence component is positive.
1371 auto secondOpIt = std::next(block->begin());
1372 return secondOpIt != block->end() && &*secondOpIt == &block->back();
1377 auto parentForOp = dyn_cast<AffineForOp>(loop->getParentOp());
1386 // input[i] should move from position i -> permMap[i]. Returns the position in
1391 // Check whether the permutation spec is valid. This is a small vector - we'll
1414 if (permMap.back() != input.size() - 1) {
1417 destBody->getOperations().splice(destBody->begin(),
1418 srcBody->getOperations(), srcBody->begin(),
1419 std::prev(srcBody->end()));
1424 for (int i = input.size() - 1; i >= 0; --i) {
1432 auto *parentBlock = input[0]->getBlock();
1433 parentBlock->getOperations().splice(Block::iterator(input[0]),
1434 input[i]->getBlock()->getOperations(),
1441 unsigned parentPosInInput = invPermMap[permMap[i] - 1].second;
1442 if (i > 0 && static_cast<unsigned>(i - 1) == parentPosInInput)
1447 destBody->getOperations().splice(destBody->begin(),
1448 input[i]->getBlock()->getOperations(),
1517 auto bounds = llvm::to_vector<4>(map->getResults());
1518 bounds.push_back(b.getAffineDimExpr(map->getNumDims()) + offset);
1519 operands->insert(operands->begin() + map->getNumDims(), iv);
1520 *map = AffineMap::get(map->getNumDims() + 1, map->getNumSymbols(), bounds,
1526 // Stripmine-sink is a primitive building block for generalized tiling of
1540 OpBuilder b(forOp->getBlock(), std::next(Block::iterator(forOp)));
1542 // Lower-bound map creation.
1547 // Upper-bound map creation.
1560 auto begin = t.getBody()->begin();
1562 auto nOps = t.getBody()->getOperations().size() - 2;
1563 newForOp.getBody()->getOperations().splice(
1564 newForOp.getBody()->getOperations().begin(),
1565 t.getBody()->getOperations(), begin, std::next(begin, nOps));
1678 // iv_i = floordiv(iv_linear, product-of-loop-ranges-until-i) mod range_i.
1682 for (unsigned idx = loops.size(); idx > 0; --idx) {
1703 applyOperands.push_back(upperBoundSymbols[idx - 1]);
1711 replaceAllUsesInRegionWith(loops[idx - 1].getInductionVar(),
1715 // 4. Move the operations from the innermost just above the second-outermost
1716 // loop, delete the extra terminator and the second-outermost loop.
1718 innermost.getBody()->back().erase();
1719 outermost.getBody()->getOperations().splice(
1721 innermost.getBody()->getOperations());
1760 /// Given a memref region, determine the lowest depth at which transfers can be
1763 /// respectively. The lowest depth depends on whether the region being accessed
1773 cst->getValues(cst->getNumDimVars(), cst->getNumDimAndSymbolVars(), &symbols);
1783 if (llvm::is_contained(symbols, it->getInductionVar()))
1791 *copyPlacementBlock = lastInvariantIV->getBlock();
1807 /// n-dimensional region, there can be at most n-1 levels of striding
1809 // TODO: make this work with non-identity layout maps.
1818 for (int d = bufferShape.size() - 1; d >= 1; d--) {
1825 if (bufferShape[d] < dimSize && bufferShape[d - 1] > 1) {
1826 strideInfos->push_back({stride, numEltPerStride});
1831 /// Generates a point-wise copy from/to `memref' to/from `fastMemRef' and
1836 /// order to index the fast buffer. If `copyOut' is true, generates a copy-out;
1837 /// otherwise a copy-in. Builder `b` should be set to the point the copy nest is
1840 /// The copy-in nest is generated as follows as an example for a 2-d region:
1843 /// fast_buf[x - offset_x][y - offset_y] = memref[x][y]
1880 // x - offset_x.
1881 fastBufExprs.push_back(b.getAffineDimExpr(2 * d + 1) -
1919 return block.getParentOp()->emitRemark();
1945 auto f = begin->getParentOfType<FunctionOpInterface>();
1966 copyPlacementBlock->getParent()->getParentOfType<FunctionOpInterface>();
1974 LLVM_DEBUG(llvm::dbgs() << "Non-identity layout map not yet supported\n");
1994 LLVM_DEBUG(llvm::dbgs() << "Non-constant region size not supported\n");
2012 cst->getValues(rank, cst->getNumVars(), ®ionSymbols);
2023 assert(lbs[d].size() == cst->getNumCols() - rank && "incorrect bound size");
2026 for (unsigned j = 0, e = cst->getNumCols() - rank - 1; j < e; j++)
2030 (offset + lbs[d][cst->getNumCols() - 1 - rank]).floorDiv(lbDivisors[d]);
2046 cst->getNumDimVars() + cst->getNumSymbolVars() - rank, 0, offset);
2097 // multi-level strides.
2124 // Point-wise copy generation.
2140 // Create a tag (single element 1-d memref) for the DMA.
2149 // DMA non-blocking read from original buffer to fast buffer.
2155 // DMA non-blocking write from fast buffer to the original memref.
2189 // Ex: to replace load %A[%i, %j] with load %Abuf[%i - %iT, %j - %jT],
2190 // index remap will be (%i, %j) -> (%i - %iT, %j - %jT),
2191 // i.e., affine.apply (d0, d1, d2, d3) -> (d2-d0, d3-d1) (%iT, %jT, %i, %j),
2201 remapExprs.push_back(dimExpr - fastBufOffsets[i]);
2208 bool isBeginAtStartOfBlock = (begin == block->begin());
2220 *nBegin = isBeginAtStartOfBlock ? block->begin() : std::next(prevOfBegin);
2234 region->memref = loadOp.getMemRef();
2235 region->setWrite(false);
2238 region->memref = storeOp.getMemRef();
2239 region->setWrite(true);
2244 auto memRefType = cast<MemRefType>(region->memref.getType());
2248 auto *regionCst = region->getConstraints();
2258 regionCst->setValues(rank, rank + numParamLoopIVs, symbols);
2264 regionCst->addBound(BoundType::LB, d, 0);
2265 regionCst->addBound(BoundType::UB, d, dimSize - 1);
2278 assert(begin->getBlock() == std::prev(end)->getBlock() &&
2280 assert(end != end->getBlock()->end() && "end can't be the block terminator");
2282 Block *block = begin->getBlock();
2284 // Copies will be generated for this depth, i.e., symbolic in all loops
2288 LLVM_DEBUG(llvm::dbgs() << "Generating copies at depth " << copyDepth
2294 // guaranteed iteration order to write test cases. CHECK-DAG doesn't help here
2307 block->walk(begin, end, [&](Operation *opInst) {
2330 auto region = std::make_unique<MemRefRegion>(opInst->getLoc());
2331 if (failed(region->compute(opInst, copyDepth, /*sliceState=*/nullptr,
2334 << "Error obtaining memory region: semi-affine maps?\n");
2335 LLVM_DEBUG(llvm::dbgs() << "over-approximating to the entire memref\n");
2338 opInst->emitError("non-constant memref sizes not yet supported"));
2352 // Note that a memref may have both read and write regions - so update the
2361 const auto *const it = targetRegions.find(region->memref);
2366 if (failed(it->second->unionBoundingBox(*region))) {
2369 "over-approximating to the entire memref\n");
2372 LLVM_DEBUG(opInst->emitError(
2373 "non-constant memref sizes not yet supported"));
2377 it->second->getConstraints()->clearAndCopyFrom(
2378 *region->getConstraints());
2380 // Union was computed and stored in 'it->second': copy to 'region'.
2381 region->getConstraints()->clearAndCopyFrom(
2382 *it->second->getConstraints());
2395 if (region->isWrite() && !existsInWrite) {
2396 writeRegions[region->memref] = std::move(region);
2397 } else if (!region->isWrite() && !existsInRead) {
2398 readRegions[region->memref] = std::move(region);
2403 LLVM_DEBUG(begin->emitError(
2441 LLVM_DEBUG(begin->emitError(
2455 block->getParentOp()->emitWarning(
2468 return affineDataCopyGenerate(forOp.getBody()->begin(),
2469 std::prev(forOp.getBody()->end()), copyOptions,
2476 Block *block = analyzedOp->getBlock();
2477 auto begin = analyzedOp->getIterator();
2492 result.alloc = en->second.getDefiningOp();
2516 /// Gathers all AffineForOps in 'func.func' grouped by loop depth.
2549 /// has the intra-tile loops. The affine.if op is inserted at the builder
2572 // TODO: Non-unit stride is not an issue to generalize to.
2575 cst.setDimSymbolSeparation(/*newSymbolCount=*/cst.getNumDimAndSymbolVars() -
2597 cst.atIneq(lbIndex, i) = fullTileLb[i] - cst.atIneq(lbIndex, i);
2602 cst.atIneq(ubIndex, i) -= fullTileUb[i];
2617 // ifCondSet can be null if cst was empty -- this can happen if all loops
2639 // TODO: straightforward to generalize to a non-unit stride.
2642 << "[tile separation] non-unit stride not implemented\n");
2649 cst.setDimSymbolSeparation(cst.getNumDimAndSymbolVars() - 1);
2680 for (auto &op : inputNest.back().getBody()->without_terminator())
2696 assert(loop->getParentOp() == prevLoop && "input not contiguously nested");
2722 thenBlock->getOperations().splice(
2723 std::prev(thenBlock->end()),
2724 outermostFullTileLoop->getBlock()->getOperations(),
2730 elseBlock->getOperations().splice(std::prev(elseBlock->end()),
2731 firstLoop->getBlock()->getOperations(),
2763 // defined above the first loop in the band. Traverse the nest bottom-up
2765 for (unsigned end = loops.size(); end > 0; --end) {
2767 for (; start < end - 1; ++start) {
2775 auto band = llvm::MutableArrayRef(loops.data() + start, end - start);
2782 if (start != end - 1)
2791 while (auto loopOp = currentOp->getParentOfType<LoopLikeOpInterface>()) {