1*31087c5eSjeanPerier //===- MemoryUtils.cpp ----------------------------------------------------===// 2*31087c5eSjeanPerier // 3*31087c5eSjeanPerier // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4*31087c5eSjeanPerier // See https://llvm.org/LICENSE.txt for license information. 5*31087c5eSjeanPerier // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6*31087c5eSjeanPerier // 7*31087c5eSjeanPerier //===----------------------------------------------------------------------===// 8*31087c5eSjeanPerier 9*31087c5eSjeanPerier #include "flang/Optimizer/Transforms/MemoryUtils.h" 10*31087c5eSjeanPerier #include "flang/Optimizer/Builder/FIRBuilder.h" 11*31087c5eSjeanPerier #include "flang/Optimizer/Builder/Todo.h" 12*31087c5eSjeanPerier #include "mlir/Dialect/OpenACC/OpenACC.h" 13*31087c5eSjeanPerier #include "mlir/IR/Dominance.h" 14*31087c5eSjeanPerier #include "llvm/ADT/STLExtras.h" 15*31087c5eSjeanPerier 16*31087c5eSjeanPerier namespace { 17*31087c5eSjeanPerier /// Helper class to detect if an alloca is inside an mlir::Block that can be 18*31087c5eSjeanPerier /// reached again before its deallocation points via block successors. This 19*31087c5eSjeanPerier /// analysis is only valid if the deallocation points are inside (or nested 20*31087c5eSjeanPerier /// inside) the same region as alloca because it does not consider region CFG 21*31087c5eSjeanPerier /// (for instance, the block inside a fir.do_loop is obviously inside a loop, 22*31087c5eSjeanPerier /// but is not a loop formed by blocks). The dominance of the alloca on its 23*31087c5eSjeanPerier /// deallocation points implies this pre-condition (although it is more 24*31087c5eSjeanPerier /// restrictive). 25*31087c5eSjeanPerier class BlockCycleDetector { 26*31087c5eSjeanPerier public: 27*31087c5eSjeanPerier bool allocaIsInCycle(fir::AllocaOp alloca, 28*31087c5eSjeanPerier llvm::ArrayRef<mlir::Operation *> deallocationPoints); 29*31087c5eSjeanPerier 30*31087c5eSjeanPerier private: 31*31087c5eSjeanPerier // Cache for blocks owning alloca that have been analyzed. In many Fortran 32*31087c5eSjeanPerier // programs, allocas are usually made in the same blocks with no block cycles. 33*31087c5eSjeanPerier // So getting a fast "no" is beneficial. 34*31087c5eSjeanPerier llvm::DenseMap<mlir::Block *, /*isInCycle*/ bool> analyzed; 35*31087c5eSjeanPerier }; 36*31087c5eSjeanPerier } // namespace 37*31087c5eSjeanPerier 38*31087c5eSjeanPerier namespace { 39*31087c5eSjeanPerier class AllocaReplaceImpl { 40*31087c5eSjeanPerier public: 41*31087c5eSjeanPerier AllocaReplaceImpl(fir::AllocaRewriterCallBack allocaRewriter, 42*31087c5eSjeanPerier fir::DeallocCallBack deallocGenerator) 43*31087c5eSjeanPerier : allocaRewriter{allocaRewriter}, deallocGenerator{deallocGenerator} {} 44*31087c5eSjeanPerier bool replace(mlir::RewriterBase &, fir::AllocaOp); 45*31087c5eSjeanPerier 46*31087c5eSjeanPerier private: 47*31087c5eSjeanPerier mlir::Region *findDeallocationPointsAndOwner( 48*31087c5eSjeanPerier fir::AllocaOp alloca, 49*31087c5eSjeanPerier llvm::SmallVectorImpl<mlir::Operation *> &deallocationPoints); 50*31087c5eSjeanPerier bool 51*31087c5eSjeanPerier allocDominatesDealloc(fir::AllocaOp alloca, 52*31087c5eSjeanPerier llvm::ArrayRef<mlir::Operation *> deallocationPoints) { 53*31087c5eSjeanPerier return llvm::all_of(deallocationPoints, [&](mlir::Operation *deallocPoint) { 54*31087c5eSjeanPerier return this->dominanceInfo.properlyDominates(alloca.getOperation(), 55*31087c5eSjeanPerier deallocPoint); 56*31087c5eSjeanPerier }); 57*31087c5eSjeanPerier } 58*31087c5eSjeanPerier void 59*31087c5eSjeanPerier genIndirectDeallocation(mlir::RewriterBase &, fir::AllocaOp, 60*31087c5eSjeanPerier llvm::ArrayRef<mlir::Operation *> deallocationPoints, 61*31087c5eSjeanPerier mlir::Value replacement, mlir::Region &owningRegion); 62*31087c5eSjeanPerier 63*31087c5eSjeanPerier private: 64*31087c5eSjeanPerier fir::AllocaRewriterCallBack allocaRewriter; 65*31087c5eSjeanPerier fir::DeallocCallBack deallocGenerator; 66*31087c5eSjeanPerier mlir::DominanceInfo dominanceInfo; 67*31087c5eSjeanPerier BlockCycleDetector blockCycleDetector; 68*31087c5eSjeanPerier }; 69*31087c5eSjeanPerier } // namespace 70*31087c5eSjeanPerier 71*31087c5eSjeanPerier static bool 72*31087c5eSjeanPerier allocaIsInCycleImpl(mlir::Block *allocaBlock, 73*31087c5eSjeanPerier llvm::ArrayRef<mlir::Operation *> deallocationPoints) { 74*31087c5eSjeanPerier llvm::DenseSet<mlir::Block *> seen; 75*31087c5eSjeanPerier // Insert the deallocation point blocks as "seen" so that the block 76*31087c5eSjeanPerier // traversal will stop at them. 77*31087c5eSjeanPerier for (mlir::Operation *deallocPoint : deallocationPoints) 78*31087c5eSjeanPerier seen.insert(deallocPoint->getBlock()); 79*31087c5eSjeanPerier if (seen.contains(allocaBlock)) 80*31087c5eSjeanPerier return false; 81*31087c5eSjeanPerier // Traverse the block successor graph starting by the alloca block. 82*31087c5eSjeanPerier llvm::SmallVector<mlir::Block *> successors{allocaBlock}; 83*31087c5eSjeanPerier while (!successors.empty()) 84*31087c5eSjeanPerier for (mlir::Block *next : successors.pop_back_val()->getSuccessors()) { 85*31087c5eSjeanPerier if (next == allocaBlock) 86*31087c5eSjeanPerier return true; 87*31087c5eSjeanPerier if (auto pair = seen.insert(next); pair.second) 88*31087c5eSjeanPerier successors.push_back(next); 89*31087c5eSjeanPerier } 90*31087c5eSjeanPerier // The traversal did not reach the alloca block again. 91*31087c5eSjeanPerier return false; 92*31087c5eSjeanPerier } 93*31087c5eSjeanPerier bool BlockCycleDetector::allocaIsInCycle( 94*31087c5eSjeanPerier fir::AllocaOp alloca, 95*31087c5eSjeanPerier llvm::ArrayRef<mlir::Operation *> deallocationPoints) { 96*31087c5eSjeanPerier mlir::Block *allocaBlock = alloca->getBlock(); 97*31087c5eSjeanPerier auto analyzedPair = analyzed.try_emplace(allocaBlock, /*isInCycle=*/false); 98*31087c5eSjeanPerier bool alreadyAnalyzed = !analyzedPair.second; 99*31087c5eSjeanPerier bool &isInCycle = analyzedPair.first->second; 100*31087c5eSjeanPerier // Fast exit if block was already analyzed and no cycle was found. 101*31087c5eSjeanPerier if (alreadyAnalyzed && !isInCycle) 102*31087c5eSjeanPerier return false; 103*31087c5eSjeanPerier // If the analysis was not done generically for this block, run it and 104*31087c5eSjeanPerier // save the result. 105*31087c5eSjeanPerier if (!alreadyAnalyzed) 106*31087c5eSjeanPerier isInCycle = allocaIsInCycleImpl(allocaBlock, /*deallocationPoints*/ {}); 107*31087c5eSjeanPerier if (!isInCycle) 108*31087c5eSjeanPerier return false; 109*31087c5eSjeanPerier // If the generic analysis found a block loop, see if the deallocation 110*31087c5eSjeanPerier // point would be reached before reaching the block again. Do not 111*31087c5eSjeanPerier // cache that analysis that is specific to the deallocation points 112*31087c5eSjeanPerier // found for this alloca. 113*31087c5eSjeanPerier return allocaIsInCycleImpl(allocaBlock, deallocationPoints); 114*31087c5eSjeanPerier } 115*31087c5eSjeanPerier 116*31087c5eSjeanPerier static bool terminatorYieldsMemory(mlir::Operation &terminator) { 117*31087c5eSjeanPerier return llvm::any_of(terminator.getResults(), [](mlir::OpResult res) { 118*31087c5eSjeanPerier return fir::conformsWithPassByRef(res.getType()); 119*31087c5eSjeanPerier }); 120*31087c5eSjeanPerier } 121*31087c5eSjeanPerier 122*31087c5eSjeanPerier static bool isRegionTerminator(mlir::Operation &terminator) { 123*31087c5eSjeanPerier // Using ReturnLike trait is tempting but it is not set on 124*31087c5eSjeanPerier // all region terminator that matters (like omp::TerminatorOp that 125*31087c5eSjeanPerier // has no results). 126*31087c5eSjeanPerier // May be true for dead code. It is not a correctness issue and dead code can 127*31087c5eSjeanPerier // be eliminated by running region simplification before this utility is 128*31087c5eSjeanPerier // used. 129*31087c5eSjeanPerier // May also be true for unreachable like terminators (e.g., after an abort 130*31087c5eSjeanPerier // call related to Fortran STOP). This is also OK, the inserted deallocation 131*31087c5eSjeanPerier // will simply never be reached. It is easier for the rest of the code here 132*31087c5eSjeanPerier // to assume there is always at least one deallocation point, so keep 133*31087c5eSjeanPerier // unreachable terminators. 134*31087c5eSjeanPerier return !terminator.hasSuccessors(); 135*31087c5eSjeanPerier } 136*31087c5eSjeanPerier 137*31087c5eSjeanPerier mlir::Region *AllocaReplaceImpl::findDeallocationPointsAndOwner( 138*31087c5eSjeanPerier fir::AllocaOp alloca, 139*31087c5eSjeanPerier llvm::SmallVectorImpl<mlir::Operation *> &deallocationPoints) { 140*31087c5eSjeanPerier // Step 1: Identify the operation and region owning the alloca. 141*31087c5eSjeanPerier mlir::Region *owningRegion = alloca.getOwnerRegion(); 142*31087c5eSjeanPerier if (!owningRegion) 143*31087c5eSjeanPerier return nullptr; 144*31087c5eSjeanPerier mlir::Operation *owningOp = owningRegion->getParentOp(); 145*31087c5eSjeanPerier assert(owningOp && "region expected to be owned"); 146*31087c5eSjeanPerier // Step 2: Identify the exit points of the owning region, they are the default 147*31087c5eSjeanPerier // deallocation points. TODO: detect and use lifetime markers to get earlier 148*31087c5eSjeanPerier // deallocation points. 149*31087c5eSjeanPerier bool isOpenACCMPRecipe = mlir::isa<mlir::accomp::RecipeInterface>(owningOp); 150*31087c5eSjeanPerier for (mlir::Block &block : owningRegion->getBlocks()) 151*31087c5eSjeanPerier if (mlir::Operation *terminator = block.getTerminator(); 152*31087c5eSjeanPerier isRegionTerminator(*terminator)) { 153*31087c5eSjeanPerier // FIXME: OpenACC and OpenMP privatization recipe are stand alone 154*31087c5eSjeanPerier // operation meant to be later "inlined", the value they return may 155*31087c5eSjeanPerier // be the address of a local alloca. It would be incorrect to insert 156*31087c5eSjeanPerier // deallocation before the terminator (this would introduce use after 157*31087c5eSjeanPerier // free once the recipe is inlined. 158*31087c5eSjeanPerier // This probably require redesign or special handling on the OpenACC/MP 159*31087c5eSjeanPerier // side. 160*31087c5eSjeanPerier if (isOpenACCMPRecipe && terminatorYieldsMemory(*terminator)) 161*31087c5eSjeanPerier return nullptr; 162*31087c5eSjeanPerier deallocationPoints.push_back(terminator); 163*31087c5eSjeanPerier } 164*31087c5eSjeanPerier // If no block terminators without successors have been found, this is 165*31087c5eSjeanPerier // an odd region we cannot reason about (never seen yet in FIR and 166*31087c5eSjeanPerier // mainstream dialects, but MLIR does not really prevent it). 167*31087c5eSjeanPerier if (deallocationPoints.empty()) 168*31087c5eSjeanPerier return nullptr; 169*31087c5eSjeanPerier 170*31087c5eSjeanPerier // Step 3: detect block based loops between the allocation and deallocation 171*31087c5eSjeanPerier // points, and add a deallocation point on the back edge to avoid memory 172*31087c5eSjeanPerier // leaks. 173*31087c5eSjeanPerier // The detection avoids doing region CFG analysis by assuming that there may 174*31087c5eSjeanPerier // be cycles if deallocation points are not dominated by the alloca. 175*31087c5eSjeanPerier // This leaves the cases where the deallocation points are in the same region 176*31087c5eSjeanPerier // as the alloca (or nested inside it). In which cases there may be a back 177*31087c5eSjeanPerier // edge between the alloca and the deallocation point via block successors. An 178*31087c5eSjeanPerier // analysis is run to detect those cases. 179*31087c5eSjeanPerier // When a loop is detected, the easiest solution to deallocate on the back 180*31087c5eSjeanPerier // edge is to store the allocated memory address in a variable (that dominates 181*31087c5eSjeanPerier // the loops) and to deallocate the address in that variable if it is set 182*31087c5eSjeanPerier // before executing the allocation. This strategy still leads to correct 183*31087c5eSjeanPerier // execution in the "false positive" cases. 184*31087c5eSjeanPerier // Hence, the alloca is added as a deallocation point when there is no 185*31087c5eSjeanPerier // dominance. Note that bringing lifetime markers above will reduce the 186*31087c5eSjeanPerier // false positives. 187*31087c5eSjeanPerier if (!allocDominatesDealloc(alloca, deallocationPoints) || 188*31087c5eSjeanPerier blockCycleDetector.allocaIsInCycle(alloca, deallocationPoints)) 189*31087c5eSjeanPerier deallocationPoints.push_back(alloca.getOperation()); 190*31087c5eSjeanPerier return owningRegion; 191*31087c5eSjeanPerier } 192*31087c5eSjeanPerier 193*31087c5eSjeanPerier void AllocaReplaceImpl::genIndirectDeallocation( 194*31087c5eSjeanPerier mlir::RewriterBase &rewriter, fir::AllocaOp alloca, 195*31087c5eSjeanPerier llvm::ArrayRef<mlir::Operation *> deallocationPoints, 196*31087c5eSjeanPerier mlir::Value replacement, mlir::Region &owningRegion) { 197*31087c5eSjeanPerier mlir::Location loc = alloca.getLoc(); 198*31087c5eSjeanPerier auto replacementInsertPoint = rewriter.saveInsertionPoint(); 199*31087c5eSjeanPerier // Create C pointer variable in the entry block to store the alloc 200*31087c5eSjeanPerier // and access it indirectly in the entry points that do not dominate. 201*31087c5eSjeanPerier rewriter.setInsertionPointToStart(&owningRegion.front()); 202*31087c5eSjeanPerier mlir::Type heapType = fir::HeapType::get(alloca.getInType()); 203*31087c5eSjeanPerier mlir::Value ptrVar = rewriter.create<fir::AllocaOp>(loc, heapType); 204*31087c5eSjeanPerier mlir::Value nullPtr = rewriter.create<fir::ZeroOp>(loc, heapType); 205*31087c5eSjeanPerier rewriter.create<fir::StoreOp>(loc, nullPtr, ptrVar); 206*31087c5eSjeanPerier // TODO: introducing a pointer compare op in FIR would help 207*31087c5eSjeanPerier // generating less IR here. 208*31087c5eSjeanPerier mlir::Type intPtrTy = fir::getIntPtrType(rewriter); 209*31087c5eSjeanPerier mlir::Value c0 = rewriter.create<mlir::arith::ConstantOp>( 210*31087c5eSjeanPerier loc, intPtrTy, rewriter.getIntegerAttr(intPtrTy, 0)); 211*31087c5eSjeanPerier 212*31087c5eSjeanPerier // Store new storage address right after its creation. 213*31087c5eSjeanPerier rewriter.restoreInsertionPoint(replacementInsertPoint); 214*31087c5eSjeanPerier mlir::Value castReplacement = 215*31087c5eSjeanPerier fir::factory::createConvert(rewriter, loc, heapType, replacement); 216*31087c5eSjeanPerier rewriter.create<fir::StoreOp>(loc, castReplacement, ptrVar); 217*31087c5eSjeanPerier 218*31087c5eSjeanPerier // Generate conditional deallocation at every deallocation point. 219*31087c5eSjeanPerier auto genConditionalDealloc = [&](mlir::Location loc) { 220*31087c5eSjeanPerier mlir::Value ptrVal = rewriter.create<fir::LoadOp>(loc, ptrVar); 221*31087c5eSjeanPerier mlir::Value ptrToInt = 222*31087c5eSjeanPerier rewriter.create<fir::ConvertOp>(loc, intPtrTy, ptrVal); 223*31087c5eSjeanPerier mlir::Value isAllocated = rewriter.create<mlir::arith::CmpIOp>( 224*31087c5eSjeanPerier loc, mlir::arith::CmpIPredicate::ne, ptrToInt, c0); 225*31087c5eSjeanPerier auto ifOp = rewriter.create<fir::IfOp>(loc, std::nullopt, isAllocated, 226*31087c5eSjeanPerier /*withElseRegion=*/false); 227*31087c5eSjeanPerier rewriter.setInsertionPointToStart(&ifOp.getThenRegion().front()); 228*31087c5eSjeanPerier mlir::Value cast = fir::factory::createConvert( 229*31087c5eSjeanPerier rewriter, loc, replacement.getType(), ptrVal); 230*31087c5eSjeanPerier deallocGenerator(loc, rewriter, cast); 231*31087c5eSjeanPerier // Currently there is no need to reset the pointer var because two 232*31087c5eSjeanPerier // deallocation points can never be reached without going through the 233*31087c5eSjeanPerier // alloca. 234*31087c5eSjeanPerier rewriter.setInsertionPointAfter(ifOp); 235*31087c5eSjeanPerier }; 236*31087c5eSjeanPerier for (mlir::Operation *deallocPoint : deallocationPoints) { 237*31087c5eSjeanPerier rewriter.setInsertionPoint(deallocPoint); 238*31087c5eSjeanPerier genConditionalDealloc(deallocPoint->getLoc()); 239*31087c5eSjeanPerier } 240*31087c5eSjeanPerier } 241*31087c5eSjeanPerier 242*31087c5eSjeanPerier bool AllocaReplaceImpl::replace(mlir::RewriterBase &rewriter, 243*31087c5eSjeanPerier fir::AllocaOp alloca) { 244*31087c5eSjeanPerier llvm::SmallVector<mlir::Operation *> deallocationPoints; 245*31087c5eSjeanPerier mlir::Region *owningRegion = 246*31087c5eSjeanPerier findDeallocationPointsAndOwner(alloca, deallocationPoints); 247*31087c5eSjeanPerier if (!owningRegion) 248*31087c5eSjeanPerier return false; 249*31087c5eSjeanPerier rewriter.setInsertionPointAfter(alloca.getOperation()); 250*31087c5eSjeanPerier bool deallocPointsDominateAlloc = 251*31087c5eSjeanPerier allocDominatesDealloc(alloca, deallocationPoints); 252*31087c5eSjeanPerier if (mlir::Value replacement = 253*31087c5eSjeanPerier allocaRewriter(rewriter, alloca, deallocPointsDominateAlloc)) { 254*31087c5eSjeanPerier mlir::Value castReplacement = fir::factory::createConvert( 255*31087c5eSjeanPerier rewriter, alloca.getLoc(), alloca.getType(), replacement); 256*31087c5eSjeanPerier if (deallocPointsDominateAlloc) 257*31087c5eSjeanPerier for (mlir::Operation *deallocPoint : deallocationPoints) { 258*31087c5eSjeanPerier rewriter.setInsertionPoint(deallocPoint); 259*31087c5eSjeanPerier deallocGenerator(deallocPoint->getLoc(), rewriter, replacement); 260*31087c5eSjeanPerier } 261*31087c5eSjeanPerier else 262*31087c5eSjeanPerier genIndirectDeallocation(rewriter, alloca, deallocationPoints, replacement, 263*31087c5eSjeanPerier *owningRegion); 264*31087c5eSjeanPerier rewriter.replaceOp(alloca, castReplacement); 265*31087c5eSjeanPerier } 266*31087c5eSjeanPerier return true; 267*31087c5eSjeanPerier } 268*31087c5eSjeanPerier 269*31087c5eSjeanPerier bool fir::replaceAllocas(mlir::RewriterBase &rewriter, 270*31087c5eSjeanPerier mlir::Operation *parentOp, 271*31087c5eSjeanPerier MustRewriteCallBack mustReplace, 272*31087c5eSjeanPerier AllocaRewriterCallBack allocaRewriter, 273*31087c5eSjeanPerier DeallocCallBack deallocGenerator) { 274*31087c5eSjeanPerier // If the parent operation is not an alloca owner, the code below would risk 275*31087c5eSjeanPerier // modifying IR outside of parentOp. 276*31087c5eSjeanPerier if (!fir::AllocaOp::ownsNestedAlloca(parentOp)) 277*31087c5eSjeanPerier return false; 278*31087c5eSjeanPerier auto insertPoint = rewriter.saveInsertionPoint(); 279*31087c5eSjeanPerier bool replacedAllRequestedAlloca = true; 280*31087c5eSjeanPerier AllocaReplaceImpl impl(allocaRewriter, deallocGenerator); 281*31087c5eSjeanPerier parentOp->walk([&](fir::AllocaOp alloca) { 282*31087c5eSjeanPerier if (mustReplace(alloca)) 283*31087c5eSjeanPerier replacedAllRequestedAlloca &= impl.replace(rewriter, alloca); 284*31087c5eSjeanPerier }); 285*31087c5eSjeanPerier rewriter.restoreInsertionPoint(insertPoint); 286*31087c5eSjeanPerier return replacedAllRequestedAlloca; 287*31087c5eSjeanPerier } 288