xref: /llvm-project/flang/lib/Optimizer/Transforms/LoopVersioning.cpp (revision 711419e3025678511e3d26c4c30d757f9029d598)
1 //===- LoopVersioning.cpp -------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 //===----------------------------------------------------------------------===//
10 /// \file
11 /// This pass looks for loops iterating over assumed-shape arrays, that can
12 /// be optimized by "guessing" that the stride is element-sized.
13 ///
14 /// This is done by creating two versions of the same loop: one which assumes
15 /// that the elements are contiguous (stride == size of element), and one that
16 /// is the original generic loop.
17 ///
18 /// As a side-effect of the assumed element size stride, the array is also
19 /// flattened to make it a 1D array - this is because the internal array
20 /// structure must be either 1D or have known sizes in all dimensions - and at
21 /// least one of the dimensions here is already unknown.
22 ///
23 /// There are two distinct benefits here:
24 /// 1. The loop that iterates over the elements is somewhat simplified by the
25 ///    constant stride calculation.
26 /// 2. Since the compiler can understand the size of the stride, it can use
27 ///    vector instructions, where an unknown (at compile time) stride does often
28 ///    prevent vector operations from being used.
29 ///
30 /// A known drawback is that the code-size is increased, in some cases that can
31 /// be quite substantial - 3-4x is quite plausible (this includes that the loop
32 /// gets vectorized, which in itself often more than doubles the size of the
33 /// code, because unless the loop size is known, there will be a modulo
34 /// vector-size remainder to deal with.
35 ///
36 /// TODO: Do we need some size limit where loops no longer get duplicated?
37 //        Maybe some sort of cost analysis.
38 /// TODO: Should some loop content - for example calls to functions and
39 ///       subroutines inhibit the versioning of the loops. Plausibly, this
40 ///       could be part of the cost analysis above.
41 //===----------------------------------------------------------------------===//
42 
43 #include "flang/ISO_Fortran_binding_wrapper.h"
44 #include "flang/Optimizer/Builder/BoxValue.h"
45 #include "flang/Optimizer/Builder/FIRBuilder.h"
46 #include "flang/Optimizer/Builder/Runtime/Inquiry.h"
47 #include "flang/Optimizer/Dialect/FIRDialect.h"
48 #include "flang/Optimizer/Dialect/FIROps.h"
49 #include "flang/Optimizer/Dialect/FIRType.h"
50 #include "flang/Optimizer/Dialect/Support/FIRContext.h"
51 #include "flang/Optimizer/Dialect/Support/KindMapping.h"
52 #include "flang/Optimizer/Support/DataLayout.h"
53 #include "flang/Optimizer/Transforms/Passes.h"
54 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
55 #include "mlir/IR/Dominance.h"
56 #include "mlir/IR/Matchers.h"
57 #include "mlir/IR/TypeUtilities.h"
58 #include "mlir/Pass/Pass.h"
59 #include "mlir/Transforms/DialectConversion.h"
60 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
61 #include "mlir/Transforms/RegionUtils.h"
62 #include "llvm/Support/Debug.h"
63 #include "llvm/Support/raw_ostream.h"
64 
65 #include <algorithm>
66 
67 namespace fir {
68 #define GEN_PASS_DEF_LOOPVERSIONING
69 #include "flang/Optimizer/Transforms/Passes.h.inc"
70 } // namespace fir
71 
72 #define DEBUG_TYPE "flang-loop-versioning"
73 
74 namespace {
75 
76 class LoopVersioningPass
77     : public fir::impl::LoopVersioningBase<LoopVersioningPass> {
78 public:
79   void runOnOperation() override;
80 };
81 
82 /// @struct ArgInfo
83 /// A structure to hold an argument, the size of the argument and dimension
84 /// information.
85 struct ArgInfo {
86   mlir::Value arg;
87   size_t size;
88   unsigned rank;
89   fir::BoxDimsOp dims[CFI_MAX_RANK];
90 };
91 
92 /// @struct ArgsUsageInLoop
93 /// A structure providing information about the function arguments
94 /// usage by the instructions immediately nested in a loop.
95 struct ArgsUsageInLoop {
96   /// Mapping between the memref operand of an array indexing
97   /// operation (e.g. fir.coordinate_of) and the argument information.
98   llvm::DenseMap<mlir::Value, ArgInfo> usageInfo;
99   /// Some array indexing operations inside a loop cannot be transformed.
100   /// This vector holds the memref operands of such operations.
101   /// The vector is used to make sure that we do not try to transform
102   /// any outer loop, since this will imply the operation rewrite
103   /// in this loop.
104   llvm::SetVector<mlir::Value> cannotTransform;
105 
106   // Debug dump of the structure members assuming that
107   // the information has been collected for the given loop.
108   void dump(fir::DoLoopOp loop) const {
109     LLVM_DEBUG({
110       mlir::OpPrintingFlags printFlags;
111       printFlags.skipRegions();
112       llvm::dbgs() << "Arguments usage info for loop:\n";
113       loop.print(llvm::dbgs(), printFlags);
114       llvm::dbgs() << "\nUsed args:\n";
115       for (auto &use : usageInfo) {
116         mlir::Value v = use.first;
117         v.print(llvm::dbgs(), printFlags);
118         llvm::dbgs() << "\n";
119       }
120       llvm::dbgs() << "\nCannot transform args:\n";
121       for (mlir::Value arg : cannotTransform) {
122         arg.print(llvm::dbgs(), printFlags);
123         llvm::dbgs() << "\n";
124       }
125       llvm::dbgs() << "====\n";
126     });
127   }
128 
129   // Erase usageInfo and cannotTransform entries for a set
130   // of given arguments.
131   void eraseUsage(const llvm::SetVector<mlir::Value> &args) {
132     for (auto &arg : args)
133       usageInfo.erase(arg);
134     cannotTransform.set_subtract(args);
135   }
136 
137   // Erase usageInfo and cannotTransform entries for a set
138   // of given arguments provided in the form of usageInfo map.
139   void eraseUsage(const llvm::DenseMap<mlir::Value, ArgInfo> &args) {
140     for (auto &arg : args) {
141       usageInfo.erase(arg.first);
142       cannotTransform.remove(arg.first);
143     }
144   }
145 };
146 } // namespace
147 
148 static fir::SequenceType getAsSequenceType(mlir::Value v) {
149   mlir::Type argTy = fir::unwrapPassByRefType(fir::unwrapRefType(v.getType()));
150   return mlir::dyn_cast<fir::SequenceType>(argTy);
151 }
152 
153 /// Return the rank and the element size (in bytes) of the given
154 /// value \p v. If it is not an array or the element type is not
155 /// supported, then return <0, 0>. Only trivial data types
156 /// are currently supported.
157 /// When \p isArgument is true, \p v is assumed to be a function
158 /// argument. If \p v's type does not look like a type of an assumed
159 /// shape array, then the function returns <0, 0>.
160 /// When \p isArgument is false, array types with known innermost
161 /// dimension are allowed to proceed.
162 static std::pair<unsigned, size_t>
163 getRankAndElementSize(const fir::KindMapping &kindMap,
164                       const mlir::DataLayout &dl, mlir::Value v,
165                       bool isArgument = false) {
166   if (auto seqTy = getAsSequenceType(v)) {
167     unsigned rank = seqTy.getDimension();
168     if (rank > 0 &&
169         (!isArgument ||
170          seqTy.getShape()[0] == fir::SequenceType::getUnknownExtent())) {
171       size_t typeSize = 0;
172       mlir::Type elementType = fir::unwrapSeqOrBoxedSeqType(v.getType());
173       if (fir::isa_trivial(elementType)) {
174         auto [eleSize, eleAlign] = fir::getTypeSizeAndAlignmentOrCrash(
175             v.getLoc(), elementType, dl, kindMap);
176         typeSize = llvm::alignTo(eleSize, eleAlign);
177       }
178       if (typeSize)
179         return {rank, typeSize};
180     }
181   }
182 
183   LLVM_DEBUG(llvm::dbgs() << "Unsupported rank/type: " << v << '\n');
184   return {0, 0};
185 }
186 
187 /// if a value comes from a fir.declare, follow it to the original source,
188 /// otherwise return the value
189 static mlir::Value unwrapFirDeclare(mlir::Value val) {
190   // fir.declare is for source code variables. We don't have declares of
191   // declares
192   if (fir::DeclareOp declare = val.getDefiningOp<fir::DeclareOp>())
193     return declare.getMemref();
194   return val;
195 }
196 
197 /// Return true, if \p rebox operation keeps the input array
198 /// continuous in the innermost dimension, if it is initially continuous
199 /// in the innermost dimension.
200 static bool reboxPreservesContinuity(fir::ReboxOp rebox) {
201   // If slicing is not involved, then the rebox does not affect
202   // the continuity of the array.
203   auto sliceArg = rebox.getSlice();
204   if (!sliceArg)
205     return true;
206 
207   // A slice with step=1 in the innermost dimension preserves
208   // the continuity of the array in the innermost dimension.
209   if (auto sliceOp =
210           mlir::dyn_cast_or_null<fir::SliceOp>(sliceArg.getDefiningOp())) {
211     if (sliceOp.getFields().empty() && sliceOp.getSubstr().empty()) {
212       auto triples = sliceOp.getTriples();
213       if (triples.size() > 2)
214         if (auto innermostStep = fir::getIntIfConstant(triples[2]))
215           if (*innermostStep == 1)
216             return true;
217     }
218 
219     LLVM_DEBUG(llvm::dbgs()
220                << "REBOX with slicing may produce non-contiguous array: "
221                << sliceOp << '\n'
222                << rebox << '\n');
223     return false;
224   }
225 
226   LLVM_DEBUG(llvm::dbgs() << "REBOX with unknown slice" << sliceArg << '\n'
227                           << rebox << '\n');
228   return false;
229 }
230 
231 /// if a value comes from a fir.rebox, follow the rebox to the original source,
232 /// of the value, otherwise return the value
233 static mlir::Value unwrapReboxOp(mlir::Value val) {
234   while (fir::ReboxOp rebox = val.getDefiningOp<fir::ReboxOp>()) {
235     if (!reboxPreservesContinuity(rebox))
236       break;
237     val = rebox.getBox();
238   }
239   return val;
240 }
241 
242 /// normalize a value (removing fir.declare and fir.rebox) so that we can
243 /// more conveniently spot values which came from function arguments
244 static mlir::Value normaliseVal(mlir::Value val) {
245   return unwrapFirDeclare(unwrapReboxOp(val));
246 }
247 
248 /// some FIR operations accept a fir.shape, a fir.shift or a fir.shapeshift.
249 /// fir.shift and fir.shapeshift allow us to extract lower bounds
250 /// if lowerbounds cannot be found, return nullptr
251 static mlir::Value tryGetLowerBoundsFromShapeLike(mlir::Value shapeLike,
252                                                   unsigned dim) {
253   mlir::Value lowerBound{nullptr};
254   if (auto shift = shapeLike.getDefiningOp<fir::ShiftOp>())
255     lowerBound = shift.getOrigins()[dim];
256   if (auto shapeShift = shapeLike.getDefiningOp<fir::ShapeShiftOp>())
257     lowerBound = shapeShift.getOrigins()[dim];
258   return lowerBound;
259 }
260 
261 /// attempt to get the array lower bounds of dimension dim of the memref
262 /// argument to a fir.array_coor op
263 /// 0 <= dim < rank
264 /// May return nullptr if no lower bounds can be determined
265 static mlir::Value getLowerBound(fir::ArrayCoorOp coop, unsigned dim) {
266   // 1) try to get from the shape argument to fir.array_coor
267   if (mlir::Value shapeLike = coop.getShape())
268     if (mlir::Value lb = tryGetLowerBoundsFromShapeLike(shapeLike, dim))
269       return lb;
270 
271   // It is important not to try to read the lower bound from the box, because
272   // in the FIR lowering, boxes will sometimes contain incorrect lower bound
273   // information
274 
275   // out of ideas
276   return {};
277 }
278 
279 /// gets the i'th index from array coordinate operation op
280 /// dim should range between 0 and rank - 1
281 static mlir::Value getIndex(fir::FirOpBuilder &builder, mlir::Operation *op,
282                             unsigned dim) {
283   if (fir::CoordinateOp coop = mlir::dyn_cast<fir::CoordinateOp>(op))
284     return coop.getCoor()[dim];
285 
286   fir::ArrayCoorOp coop = mlir::dyn_cast<fir::ArrayCoorOp>(op);
287   assert(coop &&
288          "operation must be either fir.coordiante_of or fir.array_coor");
289 
290   // fir.coordinate_of indices start at 0: adjust these indices to match by
291   // subtracting the lower bound
292   mlir::Value index = coop.getIndices()[dim];
293   mlir::Value lb = getLowerBound(coop, dim);
294   if (!lb)
295     // assume a default lower bound of one
296     lb = builder.createIntegerConstant(coop.getLoc(), index.getType(), 1);
297 
298   // index_0 = index - lb;
299   if (lb.getType() != index.getType())
300     lb = builder.createConvert(coop.getLoc(), index.getType(), lb);
301   return builder.create<mlir::arith::SubIOp>(coop.getLoc(), index, lb);
302 }
303 
304 void LoopVersioningPass::runOnOperation() {
305   LLVM_DEBUG(llvm::dbgs() << "=== Begin " DEBUG_TYPE " ===\n");
306   mlir::func::FuncOp func = getOperation();
307 
308   // First look for arguments with assumed shape = unknown extent in the lowest
309   // dimension.
310   LLVM_DEBUG(llvm::dbgs() << "Func-name:" << func.getSymName() << "\n");
311   mlir::Block::BlockArgListType args = func.getArguments();
312   mlir::ModuleOp module = func->getParentOfType<mlir::ModuleOp>();
313   fir::KindMapping kindMap = fir::getKindMapping(module);
314   mlir::SmallVector<ArgInfo, 4> argsOfInterest;
315   std::optional<mlir::DataLayout> dl =
316       fir::support::getOrSetDataLayout(module, /*allowDefaultLayout=*/false);
317   if (!dl)
318     mlir::emitError(module.getLoc(),
319                     "data layout attribute is required to perform " DEBUG_TYPE
320                     "pass");
321   for (auto &arg : args) {
322     // Optional arguments must be checked for IsPresent before
323     // looking for the bounds. They are unsupported for the time being.
324     if (func.getArgAttrOfType<mlir::UnitAttr>(arg.getArgNumber(),
325                                               fir::getOptionalAttrName())) {
326       LLVM_DEBUG(llvm::dbgs() << "OPTIONAL is not supported\n");
327       continue;
328     }
329 
330     auto [rank, typeSize] =
331         getRankAndElementSize(kindMap, *dl, arg, /*isArgument=*/true);
332     if (rank != 0 && typeSize != 0)
333       argsOfInterest.push_back({arg, typeSize, rank, {}});
334   }
335 
336   if (argsOfInterest.empty()) {
337     LLVM_DEBUG(llvm::dbgs()
338                << "No suitable arguments.\n=== End " DEBUG_TYPE " ===\n");
339     return;
340   }
341 
342   // A list of all loops in the function in post-order.
343   mlir::SmallVector<fir::DoLoopOp> originalLoops;
344   // Information about the arguments usage by the instructions
345   // immediately nested in a loop.
346   llvm::DenseMap<fir::DoLoopOp, ArgsUsageInLoop> argsInLoops;
347 
348   auto &domInfo = getAnalysis<mlir::DominanceInfo>();
349 
350   // Traverse the loops in post-order and see
351   // if those arguments are used inside any loop.
352   func.walk([&](fir::DoLoopOp loop) {
353     mlir::Block &body = *loop.getBody();
354     auto &argsInLoop = argsInLoops[loop];
355     originalLoops.push_back(loop);
356     body.walk([&](mlir::Operation *op) {
357       // Support either fir.array_coor or fir.coordinate_of.
358       if (!mlir::isa<fir::ArrayCoorOp, fir::CoordinateOp>(op))
359         return;
360       // Process only operations immediately nested in the current loop.
361       if (op->getParentOfType<fir::DoLoopOp>() != loop)
362         return;
363       mlir::Value operand = op->getOperand(0);
364       for (auto a : argsOfInterest) {
365         if (a.arg == normaliseVal(operand)) {
366           // Use the reboxed value, not the block arg when re-creating the loop.
367           a.arg = operand;
368 
369           // Check that the operand dominates the loop?
370           // If this is the case, record such operands in argsInLoop.cannot-
371           // Transform, so that they disable the transformation for the parent
372           /// loops as well.
373           if (!domInfo.dominates(a.arg, loop))
374             argsInLoop.cannotTransform.insert(a.arg);
375 
376           // No support currently for sliced arrays.
377           // This means that we cannot transform properly
378           // instructions referencing a.arg in the whole loop
379           // nest this loop is located in.
380           if (auto arrayCoor = mlir::dyn_cast<fir::ArrayCoorOp>(op))
381             if (arrayCoor.getSlice())
382               argsInLoop.cannotTransform.insert(a.arg);
383 
384           // We need to compute the rank and element size
385           // based on the operand, not the original argument,
386           // because array slicing may affect it.
387           std::tie(a.rank, a.size) = getRankAndElementSize(kindMap, *dl, a.arg);
388           if (a.rank == 0 || a.size == 0)
389             argsInLoop.cannotTransform.insert(a.arg);
390 
391           if (argsInLoop.cannotTransform.contains(a.arg)) {
392             // Remove any previously recorded usage, if any.
393             argsInLoop.usageInfo.erase(a.arg);
394             break;
395           }
396 
397           // Record the a.arg usage, if not recorded yet.
398           argsInLoop.usageInfo.try_emplace(a.arg, a);
399           break;
400         }
401       }
402     });
403   });
404 
405   // Dump loops info after initial collection.
406   LLVM_DEBUG({
407     llvm::dbgs() << "Initial usage info:\n";
408     for (fir::DoLoopOp loop : originalLoops) {
409       auto &argsInLoop = argsInLoops[loop];
410       argsInLoop.dump(loop);
411     }
412   });
413 
414   // Clear argument usage for parent loops if an inner loop
415   // contains a non-transformable usage.
416   for (fir::DoLoopOp loop : originalLoops) {
417     auto &argsInLoop = argsInLoops[loop];
418     if (argsInLoop.cannotTransform.empty())
419       continue;
420 
421     fir::DoLoopOp parent = loop;
422     while ((parent = parent->getParentOfType<fir::DoLoopOp>()))
423       argsInLoops[parent].eraseUsage(argsInLoop.cannotTransform);
424   }
425 
426   // If an argument access can be optimized in a loop and
427   // its descendant loop, then it does not make sense to
428   // generate the contiguity check for the descendant loop.
429   // The check will be produced as part of the ancestor
430   // loop's transformation. So we can clear the argument
431   // usage for all descendant loops.
432   for (fir::DoLoopOp loop : originalLoops) {
433     auto &argsInLoop = argsInLoops[loop];
434     if (argsInLoop.usageInfo.empty())
435       continue;
436 
437     loop.getBody()->walk([&](fir::DoLoopOp dloop) {
438       argsInLoops[dloop].eraseUsage(argsInLoop.usageInfo);
439     });
440   }
441 
442   LLVM_DEBUG({
443     llvm::dbgs() << "Final usage info:\n";
444     for (fir::DoLoopOp loop : originalLoops) {
445       auto &argsInLoop = argsInLoops[loop];
446       argsInLoop.dump(loop);
447     }
448   });
449 
450   // Reduce the collected information to a list of loops
451   // with attached arguments usage information.
452   // The list must hold the loops in post order, so that
453   // the inner loops are transformed before the outer loops.
454   struct OpsWithArgs {
455     mlir::Operation *op;
456     mlir::SmallVector<ArgInfo, 4> argsAndDims;
457   };
458   mlir::SmallVector<OpsWithArgs, 4> loopsOfInterest;
459   for (fir::DoLoopOp loop : originalLoops) {
460     auto &argsInLoop = argsInLoops[loop];
461     if (argsInLoop.usageInfo.empty())
462       continue;
463     OpsWithArgs info;
464     info.op = loop;
465     for (auto &arg : argsInLoop.usageInfo)
466       info.argsAndDims.push_back(arg.second);
467     loopsOfInterest.emplace_back(std::move(info));
468   }
469 
470   if (loopsOfInterest.empty()) {
471     LLVM_DEBUG(llvm::dbgs()
472                << "No loops to transform.\n=== End " DEBUG_TYPE " ===\n");
473     return;
474   }
475 
476   // If we get here, there are loops to process.
477   fir::FirOpBuilder builder{module, std::move(kindMap)};
478   mlir::Location loc = builder.getUnknownLoc();
479   mlir::IndexType idxTy = builder.getIndexType();
480 
481   LLVM_DEBUG(llvm::dbgs() << "Func Before transformation:\n");
482   LLVM_DEBUG(func->dump());
483 
484   LLVM_DEBUG(llvm::dbgs() << "loopsOfInterest: " << loopsOfInterest.size()
485                           << "\n");
486   for (auto op : loopsOfInterest) {
487     LLVM_DEBUG(op.op->dump());
488     builder.setInsertionPoint(op.op);
489 
490     mlir::Value allCompares = nullptr;
491     // Ensure all of the arrays are unit-stride.
492     for (auto &arg : op.argsAndDims) {
493       // Fetch all the dimensions of the array, except the last dimension.
494       // Always fetch the first dimension, however, so set ndims = 1 if
495       // we have one dim
496       unsigned ndims = arg.rank;
497       for (unsigned i = 0; i < ndims; i++) {
498         mlir::Value dimIdx = builder.createIntegerConstant(loc, idxTy, i);
499         arg.dims[i] = builder.create<fir::BoxDimsOp>(loc, idxTy, idxTy, idxTy,
500                                                      arg.arg, dimIdx);
501       }
502       // We only care about lowest order dimension, here.
503       mlir::Value elemSize =
504           builder.createIntegerConstant(loc, idxTy, arg.size);
505       mlir::Value cmp = builder.create<mlir::arith::CmpIOp>(
506           loc, mlir::arith::CmpIPredicate::eq, arg.dims[0].getResult(2),
507           elemSize);
508       if (!allCompares) {
509         allCompares = cmp;
510       } else {
511         allCompares =
512             builder.create<mlir::arith::AndIOp>(loc, cmp, allCompares);
513       }
514     }
515 
516     auto ifOp =
517         builder.create<fir::IfOp>(loc, op.op->getResultTypes(), allCompares,
518                                   /*withElse=*/true);
519     builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
520 
521     LLVM_DEBUG(llvm::dbgs() << "Creating cloned loop\n");
522     mlir::Operation *clonedLoop = op.op->clone();
523     bool changed = false;
524     for (auto &arg : op.argsAndDims) {
525       fir::SequenceType::Shape newShape;
526       newShape.push_back(fir::SequenceType::getUnknownExtent());
527       auto elementType = fir::unwrapSeqOrBoxedSeqType(arg.arg.getType());
528       mlir::Type arrTy = fir::SequenceType::get(newShape, elementType);
529       mlir::Type boxArrTy = fir::BoxType::get(arrTy);
530       mlir::Type refArrTy = builder.getRefType(arrTy);
531       auto carg = builder.create<fir::ConvertOp>(loc, boxArrTy, arg.arg);
532       auto caddr = builder.create<fir::BoxAddrOp>(loc, refArrTy, carg);
533       auto insPt = builder.saveInsertionPoint();
534       // Use caddr instead of arg.
535       clonedLoop->walk([&](mlir::Operation *coop) {
536         if (!mlir::isa<fir::CoordinateOp, fir::ArrayCoorOp>(coop))
537           return;
538         // Reduce the multi-dimensioned index to a single index.
539         // This is required becase fir arrays do not support multiple dimensions
540         // with unknown dimensions at compile time.
541         // We then calculate the multidimensional array like this:
542         // arr(x, y, z) bedcomes arr(z * stride(2) + y * stride(1) + x)
543         // where stride is the distance between elements in the dimensions
544         // 0, 1 and 2 or x, y and z.
545         if (coop->getOperand(0) == arg.arg && coop->getOperands().size() >= 2) {
546           builder.setInsertionPoint(coop);
547           mlir::Value totalIndex;
548           for (unsigned i = arg.rank - 1; i > 0; i--) {
549             mlir::Value curIndex =
550                 builder.createConvert(loc, idxTy, getIndex(builder, coop, i));
551             // Multiply by the stride of this array. Later we'll divide by the
552             // element size.
553             mlir::Value scale =
554                 builder.createConvert(loc, idxTy, arg.dims[i].getResult(2));
555             curIndex =
556                 builder.create<mlir::arith::MulIOp>(loc, scale, curIndex);
557             totalIndex = (totalIndex) ? builder.create<mlir::arith::AddIOp>(
558                                             loc, curIndex, totalIndex)
559                                       : curIndex;
560           }
561           // This is the lowest dimension - which doesn't need scaling
562           mlir::Value finalIndex =
563               builder.createConvert(loc, idxTy, getIndex(builder, coop, 0));
564           if (totalIndex) {
565             assert(llvm::isPowerOf2_32(arg.size) &&
566                    "Expected power of two here");
567             unsigned bits = llvm::Log2_32(arg.size);
568             mlir::Value elemShift =
569                 builder.createIntegerConstant(loc, idxTy, bits);
570             totalIndex = builder.create<mlir::arith::AddIOp>(
571                 loc,
572                 builder.create<mlir::arith::ShRSIOp>(loc, totalIndex,
573                                                      elemShift),
574                 finalIndex);
575           } else {
576             totalIndex = finalIndex;
577           }
578           auto newOp = builder.create<fir::CoordinateOp>(
579               loc, builder.getRefType(elementType), caddr,
580               mlir::ValueRange{totalIndex});
581           LLVM_DEBUG(newOp->dump());
582           coop->getResult(0).replaceAllUsesWith(newOp->getResult(0));
583           coop->erase();
584           changed = true;
585         }
586       });
587 
588       builder.restoreInsertionPoint(insPt);
589     }
590     assert(changed && "Expected operations to have changed");
591 
592     builder.insert(clonedLoop);
593     // Forward the result(s), if any, from the loop operation to the
594     //
595     mlir::ResultRange results = clonedLoop->getResults();
596     bool hasResults = (results.size() > 0);
597     if (hasResults)
598       builder.create<fir::ResultOp>(loc, results);
599 
600     // Add the original loop in the else-side of the if operation.
601     builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
602     op.op->replaceAllUsesWith(ifOp);
603     op.op->remove();
604     builder.insert(op.op);
605     // Rely on "cloned loop has results, so original loop also has results".
606     if (hasResults) {
607       builder.create<fir::ResultOp>(loc, op.op->getResults());
608     } else {
609       // Use an assert to check this.
610       assert(op.op->getResults().size() == 0 &&
611              "Weird, the cloned loop doesn't have results, but the original "
612              "does?");
613     }
614   }
615 
616   LLVM_DEBUG(llvm::dbgs() << "Func After transform:\n");
617   LLVM_DEBUG(func->dump());
618 
619   LLVM_DEBUG(llvm::dbgs() << "=== End " DEBUG_TYPE " ===\n");
620 }
621