1 //===- MVETailPredication.cpp - MVE Tail Predication ------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Armv8.1m introduced MVE, M-Profile Vector Extension, and low-overhead 11 /// branches to help accelerate DSP applications. These two extensions, 12 /// combined with a new form of predication called tail-predication, can be used 13 /// to provide implicit vector predication within a low-overhead loop. 14 /// This is implicit because the predicate of active/inactive lanes is 15 /// calculated by hardware, and thus does not need to be explicitly passed 16 /// to vector instructions. The instructions responsible for this are the 17 /// DLSTP and WLSTP instructions, which setup a tail-predicated loop and the 18 /// the total number of data elements processed by the loop. The loop-end 19 /// LETP instruction is responsible for decrementing and setting the remaining 20 /// elements to be processed and generating the mask of active lanes. 21 /// 22 /// The HardwareLoops pass inserts intrinsics identifying loops that the 23 /// backend will attempt to convert into a low-overhead loop. The vectorizer is 24 /// responsible for generating a vectorized loop in which the lanes are 25 /// predicated upon an get.active.lane.mask intrinsic. This pass looks at these 26 /// get.active.lane.mask intrinsic and attempts to convert them to VCTP 27 /// instructions. This will be picked up by the ARM Low-overhead loop pass later 28 /// in the backend, which performs the final transformation to a DLSTP or WLSTP 29 /// tail-predicated loop. 30 // 31 //===----------------------------------------------------------------------===// 32 33 #include "ARM.h" 34 #include "ARMSubtarget.h" 35 #include "ARMTargetTransformInfo.h" 36 #include "llvm/Analysis/LoopInfo.h" 37 #include "llvm/Analysis/LoopPass.h" 38 #include "llvm/Analysis/ScalarEvolution.h" 39 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 40 #include "llvm/Analysis/TargetLibraryInfo.h" 41 #include "llvm/Analysis/TargetTransformInfo.h" 42 #include "llvm/Analysis/ValueTracking.h" 43 #include "llvm/CodeGen/TargetPassConfig.h" 44 #include "llvm/IR/IRBuilder.h" 45 #include "llvm/IR/Instructions.h" 46 #include "llvm/IR/IntrinsicsARM.h" 47 #include "llvm/Support/Debug.h" 48 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 49 #include "llvm/Transforms/Utils/Local.h" 50 #include "llvm/Transforms/Utils/LoopUtils.h" 51 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" 52 53 using namespace llvm; 54 55 #define DEBUG_TYPE "mve-tail-predication" 56 #define DESC "Transform predicated vector loops to use MVE tail predication" 57 58 cl::opt<TailPredication::Mode> EnableTailPredication( 59 "tail-predication", cl::desc("MVE tail-predication pass options"), 60 cl::init(TailPredication::Enabled), 61 cl::values(clEnumValN(TailPredication::Disabled, "disabled", 62 "Don't tail-predicate loops"), 63 clEnumValN(TailPredication::EnabledNoReductions, 64 "enabled-no-reductions", 65 "Enable tail-predication, but not for reduction loops"), 66 clEnumValN(TailPredication::Enabled, 67 "enabled", 68 "Enable tail-predication, including reduction loops"), 69 clEnumValN(TailPredication::ForceEnabledNoReductions, 70 "force-enabled-no-reductions", 71 "Enable tail-predication, but not for reduction loops, " 72 "and force this which might be unsafe"), 73 clEnumValN(TailPredication::ForceEnabled, 74 "force-enabled", 75 "Enable tail-predication, including reduction loops, " 76 "and force this which might be unsafe"))); 77 78 79 namespace { 80 81 class MVETailPredication : public LoopPass { 82 SmallVector<IntrinsicInst*, 4> MaskedInsts; 83 Loop *L = nullptr; 84 ScalarEvolution *SE = nullptr; 85 TargetTransformInfo *TTI = nullptr; 86 const ARMSubtarget *ST = nullptr; 87 88 public: 89 static char ID; 90 91 MVETailPredication() : LoopPass(ID) { } 92 93 void getAnalysisUsage(AnalysisUsage &AU) const override { 94 AU.addRequired<ScalarEvolutionWrapperPass>(); 95 AU.addRequired<LoopInfoWrapperPass>(); 96 AU.addRequired<TargetPassConfig>(); 97 AU.addRequired<TargetTransformInfoWrapperPass>(); 98 AU.addPreserved<LoopInfoWrapperPass>(); 99 AU.setPreservesCFG(); 100 } 101 102 bool runOnLoop(Loop *L, LPPassManager&) override; 103 104 private: 105 /// Perform the relevant checks on the loop and convert active lane masks if 106 /// possible. 107 bool TryConvertActiveLaneMask(Value *TripCount); 108 109 /// Perform several checks on the arguments of @llvm.get.active.lane.mask 110 /// intrinsic. E.g., check that the loop induction variable and the element 111 /// count are of the form we expect, and also perform overflow checks for 112 /// the new expressions that are created. 113 const SCEV *IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, Value *TripCount); 114 115 /// Insert the intrinsic to represent the effect of tail predication. 116 void InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, Value *Start); 117 }; 118 119 } // end namespace 120 121 bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) { 122 if (skipLoop(L) || !EnableTailPredication) 123 return false; 124 125 MaskedInsts.clear(); 126 Function &F = *L->getHeader()->getParent(); 127 auto &TPC = getAnalysis<TargetPassConfig>(); 128 auto &TM = TPC.getTM<TargetMachine>(); 129 ST = &TM.getSubtarget<ARMSubtarget>(F); 130 TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 131 SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 132 this->L = L; 133 134 // The MVE and LOB extensions are combined to enable tail-predication, but 135 // there's nothing preventing us from generating VCTP instructions for v8.1m. 136 if (!ST->hasMVEIntegerOps() || !ST->hasV8_1MMainlineOps()) { 137 LLVM_DEBUG(dbgs() << "ARM TP: Not a v8.1m.main+mve target.\n"); 138 return false; 139 } 140 141 BasicBlock *Preheader = L->getLoopPreheader(); 142 if (!Preheader) 143 return false; 144 145 auto FindLoopIterations = [](BasicBlock *BB) -> IntrinsicInst* { 146 for (auto &I : *BB) { 147 auto *Call = dyn_cast<IntrinsicInst>(&I); 148 if (!Call) 149 continue; 150 151 Intrinsic::ID ID = Call->getIntrinsicID(); 152 if (ID == Intrinsic::start_loop_iterations || 153 ID == Intrinsic::test_start_loop_iterations) 154 return cast<IntrinsicInst>(&I); 155 } 156 return nullptr; 157 }; 158 159 // Look for the hardware loop intrinsic that sets the iteration count. 160 IntrinsicInst *Setup = FindLoopIterations(Preheader); 161 162 // The test.set iteration could live in the pre-preheader. 163 if (!Setup) { 164 if (!Preheader->getSinglePredecessor()) 165 return false; 166 Setup = FindLoopIterations(Preheader->getSinglePredecessor()); 167 if (!Setup) 168 return false; 169 } 170 171 LLVM_DEBUG(dbgs() << "ARM TP: Running on Loop: " << *L << *Setup << "\n"); 172 173 bool Changed = TryConvertActiveLaneMask(Setup->getArgOperand(0)); 174 175 return Changed; 176 } 177 178 // The active lane intrinsic has this form: 179 // 180 // @llvm.get.active.lane.mask(IV, TC) 181 // 182 // Here we perform checks that this intrinsic behaves as expected, 183 // which means: 184 // 185 // 1) Check that the TripCount (TC) belongs to this loop (originally). 186 // 2) The element count (TC) needs to be sufficiently large that the decrement 187 // of element counter doesn't overflow, which means that we need to prove: 188 // ceil(ElementCount / VectorWidth) >= TripCount 189 // by rounding up ElementCount up: 190 // ((ElementCount + (VectorWidth - 1)) / VectorWidth 191 // and evaluate if expression isKnownNonNegative: 192 // (((ElementCount + (VectorWidth - 1)) / VectorWidth) - TripCount 193 // 3) The IV must be an induction phi with an increment equal to the 194 // vector width. 195 const SCEV *MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, 196 Value *TripCount) { 197 bool ForceTailPredication = 198 EnableTailPredication == TailPredication::ForceEnabledNoReductions || 199 EnableTailPredication == TailPredication::ForceEnabled; 200 201 Value *ElemCount = ActiveLaneMask->getOperand(1); 202 bool Changed = false; 203 if (!L->makeLoopInvariant(ElemCount, Changed)) 204 return nullptr; 205 206 const SCEV *EC = SE->getSCEV(ElemCount); 207 const SCEV *TC = SE->getSCEV(TripCount); 208 int VectorWidth = 209 cast<FixedVectorType>(ActiveLaneMask->getType())->getNumElements(); 210 if (VectorWidth != 2 && VectorWidth != 4 && VectorWidth != 8 && 211 VectorWidth != 16) 212 return nullptr; 213 ConstantInt *ConstElemCount = nullptr; 214 215 // 1) Smoke tests that the original scalar loop TripCount (TC) belongs to 216 // this loop. The scalar tripcount corresponds the number of elements 217 // processed by the loop, so we will refer to that from this point on. 218 if (!SE->isLoopInvariant(EC, L)) { 219 LLVM_DEBUG(dbgs() << "ARM TP: element count must be loop invariant.\n"); 220 return nullptr; 221 } 222 223 // 2) Find out if IV is an induction phi. Note that we can't use Loop 224 // helpers here to get the induction variable, because the hardware loop is 225 // no longer in loopsimplify form, and also the hwloop intrinsic uses a 226 // different counter. Using SCEV, we check that the induction is of the 227 // form i = i + 4, where the increment must be equal to the VectorWidth. 228 auto *IV = ActiveLaneMask->getOperand(0); 229 const SCEV *IVExpr = SE->getSCEV(IV); 230 auto *AddExpr = dyn_cast<SCEVAddRecExpr>(IVExpr); 231 232 if (!AddExpr) { 233 LLVM_DEBUG(dbgs() << "ARM TP: induction not an add expr: "; IVExpr->dump()); 234 return nullptr; 235 } 236 // Check that this AddRec is associated with this loop. 237 if (AddExpr->getLoop() != L) { 238 LLVM_DEBUG(dbgs() << "ARM TP: phi not part of this loop\n"); 239 return nullptr; 240 } 241 auto *Step = dyn_cast<SCEVConstant>(AddExpr->getOperand(1)); 242 if (!Step) { 243 LLVM_DEBUG(dbgs() << "ARM TP: induction step is not a constant: "; 244 AddExpr->getOperand(1)->dump()); 245 return nullptr; 246 } 247 auto StepValue = Step->getValue()->getSExtValue(); 248 if (VectorWidth != StepValue) { 249 LLVM_DEBUG(dbgs() << "ARM TP: Step value " << StepValue 250 << " doesn't match vector width " << VectorWidth << "\n"); 251 return nullptr; 252 } 253 254 if ((ConstElemCount = dyn_cast<ConstantInt>(ElemCount))) { 255 ConstantInt *TC = dyn_cast<ConstantInt>(TripCount); 256 if (!TC) { 257 LLVM_DEBUG(dbgs() << "ARM TP: Constant tripcount expected in " 258 "set.loop.iterations\n"); 259 return nullptr; 260 } 261 262 // Calculate 2 tripcount values and check that they are consistent with 263 // each other. The TripCount for a predicated vector loop body is 264 // ceil(ElementCount/Width), or floor((ElementCount+Width-1)/Width) as we 265 // work it out here. 266 uint64_t TC1 = TC->getZExtValue(); 267 uint64_t TC2 = 268 (ConstElemCount->getZExtValue() + VectorWidth - 1) / VectorWidth; 269 270 // If the tripcount values are inconsistent, we can't insert the VCTP and 271 // trigger tail-predication; keep the intrinsic as a get.active.lane.mask 272 // and legalize this. 273 if (TC1 != TC2) { 274 LLVM_DEBUG(dbgs() << "ARM TP: inconsistent constant tripcount values: " 275 << TC1 << " from set.loop.iterations, and " 276 << TC2 << " from get.active.lane.mask\n"); 277 return nullptr; 278 } 279 } else if (!ForceTailPredication) { 280 // 3) We need to prove that the sub expression that we create in the 281 // tail-predicated loop body, which calculates the remaining elements to be 282 // processed, is non-negative, i.e. it doesn't overflow: 283 // 284 // ((ElementCount + VectorWidth - 1) / VectorWidth) - TripCount >= 0 285 // 286 // This is true if: 287 // 288 // TripCount == (ElementCount + VectorWidth - 1) / VectorWidth 289 // 290 // which what we will be using here. 291 // 292 const SCEV *VW = 293 SE->getSCEV(ConstantInt::get(TripCount->getType(), VectorWidth)); 294 // ElementCount + (VW-1): 295 const SCEV *Start = AddExpr->getStart(); 296 const SCEV *ECPlusVWMinus1 = SE->getAddExpr( 297 EC, 298 SE->getSCEV(ConstantInt::get(TripCount->getType(), VectorWidth - 1))); 299 300 // Ceil = ElementCount + (VW-1) / VW 301 const SCEV *Ceil = SE->getUDivExpr(ECPlusVWMinus1, VW); 302 303 // Prevent unused variable warnings with TC 304 (void)TC; 305 LLVM_DEBUG({ 306 dbgs() << "ARM TP: Analysing overflow behaviour for:\n"; 307 dbgs() << "ARM TP: - TripCount = " << *TC << "\n"; 308 dbgs() << "ARM TP: - ElemCount = " << *EC << "\n"; 309 dbgs() << "ARM TP: - Start = " << *Start << "\n"; 310 dbgs() << "ARM TP: - BETC = " << *SE->getBackedgeTakenCount(L) << "\n"; 311 dbgs() << "ARM TP: - VecWidth = " << VectorWidth << "\n"; 312 dbgs() << "ARM TP: - (ElemCount+VW-1) / VW = " << *Ceil << "\n"; 313 }); 314 315 // As an example, almost all the tripcount expressions (produced by the 316 // vectoriser) look like this: 317 // 318 // TC = ((-4 + (4 * ((3 + %N) /u 4))<nuw> - start) /u 4) 319 // 320 // and "ElementCount + (VW-1) / VW": 321 // 322 // Ceil = ((3 + %N) /u 4) 323 // 324 // Check for equality of TC and Ceil by calculating SCEV expression 325 // TC - Ceil and test it for zero. 326 // 327 const SCEV *Div = SE->getUDivExpr( 328 SE->getAddExpr(SE->getMulExpr(Ceil, VW), SE->getNegativeSCEV(VW), 329 SE->getNegativeSCEV(Start)), 330 VW); 331 const SCEV *Sub = SE->getMinusSCEV(SE->getBackedgeTakenCount(L), Div); 332 LLVM_DEBUG(dbgs() << "ARM TP: - Sub = "; Sub->dump()); 333 334 // Use context sensitive facts about the path to the loop to refine. This 335 // comes up as the backedge taken count can incorporate context sensitive 336 // reasoning, and our RHS just above doesn't. 337 Sub = SE->applyLoopGuards(Sub, L); 338 LLVM_DEBUG(dbgs() << "ARM TP: - (Guarded) = "; Sub->dump()); 339 340 if (!Sub->isZero()) { 341 LLVM_DEBUG(dbgs() << "ARM TP: possible overflow in sub expression.\n"); 342 return nullptr; 343 } 344 } 345 346 // Check that the start value is a multiple of the VectorWidth. 347 // TODO: This could do with a method to check if the scev is a multiple of 348 // VectorWidth. For the moment we just check for constants, muls and unknowns 349 // (which use MaskedValueIsZero and seems to be the most common). 350 if (auto *BaseC = dyn_cast<SCEVConstant>(AddExpr->getStart())) { 351 if (BaseC->getAPInt().urem(VectorWidth) == 0) 352 return SE->getMinusSCEV(EC, BaseC); 353 } else if (auto *BaseV = dyn_cast<SCEVUnknown>(AddExpr->getStart())) { 354 Type *Ty = BaseV->getType(); 355 APInt Mask = APInt::getLowBitsSet(Ty->getPrimitiveSizeInBits(), 356 Log2_64(VectorWidth)); 357 if (MaskedValueIsZero(BaseV->getValue(), Mask, 358 L->getHeader()->getDataLayout())) 359 return SE->getMinusSCEV(EC, BaseV); 360 } else if (auto *BaseMul = dyn_cast<SCEVMulExpr>(AddExpr->getStart())) { 361 if (auto *BaseC = dyn_cast<SCEVConstant>(BaseMul->getOperand(0))) 362 if (BaseC->getAPInt().urem(VectorWidth) == 0) 363 return SE->getMinusSCEV(EC, BaseC); 364 if (auto *BaseC = dyn_cast<SCEVConstant>(BaseMul->getOperand(1))) 365 if (BaseC->getAPInt().urem(VectorWidth) == 0) 366 return SE->getMinusSCEV(EC, BaseC); 367 } 368 369 LLVM_DEBUG( 370 dbgs() << "ARM TP: induction base is not know to be a multiple of VF: " 371 << *AddExpr->getOperand(0) << "\n"); 372 return nullptr; 373 } 374 375 void MVETailPredication::InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, 376 Value *Start) { 377 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 378 Module *M = L->getHeader()->getModule(); 379 Type *Ty = IntegerType::get(M->getContext(), 32); 380 unsigned VectorWidth = 381 cast<FixedVectorType>(ActiveLaneMask->getType())->getNumElements(); 382 383 // Insert a phi to count the number of elements processed by the loop. 384 Builder.SetInsertPoint(L->getHeader(), L->getHeader()->getFirstNonPHIIt()); 385 PHINode *Processed = Builder.CreatePHI(Ty, 2); 386 Processed->addIncoming(Start, L->getLoopPreheader()); 387 388 // Replace @llvm.get.active.mask() with the ARM specific VCTP intrinic, and 389 // thus represent the effect of tail predication. 390 Builder.SetInsertPoint(ActiveLaneMask); 391 ConstantInt *Factor = ConstantInt::get(cast<IntegerType>(Ty), VectorWidth); 392 393 Intrinsic::ID VCTPID; 394 switch (VectorWidth) { 395 default: 396 llvm_unreachable("unexpected number of lanes"); 397 case 2: VCTPID = Intrinsic::arm_mve_vctp64; break; 398 case 4: VCTPID = Intrinsic::arm_mve_vctp32; break; 399 case 8: VCTPID = Intrinsic::arm_mve_vctp16; break; 400 case 16: VCTPID = Intrinsic::arm_mve_vctp8; break; 401 } 402 Value *VCTPCall = Builder.CreateIntrinsic(VCTPID, {}, Processed); 403 ActiveLaneMask->replaceAllUsesWith(VCTPCall); 404 405 // Add the incoming value to the new phi. 406 // TODO: This add likely already exists in the loop. 407 Value *Remaining = Builder.CreateSub(Processed, Factor); 408 Processed->addIncoming(Remaining, L->getLoopLatch()); 409 LLVM_DEBUG(dbgs() << "ARM TP: Insert processed elements phi: " 410 << *Processed << "\n" 411 << "ARM TP: Inserted VCTP: " << *VCTPCall << "\n"); 412 } 413 414 bool MVETailPredication::TryConvertActiveLaneMask(Value *TripCount) { 415 SmallVector<IntrinsicInst *, 4> ActiveLaneMasks; 416 for (auto *BB : L->getBlocks()) 417 for (auto &I : *BB) 418 if (auto *Int = dyn_cast<IntrinsicInst>(&I)) 419 if (Int->getIntrinsicID() == Intrinsic::get_active_lane_mask) 420 ActiveLaneMasks.push_back(Int); 421 422 if (ActiveLaneMasks.empty()) 423 return false; 424 425 LLVM_DEBUG(dbgs() << "ARM TP: Found predicated vector loop.\n"); 426 427 for (auto *ActiveLaneMask : ActiveLaneMasks) { 428 LLVM_DEBUG(dbgs() << "ARM TP: Found active lane mask: " 429 << *ActiveLaneMask << "\n"); 430 431 const SCEV *StartSCEV = IsSafeActiveMask(ActiveLaneMask, TripCount); 432 if (!StartSCEV) { 433 LLVM_DEBUG(dbgs() << "ARM TP: Not safe to insert VCTP.\n"); 434 return false; 435 } 436 LLVM_DEBUG(dbgs() << "ARM TP: Safe to insert VCTP. Start is " << *StartSCEV 437 << "\n"); 438 SCEVExpander Expander(*SE, L->getHeader()->getDataLayout(), 439 "start"); 440 Instruction *Ins = L->getLoopPreheader()->getTerminator(); 441 Value *Start = Expander.expandCodeFor(StartSCEV, StartSCEV->getType(), Ins); 442 LLVM_DEBUG(dbgs() << "ARM TP: Created start value " << *Start << "\n"); 443 InsertVCTPIntrinsic(ActiveLaneMask, Start); 444 } 445 446 // Remove dead instructions and now dead phis. 447 for (auto *II : ActiveLaneMasks) 448 RecursivelyDeleteTriviallyDeadInstructions(II); 449 for (auto *I : L->blocks()) 450 DeleteDeadPHIs(I); 451 return true; 452 } 453 454 Pass *llvm::createMVETailPredicationPass() { 455 return new MVETailPredication(); 456 } 457 458 char MVETailPredication::ID = 0; 459 460 INITIALIZE_PASS_BEGIN(MVETailPredication, DEBUG_TYPE, DESC, false, false) 461 INITIALIZE_PASS_END(MVETailPredication, DEBUG_TYPE, DESC, false, false) 462