1 //===------ LoopGeneratorsKMP.cpp - IR helper to create loops -------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file contains functions to create parallel loops as LLVM-IR. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "polly/CodeGen/LoopGeneratorsKMP.h" 14 #include "llvm/Analysis/LoopInfo.h" 15 #include "llvm/IR/Dominators.h" 16 #include "llvm/IR/Module.h" 17 18 using namespace llvm; 19 using namespace polly; 20 21 void ParallelLoopGeneratorKMP::createCallSpawnThreads(Value *SubFn, 22 Value *SubFnParam, 23 Value *LB, Value *UB, 24 Value *Stride) { 25 const std::string Name = "__kmpc_fork_call"; 26 Function *F = M->getFunction(Name); 27 Type *KMPCMicroTy = StructType::getTypeByName(M->getContext(), "kmpc_micro"); 28 29 if (!KMPCMicroTy) { 30 // void (*kmpc_micro)(kmp_int32 *global_tid, kmp_int32 *bound_tid, ...) 31 Type *MicroParams[] = {Builder.getPtrTy(0), Builder.getPtrTy(0)}; 32 33 KMPCMicroTy = FunctionType::get(Builder.getVoidTy(), MicroParams, true); 34 } 35 36 // If F is not available, declare it. 37 if (!F) { 38 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 39 Type *Params[] = {Builder.getPtrTy(0), Builder.getInt32Ty(), 40 Builder.getPtrTy(0)}; 41 42 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Params, true); 43 F = Function::Create(Ty, Linkage, Name, M); 44 } 45 46 Value *Task = 47 Builder.CreatePointerBitCastOrAddrSpaceCast(SubFn, Builder.getPtrTy(0)); 48 49 Value *Args[] = {SourceLocationInfo, 50 Builder.getInt32(4) /* Number of arguments (w/o Task) */, 51 Task, 52 LB, 53 UB, 54 Stride, 55 SubFnParam}; 56 57 CallInst *Call = Builder.CreateCall(F, Args); 58 Call->setDebugLoc(DLGenerated); 59 } 60 61 void ParallelLoopGeneratorKMP::deployParallelExecution(Function *SubFn, 62 Value *SubFnParam, 63 Value *LB, Value *UB, 64 Value *Stride) { 65 // Inform OpenMP runtime about the number of threads if greater than zero 66 if (PollyNumThreads > 0) { 67 Value *GlobalThreadID = createCallGlobalThreadNum(); 68 createCallPushNumThreads(GlobalThreadID, Builder.getInt32(PollyNumThreads)); 69 } 70 71 // Tell the runtime we start a parallel loop 72 createCallSpawnThreads(SubFn, SubFnParam, LB, UB, Stride); 73 } 74 75 Function *ParallelLoopGeneratorKMP::prepareSubFnDefinition(Function *F) const { 76 std::vector<Type *> Arguments = { 77 Builder.getPtrTy(0), Builder.getPtrTy(0), LongType, LongType, LongType, 78 Builder.getPtrTy()}; 79 80 FunctionType *FT = FunctionType::get(Builder.getVoidTy(), Arguments, false); 81 Function *SubFn = Function::Create(FT, Function::InternalLinkage, 82 F->getName() + "_polly_subfn", M); 83 // Name the function's arguments 84 Function::arg_iterator AI = SubFn->arg_begin(); 85 AI->setName("polly.kmpc.global_tid"); 86 std::advance(AI, 1); 87 AI->setName("polly.kmpc.bound_tid"); 88 std::advance(AI, 1); 89 AI->setName("polly.kmpc.lb"); 90 std::advance(AI, 1); 91 AI->setName("polly.kmpc.ub"); 92 std::advance(AI, 1); 93 AI->setName("polly.kmpc.inc"); 94 std::advance(AI, 1); 95 AI->setName("polly.kmpc.shared"); 96 97 return SubFn; 98 } 99 100 // Create a subfunction of the following (preliminary) structure: 101 // 102 // PrevBB 103 // | 104 // v 105 // HeaderBB 106 // / | _____ 107 // / v v | 108 // / PreHeaderBB | 109 // | | | 110 // | v | 111 // | CheckNextBB | 112 // \ | \_____/ 113 // \ | 114 // v v 115 // ExitBB 116 // 117 // HeaderBB will hold allocations, loading of variables and kmp-init calls. 118 // CheckNextBB will check for more work (dynamic / static chunked) or will be 119 // empty (static non chunked). 120 // If there is more work to do: go to PreHeaderBB, otherwise go to ExitBB. 121 // PreHeaderBB loads the new boundaries (& will lead to the loop body later on). 122 // Just like CheckNextBB: PreHeaderBB is (preliminary) empty in the static non 123 // chunked scheduling case. ExitBB marks the end of the parallel execution. 124 // The possibly empty BasicBlocks will automatically be removed. 125 std::tuple<Value *, Function *> 126 ParallelLoopGeneratorKMP::createSubFn(Value *SequentialLoopStride, 127 AllocaInst *StructData, 128 SetVector<Value *> Data, ValueMapT &Map) { 129 Function *SubFn = createSubFnDefinition(); 130 LLVMContext &Context = SubFn->getContext(); 131 132 // Create basic blocks. 133 BasicBlock *HeaderBB = BasicBlock::Create(Context, "polly.par.setup", SubFn); 134 SubFnDT = std::make_unique<DominatorTree>(*SubFn); 135 SubFnLI = std::make_unique<LoopInfo>(*SubFnDT); 136 137 BasicBlock *ExitBB = BasicBlock::Create(Context, "polly.par.exit", SubFn); 138 BasicBlock *CheckNextBB = 139 BasicBlock::Create(Context, "polly.par.checkNext", SubFn); 140 BasicBlock *PreHeaderBB = 141 BasicBlock::Create(Context, "polly.par.loadIVBounds", SubFn); 142 143 SubFnDT->addNewBlock(ExitBB, HeaderBB); 144 SubFnDT->addNewBlock(CheckNextBB, HeaderBB); 145 SubFnDT->addNewBlock(PreHeaderBB, HeaderBB); 146 147 // Fill up basic block HeaderBB. 148 Builder.SetInsertPoint(HeaderBB); 149 Value *LBPtr = Builder.CreateAlloca(LongType, nullptr, "polly.par.LBPtr"); 150 Value *UBPtr = Builder.CreateAlloca(LongType, nullptr, "polly.par.UBPtr"); 151 Value *IsLastPtr = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, 152 "polly.par.lastIterPtr"); 153 Value *StridePtr = 154 Builder.CreateAlloca(LongType, nullptr, "polly.par.StridePtr"); 155 156 // Get iterator for retrieving the previously defined parameters. 157 Function::arg_iterator AI = SubFn->arg_begin(); 158 // First argument holds "global thread ID". 159 Value *IDPtr = &*AI; 160 // Skip "bound thread ID" since it is not used (but had to be defined). 161 std::advance(AI, 2); 162 // Move iterator to: LB, UB, Stride, Shared variable struct. 163 Value *LB = &*AI; 164 std::advance(AI, 1); 165 Value *UB = &*AI; 166 std::advance(AI, 1); 167 Value *Stride = &*AI; 168 std::advance(AI, 1); 169 Value *Shared = &*AI; 170 171 extractValuesFromStruct(Data, StructData->getAllocatedType(), Shared, Map); 172 173 const auto Alignment = llvm::Align(is64BitArch() ? 8 : 4); 174 Value *ID = Builder.CreateAlignedLoad(Builder.getInt32Ty(), IDPtr, Alignment, 175 "polly.par.global_tid"); 176 177 Builder.CreateAlignedStore(LB, LBPtr, Alignment); 178 Builder.CreateAlignedStore(UB, UBPtr, Alignment); 179 Builder.CreateAlignedStore(Builder.getInt32(0), IsLastPtr, Alignment); 180 Builder.CreateAlignedStore(Stride, StridePtr, Alignment); 181 182 // Subtract one as the upper bound provided by openmp is a < comparison 183 // whereas the codegenForSequential function creates a <= comparison. 184 Value *AdjustedUB = Builder.CreateAdd(UB, ConstantInt::get(LongType, -1), 185 "polly.indvar.UBAdjusted"); 186 187 Value *ChunkSize = 188 ConstantInt::get(LongType, std::max<int>(PollyChunkSize, 1)); 189 190 OMPGeneralSchedulingType Scheduling = 191 getSchedType(PollyChunkSize, PollyScheduling); 192 193 switch (Scheduling) { 194 case OMPGeneralSchedulingType::Dynamic: 195 case OMPGeneralSchedulingType::Guided: 196 case OMPGeneralSchedulingType::Runtime: 197 // "DYNAMIC" scheduling types are handled below (including 'runtime') 198 { 199 UB = AdjustedUB; 200 createCallDispatchInit(ID, LB, UB, Stride, ChunkSize); 201 Value *HasWork = 202 createCallDispatchNext(ID, IsLastPtr, LBPtr, UBPtr, StridePtr); 203 Value *HasIteration = 204 Builder.CreateICmp(llvm::CmpInst::Predicate::ICMP_EQ, HasWork, 205 Builder.getInt32(1), "polly.hasIteration"); 206 Builder.CreateCondBr(HasIteration, PreHeaderBB, ExitBB); 207 208 Builder.SetInsertPoint(CheckNextBB); 209 HasWork = createCallDispatchNext(ID, IsLastPtr, LBPtr, UBPtr, StridePtr); 210 HasIteration = 211 Builder.CreateICmp(llvm::CmpInst::Predicate::ICMP_EQ, HasWork, 212 Builder.getInt32(1), "polly.hasWork"); 213 Builder.CreateCondBr(HasIteration, PreHeaderBB, ExitBB); 214 215 Builder.SetInsertPoint(PreHeaderBB); 216 LB = Builder.CreateAlignedLoad(LongType, LBPtr, Alignment, 217 "polly.indvar.LB"); 218 UB = Builder.CreateAlignedLoad(LongType, UBPtr, Alignment, 219 "polly.indvar.UB"); 220 } 221 break; 222 case OMPGeneralSchedulingType::StaticChunked: 223 case OMPGeneralSchedulingType::StaticNonChunked: 224 // "STATIC" scheduling types are handled below 225 { 226 Builder.CreateAlignedStore(AdjustedUB, UBPtr, Alignment); 227 createCallStaticInit(ID, IsLastPtr, LBPtr, UBPtr, StridePtr, ChunkSize); 228 229 Value *ChunkedStride = Builder.CreateAlignedLoad( 230 LongType, StridePtr, Alignment, "polly.kmpc.stride"); 231 232 LB = Builder.CreateAlignedLoad(LongType, LBPtr, Alignment, 233 "polly.indvar.LB"); 234 UB = Builder.CreateAlignedLoad(LongType, UBPtr, Alignment, 235 "polly.indvar.UB.temp"); 236 237 Value *UBInRange = 238 Builder.CreateICmp(llvm::CmpInst::Predicate::ICMP_SLE, UB, AdjustedUB, 239 "polly.indvar.UB.inRange"); 240 UB = Builder.CreateSelect(UBInRange, UB, AdjustedUB, "polly.indvar.UB"); 241 Builder.CreateAlignedStore(UB, UBPtr, Alignment); 242 243 Value *HasIteration = Builder.CreateICmp( 244 llvm::CmpInst::Predicate::ICMP_SLE, LB, UB, "polly.hasIteration"); 245 Builder.CreateCondBr(HasIteration, PreHeaderBB, ExitBB); 246 247 if (Scheduling == OMPGeneralSchedulingType::StaticChunked) { 248 Builder.SetInsertPoint(PreHeaderBB); 249 LB = Builder.CreateAlignedLoad(LongType, LBPtr, Alignment, 250 "polly.indvar.LB.entry"); 251 UB = Builder.CreateAlignedLoad(LongType, UBPtr, Alignment, 252 "polly.indvar.UB.entry"); 253 } 254 255 Builder.SetInsertPoint(CheckNextBB); 256 257 if (Scheduling == OMPGeneralSchedulingType::StaticChunked) { 258 Value *NextLB = 259 Builder.CreateAdd(LB, ChunkedStride, "polly.indvar.nextLB"); 260 Value *NextUB = Builder.CreateAdd(UB, ChunkedStride); 261 262 Value *NextUBOutOfBounds = 263 Builder.CreateICmp(llvm::CmpInst::Predicate::ICMP_SGT, NextUB, 264 AdjustedUB, "polly.indvar.nextUB.outOfBounds"); 265 NextUB = Builder.CreateSelect(NextUBOutOfBounds, AdjustedUB, NextUB, 266 "polly.indvar.nextUB"); 267 268 Builder.CreateAlignedStore(NextLB, LBPtr, Alignment); 269 Builder.CreateAlignedStore(NextUB, UBPtr, Alignment); 270 271 Value *HasWork = 272 Builder.CreateICmp(llvm::CmpInst::Predicate::ICMP_SLE, NextLB, 273 AdjustedUB, "polly.hasWork"); 274 Builder.CreateCondBr(HasWork, PreHeaderBB, ExitBB); 275 } else { 276 Builder.CreateBr(ExitBB); 277 } 278 279 Builder.SetInsertPoint(PreHeaderBB); 280 } 281 break; 282 } 283 284 Builder.CreateBr(CheckNextBB); 285 Builder.SetInsertPoint(&*--Builder.GetInsertPoint()); 286 BasicBlock *AfterBB; 287 Value *IV = createLoop(LB, UB, SequentialLoopStride, Builder, *SubFnLI, 288 *SubFnDT, AfterBB, ICmpInst::ICMP_SLE, nullptr, true, 289 /* UseGuard */ false); 290 291 BasicBlock::iterator LoopBody = Builder.GetInsertPoint(); 292 293 // Add code to terminate this subfunction. 294 Builder.SetInsertPoint(ExitBB); 295 // Static (i.e. non-dynamic) scheduling types, are terminated with a fini-call 296 if (Scheduling == OMPGeneralSchedulingType::StaticChunked || 297 Scheduling == OMPGeneralSchedulingType::StaticNonChunked) { 298 createCallStaticFini(ID); 299 } 300 Builder.CreateRetVoid(); 301 Builder.SetInsertPoint(&*LoopBody); 302 303 // FIXME: Call SubFnDT->verify() and SubFnLI->verify() to check that the 304 // DominatorTree/LoopInfo has been created correctly. Alternatively, recreate 305 // from scratch since it is not needed here directly. 306 307 return std::make_tuple(IV, SubFn); 308 } 309 310 Value *ParallelLoopGeneratorKMP::createCallGlobalThreadNum() { 311 const std::string Name = "__kmpc_global_thread_num"; 312 Function *F = M->getFunction(Name); 313 314 // If F is not available, declare it. 315 if (!F) { 316 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 317 Type *Params[] = {Builder.getPtrTy(0)}; 318 319 FunctionType *Ty = FunctionType::get(Builder.getInt32Ty(), Params, false); 320 F = Function::Create(Ty, Linkage, Name, M); 321 } 322 323 CallInst *Call = Builder.CreateCall(F, {SourceLocationInfo}); 324 Call->setDebugLoc(DLGenerated); 325 return Call; 326 } 327 328 void ParallelLoopGeneratorKMP::createCallPushNumThreads(Value *GlobalThreadID, 329 Value *NumThreads) { 330 const std::string Name = "__kmpc_push_num_threads"; 331 Function *F = M->getFunction(Name); 332 333 // If F is not available, declare it. 334 if (!F) { 335 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 336 Type *Params[] = {Builder.getPtrTy(0), Builder.getInt32Ty(), 337 Builder.getInt32Ty()}; 338 339 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Params, false); 340 F = Function::Create(Ty, Linkage, Name, M); 341 } 342 343 Value *Args[] = {SourceLocationInfo, GlobalThreadID, NumThreads}; 344 345 CallInst *Call = Builder.CreateCall(F, Args); 346 Call->setDebugLoc(DLGenerated); 347 } 348 349 void ParallelLoopGeneratorKMP::createCallStaticInit(Value *GlobalThreadID, 350 Value *IsLastPtr, 351 Value *LBPtr, Value *UBPtr, 352 Value *StridePtr, 353 Value *ChunkSize) { 354 const std::string Name = 355 is64BitArch() ? "__kmpc_for_static_init_8" : "__kmpc_for_static_init_4"; 356 Function *F = M->getFunction(Name); 357 358 // If F is not available, declare it. 359 if (!F) { 360 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 361 362 Type *Params[] = {Builder.getPtrTy(0), 363 Builder.getInt32Ty(), 364 Builder.getInt32Ty(), 365 Builder.getPtrTy(0), 366 Builder.getPtrTy(0), 367 Builder.getPtrTy(0), 368 Builder.getPtrTy(0), 369 LongType, 370 LongType}; 371 372 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Params, false); 373 F = Function::Create(Ty, Linkage, Name, M); 374 } 375 376 // The parameter 'ChunkSize' will hold strictly positive integer values, 377 // regardless of PollyChunkSize's value 378 Value *Args[] = { 379 SourceLocationInfo, 380 GlobalThreadID, 381 Builder.getInt32(int(getSchedType(PollyChunkSize, PollyScheduling))), 382 IsLastPtr, 383 LBPtr, 384 UBPtr, 385 StridePtr, 386 ConstantInt::get(LongType, 1), 387 ChunkSize}; 388 389 CallInst *Call = Builder.CreateCall(F, Args); 390 Call->setDebugLoc(DLGenerated); 391 } 392 393 void ParallelLoopGeneratorKMP::createCallStaticFini(Value *GlobalThreadID) { 394 const std::string Name = "__kmpc_for_static_fini"; 395 Function *F = M->getFunction(Name); 396 397 // If F is not available, declare it. 398 if (!F) { 399 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 400 Type *Params[] = {Builder.getPtrTy(0), Builder.getInt32Ty()}; 401 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Params, false); 402 F = Function::Create(Ty, Linkage, Name, M); 403 } 404 405 Value *Args[] = {SourceLocationInfo, GlobalThreadID}; 406 407 CallInst *Call = Builder.CreateCall(F, Args); 408 Call->setDebugLoc(DLGenerated); 409 } 410 411 void ParallelLoopGeneratorKMP::createCallDispatchInit(Value *GlobalThreadID, 412 Value *LB, Value *UB, 413 Value *Inc, 414 Value *ChunkSize) { 415 const std::string Name = 416 is64BitArch() ? "__kmpc_dispatch_init_8" : "__kmpc_dispatch_init_4"; 417 Function *F = M->getFunction(Name); 418 419 // If F is not available, declare it. 420 if (!F) { 421 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 422 423 Type *Params[] = {Builder.getPtrTy(0), 424 Builder.getInt32Ty(), 425 Builder.getInt32Ty(), 426 LongType, 427 LongType, 428 LongType, 429 LongType}; 430 431 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Params, false); 432 F = Function::Create(Ty, Linkage, Name, M); 433 } 434 435 // The parameter 'ChunkSize' will hold strictly positive integer values, 436 // regardless of PollyChunkSize's value 437 Value *Args[] = { 438 SourceLocationInfo, 439 GlobalThreadID, 440 Builder.getInt32(int(getSchedType(PollyChunkSize, PollyScheduling))), 441 LB, 442 UB, 443 Inc, 444 ChunkSize}; 445 446 CallInst *Call = Builder.CreateCall(F, Args); 447 Call->setDebugLoc(DLGenerated); 448 } 449 450 Value *ParallelLoopGeneratorKMP::createCallDispatchNext(Value *GlobalThreadID, 451 Value *IsLastPtr, 452 Value *LBPtr, 453 Value *UBPtr, 454 Value *StridePtr) { 455 const std::string Name = 456 is64BitArch() ? "__kmpc_dispatch_next_8" : "__kmpc_dispatch_next_4"; 457 Function *F = M->getFunction(Name); 458 459 // If F is not available, declare it. 460 if (!F) { 461 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 462 463 Type *Params[] = {Builder.getPtrTy(0), Builder.getInt32Ty(), 464 Builder.getPtrTy(0), Builder.getPtrTy(0), 465 Builder.getPtrTy(0), Builder.getPtrTy(0)}; 466 467 FunctionType *Ty = FunctionType::get(Builder.getInt32Ty(), Params, false); 468 F = Function::Create(Ty, Linkage, Name, M); 469 } 470 471 Value *Args[] = {SourceLocationInfo, GlobalThreadID, IsLastPtr, LBPtr, UBPtr, 472 StridePtr}; 473 474 CallInst *Call = Builder.CreateCall(F, Args); 475 Call->setDebugLoc(DLGenerated); 476 return Call; 477 } 478 479 // TODO: This function currently creates a source location dummy. It might be 480 // necessary to (actually) provide information, in the future. 481 GlobalVariable *ParallelLoopGeneratorKMP::createSourceLocation() { 482 const std::string LocName = ".loc.dummy"; 483 GlobalVariable *SourceLocDummy = M->getGlobalVariable(LocName); 484 485 if (SourceLocDummy == nullptr) { 486 const std::string StructName = "struct.ident_t"; 487 StructType *IdentTy = 488 StructType::getTypeByName(M->getContext(), StructName); 489 490 // If the ident_t StructType is not available, declare it. 491 // in LLVM-IR: ident_t = type { i32, i32, i32, i32, i8* } 492 if (!IdentTy) { 493 Type *LocMembers[] = {Builder.getInt32Ty(), Builder.getInt32Ty(), 494 Builder.getInt32Ty(), Builder.getInt32Ty(), 495 Builder.getPtrTy()}; 496 497 IdentTy = 498 StructType::create(M->getContext(), LocMembers, StructName, false); 499 } 500 501 const auto ArrayType = 502 llvm::ArrayType::get(Builder.getInt8Ty(), /* Length */ 23); 503 504 // Global Variable Definitions 505 GlobalVariable *StrVar = 506 new GlobalVariable(*M, ArrayType, true, GlobalValue::PrivateLinkage, 507 nullptr, ".str.ident"); 508 StrVar->setAlignment(llvm::Align(1)); 509 510 SourceLocDummy = new GlobalVariable( 511 *M, IdentTy, true, GlobalValue::PrivateLinkage, nullptr, LocName); 512 SourceLocDummy->setAlignment(llvm::Align(8)); 513 514 // Constant Definitions 515 Constant *InitStr = ConstantDataArray::getString( 516 M->getContext(), "Source location dummy.", true); 517 518 Constant *StrPtr = static_cast<Constant *>(Builder.CreateInBoundsGEP( 519 ArrayType, StrVar, {Builder.getInt32(0), Builder.getInt32(0)})); 520 521 Constant *LocInitStruct = ConstantStruct::get( 522 IdentTy, {Builder.getInt32(0), Builder.getInt32(0), Builder.getInt32(0), 523 Builder.getInt32(0), StrPtr}); 524 525 // Initialize variables 526 StrVar->setInitializer(InitStr); 527 SourceLocDummy->setInitializer(LocInitStruct); 528 } 529 530 return SourceLocDummy; 531 } 532 533 bool ParallelLoopGeneratorKMP::is64BitArch() { 534 return (LongType->getIntegerBitWidth() == 64); 535 } 536 537 OMPGeneralSchedulingType ParallelLoopGeneratorKMP::getSchedType( 538 int ChunkSize, OMPGeneralSchedulingType Scheduling) const { 539 if (ChunkSize == 0 && Scheduling == OMPGeneralSchedulingType::StaticChunked) 540 return OMPGeneralSchedulingType::StaticNonChunked; 541 542 return Scheduling; 543 } 544