1 //===- AMDGPULibCalls.cpp -------------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This file does AMD library function optimizations. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPU.h" 15 #include "AMDGPULibFunc.h" 16 #include "GCNSubtarget.h" 17 #include "llvm/Analysis/AssumptionCache.h" 18 #include "llvm/Analysis/TargetLibraryInfo.h" 19 #include "llvm/Analysis/ValueTracking.h" 20 #include "llvm/IR/AttributeMask.h" 21 #include "llvm/IR/Dominators.h" 22 #include "llvm/IR/IRBuilder.h" 23 #include "llvm/IR/IntrinsicInst.h" 24 #include "llvm/IR/IntrinsicsAMDGPU.h" 25 #include "llvm/IR/PatternMatch.h" 26 #include "llvm/InitializePasses.h" 27 #include <cmath> 28 29 #define DEBUG_TYPE "amdgpu-simplifylib" 30 31 using namespace llvm; 32 using namespace llvm::PatternMatch; 33 34 static cl::opt<bool> EnablePreLink("amdgpu-prelink", 35 cl::desc("Enable pre-link mode optimizations"), 36 cl::init(false), 37 cl::Hidden); 38 39 static cl::list<std::string> UseNative("amdgpu-use-native", 40 cl::desc("Comma separated list of functions to replace with native, or all"), 41 cl::CommaSeparated, cl::ValueOptional, 42 cl::Hidden); 43 44 #define MATH_PI numbers::pi 45 #define MATH_E numbers::e 46 #define MATH_SQRT2 numbers::sqrt2 47 #define MATH_SQRT1_2 numbers::inv_sqrt2 48 49 namespace llvm { 50 51 class AMDGPULibCalls { 52 private: 53 const TargetLibraryInfo *TLInfo = nullptr; 54 AssumptionCache *AC = nullptr; 55 DominatorTree *DT = nullptr; 56 57 typedef llvm::AMDGPULibFunc FuncInfo; 58 59 bool UnsafeFPMath = false; 60 61 // -fuse-native. 62 bool AllNative = false; 63 64 bool useNativeFunc(const StringRef F) const; 65 66 // Return a pointer (pointer expr) to the function if function definition with 67 // "FuncName" exists. It may create a new function prototype in pre-link mode. 68 FunctionCallee getFunction(Module *M, const FuncInfo &fInfo); 69 70 bool parseFunctionName(const StringRef &FMangledName, FuncInfo &FInfo); 71 72 bool TDOFold(CallInst *CI, const FuncInfo &FInfo); 73 74 /* Specialized optimizations */ 75 76 // pow/powr/pown 77 bool fold_pow(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo); 78 79 // rootn 80 bool fold_rootn(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo); 81 82 // -fuse-native for sincos 83 bool sincosUseNative(CallInst *aCI, const FuncInfo &FInfo); 84 85 // evaluate calls if calls' arguments are constants. 86 bool evaluateScalarMathFunc(const FuncInfo &FInfo, double &Res0, double &Res1, 87 Constant *copr0, Constant *copr1); 88 bool evaluateCall(CallInst *aCI, const FuncInfo &FInfo); 89 90 // sqrt 91 bool fold_sqrt(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo); 92 93 /// Insert a value to sincos function \p Fsincos. Returns (value of sin, value 94 /// of cos, sincos call). 95 std::tuple<Value *, Value *, Value *> insertSinCos(Value *Arg, 96 FastMathFlags FMF, 97 IRBuilder<> &B, 98 FunctionCallee Fsincos); 99 100 // sin/cos 101 bool fold_sincos(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo); 102 103 // __read_pipe/__write_pipe 104 bool fold_read_write_pipe(CallInst *CI, IRBuilder<> &B, 105 const FuncInfo &FInfo); 106 107 // Get a scalar native builtin single argument FP function 108 FunctionCallee getNativeFunction(Module *M, const FuncInfo &FInfo); 109 110 /// Substitute a call to a known libcall with an intrinsic call. If \p 111 /// AllowMinSize is true, allow the replacement in a minsize function. 112 bool shouldReplaceLibcallWithIntrinsic(const CallInst *CI, 113 bool AllowMinSizeF32 = false, 114 bool AllowF64 = false, 115 bool AllowStrictFP = false); 116 void replaceLibCallWithSimpleIntrinsic(IRBuilder<> &B, CallInst *CI, 117 Intrinsic::ID IntrID); 118 119 bool tryReplaceLibcallWithSimpleIntrinsic(IRBuilder<> &B, CallInst *CI, 120 Intrinsic::ID IntrID, 121 bool AllowMinSizeF32 = false, 122 bool AllowF64 = false, 123 bool AllowStrictFP = false); 124 125 protected: 126 bool isUnsafeMath(const FPMathOperator *FPOp) const; 127 bool isUnsafeFiniteOnlyMath(const FPMathOperator *FPOp) const; 128 129 bool canIncreasePrecisionOfConstantFold(const FPMathOperator *FPOp) const; 130 131 static void replaceCall(Instruction *I, Value *With) { 132 I->replaceAllUsesWith(With); 133 I->eraseFromParent(); 134 } 135 136 static void replaceCall(FPMathOperator *I, Value *With) { 137 replaceCall(cast<Instruction>(I), With); 138 } 139 140 public: 141 AMDGPULibCalls() {} 142 143 bool fold(CallInst *CI); 144 145 void initFunction(Function &F, FunctionAnalysisManager &FAM); 146 void initNativeFuncs(); 147 148 // Replace a normal math function call with that native version 149 bool useNative(CallInst *CI); 150 }; 151 152 } // end llvm namespace 153 154 template <typename IRB> 155 static CallInst *CreateCallEx(IRB &B, FunctionCallee Callee, Value *Arg, 156 const Twine &Name = "") { 157 CallInst *R = B.CreateCall(Callee, Arg, Name); 158 if (Function *F = dyn_cast<Function>(Callee.getCallee())) 159 R->setCallingConv(F->getCallingConv()); 160 return R; 161 } 162 163 template <typename IRB> 164 static CallInst *CreateCallEx2(IRB &B, FunctionCallee Callee, Value *Arg1, 165 Value *Arg2, const Twine &Name = "") { 166 CallInst *R = B.CreateCall(Callee, {Arg1, Arg2}, Name); 167 if (Function *F = dyn_cast<Function>(Callee.getCallee())) 168 R->setCallingConv(F->getCallingConv()); 169 return R; 170 } 171 172 static FunctionType *getPownType(FunctionType *FT) { 173 Type *PowNExpTy = Type::getInt32Ty(FT->getContext()); 174 if (VectorType *VecTy = dyn_cast<VectorType>(FT->getReturnType())) 175 PowNExpTy = VectorType::get(PowNExpTy, VecTy->getElementCount()); 176 177 return FunctionType::get(FT->getReturnType(), 178 {FT->getParamType(0), PowNExpTy}, false); 179 } 180 181 // Data structures for table-driven optimizations. 182 // FuncTbl works for both f32 and f64 functions with 1 input argument 183 184 struct TableEntry { 185 double result; 186 double input; 187 }; 188 189 /* a list of {result, input} */ 190 static const TableEntry tbl_acos[] = { 191 {MATH_PI / 2.0, 0.0}, 192 {MATH_PI / 2.0, -0.0}, 193 {0.0, 1.0}, 194 {MATH_PI, -1.0} 195 }; 196 static const TableEntry tbl_acosh[] = { 197 {0.0, 1.0} 198 }; 199 static const TableEntry tbl_acospi[] = { 200 {0.5, 0.0}, 201 {0.5, -0.0}, 202 {0.0, 1.0}, 203 {1.0, -1.0} 204 }; 205 static const TableEntry tbl_asin[] = { 206 {0.0, 0.0}, 207 {-0.0, -0.0}, 208 {MATH_PI / 2.0, 1.0}, 209 {-MATH_PI / 2.0, -1.0} 210 }; 211 static const TableEntry tbl_asinh[] = { 212 {0.0, 0.0}, 213 {-0.0, -0.0} 214 }; 215 static const TableEntry tbl_asinpi[] = { 216 {0.0, 0.0}, 217 {-0.0, -0.0}, 218 {0.5, 1.0}, 219 {-0.5, -1.0} 220 }; 221 static const TableEntry tbl_atan[] = { 222 {0.0, 0.0}, 223 {-0.0, -0.0}, 224 {MATH_PI / 4.0, 1.0}, 225 {-MATH_PI / 4.0, -1.0} 226 }; 227 static const TableEntry tbl_atanh[] = { 228 {0.0, 0.0}, 229 {-0.0, -0.0} 230 }; 231 static const TableEntry tbl_atanpi[] = { 232 {0.0, 0.0}, 233 {-0.0, -0.0}, 234 {0.25, 1.0}, 235 {-0.25, -1.0} 236 }; 237 static const TableEntry tbl_cbrt[] = { 238 {0.0, 0.0}, 239 {-0.0, -0.0}, 240 {1.0, 1.0}, 241 {-1.0, -1.0}, 242 }; 243 static const TableEntry tbl_cos[] = { 244 {1.0, 0.0}, 245 {1.0, -0.0} 246 }; 247 static const TableEntry tbl_cosh[] = { 248 {1.0, 0.0}, 249 {1.0, -0.0} 250 }; 251 static const TableEntry tbl_cospi[] = { 252 {1.0, 0.0}, 253 {1.0, -0.0} 254 }; 255 static const TableEntry tbl_erfc[] = { 256 {1.0, 0.0}, 257 {1.0, -0.0} 258 }; 259 static const TableEntry tbl_erf[] = { 260 {0.0, 0.0}, 261 {-0.0, -0.0} 262 }; 263 static const TableEntry tbl_exp[] = { 264 {1.0, 0.0}, 265 {1.0, -0.0}, 266 {MATH_E, 1.0} 267 }; 268 static const TableEntry tbl_exp2[] = { 269 {1.0, 0.0}, 270 {1.0, -0.0}, 271 {2.0, 1.0} 272 }; 273 static const TableEntry tbl_exp10[] = { 274 {1.0, 0.0}, 275 {1.0, -0.0}, 276 {10.0, 1.0} 277 }; 278 static const TableEntry tbl_expm1[] = { 279 {0.0, 0.0}, 280 {-0.0, -0.0} 281 }; 282 static const TableEntry tbl_log[] = { 283 {0.0, 1.0}, 284 {1.0, MATH_E} 285 }; 286 static const TableEntry tbl_log2[] = { 287 {0.0, 1.0}, 288 {1.0, 2.0} 289 }; 290 static const TableEntry tbl_log10[] = { 291 {0.0, 1.0}, 292 {1.0, 10.0} 293 }; 294 static const TableEntry tbl_rsqrt[] = { 295 {1.0, 1.0}, 296 {MATH_SQRT1_2, 2.0} 297 }; 298 static const TableEntry tbl_sin[] = { 299 {0.0, 0.0}, 300 {-0.0, -0.0} 301 }; 302 static const TableEntry tbl_sinh[] = { 303 {0.0, 0.0}, 304 {-0.0, -0.0} 305 }; 306 static const TableEntry tbl_sinpi[] = { 307 {0.0, 0.0}, 308 {-0.0, -0.0} 309 }; 310 static const TableEntry tbl_sqrt[] = { 311 {0.0, 0.0}, 312 {1.0, 1.0}, 313 {MATH_SQRT2, 2.0} 314 }; 315 static const TableEntry tbl_tan[] = { 316 {0.0, 0.0}, 317 {-0.0, -0.0} 318 }; 319 static const TableEntry tbl_tanh[] = { 320 {0.0, 0.0}, 321 {-0.0, -0.0} 322 }; 323 static const TableEntry tbl_tanpi[] = { 324 {0.0, 0.0}, 325 {-0.0, -0.0} 326 }; 327 static const TableEntry tbl_tgamma[] = { 328 {1.0, 1.0}, 329 {1.0, 2.0}, 330 {2.0, 3.0}, 331 {6.0, 4.0} 332 }; 333 334 static bool HasNative(AMDGPULibFunc::EFuncId id) { 335 switch(id) { 336 case AMDGPULibFunc::EI_DIVIDE: 337 case AMDGPULibFunc::EI_COS: 338 case AMDGPULibFunc::EI_EXP: 339 case AMDGPULibFunc::EI_EXP2: 340 case AMDGPULibFunc::EI_EXP10: 341 case AMDGPULibFunc::EI_LOG: 342 case AMDGPULibFunc::EI_LOG2: 343 case AMDGPULibFunc::EI_LOG10: 344 case AMDGPULibFunc::EI_POWR: 345 case AMDGPULibFunc::EI_RECIP: 346 case AMDGPULibFunc::EI_RSQRT: 347 case AMDGPULibFunc::EI_SIN: 348 case AMDGPULibFunc::EI_SINCOS: 349 case AMDGPULibFunc::EI_SQRT: 350 case AMDGPULibFunc::EI_TAN: 351 return true; 352 default:; 353 } 354 return false; 355 } 356 357 using TableRef = ArrayRef<TableEntry>; 358 359 static TableRef getOptTable(AMDGPULibFunc::EFuncId id) { 360 switch(id) { 361 case AMDGPULibFunc::EI_ACOS: return TableRef(tbl_acos); 362 case AMDGPULibFunc::EI_ACOSH: return TableRef(tbl_acosh); 363 case AMDGPULibFunc::EI_ACOSPI: return TableRef(tbl_acospi); 364 case AMDGPULibFunc::EI_ASIN: return TableRef(tbl_asin); 365 case AMDGPULibFunc::EI_ASINH: return TableRef(tbl_asinh); 366 case AMDGPULibFunc::EI_ASINPI: return TableRef(tbl_asinpi); 367 case AMDGPULibFunc::EI_ATAN: return TableRef(tbl_atan); 368 case AMDGPULibFunc::EI_ATANH: return TableRef(tbl_atanh); 369 case AMDGPULibFunc::EI_ATANPI: return TableRef(tbl_atanpi); 370 case AMDGPULibFunc::EI_CBRT: return TableRef(tbl_cbrt); 371 case AMDGPULibFunc::EI_NCOS: 372 case AMDGPULibFunc::EI_COS: return TableRef(tbl_cos); 373 case AMDGPULibFunc::EI_COSH: return TableRef(tbl_cosh); 374 case AMDGPULibFunc::EI_COSPI: return TableRef(tbl_cospi); 375 case AMDGPULibFunc::EI_ERFC: return TableRef(tbl_erfc); 376 case AMDGPULibFunc::EI_ERF: return TableRef(tbl_erf); 377 case AMDGPULibFunc::EI_EXP: return TableRef(tbl_exp); 378 case AMDGPULibFunc::EI_NEXP2: 379 case AMDGPULibFunc::EI_EXP2: return TableRef(tbl_exp2); 380 case AMDGPULibFunc::EI_EXP10: return TableRef(tbl_exp10); 381 case AMDGPULibFunc::EI_EXPM1: return TableRef(tbl_expm1); 382 case AMDGPULibFunc::EI_LOG: return TableRef(tbl_log); 383 case AMDGPULibFunc::EI_NLOG2: 384 case AMDGPULibFunc::EI_LOG2: return TableRef(tbl_log2); 385 case AMDGPULibFunc::EI_LOG10: return TableRef(tbl_log10); 386 case AMDGPULibFunc::EI_NRSQRT: 387 case AMDGPULibFunc::EI_RSQRT: return TableRef(tbl_rsqrt); 388 case AMDGPULibFunc::EI_NSIN: 389 case AMDGPULibFunc::EI_SIN: return TableRef(tbl_sin); 390 case AMDGPULibFunc::EI_SINH: return TableRef(tbl_sinh); 391 case AMDGPULibFunc::EI_SINPI: return TableRef(tbl_sinpi); 392 case AMDGPULibFunc::EI_NSQRT: 393 case AMDGPULibFunc::EI_SQRT: return TableRef(tbl_sqrt); 394 case AMDGPULibFunc::EI_TAN: return TableRef(tbl_tan); 395 case AMDGPULibFunc::EI_TANH: return TableRef(tbl_tanh); 396 case AMDGPULibFunc::EI_TANPI: return TableRef(tbl_tanpi); 397 case AMDGPULibFunc::EI_TGAMMA: return TableRef(tbl_tgamma); 398 default:; 399 } 400 return TableRef(); 401 } 402 403 static inline int getVecSize(const AMDGPULibFunc& FInfo) { 404 return FInfo.getLeads()[0].VectorSize; 405 } 406 407 static inline AMDGPULibFunc::EType getArgType(const AMDGPULibFunc& FInfo) { 408 return (AMDGPULibFunc::EType)FInfo.getLeads()[0].ArgType; 409 } 410 411 FunctionCallee AMDGPULibCalls::getFunction(Module *M, const FuncInfo &fInfo) { 412 // If we are doing PreLinkOpt, the function is external. So it is safe to 413 // use getOrInsertFunction() at this stage. 414 415 return EnablePreLink ? AMDGPULibFunc::getOrInsertFunction(M, fInfo) 416 : AMDGPULibFunc::getFunction(M, fInfo); 417 } 418 419 bool AMDGPULibCalls::parseFunctionName(const StringRef &FMangledName, 420 FuncInfo &FInfo) { 421 return AMDGPULibFunc::parse(FMangledName, FInfo); 422 } 423 424 bool AMDGPULibCalls::isUnsafeMath(const FPMathOperator *FPOp) const { 425 return UnsafeFPMath || FPOp->isFast(); 426 } 427 428 bool AMDGPULibCalls::isUnsafeFiniteOnlyMath(const FPMathOperator *FPOp) const { 429 return UnsafeFPMath || 430 (FPOp->hasApproxFunc() && FPOp->hasNoNaNs() && FPOp->hasNoInfs()); 431 } 432 433 bool AMDGPULibCalls::canIncreasePrecisionOfConstantFold( 434 const FPMathOperator *FPOp) const { 435 // TODO: Refine to approxFunc or contract 436 return isUnsafeMath(FPOp); 437 } 438 439 void AMDGPULibCalls::initFunction(Function &F, FunctionAnalysisManager &FAM) { 440 UnsafeFPMath = F.getFnAttribute("unsafe-fp-math").getValueAsBool(); 441 AC = &FAM.getResult<AssumptionAnalysis>(F); 442 TLInfo = &FAM.getResult<TargetLibraryAnalysis>(F); 443 DT = FAM.getCachedResult<DominatorTreeAnalysis>(F); 444 } 445 446 bool AMDGPULibCalls::useNativeFunc(const StringRef F) const { 447 return AllNative || llvm::is_contained(UseNative, F); 448 } 449 450 void AMDGPULibCalls::initNativeFuncs() { 451 AllNative = useNativeFunc("all") || 452 (UseNative.getNumOccurrences() && UseNative.size() == 1 && 453 UseNative.begin()->empty()); 454 } 455 456 bool AMDGPULibCalls::sincosUseNative(CallInst *aCI, const FuncInfo &FInfo) { 457 bool native_sin = useNativeFunc("sin"); 458 bool native_cos = useNativeFunc("cos"); 459 460 if (native_sin && native_cos) { 461 Module *M = aCI->getModule(); 462 Value *opr0 = aCI->getArgOperand(0); 463 464 AMDGPULibFunc nf; 465 nf.getLeads()[0].ArgType = FInfo.getLeads()[0].ArgType; 466 nf.getLeads()[0].VectorSize = FInfo.getLeads()[0].VectorSize; 467 468 nf.setPrefix(AMDGPULibFunc::NATIVE); 469 nf.setId(AMDGPULibFunc::EI_SIN); 470 FunctionCallee sinExpr = getFunction(M, nf); 471 472 nf.setPrefix(AMDGPULibFunc::NATIVE); 473 nf.setId(AMDGPULibFunc::EI_COS); 474 FunctionCallee cosExpr = getFunction(M, nf); 475 if (sinExpr && cosExpr) { 476 Value *sinval = CallInst::Create(sinExpr, opr0, "splitsin", aCI); 477 Value *cosval = CallInst::Create(cosExpr, opr0, "splitcos", aCI); 478 new StoreInst(cosval, aCI->getArgOperand(1), aCI); 479 480 DEBUG_WITH_TYPE("usenative", dbgs() << "<useNative> replace " << *aCI 481 << " with native version of sin/cos"); 482 483 replaceCall(aCI, sinval); 484 return true; 485 } 486 } 487 return false; 488 } 489 490 bool AMDGPULibCalls::useNative(CallInst *aCI) { 491 Function *Callee = aCI->getCalledFunction(); 492 if (!Callee || aCI->isNoBuiltin()) 493 return false; 494 495 FuncInfo FInfo; 496 if (!parseFunctionName(Callee->getName(), FInfo) || !FInfo.isMangled() || 497 FInfo.getPrefix() != AMDGPULibFunc::NOPFX || 498 getArgType(FInfo) == AMDGPULibFunc::F64 || !HasNative(FInfo.getId()) || 499 !(AllNative || useNativeFunc(FInfo.getName()))) { 500 return false; 501 } 502 503 if (FInfo.getId() == AMDGPULibFunc::EI_SINCOS) 504 return sincosUseNative(aCI, FInfo); 505 506 FInfo.setPrefix(AMDGPULibFunc::NATIVE); 507 FunctionCallee F = getFunction(aCI->getModule(), FInfo); 508 if (!F) 509 return false; 510 511 aCI->setCalledFunction(F); 512 DEBUG_WITH_TYPE("usenative", dbgs() << "<useNative> replace " << *aCI 513 << " with native version"); 514 return true; 515 } 516 517 // Clang emits call of __read_pipe_2 or __read_pipe_4 for OpenCL read_pipe 518 // builtin, with appended type size and alignment arguments, where 2 or 4 519 // indicates the original number of arguments. The library has optimized version 520 // of __read_pipe_2/__read_pipe_4 when the type size and alignment has the same 521 // power of 2 value. This function transforms __read_pipe_2 to __read_pipe_2_N 522 // for such cases where N is the size in bytes of the type (N = 1, 2, 4, 8, ..., 523 // 128). The same for __read_pipe_4, write_pipe_2, and write_pipe_4. 524 bool AMDGPULibCalls::fold_read_write_pipe(CallInst *CI, IRBuilder<> &B, 525 const FuncInfo &FInfo) { 526 auto *Callee = CI->getCalledFunction(); 527 if (!Callee->isDeclaration()) 528 return false; 529 530 assert(Callee->hasName() && "Invalid read_pipe/write_pipe function"); 531 auto *M = Callee->getParent(); 532 std::string Name = std::string(Callee->getName()); 533 auto NumArg = CI->arg_size(); 534 if (NumArg != 4 && NumArg != 6) 535 return false; 536 ConstantInt *PacketSize = 537 dyn_cast<ConstantInt>(CI->getArgOperand(NumArg - 2)); 538 ConstantInt *PacketAlign = 539 dyn_cast<ConstantInt>(CI->getArgOperand(NumArg - 1)); 540 if (!PacketSize || !PacketAlign) 541 return false; 542 543 unsigned Size = PacketSize->getZExtValue(); 544 Align Alignment = PacketAlign->getAlignValue(); 545 if (Alignment != Size) 546 return false; 547 548 unsigned PtrArgLoc = CI->arg_size() - 3; 549 Value *PtrArg = CI->getArgOperand(PtrArgLoc); 550 Type *PtrTy = PtrArg->getType(); 551 552 SmallVector<llvm::Type *, 6> ArgTys; 553 for (unsigned I = 0; I != PtrArgLoc; ++I) 554 ArgTys.push_back(CI->getArgOperand(I)->getType()); 555 ArgTys.push_back(PtrTy); 556 557 Name = Name + "_" + std::to_string(Size); 558 auto *FTy = FunctionType::get(Callee->getReturnType(), 559 ArrayRef<Type *>(ArgTys), false); 560 AMDGPULibFunc NewLibFunc(Name, FTy); 561 FunctionCallee F = AMDGPULibFunc::getOrInsertFunction(M, NewLibFunc); 562 if (!F) 563 return false; 564 565 auto *BCast = B.CreatePointerCast(PtrArg, PtrTy); 566 SmallVector<Value *, 6> Args; 567 for (unsigned I = 0; I != PtrArgLoc; ++I) 568 Args.push_back(CI->getArgOperand(I)); 569 Args.push_back(BCast); 570 571 auto *NCI = B.CreateCall(F, Args); 572 NCI->setAttributes(CI->getAttributes()); 573 CI->replaceAllUsesWith(NCI); 574 CI->dropAllReferences(); 575 CI->eraseFromParent(); 576 577 return true; 578 } 579 580 static bool isKnownIntegral(const Value *V, const DataLayout &DL, 581 FastMathFlags FMF) { 582 if (isa<UndefValue>(V)) 583 return true; 584 585 if (const ConstantFP *CF = dyn_cast<ConstantFP>(V)) 586 return CF->getValueAPF().isInteger(); 587 588 if (const ConstantDataVector *CDV = dyn_cast<ConstantDataVector>(V)) { 589 for (unsigned i = 0, e = CDV->getNumElements(); i != e; ++i) { 590 Constant *ConstElt = CDV->getElementAsConstant(i); 591 if (isa<UndefValue>(ConstElt)) 592 continue; 593 const ConstantFP *CFP = dyn_cast<ConstantFP>(ConstElt); 594 if (!CFP || !CFP->getValue().isInteger()) 595 return false; 596 } 597 598 return true; 599 } 600 601 const Instruction *I = dyn_cast<Instruction>(V); 602 if (!I) 603 return false; 604 605 switch (I->getOpcode()) { 606 case Instruction::SIToFP: 607 case Instruction::UIToFP: 608 // TODO: Could check nofpclass(inf) on incoming argument 609 if (FMF.noInfs()) 610 return true; 611 612 // Need to check int size cannot produce infinity, which computeKnownFPClass 613 // knows how to do already. 614 return isKnownNeverInfinity(I, DL); 615 case Instruction::Call: { 616 const CallInst *CI = cast<CallInst>(I); 617 switch (CI->getIntrinsicID()) { 618 case Intrinsic::trunc: 619 case Intrinsic::floor: 620 case Intrinsic::ceil: 621 case Intrinsic::rint: 622 case Intrinsic::nearbyint: 623 case Intrinsic::round: 624 case Intrinsic::roundeven: 625 return (FMF.noInfs() && FMF.noNaNs()) || 626 isKnownNeverInfOrNaN(I, DL, nullptr); 627 default: 628 break; 629 } 630 631 break; 632 } 633 default: 634 break; 635 } 636 637 return false; 638 } 639 640 // This function returns false if no change; return true otherwise. 641 bool AMDGPULibCalls::fold(CallInst *CI) { 642 Function *Callee = CI->getCalledFunction(); 643 // Ignore indirect calls. 644 if (!Callee || Callee->isIntrinsic() || CI->isNoBuiltin()) 645 return false; 646 647 FuncInfo FInfo; 648 if (!parseFunctionName(Callee->getName(), FInfo)) 649 return false; 650 651 // Further check the number of arguments to see if they match. 652 // TODO: Check calling convention matches too 653 if (!FInfo.isCompatibleSignature(CI->getFunctionType())) 654 return false; 655 656 LLVM_DEBUG(dbgs() << "AMDIC: try folding " << *CI << '\n'); 657 658 if (TDOFold(CI, FInfo)) 659 return true; 660 661 IRBuilder<> B(CI); 662 663 if (FPMathOperator *FPOp = dyn_cast<FPMathOperator>(CI)) { 664 // Under unsafe-math, evaluate calls if possible. 665 // According to Brian Sumner, we can do this for all f32 function calls 666 // using host's double function calls. 667 if (canIncreasePrecisionOfConstantFold(FPOp) && evaluateCall(CI, FInfo)) 668 return true; 669 670 // Copy fast flags from the original call. 671 FastMathFlags FMF = FPOp->getFastMathFlags(); 672 B.setFastMathFlags(FMF); 673 674 // Specialized optimizations for each function call. 675 // 676 // TODO: Handle other simple intrinsic wrappers. Sqrt. 677 // 678 // TODO: Handle native functions 679 switch (FInfo.getId()) { 680 case AMDGPULibFunc::EI_EXP: 681 if (FMF.none()) 682 return false; 683 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::exp, 684 FMF.approxFunc()); 685 case AMDGPULibFunc::EI_EXP2: 686 if (FMF.none()) 687 return false; 688 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::exp2, 689 FMF.approxFunc()); 690 case AMDGPULibFunc::EI_LOG: 691 if (FMF.none()) 692 return false; 693 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::log, 694 FMF.approxFunc()); 695 case AMDGPULibFunc::EI_LOG2: 696 if (FMF.none()) 697 return false; 698 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::log2, 699 FMF.approxFunc()); 700 case AMDGPULibFunc::EI_LOG10: 701 if (FMF.none()) 702 return false; 703 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::log10, 704 FMF.approxFunc()); 705 case AMDGPULibFunc::EI_FMIN: 706 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::minnum, 707 true, true); 708 case AMDGPULibFunc::EI_FMAX: 709 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::maxnum, 710 true, true); 711 case AMDGPULibFunc::EI_FMA: 712 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::fma, true, 713 true); 714 case AMDGPULibFunc::EI_MAD: 715 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::fmuladd, 716 true, true); 717 case AMDGPULibFunc::EI_FABS: 718 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::fabs, true, 719 true, true); 720 case AMDGPULibFunc::EI_COPYSIGN: 721 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::copysign, 722 true, true, true); 723 case AMDGPULibFunc::EI_FLOOR: 724 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::floor, true, 725 true); 726 case AMDGPULibFunc::EI_CEIL: 727 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::ceil, true, 728 true); 729 case AMDGPULibFunc::EI_TRUNC: 730 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::trunc, true, 731 true); 732 case AMDGPULibFunc::EI_RINT: 733 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::rint, true, 734 true); 735 case AMDGPULibFunc::EI_ROUND: 736 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::round, true, 737 true); 738 case AMDGPULibFunc::EI_LDEXP: { 739 if (!shouldReplaceLibcallWithIntrinsic(CI, true, true)) 740 return false; 741 742 Value *Arg1 = CI->getArgOperand(1); 743 if (VectorType *VecTy = dyn_cast<VectorType>(CI->getType()); 744 VecTy && !isa<VectorType>(Arg1->getType())) { 745 Value *SplatArg1 = B.CreateVectorSplat(VecTy->getElementCount(), Arg1); 746 CI->setArgOperand(1, SplatArg1); 747 } 748 749 CI->setCalledFunction(Intrinsic::getDeclaration( 750 CI->getModule(), Intrinsic::ldexp, 751 {CI->getType(), CI->getArgOperand(1)->getType()})); 752 return true; 753 } 754 case AMDGPULibFunc::EI_POW: { 755 Module *M = Callee->getParent(); 756 AMDGPULibFunc PowrInfo(AMDGPULibFunc::EI_POWR, FInfo); 757 FunctionCallee PowrFunc = getFunction(M, PowrInfo); 758 CallInst *Call = cast<CallInst>(FPOp); 759 760 // pow(x, y) -> powr(x, y) for x >= -0.0 761 // TODO: Account for flags on current call 762 if (PowrFunc && 763 cannotBeOrderedLessThanZero(FPOp->getOperand(0), M->getDataLayout(), 764 TLInfo, 0, AC, Call, DT)) { 765 Call->setCalledFunction(PowrFunc); 766 return fold_pow(FPOp, B, PowrInfo) || true; 767 } 768 769 // pow(x, y) -> pown(x, y) for known integral y 770 if (isKnownIntegral(FPOp->getOperand(1), M->getDataLayout(), 771 FPOp->getFastMathFlags())) { 772 FunctionType *PownType = getPownType(CI->getFunctionType()); 773 AMDGPULibFunc PownInfo(AMDGPULibFunc::EI_POWN, PownType, true); 774 FunctionCallee PownFunc = getFunction(M, PownInfo); 775 if (PownFunc) { 776 // TODO: If the incoming integral value is an sitofp/uitofp, it won't 777 // fold out without a known range. We can probably take the source 778 // value directly. 779 Value *CastedArg = 780 B.CreateFPToSI(FPOp->getOperand(1), PownType->getParamType(1)); 781 // Have to drop any nofpclass attributes on the original call site. 782 Call->removeParamAttrs( 783 1, AttributeFuncs::typeIncompatible(CastedArg->getType())); 784 Call->setCalledFunction(PownFunc); 785 Call->setArgOperand(1, CastedArg); 786 return fold_pow(FPOp, B, PownInfo) || true; 787 } 788 } 789 790 return fold_pow(FPOp, B, FInfo); 791 } 792 case AMDGPULibFunc::EI_POWR: 793 case AMDGPULibFunc::EI_POWN: 794 return fold_pow(FPOp, B, FInfo); 795 case AMDGPULibFunc::EI_ROOTN: 796 return fold_rootn(FPOp, B, FInfo); 797 case AMDGPULibFunc::EI_SQRT: 798 return fold_sqrt(FPOp, B, FInfo); 799 case AMDGPULibFunc::EI_COS: 800 case AMDGPULibFunc::EI_SIN: 801 return fold_sincos(FPOp, B, FInfo); 802 default: 803 break; 804 } 805 } else { 806 // Specialized optimizations for each function call 807 switch (FInfo.getId()) { 808 case AMDGPULibFunc::EI_READ_PIPE_2: 809 case AMDGPULibFunc::EI_READ_PIPE_4: 810 case AMDGPULibFunc::EI_WRITE_PIPE_2: 811 case AMDGPULibFunc::EI_WRITE_PIPE_4: 812 return fold_read_write_pipe(CI, B, FInfo); 813 default: 814 break; 815 } 816 } 817 818 return false; 819 } 820 821 bool AMDGPULibCalls::TDOFold(CallInst *CI, const FuncInfo &FInfo) { 822 // Table-Driven optimization 823 const TableRef tr = getOptTable(FInfo.getId()); 824 if (tr.empty()) 825 return false; 826 827 int const sz = (int)tr.size(); 828 Value *opr0 = CI->getArgOperand(0); 829 830 if (getVecSize(FInfo) > 1) { 831 if (ConstantDataVector *CV = dyn_cast<ConstantDataVector>(opr0)) { 832 SmallVector<double, 0> DVal; 833 for (int eltNo = 0; eltNo < getVecSize(FInfo); ++eltNo) { 834 ConstantFP *eltval = dyn_cast<ConstantFP>( 835 CV->getElementAsConstant((unsigned)eltNo)); 836 assert(eltval && "Non-FP arguments in math function!"); 837 bool found = false; 838 for (int i=0; i < sz; ++i) { 839 if (eltval->isExactlyValue(tr[i].input)) { 840 DVal.push_back(tr[i].result); 841 found = true; 842 break; 843 } 844 } 845 if (!found) { 846 // This vector constants not handled yet. 847 return false; 848 } 849 } 850 LLVMContext &context = CI->getParent()->getParent()->getContext(); 851 Constant *nval; 852 if (getArgType(FInfo) == AMDGPULibFunc::F32) { 853 SmallVector<float, 0> FVal; 854 for (unsigned i = 0; i < DVal.size(); ++i) { 855 FVal.push_back((float)DVal[i]); 856 } 857 ArrayRef<float> tmp(FVal); 858 nval = ConstantDataVector::get(context, tmp); 859 } else { // F64 860 ArrayRef<double> tmp(DVal); 861 nval = ConstantDataVector::get(context, tmp); 862 } 863 LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *nval << "\n"); 864 replaceCall(CI, nval); 865 return true; 866 } 867 } else { 868 // Scalar version 869 if (ConstantFP *CF = dyn_cast<ConstantFP>(opr0)) { 870 for (int i = 0; i < sz; ++i) { 871 if (CF->isExactlyValue(tr[i].input)) { 872 Value *nval = ConstantFP::get(CF->getType(), tr[i].result); 873 LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *nval << "\n"); 874 replaceCall(CI, nval); 875 return true; 876 } 877 } 878 } 879 } 880 881 return false; 882 } 883 884 namespace llvm { 885 static double log2(double V) { 886 #if _XOPEN_SOURCE >= 600 || defined(_ISOC99_SOURCE) || _POSIX_C_SOURCE >= 200112L 887 return ::log2(V); 888 #else 889 return log(V) / numbers::ln2; 890 #endif 891 } 892 } 893 894 bool AMDGPULibCalls::fold_pow(FPMathOperator *FPOp, IRBuilder<> &B, 895 const FuncInfo &FInfo) { 896 assert((FInfo.getId() == AMDGPULibFunc::EI_POW || 897 FInfo.getId() == AMDGPULibFunc::EI_POWR || 898 FInfo.getId() == AMDGPULibFunc::EI_POWN) && 899 "fold_pow: encounter a wrong function call"); 900 901 Module *M = B.GetInsertBlock()->getModule(); 902 Type *eltType = FPOp->getType()->getScalarType(); 903 Value *opr0 = FPOp->getOperand(0); 904 Value *opr1 = FPOp->getOperand(1); 905 906 const APFloat *CF = nullptr; 907 const APInt *CINT = nullptr; 908 if (!match(opr1, m_APFloatAllowUndef(CF))) 909 match(opr1, m_APIntAllowUndef(CINT)); 910 911 // 0x1111111 means that we don't do anything for this call. 912 int ci_opr1 = (CINT ? (int)CINT->getSExtValue() : 0x1111111); 913 914 if ((CF && CF->isZero()) || (CINT && ci_opr1 == 0)) { 915 // pow/powr/pown(x, 0) == 1 916 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> 1\n"); 917 Constant *cnval = ConstantFP::get(eltType, 1.0); 918 if (getVecSize(FInfo) > 1) { 919 cnval = ConstantDataVector::getSplat(getVecSize(FInfo), cnval); 920 } 921 replaceCall(FPOp, cnval); 922 return true; 923 } 924 if ((CF && CF->isExactlyValue(1.0)) || (CINT && ci_opr1 == 1)) { 925 // pow/powr/pown(x, 1.0) = x 926 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " << *opr0 << "\n"); 927 replaceCall(FPOp, opr0); 928 return true; 929 } 930 if ((CF && CF->isExactlyValue(2.0)) || (CINT && ci_opr1 == 2)) { 931 // pow/powr/pown(x, 2.0) = x*x 932 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " << *opr0 << " * " 933 << *opr0 << "\n"); 934 Value *nval = B.CreateFMul(opr0, opr0, "__pow2"); 935 replaceCall(FPOp, nval); 936 return true; 937 } 938 if ((CF && CF->isExactlyValue(-1.0)) || (CINT && ci_opr1 == -1)) { 939 // pow/powr/pown(x, -1.0) = 1.0/x 940 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> 1 / " << *opr0 << "\n"); 941 Constant *cnval = ConstantFP::get(eltType, 1.0); 942 if (getVecSize(FInfo) > 1) { 943 cnval = ConstantDataVector::getSplat(getVecSize(FInfo), cnval); 944 } 945 Value *nval = B.CreateFDiv(cnval, opr0, "__powrecip"); 946 replaceCall(FPOp, nval); 947 return true; 948 } 949 950 if (CF && (CF->isExactlyValue(0.5) || CF->isExactlyValue(-0.5))) { 951 // pow[r](x, [-]0.5) = sqrt(x) 952 bool issqrt = CF->isExactlyValue(0.5); 953 if (FunctionCallee FPExpr = 954 getFunction(M, AMDGPULibFunc(issqrt ? AMDGPULibFunc::EI_SQRT 955 : AMDGPULibFunc::EI_RSQRT, 956 FInfo))) { 957 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " << FInfo.getName() 958 << '(' << *opr0 << ")\n"); 959 Value *nval = CreateCallEx(B,FPExpr, opr0, issqrt ? "__pow2sqrt" 960 : "__pow2rsqrt"); 961 replaceCall(FPOp, nval); 962 return true; 963 } 964 } 965 966 if (!isUnsafeFiniteOnlyMath(FPOp)) 967 return false; 968 969 // Unsafe Math optimization 970 971 // Remember that ci_opr1 is set if opr1 is integral 972 if (CF) { 973 double dval = (getArgType(FInfo) == AMDGPULibFunc::F32) 974 ? (double)CF->convertToFloat() 975 : CF->convertToDouble(); 976 int ival = (int)dval; 977 if ((double)ival == dval) { 978 ci_opr1 = ival; 979 } else 980 ci_opr1 = 0x11111111; 981 } 982 983 // pow/powr/pown(x, c) = [1/](x*x*..x); where 984 // trunc(c) == c && the number of x == c && |c| <= 12 985 unsigned abs_opr1 = (ci_opr1 < 0) ? -ci_opr1 : ci_opr1; 986 if (abs_opr1 <= 12) { 987 Constant *cnval; 988 Value *nval; 989 if (abs_opr1 == 0) { 990 cnval = ConstantFP::get(eltType, 1.0); 991 if (getVecSize(FInfo) > 1) { 992 cnval = ConstantDataVector::getSplat(getVecSize(FInfo), cnval); 993 } 994 nval = cnval; 995 } else { 996 Value *valx2 = nullptr; 997 nval = nullptr; 998 while (abs_opr1 > 0) { 999 valx2 = valx2 ? B.CreateFMul(valx2, valx2, "__powx2") : opr0; 1000 if (abs_opr1 & 1) { 1001 nval = nval ? B.CreateFMul(nval, valx2, "__powprod") : valx2; 1002 } 1003 abs_opr1 >>= 1; 1004 } 1005 } 1006 1007 if (ci_opr1 < 0) { 1008 cnval = ConstantFP::get(eltType, 1.0); 1009 if (getVecSize(FInfo) > 1) { 1010 cnval = ConstantDataVector::getSplat(getVecSize(FInfo), cnval); 1011 } 1012 nval = B.CreateFDiv(cnval, nval, "__1powprod"); 1013 } 1014 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " 1015 << ((ci_opr1 < 0) ? "1/prod(" : "prod(") << *opr0 1016 << ")\n"); 1017 replaceCall(FPOp, nval); 1018 return true; 1019 } 1020 1021 // If we should use the generic intrinsic instead of emitting a libcall 1022 const bool ShouldUseIntrinsic = eltType->isFloatTy() || eltType->isHalfTy(); 1023 1024 // powr ---> exp2(y * log2(x)) 1025 // pown/pow ---> powr(fabs(x), y) | (x & ((int)y << 31)) 1026 FunctionCallee ExpExpr; 1027 if (ShouldUseIntrinsic) 1028 ExpExpr = Intrinsic::getDeclaration(M, Intrinsic::exp2, {FPOp->getType()}); 1029 else { 1030 ExpExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_EXP2, FInfo)); 1031 if (!ExpExpr) 1032 return false; 1033 } 1034 1035 bool needlog = false; 1036 bool needabs = false; 1037 bool needcopysign = false; 1038 Constant *cnval = nullptr; 1039 if (getVecSize(FInfo) == 1) { 1040 CF = nullptr; 1041 match(opr0, m_APFloatAllowUndef(CF)); 1042 1043 if (CF) { 1044 double V = (getArgType(FInfo) == AMDGPULibFunc::F32) 1045 ? (double)CF->convertToFloat() 1046 : CF->convertToDouble(); 1047 1048 V = log2(std::abs(V)); 1049 cnval = ConstantFP::get(eltType, V); 1050 needcopysign = (FInfo.getId() != AMDGPULibFunc::EI_POWR) && 1051 CF->isNegative(); 1052 } else { 1053 needlog = true; 1054 needcopysign = needabs = FInfo.getId() != AMDGPULibFunc::EI_POWR && 1055 (!CF || CF->isNegative()); 1056 } 1057 } else { 1058 ConstantDataVector *CDV = dyn_cast<ConstantDataVector>(opr0); 1059 1060 if (!CDV) { 1061 needlog = true; 1062 needcopysign = needabs = FInfo.getId() != AMDGPULibFunc::EI_POWR; 1063 } else { 1064 assert ((int)CDV->getNumElements() == getVecSize(FInfo) && 1065 "Wrong vector size detected"); 1066 1067 SmallVector<double, 0> DVal; 1068 for (int i=0; i < getVecSize(FInfo); ++i) { 1069 double V = CDV->getElementAsAPFloat(i).convertToDouble(); 1070 if (V < 0.0) needcopysign = true; 1071 V = log2(std::abs(V)); 1072 DVal.push_back(V); 1073 } 1074 if (getArgType(FInfo) == AMDGPULibFunc::F32) { 1075 SmallVector<float, 0> FVal; 1076 for (unsigned i=0; i < DVal.size(); ++i) { 1077 FVal.push_back((float)DVal[i]); 1078 } 1079 ArrayRef<float> tmp(FVal); 1080 cnval = ConstantDataVector::get(M->getContext(), tmp); 1081 } else { 1082 ArrayRef<double> tmp(DVal); 1083 cnval = ConstantDataVector::get(M->getContext(), tmp); 1084 } 1085 } 1086 } 1087 1088 if (needcopysign && (FInfo.getId() == AMDGPULibFunc::EI_POW)) { 1089 // We cannot handle corner cases for a general pow() function, give up 1090 // unless y is a constant integral value. Then proceed as if it were pown. 1091 if (!isKnownIntegral(opr1, M->getDataLayout(), FPOp->getFastMathFlags())) 1092 return false; 1093 } 1094 1095 Value *nval; 1096 if (needabs) { 1097 nval = B.CreateUnaryIntrinsic(Intrinsic::fabs, opr0, nullptr, "__fabs"); 1098 } else { 1099 nval = cnval ? cnval : opr0; 1100 } 1101 if (needlog) { 1102 FunctionCallee LogExpr; 1103 if (ShouldUseIntrinsic) { 1104 LogExpr = 1105 Intrinsic::getDeclaration(M, Intrinsic::log2, {FPOp->getType()}); 1106 } else { 1107 LogExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_LOG2, FInfo)); 1108 if (!LogExpr) 1109 return false; 1110 } 1111 1112 nval = CreateCallEx(B,LogExpr, nval, "__log2"); 1113 } 1114 1115 if (FInfo.getId() == AMDGPULibFunc::EI_POWN) { 1116 // convert int(32) to fp(f32 or f64) 1117 opr1 = B.CreateSIToFP(opr1, nval->getType(), "pownI2F"); 1118 } 1119 nval = B.CreateFMul(opr1, nval, "__ylogx"); 1120 nval = CreateCallEx(B,ExpExpr, nval, "__exp2"); 1121 1122 if (needcopysign) { 1123 Value *opr_n; 1124 Type* rTy = opr0->getType(); 1125 Type* nTyS = B.getIntNTy(eltType->getPrimitiveSizeInBits()); 1126 Type *nTy = nTyS; 1127 if (const auto *vTy = dyn_cast<FixedVectorType>(rTy)) 1128 nTy = FixedVectorType::get(nTyS, vTy); 1129 unsigned size = nTy->getScalarSizeInBits(); 1130 opr_n = FPOp->getOperand(1); 1131 if (opr_n->getType()->isIntegerTy()) 1132 opr_n = B.CreateZExtOrTrunc(opr_n, nTy, "__ytou"); 1133 else 1134 opr_n = B.CreateFPToSI(opr1, nTy, "__ytou"); 1135 1136 Value *sign = B.CreateShl(opr_n, size-1, "__yeven"); 1137 sign = B.CreateAnd(B.CreateBitCast(opr0, nTy), sign, "__pow_sign"); 1138 nval = B.CreateOr(B.CreateBitCast(nval, nTy), sign); 1139 nval = B.CreateBitCast(nval, opr0->getType()); 1140 } 1141 1142 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " 1143 << "exp2(" << *opr1 << " * log2(" << *opr0 << "))\n"); 1144 replaceCall(FPOp, nval); 1145 1146 return true; 1147 } 1148 1149 bool AMDGPULibCalls::fold_rootn(FPMathOperator *FPOp, IRBuilder<> &B, 1150 const FuncInfo &FInfo) { 1151 // skip vector function 1152 if (getVecSize(FInfo) != 1) 1153 return false; 1154 1155 Value *opr0 = FPOp->getOperand(0); 1156 Value *opr1 = FPOp->getOperand(1); 1157 1158 ConstantInt *CINT = dyn_cast<ConstantInt>(opr1); 1159 if (!CINT) { 1160 return false; 1161 } 1162 int ci_opr1 = (int)CINT->getSExtValue(); 1163 if (ci_opr1 == 1) { // rootn(x, 1) = x 1164 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " << *opr0 << "\n"); 1165 replaceCall(FPOp, opr0); 1166 return true; 1167 } 1168 1169 Module *M = B.GetInsertBlock()->getModule(); 1170 if (ci_opr1 == 2) { // rootn(x, 2) = sqrt(x) 1171 if (FunctionCallee FPExpr = 1172 getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_SQRT, FInfo))) { 1173 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> sqrt(" << *opr0 1174 << ")\n"); 1175 Value *nval = CreateCallEx(B,FPExpr, opr0, "__rootn2sqrt"); 1176 replaceCall(FPOp, nval); 1177 return true; 1178 } 1179 } else if (ci_opr1 == 3) { // rootn(x, 3) = cbrt(x) 1180 if (FunctionCallee FPExpr = 1181 getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_CBRT, FInfo))) { 1182 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> cbrt(" << *opr0 1183 << ")\n"); 1184 Value *nval = CreateCallEx(B,FPExpr, opr0, "__rootn2cbrt"); 1185 replaceCall(FPOp, nval); 1186 return true; 1187 } 1188 } else if (ci_opr1 == -1) { // rootn(x, -1) = 1.0/x 1189 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> 1.0 / " << *opr0 << "\n"); 1190 Value *nval = B.CreateFDiv(ConstantFP::get(opr0->getType(), 1.0), 1191 opr0, 1192 "__rootn2div"); 1193 replaceCall(FPOp, nval); 1194 return true; 1195 } else if (ci_opr1 == -2) { // rootn(x, -2) = rsqrt(x) 1196 if (FunctionCallee FPExpr = 1197 getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_RSQRT, FInfo))) { 1198 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> rsqrt(" << *opr0 1199 << ")\n"); 1200 Value *nval = CreateCallEx(B,FPExpr, opr0, "__rootn2rsqrt"); 1201 replaceCall(FPOp, nval); 1202 return true; 1203 } 1204 } 1205 return false; 1206 } 1207 1208 // Get a scalar native builtin single argument FP function 1209 FunctionCallee AMDGPULibCalls::getNativeFunction(Module *M, 1210 const FuncInfo &FInfo) { 1211 if (getArgType(FInfo) == AMDGPULibFunc::F64 || !HasNative(FInfo.getId())) 1212 return nullptr; 1213 FuncInfo nf = FInfo; 1214 nf.setPrefix(AMDGPULibFunc::NATIVE); 1215 return getFunction(M, nf); 1216 } 1217 1218 // Some library calls are just wrappers around llvm intrinsics, but compiled 1219 // conservatively. Preserve the flags from the original call site by 1220 // substituting them with direct calls with all the flags. 1221 bool AMDGPULibCalls::shouldReplaceLibcallWithIntrinsic(const CallInst *CI, 1222 bool AllowMinSizeF32, 1223 bool AllowF64, 1224 bool AllowStrictFP) { 1225 Type *FltTy = CI->getType()->getScalarType(); 1226 const bool IsF32 = FltTy->isFloatTy(); 1227 1228 // f64 intrinsics aren't implemented for most operations. 1229 if (!IsF32 && !FltTy->isHalfTy() && (!AllowF64 || !FltTy->isDoubleTy())) 1230 return false; 1231 1232 // We're implicitly inlining by replacing the libcall with the intrinsic, so 1233 // don't do it for noinline call sites. 1234 if (CI->isNoInline()) 1235 return false; 1236 1237 const Function *ParentF = CI->getFunction(); 1238 // TODO: Handle strictfp 1239 if (!AllowStrictFP && ParentF->hasFnAttribute(Attribute::StrictFP)) 1240 return false; 1241 1242 if (IsF32 && !AllowMinSizeF32 && ParentF->hasMinSize()) 1243 return false; 1244 return true; 1245 } 1246 1247 void AMDGPULibCalls::replaceLibCallWithSimpleIntrinsic(IRBuilder<> &B, 1248 CallInst *CI, 1249 Intrinsic::ID IntrID) { 1250 if (CI->arg_size() == 2) { 1251 Value *Arg0 = CI->getArgOperand(0); 1252 Value *Arg1 = CI->getArgOperand(1); 1253 VectorType *Arg0VecTy = dyn_cast<VectorType>(Arg0->getType()); 1254 VectorType *Arg1VecTy = dyn_cast<VectorType>(Arg1->getType()); 1255 if (Arg0VecTy && !Arg1VecTy) { 1256 Value *SplatRHS = B.CreateVectorSplat(Arg0VecTy->getElementCount(), Arg1); 1257 CI->setArgOperand(1, SplatRHS); 1258 } else if (!Arg0VecTy && Arg1VecTy) { 1259 Value *SplatLHS = B.CreateVectorSplat(Arg1VecTy->getElementCount(), Arg0); 1260 CI->setArgOperand(0, SplatLHS); 1261 } 1262 } 1263 1264 CI->setCalledFunction( 1265 Intrinsic::getDeclaration(CI->getModule(), IntrID, {CI->getType()})); 1266 } 1267 1268 bool AMDGPULibCalls::tryReplaceLibcallWithSimpleIntrinsic( 1269 IRBuilder<> &B, CallInst *CI, Intrinsic::ID IntrID, bool AllowMinSizeF32, 1270 bool AllowF64, bool AllowStrictFP) { 1271 if (!shouldReplaceLibcallWithIntrinsic(CI, AllowMinSizeF32, AllowF64, 1272 AllowStrictFP)) 1273 return false; 1274 replaceLibCallWithSimpleIntrinsic(B, CI, IntrID); 1275 return true; 1276 } 1277 1278 // fold sqrt -> native_sqrt (x) 1279 bool AMDGPULibCalls::fold_sqrt(FPMathOperator *FPOp, IRBuilder<> &B, 1280 const FuncInfo &FInfo) { 1281 if (!isUnsafeMath(FPOp)) 1282 return false; 1283 1284 if (getArgType(FInfo) == AMDGPULibFunc::F32 && (getVecSize(FInfo) == 1) && 1285 (FInfo.getPrefix() != AMDGPULibFunc::NATIVE)) { 1286 Module *M = B.GetInsertBlock()->getModule(); 1287 1288 if (FunctionCallee FPExpr = getNativeFunction( 1289 M, AMDGPULibFunc(AMDGPULibFunc::EI_SQRT, FInfo))) { 1290 Value *opr0 = FPOp->getOperand(0); 1291 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " 1292 << "sqrt(" << *opr0 << ")\n"); 1293 Value *nval = CreateCallEx(B,FPExpr, opr0, "__sqrt"); 1294 replaceCall(FPOp, nval); 1295 return true; 1296 } 1297 } 1298 return false; 1299 } 1300 1301 std::tuple<Value *, Value *, Value *> 1302 AMDGPULibCalls::insertSinCos(Value *Arg, FastMathFlags FMF, IRBuilder<> &B, 1303 FunctionCallee Fsincos) { 1304 DebugLoc DL = B.getCurrentDebugLocation(); 1305 Function *F = B.GetInsertBlock()->getParent(); 1306 B.SetInsertPointPastAllocas(F); 1307 1308 AllocaInst *Alloc = B.CreateAlloca(Arg->getType(), nullptr, "__sincos_"); 1309 1310 if (Instruction *ArgInst = dyn_cast<Instruction>(Arg)) { 1311 // If the argument is an instruction, it must dominate all uses so put our 1312 // sincos call there. Otherwise, right after the allocas works well enough 1313 // if it's an argument or constant. 1314 1315 B.SetInsertPoint(ArgInst->getParent(), ++ArgInst->getIterator()); 1316 1317 // SetInsertPoint unwelcomely always tries to set the debug loc. 1318 B.SetCurrentDebugLocation(DL); 1319 } 1320 1321 Type *CosPtrTy = Fsincos.getFunctionType()->getParamType(1); 1322 1323 // The allocaInst allocates the memory in private address space. This need 1324 // to be addrspacecasted to point to the address space of cos pointer type. 1325 // In OpenCL 2.0 this is generic, while in 1.2 that is private. 1326 Value *CastAlloc = B.CreateAddrSpaceCast(Alloc, CosPtrTy); 1327 1328 CallInst *SinCos = CreateCallEx2(B, Fsincos, Arg, CastAlloc); 1329 1330 // TODO: Is it worth trying to preserve the location for the cos calls for the 1331 // load? 1332 1333 LoadInst *LoadCos = B.CreateLoad(Alloc->getAllocatedType(), Alloc); 1334 return {SinCos, LoadCos, SinCos}; 1335 } 1336 1337 // fold sin, cos -> sincos. 1338 bool AMDGPULibCalls::fold_sincos(FPMathOperator *FPOp, IRBuilder<> &B, 1339 const FuncInfo &fInfo) { 1340 assert(fInfo.getId() == AMDGPULibFunc::EI_SIN || 1341 fInfo.getId() == AMDGPULibFunc::EI_COS); 1342 1343 if ((getArgType(fInfo) != AMDGPULibFunc::F32 && 1344 getArgType(fInfo) != AMDGPULibFunc::F64) || 1345 fInfo.getPrefix() != AMDGPULibFunc::NOPFX) 1346 return false; 1347 1348 bool const isSin = fInfo.getId() == AMDGPULibFunc::EI_SIN; 1349 1350 Value *CArgVal = FPOp->getOperand(0); 1351 CallInst *CI = cast<CallInst>(FPOp); 1352 1353 Function *F = B.GetInsertBlock()->getParent(); 1354 Module *M = F->getParent(); 1355 1356 // Merge the sin and cos. For OpenCL 2.0, there may only be a generic pointer 1357 // implementation. Prefer the private form if available. 1358 AMDGPULibFunc SinCosLibFuncPrivate(AMDGPULibFunc::EI_SINCOS, fInfo); 1359 SinCosLibFuncPrivate.getLeads()[0].PtrKind = 1360 AMDGPULibFunc::getEPtrKindFromAddrSpace(AMDGPUAS::PRIVATE_ADDRESS); 1361 1362 AMDGPULibFunc SinCosLibFuncGeneric(AMDGPULibFunc::EI_SINCOS, fInfo); 1363 SinCosLibFuncGeneric.getLeads()[0].PtrKind = 1364 AMDGPULibFunc::getEPtrKindFromAddrSpace(AMDGPUAS::FLAT_ADDRESS); 1365 1366 FunctionCallee FSinCosPrivate = getFunction(M, SinCosLibFuncPrivate); 1367 FunctionCallee FSinCosGeneric = getFunction(M, SinCosLibFuncGeneric); 1368 FunctionCallee FSinCos = FSinCosPrivate ? FSinCosPrivate : FSinCosGeneric; 1369 if (!FSinCos) 1370 return false; 1371 1372 SmallVector<CallInst *> SinCalls; 1373 SmallVector<CallInst *> CosCalls; 1374 SmallVector<CallInst *> SinCosCalls; 1375 FuncInfo PartnerInfo(isSin ? AMDGPULibFunc::EI_COS : AMDGPULibFunc::EI_SIN, 1376 fInfo); 1377 const std::string PairName = PartnerInfo.mangle(); 1378 1379 StringRef SinName = isSin ? CI->getCalledFunction()->getName() : PairName; 1380 StringRef CosName = isSin ? PairName : CI->getCalledFunction()->getName(); 1381 const std::string SinCosPrivateName = SinCosLibFuncPrivate.mangle(); 1382 const std::string SinCosGenericName = SinCosLibFuncGeneric.mangle(); 1383 1384 // Intersect the two sets of flags. 1385 FastMathFlags FMF = FPOp->getFastMathFlags(); 1386 MDNode *FPMath = CI->getMetadata(LLVMContext::MD_fpmath); 1387 1388 SmallVector<DILocation *> MergeDbgLocs = {CI->getDebugLoc()}; 1389 1390 for (User* U : CArgVal->users()) { 1391 CallInst *XI = dyn_cast<CallInst>(U); 1392 if (!XI || XI->getFunction() != F || XI->isNoBuiltin()) 1393 continue; 1394 1395 Function *UCallee = XI->getCalledFunction(); 1396 if (!UCallee) 1397 continue; 1398 1399 bool Handled = true; 1400 1401 if (UCallee->getName() == SinName) 1402 SinCalls.push_back(XI); 1403 else if (UCallee->getName() == CosName) 1404 CosCalls.push_back(XI); 1405 else if (UCallee->getName() == SinCosPrivateName || 1406 UCallee->getName() == SinCosGenericName) 1407 SinCosCalls.push_back(XI); 1408 else 1409 Handled = false; 1410 1411 if (Handled) { 1412 MergeDbgLocs.push_back(XI->getDebugLoc()); 1413 auto *OtherOp = cast<FPMathOperator>(XI); 1414 FMF &= OtherOp->getFastMathFlags(); 1415 FPMath = MDNode::getMostGenericFPMath( 1416 FPMath, XI->getMetadata(LLVMContext::MD_fpmath)); 1417 } 1418 } 1419 1420 if (SinCalls.empty() || CosCalls.empty()) 1421 return false; 1422 1423 B.setFastMathFlags(FMF); 1424 B.setDefaultFPMathTag(FPMath); 1425 DILocation *DbgLoc = DILocation::getMergedLocations(MergeDbgLocs); 1426 B.SetCurrentDebugLocation(DbgLoc); 1427 1428 auto [Sin, Cos, SinCos] = insertSinCos(CArgVal, FMF, B, FSinCos); 1429 1430 auto replaceTrigInsts = [](ArrayRef<CallInst *> Calls, Value *Res) { 1431 for (CallInst *C : Calls) 1432 C->replaceAllUsesWith(Res); 1433 1434 // Leave the other dead instructions to avoid clobbering iterators. 1435 }; 1436 1437 replaceTrigInsts(SinCalls, Sin); 1438 replaceTrigInsts(CosCalls, Cos); 1439 replaceTrigInsts(SinCosCalls, SinCos); 1440 1441 // It's safe to delete the original now. 1442 CI->eraseFromParent(); 1443 return true; 1444 } 1445 1446 bool AMDGPULibCalls::evaluateScalarMathFunc(const FuncInfo &FInfo, double &Res0, 1447 double &Res1, Constant *copr0, 1448 Constant *copr1) { 1449 // By default, opr0/opr1/opr3 holds values of float/double type. 1450 // If they are not float/double, each function has to its 1451 // operand separately. 1452 double opr0 = 0.0, opr1 = 0.0; 1453 ConstantFP *fpopr0 = dyn_cast_or_null<ConstantFP>(copr0); 1454 ConstantFP *fpopr1 = dyn_cast_or_null<ConstantFP>(copr1); 1455 if (fpopr0) { 1456 opr0 = (getArgType(FInfo) == AMDGPULibFunc::F64) 1457 ? fpopr0->getValueAPF().convertToDouble() 1458 : (double)fpopr0->getValueAPF().convertToFloat(); 1459 } 1460 1461 if (fpopr1) { 1462 opr1 = (getArgType(FInfo) == AMDGPULibFunc::F64) 1463 ? fpopr1->getValueAPF().convertToDouble() 1464 : (double)fpopr1->getValueAPF().convertToFloat(); 1465 } 1466 1467 switch (FInfo.getId()) { 1468 default : return false; 1469 1470 case AMDGPULibFunc::EI_ACOS: 1471 Res0 = acos(opr0); 1472 return true; 1473 1474 case AMDGPULibFunc::EI_ACOSH: 1475 // acosh(x) == log(x + sqrt(x*x - 1)) 1476 Res0 = log(opr0 + sqrt(opr0*opr0 - 1.0)); 1477 return true; 1478 1479 case AMDGPULibFunc::EI_ACOSPI: 1480 Res0 = acos(opr0) / MATH_PI; 1481 return true; 1482 1483 case AMDGPULibFunc::EI_ASIN: 1484 Res0 = asin(opr0); 1485 return true; 1486 1487 case AMDGPULibFunc::EI_ASINH: 1488 // asinh(x) == log(x + sqrt(x*x + 1)) 1489 Res0 = log(opr0 + sqrt(opr0*opr0 + 1.0)); 1490 return true; 1491 1492 case AMDGPULibFunc::EI_ASINPI: 1493 Res0 = asin(opr0) / MATH_PI; 1494 return true; 1495 1496 case AMDGPULibFunc::EI_ATAN: 1497 Res0 = atan(opr0); 1498 return true; 1499 1500 case AMDGPULibFunc::EI_ATANH: 1501 // atanh(x) == (log(x+1) - log(x-1))/2; 1502 Res0 = (log(opr0 + 1.0) - log(opr0 - 1.0))/2.0; 1503 return true; 1504 1505 case AMDGPULibFunc::EI_ATANPI: 1506 Res0 = atan(opr0) / MATH_PI; 1507 return true; 1508 1509 case AMDGPULibFunc::EI_CBRT: 1510 Res0 = (opr0 < 0.0) ? -pow(-opr0, 1.0/3.0) : pow(opr0, 1.0/3.0); 1511 return true; 1512 1513 case AMDGPULibFunc::EI_COS: 1514 Res0 = cos(opr0); 1515 return true; 1516 1517 case AMDGPULibFunc::EI_COSH: 1518 Res0 = cosh(opr0); 1519 return true; 1520 1521 case AMDGPULibFunc::EI_COSPI: 1522 Res0 = cos(MATH_PI * opr0); 1523 return true; 1524 1525 case AMDGPULibFunc::EI_EXP: 1526 Res0 = exp(opr0); 1527 return true; 1528 1529 case AMDGPULibFunc::EI_EXP2: 1530 Res0 = pow(2.0, opr0); 1531 return true; 1532 1533 case AMDGPULibFunc::EI_EXP10: 1534 Res0 = pow(10.0, opr0); 1535 return true; 1536 1537 case AMDGPULibFunc::EI_LOG: 1538 Res0 = log(opr0); 1539 return true; 1540 1541 case AMDGPULibFunc::EI_LOG2: 1542 Res0 = log(opr0) / log(2.0); 1543 return true; 1544 1545 case AMDGPULibFunc::EI_LOG10: 1546 Res0 = log(opr0) / log(10.0); 1547 return true; 1548 1549 case AMDGPULibFunc::EI_RSQRT: 1550 Res0 = 1.0 / sqrt(opr0); 1551 return true; 1552 1553 case AMDGPULibFunc::EI_SIN: 1554 Res0 = sin(opr0); 1555 return true; 1556 1557 case AMDGPULibFunc::EI_SINH: 1558 Res0 = sinh(opr0); 1559 return true; 1560 1561 case AMDGPULibFunc::EI_SINPI: 1562 Res0 = sin(MATH_PI * opr0); 1563 return true; 1564 1565 case AMDGPULibFunc::EI_TAN: 1566 Res0 = tan(opr0); 1567 return true; 1568 1569 case AMDGPULibFunc::EI_TANH: 1570 Res0 = tanh(opr0); 1571 return true; 1572 1573 case AMDGPULibFunc::EI_TANPI: 1574 Res0 = tan(MATH_PI * opr0); 1575 return true; 1576 1577 // two-arg functions 1578 case AMDGPULibFunc::EI_POW: 1579 case AMDGPULibFunc::EI_POWR: 1580 Res0 = pow(opr0, opr1); 1581 return true; 1582 1583 case AMDGPULibFunc::EI_POWN: { 1584 if (ConstantInt *iopr1 = dyn_cast_or_null<ConstantInt>(copr1)) { 1585 double val = (double)iopr1->getSExtValue(); 1586 Res0 = pow(opr0, val); 1587 return true; 1588 } 1589 return false; 1590 } 1591 1592 case AMDGPULibFunc::EI_ROOTN: { 1593 if (ConstantInt *iopr1 = dyn_cast_or_null<ConstantInt>(copr1)) { 1594 double val = (double)iopr1->getSExtValue(); 1595 Res0 = pow(opr0, 1.0 / val); 1596 return true; 1597 } 1598 return false; 1599 } 1600 1601 // with ptr arg 1602 case AMDGPULibFunc::EI_SINCOS: 1603 Res0 = sin(opr0); 1604 Res1 = cos(opr0); 1605 return true; 1606 } 1607 1608 return false; 1609 } 1610 1611 bool AMDGPULibCalls::evaluateCall(CallInst *aCI, const FuncInfo &FInfo) { 1612 int numArgs = (int)aCI->arg_size(); 1613 if (numArgs > 3) 1614 return false; 1615 1616 Constant *copr0 = nullptr; 1617 Constant *copr1 = nullptr; 1618 if (numArgs > 0) { 1619 if ((copr0 = dyn_cast<Constant>(aCI->getArgOperand(0))) == nullptr) 1620 return false; 1621 } 1622 1623 if (numArgs > 1) { 1624 if ((copr1 = dyn_cast<Constant>(aCI->getArgOperand(1))) == nullptr) { 1625 if (FInfo.getId() != AMDGPULibFunc::EI_SINCOS) 1626 return false; 1627 } 1628 } 1629 1630 // At this point, all arguments to aCI are constants. 1631 1632 // max vector size is 16, and sincos will generate two results. 1633 double DVal0[16], DVal1[16]; 1634 int FuncVecSize = getVecSize(FInfo); 1635 bool hasTwoResults = (FInfo.getId() == AMDGPULibFunc::EI_SINCOS); 1636 if (FuncVecSize == 1) { 1637 if (!evaluateScalarMathFunc(FInfo, DVal0[0], DVal1[0], copr0, copr1)) { 1638 return false; 1639 } 1640 } else { 1641 ConstantDataVector *CDV0 = dyn_cast_or_null<ConstantDataVector>(copr0); 1642 ConstantDataVector *CDV1 = dyn_cast_or_null<ConstantDataVector>(copr1); 1643 for (int i = 0; i < FuncVecSize; ++i) { 1644 Constant *celt0 = CDV0 ? CDV0->getElementAsConstant(i) : nullptr; 1645 Constant *celt1 = CDV1 ? CDV1->getElementAsConstant(i) : nullptr; 1646 if (!evaluateScalarMathFunc(FInfo, DVal0[i], DVal1[i], celt0, celt1)) { 1647 return false; 1648 } 1649 } 1650 } 1651 1652 LLVMContext &context = aCI->getContext(); 1653 Constant *nval0, *nval1; 1654 if (FuncVecSize == 1) { 1655 nval0 = ConstantFP::get(aCI->getType(), DVal0[0]); 1656 if (hasTwoResults) 1657 nval1 = ConstantFP::get(aCI->getType(), DVal1[0]); 1658 } else { 1659 if (getArgType(FInfo) == AMDGPULibFunc::F32) { 1660 SmallVector <float, 0> FVal0, FVal1; 1661 for (int i = 0; i < FuncVecSize; ++i) 1662 FVal0.push_back((float)DVal0[i]); 1663 ArrayRef<float> tmp0(FVal0); 1664 nval0 = ConstantDataVector::get(context, tmp0); 1665 if (hasTwoResults) { 1666 for (int i = 0; i < FuncVecSize; ++i) 1667 FVal1.push_back((float)DVal1[i]); 1668 ArrayRef<float> tmp1(FVal1); 1669 nval1 = ConstantDataVector::get(context, tmp1); 1670 } 1671 } else { 1672 ArrayRef<double> tmp0(DVal0); 1673 nval0 = ConstantDataVector::get(context, tmp0); 1674 if (hasTwoResults) { 1675 ArrayRef<double> tmp1(DVal1); 1676 nval1 = ConstantDataVector::get(context, tmp1); 1677 } 1678 } 1679 } 1680 1681 if (hasTwoResults) { 1682 // sincos 1683 assert(FInfo.getId() == AMDGPULibFunc::EI_SINCOS && 1684 "math function with ptr arg not supported yet"); 1685 new StoreInst(nval1, aCI->getArgOperand(1), aCI); 1686 } 1687 1688 replaceCall(aCI, nval0); 1689 return true; 1690 } 1691 1692 PreservedAnalyses AMDGPUSimplifyLibCallsPass::run(Function &F, 1693 FunctionAnalysisManager &AM) { 1694 AMDGPULibCalls Simplifier; 1695 Simplifier.initNativeFuncs(); 1696 Simplifier.initFunction(F, AM); 1697 1698 bool Changed = false; 1699 1700 LLVM_DEBUG(dbgs() << "AMDIC: process function "; 1701 F.printAsOperand(dbgs(), false, F.getParent()); dbgs() << '\n';); 1702 1703 for (auto &BB : F) { 1704 for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E;) { 1705 // Ignore non-calls. 1706 CallInst *CI = dyn_cast<CallInst>(I); 1707 ++I; 1708 1709 if (CI) { 1710 if (Simplifier.fold(CI)) 1711 Changed = true; 1712 } 1713 } 1714 } 1715 return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); 1716 } 1717 1718 PreservedAnalyses AMDGPUUseNativeCallsPass::run(Function &F, 1719 FunctionAnalysisManager &AM) { 1720 if (UseNative.empty()) 1721 return PreservedAnalyses::all(); 1722 1723 AMDGPULibCalls Simplifier; 1724 Simplifier.initNativeFuncs(); 1725 Simplifier.initFunction(F, AM); 1726 1727 bool Changed = false; 1728 for (auto &BB : F) { 1729 for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E;) { 1730 // Ignore non-calls. 1731 CallInst *CI = dyn_cast<CallInst>(I); 1732 ++I; 1733 if (CI && Simplifier.useNative(CI)) 1734 Changed = true; 1735 } 1736 } 1737 return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); 1738 } 1739