1 //===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //==-----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Defines an instruction selector for the AMDGPU target. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUISelDAGToDAG.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUInstrInfo.h" 17 #include "AMDGPUSubtarget.h" 18 #include "AMDGPUTargetMachine.h" 19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 20 #include "MCTargetDesc/R600MCTargetDesc.h" 21 #include "R600RegisterInfo.h" 22 #include "SIISelLowering.h" 23 #include "SIMachineFunctionInfo.h" 24 #include "llvm/Analysis/UniformityAnalysis.h" 25 #include "llvm/CodeGen/FunctionLoweringInfo.h" 26 #include "llvm/CodeGen/SelectionDAG.h" 27 #include "llvm/CodeGen/SelectionDAGISel.h" 28 #include "llvm/CodeGen/SelectionDAGNodes.h" 29 #include "llvm/IR/IntrinsicsAMDGPU.h" 30 #include "llvm/Support/ErrorHandling.h" 31 32 #ifdef EXPENSIVE_CHECKS 33 #include "llvm/Analysis/LoopInfo.h" 34 #include "llvm/IR/Dominators.h" 35 #endif 36 37 #define DEBUG_TYPE "amdgpu-isel" 38 39 using namespace llvm; 40 41 //===----------------------------------------------------------------------===// 42 // Instruction Selector Implementation 43 //===----------------------------------------------------------------------===// 44 45 namespace { 46 static SDValue stripBitcast(SDValue Val) { 47 return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val; 48 } 49 50 // Figure out if this is really an extract of the high 16-bits of a dword. 51 static bool isExtractHiElt(SDValue In, SDValue &Out) { 52 In = stripBitcast(In); 53 54 if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 55 if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) { 56 if (!Idx->isOne()) 57 return false; 58 Out = In.getOperand(0); 59 return true; 60 } 61 } 62 63 if (In.getOpcode() != ISD::TRUNCATE) 64 return false; 65 66 SDValue Srl = In.getOperand(0); 67 if (Srl.getOpcode() == ISD::SRL) { 68 if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) { 69 if (ShiftAmt->getZExtValue() == 16) { 70 Out = stripBitcast(Srl.getOperand(0)); 71 return true; 72 } 73 } 74 } 75 76 return false; 77 } 78 79 // Look through operations that obscure just looking at the low 16-bits of the 80 // same register. 81 static SDValue stripExtractLoElt(SDValue In) { 82 if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 83 SDValue Idx = In.getOperand(1); 84 if (isNullConstant(Idx) && In.getValueSizeInBits() <= 32) 85 return In.getOperand(0); 86 } 87 88 if (In.getOpcode() == ISD::TRUNCATE) { 89 SDValue Src = In.getOperand(0); 90 if (Src.getValueType().getSizeInBits() == 32) 91 return stripBitcast(Src); 92 } 93 94 return In; 95 } 96 97 } // end anonymous namespace 98 99 INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISelLegacy, "amdgpu-isel", 100 "AMDGPU DAG->DAG Pattern Instruction Selection", false, 101 false) 102 INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo) 103 INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysisLegacy) 104 INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass) 105 #ifdef EXPENSIVE_CHECKS 106 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 107 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 108 #endif 109 INITIALIZE_PASS_END(AMDGPUDAGToDAGISelLegacy, "amdgpu-isel", 110 "AMDGPU DAG->DAG Pattern Instruction Selection", false, 111 false) 112 113 /// This pass converts a legalized DAG into a AMDGPU-specific 114 // DAG, ready for instruction scheduling. 115 FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM, 116 CodeGenOptLevel OptLevel) { 117 return new AMDGPUDAGToDAGISelLegacy(TM, OptLevel); 118 } 119 120 AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(TargetMachine &TM, 121 CodeGenOptLevel OptLevel) 122 : SelectionDAGISel(TM, OptLevel) {} 123 124 bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { 125 Subtarget = &MF.getSubtarget<GCNSubtarget>(); 126 Subtarget->checkSubtargetFeatures(MF.getFunction()); 127 Mode = SIModeRegisterDefaults(MF.getFunction(), *Subtarget); 128 return SelectionDAGISel::runOnMachineFunction(MF); 129 } 130 131 bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const { 132 // XXX - only need to list legal operations. 133 switch (Opc) { 134 case ISD::FADD: 135 case ISD::FSUB: 136 case ISD::FMUL: 137 case ISD::FDIV: 138 case ISD::FREM: 139 case ISD::FCANONICALIZE: 140 case ISD::UINT_TO_FP: 141 case ISD::SINT_TO_FP: 142 case ISD::FABS: 143 // Fabs is lowered to a bit operation, but it's an and which will clear the 144 // high bits anyway. 145 case ISD::FSQRT: 146 case ISD::FSIN: 147 case ISD::FCOS: 148 case ISD::FPOWI: 149 case ISD::FPOW: 150 case ISD::FLOG: 151 case ISD::FLOG2: 152 case ISD::FLOG10: 153 case ISD::FEXP: 154 case ISD::FEXP2: 155 case ISD::FCEIL: 156 case ISD::FTRUNC: 157 case ISD::FRINT: 158 case ISD::FNEARBYINT: 159 case ISD::FROUNDEVEN: 160 case ISD::FROUND: 161 case ISD::FFLOOR: 162 case ISD::FMINNUM: 163 case ISD::FMAXNUM: 164 case ISD::FLDEXP: 165 case AMDGPUISD::FRACT: 166 case AMDGPUISD::CLAMP: 167 case AMDGPUISD::COS_HW: 168 case AMDGPUISD::SIN_HW: 169 case AMDGPUISD::FMIN3: 170 case AMDGPUISD::FMAX3: 171 case AMDGPUISD::FMED3: 172 case AMDGPUISD::FMAD_FTZ: 173 case AMDGPUISD::RCP: 174 case AMDGPUISD::RSQ: 175 case AMDGPUISD::RCP_IFLAG: 176 // On gfx10, all 16-bit instructions preserve the high bits. 177 return Subtarget->getGeneration() <= AMDGPUSubtarget::GFX9; 178 case ISD::FP_ROUND: 179 // We may select fptrunc (fma/mad) to mad_mixlo, which does not zero the 180 // high bits on gfx9. 181 // TODO: If we had the source node we could see if the source was fma/mad 182 return Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS; 183 case ISD::FMA: 184 case ISD::FMAD: 185 case AMDGPUISD::DIV_FIXUP: 186 return Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS; 187 default: 188 // fcopysign, select and others may be lowered to 32-bit bit operations 189 // which don't zero the high bits. 190 return false; 191 } 192 } 193 194 bool AMDGPUDAGToDAGISelLegacy::runOnMachineFunction(MachineFunction &MF) { 195 #ifdef EXPENSIVE_CHECKS 196 DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 197 LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 198 for (auto &L : LI->getLoopsInPreorder()) { 199 assert(L->isLCSSAForm(DT)); 200 } 201 #endif 202 return SelectionDAGISelLegacy::runOnMachineFunction(MF); 203 } 204 205 void AMDGPUDAGToDAGISelLegacy::getAnalysisUsage(AnalysisUsage &AU) const { 206 AU.addRequired<AMDGPUArgumentUsageInfo>(); 207 AU.addRequired<UniformityInfoWrapperPass>(); 208 #ifdef EXPENSIVE_CHECKS 209 AU.addRequired<DominatorTreeWrapperPass>(); 210 AU.addRequired<LoopInfoWrapperPass>(); 211 #endif 212 SelectionDAGISelLegacy::getAnalysisUsage(AU); 213 } 214 215 bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const { 216 assert(Subtarget->d16PreservesUnusedBits()); 217 MVT VT = N->getValueType(0).getSimpleVT(); 218 if (VT != MVT::v2i16 && VT != MVT::v2f16) 219 return false; 220 221 SDValue Lo = N->getOperand(0); 222 SDValue Hi = N->getOperand(1); 223 224 LoadSDNode *LdHi = dyn_cast<LoadSDNode>(stripBitcast(Hi)); 225 226 // build_vector lo, (load ptr) -> load_d16_hi ptr, lo 227 // build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo 228 // build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo 229 230 // Need to check for possible indirect dependencies on the other half of the 231 // vector to avoid introducing a cycle. 232 if (LdHi && Hi.hasOneUse() && !LdHi->isPredecessorOf(Lo.getNode())) { 233 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other); 234 235 SDValue TiedIn = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Lo); 236 SDValue Ops[] = { 237 LdHi->getChain(), LdHi->getBasePtr(), TiedIn 238 }; 239 240 unsigned LoadOp = AMDGPUISD::LOAD_D16_HI; 241 if (LdHi->getMemoryVT() == MVT::i8) { 242 LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ? 243 AMDGPUISD::LOAD_D16_HI_I8 : AMDGPUISD::LOAD_D16_HI_U8; 244 } else { 245 assert(LdHi->getMemoryVT() == MVT::i16); 246 } 247 248 SDValue NewLoadHi = 249 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList, 250 Ops, LdHi->getMemoryVT(), 251 LdHi->getMemOperand()); 252 253 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadHi); 254 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdHi, 1), NewLoadHi.getValue(1)); 255 return true; 256 } 257 258 // build_vector (load ptr), hi -> load_d16_lo ptr, hi 259 // build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi 260 // build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi 261 LoadSDNode *LdLo = dyn_cast<LoadSDNode>(stripBitcast(Lo)); 262 if (LdLo && Lo.hasOneUse()) { 263 SDValue TiedIn = getHi16Elt(Hi); 264 if (!TiedIn || LdLo->isPredecessorOf(TiedIn.getNode())) 265 return false; 266 267 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other); 268 unsigned LoadOp = AMDGPUISD::LOAD_D16_LO; 269 if (LdLo->getMemoryVT() == MVT::i8) { 270 LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ? 271 AMDGPUISD::LOAD_D16_LO_I8 : AMDGPUISD::LOAD_D16_LO_U8; 272 } else { 273 assert(LdLo->getMemoryVT() == MVT::i16); 274 } 275 276 TiedIn = CurDAG->getNode(ISD::BITCAST, SDLoc(N), VT, TiedIn); 277 278 SDValue Ops[] = { 279 LdLo->getChain(), LdLo->getBasePtr(), TiedIn 280 }; 281 282 SDValue NewLoadLo = 283 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList, 284 Ops, LdLo->getMemoryVT(), 285 LdLo->getMemOperand()); 286 287 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadLo); 288 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdLo, 1), NewLoadLo.getValue(1)); 289 return true; 290 } 291 292 return false; 293 } 294 295 void AMDGPUDAGToDAGISel::PreprocessISelDAG() { 296 if (!Subtarget->d16PreservesUnusedBits()) 297 return; 298 299 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end(); 300 301 bool MadeChange = false; 302 while (Position != CurDAG->allnodes_begin()) { 303 SDNode *N = &*--Position; 304 if (N->use_empty()) 305 continue; 306 307 switch (N->getOpcode()) { 308 case ISD::BUILD_VECTOR: 309 // TODO: Match load d16 from shl (extload:i16), 16 310 MadeChange |= matchLoadD16FromBuildVector(N); 311 break; 312 default: 313 break; 314 } 315 } 316 317 if (MadeChange) { 318 CurDAG->RemoveDeadNodes(); 319 LLVM_DEBUG(dbgs() << "After PreProcess:\n"; 320 CurDAG->dump();); 321 } 322 } 323 324 bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const { 325 if (N->isUndef()) 326 return true; 327 328 const SIInstrInfo *TII = Subtarget->getInstrInfo(); 329 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) 330 return TII->isInlineConstant(C->getAPIntValue()); 331 332 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N)) 333 return TII->isInlineConstant(C->getValueAPF()); 334 335 return false; 336 } 337 338 /// Determine the register class for \p OpNo 339 /// \returns The register class of the virtual register that will be used for 340 /// the given operand number \OpNo or NULL if the register class cannot be 341 /// determined. 342 const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N, 343 unsigned OpNo) const { 344 if (!N->isMachineOpcode()) { 345 if (N->getOpcode() == ISD::CopyToReg) { 346 Register Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg(); 347 if (Reg.isVirtual()) { 348 MachineRegisterInfo &MRI = CurDAG->getMachineFunction().getRegInfo(); 349 return MRI.getRegClass(Reg); 350 } 351 352 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); 353 return TRI->getPhysRegBaseClass(Reg); 354 } 355 356 return nullptr; 357 } 358 359 switch (N->getMachineOpcode()) { 360 default: { 361 const MCInstrDesc &Desc = 362 Subtarget->getInstrInfo()->get(N->getMachineOpcode()); 363 unsigned OpIdx = Desc.getNumDefs() + OpNo; 364 if (OpIdx >= Desc.getNumOperands()) 365 return nullptr; 366 int RegClass = Desc.operands()[OpIdx].RegClass; 367 if (RegClass == -1) 368 return nullptr; 369 370 return Subtarget->getRegisterInfo()->getRegClass(RegClass); 371 } 372 case AMDGPU::REG_SEQUENCE: { 373 unsigned RCID = N->getConstantOperandVal(0); 374 const TargetRegisterClass *SuperRC = 375 Subtarget->getRegisterInfo()->getRegClass(RCID); 376 377 SDValue SubRegOp = N->getOperand(OpNo + 1); 378 unsigned SubRegIdx = SubRegOp->getAsZExtVal(); 379 return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC, 380 SubRegIdx); 381 } 382 } 383 } 384 385 SDNode *AMDGPUDAGToDAGISel::glueCopyToOp(SDNode *N, SDValue NewChain, 386 SDValue Glue) const { 387 SmallVector <SDValue, 8> Ops; 388 Ops.push_back(NewChain); // Replace the chain. 389 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i) 390 Ops.push_back(N->getOperand(i)); 391 392 Ops.push_back(Glue); 393 return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops); 394 } 395 396 SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const { 397 const SITargetLowering& Lowering = 398 *static_cast<const SITargetLowering*>(getTargetLowering()); 399 400 assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain"); 401 402 SDValue M0 = Lowering.copyToM0(*CurDAG, N->getOperand(0), SDLoc(N), Val); 403 return glueCopyToOp(N, M0, M0.getValue(1)); 404 } 405 406 SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const { 407 unsigned AS = cast<MemSDNode>(N)->getAddressSpace(); 408 if (AS == AMDGPUAS::LOCAL_ADDRESS) { 409 if (Subtarget->ldsRequiresM0Init()) 410 return glueCopyToM0( 411 N, CurDAG->getSignedTargetConstant(-1, SDLoc(N), MVT::i32)); 412 } else if (AS == AMDGPUAS::REGION_ADDRESS) { 413 MachineFunction &MF = CurDAG->getMachineFunction(); 414 unsigned Value = MF.getInfo<SIMachineFunctionInfo>()->getGDSSize(); 415 return 416 glueCopyToM0(N, CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i32)); 417 } 418 return N; 419 } 420 421 MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm, 422 EVT VT) const { 423 SDNode *Lo = CurDAG->getMachineNode( 424 AMDGPU::S_MOV_B32, DL, MVT::i32, 425 CurDAG->getTargetConstant(Lo_32(Imm), DL, MVT::i32)); 426 SDNode *Hi = CurDAG->getMachineNode( 427 AMDGPU::S_MOV_B32, DL, MVT::i32, 428 CurDAG->getTargetConstant(Hi_32(Imm), DL, MVT::i32)); 429 const SDValue Ops[] = { 430 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32), 431 SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32), 432 SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)}; 433 434 return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops); 435 } 436 437 void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) { 438 EVT VT = N->getValueType(0); 439 unsigned NumVectorElts = VT.getVectorNumElements(); 440 EVT EltVT = VT.getVectorElementType(); 441 SDLoc DL(N); 442 SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32); 443 444 if (NumVectorElts == 1) { 445 CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0), 446 RegClass); 447 return; 448 } 449 450 assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not " 451 "supported yet"); 452 // 32 = Max Num Vector Elements 453 // 2 = 2 REG_SEQUENCE operands per element (value, subreg index) 454 // 1 = Vector Register Class 455 SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1); 456 457 bool IsGCN = CurDAG->getSubtarget().getTargetTriple().getArch() == 458 Triple::amdgcn; 459 RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32); 460 bool IsRegSeq = true; 461 unsigned NOps = N->getNumOperands(); 462 for (unsigned i = 0; i < NOps; i++) { 463 // XXX: Why is this here? 464 if (isa<RegisterSDNode>(N->getOperand(i))) { 465 IsRegSeq = false; 466 break; 467 } 468 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i) 469 : R600RegisterInfo::getSubRegFromChannel(i); 470 RegSeqArgs[1 + (2 * i)] = N->getOperand(i); 471 RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32); 472 } 473 if (NOps != NumVectorElts) { 474 // Fill in the missing undef elements if this was a scalar_to_vector. 475 assert(N->getOpcode() == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts); 476 MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, 477 DL, EltVT); 478 for (unsigned i = NOps; i < NumVectorElts; ++i) { 479 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i) 480 : R600RegisterInfo::getSubRegFromChannel(i); 481 RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0); 482 RegSeqArgs[1 + (2 * i) + 1] = 483 CurDAG->getTargetConstant(Sub, DL, MVT::i32); 484 } 485 } 486 487 if (!IsRegSeq) 488 SelectCode(N); 489 CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs); 490 } 491 492 void AMDGPUDAGToDAGISel::SelectVectorShuffle(SDNode *N) { 493 EVT VT = N->getValueType(0); 494 EVT EltVT = VT.getVectorElementType(); 495 496 // TODO: Handle 16-bit element vectors with even aligned masks. 497 if (!Subtarget->hasPkMovB32() || !EltVT.bitsEq(MVT::i32) || 498 VT.getVectorNumElements() != 2) { 499 SelectCode(N); 500 return; 501 } 502 503 auto *SVN = cast<ShuffleVectorSDNode>(N); 504 505 SDValue Src0 = SVN->getOperand(0); 506 SDValue Src1 = SVN->getOperand(1); 507 ArrayRef<int> Mask = SVN->getMask(); 508 SDLoc DL(N); 509 510 assert(Src0.getValueType().getVectorNumElements() == 2 && Mask.size() == 2 && 511 Mask[0] < 4 && Mask[1] < 4); 512 513 SDValue VSrc0 = Mask[0] < 2 ? Src0 : Src1; 514 SDValue VSrc1 = Mask[1] < 2 ? Src0 : Src1; 515 unsigned Src0SubReg = Mask[0] & 1 ? AMDGPU::sub1 : AMDGPU::sub0; 516 unsigned Src1SubReg = Mask[1] & 1 ? AMDGPU::sub1 : AMDGPU::sub0; 517 518 if (Mask[0] < 0) { 519 Src0SubReg = Src1SubReg; 520 MachineSDNode *ImpDef = 521 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT); 522 VSrc0 = SDValue(ImpDef, 0); 523 } 524 525 if (Mask[1] < 0) { 526 Src1SubReg = Src0SubReg; 527 MachineSDNode *ImpDef = 528 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT); 529 VSrc1 = SDValue(ImpDef, 0); 530 } 531 532 // SGPR case needs to lower to copies. 533 // 534 // Also use subregister extract when we can directly blend the registers with 535 // a simple subregister copy. 536 // 537 // TODO: Maybe we should fold this out earlier 538 if (N->isDivergent() && Src0SubReg == AMDGPU::sub1 && 539 Src1SubReg == AMDGPU::sub0) { 540 // The low element of the result always comes from src0. 541 // The high element of the result always comes from src1. 542 // op_sel selects the high half of src0. 543 // op_sel_hi selects the high half of src1. 544 545 unsigned Src0OpSel = 546 Src0SubReg == AMDGPU::sub1 ? SISrcMods::OP_SEL_0 : SISrcMods::NONE; 547 unsigned Src1OpSel = 548 Src1SubReg == AMDGPU::sub1 ? SISrcMods::OP_SEL_0 : SISrcMods::NONE; 549 550 // Enable op_sel_hi to avoid printing it. This should have no effect on the 551 // result. 552 Src0OpSel |= SISrcMods::OP_SEL_1; 553 Src1OpSel |= SISrcMods::OP_SEL_1; 554 555 SDValue Src0OpSelVal = CurDAG->getTargetConstant(Src0OpSel, DL, MVT::i32); 556 SDValue Src1OpSelVal = CurDAG->getTargetConstant(Src1OpSel, DL, MVT::i32); 557 SDValue ZeroMods = CurDAG->getTargetConstant(0, DL, MVT::i32); 558 559 CurDAG->SelectNodeTo(N, AMDGPU::V_PK_MOV_B32, N->getVTList(), 560 {Src0OpSelVal, VSrc0, Src1OpSelVal, VSrc1, 561 ZeroMods, // clamp 562 ZeroMods, // op_sel 563 ZeroMods, // op_sel_hi 564 ZeroMods, // neg_lo 565 ZeroMods}); // neg_hi 566 return; 567 } 568 569 SDValue ResultElt0 = 570 CurDAG->getTargetExtractSubreg(Src0SubReg, DL, EltVT, VSrc0); 571 SDValue ResultElt1 = 572 CurDAG->getTargetExtractSubreg(Src1SubReg, DL, EltVT, VSrc1); 573 574 const SDValue Ops[] = { 575 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32), 576 ResultElt0, CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32), 577 ResultElt1, CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)}; 578 CurDAG->SelectNodeTo(N, TargetOpcode::REG_SEQUENCE, VT, Ops); 579 } 580 581 void AMDGPUDAGToDAGISel::Select(SDNode *N) { 582 unsigned int Opc = N->getOpcode(); 583 if (N->isMachineOpcode()) { 584 N->setNodeId(-1); 585 return; // Already selected. 586 } 587 588 // isa<MemSDNode> almost works but is slightly too permissive for some DS 589 // intrinsics. 590 if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N)) { 591 N = glueCopyToM0LDSInit(N); 592 SelectCode(N); 593 return; 594 } 595 596 switch (Opc) { 597 default: 598 break; 599 // We are selecting i64 ADD here instead of custom lower it during 600 // DAG legalization, so we can fold some i64 ADDs used for address 601 // calculation into the LOAD and STORE instructions. 602 case ISD::ADDC: 603 case ISD::ADDE: 604 case ISD::SUBC: 605 case ISD::SUBE: { 606 if (N->getValueType(0) != MVT::i64) 607 break; 608 609 SelectADD_SUB_I64(N); 610 return; 611 } 612 case ISD::UADDO_CARRY: 613 case ISD::USUBO_CARRY: 614 if (N->getValueType(0) != MVT::i32) 615 break; 616 617 SelectAddcSubb(N); 618 return; 619 case ISD::UADDO: 620 case ISD::USUBO: { 621 SelectUADDO_USUBO(N); 622 return; 623 } 624 case AMDGPUISD::FMUL_W_CHAIN: { 625 SelectFMUL_W_CHAIN(N); 626 return; 627 } 628 case AMDGPUISD::FMA_W_CHAIN: { 629 SelectFMA_W_CHAIN(N); 630 return; 631 } 632 633 case ISD::SCALAR_TO_VECTOR: 634 case ISD::BUILD_VECTOR: { 635 EVT VT = N->getValueType(0); 636 unsigned NumVectorElts = VT.getVectorNumElements(); 637 if (VT.getScalarSizeInBits() == 16) { 638 if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) { 639 if (SDNode *Packed = packConstantV2I16(N, *CurDAG)) { 640 ReplaceNode(N, Packed); 641 return; 642 } 643 } 644 645 break; 646 } 647 648 assert(VT.getVectorElementType().bitsEq(MVT::i32)); 649 unsigned RegClassID = 650 SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32)->getID(); 651 SelectBuildVector(N, RegClassID); 652 return; 653 } 654 case ISD::VECTOR_SHUFFLE: 655 SelectVectorShuffle(N); 656 return; 657 case ISD::BUILD_PAIR: { 658 SDValue RC, SubReg0, SubReg1; 659 SDLoc DL(N); 660 if (N->getValueType(0) == MVT::i128) { 661 RC = CurDAG->getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32); 662 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32); 663 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32); 664 } else if (N->getValueType(0) == MVT::i64) { 665 RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32); 666 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32); 667 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32); 668 } else { 669 llvm_unreachable("Unhandled value type for BUILD_PAIR"); 670 } 671 const SDValue Ops[] = { RC, N->getOperand(0), SubReg0, 672 N->getOperand(1), SubReg1 }; 673 ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, 674 N->getValueType(0), Ops)); 675 return; 676 } 677 678 case ISD::Constant: 679 case ISD::ConstantFP: { 680 if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N)) 681 break; 682 683 uint64_t Imm; 684 if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N)) { 685 Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue(); 686 if (AMDGPU::isValid32BitLiteral(Imm, true)) 687 break; 688 } else { 689 ConstantSDNode *C = cast<ConstantSDNode>(N); 690 Imm = C->getZExtValue(); 691 if (AMDGPU::isValid32BitLiteral(Imm, false)) 692 break; 693 } 694 695 SDLoc DL(N); 696 ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0))); 697 return; 698 } 699 case AMDGPUISD::BFE_I32: 700 case AMDGPUISD::BFE_U32: { 701 // There is a scalar version available, but unlike the vector version which 702 // has a separate operand for the offset and width, the scalar version packs 703 // the width and offset into a single operand. Try to move to the scalar 704 // version if the offsets are constant, so that we can try to keep extended 705 // loads of kernel arguments in SGPRs. 706 707 // TODO: Technically we could try to pattern match scalar bitshifts of 708 // dynamic values, but it's probably not useful. 709 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1)); 710 if (!Offset) 711 break; 712 713 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2)); 714 if (!Width) 715 break; 716 717 bool Signed = Opc == AMDGPUISD::BFE_I32; 718 719 uint32_t OffsetVal = Offset->getZExtValue(); 720 uint32_t WidthVal = Width->getZExtValue(); 721 722 ReplaceNode(N, getBFE32(Signed, SDLoc(N), N->getOperand(0), OffsetVal, 723 WidthVal)); 724 return; 725 } 726 case AMDGPUISD::DIV_SCALE: { 727 SelectDIV_SCALE(N); 728 return; 729 } 730 case AMDGPUISD::MAD_I64_I32: 731 case AMDGPUISD::MAD_U64_U32: { 732 SelectMAD_64_32(N); 733 return; 734 } 735 case ISD::SMUL_LOHI: 736 case ISD::UMUL_LOHI: 737 return SelectMUL_LOHI(N); 738 case ISD::CopyToReg: { 739 const SITargetLowering& Lowering = 740 *static_cast<const SITargetLowering*>(getTargetLowering()); 741 N = Lowering.legalizeTargetIndependentNode(N, *CurDAG); 742 break; 743 } 744 case ISD::AND: 745 case ISD::SRL: 746 case ISD::SRA: 747 case ISD::SIGN_EXTEND_INREG: 748 if (N->getValueType(0) != MVT::i32) 749 break; 750 751 SelectS_BFE(N); 752 return; 753 case ISD::BRCOND: 754 SelectBRCOND(N); 755 return; 756 case ISD::FP_EXTEND: 757 SelectFP_EXTEND(N); 758 return; 759 case AMDGPUISD::CVT_PKRTZ_F16_F32: 760 case AMDGPUISD::CVT_PKNORM_I16_F32: 761 case AMDGPUISD::CVT_PKNORM_U16_F32: 762 case AMDGPUISD::CVT_PK_U16_U32: 763 case AMDGPUISD::CVT_PK_I16_I32: { 764 // Hack around using a legal type if f16 is illegal. 765 if (N->getValueType(0) == MVT::i32) { 766 MVT NewVT = Opc == AMDGPUISD::CVT_PKRTZ_F16_F32 ? MVT::v2f16 : MVT::v2i16; 767 N = CurDAG->MorphNodeTo(N, N->getOpcode(), CurDAG->getVTList(NewVT), 768 { N->getOperand(0), N->getOperand(1) }); 769 SelectCode(N); 770 return; 771 } 772 773 break; 774 } 775 case ISD::INTRINSIC_W_CHAIN: { 776 SelectINTRINSIC_W_CHAIN(N); 777 return; 778 } 779 case ISD::INTRINSIC_WO_CHAIN: { 780 SelectINTRINSIC_WO_CHAIN(N); 781 return; 782 } 783 case ISD::INTRINSIC_VOID: { 784 SelectINTRINSIC_VOID(N); 785 return; 786 } 787 case AMDGPUISD::WAVE_ADDRESS: { 788 SelectWAVE_ADDRESS(N); 789 return; 790 } 791 case ISD::STACKRESTORE: { 792 SelectSTACKRESTORE(N); 793 return; 794 } 795 } 796 797 SelectCode(N); 798 } 799 800 bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const { 801 const BasicBlock *BB = FuncInfo->MBB->getBasicBlock(); 802 const Instruction *Term = BB->getTerminator(); 803 return Term->getMetadata("amdgpu.uniform") || 804 Term->getMetadata("structurizecfg.uniform"); 805 } 806 807 bool AMDGPUDAGToDAGISel::isUnneededShiftMask(const SDNode *N, 808 unsigned ShAmtBits) const { 809 assert(N->getOpcode() == ISD::AND); 810 811 const APInt &RHS = N->getConstantOperandAPInt(1); 812 if (RHS.countr_one() >= ShAmtBits) 813 return true; 814 815 const APInt &LHSKnownZeros = CurDAG->computeKnownBits(N->getOperand(0)).Zero; 816 return (LHSKnownZeros | RHS).countr_one() >= ShAmtBits; 817 } 818 819 static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr, 820 SDValue &N0, SDValue &N1) { 821 if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST && 822 Addr.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { 823 // As we split 64-bit `or` earlier, it's complicated pattern to match, i.e. 824 // (i64 (bitcast (v2i32 (build_vector 825 // (or (extract_vector_elt V, 0), OFFSET), 826 // (extract_vector_elt V, 1))))) 827 SDValue Lo = Addr.getOperand(0).getOperand(0); 828 if (Lo.getOpcode() == ISD::OR && DAG.isBaseWithConstantOffset(Lo)) { 829 SDValue BaseLo = Lo.getOperand(0); 830 SDValue BaseHi = Addr.getOperand(0).getOperand(1); 831 // Check that split base (Lo and Hi) are extracted from the same one. 832 if (BaseLo.getOpcode() == ISD::EXTRACT_VECTOR_ELT && 833 BaseHi.getOpcode() == ISD::EXTRACT_VECTOR_ELT && 834 BaseLo.getOperand(0) == BaseHi.getOperand(0) && 835 // Lo is statically extracted from index 0. 836 isa<ConstantSDNode>(BaseLo.getOperand(1)) && 837 BaseLo.getConstantOperandVal(1) == 0 && 838 // Hi is statically extracted from index 0. 839 isa<ConstantSDNode>(BaseHi.getOperand(1)) && 840 BaseHi.getConstantOperandVal(1) == 1) { 841 N0 = BaseLo.getOperand(0).getOperand(0); 842 N1 = Lo.getOperand(1); 843 return true; 844 } 845 } 846 } 847 return false; 848 } 849 850 bool AMDGPUDAGToDAGISel::isBaseWithConstantOffset64(SDValue Addr, SDValue &LHS, 851 SDValue &RHS) const { 852 if (CurDAG->isBaseWithConstantOffset(Addr)) { 853 LHS = Addr.getOperand(0); 854 RHS = Addr.getOperand(1); 855 return true; 856 } 857 858 if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, LHS, RHS)) { 859 assert(LHS && RHS && isa<ConstantSDNode>(RHS)); 860 return true; 861 } 862 863 return false; 864 } 865 866 StringRef AMDGPUDAGToDAGISelLegacy::getPassName() const { 867 return "AMDGPU DAG->DAG Pattern Instruction Selection"; 868 } 869 870 AMDGPUISelDAGToDAGPass::AMDGPUISelDAGToDAGPass(TargetMachine &TM) 871 : SelectionDAGISelPass( 872 std::make_unique<AMDGPUDAGToDAGISel>(TM, TM.getOptLevel())) {} 873 874 PreservedAnalyses 875 AMDGPUISelDAGToDAGPass::run(MachineFunction &MF, 876 MachineFunctionAnalysisManager &MFAM) { 877 #ifdef EXPENSIVE_CHECKS 878 auto &FAM = MFAM.getResult<FunctionAnalysisManagerMachineFunctionProxy>(MF) 879 .getManager(); 880 auto &F = MF.getFunction(); 881 DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F); 882 LoopInfo &LI = FAM.getResult<LoopAnalysis>(F); 883 for (auto &L : LI.getLoopsInPreorder()) 884 assert(L->isLCSSAForm(DT) && "Loop is not in LCSSA form!"); 885 #endif 886 return SelectionDAGISelPass::run(MF, MFAM); 887 } 888 889 //===----------------------------------------------------------------------===// 890 // Complex Patterns 891 //===----------------------------------------------------------------------===// 892 893 bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base, 894 SDValue &Offset) { 895 return false; 896 } 897 898 bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base, 899 SDValue &Offset) { 900 ConstantSDNode *C; 901 SDLoc DL(Addr); 902 903 if ((C = dyn_cast<ConstantSDNode>(Addr))) { 904 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32); 905 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32); 906 } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) && 907 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) { 908 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32); 909 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32); 910 } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) && 911 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) { 912 Base = Addr.getOperand(0); 913 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32); 914 } else { 915 Base = Addr; 916 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32); 917 } 918 919 return true; 920 } 921 922 SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val, 923 const SDLoc &DL) const { 924 SDNode *Mov = CurDAG->getMachineNode( 925 AMDGPU::S_MOV_B32, DL, MVT::i32, 926 CurDAG->getTargetConstant(Val, DL, MVT::i32)); 927 return SDValue(Mov, 0); 928 } 929 930 // FIXME: Should only handle uaddo_carry/usubo_carry 931 void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) { 932 SDLoc DL(N); 933 SDValue LHS = N->getOperand(0); 934 SDValue RHS = N->getOperand(1); 935 936 unsigned Opcode = N->getOpcode(); 937 bool ConsumeCarry = (Opcode == ISD::ADDE || Opcode == ISD::SUBE); 938 bool ProduceCarry = 939 ConsumeCarry || Opcode == ISD::ADDC || Opcode == ISD::SUBC; 940 bool IsAdd = Opcode == ISD::ADD || Opcode == ISD::ADDC || Opcode == ISD::ADDE; 941 942 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32); 943 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32); 944 945 SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, 946 DL, MVT::i32, LHS, Sub0); 947 SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, 948 DL, MVT::i32, LHS, Sub1); 949 950 SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, 951 DL, MVT::i32, RHS, Sub0); 952 SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, 953 DL, MVT::i32, RHS, Sub1); 954 955 SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue); 956 957 static const unsigned OpcMap[2][2][2] = { 958 {{AMDGPU::S_SUB_U32, AMDGPU::S_ADD_U32}, 959 {AMDGPU::V_SUB_CO_U32_e32, AMDGPU::V_ADD_CO_U32_e32}}, 960 {{AMDGPU::S_SUBB_U32, AMDGPU::S_ADDC_U32}, 961 {AMDGPU::V_SUBB_U32_e32, AMDGPU::V_ADDC_U32_e32}}}; 962 963 unsigned Opc = OpcMap[0][N->isDivergent()][IsAdd]; 964 unsigned CarryOpc = OpcMap[1][N->isDivergent()][IsAdd]; 965 966 SDNode *AddLo; 967 if (!ConsumeCarry) { 968 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) }; 969 AddLo = CurDAG->getMachineNode(Opc, DL, VTList, Args); 970 } else { 971 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0), N->getOperand(2) }; 972 AddLo = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args); 973 } 974 SDValue AddHiArgs[] = { 975 SDValue(Hi0, 0), 976 SDValue(Hi1, 0), 977 SDValue(AddLo, 1) 978 }; 979 SDNode *AddHi = CurDAG->getMachineNode(CarryOpc, DL, VTList, AddHiArgs); 980 981 SDValue RegSequenceArgs[] = { 982 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32), 983 SDValue(AddLo,0), 984 Sub0, 985 SDValue(AddHi,0), 986 Sub1, 987 }; 988 SDNode *RegSequence = CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL, 989 MVT::i64, RegSequenceArgs); 990 991 if (ProduceCarry) { 992 // Replace the carry-use 993 ReplaceUses(SDValue(N, 1), SDValue(AddHi, 1)); 994 } 995 996 // Replace the remaining uses. 997 ReplaceNode(N, RegSequence); 998 } 999 1000 void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) { 1001 SDLoc DL(N); 1002 SDValue LHS = N->getOperand(0); 1003 SDValue RHS = N->getOperand(1); 1004 SDValue CI = N->getOperand(2); 1005 1006 if (N->isDivergent()) { 1007 unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::V_ADDC_U32_e64 1008 : AMDGPU::V_SUBB_U32_e64; 1009 CurDAG->SelectNodeTo( 1010 N, Opc, N->getVTList(), 1011 {LHS, RHS, CI, 1012 CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/}); 1013 } else { 1014 unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::S_ADD_CO_PSEUDO 1015 : AMDGPU::S_SUB_CO_PSEUDO; 1016 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), {LHS, RHS, CI}); 1017 } 1018 } 1019 1020 void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) { 1021 // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned 1022 // carry out despite the _i32 name. These were renamed in VI to _U32. 1023 // FIXME: We should probably rename the opcodes here. 1024 bool IsAdd = N->getOpcode() == ISD::UADDO; 1025 bool IsVALU = N->isDivergent(); 1026 1027 for (SDNode::user_iterator UI = N->user_begin(), E = N->user_end(); UI != E; 1028 ++UI) 1029 if (UI.getUse().getResNo() == 1) { 1030 if ((IsAdd && (UI->getOpcode() != ISD::UADDO_CARRY)) || 1031 (!IsAdd && (UI->getOpcode() != ISD::USUBO_CARRY))) { 1032 IsVALU = true; 1033 break; 1034 } 1035 } 1036 1037 if (IsVALU) { 1038 unsigned Opc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64; 1039 1040 CurDAG->SelectNodeTo( 1041 N, Opc, N->getVTList(), 1042 {N->getOperand(0), N->getOperand(1), 1043 CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/}); 1044 } else { 1045 unsigned Opc = N->getOpcode() == ISD::UADDO ? AMDGPU::S_UADDO_PSEUDO 1046 : AMDGPU::S_USUBO_PSEUDO; 1047 1048 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), 1049 {N->getOperand(0), N->getOperand(1)}); 1050 } 1051 } 1052 1053 void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) { 1054 SDLoc SL(N); 1055 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod 1056 SDValue Ops[10]; 1057 1058 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]); 1059 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]); 1060 SelectVOP3Mods(N->getOperand(3), Ops[5], Ops[4]); 1061 Ops[8] = N->getOperand(0); 1062 Ops[9] = N->getOperand(4); 1063 1064 // If there are no source modifiers, prefer fmac over fma because it can use 1065 // the smaller VOP2 encoding. 1066 bool UseFMAC = Subtarget->hasDLInsts() && 1067 cast<ConstantSDNode>(Ops[0])->isZero() && 1068 cast<ConstantSDNode>(Ops[2])->isZero() && 1069 cast<ConstantSDNode>(Ops[4])->isZero(); 1070 unsigned Opcode = UseFMAC ? AMDGPU::V_FMAC_F32_e64 : AMDGPU::V_FMA_F32_e64; 1071 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), Ops); 1072 } 1073 1074 void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) { 1075 SDLoc SL(N); 1076 // src0_modifiers, src0, src1_modifiers, src1, clamp, omod 1077 SDValue Ops[8]; 1078 1079 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[4], Ops[5]); 1080 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]); 1081 Ops[6] = N->getOperand(0); 1082 Ops[7] = N->getOperand(3); 1083 1084 CurDAG->SelectNodeTo(N, AMDGPU::V_MUL_F32_e64, N->getVTList(), Ops); 1085 } 1086 1087 // We need to handle this here because tablegen doesn't support matching 1088 // instructions with multiple outputs. 1089 void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) { 1090 SDLoc SL(N); 1091 EVT VT = N->getValueType(0); 1092 1093 assert(VT == MVT::f32 || VT == MVT::f64); 1094 1095 unsigned Opc 1096 = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64_e64 : AMDGPU::V_DIV_SCALE_F32_e64; 1097 1098 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, 1099 // omod 1100 SDValue Ops[8]; 1101 SelectVOP3BMods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]); 1102 SelectVOP3BMods(N->getOperand(1), Ops[3], Ops[2]); 1103 SelectVOP3BMods(N->getOperand(2), Ops[5], Ops[4]); 1104 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); 1105 } 1106 1107 // We need to handle this here because tablegen doesn't support matching 1108 // instructions with multiple outputs. 1109 void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) { 1110 SDLoc SL(N); 1111 bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32; 1112 unsigned Opc; 1113 if (Subtarget->hasMADIntraFwdBug()) 1114 Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64 1115 : AMDGPU::V_MAD_U64_U32_gfx11_e64; 1116 else 1117 Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64; 1118 1119 SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1); 1120 SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), 1121 Clamp }; 1122 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); 1123 } 1124 1125 // We need to handle this here because tablegen doesn't support matching 1126 // instructions with multiple outputs. 1127 void AMDGPUDAGToDAGISel::SelectMUL_LOHI(SDNode *N) { 1128 SDLoc SL(N); 1129 bool Signed = N->getOpcode() == ISD::SMUL_LOHI; 1130 unsigned Opc; 1131 if (Subtarget->hasMADIntraFwdBug()) 1132 Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64 1133 : AMDGPU::V_MAD_U64_U32_gfx11_e64; 1134 else 1135 Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64; 1136 1137 SDValue Zero = CurDAG->getTargetConstant(0, SL, MVT::i64); 1138 SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1); 1139 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), Zero, Clamp}; 1140 SDNode *Mad = CurDAG->getMachineNode( 1141 Opc, SL, CurDAG->getVTList(MVT::i64, MVT::i1), Ops); 1142 if (!SDValue(N, 0).use_empty()) { 1143 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32); 1144 SDNode *Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL, 1145 MVT::i32, SDValue(Mad, 0), Sub0); 1146 ReplaceUses(SDValue(N, 0), SDValue(Lo, 0)); 1147 } 1148 if (!SDValue(N, 1).use_empty()) { 1149 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32); 1150 SDNode *Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL, 1151 MVT::i32, SDValue(Mad, 0), Sub1); 1152 ReplaceUses(SDValue(N, 1), SDValue(Hi, 0)); 1153 } 1154 CurDAG->RemoveDeadNode(N); 1155 } 1156 1157 bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset) const { 1158 if (!isUInt<16>(Offset)) 1159 return false; 1160 1161 if (!Base || Subtarget->hasUsableDSOffset() || 1162 Subtarget->unsafeDSOffsetFoldingEnabled()) 1163 return true; 1164 1165 // On Southern Islands instruction with a negative base value and an offset 1166 // don't seem to work. 1167 return CurDAG->SignBitIsZero(Base); 1168 } 1169 1170 bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base, 1171 SDValue &Offset) const { 1172 SDLoc DL(Addr); 1173 if (CurDAG->isBaseWithConstantOffset(Addr)) { 1174 SDValue N0 = Addr.getOperand(0); 1175 SDValue N1 = Addr.getOperand(1); 1176 ConstantSDNode *C1 = cast<ConstantSDNode>(N1); 1177 if (isDSOffsetLegal(N0, C1->getSExtValue())) { 1178 // (add n0, c0) 1179 Base = N0; 1180 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); 1181 return true; 1182 } 1183 } else if (Addr.getOpcode() == ISD::SUB) { 1184 // sub C, x -> add (sub 0, x), C 1185 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) { 1186 int64_t ByteOffset = C->getSExtValue(); 1187 if (isDSOffsetLegal(SDValue(), ByteOffset)) { 1188 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); 1189 1190 // XXX - This is kind of hacky. Create a dummy sub node so we can check 1191 // the known bits in isDSOffsetLegal. We need to emit the selected node 1192 // here, so this is thrown away. 1193 SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32, 1194 Zero, Addr.getOperand(1)); 1195 1196 if (isDSOffsetLegal(Sub, ByteOffset)) { 1197 SmallVector<SDValue, 3> Opnds; 1198 Opnds.push_back(Zero); 1199 Opnds.push_back(Addr.getOperand(1)); 1200 1201 // FIXME: Select to VOP3 version for with-carry. 1202 unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32; 1203 if (Subtarget->hasAddNoCarry()) { 1204 SubOp = AMDGPU::V_SUB_U32_e64; 1205 Opnds.push_back( 1206 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit 1207 } 1208 1209 MachineSDNode *MachineSub = 1210 CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds); 1211 1212 Base = SDValue(MachineSub, 0); 1213 Offset = CurDAG->getTargetConstant(ByteOffset, DL, MVT::i16); 1214 return true; 1215 } 1216 } 1217 } 1218 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) { 1219 // If we have a constant address, prefer to put the constant into the 1220 // offset. This can save moves to load the constant address since multiple 1221 // operations can share the zero base address register, and enables merging 1222 // into read2 / write2 instructions. 1223 1224 SDLoc DL(Addr); 1225 1226 if (isDSOffsetLegal(SDValue(), CAddr->getZExtValue())) { 1227 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); 1228 MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, 1229 DL, MVT::i32, Zero); 1230 Base = SDValue(MovZero, 0); 1231 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16); 1232 return true; 1233 } 1234 } 1235 1236 // default case 1237 Base = Addr; 1238 Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i16); 1239 return true; 1240 } 1241 1242 bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0, 1243 unsigned Offset1, 1244 unsigned Size) const { 1245 if (Offset0 % Size != 0 || Offset1 % Size != 0) 1246 return false; 1247 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size)) 1248 return false; 1249 1250 if (!Base || Subtarget->hasUsableDSOffset() || 1251 Subtarget->unsafeDSOffsetFoldingEnabled()) 1252 return true; 1253 1254 // On Southern Islands instruction with a negative base value and an offset 1255 // don't seem to work. 1256 return CurDAG->SignBitIsZero(Base); 1257 } 1258 1259 // Return whether the operation has NoUnsignedWrap property. 1260 static bool isNoUnsignedWrap(SDValue Addr) { 1261 return (Addr.getOpcode() == ISD::ADD && 1262 Addr->getFlags().hasNoUnsignedWrap()) || 1263 Addr->getOpcode() == ISD::OR; 1264 } 1265 1266 // Check that the base address of flat scratch load/store in the form of `base + 1267 // offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware 1268 // requirement). We always treat the first operand as the base address here. 1269 bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Addr) const { 1270 if (isNoUnsignedWrap(Addr)) 1271 return true; 1272 1273 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative 1274 // values. 1275 if (Subtarget->hasSignedScratchOffsets()) 1276 return true; 1277 1278 auto LHS = Addr.getOperand(0); 1279 auto RHS = Addr.getOperand(1); 1280 1281 // If the immediate offset is negative and within certain range, the base 1282 // address cannot also be negative. If the base is also negative, the sum 1283 // would be either negative or much larger than the valid range of scratch 1284 // memory a thread can access. 1285 ConstantSDNode *ImmOp = nullptr; 1286 if (Addr.getOpcode() == ISD::ADD && (ImmOp = dyn_cast<ConstantSDNode>(RHS))) { 1287 if (ImmOp->getSExtValue() < 0 && ImmOp->getSExtValue() > -0x40000000) 1288 return true; 1289 } 1290 1291 return CurDAG->SignBitIsZero(LHS); 1292 } 1293 1294 // Check address value in SGPR/VGPR are legal for flat scratch in the form 1295 // of: SGPR + VGPR. 1296 bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSV(SDValue Addr) const { 1297 if (isNoUnsignedWrap(Addr)) 1298 return true; 1299 1300 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative 1301 // values. 1302 if (Subtarget->hasSignedScratchOffsets()) 1303 return true; 1304 1305 auto LHS = Addr.getOperand(0); 1306 auto RHS = Addr.getOperand(1); 1307 return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS); 1308 } 1309 1310 // Check address value in SGPR/VGPR are legal for flat scratch in the form 1311 // of: SGPR + VGPR + Imm. 1312 bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSVImm(SDValue Addr) const { 1313 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative 1314 // values. 1315 if (AMDGPU::isGFX12Plus(*Subtarget)) 1316 return true; 1317 1318 auto Base = Addr.getOperand(0); 1319 auto *RHSImm = cast<ConstantSDNode>(Addr.getOperand(1)); 1320 // If the immediate offset is negative and within certain range, the base 1321 // address cannot also be negative. If the base is also negative, the sum 1322 // would be either negative or much larger than the valid range of scratch 1323 // memory a thread can access. 1324 if (isNoUnsignedWrap(Base) && 1325 (isNoUnsignedWrap(Addr) || 1326 (RHSImm->getSExtValue() < 0 && RHSImm->getSExtValue() > -0x40000000))) 1327 return true; 1328 1329 auto LHS = Base.getOperand(0); 1330 auto RHS = Base.getOperand(1); 1331 return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS); 1332 } 1333 1334 // TODO: If offset is too big, put low 16-bit into offset. 1335 bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base, 1336 SDValue &Offset0, 1337 SDValue &Offset1) const { 1338 return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 4); 1339 } 1340 1341 bool AMDGPUDAGToDAGISel::SelectDS128Bit8ByteAligned(SDValue Addr, SDValue &Base, 1342 SDValue &Offset0, 1343 SDValue &Offset1) const { 1344 return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 8); 1345 } 1346 1347 bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base, 1348 SDValue &Offset0, SDValue &Offset1, 1349 unsigned Size) const { 1350 SDLoc DL(Addr); 1351 1352 if (CurDAG->isBaseWithConstantOffset(Addr)) { 1353 SDValue N0 = Addr.getOperand(0); 1354 SDValue N1 = Addr.getOperand(1); 1355 ConstantSDNode *C1 = cast<ConstantSDNode>(N1); 1356 unsigned OffsetValue0 = C1->getZExtValue(); 1357 unsigned OffsetValue1 = OffsetValue0 + Size; 1358 1359 // (add n0, c0) 1360 if (isDSOffset2Legal(N0, OffsetValue0, OffsetValue1, Size)) { 1361 Base = N0; 1362 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32); 1363 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32); 1364 return true; 1365 } 1366 } else if (Addr.getOpcode() == ISD::SUB) { 1367 // sub C, x -> add (sub 0, x), C 1368 if (const ConstantSDNode *C = 1369 dyn_cast<ConstantSDNode>(Addr.getOperand(0))) { 1370 unsigned OffsetValue0 = C->getZExtValue(); 1371 unsigned OffsetValue1 = OffsetValue0 + Size; 1372 1373 if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) { 1374 SDLoc DL(Addr); 1375 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); 1376 1377 // XXX - This is kind of hacky. Create a dummy sub node so we can check 1378 // the known bits in isDSOffsetLegal. We need to emit the selected node 1379 // here, so this is thrown away. 1380 SDValue Sub = 1381 CurDAG->getNode(ISD::SUB, DL, MVT::i32, Zero, Addr.getOperand(1)); 1382 1383 if (isDSOffset2Legal(Sub, OffsetValue0, OffsetValue1, Size)) { 1384 SmallVector<SDValue, 3> Opnds; 1385 Opnds.push_back(Zero); 1386 Opnds.push_back(Addr.getOperand(1)); 1387 unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32; 1388 if (Subtarget->hasAddNoCarry()) { 1389 SubOp = AMDGPU::V_SUB_U32_e64; 1390 Opnds.push_back( 1391 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit 1392 } 1393 1394 MachineSDNode *MachineSub = CurDAG->getMachineNode( 1395 SubOp, DL, MVT::getIntegerVT(Size * 8), Opnds); 1396 1397 Base = SDValue(MachineSub, 0); 1398 Offset0 = 1399 CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32); 1400 Offset1 = 1401 CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32); 1402 return true; 1403 } 1404 } 1405 } 1406 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) { 1407 unsigned OffsetValue0 = CAddr->getZExtValue(); 1408 unsigned OffsetValue1 = OffsetValue0 + Size; 1409 1410 if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) { 1411 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); 1412 MachineSDNode *MovZero = 1413 CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero); 1414 Base = SDValue(MovZero, 0); 1415 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32); 1416 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32); 1417 return true; 1418 } 1419 } 1420 1421 // default case 1422 1423 Base = Addr; 1424 Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i32); 1425 Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i32); 1426 return true; 1427 } 1428 1429 bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr, 1430 SDValue &SOffset, SDValue &Offset, 1431 SDValue &Offen, SDValue &Idxen, 1432 SDValue &Addr64) const { 1433 // Subtarget prefers to use flat instruction 1434 // FIXME: This should be a pattern predicate and not reach here 1435 if (Subtarget->useFlatForGlobal()) 1436 return false; 1437 1438 SDLoc DL(Addr); 1439 1440 Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1); 1441 Offen = CurDAG->getTargetConstant(0, DL, MVT::i1); 1442 Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1); 1443 SOffset = Subtarget->hasRestrictedSOffset() 1444 ? CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32) 1445 : CurDAG->getTargetConstant(0, DL, MVT::i32); 1446 1447 ConstantSDNode *C1 = nullptr; 1448 SDValue N0 = Addr; 1449 if (CurDAG->isBaseWithConstantOffset(Addr)) { 1450 C1 = cast<ConstantSDNode>(Addr.getOperand(1)); 1451 if (isUInt<32>(C1->getZExtValue())) 1452 N0 = Addr.getOperand(0); 1453 else 1454 C1 = nullptr; 1455 } 1456 1457 if (N0.getOpcode() == ISD::ADD) { 1458 // (add N2, N3) -> addr64, or 1459 // (add (add N2, N3), C1) -> addr64 1460 SDValue N2 = N0.getOperand(0); 1461 SDValue N3 = N0.getOperand(1); 1462 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1); 1463 1464 if (N2->isDivergent()) { 1465 if (N3->isDivergent()) { 1466 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the 1467 // addr64, and construct the resource from a 0 address. 1468 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0); 1469 VAddr = N0; 1470 } else { 1471 // N2 is divergent, N3 is not. 1472 Ptr = N3; 1473 VAddr = N2; 1474 } 1475 } else { 1476 // N2 is not divergent. 1477 Ptr = N2; 1478 VAddr = N3; 1479 } 1480 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32); 1481 } else if (N0->isDivergent()) { 1482 // N0 is divergent. Use it as the addr64, and construct the resource from a 1483 // 0 address. 1484 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0); 1485 VAddr = N0; 1486 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1); 1487 } else { 1488 // N0 -> offset, or 1489 // (N0 + C1) -> offset 1490 VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32); 1491 Ptr = N0; 1492 } 1493 1494 if (!C1) { 1495 // No offset. 1496 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32); 1497 return true; 1498 } 1499 1500 const SIInstrInfo *TII = Subtarget->getInstrInfo(); 1501 if (TII->isLegalMUBUFImmOffset(C1->getZExtValue())) { 1502 // Legal offset for instruction. 1503 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32); 1504 return true; 1505 } 1506 1507 // Illegal offset, store it in soffset. 1508 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32); 1509 SOffset = 1510 SDValue(CurDAG->getMachineNode( 1511 AMDGPU::S_MOV_B32, DL, MVT::i32, 1512 CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)), 1513 0); 1514 return true; 1515 } 1516 1517 bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, 1518 SDValue &VAddr, SDValue &SOffset, 1519 SDValue &Offset) const { 1520 SDValue Ptr, Offen, Idxen, Addr64; 1521 1522 // addr64 bit was removed for volcanic islands. 1523 // FIXME: This should be a pattern predicate and not reach here 1524 if (!Subtarget->hasAddr64()) 1525 return false; 1526 1527 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64)) 1528 return false; 1529 1530 ConstantSDNode *C = cast<ConstantSDNode>(Addr64); 1531 if (C->getSExtValue()) { 1532 SDLoc DL(Addr); 1533 1534 const SITargetLowering& Lowering = 1535 *static_cast<const SITargetLowering*>(getTargetLowering()); 1536 1537 SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0); 1538 return true; 1539 } 1540 1541 return false; 1542 } 1543 1544 std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const { 1545 SDLoc DL(N); 1546 1547 auto *FI = dyn_cast<FrameIndexSDNode>(N); 1548 SDValue TFI = 1549 FI ? CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)) : N; 1550 1551 // We rebase the base address into an absolute stack address and hence 1552 // use constant 0 for soffset. This value must be retained until 1553 // frame elimination and eliminateFrameIndex will choose the appropriate 1554 // frame register if need be. 1555 return std::pair(TFI, CurDAG->getTargetConstant(0, DL, MVT::i32)); 1556 } 1557 1558 bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent, 1559 SDValue Addr, SDValue &Rsrc, 1560 SDValue &VAddr, SDValue &SOffset, 1561 SDValue &ImmOffset) const { 1562 1563 SDLoc DL(Addr); 1564 MachineFunction &MF = CurDAG->getMachineFunction(); 1565 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1566 1567 Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32); 1568 1569 if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) { 1570 int64_t Imm = CAddr->getSExtValue(); 1571 const int64_t NullPtr = 1572 AMDGPUTargetMachine::getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS); 1573 // Don't fold null pointer. 1574 if (Imm != NullPtr) { 1575 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget); 1576 SDValue HighBits = 1577 CurDAG->getTargetConstant(Imm & ~MaxOffset, DL, MVT::i32); 1578 MachineSDNode *MovHighBits = CurDAG->getMachineNode( 1579 AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits); 1580 VAddr = SDValue(MovHighBits, 0); 1581 1582 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32); 1583 ImmOffset = CurDAG->getTargetConstant(Imm & MaxOffset, DL, MVT::i32); 1584 return true; 1585 } 1586 } 1587 1588 if (CurDAG->isBaseWithConstantOffset(Addr)) { 1589 // (add n0, c1) 1590 1591 SDValue N0 = Addr.getOperand(0); 1592 uint64_t C1 = Addr.getConstantOperandVal(1); 1593 1594 // Offsets in vaddr must be positive if range checking is enabled. 1595 // 1596 // The total computation of vaddr + soffset + offset must not overflow. If 1597 // vaddr is negative, even if offset is 0 the sgpr offset add will end up 1598 // overflowing. 1599 // 1600 // Prior to gfx9, MUBUF instructions with the vaddr offset enabled would 1601 // always perform a range check. If a negative vaddr base index was used, 1602 // this would fail the range check. The overall address computation would 1603 // compute a valid address, but this doesn't happen due to the range 1604 // check. For out-of-bounds MUBUF loads, a 0 is returned. 1605 // 1606 // Therefore it should be safe to fold any VGPR offset on gfx9 into the 1607 // MUBUF vaddr, but not on older subtargets which can only do this if the 1608 // sign bit is known 0. 1609 const SIInstrInfo *TII = Subtarget->getInstrInfo(); 1610 if (TII->isLegalMUBUFImmOffset(C1) && 1611 (!Subtarget->privateMemoryResourceIsRangeChecked() || 1612 CurDAG->SignBitIsZero(N0))) { 1613 std::tie(VAddr, SOffset) = foldFrameIndex(N0); 1614 ImmOffset = CurDAG->getTargetConstant(C1, DL, MVT::i32); 1615 return true; 1616 } 1617 } 1618 1619 // (node) 1620 std::tie(VAddr, SOffset) = foldFrameIndex(Addr); 1621 ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i32); 1622 return true; 1623 } 1624 1625 static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val) { 1626 if (Val.getOpcode() != ISD::CopyFromReg) 1627 return false; 1628 auto Reg = cast<RegisterSDNode>(Val.getOperand(1))->getReg(); 1629 if (!Reg.isPhysical()) 1630 return false; 1631 const auto *RC = TRI.getPhysRegBaseClass(Reg); 1632 return RC && TRI.isSGPRClass(RC); 1633 } 1634 1635 bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent, 1636 SDValue Addr, 1637 SDValue &SRsrc, 1638 SDValue &SOffset, 1639 SDValue &Offset) const { 1640 const SIRegisterInfo *TRI = 1641 static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); 1642 const SIInstrInfo *TII = Subtarget->getInstrInfo(); 1643 MachineFunction &MF = CurDAG->getMachineFunction(); 1644 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1645 SDLoc DL(Addr); 1646 1647 // CopyFromReg <sgpr> 1648 if (IsCopyFromSGPR(*TRI, Addr)) { 1649 SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32); 1650 SOffset = Addr; 1651 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32); 1652 return true; 1653 } 1654 1655 ConstantSDNode *CAddr; 1656 if (Addr.getOpcode() == ISD::ADD) { 1657 // Add (CopyFromReg <sgpr>) <constant> 1658 CAddr = dyn_cast<ConstantSDNode>(Addr.getOperand(1)); 1659 if (!CAddr || !TII->isLegalMUBUFImmOffset(CAddr->getZExtValue())) 1660 return false; 1661 if (!IsCopyFromSGPR(*TRI, Addr.getOperand(0))) 1662 return false; 1663 1664 SOffset = Addr.getOperand(0); 1665 } else if ((CAddr = dyn_cast<ConstantSDNode>(Addr)) && 1666 TII->isLegalMUBUFImmOffset(CAddr->getZExtValue())) { 1667 // <constant> 1668 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32); 1669 } else { 1670 return false; 1671 } 1672 1673 SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32); 1674 1675 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i32); 1676 return true; 1677 } 1678 1679 bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, 1680 SDValue &SOffset, SDValue &Offset 1681 ) const { 1682 SDValue Ptr, VAddr, Offen, Idxen, Addr64; 1683 const SIInstrInfo *TII = Subtarget->getInstrInfo(); 1684 1685 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64)) 1686 return false; 1687 1688 if (!cast<ConstantSDNode>(Offen)->getSExtValue() && 1689 !cast<ConstantSDNode>(Idxen)->getSExtValue() && 1690 !cast<ConstantSDNode>(Addr64)->getSExtValue()) { 1691 uint64_t Rsrc = TII->getDefaultRsrcDataFormat() | 1692 maskTrailingOnes<uint64_t>(32); // Size 1693 SDLoc DL(Addr); 1694 1695 const SITargetLowering& Lowering = 1696 *static_cast<const SITargetLowering*>(getTargetLowering()); 1697 1698 SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0); 1699 return true; 1700 } 1701 return false; 1702 } 1703 1704 bool AMDGPUDAGToDAGISel::SelectBUFSOffset(SDValue ByteOffsetNode, 1705 SDValue &SOffset) const { 1706 if (Subtarget->hasRestrictedSOffset() && isNullConstant(ByteOffsetNode)) { 1707 SOffset = CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32); 1708 return true; 1709 } 1710 1711 SOffset = ByteOffsetNode; 1712 return true; 1713 } 1714 1715 // Find a load or store from corresponding pattern root. 1716 // Roots may be build_vector, bitconvert or their combinations. 1717 static MemSDNode* findMemSDNode(SDNode *N) { 1718 N = AMDGPUTargetLowering::stripBitcast(SDValue(N,0)).getNode(); 1719 if (MemSDNode *MN = dyn_cast<MemSDNode>(N)) 1720 return MN; 1721 assert(isa<BuildVectorSDNode>(N)); 1722 for (SDValue V : N->op_values()) 1723 if (MemSDNode *MN = 1724 dyn_cast<MemSDNode>(AMDGPUTargetLowering::stripBitcast(V))) 1725 return MN; 1726 llvm_unreachable("cannot find MemSDNode in the pattern!"); 1727 } 1728 1729 bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr, 1730 SDValue &VAddr, SDValue &Offset, 1731 uint64_t FlatVariant) const { 1732 int64_t OffsetVal = 0; 1733 1734 unsigned AS = findMemSDNode(N)->getAddressSpace(); 1735 1736 bool CanHaveFlatSegmentOffsetBug = 1737 Subtarget->hasFlatSegmentOffsetBug() && 1738 FlatVariant == SIInstrFlags::FLAT && 1739 (AS == AMDGPUAS::FLAT_ADDRESS || AS == AMDGPUAS::GLOBAL_ADDRESS); 1740 1741 if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) { 1742 SDValue N0, N1; 1743 if (isBaseWithConstantOffset64(Addr, N0, N1) && 1744 (FlatVariant != SIInstrFlags::FlatScratch || 1745 isFlatScratchBaseLegal(Addr))) { 1746 int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue(); 1747 1748 const SIInstrInfo *TII = Subtarget->getInstrInfo(); 1749 if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) { 1750 Addr = N0; 1751 OffsetVal = COffsetVal; 1752 } else { 1753 // If the offset doesn't fit, put the low bits into the offset field and 1754 // add the rest. 1755 // 1756 // For a FLAT instruction the hardware decides whether to access 1757 // global/scratch/shared memory based on the high bits of vaddr, 1758 // ignoring the offset field, so we have to ensure that when we add 1759 // remainder to vaddr it still points into the same underlying object. 1760 // The easiest way to do that is to make sure that we split the offset 1761 // into two pieces that are both >= 0 or both <= 0. 1762 1763 SDLoc DL(N); 1764 uint64_t RemainderOffset; 1765 1766 std::tie(OffsetVal, RemainderOffset) = 1767 TII->splitFlatOffset(COffsetVal, AS, FlatVariant); 1768 1769 SDValue AddOffsetLo = 1770 getMaterializedScalarImm32(Lo_32(RemainderOffset), DL); 1771 SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1); 1772 1773 if (Addr.getValueType().getSizeInBits() == 32) { 1774 SmallVector<SDValue, 3> Opnds; 1775 Opnds.push_back(N0); 1776 Opnds.push_back(AddOffsetLo); 1777 unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32; 1778 if (Subtarget->hasAddNoCarry()) { 1779 AddOp = AMDGPU::V_ADD_U32_e64; 1780 Opnds.push_back(Clamp); 1781 } 1782 Addr = SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0); 1783 } else { 1784 // TODO: Should this try to use a scalar add pseudo if the base address 1785 // is uniform and saddr is usable? 1786 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32); 1787 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32); 1788 1789 SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, 1790 DL, MVT::i32, N0, Sub0); 1791 SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, 1792 DL, MVT::i32, N0, Sub1); 1793 1794 SDValue AddOffsetHi = 1795 getMaterializedScalarImm32(Hi_32(RemainderOffset), DL); 1796 1797 SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1); 1798 1799 SDNode *Add = 1800 CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs, 1801 {AddOffsetLo, SDValue(N0Lo, 0), Clamp}); 1802 1803 SDNode *Addc = CurDAG->getMachineNode( 1804 AMDGPU::V_ADDC_U32_e64, DL, VTs, 1805 {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp}); 1806 1807 SDValue RegSequenceArgs[] = { 1808 CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32), 1809 SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1}; 1810 1811 Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL, 1812 MVT::i64, RegSequenceArgs), 1813 0); 1814 } 1815 } 1816 } 1817 } 1818 1819 VAddr = Addr; 1820 Offset = CurDAG->getSignedTargetConstant(OffsetVal, SDLoc(), MVT::i32); 1821 return true; 1822 } 1823 1824 bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, SDValue Addr, 1825 SDValue &VAddr, 1826 SDValue &Offset) const { 1827 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FLAT); 1828 } 1829 1830 bool AMDGPUDAGToDAGISel::SelectGlobalOffset(SDNode *N, SDValue Addr, 1831 SDValue &VAddr, 1832 SDValue &Offset) const { 1833 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FlatGlobal); 1834 } 1835 1836 bool AMDGPUDAGToDAGISel::SelectScratchOffset(SDNode *N, SDValue Addr, 1837 SDValue &VAddr, 1838 SDValue &Offset) const { 1839 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, 1840 SIInstrFlags::FlatScratch); 1841 } 1842 1843 // If this matches zero_extend i32:x, return x 1844 static SDValue matchZExtFromI32(SDValue Op) { 1845 if (Op.getOpcode() != ISD::ZERO_EXTEND) 1846 return SDValue(); 1847 1848 SDValue ExtSrc = Op.getOperand(0); 1849 return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue(); 1850 } 1851 1852 // Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset) 1853 bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, 1854 SDValue Addr, 1855 SDValue &SAddr, 1856 SDValue &VOffset, 1857 SDValue &Offset) const { 1858 int64_t ImmOffset = 0; 1859 1860 // Match the immediate offset first, which canonically is moved as low as 1861 // possible. 1862 1863 SDValue LHS, RHS; 1864 if (isBaseWithConstantOffset64(Addr, LHS, RHS)) { 1865 int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue(); 1866 const SIInstrInfo *TII = Subtarget->getInstrInfo(); 1867 1868 if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, 1869 SIInstrFlags::FlatGlobal)) { 1870 Addr = LHS; 1871 ImmOffset = COffsetVal; 1872 } else if (!LHS->isDivergent()) { 1873 if (COffsetVal > 0) { 1874 SDLoc SL(N); 1875 // saddr + large_offset -> saddr + 1876 // (voffset = large_offset & ~MaxOffset) + 1877 // (large_offset & MaxOffset); 1878 int64_t SplitImmOffset, RemainderOffset; 1879 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset( 1880 COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal); 1881 1882 if (isUInt<32>(RemainderOffset)) { 1883 SDNode *VMov = CurDAG->getMachineNode( 1884 AMDGPU::V_MOV_B32_e32, SL, MVT::i32, 1885 CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32)); 1886 VOffset = SDValue(VMov, 0); 1887 SAddr = LHS; 1888 Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32); 1889 return true; 1890 } 1891 } 1892 1893 // We are adding a 64 bit SGPR and a constant. If constant bus limit 1894 // is 1 we would need to perform 1 or 2 extra moves for each half of 1895 // the constant and it is better to do a scalar add and then issue a 1896 // single VALU instruction to materialize zero. Otherwise it is less 1897 // instructions to perform VALU adds with immediates or inline literals. 1898 unsigned NumLiterals = 1899 !TII->isInlineConstant(APInt(32, Lo_32(COffsetVal))) + 1900 !TII->isInlineConstant(APInt(32, Hi_32(COffsetVal))); 1901 if (Subtarget->getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals) 1902 return false; 1903 } 1904 } 1905 1906 // Match the variable offset. 1907 if (Addr.getOpcode() == ISD::ADD) { 1908 LHS = Addr.getOperand(0); 1909 RHS = Addr.getOperand(1); 1910 1911 if (!LHS->isDivergent()) { 1912 // add (i64 sgpr), (zero_extend (i32 vgpr)) 1913 if (SDValue ZextRHS = matchZExtFromI32(RHS)) { 1914 SAddr = LHS; 1915 VOffset = ZextRHS; 1916 } 1917 } 1918 1919 if (!SAddr && !RHS->isDivergent()) { 1920 // add (zero_extend (i32 vgpr)), (i64 sgpr) 1921 if (SDValue ZextLHS = matchZExtFromI32(LHS)) { 1922 SAddr = RHS; 1923 VOffset = ZextLHS; 1924 } 1925 } 1926 1927 if (SAddr) { 1928 Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32); 1929 return true; 1930 } 1931 } 1932 1933 if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF || 1934 isa<ConstantSDNode>(Addr)) 1935 return false; 1936 1937 // It's cheaper to materialize a single 32-bit zero for vaddr than the two 1938 // moves required to copy a 64-bit SGPR to VGPR. 1939 SAddr = Addr; 1940 SDNode *VMov = 1941 CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32, 1942 CurDAG->getTargetConstant(0, SDLoc(), MVT::i32)); 1943 VOffset = SDValue(VMov, 0); 1944 Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32); 1945 return true; 1946 } 1947 1948 static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr) { 1949 if (auto *FI = dyn_cast<FrameIndexSDNode>(SAddr)) { 1950 SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)); 1951 } else if (SAddr.getOpcode() == ISD::ADD && 1952 isa<FrameIndexSDNode>(SAddr.getOperand(0))) { 1953 // Materialize this into a scalar move for scalar address to avoid 1954 // readfirstlane. 1955 auto *FI = cast<FrameIndexSDNode>(SAddr.getOperand(0)); 1956 SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(), 1957 FI->getValueType(0)); 1958 SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, SDLoc(SAddr), 1959 MVT::i32, TFI, SAddr.getOperand(1)), 1960 0); 1961 } 1962 1963 return SAddr; 1964 } 1965 1966 // Match (32-bit SGPR base) + sext(imm offset) 1967 bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr, 1968 SDValue &SAddr, 1969 SDValue &Offset) const { 1970 if (Addr->isDivergent()) 1971 return false; 1972 1973 SDLoc DL(Addr); 1974 1975 int64_t COffsetVal = 0; 1976 1977 if (CurDAG->isBaseWithConstantOffset(Addr) && isFlatScratchBaseLegal(Addr)) { 1978 COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue(); 1979 SAddr = Addr.getOperand(0); 1980 } else { 1981 SAddr = Addr; 1982 } 1983 1984 SAddr = SelectSAddrFI(CurDAG, SAddr); 1985 1986 const SIInstrInfo *TII = Subtarget->getInstrInfo(); 1987 1988 if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, 1989 SIInstrFlags::FlatScratch)) { 1990 int64_t SplitImmOffset, RemainderOffset; 1991 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset( 1992 COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, SIInstrFlags::FlatScratch); 1993 1994 COffsetVal = SplitImmOffset; 1995 1996 SDValue AddOffset = 1997 SAddr.getOpcode() == ISD::TargetFrameIndex 1998 ? getMaterializedScalarImm32(Lo_32(RemainderOffset), DL) 1999 : CurDAG->getSignedTargetConstant(RemainderOffset, DL, MVT::i32); 2000 SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, DL, MVT::i32, 2001 SAddr, AddOffset), 2002 0); 2003 } 2004 2005 Offset = CurDAG->getSignedTargetConstant(COffsetVal, DL, MVT::i32); 2006 2007 return true; 2008 } 2009 2010 // Check whether the flat scratch SVS swizzle bug affects this access. 2011 bool AMDGPUDAGToDAGISel::checkFlatScratchSVSSwizzleBug( 2012 SDValue VAddr, SDValue SAddr, uint64_t ImmOffset) const { 2013 if (!Subtarget->hasFlatScratchSVSSwizzleBug()) 2014 return false; 2015 2016 // The bug affects the swizzling of SVS accesses if there is any carry out 2017 // from the two low order bits (i.e. from bit 1 into bit 2) when adding 2018 // voffset to (soffset + inst_offset). 2019 KnownBits VKnown = CurDAG->computeKnownBits(VAddr); 2020 KnownBits SKnown = 2021 KnownBits::add(CurDAG->computeKnownBits(SAddr), 2022 KnownBits::makeConstant(APInt(32, ImmOffset, 2023 /*isSigned=*/true))); 2024 uint64_t VMax = VKnown.getMaxValue().getZExtValue(); 2025 uint64_t SMax = SKnown.getMaxValue().getZExtValue(); 2026 return (VMax & 3) + (SMax & 3) >= 4; 2027 } 2028 2029 bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr, 2030 SDValue &VAddr, SDValue &SAddr, 2031 SDValue &Offset) const { 2032 int64_t ImmOffset = 0; 2033 2034 SDValue LHS, RHS; 2035 SDValue OrigAddr = Addr; 2036 if (isBaseWithConstantOffset64(Addr, LHS, RHS)) { 2037 int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue(); 2038 const SIInstrInfo *TII = Subtarget->getInstrInfo(); 2039 2040 if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true)) { 2041 Addr = LHS; 2042 ImmOffset = COffsetVal; 2043 } else if (!LHS->isDivergent() && COffsetVal > 0) { 2044 SDLoc SL(N); 2045 // saddr + large_offset -> saddr + (vaddr = large_offset & ~MaxOffset) + 2046 // (large_offset & MaxOffset); 2047 int64_t SplitImmOffset, RemainderOffset; 2048 std::tie(SplitImmOffset, RemainderOffset) 2049 = TII->splitFlatOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true); 2050 2051 if (isUInt<32>(RemainderOffset)) { 2052 SDNode *VMov = CurDAG->getMachineNode( 2053 AMDGPU::V_MOV_B32_e32, SL, MVT::i32, 2054 CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32)); 2055 VAddr = SDValue(VMov, 0); 2056 SAddr = LHS; 2057 if (!isFlatScratchBaseLegal(Addr)) 2058 return false; 2059 if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset)) 2060 return false; 2061 Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32); 2062 return true; 2063 } 2064 } 2065 } 2066 2067 if (Addr.getOpcode() != ISD::ADD) 2068 return false; 2069 2070 LHS = Addr.getOperand(0); 2071 RHS = Addr.getOperand(1); 2072 2073 if (!LHS->isDivergent() && RHS->isDivergent()) { 2074 SAddr = LHS; 2075 VAddr = RHS; 2076 } else if (!RHS->isDivergent() && LHS->isDivergent()) { 2077 SAddr = RHS; 2078 VAddr = LHS; 2079 } else { 2080 return false; 2081 } 2082 2083 if (OrigAddr != Addr) { 2084 if (!isFlatScratchBaseLegalSVImm(OrigAddr)) 2085 return false; 2086 } else { 2087 if (!isFlatScratchBaseLegalSV(OrigAddr)) 2088 return false; 2089 } 2090 2091 if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset)) 2092 return false; 2093 SAddr = SelectSAddrFI(CurDAG, SAddr); 2094 Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32); 2095 return true; 2096 } 2097 2098 // For unbuffered smem loads, it is illegal for the Immediate Offset to be 2099 // negative if the resulting (Offset + (M0 or SOffset or zero) is negative. 2100 // Handle the case where the Immediate Offset + SOffset is negative. 2101 bool AMDGPUDAGToDAGISel::isSOffsetLegalWithImmOffset(SDValue *SOffset, 2102 bool Imm32Only, 2103 bool IsBuffer, 2104 int64_t ImmOffset) const { 2105 if (!IsBuffer && !Imm32Only && ImmOffset < 0 && 2106 AMDGPU::hasSMRDSignedImmOffset(*Subtarget)) { 2107 KnownBits SKnown = CurDAG->computeKnownBits(*SOffset); 2108 if (ImmOffset + SKnown.getMinValue().getSExtValue() < 0) 2109 return false; 2110 } 2111 2112 return true; 2113 } 2114 2115 // Match an immediate (if Offset is not null) or an SGPR (if SOffset is 2116 // not null) offset. If Imm32Only is true, match only 32-bit immediate 2117 // offsets available on CI. 2118 bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, 2119 SDValue *SOffset, SDValue *Offset, 2120 bool Imm32Only, bool IsBuffer, 2121 bool HasSOffset, 2122 int64_t ImmOffset) const { 2123 assert((!SOffset || !Offset) && 2124 "Cannot match both soffset and offset at the same time!"); 2125 2126 ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode); 2127 if (!C) { 2128 if (!SOffset) 2129 return false; 2130 2131 if (ByteOffsetNode.getValueType().isScalarInteger() && 2132 ByteOffsetNode.getValueType().getSizeInBits() == 32) { 2133 *SOffset = ByteOffsetNode; 2134 return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer, 2135 ImmOffset); 2136 } 2137 if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) { 2138 if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) { 2139 *SOffset = ByteOffsetNode.getOperand(0); 2140 return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer, 2141 ImmOffset); 2142 } 2143 } 2144 return false; 2145 } 2146 2147 SDLoc SL(ByteOffsetNode); 2148 2149 // GFX9 and GFX10 have signed byte immediate offsets. The immediate 2150 // offset for S_BUFFER instructions is unsigned. 2151 int64_t ByteOffset = IsBuffer ? C->getZExtValue() : C->getSExtValue(); 2152 std::optional<int64_t> EncodedOffset = AMDGPU::getSMRDEncodedOffset( 2153 *Subtarget, ByteOffset, IsBuffer, HasSOffset); 2154 if (EncodedOffset && Offset && !Imm32Only) { 2155 *Offset = CurDAG->getSignedTargetConstant(*EncodedOffset, SL, MVT::i32); 2156 return true; 2157 } 2158 2159 // SGPR and literal offsets are unsigned. 2160 if (ByteOffset < 0) 2161 return false; 2162 2163 EncodedOffset = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget, ByteOffset); 2164 if (EncodedOffset && Offset && Imm32Only) { 2165 *Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32); 2166 return true; 2167 } 2168 2169 if (!isUInt<32>(ByteOffset) && !isInt<32>(ByteOffset)) 2170 return false; 2171 2172 if (SOffset) { 2173 SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32); 2174 *SOffset = SDValue( 2175 CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, C32Bit), 0); 2176 return true; 2177 } 2178 2179 return false; 2180 } 2181 2182 SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const { 2183 if (Addr.getValueType() != MVT::i32) 2184 return Addr; 2185 2186 // Zero-extend a 32-bit address. 2187 SDLoc SL(Addr); 2188 2189 const MachineFunction &MF = CurDAG->getMachineFunction(); 2190 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 2191 unsigned AddrHiVal = Info->get32BitAddressHighBits(); 2192 SDValue AddrHi = CurDAG->getTargetConstant(AddrHiVal, SL, MVT::i32); 2193 2194 const SDValue Ops[] = { 2195 CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32), 2196 Addr, 2197 CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32), 2198 SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi), 2199 0), 2200 CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32), 2201 }; 2202 2203 return SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64, 2204 Ops), 0); 2205 } 2206 2207 // Match a base and an immediate (if Offset is not null) or an SGPR (if 2208 // SOffset is not null) or an immediate+SGPR offset. If Imm32Only is 2209 // true, match only 32-bit immediate offsets available on CI. 2210 bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase, 2211 SDValue *SOffset, SDValue *Offset, 2212 bool Imm32Only, bool IsBuffer, 2213 bool HasSOffset, 2214 int64_t ImmOffset) const { 2215 if (SOffset && Offset) { 2216 assert(!Imm32Only && !IsBuffer); 2217 SDValue B; 2218 2219 if (!SelectSMRDBaseOffset(Addr, B, nullptr, Offset, false, false, true)) 2220 return false; 2221 2222 int64_t ImmOff = 0; 2223 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(*Offset)) 2224 ImmOff = C->getSExtValue(); 2225 2226 return SelectSMRDBaseOffset(B, SBase, SOffset, nullptr, false, false, true, 2227 ImmOff); 2228 } 2229 2230 // A 32-bit (address + offset) should not cause unsigned 32-bit integer 2231 // wraparound, because s_load instructions perform the addition in 64 bits. 2232 if (Addr.getValueType() == MVT::i32 && Addr.getOpcode() == ISD::ADD && 2233 !Addr->getFlags().hasNoUnsignedWrap()) 2234 return false; 2235 2236 SDValue N0, N1; 2237 // Extract the base and offset if possible. 2238 if (CurDAG->isBaseWithConstantOffset(Addr) || Addr.getOpcode() == ISD::ADD) { 2239 N0 = Addr.getOperand(0); 2240 N1 = Addr.getOperand(1); 2241 } else if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, N0, N1)) { 2242 assert(N0 && N1 && isa<ConstantSDNode>(N1)); 2243 } 2244 if (!N0 || !N1) 2245 return false; 2246 2247 if (SelectSMRDOffset(N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset, 2248 ImmOffset)) { 2249 SBase = N0; 2250 return true; 2251 } 2252 if (SelectSMRDOffset(N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset, 2253 ImmOffset)) { 2254 SBase = N1; 2255 return true; 2256 } 2257 return false; 2258 } 2259 2260 bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase, 2261 SDValue *SOffset, SDValue *Offset, 2262 bool Imm32Only) const { 2263 if (SelectSMRDBaseOffset(Addr, SBase, SOffset, Offset, Imm32Only)) { 2264 SBase = Expand32BitAddress(SBase); 2265 return true; 2266 } 2267 2268 if (Addr.getValueType() == MVT::i32 && Offset && !SOffset) { 2269 SBase = Expand32BitAddress(Addr); 2270 *Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32); 2271 return true; 2272 } 2273 2274 return false; 2275 } 2276 2277 bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase, 2278 SDValue &Offset) const { 2279 return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset); 2280 } 2281 2282 bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase, 2283 SDValue &Offset) const { 2284 assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS); 2285 return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset, 2286 /* Imm32Only */ true); 2287 } 2288 2289 bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase, 2290 SDValue &SOffset) const { 2291 return SelectSMRD(Addr, SBase, &SOffset, /* Offset */ nullptr); 2292 } 2293 2294 bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDValue Addr, SDValue &SBase, 2295 SDValue &SOffset, 2296 SDValue &Offset) const { 2297 return SelectSMRD(Addr, SBase, &SOffset, &Offset); 2298 } 2299 2300 bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue N, SDValue &Offset) const { 2301 return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset, 2302 /* Imm32Only */ false, /* IsBuffer */ true); 2303 } 2304 2305 bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue N, 2306 SDValue &Offset) const { 2307 assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS); 2308 return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset, 2309 /* Imm32Only */ true, /* IsBuffer */ true); 2310 } 2311 2312 bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset, 2313 SDValue &Offset) const { 2314 // Match the (soffset + offset) pair as a 32-bit register base and 2315 // an immediate offset. 2316 return N.getValueType() == MVT::i32 && 2317 SelectSMRDBaseOffset(N, /* SBase */ SOffset, /* SOffset*/ nullptr, 2318 &Offset, /* Imm32Only */ false, 2319 /* IsBuffer */ true); 2320 } 2321 2322 bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index, 2323 SDValue &Base, 2324 SDValue &Offset) const { 2325 SDLoc DL(Index); 2326 2327 if (CurDAG->isBaseWithConstantOffset(Index)) { 2328 SDValue N0 = Index.getOperand(0); 2329 SDValue N1 = Index.getOperand(1); 2330 ConstantSDNode *C1 = cast<ConstantSDNode>(N1); 2331 2332 // (add n0, c0) 2333 // Don't peel off the offset (c0) if doing so could possibly lead 2334 // the base (n0) to be negative. 2335 // (or n0, |c0|) can never change a sign given isBaseWithConstantOffset. 2336 if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0) || 2337 (Index->getOpcode() == ISD::OR && C1->getSExtValue() >= 0)) { 2338 Base = N0; 2339 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32); 2340 return true; 2341 } 2342 } 2343 2344 if (isa<ConstantSDNode>(Index)) 2345 return false; 2346 2347 Base = Index; 2348 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32); 2349 return true; 2350 } 2351 2352 SDNode *AMDGPUDAGToDAGISel::getBFE32(bool IsSigned, const SDLoc &DL, 2353 SDValue Val, uint32_t Offset, 2354 uint32_t Width) { 2355 if (Val->isDivergent()) { 2356 unsigned Opcode = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64; 2357 SDValue Off = CurDAG->getTargetConstant(Offset, DL, MVT::i32); 2358 SDValue W = CurDAG->getTargetConstant(Width, DL, MVT::i32); 2359 2360 return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, Off, W); 2361 } 2362 unsigned Opcode = IsSigned ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32; 2363 // Transformation function, pack the offset and width of a BFE into 2364 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second 2365 // source, bits [5:0] contain the offset and bits [22:16] the width. 2366 uint32_t PackedVal = Offset | (Width << 16); 2367 SDValue PackedConst = CurDAG->getTargetConstant(PackedVal, DL, MVT::i32); 2368 2369 return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst); 2370 } 2371 2372 void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) { 2373 // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c) 2374 // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c) 2375 // Predicate: 0 < b <= c < 32 2376 2377 const SDValue &Shl = N->getOperand(0); 2378 ConstantSDNode *B = dyn_cast<ConstantSDNode>(Shl->getOperand(1)); 2379 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 2380 2381 if (B && C) { 2382 uint32_t BVal = B->getZExtValue(); 2383 uint32_t CVal = C->getZExtValue(); 2384 2385 if (0 < BVal && BVal <= CVal && CVal < 32) { 2386 bool Signed = N->getOpcode() == ISD::SRA; 2387 ReplaceNode(N, getBFE32(Signed, SDLoc(N), Shl.getOperand(0), CVal - BVal, 2388 32 - CVal)); 2389 return; 2390 } 2391 } 2392 SelectCode(N); 2393 } 2394 2395 void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) { 2396 switch (N->getOpcode()) { 2397 case ISD::AND: 2398 if (N->getOperand(0).getOpcode() == ISD::SRL) { 2399 // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)" 2400 // Predicate: isMask(mask) 2401 const SDValue &Srl = N->getOperand(0); 2402 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Srl.getOperand(1)); 2403 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1)); 2404 2405 if (Shift && Mask) { 2406 uint32_t ShiftVal = Shift->getZExtValue(); 2407 uint32_t MaskVal = Mask->getZExtValue(); 2408 2409 if (isMask_32(MaskVal)) { 2410 uint32_t WidthVal = llvm::popcount(MaskVal); 2411 ReplaceNode(N, getBFE32(false, SDLoc(N), Srl.getOperand(0), ShiftVal, 2412 WidthVal)); 2413 return; 2414 } 2415 } 2416 } 2417 break; 2418 case ISD::SRL: 2419 if (N->getOperand(0).getOpcode() == ISD::AND) { 2420 // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)" 2421 // Predicate: isMask(mask >> b) 2422 const SDValue &And = N->getOperand(0); 2423 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(N->getOperand(1)); 2424 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(And->getOperand(1)); 2425 2426 if (Shift && Mask) { 2427 uint32_t ShiftVal = Shift->getZExtValue(); 2428 uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal; 2429 2430 if (isMask_32(MaskVal)) { 2431 uint32_t WidthVal = llvm::popcount(MaskVal); 2432 ReplaceNode(N, getBFE32(false, SDLoc(N), And.getOperand(0), ShiftVal, 2433 WidthVal)); 2434 return; 2435 } 2436 } 2437 } else if (N->getOperand(0).getOpcode() == ISD::SHL) { 2438 SelectS_BFEFromShifts(N); 2439 return; 2440 } 2441 break; 2442 case ISD::SRA: 2443 if (N->getOperand(0).getOpcode() == ISD::SHL) { 2444 SelectS_BFEFromShifts(N); 2445 return; 2446 } 2447 break; 2448 2449 case ISD::SIGN_EXTEND_INREG: { 2450 // sext_inreg (srl x, 16), i8 -> bfe_i32 x, 16, 8 2451 SDValue Src = N->getOperand(0); 2452 if (Src.getOpcode() != ISD::SRL) 2453 break; 2454 2455 const ConstantSDNode *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1)); 2456 if (!Amt) 2457 break; 2458 2459 unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits(); 2460 ReplaceNode(N, getBFE32(true, SDLoc(N), Src.getOperand(0), 2461 Amt->getZExtValue(), Width)); 2462 return; 2463 } 2464 } 2465 2466 SelectCode(N); 2467 } 2468 2469 bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const { 2470 assert(N->getOpcode() == ISD::BRCOND); 2471 if (!N->hasOneUse()) 2472 return false; 2473 2474 SDValue Cond = N->getOperand(1); 2475 if (Cond.getOpcode() == ISD::CopyToReg) 2476 Cond = Cond.getOperand(2); 2477 2478 if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse()) 2479 return false; 2480 2481 MVT VT = Cond.getOperand(0).getSimpleValueType(); 2482 if (VT == MVT::i32) 2483 return true; 2484 2485 if (VT == MVT::i64) { 2486 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 2487 return (CC == ISD::SETEQ || CC == ISD::SETNE) && 2488 Subtarget->hasScalarCompareEq64(); 2489 } 2490 2491 if ((VT == MVT::f16 || VT == MVT::f32) && Subtarget->hasSALUFloatInsts()) 2492 return true; 2493 2494 return false; 2495 } 2496 2497 static SDValue combineBallotPattern(SDValue VCMP, bool &Negate) { 2498 assert(VCMP->getOpcode() == AMDGPUISD::SETCC); 2499 // Special case for amdgcn.ballot: 2500 // %Cond = i1 (and/or combination of i1 ISD::SETCCs) 2501 // %VCMP = i(WaveSize) AMDGPUISD::SETCC (ext %Cond), 0, setne/seteq 2502 // => 2503 // Use i1 %Cond value instead of i(WaveSize) %VCMP. 2504 // This is possible because divergent ISD::SETCC is selected as V_CMP and 2505 // Cond becomes a i(WaveSize) full mask value. 2506 // Note that ballot doesn't use SETEQ condition but its easy to support it 2507 // here for completeness, so in this case Negate is set true on return. 2508 auto VCMP_CC = cast<CondCodeSDNode>(VCMP.getOperand(2))->get(); 2509 if ((VCMP_CC == ISD::SETEQ || VCMP_CC == ISD::SETNE) && 2510 isNullConstant(VCMP.getOperand(1))) { 2511 2512 auto Cond = VCMP.getOperand(0); 2513 if (ISD::isExtOpcode(Cond->getOpcode())) // Skip extension. 2514 Cond = Cond.getOperand(0); 2515 2516 if (isBoolSGPR(Cond)) { 2517 Negate = VCMP_CC == ISD::SETEQ; 2518 return Cond; 2519 } 2520 } 2521 return SDValue(); 2522 } 2523 2524 void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) { 2525 SDValue Cond = N->getOperand(1); 2526 2527 if (Cond.isUndef()) { 2528 CurDAG->SelectNodeTo(N, AMDGPU::SI_BR_UNDEF, MVT::Other, 2529 N->getOperand(2), N->getOperand(0)); 2530 return; 2531 } 2532 2533 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); 2534 2535 bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N); 2536 bool AndExec = !UseSCCBr; 2537 bool Negate = false; 2538 2539 if (Cond.getOpcode() == ISD::SETCC && 2540 Cond->getOperand(0)->getOpcode() == AMDGPUISD::SETCC) { 2541 SDValue VCMP = Cond->getOperand(0); 2542 auto CC = cast<CondCodeSDNode>(Cond->getOperand(2))->get(); 2543 if ((CC == ISD::SETEQ || CC == ISD::SETNE) && 2544 isNullConstant(Cond->getOperand(1)) && 2545 // We may encounter ballot.i64 in wave32 mode on -O0. 2546 VCMP.getValueType().getSizeInBits() == Subtarget->getWavefrontSize()) { 2547 // %VCMP = i(WaveSize) AMDGPUISD::SETCC ... 2548 // %C = i1 ISD::SETCC %VCMP, 0, setne/seteq 2549 // BRCOND i1 %C, %BB 2550 // => 2551 // %VCMP = i(WaveSize) AMDGPUISD::SETCC ... 2552 // VCC = COPY i(WaveSize) %VCMP 2553 // S_CBRANCH_VCCNZ/VCCZ %BB 2554 Negate = CC == ISD::SETEQ; 2555 bool NegatedBallot = false; 2556 if (auto BallotCond = combineBallotPattern(VCMP, NegatedBallot)) { 2557 Cond = BallotCond; 2558 UseSCCBr = !BallotCond->isDivergent(); 2559 Negate = Negate ^ NegatedBallot; 2560 } else { 2561 // TODO: don't use SCC here assuming that AMDGPUISD::SETCC is always 2562 // selected as V_CMP, but this may change for uniform condition. 2563 Cond = VCMP; 2564 UseSCCBr = false; 2565 } 2566 } 2567 // Cond is either V_CMP resulted from AMDGPUISD::SETCC or a combination of 2568 // V_CMPs resulted from ballot or ballot has uniform condition and SCC is 2569 // used. 2570 AndExec = false; 2571 } 2572 2573 unsigned BrOp = 2574 UseSCCBr ? (Negate ? AMDGPU::S_CBRANCH_SCC0 : AMDGPU::S_CBRANCH_SCC1) 2575 : (Negate ? AMDGPU::S_CBRANCH_VCCZ : AMDGPU::S_CBRANCH_VCCNZ); 2576 Register CondReg = UseSCCBr ? AMDGPU::SCC : TRI->getVCC(); 2577 SDLoc SL(N); 2578 2579 if (AndExec) { 2580 // This is the case that we are selecting to S_CBRANCH_VCCNZ. We have not 2581 // analyzed what generates the vcc value, so we do not know whether vcc 2582 // bits for disabled lanes are 0. Thus we need to mask out bits for 2583 // disabled lanes. 2584 // 2585 // For the case that we select S_CBRANCH_SCC1 and it gets 2586 // changed to S_CBRANCH_VCCNZ in SIFixSGPRCopies, SIFixSGPRCopies calls 2587 // SIInstrInfo::moveToVALU which inserts the S_AND). 2588 // 2589 // We could add an analysis of what generates the vcc value here and omit 2590 // the S_AND when is unnecessary. But it would be better to add a separate 2591 // pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it 2592 // catches both cases. 2593 Cond = SDValue( 2594 CurDAG->getMachineNode( 2595 Subtarget->isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64, SL, 2596 MVT::i1, 2597 CurDAG->getRegister(Subtarget->isWave32() ? AMDGPU::EXEC_LO 2598 : AMDGPU::EXEC, 2599 MVT::i1), 2600 Cond), 2601 0); 2602 } 2603 2604 SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, CondReg, Cond); 2605 CurDAG->SelectNodeTo(N, BrOp, MVT::Other, 2606 N->getOperand(2), // Basic Block 2607 VCC.getValue(0)); 2608 } 2609 2610 void AMDGPUDAGToDAGISel::SelectFP_EXTEND(SDNode *N) { 2611 if (Subtarget->hasSALUFloatInsts() && N->getValueType(0) == MVT::f32 && 2612 !N->isDivergent()) { 2613 SDValue Src = N->getOperand(0); 2614 if (Src.getValueType() == MVT::f16) { 2615 if (isExtractHiElt(Src, Src)) { 2616 CurDAG->SelectNodeTo(N, AMDGPU::S_CVT_HI_F32_F16, N->getVTList(), 2617 {Src}); 2618 return; 2619 } 2620 } 2621 } 2622 2623 SelectCode(N); 2624 } 2625 2626 void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) { 2627 // The address is assumed to be uniform, so if it ends up in a VGPR, it will 2628 // be copied to an SGPR with readfirstlane. 2629 unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ? 2630 AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME; 2631 2632 SDValue Chain = N->getOperand(0); 2633 SDValue Ptr = N->getOperand(2); 2634 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N); 2635 MachineMemOperand *MMO = M->getMemOperand(); 2636 bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS; 2637 2638 SDValue Offset; 2639 if (CurDAG->isBaseWithConstantOffset(Ptr)) { 2640 SDValue PtrBase = Ptr.getOperand(0); 2641 SDValue PtrOffset = Ptr.getOperand(1); 2642 2643 const APInt &OffsetVal = PtrOffset->getAsAPIntVal(); 2644 if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue())) { 2645 N = glueCopyToM0(N, PtrBase); 2646 Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32); 2647 } 2648 } 2649 2650 if (!Offset) { 2651 N = glueCopyToM0(N, Ptr); 2652 Offset = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32); 2653 } 2654 2655 SDValue Ops[] = { 2656 Offset, 2657 CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32), 2658 Chain, 2659 N->getOperand(N->getNumOperands() - 1) // New glue 2660 }; 2661 2662 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); 2663 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO}); 2664 } 2665 2666 // We need to handle this here because tablegen doesn't support matching 2667 // instructions with multiple outputs. 2668 void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N) { 2669 unsigned Opc = AMDGPU::DS_BVH_STACK_RTN_B32; 2670 SDValue Ops[] = {N->getOperand(2), N->getOperand(3), N->getOperand(4), 2671 N->getOperand(5), N->getOperand(0)}; 2672 2673 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N); 2674 MachineMemOperand *MMO = M->getMemOperand(); 2675 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); 2676 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO}); 2677 } 2678 2679 static unsigned gwsIntrinToOpcode(unsigned IntrID) { 2680 switch (IntrID) { 2681 case Intrinsic::amdgcn_ds_gws_init: 2682 return AMDGPU::DS_GWS_INIT; 2683 case Intrinsic::amdgcn_ds_gws_barrier: 2684 return AMDGPU::DS_GWS_BARRIER; 2685 case Intrinsic::amdgcn_ds_gws_sema_v: 2686 return AMDGPU::DS_GWS_SEMA_V; 2687 case Intrinsic::amdgcn_ds_gws_sema_br: 2688 return AMDGPU::DS_GWS_SEMA_BR; 2689 case Intrinsic::amdgcn_ds_gws_sema_p: 2690 return AMDGPU::DS_GWS_SEMA_P; 2691 case Intrinsic::amdgcn_ds_gws_sema_release_all: 2692 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL; 2693 default: 2694 llvm_unreachable("not a gws intrinsic"); 2695 } 2696 } 2697 2698 void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) { 2699 if (!Subtarget->hasGWS() || 2700 (IntrID == Intrinsic::amdgcn_ds_gws_sema_release_all && 2701 !Subtarget->hasGWSSemaReleaseAll())) { 2702 // Let this error. 2703 SelectCode(N); 2704 return; 2705 } 2706 2707 // Chain, intrinsic ID, vsrc, offset 2708 const bool HasVSrc = N->getNumOperands() == 4; 2709 assert(HasVSrc || N->getNumOperands() == 3); 2710 2711 SDLoc SL(N); 2712 SDValue BaseOffset = N->getOperand(HasVSrc ? 3 : 2); 2713 int ImmOffset = 0; 2714 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N); 2715 MachineMemOperand *MMO = M->getMemOperand(); 2716 2717 // Don't worry if the offset ends up in a VGPR. Only one lane will have 2718 // effect, so SIFixSGPRCopies will validly insert readfirstlane. 2719 2720 // The resource id offset is computed as (<isa opaque base> + M0[21:16] + 2721 // offset field) % 64. Some versions of the programming guide omit the m0 2722 // part, or claim it's from offset 0. 2723 if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(BaseOffset)) { 2724 // If we have a constant offset, try to use the 0 in m0 as the base. 2725 // TODO: Look into changing the default m0 initialization value. If the 2726 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to 2727 // the immediate offset. 2728 glueCopyToM0(N, CurDAG->getTargetConstant(0, SL, MVT::i32)); 2729 ImmOffset = ConstOffset->getZExtValue(); 2730 } else { 2731 if (CurDAG->isBaseWithConstantOffset(BaseOffset)) { 2732 ImmOffset = BaseOffset.getConstantOperandVal(1); 2733 BaseOffset = BaseOffset.getOperand(0); 2734 } 2735 2736 // Prefer to do the shift in an SGPR since it should be possible to use m0 2737 // as the result directly. If it's already an SGPR, it will be eliminated 2738 // later. 2739 SDNode *SGPROffset 2740 = CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32, 2741 BaseOffset); 2742 // Shift to offset in m0 2743 SDNode *M0Base 2744 = CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32, 2745 SDValue(SGPROffset, 0), 2746 CurDAG->getTargetConstant(16, SL, MVT::i32)); 2747 glueCopyToM0(N, SDValue(M0Base, 0)); 2748 } 2749 2750 SDValue Chain = N->getOperand(0); 2751 SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32); 2752 2753 const unsigned Opc = gwsIntrinToOpcode(IntrID); 2754 SmallVector<SDValue, 5> Ops; 2755 if (HasVSrc) 2756 Ops.push_back(N->getOperand(2)); 2757 Ops.push_back(OffsetField); 2758 Ops.push_back(Chain); 2759 2760 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); 2761 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO}); 2762 } 2763 2764 void AMDGPUDAGToDAGISel::SelectInterpP1F16(SDNode *N) { 2765 if (Subtarget->getLDSBankCount() != 16) { 2766 // This is a single instruction with a pattern. 2767 SelectCode(N); 2768 return; 2769 } 2770 2771 SDLoc DL(N); 2772 2773 // This requires 2 instructions. It is possible to write a pattern to support 2774 // this, but the generated isel emitter doesn't correctly deal with multiple 2775 // output instructions using the same physical register input. The copy to m0 2776 // is incorrectly placed before the second instruction. 2777 // 2778 // TODO: Match source modifiers. 2779 // 2780 // def : Pat < 2781 // (int_amdgcn_interp_p1_f16 2782 // (VOP3Mods f32:$src0, i32:$src0_modifiers), 2783 // (i32 timm:$attrchan), (i32 timm:$attr), 2784 // (i1 timm:$high), M0), 2785 // (V_INTERP_P1LV_F16 $src0_modifiers, VGPR_32:$src0, timm:$attr, 2786 // timm:$attrchan, 0, 2787 // (V_INTERP_MOV_F32 2, timm:$attr, timm:$attrchan), timm:$high)> { 2788 // let Predicates = [has16BankLDS]; 2789 // } 2790 2791 // 16 bank LDS 2792 SDValue ToM0 = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, AMDGPU::M0, 2793 N->getOperand(5), SDValue()); 2794 2795 SDVTList VTs = CurDAG->getVTList(MVT::f32, MVT::Other); 2796 2797 SDNode *InterpMov = 2798 CurDAG->getMachineNode(AMDGPU::V_INTERP_MOV_F32, DL, VTs, { 2799 CurDAG->getTargetConstant(2, DL, MVT::i32), // P0 2800 N->getOperand(3), // Attr 2801 N->getOperand(2), // Attrchan 2802 ToM0.getValue(1) // In glue 2803 }); 2804 2805 SDNode *InterpP1LV = 2806 CurDAG->getMachineNode(AMDGPU::V_INTERP_P1LV_F16, DL, MVT::f32, { 2807 CurDAG->getTargetConstant(0, DL, MVT::i32), // $src0_modifiers 2808 N->getOperand(1), // Src0 2809 N->getOperand(3), // Attr 2810 N->getOperand(2), // Attrchan 2811 CurDAG->getTargetConstant(0, DL, MVT::i32), // $src2_modifiers 2812 SDValue(InterpMov, 0), // Src2 - holds two f16 values selected by high 2813 N->getOperand(4), // high 2814 CurDAG->getTargetConstant(0, DL, MVT::i1), // $clamp 2815 CurDAG->getTargetConstant(0, DL, MVT::i32), // $omod 2816 SDValue(InterpMov, 1) 2817 }); 2818 2819 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), SDValue(InterpP1LV, 0)); 2820 } 2821 2822 void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) { 2823 unsigned IntrID = N->getConstantOperandVal(1); 2824 switch (IntrID) { 2825 case Intrinsic::amdgcn_ds_append: 2826 case Intrinsic::amdgcn_ds_consume: { 2827 if (N->getValueType(0) != MVT::i32) 2828 break; 2829 SelectDSAppendConsume(N, IntrID); 2830 return; 2831 } 2832 case Intrinsic::amdgcn_ds_bvh_stack_rtn: 2833 SelectDSBvhStackIntrinsic(N); 2834 return; 2835 case Intrinsic::amdgcn_init_whole_wave: 2836 CurDAG->getMachineFunction() 2837 .getInfo<SIMachineFunctionInfo>() 2838 ->setInitWholeWave(); 2839 break; 2840 } 2841 2842 SelectCode(N); 2843 } 2844 2845 void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) { 2846 unsigned IntrID = N->getConstantOperandVal(0); 2847 unsigned Opcode = AMDGPU::INSTRUCTION_LIST_END; 2848 SDNode *ConvGlueNode = N->getGluedNode(); 2849 if (ConvGlueNode) { 2850 // FIXME: Possibly iterate over multiple glue nodes? 2851 assert(ConvGlueNode->getOpcode() == ISD::CONVERGENCECTRL_GLUE); 2852 ConvGlueNode = ConvGlueNode->getOperand(0).getNode(); 2853 ConvGlueNode = 2854 CurDAG->getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, {}, 2855 MVT::Glue, SDValue(ConvGlueNode, 0)); 2856 } else { 2857 ConvGlueNode = nullptr; 2858 } 2859 switch (IntrID) { 2860 case Intrinsic::amdgcn_wqm: 2861 Opcode = AMDGPU::WQM; 2862 break; 2863 case Intrinsic::amdgcn_softwqm: 2864 Opcode = AMDGPU::SOFT_WQM; 2865 break; 2866 case Intrinsic::amdgcn_wwm: 2867 case Intrinsic::amdgcn_strict_wwm: 2868 Opcode = AMDGPU::STRICT_WWM; 2869 break; 2870 case Intrinsic::amdgcn_strict_wqm: 2871 Opcode = AMDGPU::STRICT_WQM; 2872 break; 2873 case Intrinsic::amdgcn_interp_p1_f16: 2874 SelectInterpP1F16(N); 2875 return; 2876 case Intrinsic::amdgcn_permlane16_swap: 2877 case Intrinsic::amdgcn_permlane32_swap: { 2878 if ((IntrID == Intrinsic::amdgcn_permlane16_swap && 2879 !Subtarget->hasPermlane16Swap()) || 2880 (IntrID == Intrinsic::amdgcn_permlane32_swap && 2881 !Subtarget->hasPermlane32Swap())) { 2882 SelectCode(N); // Hit the default error 2883 return; 2884 } 2885 2886 Opcode = IntrID == Intrinsic::amdgcn_permlane16_swap 2887 ? AMDGPU::V_PERMLANE16_SWAP_B32_e64 2888 : AMDGPU::V_PERMLANE32_SWAP_B32_e64; 2889 2890 SmallVector<SDValue, 4> NewOps(N->op_begin() + 1, N->op_end()); 2891 if (ConvGlueNode) 2892 NewOps.push_back(SDValue(ConvGlueNode, 0)); 2893 2894 bool FI = N->getConstantOperandVal(3); 2895 NewOps[2] = CurDAG->getTargetConstant( 2896 FI ? AMDGPU::DPP::DPP_FI_1 : AMDGPU::DPP::DPP_FI_0, SDLoc(), MVT::i32); 2897 2898 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), NewOps); 2899 return; 2900 } 2901 default: 2902 SelectCode(N); 2903 break; 2904 } 2905 2906 if (Opcode != AMDGPU::INSTRUCTION_LIST_END) { 2907 SDValue Src = N->getOperand(1); 2908 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), {Src}); 2909 } 2910 2911 if (ConvGlueNode) { 2912 SmallVector<SDValue, 4> NewOps(N->ops()); 2913 NewOps.push_back(SDValue(ConvGlueNode, 0)); 2914 CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), NewOps); 2915 } 2916 } 2917 2918 void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) { 2919 unsigned IntrID = N->getConstantOperandVal(1); 2920 switch (IntrID) { 2921 case Intrinsic::amdgcn_ds_gws_init: 2922 case Intrinsic::amdgcn_ds_gws_barrier: 2923 case Intrinsic::amdgcn_ds_gws_sema_v: 2924 case Intrinsic::amdgcn_ds_gws_sema_br: 2925 case Intrinsic::amdgcn_ds_gws_sema_p: 2926 case Intrinsic::amdgcn_ds_gws_sema_release_all: 2927 SelectDS_GWS(N, IntrID); 2928 return; 2929 default: 2930 break; 2931 } 2932 2933 SelectCode(N); 2934 } 2935 2936 void AMDGPUDAGToDAGISel::SelectWAVE_ADDRESS(SDNode *N) { 2937 SDValue Log2WaveSize = 2938 CurDAG->getTargetConstant(Subtarget->getWavefrontSizeLog2(), SDLoc(N), MVT::i32); 2939 CurDAG->SelectNodeTo(N, AMDGPU::S_LSHR_B32, N->getVTList(), 2940 {N->getOperand(0), Log2WaveSize}); 2941 } 2942 2943 void AMDGPUDAGToDAGISel::SelectSTACKRESTORE(SDNode *N) { 2944 SDValue SrcVal = N->getOperand(1); 2945 if (SrcVal.getValueType() != MVT::i32) { 2946 SelectCode(N); // Emit default error 2947 return; 2948 } 2949 2950 SDValue CopyVal; 2951 Register SP = TLI->getStackPointerRegisterToSaveRestore(); 2952 SDLoc SL(N); 2953 2954 if (SrcVal.getOpcode() == AMDGPUISD::WAVE_ADDRESS) { 2955 CopyVal = SrcVal.getOperand(0); 2956 } else { 2957 SDValue Log2WaveSize = CurDAG->getTargetConstant( 2958 Subtarget->getWavefrontSizeLog2(), SL, MVT::i32); 2959 2960 if (N->isDivergent()) { 2961 SrcVal = SDValue(CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, 2962 MVT::i32, SrcVal), 2963 0); 2964 } 2965 2966 CopyVal = SDValue(CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32, 2967 {SrcVal, Log2WaveSize}), 2968 0); 2969 } 2970 2971 SDValue CopyToSP = CurDAG->getCopyToReg(N->getOperand(0), SL, SP, CopyVal); 2972 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), CopyToSP); 2973 } 2974 2975 bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src, 2976 unsigned &Mods, 2977 bool IsCanonicalizing, 2978 bool AllowAbs) const { 2979 Mods = SISrcMods::NONE; 2980 Src = In; 2981 2982 if (Src.getOpcode() == ISD::FNEG) { 2983 Mods |= SISrcMods::NEG; 2984 Src = Src.getOperand(0); 2985 } else if (Src.getOpcode() == ISD::FSUB && IsCanonicalizing) { 2986 // Fold fsub [+-]0 into fneg. This may not have folded depending on the 2987 // denormal mode, but we're implicitly canonicalizing in a source operand. 2988 auto *LHS = dyn_cast<ConstantFPSDNode>(Src.getOperand(0)); 2989 if (LHS && LHS->isZero()) { 2990 Mods |= SISrcMods::NEG; 2991 Src = Src.getOperand(1); 2992 } 2993 } 2994 2995 if (AllowAbs && Src.getOpcode() == ISD::FABS) { 2996 Mods |= SISrcMods::ABS; 2997 Src = Src.getOperand(0); 2998 } 2999 3000 return true; 3001 } 3002 3003 bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src, 3004 SDValue &SrcMods) const { 3005 unsigned Mods; 3006 if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/true, 3007 /*AllowAbs=*/true)) { 3008 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); 3009 return true; 3010 } 3011 3012 return false; 3013 } 3014 3015 bool AMDGPUDAGToDAGISel::SelectVOP3ModsNonCanonicalizing( 3016 SDValue In, SDValue &Src, SDValue &SrcMods) const { 3017 unsigned Mods; 3018 if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/false, 3019 /*AllowAbs=*/true)) { 3020 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); 3021 return true; 3022 } 3023 3024 return false; 3025 } 3026 3027 bool AMDGPUDAGToDAGISel::SelectVOP3BMods(SDValue In, SDValue &Src, 3028 SDValue &SrcMods) const { 3029 unsigned Mods; 3030 if (SelectVOP3ModsImpl(In, Src, Mods, 3031 /*IsCanonicalizing=*/true, 3032 /*AllowAbs=*/false)) { 3033 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); 3034 return true; 3035 } 3036 3037 return false; 3038 } 3039 3040 bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const { 3041 if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG) 3042 return false; 3043 3044 Src = In; 3045 return true; 3046 } 3047 3048 bool AMDGPUDAGToDAGISel::SelectVINTERPModsImpl(SDValue In, SDValue &Src, 3049 SDValue &SrcMods, 3050 bool OpSel) const { 3051 unsigned Mods; 3052 if (SelectVOP3ModsImpl(In, Src, Mods, 3053 /*IsCanonicalizing=*/true, 3054 /*AllowAbs=*/false)) { 3055 if (OpSel) 3056 Mods |= SISrcMods::OP_SEL_0; 3057 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); 3058 return true; 3059 } 3060 3061 return false; 3062 } 3063 3064 bool AMDGPUDAGToDAGISel::SelectVINTERPMods(SDValue In, SDValue &Src, 3065 SDValue &SrcMods) const { 3066 return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ false); 3067 } 3068 3069 bool AMDGPUDAGToDAGISel::SelectVINTERPModsHi(SDValue In, SDValue &Src, 3070 SDValue &SrcMods) const { 3071 return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ true); 3072 } 3073 3074 bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src, 3075 SDValue &SrcMods, SDValue &Clamp, 3076 SDValue &Omod) const { 3077 SDLoc DL(In); 3078 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1); 3079 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1); 3080 3081 return SelectVOP3Mods(In, Src, SrcMods); 3082 } 3083 3084 bool AMDGPUDAGToDAGISel::SelectVOP3BMods0(SDValue In, SDValue &Src, 3085 SDValue &SrcMods, SDValue &Clamp, 3086 SDValue &Omod) const { 3087 SDLoc DL(In); 3088 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1); 3089 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1); 3090 3091 return SelectVOP3BMods(In, Src, SrcMods); 3092 } 3093 3094 bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src, 3095 SDValue &Clamp, SDValue &Omod) const { 3096 Src = In; 3097 3098 SDLoc DL(In); 3099 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1); 3100 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1); 3101 3102 return true; 3103 } 3104 3105 bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src, 3106 SDValue &SrcMods, bool IsDOT) const { 3107 unsigned Mods = SISrcMods::NONE; 3108 Src = In; 3109 3110 // TODO: Handle G_FSUB 0 as fneg 3111 if (Src.getOpcode() == ISD::FNEG) { 3112 Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); 3113 Src = Src.getOperand(0); 3114 } 3115 3116 if (Src.getOpcode() == ISD::BUILD_VECTOR && Src.getNumOperands() == 2 && 3117 (!IsDOT || !Subtarget->hasDOTOpSelHazard())) { 3118 unsigned VecMods = Mods; 3119 3120 SDValue Lo = stripBitcast(Src.getOperand(0)); 3121 SDValue Hi = stripBitcast(Src.getOperand(1)); 3122 3123 if (Lo.getOpcode() == ISD::FNEG) { 3124 Lo = stripBitcast(Lo.getOperand(0)); 3125 Mods ^= SISrcMods::NEG; 3126 } 3127 3128 if (Hi.getOpcode() == ISD::FNEG) { 3129 Hi = stripBitcast(Hi.getOperand(0)); 3130 Mods ^= SISrcMods::NEG_HI; 3131 } 3132 3133 if (isExtractHiElt(Lo, Lo)) 3134 Mods |= SISrcMods::OP_SEL_0; 3135 3136 if (isExtractHiElt(Hi, Hi)) 3137 Mods |= SISrcMods::OP_SEL_1; 3138 3139 unsigned VecSize = Src.getValueSizeInBits(); 3140 Lo = stripExtractLoElt(Lo); 3141 Hi = stripExtractLoElt(Hi); 3142 3143 if (Lo.getValueSizeInBits() > VecSize) { 3144 Lo = CurDAG->getTargetExtractSubreg( 3145 (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In), 3146 MVT::getIntegerVT(VecSize), Lo); 3147 } 3148 3149 if (Hi.getValueSizeInBits() > VecSize) { 3150 Hi = CurDAG->getTargetExtractSubreg( 3151 (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In), 3152 MVT::getIntegerVT(VecSize), Hi); 3153 } 3154 3155 assert(Lo.getValueSizeInBits() <= VecSize && 3156 Hi.getValueSizeInBits() <= VecSize); 3157 3158 if (Lo == Hi && !isInlineImmediate(Lo.getNode())) { 3159 // Really a scalar input. Just select from the low half of the register to 3160 // avoid packing. 3161 3162 if (VecSize == 32 || VecSize == Lo.getValueSizeInBits()) { 3163 Src = Lo; 3164 } else { 3165 assert(Lo.getValueSizeInBits() == 32 && VecSize == 64); 3166 3167 SDLoc SL(In); 3168 SDValue Undef = SDValue( 3169 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL, 3170 Lo.getValueType()), 0); 3171 auto RC = Lo->isDivergent() ? AMDGPU::VReg_64RegClassID 3172 : AMDGPU::SReg_64RegClassID; 3173 const SDValue Ops[] = { 3174 CurDAG->getTargetConstant(RC, SL, MVT::i32), 3175 Lo, CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32), 3176 Undef, CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32) }; 3177 3178 Src = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL, 3179 Src.getValueType(), Ops), 0); 3180 } 3181 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); 3182 return true; 3183 } 3184 3185 if (VecSize == 64 && Lo == Hi && isa<ConstantFPSDNode>(Lo)) { 3186 uint64_t Lit = cast<ConstantFPSDNode>(Lo)->getValueAPF() 3187 .bitcastToAPInt().getZExtValue(); 3188 if (AMDGPU::isInlinableLiteral32(Lit, Subtarget->hasInv2PiInlineImm())) { 3189 Src = CurDAG->getTargetConstant(Lit, SDLoc(In), MVT::i64); 3190 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); 3191 return true; 3192 } 3193 } 3194 3195 Mods = VecMods; 3196 } else if (Src.getOpcode() == ISD::VECTOR_SHUFFLE && 3197 Src.getNumOperands() == 2) { 3198 3199 // TODO: We should repeat the build_vector source check above for the 3200 // vector_shuffle for negates and casts of individual elements. 3201 3202 auto *SVN = cast<ShuffleVectorSDNode>(Src); 3203 ArrayRef<int> Mask = SVN->getMask(); 3204 3205 if (Mask[0] < 2 && Mask[1] < 2) { 3206 // src1 should be undef. 3207 SDValue ShuffleSrc = SVN->getOperand(0); 3208 3209 if (ShuffleSrc.getOpcode() == ISD::FNEG) { 3210 ShuffleSrc = ShuffleSrc.getOperand(0); 3211 Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); 3212 } 3213 3214 if (Mask[0] == 1) 3215 Mods |= SISrcMods::OP_SEL_0; 3216 if (Mask[1] == 1) 3217 Mods |= SISrcMods::OP_SEL_1; 3218 3219 Src = ShuffleSrc; 3220 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); 3221 return true; 3222 } 3223 } 3224 3225 // Packed instructions do not have abs modifiers. 3226 Mods |= SISrcMods::OP_SEL_1; 3227 3228 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); 3229 return true; 3230 } 3231 3232 bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src, 3233 SDValue &SrcMods) const { 3234 return SelectVOP3PMods(In, Src, SrcMods, true); 3235 } 3236 3237 bool AMDGPUDAGToDAGISel::SelectVOP3PModsNeg(SDValue In, SDValue &Src) const { 3238 const ConstantSDNode *C = cast<ConstantSDNode>(In); 3239 // Literal i1 value set in intrinsic, represents SrcMods for the next operand. 3240 // 1 promotes packed values to signed, 0 treats them as unsigned. 3241 assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value"); 3242 3243 unsigned Mods = SISrcMods::OP_SEL_1; 3244 unsigned SrcSign = C->getZExtValue(); 3245 if (SrcSign == 1) 3246 Mods ^= SISrcMods::NEG; 3247 3248 Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); 3249 return true; 3250 } 3251 3252 bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In, 3253 SDValue &Src) const { 3254 const ConstantSDNode *C = cast<ConstantSDNode>(In); 3255 assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value"); 3256 3257 unsigned Mods = SISrcMods::OP_SEL_1; 3258 unsigned SrcVal = C->getZExtValue(); 3259 if (SrcVal == 1) 3260 Mods |= SISrcMods::OP_SEL_0; 3261 3262 Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); 3263 return true; 3264 } 3265 3266 static MachineSDNode *buildRegSequence32(SmallVectorImpl<SDValue> &Elts, 3267 llvm::SelectionDAG *CurDAG, 3268 const SDLoc &DL) { 3269 unsigned DstRegClass; 3270 EVT DstTy; 3271 switch (Elts.size()) { 3272 case 8: 3273 DstRegClass = AMDGPU::VReg_256RegClassID; 3274 DstTy = MVT::v8i32; 3275 break; 3276 case 4: 3277 DstRegClass = AMDGPU::VReg_128RegClassID; 3278 DstTy = MVT::v4i32; 3279 break; 3280 case 2: 3281 DstRegClass = AMDGPU::VReg_64RegClassID; 3282 DstTy = MVT::v2i32; 3283 break; 3284 default: 3285 llvm_unreachable("unhandled Reg sequence size"); 3286 } 3287 3288 SmallVector<SDValue, 17> Ops; 3289 Ops.push_back(CurDAG->getTargetConstant(DstRegClass, DL, MVT::i32)); 3290 for (unsigned i = 0; i < Elts.size(); ++i) { 3291 Ops.push_back(Elts[i]); 3292 Ops.push_back(CurDAG->getTargetConstant( 3293 SIRegisterInfo::getSubRegFromChannel(i), DL, MVT::i32)); 3294 } 3295 return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, DstTy, Ops); 3296 } 3297 3298 static MachineSDNode *buildRegSequence16(SmallVectorImpl<SDValue> &Elts, 3299 llvm::SelectionDAG *CurDAG, 3300 const SDLoc &DL) { 3301 SmallVector<SDValue, 8> PackedElts; 3302 assert("unhandled Reg sequence size" && 3303 (Elts.size() == 8 || Elts.size() == 16)); 3304 3305 // Pack 16-bit elements in pairs into 32-bit register. If both elements are 3306 // unpacked from 32-bit source use it, otherwise pack them using v_perm. 3307 for (unsigned i = 0; i < Elts.size(); i += 2) { 3308 SDValue LoSrc = stripExtractLoElt(stripBitcast(Elts[i])); 3309 SDValue HiSrc; 3310 if (isExtractHiElt(Elts[i + 1], HiSrc) && LoSrc == HiSrc) { 3311 PackedElts.push_back(HiSrc); 3312 } else { 3313 SDValue PackLoLo = CurDAG->getTargetConstant(0x05040100, DL, MVT::i32); 3314 MachineSDNode *Packed = 3315 CurDAG->getMachineNode(AMDGPU::V_PERM_B32_e64, DL, MVT::i32, 3316 {Elts[i + 1], Elts[i], PackLoLo}); 3317 PackedElts.push_back(SDValue(Packed, 0)); 3318 } 3319 } 3320 3321 return buildRegSequence32(PackedElts, CurDAG, DL); 3322 } 3323 3324 static MachineSDNode *buildRegSequence(SmallVectorImpl<SDValue> &Elts, 3325 llvm::SelectionDAG *CurDAG, 3326 const SDLoc &DL, unsigned ElementSize) { 3327 if (ElementSize == 16) 3328 return buildRegSequence16(Elts, CurDAG, DL); 3329 if (ElementSize == 32) 3330 return buildRegSequence32(Elts, CurDAG, DL); 3331 llvm_unreachable("Unhandled element size"); 3332 } 3333 3334 static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, 3335 SmallVectorImpl<SDValue> &Elts, SDValue &Src, 3336 llvm::SelectionDAG *CurDAG, const SDLoc &DL, 3337 unsigned ElementSize) { 3338 if (ModOpcode == ISD::FNEG) { 3339 Mods |= SISrcMods::NEG; 3340 // Check if all elements also have abs modifier 3341 SmallVector<SDValue, 8> NegAbsElts; 3342 for (auto El : Elts) { 3343 if (El.getOpcode() != ISD::FABS) 3344 break; 3345 NegAbsElts.push_back(El->getOperand(0)); 3346 } 3347 if (Elts.size() != NegAbsElts.size()) { 3348 // Neg 3349 Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0); 3350 } else { 3351 // Neg and Abs 3352 Mods |= SISrcMods::NEG_HI; 3353 Src = SDValue(buildRegSequence(NegAbsElts, CurDAG, DL, ElementSize), 0); 3354 } 3355 } else { 3356 assert(ModOpcode == ISD::FABS); 3357 // Abs 3358 Mods |= SISrcMods::NEG_HI; 3359 Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0); 3360 } 3361 } 3362 3363 // Check all f16 elements for modifiers while looking through b32 and v2b16 3364 // build vector, stop if element does not satisfy ModifierCheck. 3365 static void 3366 checkWMMAElementsModifiersF16(BuildVectorSDNode *BV, 3367 std::function<bool(SDValue)> ModifierCheck) { 3368 for (unsigned i = 0; i < BV->getNumOperands(); ++i) { 3369 if (auto *F16Pair = 3370 dyn_cast<BuildVectorSDNode>(stripBitcast(BV->getOperand(i)))) { 3371 for (unsigned i = 0; i < F16Pair->getNumOperands(); ++i) { 3372 SDValue ElF16 = stripBitcast(F16Pair->getOperand(i)); 3373 if (!ModifierCheck(ElF16)) 3374 break; 3375 } 3376 } 3377 } 3378 } 3379 3380 bool AMDGPUDAGToDAGISel::SelectWMMAModsF16Neg(SDValue In, SDValue &Src, 3381 SDValue &SrcMods) const { 3382 Src = In; 3383 unsigned Mods = SISrcMods::OP_SEL_1; 3384 3385 // mods are on f16 elements 3386 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) { 3387 SmallVector<SDValue, 8> EltsF16; 3388 3389 checkWMMAElementsModifiersF16(BV, [&](SDValue Element) -> bool { 3390 if (Element.getOpcode() != ISD::FNEG) 3391 return false; 3392 EltsF16.push_back(Element.getOperand(0)); 3393 return true; 3394 }); 3395 3396 // All elements have neg modifier 3397 if (BV->getNumOperands() * 2 == EltsF16.size()) { 3398 Src = SDValue(buildRegSequence16(EltsF16, CurDAG, SDLoc(In)), 0); 3399 Mods |= SISrcMods::NEG; 3400 Mods |= SISrcMods::NEG_HI; 3401 } 3402 } 3403 3404 // mods are on v2f16 elements 3405 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) { 3406 SmallVector<SDValue, 8> EltsV2F16; 3407 for (unsigned i = 0; i < BV->getNumOperands(); ++i) { 3408 SDValue ElV2f16 = stripBitcast(BV->getOperand(i)); 3409 // Based on first element decide which mod we match, neg or abs 3410 if (ElV2f16.getOpcode() != ISD::FNEG) 3411 break; 3412 EltsV2F16.push_back(ElV2f16.getOperand(0)); 3413 } 3414 3415 // All pairs of elements have neg modifier 3416 if (BV->getNumOperands() == EltsV2F16.size()) { 3417 Src = SDValue(buildRegSequence32(EltsV2F16, CurDAG, SDLoc(In)), 0); 3418 Mods |= SISrcMods::NEG; 3419 Mods |= SISrcMods::NEG_HI; 3420 } 3421 } 3422 3423 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); 3424 return true; 3425 } 3426 3427 bool AMDGPUDAGToDAGISel::SelectWMMAModsF16NegAbs(SDValue In, SDValue &Src, 3428 SDValue &SrcMods) const { 3429 Src = In; 3430 unsigned Mods = SISrcMods::OP_SEL_1; 3431 unsigned ModOpcode; 3432 3433 // mods are on f16 elements 3434 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) { 3435 SmallVector<SDValue, 8> EltsF16; 3436 checkWMMAElementsModifiersF16(BV, [&](SDValue ElF16) -> bool { 3437 // Based on first element decide which mod we match, neg or abs 3438 if (EltsF16.empty()) 3439 ModOpcode = (ElF16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS; 3440 if (ElF16.getOpcode() != ModOpcode) 3441 return false; 3442 EltsF16.push_back(ElF16.getOperand(0)); 3443 return true; 3444 }); 3445 3446 // All elements have ModOpcode modifier 3447 if (BV->getNumOperands() * 2 == EltsF16.size()) 3448 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF16, Src, CurDAG, SDLoc(In), 3449 16); 3450 } 3451 3452 // mods are on v2f16 elements 3453 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) { 3454 SmallVector<SDValue, 8> EltsV2F16; 3455 3456 for (unsigned i = 0; i < BV->getNumOperands(); ++i) { 3457 SDValue ElV2f16 = stripBitcast(BV->getOperand(i)); 3458 // Based on first element decide which mod we match, neg or abs 3459 if (EltsV2F16.empty()) 3460 ModOpcode = (ElV2f16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS; 3461 if (ElV2f16->getOpcode() != ModOpcode) 3462 break; 3463 EltsV2F16.push_back(ElV2f16->getOperand(0)); 3464 } 3465 3466 // All elements have ModOpcode modifier 3467 if (BV->getNumOperands() == EltsV2F16.size()) 3468 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, CurDAG, SDLoc(In), 3469 32); 3470 } 3471 3472 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); 3473 return true; 3474 } 3475 3476 bool AMDGPUDAGToDAGISel::SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src, 3477 SDValue &SrcMods) const { 3478 Src = In; 3479 unsigned Mods = SISrcMods::OP_SEL_1; 3480 SmallVector<SDValue, 8> EltsF32; 3481 3482 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) { 3483 assert(BV->getNumOperands() > 0); 3484 // Based on first element decide which mod we match, neg or abs 3485 SDValue ElF32 = stripBitcast(BV->getOperand(0)); 3486 unsigned ModOpcode = 3487 (ElF32.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS; 3488 for (unsigned i = 0; i < BV->getNumOperands(); ++i) { 3489 SDValue ElF32 = stripBitcast(BV->getOperand(i)); 3490 if (ElF32.getOpcode() != ModOpcode) 3491 break; 3492 EltsF32.push_back(ElF32.getOperand(0)); 3493 } 3494 3495 // All elements had ModOpcode modifier 3496 if (BV->getNumOperands() == EltsF32.size()) 3497 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, CurDAG, SDLoc(In), 3498 32); 3499 } 3500 3501 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); 3502 return true; 3503 } 3504 3505 bool AMDGPUDAGToDAGISel::SelectWMMAVISrc(SDValue In, SDValue &Src) const { 3506 if (auto *BV = dyn_cast<BuildVectorSDNode>(In)) { 3507 BitVector UndefElements; 3508 if (SDValue Splat = BV->getSplatValue(&UndefElements)) 3509 if (isInlineImmediate(Splat.getNode())) { 3510 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat)) { 3511 unsigned Imm = C->getAPIntValue().getSExtValue(); 3512 Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32); 3513 return true; 3514 } 3515 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat)) { 3516 unsigned Imm = C->getValueAPF().bitcastToAPInt().getSExtValue(); 3517 Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32); 3518 return true; 3519 } 3520 llvm_unreachable("unhandled Constant node"); 3521 } 3522 } 3523 3524 // 16 bit splat 3525 SDValue SplatSrc32 = stripBitcast(In); 3526 if (auto *SplatSrc32BV = dyn_cast<BuildVectorSDNode>(SplatSrc32)) 3527 if (SDValue Splat32 = SplatSrc32BV->getSplatValue()) { 3528 SDValue SplatSrc16 = stripBitcast(Splat32); 3529 if (auto *SplatSrc16BV = dyn_cast<BuildVectorSDNode>(SplatSrc16)) 3530 if (SDValue Splat = SplatSrc16BV->getSplatValue()) { 3531 const SIInstrInfo *TII = Subtarget->getInstrInfo(); 3532 std::optional<APInt> RawValue; 3533 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat)) 3534 RawValue = C->getValueAPF().bitcastToAPInt(); 3535 else if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat)) 3536 RawValue = C->getAPIntValue(); 3537 3538 if (RawValue.has_value()) { 3539 EVT VT = In.getValueType().getScalarType(); 3540 if (VT.getSimpleVT() == MVT::f16 || VT.getSimpleVT() == MVT::bf16) { 3541 APFloat FloatVal(VT.getSimpleVT() == MVT::f16 3542 ? APFloatBase::IEEEhalf() 3543 : APFloatBase::BFloat(), 3544 RawValue.value()); 3545 if (TII->isInlineConstant(FloatVal)) { 3546 Src = CurDAG->getTargetConstant(RawValue.value(), SDLoc(In), 3547 MVT::i16); 3548 return true; 3549 } 3550 } else if (VT.getSimpleVT() == MVT::i16) { 3551 if (TII->isInlineConstant(RawValue.value())) { 3552 Src = CurDAG->getTargetConstant(RawValue.value(), SDLoc(In), 3553 MVT::i16); 3554 return true; 3555 } 3556 } else 3557 llvm_unreachable("unknown 16-bit type"); 3558 } 3559 } 3560 } 3561 3562 return false; 3563 } 3564 3565 bool AMDGPUDAGToDAGISel::SelectSWMMACIndex8(SDValue In, SDValue &Src, 3566 SDValue &IndexKey) const { 3567 unsigned Key = 0; 3568 Src = In; 3569 3570 if (In.getOpcode() == ISD::SRL) { 3571 const llvm::SDValue &ShiftSrc = In.getOperand(0); 3572 ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1)); 3573 if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt && 3574 ShiftAmt->getZExtValue() % 8 == 0) { 3575 Key = ShiftAmt->getZExtValue() / 8; 3576 Src = ShiftSrc; 3577 } 3578 } 3579 3580 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32); 3581 return true; 3582 } 3583 3584 bool AMDGPUDAGToDAGISel::SelectSWMMACIndex16(SDValue In, SDValue &Src, 3585 SDValue &IndexKey) const { 3586 unsigned Key = 0; 3587 Src = In; 3588 3589 if (In.getOpcode() == ISD::SRL) { 3590 const llvm::SDValue &ShiftSrc = In.getOperand(0); 3591 ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1)); 3592 if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt && 3593 ShiftAmt->getZExtValue() == 16) { 3594 Key = 1; 3595 Src = ShiftSrc; 3596 } 3597 } 3598 3599 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32); 3600 return true; 3601 } 3602 3603 bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src, 3604 SDValue &SrcMods) const { 3605 Src = In; 3606 // FIXME: Handle op_sel 3607 SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32); 3608 return true; 3609 } 3610 3611 bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src, 3612 SDValue &SrcMods) const { 3613 // FIXME: Handle op_sel 3614 return SelectVOP3Mods(In, Src, SrcMods); 3615 } 3616 3617 // The return value is not whether the match is possible (which it always is), 3618 // but whether or not it a conversion is really used. 3619 bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, 3620 unsigned &Mods) const { 3621 Mods = 0; 3622 SelectVOP3ModsImpl(In, Src, Mods); 3623 3624 if (Src.getOpcode() == ISD::FP_EXTEND) { 3625 Src = Src.getOperand(0); 3626 assert(Src.getValueType() == MVT::f16); 3627 Src = stripBitcast(Src); 3628 3629 // Be careful about folding modifiers if we already have an abs. fneg is 3630 // applied last, so we don't want to apply an earlier fneg. 3631 if ((Mods & SISrcMods::ABS) == 0) { 3632 unsigned ModsTmp; 3633 SelectVOP3ModsImpl(Src, Src, ModsTmp); 3634 3635 if ((ModsTmp & SISrcMods::NEG) != 0) 3636 Mods ^= SISrcMods::NEG; 3637 3638 if ((ModsTmp & SISrcMods::ABS) != 0) 3639 Mods |= SISrcMods::ABS; 3640 } 3641 3642 // op_sel/op_sel_hi decide the source type and source. 3643 // If the source's op_sel_hi is set, it indicates to do a conversion from fp16. 3644 // If the sources's op_sel is set, it picks the high half of the source 3645 // register. 3646 3647 Mods |= SISrcMods::OP_SEL_1; 3648 if (isExtractHiElt(Src, Src)) { 3649 Mods |= SISrcMods::OP_SEL_0; 3650 3651 // TODO: Should we try to look for neg/abs here? 3652 } 3653 3654 return true; 3655 } 3656 3657 return false; 3658 } 3659 3660 bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src, 3661 SDValue &SrcMods) const { 3662 unsigned Mods = 0; 3663 if (!SelectVOP3PMadMixModsImpl(In, Src, Mods)) 3664 return false; 3665 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); 3666 return true; 3667 } 3668 3669 bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src, 3670 SDValue &SrcMods) const { 3671 unsigned Mods = 0; 3672 SelectVOP3PMadMixModsImpl(In, Src, Mods); 3673 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); 3674 return true; 3675 } 3676 3677 // Match BITOP3 operation and return a number of matched instructions plus 3678 // truth table. 3679 static std::pair<unsigned, uint8_t> BitOp3_Op(SDValue In, 3680 SmallVectorImpl<SDValue> &Src) { 3681 unsigned NumOpcodes = 0; 3682 uint8_t LHSBits, RHSBits; 3683 3684 auto getOperandBits = [&Src, In](SDValue Op, uint8_t &Bits) -> bool { 3685 // Define truth table given Src0, Src1, Src2 bits permutations: 3686 // 0 0 0 3687 // 0 0 1 3688 // 0 1 0 3689 // 0 1 1 3690 // 1 0 0 3691 // 1 0 1 3692 // 1 1 0 3693 // 1 1 1 3694 const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa }; 3695 3696 if (auto *C = dyn_cast<ConstantSDNode>(Op)) { 3697 if (C->isAllOnes()) { 3698 Bits = 0xff; 3699 return true; 3700 } 3701 if (C->isZero()) { 3702 Bits = 0; 3703 return true; 3704 } 3705 } 3706 3707 for (unsigned I = 0; I < Src.size(); ++I) { 3708 // Try to find existing reused operand 3709 if (Src[I] == Op) { 3710 Bits = SrcBits[I]; 3711 return true; 3712 } 3713 // Try to replace parent operator 3714 if (Src[I] == In) { 3715 Bits = SrcBits[I]; 3716 Src[I] = Op; 3717 return true; 3718 } 3719 } 3720 3721 if (Src.size() == 3) { 3722 // No room left for operands. Try one last time, there can be a 'not' of 3723 // one of our source operands. In this case we can compute the bits 3724 // without growing Src vector. 3725 if (Op.getOpcode() == ISD::XOR) { 3726 if (auto *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 3727 if (C->isAllOnes()) { 3728 SDValue LHS = Op.getOperand(0); 3729 for (unsigned I = 0; I < Src.size(); ++I) { 3730 if (Src[I] == LHS) { 3731 Bits = ~SrcBits[I]; 3732 return true; 3733 } 3734 } 3735 } 3736 } 3737 } 3738 3739 return false; 3740 } 3741 3742 Bits = SrcBits[Src.size()]; 3743 Src.push_back(Op); 3744 return true; 3745 }; 3746 3747 switch (In.getOpcode()) { 3748 case ISD::AND: 3749 case ISD::OR: 3750 case ISD::XOR: { 3751 SDValue LHS = In.getOperand(0); 3752 SDValue RHS = In.getOperand(1); 3753 3754 SmallVector<SDValue, 3> Backup(Src.begin(), Src.end()); 3755 if (!getOperandBits(LHS, LHSBits) || 3756 !getOperandBits(RHS, RHSBits)) { 3757 Src = Backup; 3758 return std::make_pair(0, 0); 3759 } 3760 3761 // Recursion is naturally limited by the size of the operand vector. 3762 auto Op = BitOp3_Op(LHS, Src); 3763 if (Op.first) { 3764 NumOpcodes += Op.first; 3765 LHSBits = Op.second; 3766 } 3767 3768 Op = BitOp3_Op(RHS, Src); 3769 if (Op.first) { 3770 NumOpcodes += Op.first; 3771 RHSBits = Op.second; 3772 } 3773 break; 3774 } 3775 default: 3776 return std::make_pair(0, 0); 3777 } 3778 3779 uint8_t TTbl; 3780 switch (In.getOpcode()) { 3781 case ISD::AND: 3782 TTbl = LHSBits & RHSBits; 3783 break; 3784 case ISD::OR: 3785 TTbl = LHSBits | RHSBits; 3786 break; 3787 case ISD::XOR: 3788 TTbl = LHSBits ^ RHSBits; 3789 break; 3790 default: 3791 break; 3792 } 3793 3794 return std::make_pair(NumOpcodes + 1, TTbl); 3795 } 3796 3797 bool AMDGPUDAGToDAGISel::SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1, 3798 SDValue &Src2, SDValue &Tbl) const { 3799 SmallVector<SDValue, 3> Src; 3800 uint8_t TTbl; 3801 unsigned NumOpcodes; 3802 3803 std::tie(NumOpcodes, TTbl) = BitOp3_Op(In, Src); 3804 3805 // Src.empty() case can happen if all operands are all zero or all ones. 3806 // Normally it shall be optimized out before reaching this. 3807 if (NumOpcodes < 2 || Src.empty()) 3808 return false; 3809 3810 // For a uniform case threshold should be higher to account for moves between 3811 // VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be in SGPRs 3812 // and a readtfirstlane after. 3813 if (NumOpcodes < 4 && !In->isDivergent()) 3814 return false; 3815 3816 if (NumOpcodes == 2 && In.getValueType() == MVT::i32) { 3817 // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes 3818 // asm more readable. This cannot be modeled with AddedComplexity because 3819 // selector does not know how many operations did we match. 3820 if ((In.getOpcode() == ISD::XOR || In.getOpcode() == ISD::OR) && 3821 (In.getOperand(0).getOpcode() == In.getOpcode() || 3822 In.getOperand(1).getOpcode() == In.getOpcode())) 3823 return false; 3824 3825 if (In.getOpcode() == ISD::OR && 3826 (In.getOperand(0).getOpcode() == ISD::AND || 3827 In.getOperand(1).getOpcode() == ISD::AND)) 3828 return false; 3829 } 3830 3831 // Last operand can be ignored, turning a ternary operation into a binary. 3832 // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace 3833 // 'c' with 'a' here without changing the answer. In some pathological 3834 // cases it should be possible to get an operation with a single operand 3835 // too if optimizer would not catch it. 3836 while (Src.size() < 3) 3837 Src.push_back(Src[0]); 3838 3839 Src0 = Src[0]; 3840 Src1 = Src[1]; 3841 Src2 = Src[2]; 3842 3843 Tbl = CurDAG->getTargetConstant(TTbl, SDLoc(In), MVT::i32); 3844 return true; 3845 } 3846 3847 SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const { 3848 if (In.isUndef()) 3849 return CurDAG->getUNDEF(MVT::i32); 3850 3851 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In)) { 3852 SDLoc SL(In); 3853 return CurDAG->getConstant(C->getZExtValue() << 16, SL, MVT::i32); 3854 } 3855 3856 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(In)) { 3857 SDLoc SL(In); 3858 return CurDAG->getConstant( 3859 C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32); 3860 } 3861 3862 SDValue Src; 3863 if (isExtractHiElt(In, Src)) 3864 return Src; 3865 3866 return SDValue(); 3867 } 3868 3869 bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const { 3870 assert(CurDAG->getTarget().getTargetTriple().getArch() == Triple::amdgcn); 3871 3872 const SIRegisterInfo *SIRI = 3873 static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); 3874 const SIInstrInfo * SII = 3875 static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); 3876 3877 unsigned Limit = 0; 3878 bool AllUsesAcceptSReg = true; 3879 for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end(); 3880 Limit < 10 && U != E; ++U, ++Limit) { 3881 const TargetRegisterClass *RC = 3882 getOperandRegClass(U->getUser(), U->getOperandNo()); 3883 3884 // If the register class is unknown, it could be an unknown 3885 // register class that needs to be an SGPR, e.g. an inline asm 3886 // constraint 3887 if (!RC || SIRI->isSGPRClass(RC)) 3888 return false; 3889 3890 if (RC != &AMDGPU::VS_32RegClass && RC != &AMDGPU::VS_64RegClass) { 3891 AllUsesAcceptSReg = false; 3892 SDNode *User = U->getUser(); 3893 if (User->isMachineOpcode()) { 3894 unsigned Opc = User->getMachineOpcode(); 3895 const MCInstrDesc &Desc = SII->get(Opc); 3896 if (Desc.isCommutable()) { 3897 unsigned OpIdx = Desc.getNumDefs() + U->getOperandNo(); 3898 unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex; 3899 if (SII->findCommutedOpIndices(Desc, OpIdx, CommuteIdx1)) { 3900 unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs(); 3901 const TargetRegisterClass *CommutedRC = 3902 getOperandRegClass(U->getUser(), CommutedOpNo); 3903 if (CommutedRC == &AMDGPU::VS_32RegClass || 3904 CommutedRC == &AMDGPU::VS_64RegClass) 3905 AllUsesAcceptSReg = true; 3906 } 3907 } 3908 } 3909 // If "AllUsesAcceptSReg == false" so far we haven't succeeded 3910 // commuting current user. This means have at least one use 3911 // that strictly require VGPR. Thus, we will not attempt to commute 3912 // other user instructions. 3913 if (!AllUsesAcceptSReg) 3914 break; 3915 } 3916 } 3917 return !AllUsesAcceptSReg && (Limit < 10); 3918 } 3919 3920 bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode *N) const { 3921 const auto *Ld = cast<LoadSDNode>(N); 3922 3923 const MachineMemOperand *MMO = Ld->getMemOperand(); 3924 if (N->isDivergent() && !AMDGPUInstrInfo::isUniformMMO(MMO)) 3925 return false; 3926 3927 return MMO->getSize().hasValue() && 3928 Ld->getAlign() >= 3929 Align(std::min(MMO->getSize().getValue().getKnownMinValue(), 3930 uint64_t(4))) && 3931 ((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || 3932 Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) || 3933 (Subtarget->getScalarizeGlobalBehavior() && 3934 Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && 3935 Ld->isSimple() && 3936 static_cast<const SITargetLowering *>(getTargetLowering()) 3937 ->isMemOpHasNoClobberedMemOperand(N))); 3938 } 3939 3940 void AMDGPUDAGToDAGISel::PostprocessISelDAG() { 3941 const AMDGPUTargetLowering& Lowering = 3942 *static_cast<const AMDGPUTargetLowering*>(getTargetLowering()); 3943 bool IsModified = false; 3944 do { 3945 IsModified = false; 3946 3947 // Go over all selected nodes and try to fold them a bit more 3948 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_begin(); 3949 while (Position != CurDAG->allnodes_end()) { 3950 SDNode *Node = &*Position++; 3951 MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(Node); 3952 if (!MachineNode) 3953 continue; 3954 3955 SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG); 3956 if (ResNode != Node) { 3957 if (ResNode) 3958 ReplaceUses(Node, ResNode); 3959 IsModified = true; 3960 } 3961 } 3962 CurDAG->RemoveDeadNodes(); 3963 } while (IsModified); 3964 } 3965 3966 AMDGPUDAGToDAGISelLegacy::AMDGPUDAGToDAGISelLegacy(TargetMachine &TM, 3967 CodeGenOptLevel OptLevel) 3968 : SelectionDAGISelLegacy( 3969 ID, std::make_unique<AMDGPUDAGToDAGISel>(TM, OptLevel)) {} 3970 3971 char AMDGPUDAGToDAGISelLegacy::ID = 0; 3972