1 //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Custom DAG lowering for SI 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "SIISelLowering.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUInstrInfo.h" 17 #include "AMDGPUTargetMachine.h" 18 #include "GCNSubtarget.h" 19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "SIRegisterInfo.h" 22 #include "llvm/ADT/APInt.h" 23 #include "llvm/ADT/FloatingPointMode.h" 24 #include "llvm/ADT/Statistic.h" 25 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 26 #include "llvm/Analysis/UniformityAnalysis.h" 27 #include "llvm/CodeGen/Analysis.h" 28 #include "llvm/CodeGen/ByteProvider.h" 29 #include "llvm/CodeGen/FunctionLoweringInfo.h" 30 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" 31 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" 32 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 33 #include "llvm/CodeGen/MachineFrameInfo.h" 34 #include "llvm/CodeGen/MachineFunction.h" 35 #include "llvm/CodeGen/MachineLoopInfo.h" 36 #include "llvm/IR/DiagnosticInfo.h" 37 #include "llvm/IR/IRBuilder.h" 38 #include "llvm/IR/IntrinsicInst.h" 39 #include "llvm/IR/IntrinsicsAMDGPU.h" 40 #include "llvm/IR/IntrinsicsR600.h" 41 #include "llvm/IR/MDBuilder.h" 42 #include "llvm/Support/CommandLine.h" 43 #include "llvm/Support/KnownBits.h" 44 #include "llvm/Support/ModRef.h" 45 #include "llvm/Transforms/Utils/LowerAtomic.h" 46 #include <optional> 47 48 using namespace llvm; 49 50 #define DEBUG_TYPE "si-lower" 51 52 STATISTIC(NumTailCalls, "Number of tail calls"); 53 54 static cl::opt<bool> 55 DisableLoopAlignment("amdgpu-disable-loop-alignment", 56 cl::desc("Do not align and prefetch loops"), 57 cl::init(false)); 58 59 static cl::opt<bool> UseDivergentRegisterIndexing( 60 "amdgpu-use-divergent-register-indexing", cl::Hidden, 61 cl::desc("Use indirect register addressing for divergent indexes"), 62 cl::init(false)); 63 64 static bool denormalModeIsFlushAllF32(const MachineFunction &MF) { 65 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 66 return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign(); 67 } 68 69 static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF) { 70 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 71 return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign(); 72 } 73 74 static unsigned findFirstFreeSGPR(CCState &CCInfo) { 75 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); 76 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) { 77 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) { 78 return AMDGPU::SGPR0 + Reg; 79 } 80 } 81 llvm_unreachable("Cannot allocate sgpr"); 82 } 83 84 SITargetLowering::SITargetLowering(const TargetMachine &TM, 85 const GCNSubtarget &STI) 86 : AMDGPUTargetLowering(TM, STI), Subtarget(&STI) { 87 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass); 88 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass); 89 90 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass); 91 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass); 92 93 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass); 94 95 const SIRegisterInfo *TRI = STI.getRegisterInfo(); 96 const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class(); 97 98 addRegisterClass(MVT::f64, V64RegClass); 99 addRegisterClass(MVT::v2f32, V64RegClass); 100 addRegisterClass(MVT::Untyped, V64RegClass); 101 102 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass); 103 addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96)); 104 105 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass); 106 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass); 107 108 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass); 109 addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128)); 110 111 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass); 112 addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160)); 113 114 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass); 115 addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192)); 116 117 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass); 118 addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192)); 119 120 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass); 121 addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224)); 122 123 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass); 124 addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256)); 125 126 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass); 127 addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256)); 128 129 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass); 130 addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288)); 131 132 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass); 133 addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320)); 134 135 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass); 136 addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352)); 137 138 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass); 139 addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384)); 140 141 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass); 142 addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512)); 143 144 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass); 145 addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512)); 146 147 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass); 148 addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024)); 149 150 if (Subtarget->has16BitInsts()) { 151 if (Subtarget->useRealTrue16Insts()) { 152 addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass); 153 addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass); 154 addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass); 155 } else { 156 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass); 157 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass); 158 addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass); 159 } 160 161 // Unless there are also VOP3P operations, not operations are really legal. 162 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass); 163 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass); 164 addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass); 165 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass); 166 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass); 167 addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass); 168 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass); 169 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass); 170 addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass); 171 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass); 172 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass); 173 addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass); 174 addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass); 175 addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass); 176 addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass); 177 } 178 179 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass); 180 addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024)); 181 182 computeRegisterProperties(Subtarget->getRegisterInfo()); 183 184 // The boolean content concept here is too inflexible. Compares only ever 185 // really produce a 1-bit result. Any copy/extend from these will turn into a 186 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as 187 // it's what most targets use. 188 setBooleanContents(ZeroOrOneBooleanContent); 189 setBooleanVectorContents(ZeroOrOneBooleanContent); 190 191 // We need to custom lower vector stores from local memory 192 setOperationAction(ISD::LOAD, 193 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, 194 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32, 195 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32, 196 MVT::i1, MVT::v32i32}, 197 Custom); 198 199 setOperationAction(ISD::STORE, 200 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, 201 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32, 202 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32, 203 MVT::i1, MVT::v32i32}, 204 Custom); 205 206 if (isTypeLegal(MVT::bf16)) { 207 for (unsigned Opc : 208 {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV, 209 ISD::FREM, ISD::FMA, ISD::FMINNUM, ISD::FMAXNUM, 210 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FSQRT, ISD::FCBRT, 211 ISD::FSIN, ISD::FCOS, ISD::FPOW, ISD::FPOWI, 212 ISD::FLDEXP, ISD::FFREXP, ISD::FLOG, ISD::FLOG2, 213 ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10, 214 ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FNEARBYINT, 215 ISD::FROUND, ISD::FROUNDEVEN, ISD::FFLOOR, ISD::FCANONICALIZE, 216 ISD::SETCC}) { 217 // FIXME: The promoted to type shouldn't need to be explicit 218 setOperationAction(Opc, MVT::bf16, Promote); 219 AddPromotedToType(Opc, MVT::bf16, MVT::f32); 220 } 221 222 setOperationAction(ISD::FP_ROUND, MVT::bf16, Expand); 223 224 setOperationAction(ISD::SELECT, MVT::bf16, Promote); 225 AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16); 226 227 setOperationAction(ISD::FABS, MVT::bf16, Legal); 228 setOperationAction(ISD::FNEG, MVT::bf16, Legal); 229 setOperationAction(ISD::FCOPYSIGN, MVT::bf16, Legal); 230 231 // We only need to custom lower because we can't specify an action for bf16 232 // sources. 233 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); 234 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); 235 } 236 237 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); 238 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand); 239 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand); 240 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand); 241 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand); 242 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand); 243 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand); 244 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand); 245 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand); 246 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand); 247 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand); 248 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand); 249 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand); 250 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand); 251 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand); 252 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand); 253 254 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand); 255 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand); 256 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand); 257 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand); 258 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand); 259 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand); 260 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand); 261 262 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom); 263 264 setOperationAction(ISD::SELECT, MVT::i1, Promote); 265 setOperationAction(ISD::SELECT, MVT::i64, Custom); 266 setOperationAction(ISD::SELECT, MVT::f64, Promote); 267 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64); 268 269 setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom); 270 271 setOperationAction(ISD::SELECT_CC, 272 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand); 273 274 setOperationAction(ISD::SETCC, MVT::i1, Promote); 275 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand); 276 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32); 277 278 setOperationAction(ISD::TRUNCATE, 279 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, 280 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32, 281 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32}, 282 Expand); 283 setOperationAction(ISD::FP_ROUND, 284 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, 285 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32, 286 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32}, 287 Expand); 288 289 setOperationAction(ISD::SIGN_EXTEND_INREG, 290 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16, 291 MVT::v3i16, MVT::v4i16, MVT::Other}, 292 Custom); 293 294 setOperationAction(ISD::BRCOND, MVT::Other, Custom); 295 setOperationAction(ISD::BR_CC, 296 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand); 297 298 setOperationAction({ISD::UADDO, ISD::USUBO}, MVT::i32, Legal); 299 300 setOperationAction({ISD::UADDO_CARRY, ISD::USUBO_CARRY}, MVT::i32, Legal); 301 302 setOperationAction({ISD::SHL_PARTS, ISD::SRA_PARTS, ISD::SRL_PARTS}, MVT::i64, 303 Expand); 304 305 #if 0 306 setOperationAction({ISD::UADDO_CARRY, ISD::USUBO_CARRY}, MVT::i64, Legal); 307 #endif 308 309 // We only support LOAD/STORE and vector manipulation ops for vectors 310 // with > 4 elements. 311 for (MVT VT : 312 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32, 313 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32, 314 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16, 315 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32, 316 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64, 317 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16, 318 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32, 319 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) { 320 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { 321 switch (Op) { 322 case ISD::LOAD: 323 case ISD::STORE: 324 case ISD::BUILD_VECTOR: 325 case ISD::BITCAST: 326 case ISD::UNDEF: 327 case ISD::EXTRACT_VECTOR_ELT: 328 case ISD::INSERT_VECTOR_ELT: 329 case ISD::SCALAR_TO_VECTOR: 330 case ISD::IS_FPCLASS: 331 break; 332 case ISD::EXTRACT_SUBVECTOR: 333 case ISD::INSERT_SUBVECTOR: 334 case ISD::CONCAT_VECTORS: 335 setOperationAction(Op, VT, Custom); 336 break; 337 default: 338 setOperationAction(Op, VT, Expand); 339 break; 340 } 341 } 342 } 343 344 setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Expand); 345 346 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that 347 // is expanded to avoid having two separate loops in case the index is a VGPR. 348 349 // Most operations are naturally 32-bit vector operations. We only support 350 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32. 351 for (MVT Vec64 : {MVT::v2i64, MVT::v2f64}) { 352 setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote); 353 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32); 354 355 setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote); 356 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32); 357 358 setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote); 359 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32); 360 361 setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote); 362 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32); 363 } 364 365 for (MVT Vec64 : {MVT::v3i64, MVT::v3f64}) { 366 setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote); 367 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32); 368 369 setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote); 370 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32); 371 372 setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote); 373 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32); 374 375 setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote); 376 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32); 377 } 378 379 for (MVT Vec64 : {MVT::v4i64, MVT::v4f64}) { 380 setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote); 381 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32); 382 383 setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote); 384 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32); 385 386 setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote); 387 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32); 388 389 setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote); 390 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32); 391 } 392 393 for (MVT Vec64 : {MVT::v8i64, MVT::v8f64}) { 394 setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote); 395 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32); 396 397 setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote); 398 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32); 399 400 setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote); 401 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32); 402 403 setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote); 404 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32); 405 } 406 407 for (MVT Vec64 : {MVT::v16i64, MVT::v16f64}) { 408 setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote); 409 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32); 410 411 setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote); 412 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32); 413 414 setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote); 415 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32); 416 417 setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote); 418 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32); 419 } 420 421 setOperationAction(ISD::VECTOR_SHUFFLE, 422 {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32, 423 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32}, 424 Custom); 425 426 if (Subtarget->hasPkMovB32()) { 427 // TODO: 16-bit element vectors should be legal with even aligned elements. 428 // TODO: Can be legal with wider source types than the result with 429 // subregister extracts. 430 setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal); 431 } 432 433 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16}, 434 Custom); 435 436 // Avoid stack access for these. 437 // TODO: Generalize to more vector types. 438 setOperationAction({ISD::EXTRACT_VECTOR_ELT, ISD::INSERT_VECTOR_ELT}, 439 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8, 440 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16}, 441 Custom); 442 443 // Deal with vec3 vector operations when widened to vec4. 444 setOperationAction(ISD::INSERT_SUBVECTOR, 445 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom); 446 447 // Deal with vec5/6/7 vector operations when widened to vec8. 448 setOperationAction(ISD::INSERT_SUBVECTOR, 449 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32, 450 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32, 451 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32, 452 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32}, 453 Custom); 454 455 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, 456 // and output demarshalling 457 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom); 458 459 // We can't return success/failure, only the old value, 460 // let LLVM add the comparison 461 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, {MVT::i32, MVT::i64}, 462 Expand); 463 464 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom); 465 466 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal); 467 468 // FIXME: This should be narrowed to i32, but that only happens if i64 is 469 // illegal. 470 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32. 471 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal); 472 473 // On SI this is s_memtime and s_memrealtime on VI. 474 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal); 475 476 if (Subtarget->hasSMemRealTime() || 477 Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11) 478 setOperationAction(ISD::READSTEADYCOUNTER, MVT::i64, Legal); 479 setOperationAction({ISD::TRAP, ISD::DEBUGTRAP}, MVT::Other, Custom); 480 481 if (Subtarget->has16BitInsts()) { 482 setOperationAction({ISD::FPOW, ISD::FPOWI}, MVT::f16, Promote); 483 setOperationAction({ISD::FLOG, ISD::FEXP, ISD::FLOG10}, MVT::f16, Custom); 484 } else { 485 setOperationAction(ISD::FSQRT, MVT::f16, Custom); 486 } 487 488 if (Subtarget->hasMadMacF32Insts()) 489 setOperationAction(ISD::FMAD, MVT::f32, Legal); 490 491 if (!Subtarget->hasBFI()) 492 // fcopysign can be done in a single instruction with BFI. 493 setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand); 494 495 if (!Subtarget->hasBCNT(32)) 496 setOperationAction(ISD::CTPOP, MVT::i32, Expand); 497 498 if (!Subtarget->hasBCNT(64)) 499 setOperationAction(ISD::CTPOP, MVT::i64, Expand); 500 501 if (Subtarget->hasFFBH()) 502 setOperationAction({ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, MVT::i32, Custom); 503 504 if (Subtarget->hasFFBL()) 505 setOperationAction({ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF}, MVT::i32, Custom); 506 507 // We only really have 32-bit BFE instructions (and 16-bit on VI). 508 // 509 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any 510 // effort to match them now. We want this to be false for i64 cases when the 511 // extraction isn't restricted to the upper or lower half. Ideally we would 512 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that 513 // span the midpoint are probably relatively rare, so don't worry about them 514 // for now. 515 if (Subtarget->hasBFE()) 516 setHasExtractBitsInsn(true); 517 518 // Clamp modifier on add/sub 519 if (Subtarget->hasIntClamp()) 520 setOperationAction({ISD::UADDSAT, ISD::USUBSAT}, MVT::i32, Legal); 521 522 if (Subtarget->hasAddNoCarry()) 523 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32}, 524 Legal); 525 526 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, {MVT::f32, MVT::f64}, 527 Custom); 528 529 // These are really only legal for ieee_mode functions. We should be avoiding 530 // them for functions that don't have ieee_mode enabled, so just say they are 531 // legal. 532 setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE}, 533 {MVT::f32, MVT::f64}, Legal); 534 535 if (Subtarget->haveRoundOpsF64()) 536 setOperationAction({ISD::FTRUNC, ISD::FCEIL, ISD::FROUNDEVEN}, MVT::f64, 537 Legal); 538 else 539 setOperationAction({ISD::FCEIL, ISD::FTRUNC, ISD::FROUNDEVEN, ISD::FFLOOR}, 540 MVT::f64, Custom); 541 542 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 543 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64}, 544 Legal); 545 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom); 546 547 setOperationAction({ISD::FSIN, ISD::FCOS, ISD::FDIV}, MVT::f32, Custom); 548 setOperationAction(ISD::FDIV, MVT::f64, Custom); 549 550 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand); 551 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand); 552 553 // Custom lower these because we can't specify a rule based on an illegal 554 // source bf16. 555 setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f32, Custom); 556 setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f64, Custom); 557 558 if (Subtarget->has16BitInsts()) { 559 setOperationAction({ISD::Constant, ISD::SMIN, ISD::SMAX, ISD::UMIN, 560 ISD::UMAX, ISD::UADDSAT, ISD::USUBSAT}, 561 MVT::i16, Legal); 562 563 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32); 564 565 setOperationAction({ISD::ROTR, ISD::ROTL, ISD::SELECT_CC, ISD::BR_CC}, 566 MVT::i16, Expand); 567 568 setOperationAction({ISD::SIGN_EXTEND, ISD::SDIV, ISD::UDIV, ISD::SREM, 569 ISD::UREM, ISD::BITREVERSE, ISD::CTTZ, 570 ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF, 571 ISD::CTPOP}, 572 MVT::i16, Promote); 573 574 setOperationAction(ISD::LOAD, MVT::i16, Custom); 575 576 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 577 578 setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote); 579 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32); 580 setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote); 581 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32); 582 583 setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::i16, Custom); 584 setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i16, Custom); 585 setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i16, Custom); 586 587 setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i32, Custom); 588 589 // F16 - Constant Actions. 590 setOperationAction(ISD::ConstantFP, MVT::f16, Legal); 591 setOperationAction(ISD::ConstantFP, MVT::bf16, Legal); 592 593 // F16 - Load/Store Actions. 594 setOperationAction(ISD::LOAD, MVT::f16, Promote); 595 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16); 596 setOperationAction(ISD::STORE, MVT::f16, Promote); 597 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16); 598 599 // BF16 - Load/Store Actions. 600 setOperationAction(ISD::LOAD, MVT::bf16, Promote); 601 AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16); 602 setOperationAction(ISD::STORE, MVT::bf16, Promote); 603 AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16); 604 605 // F16 - VOP1 Actions. 606 setOperationAction({ISD::FP_ROUND, ISD::STRICT_FP_ROUND, ISD::FCOS, 607 ISD::FSIN, ISD::FROUND}, 608 MVT::f16, Custom); 609 610 setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::f16, Promote); 611 setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::bf16, Promote); 612 613 // F16 - VOP2 Actions. 614 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16}, 615 Expand); 616 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, MVT::f16, Custom); 617 setOperationAction(ISD::FFREXP, MVT::f16, Custom); 618 setOperationAction(ISD::FDIV, MVT::f16, Custom); 619 620 // F16 - VOP3 Actions. 621 setOperationAction(ISD::FMA, MVT::f16, Legal); 622 if (STI.hasMadF16()) 623 setOperationAction(ISD::FMAD, MVT::f16, Legal); 624 625 for (MVT VT : 626 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16, 627 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, 628 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) { 629 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { 630 switch (Op) { 631 case ISD::LOAD: 632 case ISD::STORE: 633 case ISD::BUILD_VECTOR: 634 case ISD::BITCAST: 635 case ISD::UNDEF: 636 case ISD::EXTRACT_VECTOR_ELT: 637 case ISD::INSERT_VECTOR_ELT: 638 case ISD::INSERT_SUBVECTOR: 639 case ISD::SCALAR_TO_VECTOR: 640 case ISD::IS_FPCLASS: 641 break; 642 case ISD::EXTRACT_SUBVECTOR: 643 case ISD::CONCAT_VECTORS: 644 setOperationAction(Op, VT, Custom); 645 break; 646 default: 647 setOperationAction(Op, VT, Expand); 648 break; 649 } 650 } 651 } 652 653 // v_perm_b32 can handle either of these. 654 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal); 655 setOperationAction(ISD::BSWAP, MVT::v4i16, Custom); 656 657 // XXX - Do these do anything? Vector constants turn into build_vector. 658 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal); 659 660 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, 661 Legal); 662 663 setOperationAction(ISD::STORE, MVT::v2i16, Promote); 664 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32); 665 setOperationAction(ISD::STORE, MVT::v2f16, Promote); 666 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32); 667 668 setOperationAction(ISD::LOAD, MVT::v2i16, Promote); 669 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32); 670 setOperationAction(ISD::LOAD, MVT::v2f16, Promote); 671 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32); 672 673 setOperationAction(ISD::AND, MVT::v2i16, Promote); 674 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32); 675 setOperationAction(ISD::OR, MVT::v2i16, Promote); 676 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32); 677 setOperationAction(ISD::XOR, MVT::v2i16, Promote); 678 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32); 679 680 setOperationAction(ISD::LOAD, MVT::v4i16, Promote); 681 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32); 682 setOperationAction(ISD::LOAD, MVT::v4f16, Promote); 683 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32); 684 setOperationAction(ISD::LOAD, MVT::v4bf16, Promote); 685 AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32); 686 687 setOperationAction(ISD::STORE, MVT::v4i16, Promote); 688 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32); 689 setOperationAction(ISD::STORE, MVT::v4f16, Promote); 690 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32); 691 setOperationAction(ISD::STORE, MVT::v4bf16, Promote); 692 AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32); 693 694 setOperationAction(ISD::LOAD, MVT::v8i16, Promote); 695 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32); 696 setOperationAction(ISD::LOAD, MVT::v8f16, Promote); 697 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32); 698 setOperationAction(ISD::LOAD, MVT::v8bf16, Promote); 699 AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32); 700 701 setOperationAction(ISD::STORE, MVT::v4i16, Promote); 702 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32); 703 setOperationAction(ISD::STORE, MVT::v4f16, Promote); 704 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32); 705 706 setOperationAction(ISD::STORE, MVT::v8i16, Promote); 707 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32); 708 setOperationAction(ISD::STORE, MVT::v8f16, Promote); 709 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32); 710 setOperationAction(ISD::STORE, MVT::v8bf16, Promote); 711 AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32); 712 713 setOperationAction(ISD::LOAD, MVT::v16i16, Promote); 714 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32); 715 setOperationAction(ISD::LOAD, MVT::v16f16, Promote); 716 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32); 717 setOperationAction(ISD::LOAD, MVT::v16bf16, Promote); 718 AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32); 719 720 setOperationAction(ISD::STORE, MVT::v16i16, Promote); 721 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32); 722 setOperationAction(ISD::STORE, MVT::v16f16, Promote); 723 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32); 724 setOperationAction(ISD::STORE, MVT::v16bf16, Promote); 725 AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32); 726 727 setOperationAction(ISD::LOAD, MVT::v32i16, Promote); 728 AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32); 729 setOperationAction(ISD::LOAD, MVT::v32f16, Promote); 730 AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32); 731 setOperationAction(ISD::LOAD, MVT::v32bf16, Promote); 732 AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32); 733 734 setOperationAction(ISD::STORE, MVT::v32i16, Promote); 735 AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32); 736 setOperationAction(ISD::STORE, MVT::v32f16, Promote); 737 AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32); 738 setOperationAction(ISD::STORE, MVT::v32bf16, Promote); 739 AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32); 740 741 setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND}, 742 MVT::v2i32, Expand); 743 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand); 744 745 setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND}, 746 MVT::v4i32, Expand); 747 748 setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND}, 749 MVT::v8i32, Expand); 750 751 setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, 752 Subtarget->hasVOP3PInsts() ? Legal : Custom); 753 754 setOperationAction(ISD::FNEG, MVT::v2f16, Legal); 755 // This isn't really legal, but this avoids the legalizer unrolling it (and 756 // allows matching fneg (fabs x) patterns) 757 setOperationAction(ISD::FABS, MVT::v2f16, Legal); 758 759 setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom); 760 setOperationAction({ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, MVT::f16, Legal); 761 762 setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE, ISD::FMINIMUMNUM, 763 ISD::FMAXIMUMNUM}, 764 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16}, 765 Custom); 766 767 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, 768 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16}, 769 Expand); 770 771 for (MVT Vec16 : 772 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16, 773 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) { 774 setOperationAction( 775 {ISD::BUILD_VECTOR, ISD::EXTRACT_VECTOR_ELT, ISD::SCALAR_TO_VECTOR}, 776 Vec16, Custom); 777 setOperationAction(ISD::INSERT_VECTOR_ELT, Vec16, Expand); 778 } 779 } 780 781 if (Subtarget->hasVOP3PInsts()) { 782 setOperationAction({ISD::ADD, ISD::SUB, ISD::MUL, ISD::SHL, ISD::SRL, 783 ISD::SRA, ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX, 784 ISD::UADDSAT, ISD::USUBSAT, ISD::SADDSAT, ISD::SSUBSAT}, 785 MVT::v2i16, Legal); 786 787 setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FMINNUM_IEEE, 788 ISD::FMAXNUM_IEEE, ISD::FCANONICALIZE}, 789 MVT::v2f16, Legal); 790 791 setOperationAction(ISD::EXTRACT_VECTOR_ELT, 792 {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Custom); 793 794 setOperationAction(ISD::VECTOR_SHUFFLE, 795 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16, 796 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16, 797 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16}, 798 Custom); 799 800 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16}) 801 // Split vector operations. 802 setOperationAction({ISD::SHL, ISD::SRA, ISD::SRL, ISD::ADD, ISD::SUB, 803 ISD::MUL, ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, 804 ISD::UMAX, ISD::UADDSAT, ISD::SADDSAT, ISD::USUBSAT, 805 ISD::SSUBSAT}, 806 VT, Custom); 807 808 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16}) 809 // Split vector operations. 810 setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FCANONICALIZE}, 811 VT, Custom); 812 813 setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, {MVT::v2f16, MVT::v4f16}, 814 Custom); 815 816 setOperationAction(ISD::FEXP, MVT::v2f16, Custom); 817 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16}, 818 Custom); 819 820 if (Subtarget->hasPackedFP32Ops()) { 821 setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FNEG}, 822 MVT::v2f32, Legal); 823 setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA}, 824 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32}, 825 Custom); 826 } 827 } 828 829 setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v4f16, Custom); 830 831 if (Subtarget->has16BitInsts()) { 832 setOperationAction(ISD::SELECT, MVT::v2i16, Promote); 833 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32); 834 setOperationAction(ISD::SELECT, MVT::v2f16, Promote); 835 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32); 836 } else { 837 // Legalization hack. 838 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom); 839 840 setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v2f16, Custom); 841 } 842 843 setOperationAction(ISD::SELECT, 844 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8, 845 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16, 846 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16, 847 MVT::v32f16, MVT::v32bf16}, 848 Custom); 849 850 setOperationAction({ISD::SMULO, ISD::UMULO}, MVT::i64, Custom); 851 852 if (Subtarget->hasScalarSMulU64()) 853 setOperationAction(ISD::MUL, MVT::i64, Custom); 854 855 if (Subtarget->hasMad64_32()) 856 setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, MVT::i32, Custom); 857 858 if (Subtarget->hasPrefetch()) 859 setOperationAction(ISD::PREFETCH, MVT::Other, Custom); 860 861 if (Subtarget->hasIEEEMinMax()) { 862 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, 863 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal); 864 setOperationAction({ISD::FMINIMUM, ISD::FMAXIMUM}, 865 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16}, 866 Custom); 867 } else { 868 // FIXME: For nnan fmaximum, emit the fmaximum3 instead of fmaxnum 869 if (Subtarget->hasMinimum3Maximum3F32()) 870 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f32, Legal); 871 872 if (Subtarget->hasMinimum3Maximum3PKF16()) 873 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::v2f16, Legal); 874 } 875 876 setOperationAction(ISD::INTRINSIC_WO_CHAIN, 877 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16, 878 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128, 879 MVT::i8}, 880 Custom); 881 882 setOperationAction(ISD::INTRINSIC_W_CHAIN, 883 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16, 884 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16, 885 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16, 886 MVT::i16, MVT::bf16, MVT::i8, MVT::i128}, 887 Custom); 888 889 setOperationAction(ISD::INTRINSIC_VOID, 890 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16, 891 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16, 892 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, 893 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128}, 894 Custom); 895 896 setOperationAction(ISD::STACKSAVE, MVT::Other, Custom); 897 setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom); 898 setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom); 899 setOperationAction(ISD::GET_FPENV, MVT::i64, Custom); 900 setOperationAction(ISD::SET_FPENV, MVT::i64, Custom); 901 902 // TODO: Could move this to custom lowering, could benefit from combines on 903 // extract of relevant bits. 904 setOperationAction(ISD::GET_FPMODE, MVT::i32, Legal); 905 906 setOperationAction(ISD::MUL, MVT::i1, Promote); 907 908 if (Subtarget->hasBF16ConversionInsts()) { 909 setOperationAction(ISD::FP_ROUND, MVT::v2bf16, Legal); 910 setOperationAction(ISD::FP_ROUND, MVT::bf16, Legal); 911 setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Legal); 912 } 913 914 if (Subtarget->hasCvtPkF16F32Inst()) { 915 setOperationAction(ISD::FP_ROUND, MVT::v2f16, Legal); 916 } 917 918 setTargetDAGCombine({ISD::ADD, 919 ISD::UADDO_CARRY, 920 ISD::SUB, 921 ISD::USUBO_CARRY, 922 ISD::MUL, 923 ISD::FADD, 924 ISD::FSUB, 925 ISD::FDIV, 926 ISD::FMUL, 927 ISD::FMINNUM, 928 ISD::FMAXNUM, 929 ISD::FMINNUM_IEEE, 930 ISD::FMAXNUM_IEEE, 931 ISD::FMINIMUM, 932 ISD::FMAXIMUM, 933 ISD::FMA, 934 ISD::SMIN, 935 ISD::SMAX, 936 ISD::UMIN, 937 ISD::UMAX, 938 ISD::SETCC, 939 ISD::SELECT, 940 ISD::SMIN, 941 ISD::SMAX, 942 ISD::UMIN, 943 ISD::UMAX, 944 ISD::AND, 945 ISD::OR, 946 ISD::XOR, 947 ISD::SHL, 948 ISD::SRL, 949 ISD::SRA, 950 ISD::FSHR, 951 ISD::SINT_TO_FP, 952 ISD::UINT_TO_FP, 953 ISD::FCANONICALIZE, 954 ISD::SCALAR_TO_VECTOR, 955 ISD::ZERO_EXTEND, 956 ISD::SIGN_EXTEND_INREG, 957 ISD::EXTRACT_VECTOR_ELT, 958 ISD::INSERT_VECTOR_ELT, 959 ISD::FCOPYSIGN}); 960 961 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16()) 962 setTargetDAGCombine(ISD::FP_ROUND); 963 964 // All memory operations. Some folding on the pointer operand is done to help 965 // matching the constant offsets in the addressing modes. 966 setTargetDAGCombine({ISD::LOAD, 967 ISD::STORE, 968 ISD::ATOMIC_LOAD, 969 ISD::ATOMIC_STORE, 970 ISD::ATOMIC_CMP_SWAP, 971 ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, 972 ISD::ATOMIC_SWAP, 973 ISD::ATOMIC_LOAD_ADD, 974 ISD::ATOMIC_LOAD_SUB, 975 ISD::ATOMIC_LOAD_AND, 976 ISD::ATOMIC_LOAD_OR, 977 ISD::ATOMIC_LOAD_XOR, 978 ISD::ATOMIC_LOAD_NAND, 979 ISD::ATOMIC_LOAD_MIN, 980 ISD::ATOMIC_LOAD_MAX, 981 ISD::ATOMIC_LOAD_UMIN, 982 ISD::ATOMIC_LOAD_UMAX, 983 ISD::ATOMIC_LOAD_FADD, 984 ISD::ATOMIC_LOAD_FMIN, 985 ISD::ATOMIC_LOAD_FMAX, 986 ISD::ATOMIC_LOAD_UINC_WRAP, 987 ISD::ATOMIC_LOAD_UDEC_WRAP, 988 ISD::INTRINSIC_VOID, 989 ISD::INTRINSIC_W_CHAIN}); 990 991 // FIXME: In other contexts we pretend this is a per-function property. 992 setStackPointerRegisterToSaveRestore(AMDGPU::SGPR32); 993 994 setSchedulingPreference(Sched::RegPressure); 995 } 996 997 const GCNSubtarget *SITargetLowering::getSubtarget() const { return Subtarget; } 998 999 ArrayRef<MCPhysReg> SITargetLowering::getRoundingControlRegisters() const { 1000 static const MCPhysReg RCRegs[] = {AMDGPU::MODE}; 1001 return RCRegs; 1002 } 1003 1004 //===----------------------------------------------------------------------===// 1005 // TargetLowering queries 1006 //===----------------------------------------------------------------------===// 1007 1008 // v_mad_mix* support a conversion from f16 to f32. 1009 // 1010 // There is only one special case when denormals are enabled we don't currently, 1011 // where this is OK to use. 1012 bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, 1013 EVT DestVT, EVT SrcVT) const { 1014 return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) || 1015 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) && 1016 DestVT.getScalarType() == MVT::f32 && 1017 SrcVT.getScalarType() == MVT::f16 && 1018 // TODO: This probably only requires no input flushing? 1019 denormalModeIsFlushAllF32(DAG.getMachineFunction()); 1020 } 1021 1022 bool SITargetLowering::isFPExtFoldable(const MachineInstr &MI, unsigned Opcode, 1023 LLT DestTy, LLT SrcTy) const { 1024 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) || 1025 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) && 1026 DestTy.getScalarSizeInBits() == 32 && 1027 SrcTy.getScalarSizeInBits() == 16 && 1028 // TODO: This probably only requires no input flushing? 1029 denormalModeIsFlushAllF32(*MI.getMF()); 1030 } 1031 1032 bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const { 1033 // SI has some legal vector types, but no legal vector operations. Say no 1034 // shuffles are legal in order to prefer scalarizing some vector operations. 1035 return false; 1036 } 1037 1038 MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, 1039 CallingConv::ID CC, 1040 EVT VT) const { 1041 if (CC == CallingConv::AMDGPU_KERNEL) 1042 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT); 1043 1044 if (VT.isVector()) { 1045 EVT ScalarVT = VT.getScalarType(); 1046 unsigned Size = ScalarVT.getSizeInBits(); 1047 if (Size == 16) { 1048 if (Subtarget->has16BitInsts()) { 1049 if (VT.isInteger()) 1050 return MVT::v2i16; 1051 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16); 1052 } 1053 return VT.isInteger() ? MVT::i32 : MVT::f32; 1054 } 1055 1056 if (Size < 16) 1057 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32; 1058 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32; 1059 } 1060 1061 if (VT.getSizeInBits() > 32) 1062 return MVT::i32; 1063 1064 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT); 1065 } 1066 1067 unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, 1068 CallingConv::ID CC, 1069 EVT VT) const { 1070 if (CC == CallingConv::AMDGPU_KERNEL) 1071 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT); 1072 1073 if (VT.isVector()) { 1074 unsigned NumElts = VT.getVectorNumElements(); 1075 EVT ScalarVT = VT.getScalarType(); 1076 unsigned Size = ScalarVT.getSizeInBits(); 1077 1078 // FIXME: Should probably promote 8-bit vectors to i16. 1079 if (Size == 16 && Subtarget->has16BitInsts()) 1080 return (NumElts + 1) / 2; 1081 1082 if (Size <= 32) 1083 return NumElts; 1084 1085 if (Size > 32) 1086 return NumElts * ((Size + 31) / 32); 1087 } else if (VT.getSizeInBits() > 32) 1088 return (VT.getSizeInBits() + 31) / 32; 1089 1090 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT); 1091 } 1092 1093 unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv( 1094 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, 1095 unsigned &NumIntermediates, MVT &RegisterVT) const { 1096 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) { 1097 unsigned NumElts = VT.getVectorNumElements(); 1098 EVT ScalarVT = VT.getScalarType(); 1099 unsigned Size = ScalarVT.getSizeInBits(); 1100 // FIXME: We should fix the ABI to be the same on targets without 16-bit 1101 // support, but unless we can properly handle 3-vectors, it will be still be 1102 // inconsistent. 1103 if (Size == 16 && Subtarget->has16BitInsts()) { 1104 if (ScalarVT == MVT::bf16) { 1105 RegisterVT = MVT::i32; 1106 IntermediateVT = MVT::v2bf16; 1107 } else { 1108 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16; 1109 IntermediateVT = RegisterVT; 1110 } 1111 NumIntermediates = (NumElts + 1) / 2; 1112 return NumIntermediates; 1113 } 1114 1115 if (Size == 32) { 1116 RegisterVT = ScalarVT.getSimpleVT(); 1117 IntermediateVT = RegisterVT; 1118 NumIntermediates = NumElts; 1119 return NumIntermediates; 1120 } 1121 1122 if (Size < 16 && Subtarget->has16BitInsts()) { 1123 // FIXME: Should probably form v2i16 pieces 1124 RegisterVT = MVT::i16; 1125 IntermediateVT = ScalarVT; 1126 NumIntermediates = NumElts; 1127 return NumIntermediates; 1128 } 1129 1130 if (Size != 16 && Size <= 32) { 1131 RegisterVT = MVT::i32; 1132 IntermediateVT = ScalarVT; 1133 NumIntermediates = NumElts; 1134 return NumIntermediates; 1135 } 1136 1137 if (Size > 32) { 1138 RegisterVT = MVT::i32; 1139 IntermediateVT = RegisterVT; 1140 NumIntermediates = NumElts * ((Size + 31) / 32); 1141 return NumIntermediates; 1142 } 1143 } 1144 1145 return TargetLowering::getVectorTypeBreakdownForCallingConv( 1146 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT); 1147 } 1148 1149 static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, 1150 const DataLayout &DL, Type *Ty, 1151 unsigned MaxNumLanes) { 1152 assert(MaxNumLanes != 0); 1153 1154 LLVMContext &Ctx = Ty->getContext(); 1155 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) { 1156 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements()); 1157 return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()), 1158 NumElts); 1159 } 1160 1161 return TLI.getValueType(DL, Ty); 1162 } 1163 1164 // Peek through TFE struct returns to only use the data size. 1165 static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, 1166 const DataLayout &DL, Type *Ty, 1167 unsigned MaxNumLanes) { 1168 auto *ST = dyn_cast<StructType>(Ty); 1169 if (!ST) 1170 return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes); 1171 1172 // TFE intrinsics return an aggregate type. 1173 assert(ST->getNumContainedTypes() == 2 && 1174 ST->getContainedType(1)->isIntegerTy(32)); 1175 return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes); 1176 } 1177 1178 /// Map address space 7 to MVT::v5i32 because that's its in-memory 1179 /// representation. This return value is vector-typed because there is no 1180 /// MVT::i160 and it is not clear if one can be added. While this could 1181 /// cause issues during codegen, these address space 7 pointers will be 1182 /// rewritten away by then. Therefore, we can return MVT::v5i32 in order 1183 /// to allow pre-codegen passes that query TargetTransformInfo, often for cost 1184 /// modeling, to work. 1185 MVT SITargetLowering::getPointerTy(const DataLayout &DL, unsigned AS) const { 1186 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160) 1187 return MVT::v5i32; 1188 if (AMDGPUAS::BUFFER_STRIDED_POINTER == AS && 1189 DL.getPointerSizeInBits(AS) == 192) 1190 return MVT::v6i32; 1191 return AMDGPUTargetLowering::getPointerTy(DL, AS); 1192 } 1193 /// Similarly, the in-memory representation of a p7 is {p8, i32}, aka 1194 /// v8i32 when padding is added. 1195 /// The in-memory representation of a p9 is {p8, i32, i32}, which is 1196 /// also v8i32 with padding. 1197 MVT SITargetLowering::getPointerMemTy(const DataLayout &DL, unsigned AS) const { 1198 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS && 1199 DL.getPointerSizeInBits(AS) == 160) || 1200 (AMDGPUAS::BUFFER_STRIDED_POINTER == AS && 1201 DL.getPointerSizeInBits(AS) == 192)) 1202 return MVT::v8i32; 1203 return AMDGPUTargetLowering::getPointerMemTy(DL, AS); 1204 } 1205 1206 bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, 1207 const CallInst &CI, 1208 MachineFunction &MF, 1209 unsigned IntrID) const { 1210 Info.flags = MachineMemOperand::MONone; 1211 if (CI.hasMetadata(LLVMContext::MD_invariant_load)) 1212 Info.flags |= MachineMemOperand::MOInvariant; 1213 if (CI.hasMetadata(LLVMContext::MD_nontemporal)) 1214 Info.flags |= MachineMemOperand::MONonTemporal; 1215 Info.flags |= getTargetMMOFlags(CI); 1216 1217 if (const AMDGPU::RsrcIntrinsic *RsrcIntr = 1218 AMDGPU::lookupRsrcIntrinsic(IntrID)) { 1219 AttributeList Attr = 1220 Intrinsic::getAttributes(CI.getContext(), (Intrinsic::ID)IntrID); 1221 MemoryEffects ME = Attr.getMemoryEffects(); 1222 if (ME.doesNotAccessMemory()) 1223 return false; 1224 1225 // TODO: Should images get their own address space? 1226 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE; 1227 1228 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = nullptr; 1229 if (RsrcIntr->IsImage) { 1230 const AMDGPU::ImageDimIntrinsicInfo *Intr = 1231 AMDGPU::getImageDimIntrinsicInfo(IntrID); 1232 BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 1233 Info.align.reset(); 1234 } 1235 1236 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg); 1237 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) { 1238 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE) 1239 // We conservatively set the memory operand of a buffer intrinsic to the 1240 // base resource pointer, so that we can access alias information about 1241 // those pointers. Cases like "this points at the same value 1242 // but with a different offset" are handled in 1243 // areMemAccessesTriviallyDisjoint. 1244 Info.ptrVal = RsrcArg; 1245 } 1246 1247 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data; 1248 if (!IsSPrefetch) { 1249 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1)); 1250 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE) 1251 Info.flags |= MachineMemOperand::MOVolatile; 1252 } 1253 1254 Info.flags |= MachineMemOperand::MODereferenceable; 1255 if (ME.onlyReadsMemory()) { 1256 if (RsrcIntr->IsImage) { 1257 unsigned MaxNumLanes = 4; 1258 1259 if (!BaseOpcode->Gather4) { 1260 // If this isn't a gather, we may have excess loaded elements in the 1261 // IR type. Check the dmask for the real number of elements loaded. 1262 unsigned DMask = 1263 cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue(); 1264 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask); 1265 } 1266 1267 Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(), 1268 CI.getType(), MaxNumLanes); 1269 } else { 1270 Info.memVT = 1271 memVTFromLoadIntrReturn(*this, MF.getDataLayout(), CI.getType(), 1272 std::numeric_limits<unsigned>::max()); 1273 } 1274 1275 // FIXME: What does alignment mean for an image? 1276 Info.opc = ISD::INTRINSIC_W_CHAIN; 1277 Info.flags |= MachineMemOperand::MOLoad; 1278 } else if (ME.onlyWritesMemory()) { 1279 Info.opc = ISD::INTRINSIC_VOID; 1280 1281 Type *DataTy = CI.getArgOperand(0)->getType(); 1282 if (RsrcIntr->IsImage) { 1283 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue(); 1284 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask); 1285 Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy, 1286 DMaskLanes); 1287 } else 1288 Info.memVT = getValueType(MF.getDataLayout(), DataTy); 1289 1290 Info.flags |= MachineMemOperand::MOStore; 1291 } else { 1292 // Atomic, NoReturn Sampler or prefetch 1293 Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID 1294 : ISD::INTRINSIC_W_CHAIN; 1295 Info.flags |= 1296 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable; 1297 1298 if (!IsSPrefetch) 1299 Info.flags |= MachineMemOperand::MOStore; 1300 1301 switch (IntrID) { 1302 default: 1303 if ((RsrcIntr->IsImage && BaseOpcode->NoReturn) || IsSPrefetch) { 1304 // Fake memory access type for no return sampler intrinsics 1305 Info.memVT = MVT::i32; 1306 } else { 1307 // XXX - Should this be volatile without known ordering? 1308 Info.flags |= MachineMemOperand::MOVolatile; 1309 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType()); 1310 } 1311 break; 1312 case Intrinsic::amdgcn_raw_buffer_load_lds: 1313 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: 1314 case Intrinsic::amdgcn_struct_buffer_load_lds: 1315 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: { 1316 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue(); 1317 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8); 1318 Info.ptrVal = CI.getArgOperand(1); 1319 return true; 1320 } 1321 case Intrinsic::amdgcn_raw_atomic_buffer_load: 1322 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load: 1323 case Intrinsic::amdgcn_struct_atomic_buffer_load: 1324 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: { 1325 Info.memVT = 1326 memVTFromLoadIntrReturn(*this, MF.getDataLayout(), CI.getType(), 1327 std::numeric_limits<unsigned>::max()); 1328 Info.flags &= ~MachineMemOperand::MOStore; 1329 return true; 1330 } 1331 } 1332 } 1333 return true; 1334 } 1335 1336 switch (IntrID) { 1337 case Intrinsic::amdgcn_ds_ordered_add: 1338 case Intrinsic::amdgcn_ds_ordered_swap: { 1339 Info.opc = ISD::INTRINSIC_W_CHAIN; 1340 Info.memVT = MVT::getVT(CI.getType()); 1341 Info.ptrVal = CI.getOperand(0); 1342 Info.align.reset(); 1343 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore; 1344 1345 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4)); 1346 if (!Vol->isZero()) 1347 Info.flags |= MachineMemOperand::MOVolatile; 1348 1349 return true; 1350 } 1351 case Intrinsic::amdgcn_ds_add_gs_reg_rtn: 1352 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: { 1353 Info.opc = ISD::INTRINSIC_W_CHAIN; 1354 Info.memVT = MVT::getVT(CI.getOperand(0)->getType()); 1355 Info.ptrVal = nullptr; 1356 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER; 1357 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; 1358 return true; 1359 } 1360 case Intrinsic::amdgcn_ds_append: 1361 case Intrinsic::amdgcn_ds_consume: { 1362 Info.opc = ISD::INTRINSIC_W_CHAIN; 1363 Info.memVT = MVT::getVT(CI.getType()); 1364 Info.ptrVal = CI.getOperand(0); 1365 Info.align.reset(); 1366 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore; 1367 1368 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1)); 1369 if (!Vol->isZero()) 1370 Info.flags |= MachineMemOperand::MOVolatile; 1371 1372 return true; 1373 } 1374 case Intrinsic::amdgcn_global_atomic_csub: { 1375 Info.opc = ISD::INTRINSIC_W_CHAIN; 1376 Info.memVT = MVT::getVT(CI.getType()); 1377 Info.ptrVal = CI.getOperand(0); 1378 Info.align.reset(); 1379 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore | 1380 MachineMemOperand::MOVolatile; 1381 return true; 1382 } 1383 case Intrinsic::amdgcn_image_bvh_intersect_ray: { 1384 Info.opc = ISD::INTRINSIC_W_CHAIN; 1385 Info.memVT = MVT::getVT(CI.getType()); // XXX: what is correct VT? 1386 1387 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE; 1388 Info.align.reset(); 1389 Info.flags |= 1390 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable; 1391 return true; 1392 } 1393 case Intrinsic::amdgcn_global_atomic_fmin_num: 1394 case Intrinsic::amdgcn_global_atomic_fmax_num: 1395 case Intrinsic::amdgcn_global_atomic_ordered_add_b64: 1396 case Intrinsic::amdgcn_flat_atomic_fmin_num: 1397 case Intrinsic::amdgcn_flat_atomic_fmax_num: 1398 case Intrinsic::amdgcn_atomic_cond_sub_u32: { 1399 Info.opc = ISD::INTRINSIC_W_CHAIN; 1400 Info.memVT = MVT::getVT(CI.getType()); 1401 Info.ptrVal = CI.getOperand(0); 1402 Info.align.reset(); 1403 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore | 1404 MachineMemOperand::MODereferenceable | 1405 MachineMemOperand::MOVolatile; 1406 return true; 1407 } 1408 case Intrinsic::amdgcn_global_load_tr_b64: 1409 case Intrinsic::amdgcn_global_load_tr_b128: 1410 case Intrinsic::amdgcn_ds_read_tr4_b64: 1411 case Intrinsic::amdgcn_ds_read_tr6_b96: 1412 case Intrinsic::amdgcn_ds_read_tr8_b64: 1413 case Intrinsic::amdgcn_ds_read_tr16_b64: { 1414 Info.opc = ISD::INTRINSIC_W_CHAIN; 1415 Info.memVT = MVT::getVT(CI.getType()); 1416 Info.ptrVal = CI.getOperand(0); 1417 Info.align.reset(); 1418 Info.flags |= MachineMemOperand::MOLoad; 1419 return true; 1420 } 1421 case Intrinsic::amdgcn_ds_gws_init: 1422 case Intrinsic::amdgcn_ds_gws_barrier: 1423 case Intrinsic::amdgcn_ds_gws_sema_v: 1424 case Intrinsic::amdgcn_ds_gws_sema_br: 1425 case Intrinsic::amdgcn_ds_gws_sema_p: 1426 case Intrinsic::amdgcn_ds_gws_sema_release_all: { 1427 Info.opc = ISD::INTRINSIC_VOID; 1428 1429 const GCNTargetMachine &TM = 1430 static_cast<const GCNTargetMachine &>(getTargetMachine()); 1431 1432 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1433 Info.ptrVal = MFI->getGWSPSV(TM); 1434 1435 // This is an abstract access, but we need to specify a type and size. 1436 Info.memVT = MVT::i32; 1437 Info.size = 4; 1438 Info.align = Align(4); 1439 1440 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier) 1441 Info.flags |= MachineMemOperand::MOLoad; 1442 else 1443 Info.flags |= MachineMemOperand::MOStore; 1444 return true; 1445 } 1446 case Intrinsic::amdgcn_global_load_lds: { 1447 Info.opc = ISD::INTRINSIC_VOID; 1448 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue(); 1449 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8); 1450 Info.ptrVal = CI.getArgOperand(1); 1451 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore; 1452 return true; 1453 } 1454 case Intrinsic::amdgcn_ds_bvh_stack_rtn: { 1455 Info.opc = ISD::INTRINSIC_W_CHAIN; 1456 1457 const GCNTargetMachine &TM = 1458 static_cast<const GCNTargetMachine &>(getTargetMachine()); 1459 1460 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1461 Info.ptrVal = MFI->getGWSPSV(TM); 1462 1463 // This is an abstract access, but we need to specify a type and size. 1464 Info.memVT = MVT::i32; 1465 Info.size = 4; 1466 Info.align = Align(4); 1467 1468 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; 1469 return true; 1470 } 1471 case Intrinsic::amdgcn_s_prefetch_data: { 1472 Info.opc = ISD::INTRINSIC_VOID; 1473 Info.memVT = EVT::getIntegerVT(CI.getContext(), 8); 1474 Info.ptrVal = CI.getArgOperand(0); 1475 Info.flags |= MachineMemOperand::MOLoad; 1476 return true; 1477 } 1478 default: 1479 return false; 1480 } 1481 } 1482 1483 void SITargetLowering::CollectTargetIntrinsicOperands( 1484 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const { 1485 switch (cast<IntrinsicInst>(I).getIntrinsicID()) { 1486 case Intrinsic::amdgcn_addrspacecast_nonnull: { 1487 // The DAG's ValueType loses the addrspaces. 1488 // Add them as 2 extra Constant operands "from" and "to". 1489 unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace(); 1490 unsigned DstAS = I.getType()->getPointerAddressSpace(); 1491 Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32)); 1492 Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32)); 1493 break; 1494 } 1495 default: 1496 break; 1497 } 1498 } 1499 1500 bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II, 1501 SmallVectorImpl<Value *> &Ops, 1502 Type *&AccessTy) const { 1503 Value *Ptr = nullptr; 1504 switch (II->getIntrinsicID()) { 1505 case Intrinsic::amdgcn_atomic_cond_sub_u32: 1506 case Intrinsic::amdgcn_ds_append: 1507 case Intrinsic::amdgcn_ds_consume: 1508 case Intrinsic::amdgcn_ds_read_tr4_b64: 1509 case Intrinsic::amdgcn_ds_read_tr6_b96: 1510 case Intrinsic::amdgcn_ds_read_tr8_b64: 1511 case Intrinsic::amdgcn_ds_read_tr16_b64: 1512 case Intrinsic::amdgcn_ds_ordered_add: 1513 case Intrinsic::amdgcn_ds_ordered_swap: 1514 case Intrinsic::amdgcn_flat_atomic_fmax_num: 1515 case Intrinsic::amdgcn_flat_atomic_fmin_num: 1516 case Intrinsic::amdgcn_global_atomic_csub: 1517 case Intrinsic::amdgcn_global_atomic_fmax_num: 1518 case Intrinsic::amdgcn_global_atomic_fmin_num: 1519 case Intrinsic::amdgcn_global_atomic_ordered_add_b64: 1520 case Intrinsic::amdgcn_global_load_tr_b64: 1521 case Intrinsic::amdgcn_global_load_tr_b128: 1522 Ptr = II->getArgOperand(0); 1523 break; 1524 case Intrinsic::amdgcn_global_load_lds: 1525 Ptr = II->getArgOperand(1); 1526 break; 1527 default: 1528 return false; 1529 } 1530 AccessTy = II->getType(); 1531 Ops.push_back(Ptr); 1532 return true; 1533 } 1534 1535 bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM, 1536 unsigned AddrSpace) const { 1537 if (!Subtarget->hasFlatInstOffsets()) { 1538 // Flat instructions do not have offsets, and only have the register 1539 // address. 1540 return AM.BaseOffs == 0 && AM.Scale == 0; 1541 } 1542 1543 decltype(SIInstrFlags::FLAT) FlatVariant = 1544 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ? SIInstrFlags::FlatGlobal 1545 : AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ? SIInstrFlags::FlatScratch 1546 : SIInstrFlags::FLAT; 1547 1548 return AM.Scale == 0 && 1549 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset( 1550 AM.BaseOffs, AddrSpace, FlatVariant)); 1551 } 1552 1553 bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const { 1554 if (Subtarget->hasFlatGlobalInsts()) 1555 return isLegalFlatAddressingMode(AM, AMDGPUAS::GLOBAL_ADDRESS); 1556 1557 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) { 1558 // Assume the we will use FLAT for all global memory accesses 1559 // on VI. 1560 // FIXME: This assumption is currently wrong. On VI we still use 1561 // MUBUF instructions for the r + i addressing mode. As currently 1562 // implemented, the MUBUF instructions only work on buffer < 4GB. 1563 // It may be possible to support > 4GB buffers with MUBUF instructions, 1564 // by setting the stride value in the resource descriptor which would 1565 // increase the size limit to (stride * 4GB). However, this is risky, 1566 // because it has never been validated. 1567 return isLegalFlatAddressingMode(AM, AMDGPUAS::FLAT_ADDRESS); 1568 } 1569 1570 return isLegalMUBUFAddressingMode(AM); 1571 } 1572 1573 bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const { 1574 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and 1575 // additionally can do r + r + i with addr64. 32-bit has more addressing 1576 // mode options. Depending on the resource constant, it can also do 1577 // (i64 r0) + (i32 r1) * (i14 i). 1578 // 1579 // Private arrays end up using a scratch buffer most of the time, so also 1580 // assume those use MUBUF instructions. Scratch loads / stores are currently 1581 // implemented as mubuf instructions with offen bit set, so slightly 1582 // different than the normal addr64. 1583 const SIInstrInfo *TII = Subtarget->getInstrInfo(); 1584 if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs)) 1585 return false; 1586 1587 // FIXME: Since we can split immediate into soffset and immediate offset, 1588 // would it make sense to allow any immediate? 1589 1590 switch (AM.Scale) { 1591 case 0: // r + i or just i, depending on HasBaseReg. 1592 return true; 1593 case 1: 1594 return true; // We have r + r or r + i. 1595 case 2: 1596 if (AM.HasBaseReg) { 1597 // Reject 2 * r + r. 1598 return false; 1599 } 1600 1601 // Allow 2 * r as r + r 1602 // Or 2 * r + i is allowed as r + r + i. 1603 return true; 1604 default: // Don't allow n * r 1605 return false; 1606 } 1607 } 1608 1609 bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, 1610 const AddrMode &AM, Type *Ty, 1611 unsigned AS, 1612 Instruction *I) const { 1613 // No global is ever allowed as a base. 1614 if (AM.BaseGV) 1615 return false; 1616 1617 if (AS == AMDGPUAS::GLOBAL_ADDRESS) 1618 return isLegalGlobalAddressingMode(AM); 1619 1620 if (AS == AMDGPUAS::CONSTANT_ADDRESS || 1621 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT || 1622 AS == AMDGPUAS::BUFFER_FAT_POINTER || AS == AMDGPUAS::BUFFER_RESOURCE || 1623 AS == AMDGPUAS::BUFFER_STRIDED_POINTER) { 1624 // If the offset isn't a multiple of 4, it probably isn't going to be 1625 // correctly aligned. 1626 // FIXME: Can we get the real alignment here? 1627 if (AM.BaseOffs % 4 != 0) 1628 return isLegalMUBUFAddressingMode(AM); 1629 1630 if (!Subtarget->hasScalarSubwordLoads()) { 1631 // There are no SMRD extloads, so if we have to do a small type access we 1632 // will use a MUBUF load. 1633 // FIXME?: We also need to do this if unaligned, but we don't know the 1634 // alignment here. 1635 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4) 1636 return isLegalGlobalAddressingMode(AM); 1637 } 1638 1639 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) { 1640 // SMRD instructions have an 8-bit, dword offset on SI. 1641 if (!isUInt<8>(AM.BaseOffs / 4)) 1642 return false; 1643 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) { 1644 // On CI+, this can also be a 32-bit literal constant offset. If it fits 1645 // in 8-bits, it can use a smaller encoding. 1646 if (!isUInt<32>(AM.BaseOffs / 4)) 1647 return false; 1648 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) { 1649 // On VI, these use the SMEM format and the offset is 20-bit in bytes. 1650 if (!isUInt<20>(AM.BaseOffs)) 1651 return false; 1652 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) { 1653 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative 1654 // for S_BUFFER_* instructions). 1655 if (!isInt<21>(AM.BaseOffs)) 1656 return false; 1657 } else { 1658 // On GFX12, all offsets are signed 24-bit in bytes. 1659 if (!isInt<24>(AM.BaseOffs)) 1660 return false; 1661 } 1662 1663 if ((AS == AMDGPUAS::CONSTANT_ADDRESS || 1664 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) && 1665 AM.BaseOffs < 0) { 1666 // Scalar (non-buffer) loads can only use a negative offset if 1667 // soffset+offset is non-negative. Since the compiler can only prove that 1668 // in a few special cases, it is safer to claim that negative offsets are 1669 // not supported. 1670 return false; 1671 } 1672 1673 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg. 1674 return true; 1675 1676 if (AM.Scale == 1 && AM.HasBaseReg) 1677 return true; 1678 1679 return false; 1680 } 1681 1682 if (AS == AMDGPUAS::PRIVATE_ADDRESS) 1683 return Subtarget->enableFlatScratch() 1684 ? isLegalFlatAddressingMode(AM, AMDGPUAS::PRIVATE_ADDRESS) 1685 : isLegalMUBUFAddressingMode(AM); 1686 1687 if (AS == AMDGPUAS::LOCAL_ADDRESS || 1688 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) { 1689 // Basic, single offset DS instructions allow a 16-bit unsigned immediate 1690 // field. 1691 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have 1692 // an 8-bit dword offset but we don't know the alignment here. 1693 if (!isUInt<16>(AM.BaseOffs)) 1694 return false; 1695 1696 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg. 1697 return true; 1698 1699 if (AM.Scale == 1 && AM.HasBaseReg) 1700 return true; 1701 1702 return false; 1703 } 1704 1705 if (AS == AMDGPUAS::FLAT_ADDRESS || AS == AMDGPUAS::UNKNOWN_ADDRESS_SPACE) { 1706 // For an unknown address space, this usually means that this is for some 1707 // reason being used for pure arithmetic, and not based on some addressing 1708 // computation. We don't have instructions that compute pointers with any 1709 // addressing modes, so treat them as having no offset like flat 1710 // instructions. 1711 return isLegalFlatAddressingMode(AM, AMDGPUAS::FLAT_ADDRESS); 1712 } 1713 1714 // Assume a user alias of global for unknown address spaces. 1715 return isLegalGlobalAddressingMode(AM); 1716 } 1717 1718 bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT, 1719 const MachineFunction &MF) const { 1720 if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) 1721 return (MemVT.getSizeInBits() <= 4 * 32); 1722 if (AS == AMDGPUAS::PRIVATE_ADDRESS) { 1723 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize(); 1724 return (MemVT.getSizeInBits() <= MaxPrivateBits); 1725 } 1726 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) 1727 return (MemVT.getSizeInBits() <= 2 * 32); 1728 return true; 1729 } 1730 1731 bool SITargetLowering::allowsMisalignedMemoryAccessesImpl( 1732 unsigned Size, unsigned AddrSpace, Align Alignment, 1733 MachineMemOperand::Flags Flags, unsigned *IsFast) const { 1734 if (IsFast) 1735 *IsFast = 0; 1736 1737 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS || 1738 AddrSpace == AMDGPUAS::REGION_ADDRESS) { 1739 // Check if alignment requirements for ds_read/write instructions are 1740 // disabled. 1741 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4)) 1742 return false; 1743 1744 Align RequiredAlignment( 1745 PowerOf2Ceil(divideCeil(Size, 8))); // Natural alignment. 1746 if (Subtarget->hasLDSMisalignedBug() && Size > 32 && 1747 Alignment < RequiredAlignment) 1748 return false; 1749 1750 // Either, the alignment requirements are "enabled", or there is an 1751 // unaligned LDS access related hardware bug though alignment requirements 1752 // are "disabled". In either case, we need to check for proper alignment 1753 // requirements. 1754 // 1755 switch (Size) { 1756 case 64: 1757 // SI has a hardware bug in the LDS / GDS bounds checking: if the base 1758 // address is negative, then the instruction is incorrectly treated as 1759 // out-of-bounds even if base + offsets is in bounds. Split vectorized 1760 // loads here to avoid emitting ds_read2_b32. We may re-combine the 1761 // load later in the SILoadStoreOptimizer. 1762 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8)) 1763 return false; 1764 1765 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we 1766 // can do a 4 byte aligned, 8 byte access in a single operation using 1767 // ds_read2/write2_b32 with adjacent offsets. 1768 RequiredAlignment = Align(4); 1769 1770 if (Subtarget->hasUnalignedDSAccessEnabled()) { 1771 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/ 1772 // ds_write2_b32 depending on the alignment. In either case with either 1773 // alignment there is no faster way of doing this. 1774 1775 // The numbers returned here and below are not additive, it is a 'speed 1776 // rank'. They are just meant to be compared to decide if a certain way 1777 // of lowering an operation is faster than another. For that purpose 1778 // naturally aligned operation gets it bitsize to indicate that "it 1779 // operates with a speed comparable to N-bit wide load". With the full 1780 // alignment ds128 is slower than ds96 for example. If underaligned it 1781 // is comparable to a speed of a single dword access, which would then 1782 // mean 32 < 128 and it is faster to issue a wide load regardless. 1783 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a 1784 // wider load which will not be aligned anymore the latter is slower. 1785 if (IsFast) 1786 *IsFast = (Alignment >= RequiredAlignment) ? 64 1787 : (Alignment < Align(4)) ? 32 1788 : 1; 1789 return true; 1790 } 1791 1792 break; 1793 case 96: 1794 if (!Subtarget->hasDS96AndDS128()) 1795 return false; 1796 1797 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on 1798 // gfx8 and older. 1799 1800 if (Subtarget->hasUnalignedDSAccessEnabled()) { 1801 // Naturally aligned access is fastest. However, also report it is Fast 1802 // if memory is aligned less than DWORD. A narrow load or store will be 1803 // be equally slow as a single ds_read_b96/ds_write_b96, but there will 1804 // be more of them, so overall we will pay less penalty issuing a single 1805 // instruction. 1806 1807 // See comment on the values above. 1808 if (IsFast) 1809 *IsFast = (Alignment >= RequiredAlignment) ? 96 1810 : (Alignment < Align(4)) ? 32 1811 : 1; 1812 return true; 1813 } 1814 1815 break; 1816 case 128: 1817 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128()) 1818 return false; 1819 1820 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on 1821 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a 1822 // single operation using ds_read2/write2_b64. 1823 RequiredAlignment = Align(8); 1824 1825 if (Subtarget->hasUnalignedDSAccessEnabled()) { 1826 // Naturally aligned access is fastest. However, also report it is Fast 1827 // if memory is aligned less than DWORD. A narrow load or store will be 1828 // be equally slow as a single ds_read_b128/ds_write_b128, but there 1829 // will be more of them, so overall we will pay less penalty issuing a 1830 // single instruction. 1831 1832 // See comment on the values above. 1833 if (IsFast) 1834 *IsFast = (Alignment >= RequiredAlignment) ? 128 1835 : (Alignment < Align(4)) ? 32 1836 : 1; 1837 return true; 1838 } 1839 1840 break; 1841 default: 1842 if (Size > 32) 1843 return false; 1844 1845 break; 1846 } 1847 1848 // See comment on the values above. 1849 // Note that we have a single-dword or sub-dword here, so if underaligned 1850 // it is a slowest possible access, hence returned value is 0. 1851 if (IsFast) 1852 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0; 1853 1854 return Alignment >= RequiredAlignment || 1855 Subtarget->hasUnalignedDSAccessEnabled(); 1856 } 1857 1858 // FIXME: We have to be conservative here and assume that flat operations 1859 // will access scratch. If we had access to the IR function, then we 1860 // could determine if any private memory was used in the function. 1861 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS || 1862 AddrSpace == AMDGPUAS::FLAT_ADDRESS) { 1863 bool AlignedBy4 = Alignment >= Align(4); 1864 if (IsFast) 1865 *IsFast = AlignedBy4; 1866 1867 return AlignedBy4 || Subtarget->hasUnalignedScratchAccessEnabled(); 1868 } 1869 1870 // So long as they are correct, wide global memory operations perform better 1871 // than multiple smaller memory ops -- even when misaligned 1872 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) { 1873 if (IsFast) 1874 *IsFast = Size; 1875 1876 return Alignment >= Align(4) || 1877 Subtarget->hasUnalignedBufferAccessEnabled(); 1878 } 1879 1880 // Smaller than dword value must be aligned. 1881 if (Size < 32) 1882 return false; 1883 1884 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the 1885 // byte-address are ignored, thus forcing Dword alignment. 1886 // This applies to private, global, and constant memory. 1887 if (IsFast) 1888 *IsFast = 1; 1889 1890 return Size >= 32 && Alignment >= Align(4); 1891 } 1892 1893 bool SITargetLowering::allowsMisalignedMemoryAccesses( 1894 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags, 1895 unsigned *IsFast) const { 1896 return allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AddrSpace, 1897 Alignment, Flags, IsFast); 1898 } 1899 1900 EVT SITargetLowering::getOptimalMemOpType( 1901 const MemOp &Op, const AttributeList &FuncAttributes) const { 1902 // FIXME: Should account for address space here. 1903 1904 // The default fallback uses the private pointer size as a guess for a type to 1905 // use. Make sure we switch these to 64-bit accesses. 1906 1907 if (Op.size() >= 16 && 1908 Op.isDstAligned(Align(4))) // XXX: Should only do for global 1909 return MVT::v4i32; 1910 1911 if (Op.size() >= 8 && Op.isDstAligned(Align(4))) 1912 return MVT::v2i32; 1913 1914 // Use the default. 1915 return MVT::Other; 1916 } 1917 1918 bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const { 1919 const MemSDNode *MemNode = cast<MemSDNode>(N); 1920 return MemNode->getMemOperand()->getFlags() & MONoClobber; 1921 } 1922 1923 bool SITargetLowering::isNonGlobalAddrSpace(unsigned AS) { 1924 return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS || 1925 AS == AMDGPUAS::PRIVATE_ADDRESS; 1926 } 1927 1928 bool SITargetLowering::isFreeAddrSpaceCast(unsigned SrcAS, 1929 unsigned DestAS) const { 1930 // Flat -> private/local is a simple truncate. 1931 // Flat -> global is no-op 1932 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) 1933 return true; 1934 1935 const GCNTargetMachine &TM = 1936 static_cast<const GCNTargetMachine &>(getTargetMachine()); 1937 return TM.isNoopAddrSpaceCast(SrcAS, DestAS); 1938 } 1939 1940 TargetLoweringBase::LegalizeTypeAction 1941 SITargetLowering::getPreferredVectorAction(MVT VT) const { 1942 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 && 1943 VT.getScalarType().bitsLE(MVT::i16)) 1944 return VT.isPow2VectorType() ? TypeSplitVector : TypeWidenVector; 1945 return TargetLoweringBase::getPreferredVectorAction(VT); 1946 } 1947 1948 bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, 1949 Type *Ty) const { 1950 // FIXME: Could be smarter if called for vector constants. 1951 return true; 1952 } 1953 1954 bool SITargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, 1955 unsigned Index) const { 1956 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) 1957 return false; 1958 1959 // TODO: Add more cases that are cheap. 1960 return Index == 0; 1961 } 1962 1963 bool SITargetLowering::isExtractVecEltCheap(EVT VT, unsigned Index) const { 1964 // TODO: This should be more aggressive, particular for 16-bit element 1965 // vectors. However there are some mixed improvements and regressions. 1966 EVT EltTy = VT.getVectorElementType(); 1967 return EltTy.getSizeInBits() % 32 == 0; 1968 } 1969 1970 bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const { 1971 if (Subtarget->has16BitInsts() && VT == MVT::i16) { 1972 switch (Op) { 1973 case ISD::LOAD: 1974 case ISD::STORE: 1975 return true; 1976 default: 1977 return false; 1978 } 1979 } 1980 1981 // SimplifySetCC uses this function to determine whether or not it should 1982 // create setcc with i1 operands. We don't have instructions for i1 setcc. 1983 if (VT == MVT::i1 && Op == ISD::SETCC) 1984 return false; 1985 1986 return TargetLowering::isTypeDesirableForOp(Op, VT); 1987 } 1988 1989 SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG, 1990 const SDLoc &SL, 1991 SDValue Chain, 1992 uint64_t Offset) const { 1993 const DataLayout &DL = DAG.getDataLayout(); 1994 MachineFunction &MF = DAG.getMachineFunction(); 1995 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1996 MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS); 1997 1998 auto [InputPtrReg, RC, ArgTy] = 1999 Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 2000 2001 // We may not have the kernarg segment argument if we have no kernel 2002 // arguments. 2003 if (!InputPtrReg) 2004 return DAG.getConstant(Offset, SL, PtrVT); 2005 2006 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 2007 SDValue BasePtr = DAG.getCopyFromReg( 2008 Chain, SL, MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT); 2009 2010 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset)); 2011 } 2012 2013 SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG, 2014 const SDLoc &SL) const { 2015 uint64_t Offset = 2016 getImplicitParameterOffset(DAG.getMachineFunction(), FIRST_IMPLICIT); 2017 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset); 2018 } 2019 2020 SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG, 2021 const SDLoc &SL) const { 2022 2023 Function &F = DAG.getMachineFunction().getFunction(); 2024 std::optional<uint32_t> KnownSize = 2025 AMDGPUMachineFunction::getLDSKernelIdMetadata(F); 2026 if (KnownSize.has_value()) 2027 return DAG.getConstant(*KnownSize, SL, MVT::i32); 2028 return SDValue(); 2029 } 2030 2031 SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT, 2032 const SDLoc &SL, SDValue Val, 2033 bool Signed, 2034 const ISD::InputArg *Arg) const { 2035 // First, if it is a widened vector, narrow it. 2036 if (VT.isVector() && 2037 VT.getVectorNumElements() != MemVT.getVectorNumElements()) { 2038 EVT NarrowedVT = 2039 EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(), 2040 VT.getVectorNumElements()); 2041 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val, 2042 DAG.getConstant(0, SL, MVT::i32)); 2043 } 2044 2045 // Then convert the vector elements or scalar value. 2046 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && VT.bitsLT(MemVT)) { 2047 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext; 2048 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT)); 2049 } 2050 2051 if (MemVT.isFloatingPoint()) 2052 Val = getFPExtOrFPRound(DAG, Val, SL, VT); 2053 else if (Signed) 2054 Val = DAG.getSExtOrTrunc(Val, SL, VT); 2055 else 2056 Val = DAG.getZExtOrTrunc(Val, SL, VT); 2057 2058 return Val; 2059 } 2060 2061 SDValue SITargetLowering::lowerKernargMemParameter( 2062 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain, 2063 uint64_t Offset, Align Alignment, bool Signed, 2064 const ISD::InputArg *Arg) const { 2065 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 2066 2067 // Try to avoid using an extload by loading earlier than the argument address, 2068 // and extracting the relevant bits. The load should hopefully be merged with 2069 // the previous argument. 2070 if (MemVT.getStoreSize() < 4 && Alignment < 4) { 2071 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs). 2072 int64_t AlignDownOffset = alignDown(Offset, 4); 2073 int64_t OffsetDiff = Offset - AlignDownOffset; 2074 2075 EVT IntVT = MemVT.changeTypeToInteger(); 2076 2077 // TODO: If we passed in the base kernel offset we could have a better 2078 // alignment than 4, but we don't really need it. 2079 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset); 2080 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4), 2081 MachineMemOperand::MODereferenceable | 2082 MachineMemOperand::MOInvariant); 2083 2084 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32); 2085 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt); 2086 2087 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract); 2088 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal); 2089 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg); 2090 2091 return DAG.getMergeValues({ArgVal, Load.getValue(1)}, SL); 2092 } 2093 2094 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset); 2095 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment, 2096 MachineMemOperand::MODereferenceable | 2097 MachineMemOperand::MOInvariant); 2098 2099 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg); 2100 return DAG.getMergeValues({Val, Load.getValue(1)}, SL); 2101 } 2102 2103 SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, 2104 CCValAssign &VA, const SDLoc &SL, 2105 SDValue Chain, 2106 const ISD::InputArg &Arg) const { 2107 MachineFunction &MF = DAG.getMachineFunction(); 2108 MachineFrameInfo &MFI = MF.getFrameInfo(); 2109 2110 if (Arg.Flags.isByVal()) { 2111 unsigned Size = Arg.Flags.getByValSize(); 2112 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false); 2113 return DAG.getFrameIndex(FrameIdx, MVT::i32); 2114 } 2115 2116 unsigned ArgOffset = VA.getLocMemOffset(); 2117 unsigned ArgSize = VA.getValVT().getStoreSize(); 2118 2119 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true); 2120 2121 // Create load nodes to retrieve arguments from the stack. 2122 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32); 2123 SDValue ArgValue; 2124 2125 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT) 2126 ISD::LoadExtType ExtType = ISD::NON_EXTLOAD; 2127 MVT MemVT = VA.getValVT(); 2128 2129 switch (VA.getLocInfo()) { 2130 default: 2131 break; 2132 case CCValAssign::BCvt: 2133 MemVT = VA.getLocVT(); 2134 break; 2135 case CCValAssign::SExt: 2136 ExtType = ISD::SEXTLOAD; 2137 break; 2138 case CCValAssign::ZExt: 2139 ExtType = ISD::ZEXTLOAD; 2140 break; 2141 case CCValAssign::AExt: 2142 ExtType = ISD::EXTLOAD; 2143 break; 2144 } 2145 2146 ArgValue = DAG.getExtLoad( 2147 ExtType, SL, VA.getLocVT(), Chain, FIN, 2148 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), MemVT); 2149 return ArgValue; 2150 } 2151 2152 SDValue SITargetLowering::getPreloadedValue( 2153 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT, 2154 AMDGPUFunctionArgInfo::PreloadedValue PVID) const { 2155 const ArgDescriptor *Reg = nullptr; 2156 const TargetRegisterClass *RC; 2157 LLT Ty; 2158 2159 CallingConv::ID CC = DAG.getMachineFunction().getFunction().getCallingConv(); 2160 const ArgDescriptor WorkGroupIDX = 2161 ArgDescriptor::createRegister(AMDGPU::TTMP9); 2162 // If GridZ is not programmed in an entry function then the hardware will set 2163 // it to all zeros, so there is no need to mask the GridY value in the low 2164 // order bits. 2165 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister( 2166 AMDGPU::TTMP7, 2167 AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu); 2168 const ArgDescriptor WorkGroupIDZ = 2169 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u); 2170 if (Subtarget->hasArchitectedSGPRs() && 2171 (AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx)) { 2172 switch (PVID) { 2173 case AMDGPUFunctionArgInfo::WORKGROUP_ID_X: 2174 Reg = &WorkGroupIDX; 2175 RC = &AMDGPU::SReg_32RegClass; 2176 Ty = LLT::scalar(32); 2177 break; 2178 case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y: 2179 Reg = &WorkGroupIDY; 2180 RC = &AMDGPU::SReg_32RegClass; 2181 Ty = LLT::scalar(32); 2182 break; 2183 case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z: 2184 Reg = &WorkGroupIDZ; 2185 RC = &AMDGPU::SReg_32RegClass; 2186 Ty = LLT::scalar(32); 2187 break; 2188 default: 2189 break; 2190 } 2191 } 2192 2193 if (!Reg) 2194 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID); 2195 if (!Reg) { 2196 if (PVID == AMDGPUFunctionArgInfo::PreloadedValue::KERNARG_SEGMENT_PTR) { 2197 // It's possible for a kernarg intrinsic call to appear in a kernel with 2198 // no allocated segment, in which case we do not add the user sgpr 2199 // argument, so just return null. 2200 return DAG.getConstant(0, SDLoc(), VT); 2201 } 2202 2203 // It's undefined behavior if a function marked with the amdgpu-no-* 2204 // attributes uses the corresponding intrinsic. 2205 return DAG.getUNDEF(VT); 2206 } 2207 2208 return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg); 2209 } 2210 2211 static void processPSInputArgs(SmallVectorImpl<ISD::InputArg> &Splits, 2212 CallingConv::ID CallConv, 2213 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped, 2214 FunctionType *FType, 2215 SIMachineFunctionInfo *Info) { 2216 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) { 2217 const ISD::InputArg *Arg = &Ins[I]; 2218 2219 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) && 2220 "vector type argument should have been split"); 2221 2222 // First check if it's a PS input addr. 2223 if (CallConv == CallingConv::AMDGPU_PS && !Arg->Flags.isInReg() && 2224 PSInputNum <= 15) { 2225 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum); 2226 2227 // Inconveniently only the first part of the split is marked as isSplit, 2228 // so skip to the end. We only want to increment PSInputNum once for the 2229 // entire split argument. 2230 if (Arg->Flags.isSplit()) { 2231 while (!Arg->Flags.isSplitEnd()) { 2232 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) && 2233 "unexpected vector split in ps argument type"); 2234 if (!SkipArg) 2235 Splits.push_back(*Arg); 2236 Arg = &Ins[++I]; 2237 } 2238 } 2239 2240 if (SkipArg) { 2241 // We can safely skip PS inputs. 2242 Skipped.set(Arg->getOrigArgIndex()); 2243 ++PSInputNum; 2244 continue; 2245 } 2246 2247 Info->markPSInputAllocated(PSInputNum); 2248 if (Arg->Used) 2249 Info->markPSInputEnabled(PSInputNum); 2250 2251 ++PSInputNum; 2252 } 2253 2254 Splits.push_back(*Arg); 2255 } 2256 } 2257 2258 // Allocate special inputs passed in VGPRs. 2259 void SITargetLowering::allocateSpecialEntryInputVGPRs( 2260 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, 2261 SIMachineFunctionInfo &Info) const { 2262 const LLT S32 = LLT::scalar(32); 2263 MachineRegisterInfo &MRI = MF.getRegInfo(); 2264 2265 if (Info.hasWorkItemIDX()) { 2266 Register Reg = AMDGPU::VGPR0; 2267 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); 2268 2269 CCInfo.AllocateReg(Reg); 2270 unsigned Mask = 2271 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u; 2272 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask)); 2273 } 2274 2275 if (Info.hasWorkItemIDY()) { 2276 assert(Info.hasWorkItemIDX()); 2277 if (Subtarget->hasPackedTID()) { 2278 Info.setWorkItemIDY( 2279 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 10)); 2280 } else { 2281 unsigned Reg = AMDGPU::VGPR1; 2282 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); 2283 2284 CCInfo.AllocateReg(Reg); 2285 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg)); 2286 } 2287 } 2288 2289 if (Info.hasWorkItemIDZ()) { 2290 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY()); 2291 if (Subtarget->hasPackedTID()) { 2292 Info.setWorkItemIDZ( 2293 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 20)); 2294 } else { 2295 unsigned Reg = AMDGPU::VGPR2; 2296 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); 2297 2298 CCInfo.AllocateReg(Reg); 2299 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg)); 2300 } 2301 } 2302 } 2303 2304 // Try to allocate a VGPR at the end of the argument list, or if no argument 2305 // VGPRs are left allocating a stack slot. 2306 // If \p Mask is is given it indicates bitfield position in the register. 2307 // If \p Arg is given use it with new ]p Mask instead of allocating new. 2308 static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u, 2309 ArgDescriptor Arg = ArgDescriptor()) { 2310 if (Arg.isSet()) 2311 return ArgDescriptor::createArg(Arg, Mask); 2312 2313 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32); 2314 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs); 2315 if (RegIdx == ArgVGPRs.size()) { 2316 // Spill to stack required. 2317 int64_t Offset = CCInfo.AllocateStack(4, Align(4)); 2318 2319 return ArgDescriptor::createStack(Offset, Mask); 2320 } 2321 2322 unsigned Reg = ArgVGPRs[RegIdx]; 2323 Reg = CCInfo.AllocateReg(Reg); 2324 assert(Reg != AMDGPU::NoRegister); 2325 2326 MachineFunction &MF = CCInfo.getMachineFunction(); 2327 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); 2328 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32)); 2329 return ArgDescriptor::createRegister(Reg, Mask); 2330 } 2331 2332 static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, 2333 const TargetRegisterClass *RC, 2334 unsigned NumArgRegs) { 2335 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32); 2336 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs); 2337 if (RegIdx == ArgSGPRs.size()) 2338 report_fatal_error("ran out of SGPRs for arguments"); 2339 2340 unsigned Reg = ArgSGPRs[RegIdx]; 2341 Reg = CCInfo.AllocateReg(Reg); 2342 assert(Reg != AMDGPU::NoRegister); 2343 2344 MachineFunction &MF = CCInfo.getMachineFunction(); 2345 MF.addLiveIn(Reg, RC); 2346 return ArgDescriptor::createRegister(Reg); 2347 } 2348 2349 // If this has a fixed position, we still should allocate the register in the 2350 // CCInfo state. Technically we could get away with this for values passed 2351 // outside of the normal argument range. 2352 static void allocateFixedSGPRInputImpl(CCState &CCInfo, 2353 const TargetRegisterClass *RC, 2354 MCRegister Reg) { 2355 Reg = CCInfo.AllocateReg(Reg); 2356 assert(Reg != AMDGPU::NoRegister); 2357 MachineFunction &MF = CCInfo.getMachineFunction(); 2358 MF.addLiveIn(Reg, RC); 2359 } 2360 2361 static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) { 2362 if (Arg) { 2363 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 2364 Arg.getRegister()); 2365 } else 2366 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32); 2367 } 2368 2369 static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) { 2370 if (Arg) { 2371 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 2372 Arg.getRegister()); 2373 } else 2374 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16); 2375 } 2376 2377 /// Allocate implicit function VGPR arguments at the end of allocated user 2378 /// arguments. 2379 void SITargetLowering::allocateSpecialInputVGPRs( 2380 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, 2381 SIMachineFunctionInfo &Info) const { 2382 const unsigned Mask = 0x3ff; 2383 ArgDescriptor Arg; 2384 2385 if (Info.hasWorkItemIDX()) { 2386 Arg = allocateVGPR32Input(CCInfo, Mask); 2387 Info.setWorkItemIDX(Arg); 2388 } 2389 2390 if (Info.hasWorkItemIDY()) { 2391 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg); 2392 Info.setWorkItemIDY(Arg); 2393 } 2394 2395 if (Info.hasWorkItemIDZ()) 2396 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg)); 2397 } 2398 2399 /// Allocate implicit function VGPR arguments in fixed registers. 2400 void SITargetLowering::allocateSpecialInputVGPRsFixed( 2401 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, 2402 SIMachineFunctionInfo &Info) const { 2403 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31); 2404 if (!Reg) 2405 report_fatal_error("failed to allocated VGPR for implicit arguments"); 2406 2407 const unsigned Mask = 0x3ff; 2408 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask)); 2409 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10)); 2410 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20)); 2411 } 2412 2413 void SITargetLowering::allocateSpecialInputSGPRs( 2414 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, 2415 SIMachineFunctionInfo &Info) const { 2416 auto &ArgInfo = Info.getArgInfo(); 2417 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo(); 2418 2419 // TODO: Unify handling with private memory pointers. 2420 if (UserSGPRInfo.hasDispatchPtr()) 2421 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr); 2422 2423 if (UserSGPRInfo.hasQueuePtr()) 2424 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr); 2425 2426 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a 2427 // constant offset from the kernarg segment. 2428 if (Info.hasImplicitArgPtr()) 2429 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr); 2430 2431 if (UserSGPRInfo.hasDispatchID()) 2432 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID); 2433 2434 // flat_scratch_init is not applicable for non-kernel functions. 2435 2436 if (Info.hasWorkGroupIDX()) 2437 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX); 2438 2439 if (Info.hasWorkGroupIDY()) 2440 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY); 2441 2442 if (Info.hasWorkGroupIDZ()) 2443 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ); 2444 2445 if (Info.hasLDSKernelId()) 2446 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId); 2447 } 2448 2449 // Allocate special inputs passed in user SGPRs. 2450 void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo, 2451 MachineFunction &MF, 2452 const SIRegisterInfo &TRI, 2453 SIMachineFunctionInfo &Info) const { 2454 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo(); 2455 if (UserSGPRInfo.hasImplicitBufferPtr()) { 2456 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI); 2457 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass); 2458 CCInfo.AllocateReg(ImplicitBufferPtrReg); 2459 } 2460 2461 // FIXME: How should these inputs interact with inreg / custom SGPR inputs? 2462 if (UserSGPRInfo.hasPrivateSegmentBuffer()) { 2463 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI); 2464 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass); 2465 CCInfo.AllocateReg(PrivateSegmentBufferReg); 2466 } 2467 2468 if (UserSGPRInfo.hasDispatchPtr()) { 2469 Register DispatchPtrReg = Info.addDispatchPtr(TRI); 2470 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass); 2471 CCInfo.AllocateReg(DispatchPtrReg); 2472 } 2473 2474 if (UserSGPRInfo.hasQueuePtr()) { 2475 Register QueuePtrReg = Info.addQueuePtr(TRI); 2476 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass); 2477 CCInfo.AllocateReg(QueuePtrReg); 2478 } 2479 2480 if (UserSGPRInfo.hasKernargSegmentPtr()) { 2481 MachineRegisterInfo &MRI = MF.getRegInfo(); 2482 Register InputPtrReg = Info.addKernargSegmentPtr(TRI); 2483 CCInfo.AllocateReg(InputPtrReg); 2484 2485 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass); 2486 MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 2487 } 2488 2489 if (UserSGPRInfo.hasDispatchID()) { 2490 Register DispatchIDReg = Info.addDispatchID(TRI); 2491 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass); 2492 CCInfo.AllocateReg(DispatchIDReg); 2493 } 2494 2495 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) { 2496 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI); 2497 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass); 2498 CCInfo.AllocateReg(FlatScratchInitReg); 2499 } 2500 2501 if (UserSGPRInfo.hasPrivateSegmentSize()) { 2502 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI); 2503 MF.addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass); 2504 CCInfo.AllocateReg(PrivateSegmentSizeReg); 2505 } 2506 2507 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read 2508 // these from the dispatch pointer. 2509 } 2510 2511 // Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be 2512 // sequential starting from the first argument. 2513 void SITargetLowering::allocatePreloadKernArgSGPRs( 2514 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs, 2515 const SmallVectorImpl<ISD::InputArg> &Ins, MachineFunction &MF, 2516 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const { 2517 Function &F = MF.getFunction(); 2518 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset(); 2519 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo(); 2520 bool InPreloadSequence = true; 2521 unsigned InIdx = 0; 2522 bool AlignedForImplictArgs = false; 2523 unsigned ImplicitArgOffset = 0; 2524 for (auto &Arg : F.args()) { 2525 if (!InPreloadSequence || !Arg.hasInRegAttr()) 2526 break; 2527 2528 unsigned ArgIdx = Arg.getArgNo(); 2529 // Don't preload non-original args or parts not in the current preload 2530 // sequence. 2531 if (InIdx < Ins.size() && 2532 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx)) 2533 break; 2534 2535 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() && 2536 Ins[InIdx].getOrigArgIndex() == ArgIdx; 2537 InIdx++) { 2538 assert(ArgLocs[ArgIdx].isMemLoc()); 2539 auto &ArgLoc = ArgLocs[InIdx]; 2540 const Align KernelArgBaseAlign = Align(16); 2541 unsigned ArgOffset = ArgLoc.getLocMemOffset(); 2542 Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset); 2543 unsigned NumAllocSGPRs = 2544 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32; 2545 2546 // Fix alignment for hidden arguments. 2547 if (Arg.hasAttribute("amdgpu-hidden-argument")) { 2548 if (!AlignedForImplictArgs) { 2549 ImplicitArgOffset = 2550 alignTo(LastExplicitArgOffset, 2551 Subtarget->getAlignmentForImplicitArgPtr()) - 2552 LastExplicitArgOffset; 2553 AlignedForImplictArgs = true; 2554 } 2555 ArgOffset += ImplicitArgOffset; 2556 } 2557 2558 // Arg is preloaded into the previous SGPR. 2559 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) { 2560 assert(InIdx >= 1 && "No previous SGPR"); 2561 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back( 2562 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]); 2563 continue; 2564 } 2565 2566 unsigned Padding = ArgOffset - LastExplicitArgOffset; 2567 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4; 2568 // Check for free user SGPRs for preloading. 2569 if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs()) { 2570 InPreloadSequence = false; 2571 break; 2572 } 2573 2574 // Preload this argument. 2575 const TargetRegisterClass *RC = 2576 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32); 2577 SmallVectorImpl<MCRegister> *PreloadRegs = 2578 Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs); 2579 2580 if (PreloadRegs->size() > 1) 2581 RC = &AMDGPU::SGPR_32RegClass; 2582 for (auto &Reg : *PreloadRegs) { 2583 assert(Reg); 2584 MF.addLiveIn(Reg, RC); 2585 CCInfo.AllocateReg(Reg); 2586 } 2587 2588 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset; 2589 } 2590 } 2591 } 2592 2593 void SITargetLowering::allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, 2594 const SIRegisterInfo &TRI, 2595 SIMachineFunctionInfo &Info) const { 2596 // Always allocate this last since it is a synthetic preload. 2597 if (Info.hasLDSKernelId()) { 2598 Register Reg = Info.addLDSKernelId(); 2599 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); 2600 CCInfo.AllocateReg(Reg); 2601 } 2602 } 2603 2604 // Allocate special input registers that are initialized per-wave. 2605 void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, 2606 SIMachineFunctionInfo &Info, 2607 CallingConv::ID CallConv, 2608 bool IsShader) const { 2609 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs(); 2610 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) { 2611 // Note: user SGPRs are handled by the front-end for graphics shaders 2612 // Pad up the used user SGPRs with dead inputs. 2613 2614 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately 2615 // before enabling architected SGPRs for workgroup IDs. 2616 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget"); 2617 2618 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs(); 2619 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to 2620 // rely on it to reach 16 since if we end up having no stack usage, it will 2621 // not really be added. 2622 unsigned NumRequiredSystemSGPRs = 2623 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() + 2624 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo(); 2625 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) { 2626 Register Reg = Info.addReservedUserSGPR(); 2627 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); 2628 CCInfo.AllocateReg(Reg); 2629 } 2630 } 2631 2632 if (!HasArchitectedSGPRs) { 2633 if (Info.hasWorkGroupIDX()) { 2634 Register Reg = Info.addWorkGroupIDX(); 2635 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); 2636 CCInfo.AllocateReg(Reg); 2637 } 2638 2639 if (Info.hasWorkGroupIDY()) { 2640 Register Reg = Info.addWorkGroupIDY(); 2641 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); 2642 CCInfo.AllocateReg(Reg); 2643 } 2644 2645 if (Info.hasWorkGroupIDZ()) { 2646 Register Reg = Info.addWorkGroupIDZ(); 2647 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); 2648 CCInfo.AllocateReg(Reg); 2649 } 2650 } 2651 2652 if (Info.hasWorkGroupInfo()) { 2653 Register Reg = Info.addWorkGroupInfo(); 2654 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); 2655 CCInfo.AllocateReg(Reg); 2656 } 2657 2658 if (Info.hasPrivateSegmentWaveByteOffset()) { 2659 // Scratch wave offset passed in system SGPR. 2660 unsigned PrivateSegmentWaveByteOffsetReg; 2661 2662 if (IsShader) { 2663 PrivateSegmentWaveByteOffsetReg = 2664 Info.getPrivateSegmentWaveByteOffsetSystemSGPR(); 2665 2666 // This is true if the scratch wave byte offset doesn't have a fixed 2667 // location. 2668 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) { 2669 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo); 2670 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg); 2671 } 2672 } else 2673 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset(); 2674 2675 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass); 2676 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg); 2677 } 2678 2679 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader || 2680 Info.getNumPreloadedSGPRs() >= 16); 2681 } 2682 2683 static void reservePrivateMemoryRegs(const TargetMachine &TM, 2684 MachineFunction &MF, 2685 const SIRegisterInfo &TRI, 2686 SIMachineFunctionInfo &Info) { 2687 // Now that we've figured out where the scratch register inputs are, see if 2688 // should reserve the arguments and use them directly. 2689 MachineFrameInfo &MFI = MF.getFrameInfo(); 2690 bool HasStackObjects = MFI.hasStackObjects(); 2691 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 2692 2693 // Record that we know we have non-spill stack objects so we don't need to 2694 // check all stack objects later. 2695 if (HasStackObjects) 2696 Info.setHasNonSpillStackObjects(true); 2697 2698 // Everything live out of a block is spilled with fast regalloc, so it's 2699 // almost certain that spilling will be required. 2700 if (TM.getOptLevel() == CodeGenOptLevel::None) 2701 HasStackObjects = true; 2702 2703 // For now assume stack access is needed in any callee functions, so we need 2704 // the scratch registers to pass in. 2705 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls(); 2706 2707 if (!ST.enableFlatScratch()) { 2708 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) { 2709 // If we have stack objects, we unquestionably need the private buffer 2710 // resource. For the Code Object V2 ABI, this will be the first 4 user 2711 // SGPR inputs. We can reserve those and use them directly. 2712 2713 Register PrivateSegmentBufferReg = 2714 Info.getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); 2715 Info.setScratchRSrcReg(PrivateSegmentBufferReg); 2716 } else { 2717 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF); 2718 // We tentatively reserve the last registers (skipping the last registers 2719 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation, 2720 // we'll replace these with the ones immediately after those which were 2721 // really allocated. In the prologue copies will be inserted from the 2722 // argument to these reserved registers. 2723 2724 // Without HSA, relocations are used for the scratch pointer and the 2725 // buffer resource setup is always inserted in the prologue. Scratch wave 2726 // offset is still in an input SGPR. 2727 Info.setScratchRSrcReg(ReservedBufferReg); 2728 } 2729 } 2730 2731 MachineRegisterInfo &MRI = MF.getRegInfo(); 2732 2733 // For entry functions we have to set up the stack pointer if we use it, 2734 // whereas non-entry functions get this "for free". This means there is no 2735 // intrinsic advantage to using S32 over S34 in cases where we do not have 2736 // calls but do need a frame pointer (i.e. if we are requested to have one 2737 // because frame pointer elimination is disabled). To keep things simple we 2738 // only ever use S32 as the call ABI stack pointer, and so using it does not 2739 // imply we need a separate frame pointer. 2740 // 2741 // Try to use s32 as the SP, but move it if it would interfere with input 2742 // arguments. This won't work with calls though. 2743 // 2744 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input 2745 // registers. 2746 if (!MRI.isLiveIn(AMDGPU::SGPR32)) { 2747 Info.setStackPtrOffsetReg(AMDGPU::SGPR32); 2748 } else { 2749 assert(AMDGPU::isShader(MF.getFunction().getCallingConv())); 2750 2751 if (MFI.hasCalls()) 2752 report_fatal_error("call in graphics shader with too many input SGPRs"); 2753 2754 for (unsigned Reg : AMDGPU::SGPR_32RegClass) { 2755 if (!MRI.isLiveIn(Reg)) { 2756 Info.setStackPtrOffsetReg(Reg); 2757 break; 2758 } 2759 } 2760 2761 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG) 2762 report_fatal_error("failed to find register for SP"); 2763 } 2764 2765 // hasFP should be accurate for entry functions even before the frame is 2766 // finalized, because it does not rely on the known stack size, only 2767 // properties like whether variable sized objects are present. 2768 if (ST.getFrameLowering()->hasFP(MF)) { 2769 Info.setFrameOffsetReg(AMDGPU::SGPR33); 2770 } 2771 } 2772 2773 bool SITargetLowering::supportSplitCSR(MachineFunction *MF) const { 2774 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 2775 return !Info->isEntryFunction(); 2776 } 2777 2778 void SITargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {} 2779 2780 void SITargetLowering::insertCopiesSplitCSR( 2781 MachineBasicBlock *Entry, 2782 const SmallVectorImpl<MachineBasicBlock *> &Exits) const { 2783 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); 2784 2785 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); 2786 if (!IStart) 2787 return; 2788 2789 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 2790 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); 2791 MachineBasicBlock::iterator MBBI = Entry->begin(); 2792 for (const MCPhysReg *I = IStart; *I; ++I) { 2793 const TargetRegisterClass *RC = nullptr; 2794 if (AMDGPU::SReg_64RegClass.contains(*I)) 2795 RC = &AMDGPU::SGPR_64RegClass; 2796 else if (AMDGPU::SReg_32RegClass.contains(*I)) 2797 RC = &AMDGPU::SGPR_32RegClass; 2798 else 2799 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 2800 2801 Register NewVR = MRI->createVirtualRegister(RC); 2802 // Create copy from CSR to a virtual register. 2803 Entry->addLiveIn(*I); 2804 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) 2805 .addReg(*I); 2806 2807 // Insert the copy-back instructions right before the terminator. 2808 for (auto *Exit : Exits) 2809 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(), 2810 TII->get(TargetOpcode::COPY), *I) 2811 .addReg(NewVR); 2812 } 2813 } 2814 2815 SDValue SITargetLowering::LowerFormalArguments( 2816 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 2817 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL, 2818 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 2819 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); 2820 2821 MachineFunction &MF = DAG.getMachineFunction(); 2822 const Function &Fn = MF.getFunction(); 2823 FunctionType *FType = MF.getFunction().getFunctionType(); 2824 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 2825 2826 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) { 2827 DiagnosticInfoUnsupported NoGraphicsHSA( 2828 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()); 2829 DAG.getContext()->diagnose(NoGraphicsHSA); 2830 return DAG.getEntryNode(); 2831 } 2832 2833 SmallVector<ISD::InputArg, 16> Splits; 2834 SmallVector<CCValAssign, 16> ArgLocs; 2835 BitVector Skipped(Ins.size()); 2836 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 2837 *DAG.getContext()); 2838 2839 bool IsGraphics = AMDGPU::isGraphics(CallConv); 2840 bool IsKernel = AMDGPU::isKernel(CallConv); 2841 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv); 2842 2843 if (IsGraphics) { 2844 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo(); 2845 assert(!UserSGPRInfo.hasDispatchPtr() && 2846 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() && 2847 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() && 2848 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ()); 2849 (void)UserSGPRInfo; 2850 if (!Subtarget->enableFlatScratch()) 2851 assert(!UserSGPRInfo.hasFlatScratchInit()); 2852 if ((CallConv != CallingConv::AMDGPU_CS && 2853 CallConv != CallingConv::AMDGPU_Gfx) || 2854 !Subtarget->hasArchitectedSGPRs()) 2855 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() && 2856 !Info->hasWorkGroupIDZ()); 2857 } 2858 2859 if (CallConv == CallingConv::AMDGPU_PS) { 2860 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info); 2861 2862 // At least one interpolation mode must be enabled or else the GPU will 2863 // hang. 2864 // 2865 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user 2866 // set PSInputAddr, the user wants to enable some bits after the compilation 2867 // based on run-time states. Since we can't know what the final PSInputEna 2868 // will look like, so we shouldn't do anything here and the user should take 2869 // responsibility for the correct programming. 2870 // 2871 // Otherwise, the following restrictions apply: 2872 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled. 2873 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be 2874 // enabled too. 2875 if ((Info->getPSInputAddr() & 0x7F) == 0 || 2876 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) { 2877 CCInfo.AllocateReg(AMDGPU::VGPR0); 2878 CCInfo.AllocateReg(AMDGPU::VGPR1); 2879 Info->markPSInputAllocated(0); 2880 Info->markPSInputEnabled(0); 2881 } 2882 if (Subtarget->isAmdPalOS()) { 2883 // For isAmdPalOS, the user does not enable some bits after compilation 2884 // based on run-time states; the register values being generated here are 2885 // the final ones set in hardware. Therefore we need to apply the 2886 // workaround to PSInputAddr and PSInputEnable together. (The case where 2887 // a bit is set in PSInputAddr but not PSInputEnable is where the 2888 // frontend set up an input arg for a particular interpolation mode, but 2889 // nothing uses that input arg. Really we should have an earlier pass 2890 // that removes such an arg.) 2891 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable(); 2892 if ((PsInputBits & 0x7F) == 0 || 2893 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1))) 2894 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr())); 2895 } 2896 } else if (IsKernel) { 2897 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX()); 2898 } else { 2899 Splits.append(Ins.begin(), Ins.end()); 2900 } 2901 2902 if (IsKernel) 2903 analyzeFormalArgumentsCompute(CCInfo, Ins); 2904 2905 if (IsEntryFunc) { 2906 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info); 2907 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info); 2908 if (IsKernel && Subtarget->hasKernargPreload()) 2909 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info); 2910 2911 allocateLDSKernelId(CCInfo, MF, *TRI, *Info); 2912 } else if (!IsGraphics) { 2913 // For the fixed ABI, pass workitem IDs in the last argument register. 2914 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info); 2915 2916 // FIXME: Sink this into allocateSpecialInputSGPRs 2917 if (!Subtarget->enableFlatScratch()) 2918 CCInfo.AllocateReg(Info->getScratchRSrcReg()); 2919 2920 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info); 2921 } 2922 2923 if (!IsKernel) { 2924 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg); 2925 CCInfo.AnalyzeFormalArguments(Splits, AssignFn); 2926 } 2927 2928 SmallVector<SDValue, 16> Chains; 2929 2930 // FIXME: This is the minimum kernel argument alignment. We should improve 2931 // this to the maximum alignment of the arguments. 2932 // 2933 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit 2934 // kern arg offset. 2935 const Align KernelArgBaseAlign = Align(16); 2936 2937 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { 2938 const ISD::InputArg &Arg = Ins[i]; 2939 if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) { 2940 InVals.push_back(DAG.getUNDEF(Arg.VT)); 2941 continue; 2942 } 2943 2944 CCValAssign &VA = ArgLocs[ArgIdx++]; 2945 MVT VT = VA.getLocVT(); 2946 2947 if (IsEntryFunc && VA.isMemLoc()) { 2948 VT = Ins[i].VT; 2949 EVT MemVT = VA.getLocVT(); 2950 2951 const uint64_t Offset = VA.getLocMemOffset(); 2952 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset); 2953 2954 if (Arg.Flags.isByRef()) { 2955 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset); 2956 2957 const GCNTargetMachine &TM = 2958 static_cast<const GCNTargetMachine &>(getTargetMachine()); 2959 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS, 2960 Arg.Flags.getPointerAddrSpace())) { 2961 Ptr = DAG.getAddrSpaceCast(DL, VT, Ptr, AMDGPUAS::CONSTANT_ADDRESS, 2962 Arg.Flags.getPointerAddrSpace()); 2963 } 2964 2965 InVals.push_back(Ptr); 2966 continue; 2967 } 2968 2969 SDValue NewArg; 2970 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) { 2971 if (MemVT.getStoreSize() < 4 && Alignment < 4) { 2972 // In this case the argument is packed into the previous preload SGPR. 2973 int64_t AlignDownOffset = alignDown(Offset, 4); 2974 int64_t OffsetDiff = Offset - AlignDownOffset; 2975 EVT IntVT = MemVT.changeTypeToInteger(); 2976 2977 const SIMachineFunctionInfo *Info = 2978 MF.getInfo<SIMachineFunctionInfo>(); 2979 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 2980 Register Reg = 2981 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0]; 2982 2983 assert(Reg); 2984 Register VReg = MRI.getLiveInVirtReg(Reg); 2985 SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32); 2986 2987 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32); 2988 SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt); 2989 2990 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract); 2991 ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal); 2992 NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal, 2993 Ins[i].Flags.isSExt(), &Ins[i]); 2994 2995 NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL); 2996 } else { 2997 const SIMachineFunctionInfo *Info = 2998 MF.getInfo<SIMachineFunctionInfo>(); 2999 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 3000 const SmallVectorImpl<MCRegister> &PreloadRegs = 3001 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs; 3002 3003 SDValue Copy; 3004 if (PreloadRegs.size() == 1) { 3005 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]); 3006 const TargetRegisterClass *RC = MRI.getRegClass(VReg); 3007 NewArg = DAG.getCopyFromReg( 3008 Chain, DL, VReg, 3009 EVT::getIntegerVT(*DAG.getContext(), 3010 TRI->getRegSizeInBits(*RC))); 3011 3012 } else { 3013 // If the kernarg alignment does not match the alignment of the SGPR 3014 // tuple RC that can accommodate this argument, it will be built up 3015 // via copies from from the individual SGPRs that the argument was 3016 // preloaded to. 3017 SmallVector<SDValue, 4> Elts; 3018 for (auto Reg : PreloadRegs) { 3019 Register VReg = MRI.getLiveInVirtReg(Reg); 3020 Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32); 3021 Elts.push_back(Copy); 3022 } 3023 NewArg = 3024 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32, 3025 PreloadRegs.size()), 3026 DL, Elts); 3027 } 3028 3029 // If the argument was preloaded to multiple consecutive 32-bit 3030 // registers because of misalignment between addressable SGPR tuples 3031 // and the argument size, we can still assume that because of kernarg 3032 // segment alignment restrictions that NewArg's size is the same as 3033 // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a 3034 // truncate since we cannot preload to less than a single SGPR and the 3035 // MemVT may be smaller. 3036 EVT MemVTInt = 3037 EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()); 3038 if (MemVT.bitsLT(NewArg.getSimpleValueType())) 3039 NewArg = DAG.getNode(ISD::TRUNCATE, DL, MemVTInt, NewArg); 3040 3041 NewArg = DAG.getBitcast(MemVT, NewArg); 3042 NewArg = convertArgType(DAG, VT, MemVT, DL, NewArg, 3043 Ins[i].Flags.isSExt(), &Ins[i]); 3044 NewArg = DAG.getMergeValues({NewArg, Chain}, DL); 3045 } 3046 } else { 3047 // Hidden arguments that are in the kernel signature must be preloaded 3048 // to user SGPRs. Print a diagnostic error if a hidden argument is in 3049 // the argument list and is not preloaded. 3050 if (Arg.isOrigArg()) { 3051 Argument *OrigArg = Fn.getArg(Arg.getOrigArgIndex()); 3052 if (OrigArg->hasAttribute("amdgpu-hidden-argument")) { 3053 DiagnosticInfoUnsupported NonPreloadHiddenArg( 3054 *OrigArg->getParent(), 3055 "hidden argument in kernel signature was not preloaded", 3056 DL.getDebugLoc()); 3057 DAG.getContext()->diagnose(NonPreloadHiddenArg); 3058 } 3059 } 3060 3061 NewArg = 3062 lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset, 3063 Alignment, Ins[i].Flags.isSExt(), &Ins[i]); 3064 } 3065 Chains.push_back(NewArg.getValue(1)); 3066 3067 auto *ParamTy = 3068 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex())); 3069 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS && 3070 ParamTy && 3071 (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || 3072 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) { 3073 // On SI local pointers are just offsets into LDS, so they are always 3074 // less than 16-bits. On CI and newer they could potentially be 3075 // real pointers, so we can't guarantee their size. 3076 NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg, 3077 DAG.getValueType(MVT::i16)); 3078 } 3079 3080 InVals.push_back(NewArg); 3081 continue; 3082 } 3083 if (!IsEntryFunc && VA.isMemLoc()) { 3084 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg); 3085 InVals.push_back(Val); 3086 if (!Arg.Flags.isByVal()) 3087 Chains.push_back(Val.getValue(1)); 3088 continue; 3089 } 3090 3091 assert(VA.isRegLoc() && "Parameter must be in a register!"); 3092 3093 Register Reg = VA.getLocReg(); 3094 const TargetRegisterClass *RC = nullptr; 3095 if (AMDGPU::VGPR_32RegClass.contains(Reg)) 3096 RC = &AMDGPU::VGPR_32RegClass; 3097 else if (AMDGPU::SGPR_32RegClass.contains(Reg)) 3098 RC = &AMDGPU::SGPR_32RegClass; 3099 else 3100 llvm_unreachable("Unexpected register class in LowerFormalArguments!"); 3101 EVT ValVT = VA.getValVT(); 3102 3103 Reg = MF.addLiveIn(Reg, RC); 3104 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT); 3105 3106 if (Arg.Flags.isSRet()) { 3107 // The return object should be reasonably addressable. 3108 3109 // FIXME: This helps when the return is a real sret. If it is a 3110 // automatically inserted sret (i.e. CanLowerReturn returns false), an 3111 // extra copy is inserted in SelectionDAGBuilder which obscures this. 3112 unsigned NumBits = 3113 32 - getSubtarget()->getKnownHighZeroBitsForFrameIndex(); 3114 Val = DAG.getNode( 3115 ISD::AssertZext, DL, VT, Val, 3116 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits))); 3117 } 3118 3119 // If this is an 8 or 16-bit value, it is really passed promoted 3120 // to 32 bits. Insert an assert[sz]ext to capture this, then 3121 // truncate to the right size. 3122 switch (VA.getLocInfo()) { 3123 case CCValAssign::Full: 3124 break; 3125 case CCValAssign::BCvt: 3126 Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val); 3127 break; 3128 case CCValAssign::SExt: 3129 Val = DAG.getNode(ISD::AssertSext, DL, VT, Val, DAG.getValueType(ValVT)); 3130 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val); 3131 break; 3132 case CCValAssign::ZExt: 3133 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val, DAG.getValueType(ValVT)); 3134 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val); 3135 break; 3136 case CCValAssign::AExt: 3137 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val); 3138 break; 3139 default: 3140 llvm_unreachable("Unknown loc info!"); 3141 } 3142 3143 InVals.push_back(Val); 3144 } 3145 3146 // Start adding system SGPRs. 3147 if (IsEntryFunc) 3148 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics); 3149 3150 // DAG.getPass() returns nullptr when using new pass manager. 3151 // TODO: Use DAG.getMFAM() to access analysis result. 3152 if (DAG.getPass()) { 3153 auto &ArgUsageInfo = DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>(); 3154 ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo()); 3155 } 3156 3157 unsigned StackArgSize = CCInfo.getStackSize(); 3158 Info->setBytesInStackArgArea(StackArgSize); 3159 3160 return Chains.empty() ? Chain 3161 : DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); 3162 } 3163 3164 // TODO: If return values can't fit in registers, we should return as many as 3165 // possible in registers before passing on stack. 3166 bool SITargetLowering::CanLowerReturn( 3167 CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg, 3168 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context, 3169 const Type *RetTy) const { 3170 // Replacing returns with sret/stack usage doesn't make sense for shaders. 3171 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn 3172 // for shaders. Vector types should be explicitly handled by CC. 3173 if (AMDGPU::isEntryFunctionCC(CallConv)) 3174 return true; 3175 3176 SmallVector<CCValAssign, 16> RVLocs; 3177 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context); 3178 if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg))) 3179 return false; 3180 3181 // We must use the stack if return would require unavailable registers. 3182 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF); 3183 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs(); 3184 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) 3185 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i))) 3186 return false; 3187 3188 return true; 3189 } 3190 3191 SDValue 3192 SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, 3193 bool isVarArg, 3194 const SmallVectorImpl<ISD::OutputArg> &Outs, 3195 const SmallVectorImpl<SDValue> &OutVals, 3196 const SDLoc &DL, SelectionDAG &DAG) const { 3197 MachineFunction &MF = DAG.getMachineFunction(); 3198 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 3199 3200 if (AMDGPU::isKernel(CallConv)) { 3201 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs, 3202 OutVals, DL, DAG); 3203 } 3204 3205 bool IsShader = AMDGPU::isShader(CallConv); 3206 3207 Info->setIfReturnsVoid(Outs.empty()); 3208 bool IsWaveEnd = Info->returnsVoid() && IsShader; 3209 3210 // CCValAssign - represent the assignment of the return value to a location. 3211 SmallVector<CCValAssign, 48> RVLocs; 3212 SmallVector<ISD::OutputArg, 48> Splits; 3213 3214 // CCState - Info about the registers and stack slots. 3215 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 3216 *DAG.getContext()); 3217 3218 // Analyze outgoing return values. 3219 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg)); 3220 3221 SDValue Glue; 3222 SmallVector<SDValue, 48> RetOps; 3223 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 3224 3225 // Copy the result values into the output registers. 3226 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E; 3227 ++I, ++RealRVLocIdx) { 3228 CCValAssign &VA = RVLocs[I]; 3229 assert(VA.isRegLoc() && "Can only return in registers!"); 3230 // TODO: Partially return in registers if return values don't fit. 3231 SDValue Arg = OutVals[RealRVLocIdx]; 3232 3233 // Copied from other backends. 3234 switch (VA.getLocInfo()) { 3235 case CCValAssign::Full: 3236 break; 3237 case CCValAssign::BCvt: 3238 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); 3239 break; 3240 case CCValAssign::SExt: 3241 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg); 3242 break; 3243 case CCValAssign::ZExt: 3244 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg); 3245 break; 3246 case CCValAssign::AExt: 3247 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg); 3248 break; 3249 default: 3250 llvm_unreachable("Unknown loc info!"); 3251 } 3252 3253 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue); 3254 Glue = Chain.getValue(1); 3255 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 3256 } 3257 3258 // FIXME: Does sret work properly? 3259 if (!Info->isEntryFunction()) { 3260 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); 3261 const MCPhysReg *I = 3262 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); 3263 if (I) { 3264 for (; *I; ++I) { 3265 if (AMDGPU::SReg_64RegClass.contains(*I)) 3266 RetOps.push_back(DAG.getRegister(*I, MVT::i64)); 3267 else if (AMDGPU::SReg_32RegClass.contains(*I)) 3268 RetOps.push_back(DAG.getRegister(*I, MVT::i32)); 3269 else 3270 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 3271 } 3272 } 3273 } 3274 3275 // Update chain and glue. 3276 RetOps[0] = Chain; 3277 if (Glue.getNode()) 3278 RetOps.push_back(Glue); 3279 3280 unsigned Opc = AMDGPUISD::ENDPGM; 3281 if (!IsWaveEnd) 3282 Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_GLUE; 3283 return DAG.getNode(Opc, DL, MVT::Other, RetOps); 3284 } 3285 3286 SDValue SITargetLowering::LowerCallResult( 3287 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg, 3288 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL, 3289 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn, 3290 SDValue ThisVal) const { 3291 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg); 3292 3293 // Assign locations to each value returned by this call. 3294 SmallVector<CCValAssign, 16> RVLocs; 3295 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs, 3296 *DAG.getContext()); 3297 CCInfo.AnalyzeCallResult(Ins, RetCC); 3298 3299 // Copy all of the result registers out of their specified physreg. 3300 for (CCValAssign VA : RVLocs) { 3301 SDValue Val; 3302 3303 if (VA.isRegLoc()) { 3304 Val = 3305 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue); 3306 Chain = Val.getValue(1); 3307 InGlue = Val.getValue(2); 3308 } else if (VA.isMemLoc()) { 3309 report_fatal_error("TODO: return values in memory"); 3310 } else 3311 llvm_unreachable("unknown argument location type"); 3312 3313 switch (VA.getLocInfo()) { 3314 case CCValAssign::Full: 3315 break; 3316 case CCValAssign::BCvt: 3317 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val); 3318 break; 3319 case CCValAssign::ZExt: 3320 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val, 3321 DAG.getValueType(VA.getValVT())); 3322 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val); 3323 break; 3324 case CCValAssign::SExt: 3325 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val, 3326 DAG.getValueType(VA.getValVT())); 3327 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val); 3328 break; 3329 case CCValAssign::AExt: 3330 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val); 3331 break; 3332 default: 3333 llvm_unreachable("Unknown loc info!"); 3334 } 3335 3336 InVals.push_back(Val); 3337 } 3338 3339 return Chain; 3340 } 3341 3342 // Add code to pass special inputs required depending on used features separate 3343 // from the explicit user arguments present in the IR. 3344 void SITargetLowering::passSpecialInputs( 3345 CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, 3346 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass, 3347 SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const { 3348 // If we don't have a call site, this was a call inserted by 3349 // legalization. These can never use special inputs. 3350 if (!CLI.CB) 3351 return; 3352 3353 SelectionDAG &DAG = CLI.DAG; 3354 const SDLoc &DL = CLI.DL; 3355 const Function &F = DAG.getMachineFunction().getFunction(); 3356 3357 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); 3358 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo(); 3359 3360 const AMDGPUFunctionArgInfo *CalleeArgInfo = 3361 &AMDGPUArgumentUsageInfo::FixedABIFunctionInfo; 3362 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) { 3363 // DAG.getPass() returns nullptr when using new pass manager. 3364 // TODO: Use DAG.getMFAM() to access analysis result. 3365 if (DAG.getPass()) { 3366 auto &ArgUsageInfo = 3367 DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>(); 3368 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc); 3369 } 3370 } 3371 3372 // TODO: Unify with private memory register handling. This is complicated by 3373 // the fact that at least in kernels, the input argument is not necessarily 3374 // in the same location as the input. 3375 // clang-format off 3376 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue, 3377 StringLiteral> ImplicitAttrs[] = { 3378 {AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr"}, 3379 {AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" }, 3380 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"}, 3381 {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"}, 3382 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"}, 3383 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"}, 3384 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"}, 3385 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID,"amdgpu-no-lds-kernel-id"}, 3386 }; 3387 // clang-format on 3388 3389 for (auto [InputID, Attr] : ImplicitAttrs) { 3390 // If the callee does not use the attribute value, skip copying the value. 3391 if (CLI.CB->hasFnAttr(Attr)) 3392 continue; 3393 3394 const auto [OutgoingArg, ArgRC, ArgTy] = 3395 CalleeArgInfo->getPreloadedValue(InputID); 3396 if (!OutgoingArg) 3397 continue; 3398 3399 const auto [IncomingArg, IncomingArgRC, Ty] = 3400 CallerArgInfo.getPreloadedValue(InputID); 3401 assert(IncomingArgRC == ArgRC); 3402 3403 // All special arguments are ints for now. 3404 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32; 3405 SDValue InputReg; 3406 3407 if (IncomingArg) { 3408 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg); 3409 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) { 3410 // The implicit arg ptr is special because it doesn't have a corresponding 3411 // input for kernels, and is computed from the kernarg segment pointer. 3412 InputReg = getImplicitArgPtr(DAG, DL); 3413 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) { 3414 std::optional<uint32_t> Id = 3415 AMDGPUMachineFunction::getLDSKernelIdMetadata(F); 3416 if (Id.has_value()) { 3417 InputReg = DAG.getConstant(*Id, DL, ArgVT); 3418 } else { 3419 InputReg = DAG.getUNDEF(ArgVT); 3420 } 3421 } else { 3422 // We may have proven the input wasn't needed, although the ABI is 3423 // requiring it. We just need to allocate the register appropriately. 3424 InputReg = DAG.getUNDEF(ArgVT); 3425 } 3426 3427 if (OutgoingArg->isRegister()) { 3428 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg); 3429 if (!CCInfo.AllocateReg(OutgoingArg->getRegister())) 3430 report_fatal_error("failed to allocate implicit input argument"); 3431 } else { 3432 unsigned SpecialArgOffset = 3433 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4)); 3434 SDValue ArgStore = 3435 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset); 3436 MemOpChains.push_back(ArgStore); 3437 } 3438 } 3439 3440 // Pack workitem IDs into a single register or pass it as is if already 3441 // packed. 3442 3443 auto [OutgoingArg, ArgRC, Ty] = 3444 CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X); 3445 if (!OutgoingArg) 3446 std::tie(OutgoingArg, ArgRC, Ty) = 3447 CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 3448 if (!OutgoingArg) 3449 std::tie(OutgoingArg, ArgRC, Ty) = 3450 CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 3451 if (!OutgoingArg) 3452 return; 3453 3454 const ArgDescriptor *IncomingArgX = std::get<0>( 3455 CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X)); 3456 const ArgDescriptor *IncomingArgY = std::get<0>( 3457 CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y)); 3458 const ArgDescriptor *IncomingArgZ = std::get<0>( 3459 CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z)); 3460 3461 SDValue InputReg; 3462 SDLoc SL; 3463 3464 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x"); 3465 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y"); 3466 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z"); 3467 3468 // If incoming ids are not packed we need to pack them. 3469 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX && 3470 NeedWorkItemIDX) { 3471 if (Subtarget->getMaxWorkitemID(F, 0) != 0) { 3472 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX); 3473 } else { 3474 InputReg = DAG.getConstant(0, DL, MVT::i32); 3475 } 3476 } 3477 3478 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY && 3479 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) { 3480 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY); 3481 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y, 3482 DAG.getShiftAmountConstant(10, MVT::i32, SL)); 3483 InputReg = InputReg.getNode() 3484 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y) 3485 : Y; 3486 } 3487 3488 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ && 3489 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) { 3490 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ); 3491 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z, 3492 DAG.getShiftAmountConstant(20, MVT::i32, SL)); 3493 InputReg = InputReg.getNode() 3494 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z) 3495 : Z; 3496 } 3497 3498 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) { 3499 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) { 3500 // We're in a situation where the outgoing function requires the workitem 3501 // ID, but the calling function does not have it (e.g a graphics function 3502 // calling a C calling convention function). This is illegal, but we need 3503 // to produce something. 3504 InputReg = DAG.getUNDEF(MVT::i32); 3505 } else { 3506 // Workitem ids are already packed, any of present incoming arguments 3507 // will carry all required fields. 3508 ArgDescriptor IncomingArg = 3509 ArgDescriptor::createArg(IncomingArgX ? *IncomingArgX 3510 : IncomingArgY ? *IncomingArgY 3511 : *IncomingArgZ, 3512 ~0u); 3513 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg); 3514 } 3515 } 3516 3517 if (OutgoingArg->isRegister()) { 3518 if (InputReg) 3519 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg); 3520 3521 CCInfo.AllocateReg(OutgoingArg->getRegister()); 3522 } else { 3523 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4)); 3524 if (InputReg) { 3525 SDValue ArgStore = 3526 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset); 3527 MemOpChains.push_back(ArgStore); 3528 } 3529 } 3530 } 3531 3532 static bool canGuaranteeTCO(CallingConv::ID CC) { 3533 return CC == CallingConv::Fast; 3534 } 3535 3536 /// Return true if we might ever do TCO for calls with this calling convention. 3537 static bool mayTailCallThisCC(CallingConv::ID CC) { 3538 switch (CC) { 3539 case CallingConv::C: 3540 case CallingConv::AMDGPU_Gfx: 3541 return true; 3542 default: 3543 return canGuaranteeTCO(CC); 3544 } 3545 } 3546 3547 bool SITargetLowering::isEligibleForTailCallOptimization( 3548 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg, 3549 const SmallVectorImpl<ISD::OutputArg> &Outs, 3550 const SmallVectorImpl<SDValue> &OutVals, 3551 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const { 3552 if (AMDGPU::isChainCC(CalleeCC)) 3553 return true; 3554 3555 if (!mayTailCallThisCC(CalleeCC)) 3556 return false; 3557 3558 // For a divergent call target, we need to do a waterfall loop over the 3559 // possible callees which precludes us from using a simple jump. 3560 if (Callee->isDivergent()) 3561 return false; 3562 3563 MachineFunction &MF = DAG.getMachineFunction(); 3564 const Function &CallerF = MF.getFunction(); 3565 CallingConv::ID CallerCC = CallerF.getCallingConv(); 3566 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); 3567 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); 3568 3569 // Kernels aren't callable, and don't have a live in return address so it 3570 // doesn't make sense to do a tail call with entry functions. 3571 if (!CallerPreserved) 3572 return false; 3573 3574 bool CCMatch = CallerCC == CalleeCC; 3575 3576 if (DAG.getTarget().Options.GuaranteedTailCallOpt) { 3577 if (canGuaranteeTCO(CalleeCC) && CCMatch) 3578 return true; 3579 return false; 3580 } 3581 3582 // TODO: Can we handle var args? 3583 if (IsVarArg) 3584 return false; 3585 3586 for (const Argument &Arg : CallerF.args()) { 3587 if (Arg.hasByValAttr()) 3588 return false; 3589 } 3590 3591 LLVMContext &Ctx = *DAG.getContext(); 3592 3593 // Check that the call results are passed in the same way. 3594 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins, 3595 CCAssignFnForCall(CalleeCC, IsVarArg), 3596 CCAssignFnForCall(CallerCC, IsVarArg))) 3597 return false; 3598 3599 // The callee has to preserve all registers the caller needs to preserve. 3600 if (!CCMatch) { 3601 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); 3602 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) 3603 return false; 3604 } 3605 3606 // Nothing more to check if the callee is taking no arguments. 3607 if (Outs.empty()) 3608 return true; 3609 3610 SmallVector<CCValAssign, 16> ArgLocs; 3611 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx); 3612 3613 // FIXME: We are not allocating special input registers, so we will be 3614 // deciding based on incorrect register assignments. 3615 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg)); 3616 3617 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); 3618 // If the stack arguments for this call do not fit into our own save area then 3619 // the call cannot be made tail. 3620 // TODO: Is this really necessary? 3621 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea()) 3622 return false; 3623 3624 for (const auto &[CCVA, ArgVal] : zip_equal(ArgLocs, OutVals)) { 3625 // FIXME: What about inreg arguments that end up passed in memory? 3626 if (!CCVA.isRegLoc()) 3627 continue; 3628 3629 // If we are passing an argument in an SGPR, and the value is divergent, 3630 // this call requires a waterfall loop. 3631 if (ArgVal->isDivergent() && TRI->isSGPRPhysReg(CCVA.getLocReg())) { 3632 LLVM_DEBUG( 3633 dbgs() << "Cannot tail call due to divergent outgoing argument in " 3634 << printReg(CCVA.getLocReg(), TRI) << '\n'); 3635 return false; 3636 } 3637 } 3638 3639 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3640 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals); 3641 } 3642 3643 bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { 3644 if (!CI->isTailCall()) 3645 return false; 3646 3647 const Function *ParentFn = CI->getParent()->getParent(); 3648 if (AMDGPU::isEntryFunctionCC(ParentFn->getCallingConv())) 3649 return false; 3650 return true; 3651 } 3652 3653 // The wave scratch offset register is used as the global base pointer. 3654 SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, 3655 SmallVectorImpl<SDValue> &InVals) const { 3656 CallingConv::ID CallConv = CLI.CallConv; 3657 bool IsChainCallConv = AMDGPU::isChainCC(CallConv); 3658 3659 SelectionDAG &DAG = CLI.DAG; 3660 3661 TargetLowering::ArgListEntry RequestedExec; 3662 if (IsChainCallConv) { 3663 // The last argument should be the value that we need to put in EXEC. 3664 // Pop it out of CLI.Outs and CLI.OutVals before we do any processing so we 3665 // don't treat it like the rest of the arguments. 3666 RequestedExec = CLI.Args.back(); 3667 assert(RequestedExec.Node && "No node for EXEC"); 3668 3669 if (!RequestedExec.Ty->isIntegerTy(Subtarget->getWavefrontSize())) 3670 return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC"); 3671 3672 assert(CLI.Outs.back().OrigArgIndex == 2 && "Unexpected last arg"); 3673 CLI.Outs.pop_back(); 3674 CLI.OutVals.pop_back(); 3675 3676 if (RequestedExec.Ty->isIntegerTy(64)) { 3677 assert(CLI.Outs.back().OrigArgIndex == 2 && "Exec wasn't split up"); 3678 CLI.Outs.pop_back(); 3679 CLI.OutVals.pop_back(); 3680 } 3681 3682 assert(CLI.Outs.back().OrigArgIndex != 2 && 3683 "Haven't popped all the pieces of the EXEC mask"); 3684 } 3685 3686 const SDLoc &DL = CLI.DL; 3687 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs; 3688 SmallVector<SDValue, 32> &OutVals = CLI.OutVals; 3689 SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins; 3690 SDValue Chain = CLI.Chain; 3691 SDValue Callee = CLI.Callee; 3692 bool &IsTailCall = CLI.IsTailCall; 3693 bool IsVarArg = CLI.IsVarArg; 3694 bool IsSibCall = false; 3695 MachineFunction &MF = DAG.getMachineFunction(); 3696 3697 if (Callee.isUndef() || isNullConstant(Callee)) { 3698 if (!CLI.IsTailCall) { 3699 for (ISD::InputArg &Arg : CLI.Ins) 3700 InVals.push_back(DAG.getUNDEF(Arg.VT)); 3701 } 3702 3703 return Chain; 3704 } 3705 3706 if (IsVarArg) { 3707 return lowerUnhandledCall(CLI, InVals, 3708 "unsupported call to variadic function "); 3709 } 3710 3711 if (!CLI.CB) 3712 report_fatal_error("unsupported libcall legalization"); 3713 3714 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) { 3715 return lowerUnhandledCall(CLI, InVals, 3716 "unsupported required tail call to function "); 3717 } 3718 3719 if (IsTailCall) { 3720 IsTailCall = isEligibleForTailCallOptimization(Callee, CallConv, IsVarArg, 3721 Outs, OutVals, Ins, DAG); 3722 if (!IsTailCall && 3723 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) { 3724 report_fatal_error("failed to perform tail call elimination on a call " 3725 "site marked musttail or on llvm.amdgcn.cs.chain"); 3726 } 3727 3728 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; 3729 3730 // A sibling call is one where we're under the usual C ABI and not planning 3731 // to change that but can still do a tail call: 3732 if (!TailCallOpt && IsTailCall) 3733 IsSibCall = true; 3734 3735 if (IsTailCall) 3736 ++NumTailCalls; 3737 } 3738 3739 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 3740 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 3741 SmallVector<SDValue, 8> MemOpChains; 3742 3743 // Analyze operands of the call, assigning locations to each operand. 3744 SmallVector<CCValAssign, 16> ArgLocs; 3745 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); 3746 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg); 3747 3748 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv)) { 3749 // With a fixed ABI, allocate fixed registers before user arguments. 3750 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain); 3751 } 3752 3753 CCInfo.AnalyzeCallOperands(Outs, AssignFn); 3754 3755 // Get a count of how many bytes are to be pushed on the stack. 3756 unsigned NumBytes = CCInfo.getStackSize(); 3757 3758 if (IsSibCall) { 3759 // Since we're not changing the ABI to make this a tail call, the memory 3760 // operands are already available in the caller's incoming argument space. 3761 NumBytes = 0; 3762 } 3763 3764 // FPDiff is the byte offset of the call's argument area from the callee's. 3765 // Stores to callee stack arguments will be placed in FixedStackSlots offset 3766 // by this amount for a tail call. In a sibling call it must be 0 because the 3767 // caller will deallocate the entire stack and the callee still expects its 3768 // arguments to begin at SP+0. Completely unused for non-tail calls. 3769 int32_t FPDiff = 0; 3770 MachineFrameInfo &MFI = MF.getFrameInfo(); 3771 auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); 3772 3773 // Adjust the stack pointer for the new arguments... 3774 // These operations are automatically eliminated by the prolog/epilog pass 3775 if (!IsSibCall) 3776 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL); 3777 3778 if (!IsSibCall || IsChainCallConv) { 3779 if (!Subtarget->enableFlatScratch()) { 3780 SmallVector<SDValue, 4> CopyFromChains; 3781 3782 // In the HSA case, this should be an identity copy. 3783 SDValue ScratchRSrcReg = 3784 DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32); 3785 RegsToPass.emplace_back(IsChainCallConv 3786 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51 3787 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, 3788 ScratchRSrcReg); 3789 CopyFromChains.push_back(ScratchRSrcReg.getValue(1)); 3790 Chain = DAG.getTokenFactor(DL, CopyFromChains); 3791 } 3792 } 3793 3794 const unsigned NumSpecialInputs = RegsToPass.size(); 3795 3796 MVT PtrVT = MVT::i32; 3797 3798 // Walk the register/memloc assignments, inserting copies/loads. 3799 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 3800 CCValAssign &VA = ArgLocs[i]; 3801 SDValue Arg = OutVals[i]; 3802 3803 // Promote the value if needed. 3804 switch (VA.getLocInfo()) { 3805 case CCValAssign::Full: 3806 break; 3807 case CCValAssign::BCvt: 3808 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); 3809 break; 3810 case CCValAssign::ZExt: 3811 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg); 3812 break; 3813 case CCValAssign::SExt: 3814 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg); 3815 break; 3816 case CCValAssign::AExt: 3817 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg); 3818 break; 3819 case CCValAssign::FPExt: 3820 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg); 3821 break; 3822 default: 3823 llvm_unreachable("Unknown loc info!"); 3824 } 3825 3826 if (VA.isRegLoc()) { 3827 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg)); 3828 } else { 3829 assert(VA.isMemLoc()); 3830 3831 SDValue DstAddr; 3832 MachinePointerInfo DstInfo; 3833 3834 unsigned LocMemOffset = VA.getLocMemOffset(); 3835 int32_t Offset = LocMemOffset; 3836 3837 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT); 3838 MaybeAlign Alignment; 3839 3840 if (IsTailCall) { 3841 ISD::ArgFlagsTy Flags = Outs[i].Flags; 3842 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() 3843 : VA.getValVT().getStoreSize(); 3844 3845 // FIXME: We can have better than the minimum byval required alignment. 3846 Alignment = 3847 Flags.isByVal() 3848 ? Flags.getNonZeroByValAlign() 3849 : commonAlignment(Subtarget->getStackAlignment(), Offset); 3850 3851 Offset = Offset + FPDiff; 3852 int FI = MFI.CreateFixedObject(OpSize, Offset, true); 3853 3854 DstAddr = DAG.getFrameIndex(FI, PtrVT); 3855 DstInfo = MachinePointerInfo::getFixedStack(MF, FI); 3856 3857 // Make sure any stack arguments overlapping with where we're storing 3858 // are loaded before this eventual operation. Otherwise they'll be 3859 // clobbered. 3860 3861 // FIXME: Why is this really necessary? This seems to just result in a 3862 // lot of code to copy the stack and write them back to the same 3863 // locations, which are supposed to be immutable? 3864 Chain = addTokenForArgument(Chain, DAG, MFI, FI); 3865 } else { 3866 // Stores to the argument stack area are relative to the stack pointer. 3867 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(), 3868 MVT::i32); 3869 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff); 3870 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset); 3871 Alignment = 3872 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset); 3873 } 3874 3875 if (Outs[i].Flags.isByVal()) { 3876 SDValue SizeNode = 3877 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32); 3878 SDValue Cpy = 3879 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode, 3880 Outs[i].Flags.getNonZeroByValAlign(), 3881 /*isVol = */ false, /*AlwaysInline = */ true, 3882 /*CI=*/nullptr, std::nullopt, DstInfo, 3883 MachinePointerInfo(AMDGPUAS::PRIVATE_ADDRESS)); 3884 3885 MemOpChains.push_back(Cpy); 3886 } else { 3887 SDValue Store = 3888 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment); 3889 MemOpChains.push_back(Store); 3890 } 3891 } 3892 } 3893 3894 if (!MemOpChains.empty()) 3895 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains); 3896 3897 SDValue ReadFirstLaneID = 3898 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32); 3899 3900 SDValue TokenGlue; 3901 if (CLI.ConvergenceControlToken) { 3902 TokenGlue = DAG.getNode(ISD::CONVERGENCECTRL_GLUE, DL, MVT::Glue, 3903 CLI.ConvergenceControlToken); 3904 } 3905 3906 // Build a sequence of copy-to-reg nodes chained together with token chain 3907 // and flag operands which copy the outgoing args into the appropriate regs. 3908 SDValue InGlue; 3909 3910 unsigned ArgIdx = 0; 3911 for (auto [Reg, Val] : RegsToPass) { 3912 if (ArgIdx++ >= NumSpecialInputs && 3913 (IsChainCallConv || !Val->isDivergent()) && TRI->isSGPRPhysReg(Reg)) { 3914 // For chain calls, the inreg arguments are required to be 3915 // uniform. Speculatively Insert a readfirstlane in case we cannot prove 3916 // they are uniform. 3917 // 3918 // For other calls, if an inreg arguments is known to be uniform, 3919 // speculatively insert a readfirstlane in case it is in a VGPR. 3920 // 3921 // FIXME: We need to execute this in a waterfall loop if it is a divergent 3922 // value, so let that continue to produce invalid code. 3923 3924 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Val}); 3925 if (TokenGlue) 3926 ReadfirstlaneArgs.push_back(TokenGlue); 3927 Val = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Val.getValueType(), 3928 ReadfirstlaneArgs); 3929 } 3930 3931 Chain = DAG.getCopyToReg(Chain, DL, Reg, Val, InGlue); 3932 InGlue = Chain.getValue(1); 3933 } 3934 3935 // We don't usually want to end the call-sequence here because we would tidy 3936 // the frame up *after* the call, however in the ABI-changing tail-call case 3937 // we've carefully laid out the parameters so that when sp is reset they'll be 3938 // in the correct location. 3939 if (IsTailCall && !IsSibCall) { 3940 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL); 3941 InGlue = Chain.getValue(1); 3942 } 3943 3944 std::vector<SDValue> Ops({Chain}); 3945 3946 // Add a redundant copy of the callee global which will not be legalized, as 3947 // we need direct access to the callee later. 3948 if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Callee)) { 3949 const GlobalValue *GV = GSD->getGlobal(); 3950 Ops.push_back(Callee); 3951 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64)); 3952 } else { 3953 if (IsTailCall) { 3954 // isEligibleForTailCallOptimization considered whether the call target is 3955 // divergent, but we may still end up with a uniform value in a VGPR. 3956 // Insert a readfirstlane just in case. 3957 SDValue ReadFirstLaneID = 3958 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32); 3959 3960 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Callee}); 3961 if (TokenGlue) 3962 ReadfirstlaneArgs.push_back(TokenGlue); // Wire up convergence token. 3963 Callee = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Callee.getValueType(), 3964 ReadfirstlaneArgs); 3965 } 3966 3967 Ops.push_back(Callee); 3968 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64)); 3969 } 3970 3971 if (IsTailCall) { 3972 // Each tail call may have to adjust the stack by a different amount, so 3973 // this information must travel along with the operation for eventual 3974 // consumption by emitEpilogue. 3975 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32)); 3976 } 3977 3978 if (IsChainCallConv) 3979 Ops.push_back(RequestedExec.Node); 3980 3981 // Add argument registers to the end of the list so that they are known live 3982 // into the call. 3983 for (auto &[Reg, Val] : RegsToPass) 3984 Ops.push_back(DAG.getRegister(Reg, Val.getValueType())); 3985 3986 // Add a register mask operand representing the call-preserved registers. 3987 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv); 3988 assert(Mask && "Missing call preserved mask for calling convention"); 3989 Ops.push_back(DAG.getRegisterMask(Mask)); 3990 3991 if (SDValue Token = CLI.ConvergenceControlToken) { 3992 SmallVector<SDValue, 2> GlueOps; 3993 GlueOps.push_back(Token); 3994 if (InGlue) 3995 GlueOps.push_back(InGlue); 3996 3997 InGlue = SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, DL, 3998 MVT::Glue, GlueOps), 3999 0); 4000 } 4001 4002 if (InGlue) 4003 Ops.push_back(InGlue); 4004 4005 // If we're doing a tall call, use a TC_RETURN here rather than an 4006 // actual call instruction. 4007 if (IsTailCall) { 4008 MFI.setHasTailCall(); 4009 unsigned OPC = AMDGPUISD::TC_RETURN; 4010 switch (CallConv) { 4011 case CallingConv::AMDGPU_Gfx: 4012 OPC = AMDGPUISD::TC_RETURN_GFX; 4013 break; 4014 case CallingConv::AMDGPU_CS_Chain: 4015 case CallingConv::AMDGPU_CS_ChainPreserve: 4016 OPC = AMDGPUISD::TC_RETURN_CHAIN; 4017 break; 4018 } 4019 4020 return DAG.getNode(OPC, DL, MVT::Other, Ops); 4021 } 4022 4023 // Returns a chain and a flag for retval copy to use. 4024 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, {MVT::Other, MVT::Glue}, Ops); 4025 Chain = Call.getValue(0); 4026 InGlue = Call.getValue(1); 4027 4028 uint64_t CalleePopBytes = NumBytes; 4029 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL); 4030 if (!Ins.empty()) 4031 InGlue = Chain.getValue(1); 4032 4033 // Handle result values, copying them out of physregs into vregs that we 4034 // return. 4035 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG, 4036 InVals, /*IsThisReturn=*/false, SDValue()); 4037 } 4038 4039 // This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC, 4040 // except for: 4041 // 1. Stack growth direction(default: downwards, AMDGPU: upwards), and 4042 // 2. Scale size where, scale = wave-reduction(alloca-size) * wave-size 4043 SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 4044 SelectionDAG &DAG) const { 4045 const MachineFunction &MF = DAG.getMachineFunction(); 4046 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 4047 4048 SDLoc dl(Op); 4049 EVT VT = Op.getValueType(); 4050 SDValue Chain = Op.getOperand(0); 4051 Register SPReg = Info->getStackPtrOffsetReg(); 4052 4053 // Chain the dynamic stack allocation so that it doesn't modify the stack 4054 // pointer when other instructions are using the stack. 4055 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl); 4056 4057 SDValue Size = Op.getOperand(1); 4058 SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, SPReg, VT); 4059 Align Alignment = cast<ConstantSDNode>(Op.getOperand(2))->getAlignValue(); 4060 4061 const TargetFrameLowering *TFL = Subtarget->getFrameLowering(); 4062 assert(TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp && 4063 "Stack grows upwards for AMDGPU"); 4064 4065 Chain = BaseAddr.getValue(1); 4066 Align StackAlign = TFL->getStackAlign(); 4067 if (Alignment > StackAlign) { 4068 uint64_t ScaledAlignment = (uint64_t)Alignment.value() 4069 << Subtarget->getWavefrontSizeLog2(); 4070 uint64_t StackAlignMask = ScaledAlignment - 1; 4071 SDValue TmpAddr = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, 4072 DAG.getConstant(StackAlignMask, dl, VT)); 4073 BaseAddr = DAG.getNode(ISD::AND, dl, VT, TmpAddr, 4074 DAG.getSignedConstant(-ScaledAlignment, dl, VT)); 4075 } 4076 4077 assert(Size.getValueType() == MVT::i32 && "Size must be 32-bit"); 4078 SDValue NewSP; 4079 if (isa<ConstantSDNode>(Size)) { 4080 // For constant sized alloca, scale alloca size by wave-size 4081 SDValue ScaledSize = DAG.getNode( 4082 ISD::SHL, dl, VT, Size, 4083 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32)); 4084 NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value 4085 } else { 4086 // For dynamic sized alloca, perform wave-wide reduction to get max of 4087 // alloca size(divergent) and then scale it by wave-size 4088 SDValue WaveReduction = 4089 DAG.getTargetConstant(Intrinsic::amdgcn_wave_reduce_umax, dl, MVT::i32); 4090 Size = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, WaveReduction, 4091 Size, DAG.getConstant(0, dl, MVT::i32)); 4092 SDValue ScaledSize = DAG.getNode( 4093 ISD::SHL, dl, VT, Size, 4094 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32)); 4095 NewSP = 4096 DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value in vgpr. 4097 SDValue ReadFirstLaneID = 4098 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, dl, MVT::i32); 4099 NewSP = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, ReadFirstLaneID, 4100 NewSP); 4101 } 4102 4103 Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain 4104 SDValue CallSeqEnd = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl); 4105 4106 return DAG.getMergeValues({BaseAddr, CallSeqEnd}, dl); 4107 } 4108 4109 SDValue SITargetLowering::LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const { 4110 if (Op.getValueType() != MVT::i32) 4111 return Op; // Defer to cannot select error. 4112 4113 Register SP = getStackPointerRegisterToSaveRestore(); 4114 SDLoc SL(Op); 4115 4116 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32); 4117 4118 // Convert from wave uniform to swizzled vector address. This should protect 4119 // from any edge cases where the stacksave result isn't directly used with 4120 // stackrestore. 4121 SDValue VectorAddress = 4122 DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP); 4123 return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL); 4124 } 4125 4126 SDValue SITargetLowering::lowerGET_ROUNDING(SDValue Op, 4127 SelectionDAG &DAG) const { 4128 SDLoc SL(Op); 4129 assert(Op.getValueType() == MVT::i32); 4130 4131 uint32_t BothRoundHwReg = 4132 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 4); 4133 SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32); 4134 4135 SDValue IntrinID = 4136 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32); 4137 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(), 4138 Op.getOperand(0), IntrinID, GetRoundBothImm); 4139 4140 // There are two rounding modes, one for f32 and one for f64/f16. We only 4141 // report in the standard value range if both are the same. 4142 // 4143 // The raw values also differ from the expected FLT_ROUNDS values. Nearest 4144 // ties away from zero is not supported, and the other values are rotated by 4145 // 1. 4146 // 4147 // If the two rounding modes are not the same, report a target defined value. 4148 4149 // Mode register rounding mode fields: 4150 // 4151 // [1:0] Single-precision round mode. 4152 // [3:2] Double/Half-precision round mode. 4153 // 4154 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero. 4155 // 4156 // Hardware Spec 4157 // Toward-0 3 0 4158 // Nearest Even 0 1 4159 // +Inf 1 2 4160 // -Inf 2 3 4161 // NearestAway0 N/A 4 4162 // 4163 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit 4164 // table we can index by the raw hardware mode. 4165 // 4166 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf 4167 4168 SDValue BitTable = 4169 DAG.getConstant(AMDGPU::FltRoundConversionTable, SL, MVT::i64); 4170 4171 SDValue Two = DAG.getConstant(2, SL, MVT::i32); 4172 SDValue RoundModeTimesNumBits = 4173 DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two); 4174 4175 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we 4176 // knew only one mode was demanded. 4177 SDValue TableValue = 4178 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits); 4179 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue); 4180 4181 SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32); 4182 SDValue TableEntry = 4183 DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask); 4184 4185 // There's a gap in the 4-bit encoded table and actual enum values, so offset 4186 // if it's an extended value. 4187 SDValue Four = DAG.getConstant(4, SL, MVT::i32); 4188 SDValue IsStandardValue = 4189 DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT); 4190 SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four); 4191 SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue, 4192 TableEntry, EnumOffset); 4193 4194 return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL); 4195 } 4196 4197 SDValue SITargetLowering::lowerSET_ROUNDING(SDValue Op, 4198 SelectionDAG &DAG) const { 4199 SDLoc SL(Op); 4200 4201 SDValue NewMode = Op.getOperand(1); 4202 assert(NewMode.getValueType() == MVT::i32); 4203 4204 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the 4205 // hardware MODE.fp_round values. 4206 if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) { 4207 uint32_t ClampedVal = std::min( 4208 static_cast<uint32_t>(ConstMode->getZExtValue()), 4209 static_cast<uint32_t>(AMDGPU::TowardZeroF32_TowardNegativeF64)); 4210 NewMode = DAG.getConstant( 4211 AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32); 4212 } else { 4213 // If we know the input can only be one of the supported standard modes in 4214 // the range 0-3, we can use a simplified mapping to hardware values. 4215 KnownBits KB = DAG.computeKnownBits(NewMode); 4216 const bool UseReducedTable = KB.countMinLeadingZeros() >= 30; 4217 // The supported standard values are 0-3. The extended values start at 8. We 4218 // need to offset by 4 if the value is in the extended range. 4219 4220 if (UseReducedTable) { 4221 // Truncate to the low 32-bits. 4222 SDValue BitTable = DAG.getConstant( 4223 AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32); 4224 4225 SDValue Two = DAG.getConstant(2, SL, MVT::i32); 4226 SDValue RoundModeTimesNumBits = 4227 DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two); 4228 4229 NewMode = 4230 DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits); 4231 4232 // TODO: SimplifyDemandedBits on the setreg source here can likely reduce 4233 // the table extracted bits into inline immediates. 4234 } else { 4235 // table_index = umin(value, value - 4) 4236 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf 4237 SDValue BitTable = 4238 DAG.getConstant(AMDGPU::FltRoundToHWConversionTable, SL, MVT::i64); 4239 4240 SDValue Four = DAG.getConstant(4, SL, MVT::i32); 4241 SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four); 4242 SDValue IndexVal = 4243 DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum); 4244 4245 SDValue Two = DAG.getConstant(2, SL, MVT::i32); 4246 SDValue RoundModeTimesNumBits = 4247 DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two); 4248 4249 SDValue TableValue = 4250 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits); 4251 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue); 4252 4253 // No need to mask out the high bits since the setreg will ignore them 4254 // anyway. 4255 NewMode = TruncTable; 4256 } 4257 4258 // Insert a readfirstlane in case the value is a VGPR. We could do this 4259 // earlier and keep more operations scalar, but that interferes with 4260 // combining the source. 4261 SDValue ReadFirstLaneID = 4262 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32); 4263 NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, 4264 ReadFirstLaneID, NewMode); 4265 } 4266 4267 // N.B. The setreg will be later folded into s_round_mode on supported 4268 // targets. 4269 SDValue IntrinID = 4270 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32); 4271 uint32_t BothRoundHwReg = 4272 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 4); 4273 SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32); 4274 4275 SDValue SetReg = 4276 DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0), 4277 IntrinID, RoundBothImm, NewMode); 4278 4279 return SetReg; 4280 } 4281 4282 SDValue SITargetLowering::lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const { 4283 if (Op->isDivergent()) 4284 return SDValue(); 4285 4286 switch (cast<MemSDNode>(Op)->getAddressSpace()) { 4287 case AMDGPUAS::FLAT_ADDRESS: 4288 case AMDGPUAS::GLOBAL_ADDRESS: 4289 case AMDGPUAS::CONSTANT_ADDRESS: 4290 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 4291 break; 4292 default: 4293 return SDValue(); 4294 } 4295 4296 return Op; 4297 } 4298 4299 // Work around DAG legality rules only based on the result type. 4300 SDValue SITargetLowering::lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { 4301 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND; 4302 SDValue Src = Op.getOperand(IsStrict ? 1 : 0); 4303 EVT SrcVT = Src.getValueType(); 4304 4305 if (SrcVT.getScalarType() != MVT::bf16) 4306 return Op; 4307 4308 SDLoc SL(Op); 4309 SDValue BitCast = 4310 DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src); 4311 4312 EVT DstVT = Op.getValueType(); 4313 if (IsStrict) 4314 llvm_unreachable("Need STRICT_BF16_TO_FP"); 4315 4316 return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast); 4317 } 4318 4319 SDValue SITargetLowering::lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const { 4320 SDLoc SL(Op); 4321 if (Op.getValueType() != MVT::i64) 4322 return Op; 4323 4324 uint32_t ModeHwReg = 4325 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 23); 4326 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32); 4327 uint32_t TrapHwReg = 4328 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_TRAPSTS, 0, 5); 4329 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32); 4330 4331 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other); 4332 SDValue IntrinID = 4333 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32); 4334 SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList, 4335 Op.getOperand(0), IntrinID, ModeHwRegImm); 4336 SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList, 4337 Op.getOperand(0), IntrinID, TrapHwRegImm); 4338 SDValue TokenReg = 4339 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1), 4340 GetTrapReg.getValue(1)); 4341 4342 SDValue CvtPtr = 4343 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg); 4344 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr); 4345 4346 return DAG.getMergeValues({Result, TokenReg}, SL); 4347 } 4348 4349 SDValue SITargetLowering::lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const { 4350 SDLoc SL(Op); 4351 if (Op.getOperand(1).getValueType() != MVT::i64) 4352 return Op; 4353 4354 SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1)); 4355 SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input, 4356 DAG.getConstant(0, SL, MVT::i32)); 4357 SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input, 4358 DAG.getConstant(1, SL, MVT::i32)); 4359 4360 SDValue ReadFirstLaneID = 4361 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32); 4362 NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, 4363 ReadFirstLaneID, NewModeReg); 4364 NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, 4365 ReadFirstLaneID, NewTrapReg); 4366 4367 unsigned ModeHwReg = 4368 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 23); 4369 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32); 4370 unsigned TrapHwReg = 4371 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_TRAPSTS, 0, 5); 4372 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32); 4373 4374 SDValue IntrinID = 4375 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32); 4376 SDValue SetModeReg = 4377 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0), 4378 IntrinID, ModeHwRegImm, NewModeReg); 4379 SDValue SetTrapReg = 4380 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0), 4381 IntrinID, TrapHwRegImm, NewTrapReg); 4382 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg); 4383 } 4384 4385 Register SITargetLowering::getRegisterByName(const char *RegName, LLT VT, 4386 const MachineFunction &MF) const { 4387 Register Reg = StringSwitch<Register>(RegName) 4388 .Case("m0", AMDGPU::M0) 4389 .Case("exec", AMDGPU::EXEC) 4390 .Case("exec_lo", AMDGPU::EXEC_LO) 4391 .Case("exec_hi", AMDGPU::EXEC_HI) 4392 .Case("flat_scratch", AMDGPU::FLAT_SCR) 4393 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO) 4394 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI) 4395 .Default(Register()); 4396 4397 if (Reg == AMDGPU::NoRegister) { 4398 report_fatal_error( 4399 Twine("invalid register name \"" + StringRef(RegName) + "\".")); 4400 } 4401 4402 if (!Subtarget->hasFlatScrRegister() && 4403 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) { 4404 report_fatal_error(Twine("invalid register \"" + StringRef(RegName) + 4405 "\" for subtarget.")); 4406 } 4407 4408 switch (Reg) { 4409 case AMDGPU::M0: 4410 case AMDGPU::EXEC_LO: 4411 case AMDGPU::EXEC_HI: 4412 case AMDGPU::FLAT_SCR_LO: 4413 case AMDGPU::FLAT_SCR_HI: 4414 if (VT.getSizeInBits() == 32) 4415 return Reg; 4416 break; 4417 case AMDGPU::EXEC: 4418 case AMDGPU::FLAT_SCR: 4419 if (VT.getSizeInBits() == 64) 4420 return Reg; 4421 break; 4422 default: 4423 llvm_unreachable("missing register type checking"); 4424 } 4425 4426 report_fatal_error( 4427 Twine("invalid type for register \"" + StringRef(RegName) + "\".")); 4428 } 4429 4430 // If kill is not the last instruction, split the block so kill is always a 4431 // proper terminator. 4432 MachineBasicBlock * 4433 SITargetLowering::splitKillBlock(MachineInstr &MI, 4434 MachineBasicBlock *BB) const { 4435 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/); 4436 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 4437 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode())); 4438 return SplitBB; 4439 } 4440 4441 // Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true, 4442 // \p MI will be the only instruction in the loop body block. Otherwise, it will 4443 // be the first instruction in the remainder block. 4444 // 4445 /// \returns { LoopBody, Remainder } 4446 static std::pair<MachineBasicBlock *, MachineBasicBlock *> 4447 splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop) { 4448 MachineFunction *MF = MBB.getParent(); 4449 MachineBasicBlock::iterator I(&MI); 4450 4451 // To insert the loop we need to split the block. Move everything after this 4452 // point to a new block, and insert a new empty block between the two. 4453 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); 4454 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); 4455 MachineFunction::iterator MBBI(MBB); 4456 ++MBBI; 4457 4458 MF->insert(MBBI, LoopBB); 4459 MF->insert(MBBI, RemainderBB); 4460 4461 LoopBB->addSuccessor(LoopBB); 4462 LoopBB->addSuccessor(RemainderBB); 4463 4464 // Move the rest of the block into a new block. 4465 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); 4466 4467 if (InstInLoop) { 4468 auto Next = std::next(I); 4469 4470 // Move instruction to loop body. 4471 LoopBB->splice(LoopBB->begin(), &MBB, I, Next); 4472 4473 // Move the rest of the block. 4474 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end()); 4475 } else { 4476 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end()); 4477 } 4478 4479 MBB.addSuccessor(LoopBB); 4480 4481 return std::pair(LoopBB, RemainderBB); 4482 } 4483 4484 /// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it. 4485 void SITargetLowering::bundleInstWithWaitcnt(MachineInstr &MI) const { 4486 MachineBasicBlock *MBB = MI.getParent(); 4487 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 4488 auto I = MI.getIterator(); 4489 auto E = std::next(I); 4490 4491 // clang-format off 4492 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT)) 4493 .addImm(0); 4494 // clang-format on 4495 4496 MIBundleBuilder Bundler(*MBB, I, E); 4497 finalizeBundle(*MBB, Bundler.begin()); 4498 } 4499 4500 MachineBasicBlock * 4501 SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI, 4502 MachineBasicBlock *BB) const { 4503 const DebugLoc &DL = MI.getDebugLoc(); 4504 4505 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 4506 4507 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 4508 4509 // Apparently kill flags are only valid if the def is in the same block? 4510 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) 4511 Src->setIsKill(false); 4512 4513 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, *BB, true); 4514 4515 MachineBasicBlock::iterator I = LoopBB->end(); 4516 4517 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode( 4518 AMDGPU::Hwreg::ID_TRAPSTS, AMDGPU::Hwreg::OFFSET_MEM_VIOL, 1); 4519 4520 // Clear TRAP_STS.MEM_VIOL 4521 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32)) 4522 .addImm(0) 4523 .addImm(EncodedReg); 4524 4525 bundleInstWithWaitcnt(MI); 4526 4527 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 4528 4529 // Load and check TRAP_STS.MEM_VIOL 4530 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg) 4531 .addImm(EncodedReg); 4532 4533 // FIXME: Do we need to use an isel pseudo that may clobber scc? 4534 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32)) 4535 .addReg(Reg, RegState::Kill) 4536 .addImm(0); 4537 // clang-format off 4538 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1)) 4539 .addMBB(LoopBB); 4540 // clang-format on 4541 4542 return RemainderBB; 4543 } 4544 4545 // Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the 4546 // wavefront. If the value is uniform and just happens to be in a VGPR, this 4547 // will only do one iteration. In the worst case, this will loop 64 times. 4548 // 4549 // TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value. 4550 static MachineBasicBlock::iterator 4551 emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, 4552 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, 4553 const DebugLoc &DL, const MachineOperand &Idx, 4554 unsigned InitReg, unsigned ResultReg, unsigned PhiReg, 4555 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, 4556 Register &SGPRIdxReg) { 4557 4558 MachineFunction *MF = OrigBB.getParent(); 4559 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 4560 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 4561 MachineBasicBlock::iterator I = LoopBB.begin(); 4562 4563 const TargetRegisterClass *BoolRC = TRI->getBoolRC(); 4564 Register PhiExec = MRI.createVirtualRegister(BoolRC); 4565 Register NewExec = MRI.createVirtualRegister(BoolRC); 4566 Register CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 4567 Register CondReg = MRI.createVirtualRegister(BoolRC); 4568 4569 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg) 4570 .addReg(InitReg) 4571 .addMBB(&OrigBB) 4572 .addReg(ResultReg) 4573 .addMBB(&LoopBB); 4574 4575 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec) 4576 .addReg(InitSaveExecReg) 4577 .addMBB(&OrigBB) 4578 .addReg(NewExec) 4579 .addMBB(&LoopBB); 4580 4581 // Read the next variant <- also loop target. 4582 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg) 4583 .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef())); 4584 4585 // Compare the just read M0 value to all possible Idx values. 4586 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg) 4587 .addReg(CurrentIdxReg) 4588 .addReg(Idx.getReg(), 0, Idx.getSubReg()); 4589 4590 // Update EXEC, save the original EXEC value to VCC. 4591 BuildMI(LoopBB, I, DL, 4592 TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 4593 : AMDGPU::S_AND_SAVEEXEC_B64), 4594 NewExec) 4595 .addReg(CondReg, RegState::Kill); 4596 4597 MRI.setSimpleHint(NewExec, CondReg); 4598 4599 if (UseGPRIdxMode) { 4600 if (Offset == 0) { 4601 SGPRIdxReg = CurrentIdxReg; 4602 } else { 4603 SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 4604 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg) 4605 .addReg(CurrentIdxReg, RegState::Kill) 4606 .addImm(Offset); 4607 } 4608 } else { 4609 // Move index from VCC into M0 4610 if (Offset == 0) { 4611 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) 4612 .addReg(CurrentIdxReg, RegState::Kill); 4613 } else { 4614 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) 4615 .addReg(CurrentIdxReg, RegState::Kill) 4616 .addImm(Offset); 4617 } 4618 } 4619 4620 // Update EXEC, switch all done bits to 0 and all todo bits to 1. 4621 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 4622 MachineInstr *InsertPt = 4623 BuildMI(LoopBB, I, DL, 4624 TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term 4625 : AMDGPU::S_XOR_B64_term), 4626 Exec) 4627 .addReg(Exec) 4628 .addReg(NewExec); 4629 4630 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use 4631 // s_cbranch_scc0? 4632 4633 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover. 4634 // clang-format off 4635 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) 4636 .addMBB(&LoopBB); 4637 // clang-format on 4638 4639 return InsertPt->getIterator(); 4640 } 4641 4642 // This has slightly sub-optimal regalloc when the source vector is killed by 4643 // the read. The register allocator does not understand that the kill is 4644 // per-workitem, so is kept alive for the whole loop so we end up not re-using a 4645 // subregister from it, using 1 more VGPR than necessary. This was saved when 4646 // this was expanded after register allocation. 4647 static MachineBasicBlock::iterator 4648 loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, 4649 unsigned InitResultReg, unsigned PhiReg, int Offset, 4650 bool UseGPRIdxMode, Register &SGPRIdxReg) { 4651 MachineFunction *MF = MBB.getParent(); 4652 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 4653 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 4654 MachineRegisterInfo &MRI = MF->getRegInfo(); 4655 const DebugLoc &DL = MI.getDebugLoc(); 4656 MachineBasicBlock::iterator I(&MI); 4657 4658 const auto *BoolXExecRC = TRI->getWaveMaskRegClass(); 4659 Register DstReg = MI.getOperand(0).getReg(); 4660 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC); 4661 Register TmpExec = MRI.createVirtualRegister(BoolXExecRC); 4662 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 4663 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 4664 4665 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec); 4666 4667 // Save the EXEC mask 4668 // clang-format off 4669 BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec) 4670 .addReg(Exec); 4671 // clang-format on 4672 4673 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB, false); 4674 4675 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); 4676 4677 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx, 4678 InitResultReg, DstReg, PhiReg, TmpExec, 4679 Offset, UseGPRIdxMode, SGPRIdxReg); 4680 4681 MachineBasicBlock *LandingPad = MF->CreateMachineBasicBlock(); 4682 MachineFunction::iterator MBBI(LoopBB); 4683 ++MBBI; 4684 MF->insert(MBBI, LandingPad); 4685 LoopBB->removeSuccessor(RemainderBB); 4686 LandingPad->addSuccessor(RemainderBB); 4687 LoopBB->addSuccessor(LandingPad); 4688 MachineBasicBlock::iterator First = LandingPad->begin(); 4689 // clang-format off 4690 BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec) 4691 .addReg(SaveExec); 4692 // clang-format on 4693 4694 return InsPt; 4695 } 4696 4697 // Returns subreg index, offset 4698 static std::pair<unsigned, int> 4699 computeIndirectRegAndOffset(const SIRegisterInfo &TRI, 4700 const TargetRegisterClass *SuperRC, unsigned VecReg, 4701 int Offset) { 4702 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32; 4703 4704 // Skip out of bounds offsets, or else we would end up using an undefined 4705 // register. 4706 if (Offset >= NumElts || Offset < 0) 4707 return std::pair(AMDGPU::sub0, Offset); 4708 4709 return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0); 4710 } 4711 4712 static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, 4713 MachineRegisterInfo &MRI, MachineInstr &MI, 4714 int Offset) { 4715 MachineBasicBlock *MBB = MI.getParent(); 4716 const DebugLoc &DL = MI.getDebugLoc(); 4717 MachineBasicBlock::iterator I(&MI); 4718 4719 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); 4720 4721 assert(Idx->getReg() != AMDGPU::NoRegister); 4722 4723 if (Offset == 0) { 4724 // clang-format off 4725 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) 4726 .add(*Idx); 4727 // clang-format on 4728 } else { 4729 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) 4730 .add(*Idx) 4731 .addImm(Offset); 4732 } 4733 } 4734 4735 static Register getIndirectSGPRIdx(const SIInstrInfo *TII, 4736 MachineRegisterInfo &MRI, MachineInstr &MI, 4737 int Offset) { 4738 MachineBasicBlock *MBB = MI.getParent(); 4739 const DebugLoc &DL = MI.getDebugLoc(); 4740 MachineBasicBlock::iterator I(&MI); 4741 4742 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); 4743 4744 if (Offset == 0) 4745 return Idx->getReg(); 4746 4747 Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 4748 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp) 4749 .add(*Idx) 4750 .addImm(Offset); 4751 return Tmp; 4752 } 4753 4754 static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI, 4755 MachineBasicBlock &MBB, 4756 const GCNSubtarget &ST) { 4757 const SIInstrInfo *TII = ST.getInstrInfo(); 4758 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 4759 MachineFunction *MF = MBB.getParent(); 4760 MachineRegisterInfo &MRI = MF->getRegInfo(); 4761 4762 Register Dst = MI.getOperand(0).getReg(); 4763 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); 4764 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg(); 4765 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm(); 4766 4767 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg); 4768 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg()); 4769 4770 unsigned SubReg; 4771 std::tie(SubReg, Offset) = 4772 computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset); 4773 4774 const bool UseGPRIdxMode = ST.useVGPRIndexMode(); 4775 4776 // Check for a SGPR index. 4777 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) { 4778 MachineBasicBlock::iterator I(&MI); 4779 const DebugLoc &DL = MI.getDebugLoc(); 4780 4781 if (UseGPRIdxMode) { 4782 // TODO: Look at the uses to avoid the copy. This may require rescheduling 4783 // to avoid interfering with other uses, so probably requires a new 4784 // optimization pass. 4785 Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset); 4786 4787 const MCInstrDesc &GPRIDXDesc = 4788 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true); 4789 BuildMI(MBB, I, DL, GPRIDXDesc, Dst) 4790 .addReg(SrcReg) 4791 .addReg(Idx) 4792 .addImm(SubReg); 4793 } else { 4794 setM0ToIndexFromSGPR(TII, MRI, MI, Offset); 4795 4796 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst) 4797 .addReg(SrcReg, 0, SubReg) 4798 .addReg(SrcReg, RegState::Implicit); 4799 } 4800 4801 MI.eraseFromParent(); 4802 4803 return &MBB; 4804 } 4805 4806 // Control flow needs to be inserted if indexing with a VGPR. 4807 const DebugLoc &DL = MI.getDebugLoc(); 4808 MachineBasicBlock::iterator I(&MI); 4809 4810 Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4811 Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4812 4813 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg); 4814 4815 Register SGPRIdxReg; 4816 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset, 4817 UseGPRIdxMode, SGPRIdxReg); 4818 4819 MachineBasicBlock *LoopBB = InsPt->getParent(); 4820 4821 if (UseGPRIdxMode) { 4822 const MCInstrDesc &GPRIDXDesc = 4823 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true); 4824 4825 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst) 4826 .addReg(SrcReg) 4827 .addReg(SGPRIdxReg) 4828 .addImm(SubReg); 4829 } else { 4830 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst) 4831 .addReg(SrcReg, 0, SubReg) 4832 .addReg(SrcReg, RegState::Implicit); 4833 } 4834 4835 MI.eraseFromParent(); 4836 4837 return LoopBB; 4838 } 4839 4840 static MachineBasicBlock *emitIndirectDst(MachineInstr &MI, 4841 MachineBasicBlock &MBB, 4842 const GCNSubtarget &ST) { 4843 const SIInstrInfo *TII = ST.getInstrInfo(); 4844 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 4845 MachineFunction *MF = MBB.getParent(); 4846 MachineRegisterInfo &MRI = MF->getRegInfo(); 4847 4848 Register Dst = MI.getOperand(0).getReg(); 4849 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src); 4850 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); 4851 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val); 4852 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm(); 4853 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg()); 4854 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg()); 4855 4856 // This can be an immediate, but will be folded later. 4857 assert(Val->getReg()); 4858 4859 unsigned SubReg; 4860 std::tie(SubReg, Offset) = 4861 computeIndirectRegAndOffset(TRI, VecRC, SrcVec->getReg(), Offset); 4862 const bool UseGPRIdxMode = ST.useVGPRIndexMode(); 4863 4864 if (Idx->getReg() == AMDGPU::NoRegister) { 4865 MachineBasicBlock::iterator I(&MI); 4866 const DebugLoc &DL = MI.getDebugLoc(); 4867 4868 assert(Offset == 0); 4869 4870 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst) 4871 .add(*SrcVec) 4872 .add(*Val) 4873 .addImm(SubReg); 4874 4875 MI.eraseFromParent(); 4876 return &MBB; 4877 } 4878 4879 // Check for a SGPR index. 4880 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) { 4881 MachineBasicBlock::iterator I(&MI); 4882 const DebugLoc &DL = MI.getDebugLoc(); 4883 4884 if (UseGPRIdxMode) { 4885 Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset); 4886 4887 const MCInstrDesc &GPRIDXDesc = 4888 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false); 4889 BuildMI(MBB, I, DL, GPRIDXDesc, Dst) 4890 .addReg(SrcVec->getReg()) 4891 .add(*Val) 4892 .addReg(Idx) 4893 .addImm(SubReg); 4894 } else { 4895 setM0ToIndexFromSGPR(TII, MRI, MI, Offset); 4896 4897 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo( 4898 TRI.getRegSizeInBits(*VecRC), 32, false); 4899 BuildMI(MBB, I, DL, MovRelDesc, Dst) 4900 .addReg(SrcVec->getReg()) 4901 .add(*Val) 4902 .addImm(SubReg); 4903 } 4904 MI.eraseFromParent(); 4905 return &MBB; 4906 } 4907 4908 // Control flow needs to be inserted if indexing with a VGPR. 4909 if (Val->isReg()) 4910 MRI.clearKillFlags(Val->getReg()); 4911 4912 const DebugLoc &DL = MI.getDebugLoc(); 4913 4914 Register PhiReg = MRI.createVirtualRegister(VecRC); 4915 4916 Register SGPRIdxReg; 4917 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset, 4918 UseGPRIdxMode, SGPRIdxReg); 4919 MachineBasicBlock *LoopBB = InsPt->getParent(); 4920 4921 if (UseGPRIdxMode) { 4922 const MCInstrDesc &GPRIDXDesc = 4923 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false); 4924 4925 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst) 4926 .addReg(PhiReg) 4927 .add(*Val) 4928 .addReg(SGPRIdxReg) 4929 .addImm(SubReg); 4930 } else { 4931 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo( 4932 TRI.getRegSizeInBits(*VecRC), 32, false); 4933 BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst) 4934 .addReg(PhiReg) 4935 .add(*Val) 4936 .addImm(SubReg); 4937 } 4938 4939 MI.eraseFromParent(); 4940 return LoopBB; 4941 } 4942 4943 static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, 4944 MachineBasicBlock &BB, 4945 const GCNSubtarget &ST, 4946 unsigned Opc) { 4947 MachineRegisterInfo &MRI = BB.getParent()->getRegInfo(); 4948 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 4949 const DebugLoc &DL = MI.getDebugLoc(); 4950 const SIInstrInfo *TII = ST.getInstrInfo(); 4951 4952 // Reduction operations depend on whether the input operand is SGPR or VGPR. 4953 Register SrcReg = MI.getOperand(1).getReg(); 4954 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg)); 4955 Register DstReg = MI.getOperand(0).getReg(); 4956 MachineBasicBlock *RetBB = nullptr; 4957 if (isSGPR) { 4958 // These operations with a uniform value i.e. SGPR are idempotent. 4959 // Reduced value will be same as given sgpr. 4960 // clang-format off 4961 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg) 4962 .addReg(SrcReg); 4963 // clang-format on 4964 RetBB = &BB; 4965 } else { 4966 // TODO: Implement DPP Strategy and switch based on immediate strategy 4967 // operand. For now, for all the cases (default, Iterative and DPP we use 4968 // iterative approach by default.) 4969 4970 // To reduce the VGPR using iterative approach, we need to iterate 4971 // over all the active lanes. Lowering consists of ComputeLoop, 4972 // which iterate over only active lanes. We use copy of EXEC register 4973 // as induction variable and every active lane modifies it using bitset0 4974 // so that we will get the next active lane for next iteration. 4975 MachineBasicBlock::iterator I = BB.end(); 4976 Register SrcReg = MI.getOperand(1).getReg(); 4977 4978 // Create Control flow for loop 4979 // Split MI's Machine Basic block into For loop 4980 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true); 4981 4982 // Create virtual registers required for lowering. 4983 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass(); 4984 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg); 4985 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass); 4986 Register InitalValReg = MRI.createVirtualRegister(DstRegClass); 4987 4988 Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass); 4989 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass); 4990 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass); 4991 4992 Register FF1Reg = MRI.createVirtualRegister(DstRegClass); 4993 Register LaneValueReg = MRI.createVirtualRegister(DstRegClass); 4994 4995 bool IsWave32 = ST.isWave32(); 4996 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 4997 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 4998 4999 // Create initail values of induction variable from Exec, Accumulator and 5000 // insert branch instr to newly created ComputeBlockk 5001 uint32_t InitalValue = 5002 (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0; 5003 auto TmpSReg = 5004 BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg); 5005 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg) 5006 .addImm(InitalValue); 5007 // clang-format off 5008 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH)) 5009 .addMBB(ComputeLoop); 5010 // clang-format on 5011 5012 // Start constructing ComputeLoop 5013 I = ComputeLoop->end(); 5014 auto Accumulator = 5015 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg) 5016 .addReg(InitalValReg) 5017 .addMBB(&BB); 5018 auto ActiveBits = 5019 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg) 5020 .addReg(TmpSReg->getOperand(0).getReg()) 5021 .addMBB(&BB); 5022 5023 // Perform the computations 5024 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64; 5025 auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg) 5026 .addReg(ActiveBits->getOperand(0).getReg()); 5027 auto LaneValue = BuildMI(*ComputeLoop, I, DL, 5028 TII->get(AMDGPU::V_READLANE_B32), LaneValueReg) 5029 .addReg(SrcReg) 5030 .addReg(FF1->getOperand(0).getReg()); 5031 auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg) 5032 .addReg(Accumulator->getOperand(0).getReg()) 5033 .addReg(LaneValue->getOperand(0).getReg()); 5034 5035 // Manipulate the iterator to get the next active lane 5036 unsigned BITSETOpc = 5037 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64; 5038 auto NewActiveBits = 5039 BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg) 5040 .addReg(FF1->getOperand(0).getReg()) 5041 .addReg(ActiveBits->getOperand(0).getReg()); 5042 5043 // Add phi nodes 5044 Accumulator.addReg(NewAccumulator->getOperand(0).getReg()) 5045 .addMBB(ComputeLoop); 5046 ActiveBits.addReg(NewActiveBits->getOperand(0).getReg()) 5047 .addMBB(ComputeLoop); 5048 5049 // Creating branching 5050 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64; 5051 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc)) 5052 .addReg(NewActiveBits->getOperand(0).getReg()) 5053 .addImm(0); 5054 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1)) 5055 .addMBB(ComputeLoop); 5056 5057 RetBB = ComputeEnd; 5058 } 5059 MI.eraseFromParent(); 5060 return RetBB; 5061 } 5062 5063 MachineBasicBlock * 5064 SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, 5065 MachineBasicBlock *BB) const { 5066 5067 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 5068 MachineFunction *MF = BB->getParent(); 5069 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 5070 5071 switch (MI.getOpcode()) { 5072 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32: 5073 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32); 5074 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32: 5075 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32); 5076 case AMDGPU::S_UADDO_PSEUDO: 5077 case AMDGPU::S_USUBO_PSEUDO: { 5078 const DebugLoc &DL = MI.getDebugLoc(); 5079 MachineOperand &Dest0 = MI.getOperand(0); 5080 MachineOperand &Dest1 = MI.getOperand(1); 5081 MachineOperand &Src0 = MI.getOperand(2); 5082 MachineOperand &Src1 = MI.getOperand(3); 5083 5084 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO) 5085 ? AMDGPU::S_ADD_I32 5086 : AMDGPU::S_SUB_I32; 5087 // clang-format off 5088 BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg()) 5089 .add(Src0) 5090 .add(Src1); 5091 // clang-format on 5092 5093 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg()) 5094 .addImm(1) 5095 .addImm(0); 5096 5097 MI.eraseFromParent(); 5098 return BB; 5099 } 5100 case AMDGPU::S_ADD_U64_PSEUDO: 5101 case AMDGPU::S_SUB_U64_PSEUDO: { 5102 // For targets older than GFX12, we emit a sequence of 32-bit operations. 5103 // For GFX12, we emit s_add_u64 and s_sub_u64. 5104 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 5105 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 5106 const DebugLoc &DL = MI.getDebugLoc(); 5107 MachineOperand &Dest = MI.getOperand(0); 5108 MachineOperand &Src0 = MI.getOperand(1); 5109 MachineOperand &Src1 = MI.getOperand(2); 5110 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO); 5111 if (Subtarget->hasScalarAddSub64()) { 5112 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64; 5113 // clang-format off 5114 BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg()) 5115 .add(Src0) 5116 .add(Src1); 5117 // clang-format on 5118 } else { 5119 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 5120 const TargetRegisterClass *BoolRC = TRI->getBoolRC(); 5121 5122 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 5123 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 5124 5125 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm( 5126 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass); 5127 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm( 5128 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass); 5129 5130 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm( 5131 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass); 5132 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm( 5133 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass); 5134 5135 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; 5136 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; 5137 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0) 5138 .add(Src0Sub0) 5139 .add(Src1Sub0); 5140 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1) 5141 .add(Src0Sub1) 5142 .add(Src1Sub1); 5143 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg()) 5144 .addReg(DestSub0) 5145 .addImm(AMDGPU::sub0) 5146 .addReg(DestSub1) 5147 .addImm(AMDGPU::sub1); 5148 } 5149 MI.eraseFromParent(); 5150 return BB; 5151 } 5152 case AMDGPU::V_ADD_U64_PSEUDO: 5153 case AMDGPU::V_SUB_U64_PSEUDO: { 5154 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 5155 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 5156 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 5157 const DebugLoc &DL = MI.getDebugLoc(); 5158 5159 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO); 5160 5161 MachineOperand &Dest = MI.getOperand(0); 5162 MachineOperand &Src0 = MI.getOperand(1); 5163 MachineOperand &Src1 = MI.getOperand(2); 5164 5165 if (IsAdd && ST.hasLshlAddB64()) { 5166 auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64), 5167 Dest.getReg()) 5168 .add(Src0) 5169 .addImm(0) 5170 .add(Src1); 5171 TII->legalizeOperands(*Add); 5172 MI.eraseFromParent(); 5173 return BB; 5174 } 5175 5176 const auto *CarryRC = TRI->getWaveMaskRegClass(); 5177 5178 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5179 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5180 5181 Register CarryReg = MRI.createVirtualRegister(CarryRC); 5182 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC); 5183 5184 const TargetRegisterClass *Src0RC = Src0.isReg() 5185 ? MRI.getRegClass(Src0.getReg()) 5186 : &AMDGPU::VReg_64RegClass; 5187 const TargetRegisterClass *Src1RC = Src1.isReg() 5188 ? MRI.getRegClass(Src1.getReg()) 5189 : &AMDGPU::VReg_64RegClass; 5190 5191 const TargetRegisterClass *Src0SubRC = 5192 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0); 5193 const TargetRegisterClass *Src1SubRC = 5194 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1); 5195 5196 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm( 5197 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC); 5198 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm( 5199 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC); 5200 5201 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm( 5202 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC); 5203 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm( 5204 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC); 5205 5206 unsigned LoOpc = 5207 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64; 5208 MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0) 5209 .addReg(CarryReg, RegState::Define) 5210 .add(SrcReg0Sub0) 5211 .add(SrcReg1Sub0) 5212 .addImm(0); // clamp bit 5213 5214 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64; 5215 MachineInstr *HiHalf = 5216 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1) 5217 .addReg(DeadCarryReg, RegState::Define | RegState::Dead) 5218 .add(SrcReg0Sub1) 5219 .add(SrcReg1Sub1) 5220 .addReg(CarryReg, RegState::Kill) 5221 .addImm(0); // clamp bit 5222 5223 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg()) 5224 .addReg(DestSub0) 5225 .addImm(AMDGPU::sub0) 5226 .addReg(DestSub1) 5227 .addImm(AMDGPU::sub1); 5228 TII->legalizeOperands(*LoHalf); 5229 TII->legalizeOperands(*HiHalf); 5230 MI.eraseFromParent(); 5231 return BB; 5232 } 5233 case AMDGPU::S_ADD_CO_PSEUDO: 5234 case AMDGPU::S_SUB_CO_PSEUDO: { 5235 // This pseudo has a chance to be selected 5236 // only from uniform add/subcarry node. All the VGPR operands 5237 // therefore assumed to be splat vectors. 5238 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 5239 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 5240 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 5241 MachineBasicBlock::iterator MII = MI; 5242 const DebugLoc &DL = MI.getDebugLoc(); 5243 MachineOperand &Dest = MI.getOperand(0); 5244 MachineOperand &CarryDest = MI.getOperand(1); 5245 MachineOperand &Src0 = MI.getOperand(2); 5246 MachineOperand &Src1 = MI.getOperand(3); 5247 MachineOperand &Src2 = MI.getOperand(4); 5248 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO) 5249 ? AMDGPU::S_ADDC_U32 5250 : AMDGPU::S_SUBB_U32; 5251 if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) { 5252 Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 5253 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0) 5254 .addReg(Src0.getReg()); 5255 Src0.setReg(RegOp0); 5256 } 5257 if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) { 5258 Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 5259 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1) 5260 .addReg(Src1.getReg()); 5261 Src1.setReg(RegOp1); 5262 } 5263 Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 5264 if (TRI->isVectorRegister(MRI, Src2.getReg())) { 5265 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2) 5266 .addReg(Src2.getReg()); 5267 Src2.setReg(RegOp2); 5268 } 5269 5270 const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg()); 5271 unsigned WaveSize = TRI->getRegSizeInBits(*Src2RC); 5272 assert(WaveSize == 64 || WaveSize == 32); 5273 5274 if (WaveSize == 64) { 5275 if (ST.hasScalarCompareEq64()) { 5276 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64)) 5277 .addReg(Src2.getReg()) 5278 .addImm(0); 5279 } else { 5280 const TargetRegisterClass *SubRC = 5281 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0); 5282 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm( 5283 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC); 5284 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm( 5285 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC); 5286 Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 5287 5288 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32) 5289 .add(Src2Sub0) 5290 .add(Src2Sub1); 5291 5292 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32)) 5293 .addReg(Src2_32, RegState::Kill) 5294 .addImm(0); 5295 } 5296 } else { 5297 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32)) 5298 .addReg(Src2.getReg()) 5299 .addImm(0); 5300 } 5301 5302 // clang-format off 5303 BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg()) 5304 .add(Src0) 5305 .add(Src1); 5306 // clang-format on 5307 5308 unsigned SelOpc = 5309 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32; 5310 5311 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg()) 5312 .addImm(-1) 5313 .addImm(0); 5314 5315 MI.eraseFromParent(); 5316 return BB; 5317 } 5318 case AMDGPU::SI_INIT_M0: { 5319 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(), 5320 TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) 5321 .add(MI.getOperand(0)); 5322 MI.eraseFromParent(); 5323 return BB; 5324 } 5325 case AMDGPU::GET_GROUPSTATICSIZE: { 5326 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA || 5327 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL); 5328 DebugLoc DL = MI.getDebugLoc(); 5329 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32)) 5330 .add(MI.getOperand(0)) 5331 .addImm(MFI->getLDSSize()); 5332 MI.eraseFromParent(); 5333 return BB; 5334 } 5335 case AMDGPU::GET_SHADERCYCLESHILO: { 5336 assert(MF->getSubtarget<GCNSubtarget>().hasShaderCyclesHiLoRegisters()); 5337 MachineRegisterInfo &MRI = MF->getRegInfo(); 5338 const DebugLoc &DL = MI.getDebugLoc(); 5339 // The algorithm is: 5340 // 5341 // hi1 = getreg(SHADER_CYCLES_HI) 5342 // lo1 = getreg(SHADER_CYCLES_LO) 5343 // hi2 = getreg(SHADER_CYCLES_HI) 5344 // 5345 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1. 5346 // Otherwise there was overflow and the result is hi2:0. In both cases the 5347 // result should represent the actual time at some point during the sequence 5348 // of three getregs. 5349 using namespace AMDGPU::Hwreg; 5350 Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 5351 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1) 5352 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32)); 5353 Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 5354 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1) 5355 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32)); 5356 Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 5357 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2) 5358 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32)); 5359 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32)) 5360 .addReg(RegHi1) 5361 .addReg(RegHi2); 5362 Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 5363 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo) 5364 .addReg(RegLo1) 5365 .addImm(0); 5366 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE)) 5367 .add(MI.getOperand(0)) 5368 .addReg(RegLo) 5369 .addImm(AMDGPU::sub0) 5370 .addReg(RegHi2) 5371 .addImm(AMDGPU::sub1); 5372 MI.eraseFromParent(); 5373 return BB; 5374 } 5375 case AMDGPU::SI_INDIRECT_SRC_V1: 5376 case AMDGPU::SI_INDIRECT_SRC_V2: 5377 case AMDGPU::SI_INDIRECT_SRC_V4: 5378 case AMDGPU::SI_INDIRECT_SRC_V8: 5379 case AMDGPU::SI_INDIRECT_SRC_V9: 5380 case AMDGPU::SI_INDIRECT_SRC_V10: 5381 case AMDGPU::SI_INDIRECT_SRC_V11: 5382 case AMDGPU::SI_INDIRECT_SRC_V12: 5383 case AMDGPU::SI_INDIRECT_SRC_V16: 5384 case AMDGPU::SI_INDIRECT_SRC_V32: 5385 return emitIndirectSrc(MI, *BB, *getSubtarget()); 5386 case AMDGPU::SI_INDIRECT_DST_V1: 5387 case AMDGPU::SI_INDIRECT_DST_V2: 5388 case AMDGPU::SI_INDIRECT_DST_V4: 5389 case AMDGPU::SI_INDIRECT_DST_V8: 5390 case AMDGPU::SI_INDIRECT_DST_V9: 5391 case AMDGPU::SI_INDIRECT_DST_V10: 5392 case AMDGPU::SI_INDIRECT_DST_V11: 5393 case AMDGPU::SI_INDIRECT_DST_V12: 5394 case AMDGPU::SI_INDIRECT_DST_V16: 5395 case AMDGPU::SI_INDIRECT_DST_V32: 5396 return emitIndirectDst(MI, *BB, *getSubtarget()); 5397 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO: 5398 case AMDGPU::SI_KILL_I1_PSEUDO: 5399 return splitKillBlock(MI, BB); 5400 case AMDGPU::V_CNDMASK_B64_PSEUDO: { 5401 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 5402 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 5403 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 5404 5405 Register Dst = MI.getOperand(0).getReg(); 5406 const MachineOperand &Src0 = MI.getOperand(1); 5407 const MachineOperand &Src1 = MI.getOperand(2); 5408 const DebugLoc &DL = MI.getDebugLoc(); 5409 Register SrcCond = MI.getOperand(3).getReg(); 5410 5411 Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5412 Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5413 const auto *CondRC = TRI->getWaveMaskRegClass(); 5414 Register SrcCondCopy = MRI.createVirtualRegister(CondRC); 5415 5416 const TargetRegisterClass *Src0RC = Src0.isReg() 5417 ? MRI.getRegClass(Src0.getReg()) 5418 : &AMDGPU::VReg_64RegClass; 5419 const TargetRegisterClass *Src1RC = Src1.isReg() 5420 ? MRI.getRegClass(Src1.getReg()) 5421 : &AMDGPU::VReg_64RegClass; 5422 5423 const TargetRegisterClass *Src0SubRC = 5424 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0); 5425 const TargetRegisterClass *Src1SubRC = 5426 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1); 5427 5428 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm( 5429 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC); 5430 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm( 5431 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC); 5432 5433 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm( 5434 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC); 5435 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm( 5436 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC); 5437 5438 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy).addReg(SrcCond); 5439 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo) 5440 .addImm(0) 5441 .add(Src0Sub0) 5442 .addImm(0) 5443 .add(Src1Sub0) 5444 .addReg(SrcCondCopy); 5445 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi) 5446 .addImm(0) 5447 .add(Src0Sub1) 5448 .addImm(0) 5449 .add(Src1Sub1) 5450 .addReg(SrcCondCopy); 5451 5452 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst) 5453 .addReg(DstLo) 5454 .addImm(AMDGPU::sub0) 5455 .addReg(DstHi) 5456 .addImm(AMDGPU::sub1); 5457 MI.eraseFromParent(); 5458 return BB; 5459 } 5460 case AMDGPU::SI_BR_UNDEF: { 5461 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 5462 const DebugLoc &DL = MI.getDebugLoc(); 5463 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1)) 5464 .add(MI.getOperand(0)); 5465 Br->getOperand(1).setIsUndef(); // read undef SCC 5466 MI.eraseFromParent(); 5467 return BB; 5468 } 5469 case AMDGPU::ADJCALLSTACKUP: 5470 case AMDGPU::ADJCALLSTACKDOWN: { 5471 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 5472 MachineInstrBuilder MIB(*MF, &MI); 5473 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine) 5474 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit); 5475 return BB; 5476 } 5477 case AMDGPU::SI_CALL_ISEL: { 5478 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 5479 const DebugLoc &DL = MI.getDebugLoc(); 5480 5481 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF); 5482 5483 MachineInstrBuilder MIB; 5484 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg); 5485 5486 for (const MachineOperand &MO : MI.operands()) 5487 MIB.add(MO); 5488 5489 MIB.cloneMemRefs(MI); 5490 MI.eraseFromParent(); 5491 return BB; 5492 } 5493 case AMDGPU::V_ADD_CO_U32_e32: 5494 case AMDGPU::V_SUB_CO_U32_e32: 5495 case AMDGPU::V_SUBREV_CO_U32_e32: { 5496 // TODO: Define distinct V_*_I32_Pseudo instructions instead. 5497 const DebugLoc &DL = MI.getDebugLoc(); 5498 unsigned Opc = MI.getOpcode(); 5499 5500 bool NeedClampOperand = false; 5501 if (TII->pseudoToMCOpcode(Opc) == -1) { 5502 Opc = AMDGPU::getVOPe64(Opc); 5503 NeedClampOperand = true; 5504 } 5505 5506 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg()); 5507 if (TII->isVOP3(*I)) { 5508 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 5509 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 5510 I.addReg(TRI->getVCC(), RegState::Define); 5511 } 5512 I.add(MI.getOperand(1)).add(MI.getOperand(2)); 5513 if (NeedClampOperand) 5514 I.addImm(0); // clamp bit for e64 encoding 5515 5516 TII->legalizeOperands(*I); 5517 5518 MI.eraseFromParent(); 5519 return BB; 5520 } 5521 case AMDGPU::V_ADDC_U32_e32: 5522 case AMDGPU::V_SUBB_U32_e32: 5523 case AMDGPU::V_SUBBREV_U32_e32: 5524 // These instructions have an implicit use of vcc which counts towards the 5525 // constant bus limit. 5526 TII->legalizeOperands(MI); 5527 return BB; 5528 case AMDGPU::DS_GWS_INIT: 5529 case AMDGPU::DS_GWS_SEMA_BR: 5530 case AMDGPU::DS_GWS_BARRIER: 5531 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0); 5532 [[fallthrough]]; 5533 case AMDGPU::DS_GWS_SEMA_V: 5534 case AMDGPU::DS_GWS_SEMA_P: 5535 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL: 5536 // A s_waitcnt 0 is required to be the instruction immediately following. 5537 if (getSubtarget()->hasGWSAutoReplay()) { 5538 bundleInstWithWaitcnt(MI); 5539 return BB; 5540 } 5541 5542 return emitGWSMemViolTestLoop(MI, BB); 5543 case AMDGPU::S_SETREG_B32: { 5544 // Try to optimize cases that only set the denormal mode or rounding mode. 5545 // 5546 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or 5547 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode 5548 // instead. 5549 // 5550 // FIXME: This could be predicates on the immediate, but tablegen doesn't 5551 // allow you to have a no side effect instruction in the output of a 5552 // sideeffecting pattern. 5553 auto [ID, Offset, Width] = 5554 AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm()); 5555 if (ID != AMDGPU::Hwreg::ID_MODE) 5556 return BB; 5557 5558 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width); 5559 const unsigned SetMask = WidthMask << Offset; 5560 5561 if (getSubtarget()->hasDenormModeInst()) { 5562 unsigned SetDenormOp = 0; 5563 unsigned SetRoundOp = 0; 5564 5565 // The dedicated instructions can only set the whole denorm or round mode 5566 // at once, not a subset of bits in either. 5567 if (SetMask == 5568 (AMDGPU::Hwreg::FP_ROUND_MASK | AMDGPU::Hwreg::FP_DENORM_MASK)) { 5569 // If this fully sets both the round and denorm mode, emit the two 5570 // dedicated instructions for these. 5571 SetRoundOp = AMDGPU::S_ROUND_MODE; 5572 SetDenormOp = AMDGPU::S_DENORM_MODE; 5573 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) { 5574 SetRoundOp = AMDGPU::S_ROUND_MODE; 5575 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) { 5576 SetDenormOp = AMDGPU::S_DENORM_MODE; 5577 } 5578 5579 if (SetRoundOp || SetDenormOp) { 5580 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 5581 MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg()); 5582 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) { 5583 unsigned ImmVal = Def->getOperand(1).getImm(); 5584 if (SetRoundOp) { 5585 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp)) 5586 .addImm(ImmVal & 0xf); 5587 5588 // If we also have the denorm mode, get just the denorm mode bits. 5589 ImmVal >>= 4; 5590 } 5591 5592 if (SetDenormOp) { 5593 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp)) 5594 .addImm(ImmVal & 0xf); 5595 } 5596 5597 MI.eraseFromParent(); 5598 return BB; 5599 } 5600 } 5601 } 5602 5603 // If only FP bits are touched, used the no side effects pseudo. 5604 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK | 5605 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask) 5606 MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode)); 5607 5608 return BB; 5609 } 5610 case AMDGPU::S_INVERSE_BALLOT_U32: 5611 case AMDGPU::S_INVERSE_BALLOT_U64: 5612 // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if 5613 // necessary. After that they are equivalent to a COPY. 5614 MI.setDesc(TII->get(AMDGPU::COPY)); 5615 return BB; 5616 case AMDGPU::ENDPGM_TRAP: { 5617 const DebugLoc &DL = MI.getDebugLoc(); 5618 if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) { 5619 MI.setDesc(TII->get(AMDGPU::S_ENDPGM)); 5620 MI.addOperand(MachineOperand::CreateImm(0)); 5621 return BB; 5622 } 5623 5624 // We need a block split to make the real endpgm a terminator. We also don't 5625 // want to break phis in successor blocks, so we can't just delete to the 5626 // end of the block. 5627 5628 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/); 5629 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); 5630 MF->push_back(TrapBB); 5631 // clang-format off 5632 BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM)) 5633 .addImm(0); 5634 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) 5635 .addMBB(TrapBB); 5636 // clang-format on 5637 5638 BB->addSuccessor(TrapBB); 5639 MI.eraseFromParent(); 5640 return SplitBB; 5641 } 5642 case AMDGPU::SIMULATED_TRAP: { 5643 assert(Subtarget->hasPrivEnabledTrap2NopBug()); 5644 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 5645 MachineBasicBlock *SplitBB = 5646 TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc()); 5647 MI.eraseFromParent(); 5648 return SplitBB; 5649 } 5650 default: 5651 if (TII->isImage(MI) || TII->isMUBUF(MI)) { 5652 if (!MI.mayStore()) 5653 AddMemOpInit(MI); 5654 return BB; 5655 } 5656 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); 5657 } 5658 } 5659 5660 bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const { 5661 // This currently forces unfolding various combinations of fsub into fma with 5662 // free fneg'd operands. As long as we have fast FMA (controlled by 5663 // isFMAFasterThanFMulAndFAdd), we should perform these. 5664 5665 // When fma is quarter rate, for f64 where add / sub are at best half rate, 5666 // most of these combines appear to be cycle neutral but save on instruction 5667 // count / code size. 5668 return true; 5669 } 5670 5671 bool SITargetLowering::enableAggressiveFMAFusion(LLT Ty) const { return true; } 5672 5673 EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx, 5674 EVT VT) const { 5675 if (!VT.isVector()) { 5676 return MVT::i1; 5677 } 5678 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements()); 5679 } 5680 5681 MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT VT) const { 5682 // TODO: Should i16 be used always if legal? For now it would force VALU 5683 // shifts. 5684 return (VT == MVT::i16) ? MVT::i16 : MVT::i32; 5685 } 5686 5687 LLT SITargetLowering::getPreferredShiftAmountTy(LLT Ty) const { 5688 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts()) 5689 ? Ty.changeElementSize(16) 5690 : Ty.changeElementSize(32); 5691 } 5692 5693 // Answering this is somewhat tricky and depends on the specific device which 5694 // have different rates for fma or all f64 operations. 5695 // 5696 // v_fma_f64 and v_mul_f64 always take the same number of cycles as each other 5697 // regardless of which device (although the number of cycles differs between 5698 // devices), so it is always profitable for f64. 5699 // 5700 // v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable 5701 // only on full rate devices. Normally, we should prefer selecting v_mad_f32 5702 // which we can always do even without fused FP ops since it returns the same 5703 // result as the separate operations and since it is always full 5704 // rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32 5705 // however does not support denormals, so we do report fma as faster if we have 5706 // a fast fma device and require denormals. 5707 // 5708 bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, 5709 EVT VT) const { 5710 VT = VT.getScalarType(); 5711 5712 switch (VT.getSimpleVT().SimpleTy) { 5713 case MVT::f32: { 5714 // If mad is not available this depends only on if f32 fma is full rate. 5715 if (!Subtarget->hasMadMacF32Insts()) 5716 return Subtarget->hasFastFMAF32(); 5717 5718 // Otherwise f32 mad is always full rate and returns the same result as 5719 // the separate operations so should be preferred over fma. 5720 // However does not support denormals. 5721 if (!denormalModeIsFlushAllF32(MF)) 5722 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts(); 5723 5724 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32. 5725 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts(); 5726 } 5727 case MVT::f64: 5728 return true; 5729 case MVT::f16: 5730 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF); 5731 default: 5732 break; 5733 } 5734 5735 return false; 5736 } 5737 5738 bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, 5739 LLT Ty) const { 5740 switch (Ty.getScalarSizeInBits()) { 5741 case 16: 5742 return isFMAFasterThanFMulAndFAdd(MF, MVT::f16); 5743 case 32: 5744 return isFMAFasterThanFMulAndFAdd(MF, MVT::f32); 5745 case 64: 5746 return isFMAFasterThanFMulAndFAdd(MF, MVT::f64); 5747 default: 5748 break; 5749 } 5750 5751 return false; 5752 } 5753 5754 // Refer to comments added to the MIR variant of isFMAFasterThanFMulAndFAdd for 5755 // specific details. 5756 bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const Function &F, 5757 Type *Ty) const { 5758 switch (Ty->getScalarSizeInBits()) { 5759 case 16: { 5760 SIModeRegisterDefaults Mode = SIModeRegisterDefaults(F, *Subtarget); 5761 return Subtarget->has16BitInsts() && 5762 Mode.FP64FP16Denormals != DenormalMode::getPreserveSign(); 5763 } 5764 case 32: { 5765 if (!Subtarget->hasMadMacF32Insts()) 5766 return Subtarget->hasFastFMAF32(); 5767 5768 SIModeRegisterDefaults Mode = SIModeRegisterDefaults(F, *Subtarget); 5769 if (Mode.FP32Denormals != DenormalMode::getPreserveSign()) 5770 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts(); 5771 5772 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts(); 5773 } 5774 case 64: 5775 return true; 5776 default: 5777 break; 5778 } 5779 5780 return false; 5781 } 5782 5783 bool SITargetLowering::isFMADLegal(const MachineInstr &MI, LLT Ty) const { 5784 if (!Ty.isScalar()) 5785 return false; 5786 5787 if (Ty.getScalarSizeInBits() == 16) 5788 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF()); 5789 if (Ty.getScalarSizeInBits() == 32) 5790 return Subtarget->hasMadMacF32Insts() && 5791 denormalModeIsFlushAllF32(*MI.getMF()); 5792 5793 return false; 5794 } 5795 5796 bool SITargetLowering::isFMADLegal(const SelectionDAG &DAG, 5797 const SDNode *N) const { 5798 // TODO: Check future ftz flag 5799 // v_mad_f32/v_mac_f32 do not support denormals. 5800 EVT VT = N->getValueType(0); 5801 if (VT == MVT::f32) 5802 return Subtarget->hasMadMacF32Insts() && 5803 denormalModeIsFlushAllF32(DAG.getMachineFunction()); 5804 if (VT == MVT::f16) { 5805 return Subtarget->hasMadF16() && 5806 denormalModeIsFlushAllF64F16(DAG.getMachineFunction()); 5807 } 5808 5809 return false; 5810 } 5811 5812 //===----------------------------------------------------------------------===// 5813 // Custom DAG Lowering Operations 5814 //===----------------------------------------------------------------------===// 5815 5816 // Work around LegalizeDAG doing the wrong thing and fully scalarizing if the 5817 // wider vector type is legal. 5818 SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op, 5819 SelectionDAG &DAG) const { 5820 unsigned Opc = Op.getOpcode(); 5821 EVT VT = Op.getValueType(); 5822 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 || 5823 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 || 5824 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 || 5825 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16); 5826 5827 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0); 5828 5829 SDLoc SL(Op); 5830 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo, Op->getFlags()); 5831 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi, Op->getFlags()); 5832 5833 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi); 5834 } 5835 5836 // Work around LegalizeDAG doing the wrong thing and fully scalarizing if the 5837 // wider vector type is legal. 5838 SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op, 5839 SelectionDAG &DAG) const { 5840 unsigned Opc = Op.getOpcode(); 5841 EVT VT = Op.getValueType(); 5842 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 || 5843 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 || 5844 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 || 5845 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16); 5846 5847 auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0); 5848 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1); 5849 5850 SDLoc SL(Op); 5851 5852 SDValue OpLo = 5853 DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Op->getFlags()); 5854 SDValue OpHi = 5855 DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Op->getFlags()); 5856 5857 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi); 5858 } 5859 5860 SDValue SITargetLowering::splitTernaryVectorOp(SDValue Op, 5861 SelectionDAG &DAG) const { 5862 unsigned Opc = Op.getOpcode(); 5863 EVT VT = Op.getValueType(); 5864 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 || 5865 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 || 5866 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 || 5867 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 || 5868 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 || 5869 VT == MVT::v32bf16); 5870 5871 SDValue Op0 = Op.getOperand(0); 5872 auto [Lo0, Hi0] = Op0.getValueType().isVector() 5873 ? DAG.SplitVectorOperand(Op.getNode(), 0) 5874 : std::pair(Op0, Op0); 5875 5876 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1); 5877 auto [Lo2, Hi2] = DAG.SplitVectorOperand(Op.getNode(), 2); 5878 5879 SDLoc SL(Op); 5880 auto ResVT = DAG.GetSplitDestVTs(VT); 5881 5882 SDValue OpLo = 5883 DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2, Op->getFlags()); 5884 SDValue OpHi = 5885 DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2, Op->getFlags()); 5886 5887 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi); 5888 } 5889 5890 SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 5891 switch (Op.getOpcode()) { 5892 default: 5893 return AMDGPUTargetLowering::LowerOperation(Op, DAG); 5894 case ISD::BRCOND: 5895 return LowerBRCOND(Op, DAG); 5896 case ISD::RETURNADDR: 5897 return LowerRETURNADDR(Op, DAG); 5898 case ISD::LOAD: { 5899 SDValue Result = LowerLOAD(Op, DAG); 5900 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) && 5901 "Load should return a value and a chain"); 5902 return Result; 5903 } 5904 case ISD::FSQRT: { 5905 EVT VT = Op.getValueType(); 5906 if (VT == MVT::f32) 5907 return lowerFSQRTF32(Op, DAG); 5908 if (VT == MVT::f64) 5909 return lowerFSQRTF64(Op, DAG); 5910 return SDValue(); 5911 } 5912 case ISD::FSIN: 5913 case ISD::FCOS: 5914 return LowerTrig(Op, DAG); 5915 case ISD::SELECT: 5916 return LowerSELECT(Op, DAG); 5917 case ISD::FDIV: 5918 return LowerFDIV(Op, DAG); 5919 case ISD::FFREXP: 5920 return LowerFFREXP(Op, DAG); 5921 case ISD::ATOMIC_CMP_SWAP: 5922 return LowerATOMIC_CMP_SWAP(Op, DAG); 5923 case ISD::STORE: 5924 return LowerSTORE(Op, DAG); 5925 case ISD::GlobalAddress: { 5926 MachineFunction &MF = DAG.getMachineFunction(); 5927 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 5928 return LowerGlobalAddress(MFI, Op, DAG); 5929 } 5930 case ISD::INTRINSIC_WO_CHAIN: 5931 return LowerINTRINSIC_WO_CHAIN(Op, DAG); 5932 case ISD::INTRINSIC_W_CHAIN: 5933 return LowerINTRINSIC_W_CHAIN(Op, DAG); 5934 case ISD::INTRINSIC_VOID: 5935 return LowerINTRINSIC_VOID(Op, DAG); 5936 case ISD::ADDRSPACECAST: 5937 return lowerADDRSPACECAST(Op, DAG); 5938 case ISD::INSERT_SUBVECTOR: 5939 return lowerINSERT_SUBVECTOR(Op, DAG); 5940 case ISD::INSERT_VECTOR_ELT: 5941 return lowerINSERT_VECTOR_ELT(Op, DAG); 5942 case ISD::EXTRACT_VECTOR_ELT: 5943 return lowerEXTRACT_VECTOR_ELT(Op, DAG); 5944 case ISD::VECTOR_SHUFFLE: 5945 return lowerVECTOR_SHUFFLE(Op, DAG); 5946 case ISD::SCALAR_TO_VECTOR: 5947 return lowerSCALAR_TO_VECTOR(Op, DAG); 5948 case ISD::BUILD_VECTOR: 5949 return lowerBUILD_VECTOR(Op, DAG); 5950 case ISD::FP_ROUND: 5951 case ISD::STRICT_FP_ROUND: 5952 return lowerFP_ROUND(Op, DAG); 5953 case ISD::TRAP: 5954 return lowerTRAP(Op, DAG); 5955 case ISD::DEBUGTRAP: 5956 return lowerDEBUGTRAP(Op, DAG); 5957 case ISD::ABS: 5958 case ISD::FABS: 5959 case ISD::FNEG: 5960 case ISD::FCANONICALIZE: 5961 case ISD::BSWAP: 5962 return splitUnaryVectorOp(Op, DAG); 5963 case ISD::FMINNUM: 5964 case ISD::FMAXNUM: 5965 return lowerFMINNUM_FMAXNUM(Op, DAG); 5966 case ISD::FLDEXP: 5967 case ISD::STRICT_FLDEXP: 5968 return lowerFLDEXP(Op, DAG); 5969 case ISD::FMA: 5970 return splitTernaryVectorOp(Op, DAG); 5971 case ISD::FP_TO_SINT: 5972 case ISD::FP_TO_UINT: 5973 return LowerFP_TO_INT(Op, DAG); 5974 case ISD::SHL: 5975 case ISD::SRA: 5976 case ISD::SRL: 5977 case ISD::ADD: 5978 case ISD::SUB: 5979 case ISD::SMIN: 5980 case ISD::SMAX: 5981 case ISD::UMIN: 5982 case ISD::UMAX: 5983 case ISD::FADD: 5984 case ISD::FMUL: 5985 case ISD::FMINNUM_IEEE: 5986 case ISD::FMAXNUM_IEEE: 5987 case ISD::FMINIMUM: 5988 case ISD::FMAXIMUM: 5989 case ISD::FMINIMUMNUM: 5990 case ISD::FMAXIMUMNUM: 5991 case ISD::UADDSAT: 5992 case ISD::USUBSAT: 5993 case ISD::SADDSAT: 5994 case ISD::SSUBSAT: 5995 return splitBinaryVectorOp(Op, DAG); 5996 case ISD::MUL: 5997 return lowerMUL(Op, DAG); 5998 case ISD::SMULO: 5999 case ISD::UMULO: 6000 return lowerXMULO(Op, DAG); 6001 case ISD::SMUL_LOHI: 6002 case ISD::UMUL_LOHI: 6003 return lowerXMUL_LOHI(Op, DAG); 6004 case ISD::DYNAMIC_STACKALLOC: 6005 return LowerDYNAMIC_STACKALLOC(Op, DAG); 6006 case ISD::STACKSAVE: 6007 return LowerSTACKSAVE(Op, DAG); 6008 case ISD::GET_ROUNDING: 6009 return lowerGET_ROUNDING(Op, DAG); 6010 case ISD::SET_ROUNDING: 6011 return lowerSET_ROUNDING(Op, DAG); 6012 case ISD::PREFETCH: 6013 return lowerPREFETCH(Op, DAG); 6014 case ISD::FP_EXTEND: 6015 case ISD::STRICT_FP_EXTEND: 6016 return lowerFP_EXTEND(Op, DAG); 6017 case ISD::GET_FPENV: 6018 return lowerGET_FPENV(Op, DAG); 6019 case ISD::SET_FPENV: 6020 return lowerSET_FPENV(Op, DAG); 6021 } 6022 return SDValue(); 6023 } 6024 6025 // Used for D16: Casts the result of an instruction into the right vector, 6026 // packs values if loads return unpacked values. 6027 static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, 6028 const SDLoc &DL, SelectionDAG &DAG, 6029 bool Unpacked) { 6030 if (!LoadVT.isVector()) 6031 return Result; 6032 6033 // Cast back to the original packed type or to a larger type that is a 6034 // multiple of 32 bit for D16. Widening the return type is a required for 6035 // legalization. 6036 EVT FittingLoadVT = LoadVT; 6037 if ((LoadVT.getVectorNumElements() % 2) == 1) { 6038 FittingLoadVT = 6039 EVT::getVectorVT(*DAG.getContext(), LoadVT.getVectorElementType(), 6040 LoadVT.getVectorNumElements() + 1); 6041 } 6042 6043 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16. 6044 // Truncate to v2i16/v4i16. 6045 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger(); 6046 6047 // Workaround legalizer not scalarizing truncate after vector op 6048 // legalization but not creating intermediate vector trunc. 6049 SmallVector<SDValue, 4> Elts; 6050 DAG.ExtractVectorElements(Result, Elts); 6051 for (SDValue &Elt : Elts) 6052 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt); 6053 6054 // Pad illegal v1i16/v3fi6 to v4i16 6055 if ((LoadVT.getVectorNumElements() % 2) == 1) 6056 Elts.push_back(DAG.getUNDEF(MVT::i16)); 6057 6058 Result = DAG.getBuildVector(IntLoadVT, DL, Elts); 6059 6060 // Bitcast to original type (v2f16/v4f16). 6061 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result); 6062 } 6063 6064 // Cast back to the original packed type. 6065 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result); 6066 } 6067 6068 SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, MemSDNode *M, 6069 SelectionDAG &DAG, 6070 ArrayRef<SDValue> Ops, 6071 bool IsIntrinsic) const { 6072 SDLoc DL(M); 6073 6074 bool Unpacked = Subtarget->hasUnpackedD16VMem(); 6075 EVT LoadVT = M->getValueType(0); 6076 6077 EVT EquivLoadVT = LoadVT; 6078 if (LoadVT.isVector()) { 6079 if (Unpacked) { 6080 EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, 6081 LoadVT.getVectorNumElements()); 6082 } else if ((LoadVT.getVectorNumElements() % 2) == 1) { 6083 // Widen v3f16 to legal type 6084 EquivLoadVT = 6085 EVT::getVectorVT(*DAG.getContext(), LoadVT.getVectorElementType(), 6086 LoadVT.getVectorNumElements() + 1); 6087 } 6088 } 6089 6090 // Change from v4f16/v2f16 to EquivLoadVT. 6091 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other); 6092 6093 SDValue Load = DAG.getMemIntrinsicNode( 6094 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL, VTList, Ops, 6095 M->getMemoryVT(), M->getMemOperand()); 6096 6097 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked); 6098 6099 return DAG.getMergeValues({Adjusted, Load.getValue(1)}, DL); 6100 } 6101 6102 SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat, 6103 SelectionDAG &DAG, 6104 ArrayRef<SDValue> Ops) const { 6105 SDLoc DL(M); 6106 EVT LoadVT = M->getValueType(0); 6107 EVT EltType = LoadVT.getScalarType(); 6108 EVT IntVT = LoadVT.changeTypeToInteger(); 6109 6110 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16); 6111 6112 assert(M->getNumValues() == 2 || M->getNumValues() == 3); 6113 bool IsTFE = M->getNumValues() == 3; 6114 6115 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE 6116 : AMDGPUISD::BUFFER_LOAD_FORMAT) 6117 : IsTFE ? AMDGPUISD::BUFFER_LOAD_TFE 6118 : AMDGPUISD::BUFFER_LOAD; 6119 6120 if (IsD16) { 6121 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops); 6122 } 6123 6124 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics 6125 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32) 6126 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand(), 6127 IsTFE); 6128 6129 if (isTypeLegal(LoadVT)) { 6130 return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT, 6131 M->getMemOperand(), DAG); 6132 } 6133 6134 EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT); 6135 SDVTList VTList = DAG.getVTList(CastVT, MVT::Other); 6136 SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT, 6137 M->getMemOperand(), DAG); 6138 return DAG.getMergeValues( 6139 {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)}, 6140 DL); 6141 } 6142 6143 static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, 6144 SelectionDAG &DAG) { 6145 EVT VT = N->getValueType(0); 6146 unsigned CondCode = N->getConstantOperandVal(3); 6147 if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode))) 6148 return DAG.getUNDEF(VT); 6149 6150 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode); 6151 6152 SDValue LHS = N->getOperand(1); 6153 SDValue RHS = N->getOperand(2); 6154 6155 SDLoc DL(N); 6156 6157 EVT CmpVT = LHS.getValueType(); 6158 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) { 6159 unsigned PromoteOp = 6160 ICmpInst::isSigned(IcInput) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 6161 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS); 6162 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS); 6163 } 6164 6165 ISD::CondCode CCOpcode = getICmpCondCode(IcInput); 6166 6167 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize(); 6168 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize); 6169 6170 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS, 6171 DAG.getCondCode(CCOpcode)); 6172 if (VT.bitsEq(CCVT)) 6173 return SetCC; 6174 return DAG.getZExtOrTrunc(SetCC, DL, VT); 6175 } 6176 6177 static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, 6178 SelectionDAG &DAG) { 6179 EVT VT = N->getValueType(0); 6180 6181 unsigned CondCode = N->getConstantOperandVal(3); 6182 if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode))) 6183 return DAG.getUNDEF(VT); 6184 6185 SDValue Src0 = N->getOperand(1); 6186 SDValue Src1 = N->getOperand(2); 6187 EVT CmpVT = Src0.getValueType(); 6188 SDLoc SL(N); 6189 6190 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) { 6191 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0); 6192 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1); 6193 } 6194 6195 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode); 6196 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput); 6197 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize(); 6198 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize); 6199 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, Src1, 6200 DAG.getCondCode(CCOpcode)); 6201 if (VT.bitsEq(CCVT)) 6202 return SetCC; 6203 return DAG.getZExtOrTrunc(SetCC, SL, VT); 6204 } 6205 6206 static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, 6207 SelectionDAG &DAG) { 6208 EVT VT = N->getValueType(0); 6209 SDValue Src = N->getOperand(1); 6210 SDLoc SL(N); 6211 6212 if (Src.getOpcode() == ISD::SETCC) { 6213 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...) 6214 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0), 6215 Src.getOperand(1), Src.getOperand(2)); 6216 } 6217 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) { 6218 // (ballot 0) -> 0 6219 if (Arg->isZero()) 6220 return DAG.getConstant(0, SL, VT); 6221 6222 // (ballot 1) -> EXEC/EXEC_LO 6223 if (Arg->isOne()) { 6224 Register Exec; 6225 if (VT.getScalarSizeInBits() == 32) 6226 Exec = AMDGPU::EXEC_LO; 6227 else if (VT.getScalarSizeInBits() == 64) 6228 Exec = AMDGPU::EXEC; 6229 else 6230 return SDValue(); 6231 6232 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT); 6233 } 6234 } 6235 6236 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0) 6237 // ISD::SETNE) 6238 return DAG.getNode( 6239 AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32), 6240 DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE)); 6241 } 6242 6243 static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, 6244 SelectionDAG &DAG) { 6245 EVT VT = N->getValueType(0); 6246 unsigned ValSize = VT.getSizeInBits(); 6247 unsigned IID = N->getConstantOperandVal(0); 6248 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 || 6249 IID == Intrinsic::amdgcn_permlanex16; 6250 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive || 6251 IID == Intrinsic::amdgcn_set_inactive_chain_arg; 6252 SDLoc SL(N); 6253 MVT IntVT = MVT::getIntegerVT(ValSize); 6254 const GCNSubtarget *ST = TLI.getSubtarget(); 6255 unsigned SplitSize = 32; 6256 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) && 6257 ST->hasDPALU_DPP() && 6258 AMDGPU::isLegalDPALU_DPPControl(N->getConstantOperandVal(3))) 6259 SplitSize = 64; 6260 6261 auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1, 6262 SDValue Src2, MVT ValT) -> SDValue { 6263 SmallVector<SDValue, 8> Operands; 6264 switch (IID) { 6265 case Intrinsic::amdgcn_permlane16: 6266 case Intrinsic::amdgcn_permlanex16: 6267 case Intrinsic::amdgcn_update_dpp: 6268 Operands.push_back(N->getOperand(6)); 6269 Operands.push_back(N->getOperand(5)); 6270 Operands.push_back(N->getOperand(4)); 6271 [[fallthrough]]; 6272 case Intrinsic::amdgcn_writelane: 6273 Operands.push_back(Src2); 6274 [[fallthrough]]; 6275 case Intrinsic::amdgcn_readlane: 6276 case Intrinsic::amdgcn_set_inactive: 6277 case Intrinsic::amdgcn_set_inactive_chain_arg: 6278 case Intrinsic::amdgcn_mov_dpp8: 6279 Operands.push_back(Src1); 6280 [[fallthrough]]; 6281 case Intrinsic::amdgcn_readfirstlane: 6282 case Intrinsic::amdgcn_permlane64: 6283 Operands.push_back(Src0); 6284 break; 6285 default: 6286 llvm_unreachable("unhandled lane op"); 6287 } 6288 6289 Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32)); 6290 std::reverse(Operands.begin(), Operands.end()); 6291 6292 if (SDNode *GL = N->getGluedNode()) { 6293 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE); 6294 GL = GL->getOperand(0).getNode(); 6295 Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue, 6296 SDValue(GL, 0))); 6297 } 6298 6299 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands); 6300 }; 6301 6302 SDValue Src0 = N->getOperand(1); 6303 SDValue Src1, Src2; 6304 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane || 6305 IID == Intrinsic::amdgcn_mov_dpp8 || 6306 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) { 6307 Src1 = N->getOperand(2); 6308 if (IID == Intrinsic::amdgcn_writelane || 6309 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16) 6310 Src2 = N->getOperand(3); 6311 } 6312 6313 if (ValSize == SplitSize) { 6314 // Already legal 6315 return SDValue(); 6316 } 6317 6318 if (ValSize < 32) { 6319 bool IsFloat = VT.isFloatingPoint(); 6320 Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0, 6321 SL, MVT::i32); 6322 6323 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) { 6324 Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1, 6325 SL, MVT::i32); 6326 } 6327 6328 if (IID == Intrinsic::amdgcn_writelane) { 6329 Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2, 6330 SL, MVT::i32); 6331 } 6332 6333 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32); 6334 SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT); 6335 return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc; 6336 } 6337 6338 if (ValSize % SplitSize != 0) 6339 return SDValue(); 6340 6341 auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue { 6342 EVT VT = N->getValueType(0); 6343 unsigned NE = VT.getVectorNumElements(); 6344 EVT EltVT = VT.getVectorElementType(); 6345 SmallVector<SDValue, 8> Scalars; 6346 unsigned NumOperands = N->getNumOperands(); 6347 SmallVector<SDValue, 4> Operands(NumOperands); 6348 SDNode *GL = N->getGluedNode(); 6349 6350 // only handle convergencectrl_glue 6351 assert(!GL || GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE); 6352 6353 for (unsigned i = 0; i != NE; ++i) { 6354 for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e; 6355 ++j) { 6356 SDValue Operand = N->getOperand(j); 6357 EVT OperandVT = Operand.getValueType(); 6358 if (OperandVT.isVector()) { 6359 // A vector operand; extract a single element. 6360 EVT OperandEltVT = OperandVT.getVectorElementType(); 6361 Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT, 6362 Operand, DAG.getVectorIdxConstant(i, SL)); 6363 } else { 6364 // A scalar operand; just use it as is. 6365 Operands[j] = Operand; 6366 } 6367 } 6368 6369 if (GL) 6370 Operands[NumOperands - 1] = 6371 DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue, 6372 SDValue(GL->getOperand(0).getNode(), 0)); 6373 6374 Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands)); 6375 } 6376 6377 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE); 6378 return DAG.getBuildVector(VecVT, SL, Scalars); 6379 }; 6380 6381 if (VT.isVector()) { 6382 switch (MVT::SimpleValueType EltTy = 6383 VT.getVectorElementType().getSimpleVT().SimpleTy) { 6384 case MVT::i32: 6385 case MVT::f32: 6386 if (SplitSize == 32) { 6387 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT()); 6388 return unrollLaneOp(LaneOp.getNode()); 6389 } 6390 [[fallthrough]]; 6391 case MVT::i16: 6392 case MVT::f16: 6393 case MVT::bf16: { 6394 unsigned SubVecNumElt = 6395 SplitSize / VT.getVectorElementType().getSizeInBits(); 6396 MVT SubVecVT = MVT::getVectorVT(EltTy, SubVecNumElt); 6397 SmallVector<SDValue, 4> Pieces; 6398 SDValue Src0SubVec, Src1SubVec, Src2SubVec; 6399 for (unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) { 6400 Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0, 6401 DAG.getConstant(EltIdx, SL, MVT::i32)); 6402 6403 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || 6404 IsPermLane16) 6405 Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1, 6406 DAG.getConstant(EltIdx, SL, MVT::i32)); 6407 6408 if (IID == Intrinsic::amdgcn_writelane) 6409 Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2, 6410 DAG.getConstant(EltIdx, SL, MVT::i32)); 6411 6412 Pieces.push_back( 6413 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16 6414 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT) 6415 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT)); 6416 EltIdx += SubVecNumElt; 6417 } 6418 return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces); 6419 } 6420 default: 6421 // Handle all other cases by bitcasting to i32 vectors 6422 break; 6423 } 6424 } 6425 6426 MVT VecVT = 6427 MVT::getVectorVT(MVT::getIntegerVT(SplitSize), ValSize / SplitSize); 6428 Src0 = DAG.getBitcast(VecVT, Src0); 6429 6430 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) 6431 Src1 = DAG.getBitcast(VecVT, Src1); 6432 6433 if (IID == Intrinsic::amdgcn_writelane) 6434 Src2 = DAG.getBitcast(VecVT, Src2); 6435 6436 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT); 6437 SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode()); 6438 return DAG.getBitcast(VT, UnrolledLaneOp); 6439 } 6440 6441 void SITargetLowering::ReplaceNodeResults(SDNode *N, 6442 SmallVectorImpl<SDValue> &Results, 6443 SelectionDAG &DAG) const { 6444 switch (N->getOpcode()) { 6445 case ISD::INSERT_VECTOR_ELT: { 6446 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG)) 6447 Results.push_back(Res); 6448 return; 6449 } 6450 case ISD::EXTRACT_VECTOR_ELT: { 6451 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG)) 6452 Results.push_back(Res); 6453 return; 6454 } 6455 case ISD::INTRINSIC_WO_CHAIN: { 6456 unsigned IID = N->getConstantOperandVal(0); 6457 switch (IID) { 6458 case Intrinsic::amdgcn_make_buffer_rsrc: 6459 Results.push_back(lowerPointerAsRsrcIntrin(N, DAG)); 6460 return; 6461 case Intrinsic::amdgcn_cvt_pkrtz: { 6462 SDValue Src0 = N->getOperand(1); 6463 SDValue Src1 = N->getOperand(2); 6464 SDLoc SL(N); 6465 SDValue Cvt = 6466 DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, Src0, Src1); 6467 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt)); 6468 return; 6469 } 6470 case Intrinsic::amdgcn_cvt_pknorm_i16: 6471 case Intrinsic::amdgcn_cvt_pknorm_u16: 6472 case Intrinsic::amdgcn_cvt_pk_i16: 6473 case Intrinsic::amdgcn_cvt_pk_u16: { 6474 SDValue Src0 = N->getOperand(1); 6475 SDValue Src1 = N->getOperand(2); 6476 SDLoc SL(N); 6477 unsigned Opcode; 6478 6479 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16) 6480 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32; 6481 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16) 6482 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32; 6483 else if (IID == Intrinsic::amdgcn_cvt_pk_i16) 6484 Opcode = AMDGPUISD::CVT_PK_I16_I32; 6485 else 6486 Opcode = AMDGPUISD::CVT_PK_U16_U32; 6487 6488 EVT VT = N->getValueType(0); 6489 if (isTypeLegal(VT)) 6490 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1)); 6491 else { 6492 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1); 6493 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt)); 6494 } 6495 return; 6496 } 6497 case Intrinsic::amdgcn_s_buffer_load: { 6498 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate 6499 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG 6500 // combiner tries to merge the s_buffer_load_u8 with a sext instruction 6501 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with 6502 // s_buffer_load_i8. 6503 if (!Subtarget->hasScalarSubwordLoads()) 6504 return; 6505 SDValue Op = SDValue(N, 0); 6506 SDValue Rsrc = Op.getOperand(1); 6507 SDValue Offset = Op.getOperand(2); 6508 SDValue CachePolicy = Op.getOperand(3); 6509 EVT VT = Op.getValueType(); 6510 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n"); 6511 SDLoc DL(Op); 6512 MachineFunction &MF = DAG.getMachineFunction(); 6513 const DataLayout &DataLayout = DAG.getDataLayout(); 6514 Align Alignment = 6515 DataLayout.getABITypeAlign(VT.getTypeForEVT(*DAG.getContext())); 6516 MachineMemOperand *MMO = MF.getMachineMemOperand( 6517 MachinePointerInfo(), 6518 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 6519 MachineMemOperand::MOInvariant, 6520 VT.getStoreSize(), Alignment); 6521 SDValue LoadVal; 6522 if (!Offset->isDivergent()) { 6523 SDValue Ops[] = {Rsrc, // source register 6524 Offset, CachePolicy}; 6525 SDValue BufferLoad = 6526 DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_UBYTE, DL, 6527 DAG.getVTList(MVT::i32), Ops, VT, MMO); 6528 LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad); 6529 } else { 6530 SDValue Ops[] = { 6531 DAG.getEntryNode(), // Chain 6532 Rsrc, // rsrc 6533 DAG.getConstant(0, DL, MVT::i32), // vindex 6534 {}, // voffset 6535 {}, // soffset 6536 {}, // offset 6537 CachePolicy, // cachepolicy 6538 DAG.getTargetConstant(0, DL, MVT::i1), // idxen 6539 }; 6540 setBufferOffsets(Offset, DAG, &Ops[3], Align(4)); 6541 LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO); 6542 } 6543 Results.push_back(LoadVal); 6544 return; 6545 } 6546 } 6547 break; 6548 } 6549 case ISD::INTRINSIC_W_CHAIN: { 6550 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) { 6551 if (Res.getOpcode() == ISD::MERGE_VALUES) { 6552 // FIXME: Hacky 6553 for (unsigned I = 0; I < Res.getNumOperands(); I++) { 6554 Results.push_back(Res.getOperand(I)); 6555 } 6556 } else { 6557 Results.push_back(Res); 6558 Results.push_back(Res.getValue(1)); 6559 } 6560 return; 6561 } 6562 6563 break; 6564 } 6565 case ISD::SELECT: { 6566 SDLoc SL(N); 6567 EVT VT = N->getValueType(0); 6568 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT); 6569 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1)); 6570 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2)); 6571 6572 EVT SelectVT = NewVT; 6573 if (NewVT.bitsLT(MVT::i32)) { 6574 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS); 6575 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS); 6576 SelectVT = MVT::i32; 6577 } 6578 6579 SDValue NewSelect = 6580 DAG.getNode(ISD::SELECT, SL, SelectVT, N->getOperand(0), LHS, RHS); 6581 6582 if (NewVT != SelectVT) 6583 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect); 6584 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect)); 6585 return; 6586 } 6587 case ISD::FNEG: { 6588 if (N->getValueType(0) != MVT::v2f16) 6589 break; 6590 6591 SDLoc SL(N); 6592 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0)); 6593 6594 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32, BC, 6595 DAG.getConstant(0x80008000, SL, MVT::i32)); 6596 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op)); 6597 return; 6598 } 6599 case ISD::FABS: { 6600 if (N->getValueType(0) != MVT::v2f16) 6601 break; 6602 6603 SDLoc SL(N); 6604 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0)); 6605 6606 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32, BC, 6607 DAG.getConstant(0x7fff7fff, SL, MVT::i32)); 6608 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op)); 6609 return; 6610 } 6611 case ISD::FSQRT: { 6612 if (N->getValueType(0) != MVT::f16) 6613 break; 6614 Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG)); 6615 break; 6616 } 6617 default: 6618 AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG); 6619 break; 6620 } 6621 } 6622 6623 /// Helper function for LowerBRCOND 6624 static SDNode *findUser(SDValue Value, unsigned Opcode) { 6625 6626 for (SDUse &U : Value->uses()) { 6627 if (U.get() != Value) 6628 continue; 6629 6630 if (U.getUser()->getOpcode() == Opcode) 6631 return U.getUser(); 6632 } 6633 return nullptr; 6634 } 6635 6636 unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const { 6637 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) { 6638 switch (Intr->getConstantOperandVal(1)) { 6639 case Intrinsic::amdgcn_if: 6640 return AMDGPUISD::IF; 6641 case Intrinsic::amdgcn_else: 6642 return AMDGPUISD::ELSE; 6643 case Intrinsic::amdgcn_loop: 6644 return AMDGPUISD::LOOP; 6645 case Intrinsic::amdgcn_end_cf: 6646 llvm_unreachable("should not occur"); 6647 default: 6648 return 0; 6649 } 6650 } 6651 6652 // break, if_break, else_break are all only used as inputs to loop, not 6653 // directly as branch conditions. 6654 return 0; 6655 } 6656 6657 bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const { 6658 const Triple &TT = getTargetMachine().getTargetTriple(); 6659 return (GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || 6660 GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) && 6661 AMDGPU::shouldEmitConstantsToTextSection(TT); 6662 } 6663 6664 bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const { 6665 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) 6666 return false; 6667 6668 // FIXME: Either avoid relying on address space here or change the default 6669 // address space for functions to avoid the explicit check. 6670 return (GV->getValueType()->isFunctionTy() || 6671 !isNonGlobalAddrSpace(GV->getAddressSpace())) && 6672 !shouldEmitFixup(GV) && !getTargetMachine().shouldAssumeDSOLocal(GV); 6673 } 6674 6675 bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const { 6676 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV); 6677 } 6678 6679 bool SITargetLowering::shouldUseLDSConstAddress(const GlobalValue *GV) const { 6680 if (!GV->hasExternalLinkage()) 6681 return true; 6682 6683 const auto OS = getTargetMachine().getTargetTriple().getOS(); 6684 return OS == Triple::AMDHSA || OS == Triple::AMDPAL; 6685 } 6686 6687 /// This transforms the control flow intrinsics to get the branch destination as 6688 /// last parameter, also switches branch target with BR if the need arise 6689 SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SelectionDAG &DAG) const { 6690 SDLoc DL(BRCOND); 6691 6692 SDNode *Intr = BRCOND.getOperand(1).getNode(); 6693 SDValue Target = BRCOND.getOperand(2); 6694 SDNode *BR = nullptr; 6695 SDNode *SetCC = nullptr; 6696 6697 if (Intr->getOpcode() == ISD::SETCC) { 6698 // As long as we negate the condition everything is fine 6699 SetCC = Intr; 6700 Intr = SetCC->getOperand(0).getNode(); 6701 6702 } else { 6703 // Get the target from BR if we don't negate the condition 6704 BR = findUser(BRCOND, ISD::BR); 6705 assert(BR && "brcond missing unconditional branch user"); 6706 Target = BR->getOperand(1); 6707 } 6708 6709 unsigned CFNode = isCFIntrinsic(Intr); 6710 if (CFNode == 0) { 6711 // This is a uniform branch so we don't need to legalize. 6712 return BRCOND; 6713 } 6714 6715 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID || 6716 Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN; 6717 6718 assert(!SetCC || 6719 (SetCC->getConstantOperandVal(1) == 1 && 6720 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() == 6721 ISD::SETNE)); 6722 6723 // operands of the new intrinsic call 6724 SmallVector<SDValue, 4> Ops; 6725 if (HaveChain) 6726 Ops.push_back(BRCOND.getOperand(0)); 6727 6728 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end()); 6729 Ops.push_back(Target); 6730 6731 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end()); 6732 6733 // build the new intrinsic call 6734 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode(); 6735 6736 if (!HaveChain) { 6737 SDValue Ops[] = {SDValue(Result, 0), BRCOND.getOperand(0)}; 6738 6739 Result = DAG.getMergeValues(Ops, DL).getNode(); 6740 } 6741 6742 if (BR) { 6743 // Give the branch instruction our target 6744 SDValue Ops[] = {BR->getOperand(0), BRCOND.getOperand(2)}; 6745 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops); 6746 DAG.ReplaceAllUsesWith(BR, NewBR.getNode()); 6747 } 6748 6749 SDValue Chain = SDValue(Result, Result->getNumValues() - 1); 6750 6751 // Copy the intrinsic results to registers 6752 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) { 6753 SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg); 6754 if (!CopyToReg) 6755 continue; 6756 6757 Chain = DAG.getCopyToReg(Chain, DL, CopyToReg->getOperand(1), 6758 SDValue(Result, i - 1), SDValue()); 6759 6760 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0)); 6761 } 6762 6763 // Remove the old intrinsic from the chain 6764 DAG.ReplaceAllUsesOfValueWith(SDValue(Intr, Intr->getNumValues() - 1), 6765 Intr->getOperand(0)); 6766 6767 return Chain; 6768 } 6769 6770 SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const { 6771 MVT VT = Op.getSimpleValueType(); 6772 SDLoc DL(Op); 6773 // Checking the depth 6774 if (Op.getConstantOperandVal(0) != 0) 6775 return DAG.getConstant(0, DL, VT); 6776 6777 MachineFunction &MF = DAG.getMachineFunction(); 6778 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 6779 // Check for kernel and shader functions 6780 if (Info->isEntryFunction()) 6781 return DAG.getConstant(0, DL, VT); 6782 6783 MachineFrameInfo &MFI = MF.getFrameInfo(); 6784 // There is a call to @llvm.returnaddress in this function 6785 MFI.setReturnAddressIsTaken(true); 6786 6787 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); 6788 // Get the return address reg and mark it as an implicit live-in 6789 Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF), 6790 getRegClassFor(VT, Op.getNode()->isDivergent())); 6791 6792 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT); 6793 } 6794 6795 SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op, 6796 const SDLoc &DL, EVT VT) const { 6797 return Op.getValueType().bitsLE(VT) 6798 ? DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) 6799 : DAG.getNode(ISD::FP_ROUND, DL, VT, Op, 6800 DAG.getTargetConstant(0, DL, MVT::i32)); 6801 } 6802 6803 SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { 6804 assert(Op.getValueType() == MVT::f16 && 6805 "Do not know how to custom lower FP_ROUND for non-f16 type"); 6806 6807 SDValue Src = Op.getOperand(0); 6808 EVT SrcVT = Src.getValueType(); 6809 if (SrcVT != MVT::f64) 6810 return Op; 6811 6812 // TODO: Handle strictfp 6813 if (Op.getOpcode() != ISD::FP_ROUND) 6814 return Op; 6815 6816 SDLoc DL(Op); 6817 6818 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src); 6819 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16); 6820 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc); 6821 } 6822 6823 SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op, 6824 SelectionDAG &DAG) const { 6825 EVT VT = Op.getValueType(); 6826 const MachineFunction &MF = DAG.getMachineFunction(); 6827 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 6828 bool IsIEEEMode = Info->getMode().IEEE; 6829 6830 // FIXME: Assert during selection that this is only selected for 6831 // ieee_mode. Currently a combine can produce the ieee version for non-ieee 6832 // mode functions, but this happens to be OK since it's only done in cases 6833 // where there is known no sNaN. 6834 if (IsIEEEMode) 6835 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG); 6836 6837 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 || 6838 VT == MVT::v16bf16) 6839 return splitBinaryVectorOp(Op, DAG); 6840 return Op; 6841 } 6842 6843 SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const { 6844 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP; 6845 EVT VT = Op.getValueType(); 6846 assert(VT == MVT::f16); 6847 6848 SDValue Exp = Op.getOperand(IsStrict ? 2 : 1); 6849 EVT ExpVT = Exp.getValueType(); 6850 if (ExpVT == MVT::i16) 6851 return Op; 6852 6853 SDLoc DL(Op); 6854 6855 // Correct the exponent type for f16 to i16. 6856 // Clamp the range of the exponent to the instruction's range. 6857 6858 // TODO: This should be a generic narrowing legalization, and can easily be 6859 // for GlobalISel. 6860 6861 SDValue MinExp = DAG.getSignedConstant(minIntN(16), DL, ExpVT); 6862 SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp); 6863 6864 SDValue MaxExp = DAG.getSignedConstant(maxIntN(16), DL, ExpVT); 6865 SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp); 6866 6867 SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp); 6868 6869 if (IsStrict) { 6870 return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other}, 6871 {Op.getOperand(0), Op.getOperand(1), TruncExp}); 6872 } 6873 6874 return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp); 6875 } 6876 6877 static unsigned getExtOpcodeForPromotedOp(SDValue Op) { 6878 switch (Op->getOpcode()) { 6879 case ISD::SRA: 6880 case ISD::SMIN: 6881 case ISD::SMAX: 6882 return ISD::SIGN_EXTEND; 6883 case ISD::SRL: 6884 case ISD::UMIN: 6885 case ISD::UMAX: 6886 return ISD::ZERO_EXTEND; 6887 case ISD::ADD: 6888 case ISD::SUB: 6889 case ISD::AND: 6890 case ISD::OR: 6891 case ISD::XOR: 6892 case ISD::SHL: 6893 case ISD::SELECT: 6894 case ISD::MUL: 6895 // operation result won't be influenced by garbage high bits. 6896 // TODO: are all of those cases correct, and are there more? 6897 return ISD::ANY_EXTEND; 6898 case ISD::SETCC: { 6899 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 6900 return ISD::isSignedIntSetCC(CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 6901 } 6902 default: 6903 llvm_unreachable("unexpected opcode!"); 6904 } 6905 } 6906 6907 SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op, 6908 DAGCombinerInfo &DCI) const { 6909 const unsigned Opc = Op.getOpcode(); 6910 assert(Opc == ISD::ADD || Opc == ISD::SUB || Opc == ISD::SHL || 6911 Opc == ISD::SRL || Opc == ISD::SRA || Opc == ISD::AND || 6912 Opc == ISD::OR || Opc == ISD::XOR || Opc == ISD::MUL || 6913 Opc == ISD::SETCC || Opc == ISD::SELECT || Opc == ISD::SMIN || 6914 Opc == ISD::SMAX || Opc == ISD::UMIN || Opc == ISD::UMAX); 6915 6916 EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType() 6917 : Op->getOperand(0).getValueType(); 6918 auto ExtTy = OpTy.changeElementType(MVT::i32); 6919 6920 if (DCI.isBeforeLegalizeOps() || 6921 isNarrowingProfitable(Op.getNode(), ExtTy, OpTy)) 6922 return SDValue(); 6923 6924 auto &DAG = DCI.DAG; 6925 6926 SDLoc DL(Op); 6927 SDValue LHS; 6928 SDValue RHS; 6929 if (Opc == ISD::SELECT) { 6930 LHS = Op->getOperand(1); 6931 RHS = Op->getOperand(2); 6932 } else { 6933 LHS = Op->getOperand(0); 6934 RHS = Op->getOperand(1); 6935 } 6936 6937 const unsigned ExtOp = getExtOpcodeForPromotedOp(Op); 6938 LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS}); 6939 6940 // Special case: for shifts, the RHS always needs a zext. 6941 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA) 6942 RHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtTy, {RHS}); 6943 else 6944 RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS}); 6945 6946 // setcc always return i1/i1 vec so no need to truncate after. 6947 if (Opc == ISD::SETCC) { 6948 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 6949 return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC); 6950 } 6951 6952 // For other ops, we extend the operation's return type as well so we need to 6953 // truncate back to the original type. 6954 SDValue NewVal; 6955 if (Opc == ISD::SELECT) 6956 NewVal = DAG.getNode(ISD::SELECT, DL, ExtTy, {Op->getOperand(0), LHS, RHS}); 6957 else 6958 NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS}); 6959 6960 return DAG.getZExtOrTrunc(NewVal, DL, OpTy); 6961 } 6962 6963 // Custom lowering for vector multiplications and s_mul_u64. 6964 SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const { 6965 EVT VT = Op.getValueType(); 6966 6967 // Split vector operands. 6968 if (VT.isVector()) 6969 return splitBinaryVectorOp(Op, DAG); 6970 6971 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64"); 6972 6973 // There are four ways to lower s_mul_u64: 6974 // 6975 // 1. If all the operands are uniform, then we lower it as it is. 6976 // 6977 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit 6978 // multiplications because there is not a vector equivalent of s_mul_u64. 6979 // 6980 // 3. If the cost model decides that it is more efficient to use vector 6981 // registers, then we have to split s_mul_u64 in 32-bit multiplications. 6982 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp . 6983 // 6984 // 4. If the cost model decides to use vector registers and both of the 6985 // operands are zero-extended/sign-extended from 32-bits, then we split the 6986 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not 6987 // possible to check if the operands are zero-extended or sign-extended in 6988 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with 6989 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace 6990 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended. 6991 // If the cost model decides that we have to use vector registers, then 6992 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/ 6993 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model 6994 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/ 6995 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in 6996 // SIInstrInfo.cpp . 6997 6998 if (Op->isDivergent()) 6999 return SDValue(); 7000 7001 SDValue Op0 = Op.getOperand(0); 7002 SDValue Op1 = Op.getOperand(1); 7003 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64 7004 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to 7005 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo. 7006 KnownBits Op0KnownBits = DAG.computeKnownBits(Op0); 7007 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros(); 7008 KnownBits Op1KnownBits = DAG.computeKnownBits(Op1); 7009 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros(); 7010 SDLoc SL(Op); 7011 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32) 7012 return SDValue( 7013 DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0); 7014 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0); 7015 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1); 7016 if (Op0SignBits >= 33 && Op1SignBits >= 33) 7017 return SDValue( 7018 DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0); 7019 // If all the operands are uniform, then we lower s_mul_u64 as it is. 7020 return Op; 7021 } 7022 7023 SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const { 7024 EVT VT = Op.getValueType(); 7025 SDLoc SL(Op); 7026 SDValue LHS = Op.getOperand(0); 7027 SDValue RHS = Op.getOperand(1); 7028 bool isSigned = Op.getOpcode() == ISD::SMULO; 7029 7030 if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) { 7031 const APInt &C = RHSC->getAPIntValue(); 7032 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X } 7033 if (C.isPowerOf2()) { 7034 // smulo(x, signed_min) is same as umulo(x, signed_min). 7035 bool UseArithShift = isSigned && !C.isMinSignedValue(); 7036 SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32); 7037 SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt); 7038 SDValue Overflow = 7039 DAG.getSetCC(SL, MVT::i1, 7040 DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL, SL, VT, 7041 Result, ShiftAmt), 7042 LHS, ISD::SETNE); 7043 return DAG.getMergeValues({Result, Overflow}, SL); 7044 } 7045 } 7046 7047 SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS); 7048 SDValue Top = 7049 DAG.getNode(isSigned ? ISD::MULHS : ISD::MULHU, SL, VT, LHS, RHS); 7050 7051 SDValue Sign = isSigned 7052 ? DAG.getNode(ISD::SRA, SL, VT, Result, 7053 DAG.getConstant(VT.getScalarSizeInBits() - 1, 7054 SL, MVT::i32)) 7055 : DAG.getConstant(0, SL, VT); 7056 SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE); 7057 7058 return DAG.getMergeValues({Result, Overflow}, SL); 7059 } 7060 7061 SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const { 7062 if (Op->isDivergent()) { 7063 // Select to V_MAD_[IU]64_[IU]32. 7064 return Op; 7065 } 7066 if (Subtarget->hasSMulHi()) { 7067 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32. 7068 return SDValue(); 7069 } 7070 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to 7071 // calculate the high part, so we might as well do the whole thing with 7072 // V_MAD_[IU]64_[IU]32. 7073 return Op; 7074 } 7075 7076 SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const { 7077 if (!Subtarget->isTrapHandlerEnabled() || 7078 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) 7079 return lowerTrapEndpgm(Op, DAG); 7080 7081 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG) 7082 : lowerTrapHsaQueuePtr(Op, DAG); 7083 } 7084 7085 SDValue SITargetLowering::lowerTrapEndpgm(SDValue Op, SelectionDAG &DAG) const { 7086 SDLoc SL(Op); 7087 SDValue Chain = Op.getOperand(0); 7088 return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain); 7089 } 7090 7091 SDValue 7092 SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT, 7093 const SDLoc &DL, Align Alignment, 7094 ImplicitParameter Param) const { 7095 MachineFunction &MF = DAG.getMachineFunction(); 7096 uint64_t Offset = getImplicitParameterOffset(MF, Param); 7097 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset); 7098 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 7099 return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, PtrInfo, Alignment, 7100 MachineMemOperand::MODereferenceable | 7101 MachineMemOperand::MOInvariant); 7102 } 7103 7104 SDValue SITargetLowering::lowerTrapHsaQueuePtr(SDValue Op, 7105 SelectionDAG &DAG) const { 7106 SDLoc SL(Op); 7107 SDValue Chain = Op.getOperand(0); 7108 7109 SDValue QueuePtr; 7110 // For code object version 5, QueuePtr is passed through implicit kernarg. 7111 const Module *M = DAG.getMachineFunction().getFunction().getParent(); 7112 if (AMDGPU::getAMDHSACodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5) { 7113 QueuePtr = 7114 loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR); 7115 } else { 7116 MachineFunction &MF = DAG.getMachineFunction(); 7117 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 7118 Register UserSGPR = Info->getQueuePtrUserSGPR(); 7119 7120 if (UserSGPR == AMDGPU::NoRegister) { 7121 // We probably are in a function incorrectly marked with 7122 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the 7123 // trap, so just use a null pointer. 7124 QueuePtr = DAG.getConstant(0, SL, MVT::i64); 7125 } else { 7126 QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR, 7127 MVT::i64); 7128 } 7129 } 7130 7131 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64); 7132 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01, QueuePtr, SDValue()); 7133 7134 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap); 7135 SDValue Ops[] = {ToReg, DAG.getTargetConstant(TrapID, SL, MVT::i16), SGPR01, 7136 ToReg.getValue(1)}; 7137 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops); 7138 } 7139 7140 SDValue SITargetLowering::lowerTrapHsa(SDValue Op, SelectionDAG &DAG) const { 7141 SDLoc SL(Op); 7142 SDValue Chain = Op.getOperand(0); 7143 7144 // We need to simulate the 's_trap 2' instruction on targets that run in 7145 // PRIV=1 (where it is treated as a nop). 7146 if (Subtarget->hasPrivEnabledTrap2NopBug()) 7147 return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain); 7148 7149 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap); 7150 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)}; 7151 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops); 7152 } 7153 7154 SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const { 7155 SDLoc SL(Op); 7156 SDValue Chain = Op.getOperand(0); 7157 MachineFunction &MF = DAG.getMachineFunction(); 7158 7159 if (!Subtarget->isTrapHandlerEnabled() || 7160 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { 7161 DiagnosticInfoUnsupported NoTrap(MF.getFunction(), 7162 "debugtrap handler not supported", 7163 Op.getDebugLoc(), DS_Warning); 7164 LLVMContext &Ctx = MF.getFunction().getContext(); 7165 Ctx.diagnose(NoTrap); 7166 return Chain; 7167 } 7168 7169 uint64_t TrapID = 7170 static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap); 7171 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)}; 7172 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops); 7173 } 7174 7175 SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL, 7176 SelectionDAG &DAG) const { 7177 if (Subtarget->hasApertureRegs()) { 7178 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS) 7179 ? AMDGPU::SRC_SHARED_BASE 7180 : AMDGPU::SRC_PRIVATE_BASE; 7181 // Note: this feature (register) is broken. When used as a 32-bit operand, 7182 // it returns a wrong value (all zeroes?). The real value is in the upper 32 7183 // bits. 7184 // 7185 // To work around the issue, directly emit a 64 bit mov from this register 7186 // then extract the high bits. Note that this shouldn't even result in a 7187 // shift being emitted and simply become a pair of registers (e.g.): 7188 // s_mov_b64 s[6:7], src_shared_base 7189 // v_mov_b32_e32 v1, s7 7190 // 7191 // FIXME: It would be more natural to emit a CopyFromReg here, but then copy 7192 // coalescing would kick in and it would think it's okay to use the "HI" 7193 // subregister directly (instead of extracting the HI 32 bits) which is an 7194 // artificial (unusable) register. 7195 // Register TableGen definitions would need an overhaul to get rid of the 7196 // artificial "HI" aperture registers and prevent this kind of issue from 7197 // happening. 7198 SDNode *Mov = DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64, 7199 DAG.getRegister(ApertureRegNo, MVT::i64)); 7200 return DAG.getNode( 7201 ISD::TRUNCATE, DL, MVT::i32, 7202 DAG.getNode(ISD::SRL, DL, MVT::i64, 7203 {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)})); 7204 } 7205 7206 // For code object version 5, private_base and shared_base are passed through 7207 // implicit kernargs. 7208 const Module *M = DAG.getMachineFunction().getFunction().getParent(); 7209 if (AMDGPU::getAMDHSACodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5) { 7210 ImplicitParameter Param = 7211 (AS == AMDGPUAS::LOCAL_ADDRESS) ? SHARED_BASE : PRIVATE_BASE; 7212 return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param); 7213 } 7214 7215 MachineFunction &MF = DAG.getMachineFunction(); 7216 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 7217 Register UserSGPR = Info->getQueuePtrUserSGPR(); 7218 if (UserSGPR == AMDGPU::NoRegister) { 7219 // We probably are in a function incorrectly marked with 7220 // amdgpu-no-queue-ptr. This is undefined. 7221 return DAG.getUNDEF(MVT::i32); 7222 } 7223 7224 SDValue QueuePtr = 7225 CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64); 7226 7227 // Offset into amd_queue_t for group_segment_aperture_base_hi / 7228 // private_segment_aperture_base_hi. 7229 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 7230 7231 SDValue Ptr = 7232 DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset)); 7233 7234 // TODO: Use custom target PseudoSourceValue. 7235 // TODO: We should use the value from the IR intrinsic call, but it might not 7236 // be available and how do we get it? 7237 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 7238 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo, 7239 commonAlignment(Align(64), StructOffset), 7240 MachineMemOperand::MODereferenceable | 7241 MachineMemOperand::MOInvariant); 7242 } 7243 7244 /// Return true if the value is a known valid address, such that a null check is 7245 /// not necessary. 7246 static bool isKnownNonNull(SDValue Val, SelectionDAG &DAG, 7247 const AMDGPUTargetMachine &TM, unsigned AddrSpace) { 7248 if (isa<FrameIndexSDNode>(Val) || isa<GlobalAddressSDNode>(Val) || 7249 isa<BasicBlockSDNode>(Val)) 7250 return true; 7251 7252 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val)) 7253 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace); 7254 7255 // TODO: Search through arithmetic, handle arguments and loads 7256 // marked nonnull. 7257 return false; 7258 } 7259 7260 SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op, 7261 SelectionDAG &DAG) const { 7262 SDLoc SL(Op); 7263 7264 const AMDGPUTargetMachine &TM = 7265 static_cast<const AMDGPUTargetMachine &>(getTargetMachine()); 7266 7267 unsigned DestAS, SrcAS; 7268 SDValue Src; 7269 bool IsNonNull = false; 7270 if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) { 7271 SrcAS = ASC->getSrcAddressSpace(); 7272 Src = ASC->getOperand(0); 7273 DestAS = ASC->getDestAddressSpace(); 7274 } else { 7275 assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN && 7276 Op.getConstantOperandVal(0) == 7277 Intrinsic::amdgcn_addrspacecast_nonnull); 7278 Src = Op->getOperand(1); 7279 SrcAS = Op->getConstantOperandVal(2); 7280 DestAS = Op->getConstantOperandVal(3); 7281 IsNonNull = true; 7282 } 7283 7284 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64); 7285 7286 // flat -> local/private 7287 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 7288 if (DestAS == AMDGPUAS::LOCAL_ADDRESS || 7289 DestAS == AMDGPUAS::PRIVATE_ADDRESS) { 7290 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src); 7291 7292 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS)) 7293 return Ptr; 7294 7295 unsigned NullVal = TM.getNullPointerValue(DestAS); 7296 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32); 7297 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE); 7298 7299 return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr, 7300 SegmentNullPtr); 7301 } 7302 } 7303 7304 // local/private -> flat 7305 if (DestAS == AMDGPUAS::FLAT_ADDRESS) { 7306 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS || 7307 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) { 7308 7309 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG); 7310 SDValue CvtPtr = 7311 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture); 7312 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr); 7313 7314 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS)) 7315 return CvtPtr; 7316 7317 unsigned NullVal = TM.getNullPointerValue(SrcAS); 7318 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32); 7319 7320 SDValue NonNull = 7321 DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE); 7322 7323 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr, 7324 FlatNullPtr); 7325 } 7326 } 7327 7328 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && 7329 Op.getValueType() == MVT::i64) { 7330 const SIMachineFunctionInfo *Info = 7331 DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>(); 7332 SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32); 7333 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi); 7334 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec); 7335 } 7336 7337 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && 7338 Src.getValueType() == MVT::i64) 7339 return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src); 7340 7341 // global <-> flat are no-ops and never emitted. 7342 7343 const MachineFunction &MF = DAG.getMachineFunction(); 7344 DiagnosticInfoUnsupported InvalidAddrSpaceCast( 7345 MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc()); 7346 DAG.getContext()->diagnose(InvalidAddrSpaceCast); 7347 7348 return DAG.getUNDEF(Op->getValueType(0)); 7349 } 7350 7351 // This lowers an INSERT_SUBVECTOR by extracting the individual elements from 7352 // the small vector and inserting them into the big vector. That is better than 7353 // the default expansion of doing it via a stack slot. Even though the use of 7354 // the stack slot would be optimized away afterwards, the stack slot itself 7355 // remains. 7356 SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op, 7357 SelectionDAG &DAG) const { 7358 SDValue Vec = Op.getOperand(0); 7359 SDValue Ins = Op.getOperand(1); 7360 SDValue Idx = Op.getOperand(2); 7361 EVT VecVT = Vec.getValueType(); 7362 EVT InsVT = Ins.getValueType(); 7363 EVT EltVT = VecVT.getVectorElementType(); 7364 unsigned InsNumElts = InsVT.getVectorNumElements(); 7365 unsigned IdxVal = Idx->getAsZExtVal(); 7366 SDLoc SL(Op); 7367 7368 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) { 7369 // Insert 32-bit registers at a time. 7370 assert(InsNumElts % 2 == 0 && "expect legal vector types"); 7371 7372 unsigned VecNumElts = VecVT.getVectorNumElements(); 7373 EVT NewVecVT = 7374 EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2); 7375 EVT NewInsVT = InsNumElts == 2 ? MVT::i32 7376 : EVT::getVectorVT(*DAG.getContext(), 7377 MVT::i32, InsNumElts / 2); 7378 7379 Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec); 7380 Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins); 7381 7382 for (unsigned I = 0; I != InsNumElts / 2; ++I) { 7383 SDValue Elt; 7384 if (InsNumElts == 2) { 7385 Elt = Ins; 7386 } else { 7387 Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins, 7388 DAG.getConstant(I, SL, MVT::i32)); 7389 } 7390 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt, 7391 DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32)); 7392 } 7393 7394 return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec); 7395 } 7396 7397 for (unsigned I = 0; I != InsNumElts; ++I) { 7398 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins, 7399 DAG.getConstant(I, SL, MVT::i32)); 7400 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt, 7401 DAG.getConstant(IdxVal + I, SL, MVT::i32)); 7402 } 7403 return Vec; 7404 } 7405 7406 SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op, 7407 SelectionDAG &DAG) const { 7408 SDValue Vec = Op.getOperand(0); 7409 SDValue InsVal = Op.getOperand(1); 7410 SDValue Idx = Op.getOperand(2); 7411 EVT VecVT = Vec.getValueType(); 7412 EVT EltVT = VecVT.getVectorElementType(); 7413 unsigned VecSize = VecVT.getSizeInBits(); 7414 unsigned EltSize = EltVT.getSizeInBits(); 7415 SDLoc SL(Op); 7416 7417 // Specially handle the case of v4i16 with static indexing. 7418 unsigned NumElts = VecVT.getVectorNumElements(); 7419 auto *KIdx = dyn_cast<ConstantSDNode>(Idx); 7420 if (NumElts == 4 && EltSize == 16 && KIdx) { 7421 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec); 7422 7423 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec, 7424 DAG.getConstant(0, SL, MVT::i32)); 7425 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec, 7426 DAG.getConstant(1, SL, MVT::i32)); 7427 7428 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf); 7429 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf); 7430 7431 unsigned Idx = KIdx->getZExtValue(); 7432 bool InsertLo = Idx < 2; 7433 SDValue InsHalf = DAG.getNode( 7434 ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16, InsertLo ? LoVec : HiVec, 7435 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal), 7436 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32)); 7437 7438 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf); 7439 7440 SDValue Concat = 7441 InsertLo ? DAG.getBuildVector(MVT::v2i32, SL, {InsHalf, HiHalf}) 7442 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf}); 7443 7444 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat); 7445 } 7446 7447 // Static indexing does not lower to stack access, and hence there is no need 7448 // for special custom lowering to avoid stack access. 7449 if (isa<ConstantSDNode>(Idx)) 7450 return SDValue(); 7451 7452 // Avoid stack access for dynamic indexing by custom lowering to 7453 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec 7454 7455 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits"); 7456 7457 MVT IntVT = MVT::getIntegerVT(VecSize); 7458 7459 // Convert vector index to bit-index and get the required bit mask. 7460 assert(isPowerOf2_32(EltSize)); 7461 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize); 7462 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32); 7463 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor); 7464 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT, 7465 DAG.getConstant(EltMask, SL, IntVT), ScaledIdx); 7466 7467 // 1. Create a congruent vector with the target value in each element. 7468 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT, 7469 DAG.getSplatBuildVector(VecVT, SL, InsVal)); 7470 7471 // 2. Mask off all other indices except the required index within (1). 7472 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal); 7473 7474 // 3. Mask off the required index within the target vector. 7475 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec); 7476 SDValue RHS = 7477 DAG.getNode(ISD::AND, SL, IntVT, DAG.getNOT(SL, BFM, IntVT), BCVec); 7478 7479 // 4. Get (2) and (3) ORed into the target vector. 7480 SDValue BFI = 7481 DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS, SDNodeFlags::Disjoint); 7482 7483 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI); 7484 } 7485 7486 SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, 7487 SelectionDAG &DAG) const { 7488 SDLoc SL(Op); 7489 7490 EVT ResultVT = Op.getValueType(); 7491 SDValue Vec = Op.getOperand(0); 7492 SDValue Idx = Op.getOperand(1); 7493 EVT VecVT = Vec.getValueType(); 7494 unsigned VecSize = VecVT.getSizeInBits(); 7495 EVT EltVT = VecVT.getVectorElementType(); 7496 7497 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr); 7498 7499 // Make sure we do any optimizations that will make it easier to fold 7500 // source modifiers before obscuring it with bit operations. 7501 7502 // XXX - Why doesn't this get called when vector_shuffle is expanded? 7503 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI)) 7504 return Combined; 7505 7506 if (VecSize == 128 || VecSize == 256 || VecSize == 512) { 7507 SDValue Lo, Hi; 7508 auto [LoVT, HiVT] = DAG.GetSplitDestVTs(VecVT); 7509 7510 if (VecSize == 128) { 7511 SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec); 7512 Lo = DAG.getBitcast(LoVT, 7513 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2, 7514 DAG.getConstant(0, SL, MVT::i32))); 7515 Hi = DAG.getBitcast(HiVT, 7516 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2, 7517 DAG.getConstant(1, SL, MVT::i32))); 7518 } else if (VecSize == 256) { 7519 SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec); 7520 SDValue Parts[4]; 7521 for (unsigned P = 0; P < 4; ++P) { 7522 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2, 7523 DAG.getConstant(P, SL, MVT::i32)); 7524 } 7525 7526 Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64, 7527 Parts[0], Parts[1])); 7528 Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64, 7529 Parts[2], Parts[3])); 7530 } else { 7531 assert(VecSize == 512); 7532 7533 SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec); 7534 SDValue Parts[8]; 7535 for (unsigned P = 0; P < 8; ++P) { 7536 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2, 7537 DAG.getConstant(P, SL, MVT::i32)); 7538 } 7539 7540 Lo = DAG.getBitcast(LoVT, 7541 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64, 7542 Parts[0], Parts[1], Parts[2], Parts[3])); 7543 Hi = DAG.getBitcast(HiVT, 7544 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64, 7545 Parts[4], Parts[5], Parts[6], Parts[7])); 7546 } 7547 7548 EVT IdxVT = Idx.getValueType(); 7549 unsigned NElem = VecVT.getVectorNumElements(); 7550 assert(isPowerOf2_32(NElem)); 7551 SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT); 7552 SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask); 7553 SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT); 7554 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx); 7555 } 7556 7557 assert(VecSize <= 64); 7558 7559 MVT IntVT = MVT::getIntegerVT(VecSize); 7560 7561 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly. 7562 SDValue VecBC = peekThroughBitcasts(Vec); 7563 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) { 7564 SDValue Src = VecBC.getOperand(0); 7565 Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src); 7566 Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT); 7567 } 7568 7569 unsigned EltSize = EltVT.getSizeInBits(); 7570 assert(isPowerOf2_32(EltSize)); 7571 7572 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32); 7573 7574 // Convert vector index to bit-index (* EltSize) 7575 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor); 7576 7577 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec); 7578 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx); 7579 7580 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) { 7581 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt); 7582 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result); 7583 } 7584 7585 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT); 7586 } 7587 7588 static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) { 7589 assert(Elt % 2 == 0); 7590 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0); 7591 } 7592 7593 static bool elementPairIsOddToEven(ArrayRef<int> Mask, int Elt) { 7594 assert(Elt % 2 == 0); 7595 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) && 7596 !(Mask[Elt + 1] & 1); 7597 } 7598 7599 SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op, 7600 SelectionDAG &DAG) const { 7601 SDLoc SL(Op); 7602 EVT ResultVT = Op.getValueType(); 7603 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op); 7604 MVT EltVT = ResultVT.getVectorElementType().getSimpleVT(); 7605 const int NewSrcNumElts = 2; 7606 MVT PackVT = MVT::getVectorVT(EltVT, NewSrcNumElts); 7607 int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements(); 7608 7609 // Break up the shuffle into registers sized pieces. 7610 // 7611 // We're trying to form sub-shuffles that the register allocation pipeline 7612 // won't be able to figure out, like how to use v_pk_mov_b32 to do a register 7613 // blend or 16-bit op_sel. It should be able to figure out how to reassemble a 7614 // pair of copies into a consecutive register copy, so use the ordinary 7615 // extract_vector_elt lowering unless we can use the shuffle. 7616 // 7617 // TODO: This is a bit of hack, and we should probably always use 7618 // extract_subvector for the largest possible subvector we can (or at least 7619 // use it for PackVT aligned pieces). However we have worse support for 7620 // combines on them don't directly treat extract_subvector / insert_subvector 7621 // as legal. The DAG scheduler also ends up doing a worse job with the 7622 // extract_subvectors. 7623 const bool ShouldUseConsecutiveExtract = EltVT.getSizeInBits() == 16; 7624 7625 // vector_shuffle <0,1,6,7> lhs, rhs 7626 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2) 7627 // 7628 // vector_shuffle <6,7,2,3> lhs, rhs 7629 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2) 7630 // 7631 // vector_shuffle <6,7,0,1> lhs, rhs 7632 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0) 7633 7634 // Avoid scalarizing when both halves are reading from consecutive elements. 7635 7636 // If we're treating 2 element shuffles as legal, also create odd-to-even 7637 // shuffles of neighboring pairs. 7638 // 7639 // vector_shuffle <3,2,7,6> lhs, rhs 7640 // -> concat_vectors vector_shuffle <1, 0> (extract_subvector lhs, 0) 7641 // vector_shuffle <1, 0> (extract_subvector rhs, 2) 7642 7643 SmallVector<SDValue, 16> Pieces; 7644 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) { 7645 if (ShouldUseConsecutiveExtract && 7646 elementPairIsContiguous(SVN->getMask(), I)) { 7647 const int Idx = SVN->getMaskElt(I); 7648 int VecIdx = Idx < SrcNumElts ? 0 : 1; 7649 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts; 7650 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, 7651 SVN->getOperand(VecIdx), 7652 DAG.getConstant(EltIdx, SL, MVT::i32)); 7653 Pieces.push_back(SubVec); 7654 } else if (elementPairIsOddToEven(SVN->getMask(), I) && 7655 isOperationLegal(ISD::VECTOR_SHUFFLE, PackVT)) { 7656 int Idx0 = SVN->getMaskElt(I); 7657 int Idx1 = SVN->getMaskElt(I + 1); 7658 7659 SDValue SrcOp0 = SVN->getOperand(0); 7660 SDValue SrcOp1 = SrcOp0; 7661 if (Idx0 >= SrcNumElts) { 7662 SrcOp0 = SVN->getOperand(1); 7663 Idx0 -= SrcNumElts; 7664 } 7665 7666 if (Idx1 >= SrcNumElts) { 7667 SrcOp1 = SVN->getOperand(1); 7668 Idx1 -= SrcNumElts; 7669 } 7670 7671 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1); 7672 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1); 7673 7674 // Extract nearest even aligned piece. 7675 SDValue SubVec0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp0, 7676 DAG.getConstant(AlignedIdx0, SL, MVT::i32)); 7677 SDValue SubVec1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp1, 7678 DAG.getConstant(AlignedIdx1, SL, MVT::i32)); 7679 7680 int NewMaskIdx0 = Idx0 - AlignedIdx0; 7681 int NewMaskIdx1 = Idx1 - AlignedIdx1; 7682 7683 SDValue Result0 = SubVec0; 7684 SDValue Result1 = SubVec0; 7685 7686 if (SubVec0 != SubVec1) { 7687 NewMaskIdx1 += NewSrcNumElts; 7688 Result1 = SubVec1; 7689 } else { 7690 Result1 = DAG.getUNDEF(PackVT); 7691 } 7692 7693 SDValue Shuf = DAG.getVectorShuffle(PackVT, SL, Result0, Result1, 7694 {NewMaskIdx0, NewMaskIdx1}); 7695 Pieces.push_back(Shuf); 7696 } else { 7697 const int Idx0 = SVN->getMaskElt(I); 7698 const int Idx1 = SVN->getMaskElt(I + 1); 7699 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1; 7700 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1; 7701 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts; 7702 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts; 7703 7704 SDValue Vec0 = SVN->getOperand(VecIdx0); 7705 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec0, 7706 DAG.getSignedConstant(EltIdx0, SL, MVT::i32)); 7707 7708 SDValue Vec1 = SVN->getOperand(VecIdx1); 7709 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec1, 7710 DAG.getSignedConstant(EltIdx1, SL, MVT::i32)); 7711 Pieces.push_back(DAG.getBuildVector(PackVT, SL, {Elt0, Elt1})); 7712 } 7713 } 7714 7715 return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces); 7716 } 7717 7718 SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op, 7719 SelectionDAG &DAG) const { 7720 SDValue SVal = Op.getOperand(0); 7721 EVT ResultVT = Op.getValueType(); 7722 EVT SValVT = SVal.getValueType(); 7723 SDValue UndefVal = DAG.getUNDEF(SValVT); 7724 SDLoc SL(Op); 7725 7726 SmallVector<SDValue, 8> VElts; 7727 VElts.push_back(SVal); 7728 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I) 7729 VElts.push_back(UndefVal); 7730 7731 return DAG.getBuildVector(ResultVT, SL, VElts); 7732 } 7733 7734 SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op, 7735 SelectionDAG &DAG) const { 7736 SDLoc SL(Op); 7737 EVT VT = Op.getValueType(); 7738 7739 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) { 7740 assert(!Subtarget->hasVOP3PInsts() && "this should be legal"); 7741 7742 SDValue Lo = Op.getOperand(0); 7743 SDValue Hi = Op.getOperand(1); 7744 7745 // Avoid adding defined bits with the zero_extend. 7746 if (Hi.isUndef()) { 7747 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo); 7748 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo); 7749 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo); 7750 } 7751 7752 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi); 7753 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi); 7754 7755 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi, 7756 DAG.getConstant(16, SL, MVT::i32)); 7757 if (Lo.isUndef()) 7758 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi); 7759 7760 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo); 7761 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo); 7762 7763 SDValue Or = 7764 DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi, SDNodeFlags::Disjoint); 7765 return DAG.getNode(ISD::BITCAST, SL, VT, Or); 7766 } 7767 7768 // Split into 2-element chunks. 7769 const unsigned NumParts = VT.getVectorNumElements() / 2; 7770 EVT PartVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2); 7771 MVT PartIntVT = MVT::getIntegerVT(PartVT.getSizeInBits()); 7772 7773 SmallVector<SDValue> Casts; 7774 for (unsigned P = 0; P < NumParts; ++P) { 7775 SDValue Vec = DAG.getBuildVector( 7776 PartVT, SL, {Op.getOperand(P * 2), Op.getOperand(P * 2 + 1)}); 7777 Casts.push_back(DAG.getNode(ISD::BITCAST, SL, PartIntVT, Vec)); 7778 } 7779 7780 SDValue Blend = 7781 DAG.getBuildVector(MVT::getVectorVT(PartIntVT, NumParts), SL, Casts); 7782 return DAG.getNode(ISD::BITCAST, SL, VT, Blend); 7783 } 7784 7785 bool SITargetLowering::isOffsetFoldingLegal( 7786 const GlobalAddressSDNode *GA) const { 7787 // OSes that use ELF REL relocations (instead of RELA) can only store a 7788 // 32-bit addend in the instruction, so it is not safe to allow offset folding 7789 // which can create arbitrary 64-bit addends. (This is only a problem for 7790 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by 7791 // the high 32 bits of the addend.) 7792 // 7793 // This should be kept in sync with how HasRelocationAddend is initialized in 7794 // the constructor of ELFAMDGPUAsmBackend. 7795 if (!Subtarget->isAmdHsaOS()) 7796 return false; 7797 7798 // We can fold offsets for anything that doesn't require a GOT relocation. 7799 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS || 7800 GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || 7801 GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) && 7802 !shouldEmitGOTReloc(GA->getGlobal()); 7803 } 7804 7805 static SDValue 7806 buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, 7807 const SDLoc &DL, int64_t Offset, EVT PtrVT, 7808 unsigned GAFlags = SIInstrInfo::MO_NONE) { 7809 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 7810 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is 7811 // lowered to the following code sequence: 7812 // 7813 // For constant address space: 7814 // s_getpc_b64 s[0:1] 7815 // s_add_u32 s0, s0, $symbol 7816 // s_addc_u32 s1, s1, 0 7817 // 7818 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 7819 // a fixup or relocation is emitted to replace $symbol with a literal 7820 // constant, which is a pc-relative offset from the encoding of the $symbol 7821 // operand to the global variable. 7822 // 7823 // For global address space: 7824 // s_getpc_b64 s[0:1] 7825 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 7826 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 7827 // 7828 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 7829 // fixups or relocations are emitted to replace $symbol@*@lo and 7830 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 7831 // which is a 64-bit pc-relative offset from the encoding of the $symbol 7832 // operand to the global variable. 7833 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags); 7834 SDValue PtrHi; 7835 if (GAFlags == SIInstrInfo::MO_NONE) 7836 PtrHi = DAG.getTargetConstant(0, DL, MVT::i32); 7837 else 7838 PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1); 7839 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi); 7840 } 7841 7842 SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, 7843 SDValue Op, 7844 SelectionDAG &DAG) const { 7845 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op); 7846 SDLoc DL(GSD); 7847 EVT PtrVT = Op.getValueType(); 7848 7849 const GlobalValue *GV = GSD->getGlobal(); 7850 if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && 7851 shouldUseLDSConstAddress(GV)) || 7852 GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS || 7853 GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { 7854 if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && 7855 GV->hasExternalLinkage()) { 7856 Type *Ty = GV->getValueType(); 7857 // HIP uses an unsized array `extern __shared__ T s[]` or similar 7858 // zero-sized type in other languages to declare the dynamic shared 7859 // memory which size is not known at the compile time. They will be 7860 // allocated by the runtime and placed directly after the static 7861 // allocated ones. They all share the same offset. 7862 if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) { 7863 assert(PtrVT == MVT::i32 && "32-bit pointer is expected."); 7864 // Adjust alignment for that dynamic shared memory array. 7865 Function &F = DAG.getMachineFunction().getFunction(); 7866 MFI->setDynLDSAlign(F, *cast<GlobalVariable>(GV)); 7867 MFI->setUsesDynamicLDS(true); 7868 return SDValue( 7869 DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0); 7870 } 7871 } 7872 return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG); 7873 } 7874 7875 if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { 7876 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(), 7877 SIInstrInfo::MO_ABS32_LO); 7878 return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA); 7879 } 7880 7881 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) { 7882 SDValue AddrLo = DAG.getTargetGlobalAddress( 7883 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO); 7884 AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0}; 7885 7886 SDValue AddrHi = DAG.getTargetGlobalAddress( 7887 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI); 7888 AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0}; 7889 7890 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi); 7891 } 7892 7893 if (shouldEmitFixup(GV)) 7894 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT); 7895 7896 if (shouldEmitPCReloc(GV)) 7897 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT, 7898 SIInstrInfo::MO_REL32); 7899 7900 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT, 7901 SIInstrInfo::MO_GOTPCREL32); 7902 PointerType *PtrTy = 7903 PointerType::get(*DAG.getContext(), AMDGPUAS::CONSTANT_ADDRESS); 7904 const DataLayout &DataLayout = DAG.getDataLayout(); 7905 Align Alignment = DataLayout.getABITypeAlign(PtrTy); 7906 MachinePointerInfo PtrInfo = 7907 MachinePointerInfo::getGOT(DAG.getMachineFunction()); 7908 7909 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment, 7910 MachineMemOperand::MODereferenceable | 7911 MachineMemOperand::MOInvariant); 7912 } 7913 7914 SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, 7915 const SDLoc &DL, SDValue V) const { 7916 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as 7917 // the destination register. 7918 // 7919 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions, 7920 // so we will end up with redundant moves to m0. 7921 // 7922 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result. 7923 7924 // A Null SDValue creates a glue result. 7925 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue, 7926 V, Chain); 7927 return SDValue(M0, 0); 7928 } 7929 7930 SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op, 7931 MVT VT, 7932 unsigned Offset) const { 7933 SDLoc SL(Op); 7934 SDValue Param = lowerKernargMemParameter( 7935 DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false); 7936 // The local size values will have the hi 16-bits as zero. 7937 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param, 7938 DAG.getValueType(VT)); 7939 } 7940 7941 static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, 7942 EVT VT) { 7943 DiagnosticInfoUnsupported BadIntrin(DAG.getMachineFunction().getFunction(), 7944 "non-hsa intrinsic with hsa target", 7945 DL.getDebugLoc()); 7946 DAG.getContext()->diagnose(BadIntrin); 7947 return DAG.getUNDEF(VT); 7948 } 7949 7950 static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, 7951 EVT VT) { 7952 DiagnosticInfoUnsupported BadIntrin(DAG.getMachineFunction().getFunction(), 7953 "intrinsic not supported on subtarget", 7954 DL.getDebugLoc()); 7955 DAG.getContext()->diagnose(BadIntrin); 7956 return DAG.getUNDEF(VT); 7957 } 7958 7959 static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, 7960 ArrayRef<SDValue> Elts) { 7961 assert(!Elts.empty()); 7962 MVT Type; 7963 unsigned NumElts = Elts.size(); 7964 7965 if (NumElts <= 12) { 7966 Type = MVT::getVectorVT(MVT::f32, NumElts); 7967 } else { 7968 assert(Elts.size() <= 16); 7969 Type = MVT::v16f32; 7970 NumElts = 16; 7971 } 7972 7973 SmallVector<SDValue, 16> VecElts(NumElts); 7974 for (unsigned i = 0; i < Elts.size(); ++i) { 7975 SDValue Elt = Elts[i]; 7976 if (Elt.getValueType() != MVT::f32) 7977 Elt = DAG.getBitcast(MVT::f32, Elt); 7978 VecElts[i] = Elt; 7979 } 7980 for (unsigned i = Elts.size(); i < NumElts; ++i) 7981 VecElts[i] = DAG.getUNDEF(MVT::f32); 7982 7983 if (NumElts == 1) 7984 return VecElts[0]; 7985 return DAG.getBuildVector(Type, DL, VecElts); 7986 } 7987 7988 static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, 7989 SDValue Src, int ExtraElts) { 7990 EVT SrcVT = Src.getValueType(); 7991 7992 SmallVector<SDValue, 8> Elts; 7993 7994 if (SrcVT.isVector()) 7995 DAG.ExtractVectorElements(Src, Elts); 7996 else 7997 Elts.push_back(Src); 7998 7999 SDValue Undef = DAG.getUNDEF(SrcVT.getScalarType()); 8000 while (ExtraElts--) 8001 Elts.push_back(Undef); 8002 8003 return DAG.getBuildVector(CastVT, DL, Elts); 8004 } 8005 8006 // Re-construct the required return value for a image load intrinsic. 8007 // This is more complicated due to the optional use TexFailCtrl which means the 8008 // required return type is an aggregate 8009 static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, 8010 ArrayRef<EVT> ResultTypes, bool IsTexFail, 8011 bool Unpacked, bool IsD16, int DMaskPop, 8012 int NumVDataDwords, bool IsAtomicPacked16Bit, 8013 const SDLoc &DL) { 8014 // Determine the required return type. This is the same regardless of 8015 // IsTexFail flag 8016 EVT ReqRetVT = ResultTypes[0]; 8017 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1; 8018 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit) 8019 ? (ReqRetNumElts + 1) / 2 8020 : ReqRetNumElts; 8021 8022 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2; 8023 8024 MVT DataDwordVT = 8025 NumDataDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords); 8026 8027 MVT MaskPopVT = 8028 MaskPopDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords); 8029 8030 SDValue Data(Result, 0); 8031 SDValue TexFail; 8032 8033 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) { 8034 SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32); 8035 if (MaskPopVT.isVector()) { 8036 Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT, 8037 SDValue(Result, 0), ZeroIdx); 8038 } else { 8039 Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT, 8040 SDValue(Result, 0), ZeroIdx); 8041 } 8042 } 8043 8044 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit) 8045 Data = padEltsToUndef(DAG, DL, DataDwordVT, Data, 8046 NumDataDwords - MaskPopDwords); 8047 8048 if (IsD16) 8049 Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked); 8050 8051 EVT LegalReqRetVT = ReqRetVT; 8052 if (!ReqRetVT.isVector()) { 8053 if (!Data.getValueType().isInteger()) 8054 Data = DAG.getNode(ISD::BITCAST, DL, 8055 Data.getValueType().changeTypeToInteger(), Data); 8056 Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data); 8057 } else { 8058 // We need to widen the return vector to a legal type 8059 if ((ReqRetVT.getVectorNumElements() % 2) == 1 && 8060 ReqRetVT.getVectorElementType().getSizeInBits() == 16) { 8061 LegalReqRetVT = 8062 EVT::getVectorVT(*DAG.getContext(), ReqRetVT.getVectorElementType(), 8063 ReqRetVT.getVectorNumElements() + 1); 8064 } 8065 } 8066 Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data); 8067 8068 if (IsTexFail) { 8069 TexFail = 8070 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0), 8071 DAG.getConstant(MaskPopDwords, DL, MVT::i32)); 8072 8073 return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL); 8074 } 8075 8076 if (Result->getNumValues() == 1) 8077 return Data; 8078 8079 return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL); 8080 } 8081 8082 static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE, 8083 SDValue *LWE, bool &IsTexFail) { 8084 auto *TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode()); 8085 8086 uint64_t Value = TexFailCtrlConst->getZExtValue(); 8087 if (Value) { 8088 IsTexFail = true; 8089 } 8090 8091 SDLoc DL(TexFailCtrlConst); 8092 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32); 8093 Value &= ~(uint64_t)0x1; 8094 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32); 8095 Value &= ~(uint64_t)0x2; 8096 8097 return Value == 0; 8098 } 8099 8100 static void packImage16bitOpsToDwords(SelectionDAG &DAG, SDValue Op, 8101 MVT PackVectorVT, 8102 SmallVectorImpl<SDValue> &PackedAddrs, 8103 unsigned DimIdx, unsigned EndIdx, 8104 unsigned NumGradients) { 8105 SDLoc DL(Op); 8106 for (unsigned I = DimIdx; I < EndIdx; I++) { 8107 SDValue Addr = Op.getOperand(I); 8108 8109 // Gradients are packed with undef for each coordinate. 8110 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this: 8111 // 1D: undef,dx/dh; undef,dx/dv 8112 // 2D: dy/dh,dx/dh; dy/dv,dx/dv 8113 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv 8114 if (((I + 1) >= EndIdx) || 8115 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 || 8116 I == DimIdx + NumGradients - 1))) { 8117 if (Addr.getValueType() != MVT::i16) 8118 Addr = DAG.getBitcast(MVT::i16, Addr); 8119 Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr); 8120 } else { 8121 Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)}); 8122 I++; 8123 } 8124 Addr = DAG.getBitcast(MVT::f32, Addr); 8125 PackedAddrs.push_back(Addr); 8126 } 8127 } 8128 8129 SDValue SITargetLowering::lowerImage(SDValue Op, 8130 const AMDGPU::ImageDimIntrinsicInfo *Intr, 8131 SelectionDAG &DAG, bool WithChain) const { 8132 SDLoc DL(Op); 8133 MachineFunction &MF = DAG.getMachineFunction(); 8134 const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>(); 8135 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 8136 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 8137 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim); 8138 unsigned IntrOpcode = Intr->BaseOpcode; 8139 bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget); 8140 bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget); 8141 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget); 8142 8143 SmallVector<EVT, 3> ResultTypes(Op->values()); 8144 SmallVector<EVT, 3> OrigResultTypes(Op->values()); 8145 bool IsD16 = false; 8146 bool IsG16 = false; 8147 bool IsA16 = false; 8148 SDValue VData; 8149 int NumVDataDwords = 0; 8150 bool AdjustRetType = false; 8151 bool IsAtomicPacked16Bit = false; 8152 8153 // Offset of intrinsic arguments 8154 const unsigned ArgOffset = WithChain ? 2 : 1; 8155 8156 unsigned DMask; 8157 unsigned DMaskLanes = 0; 8158 8159 if (BaseOpcode->Atomic) { 8160 VData = Op.getOperand(2); 8161 8162 IsAtomicPacked16Bit = 8163 (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 || 8164 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16); 8165 8166 bool Is64Bit = VData.getValueSizeInBits() == 64; 8167 if (BaseOpcode->AtomicX2) { 8168 SDValue VData2 = Op.getOperand(3); 8169 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL, 8170 {VData, VData2}); 8171 if (Is64Bit) 8172 VData = DAG.getBitcast(MVT::v4i32, VData); 8173 8174 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32; 8175 DMask = Is64Bit ? 0xf : 0x3; 8176 NumVDataDwords = Is64Bit ? 4 : 2; 8177 } else { 8178 DMask = Is64Bit ? 0x3 : 0x1; 8179 NumVDataDwords = Is64Bit ? 2 : 1; 8180 } 8181 } else { 8182 DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex); 8183 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask); 8184 8185 if (BaseOpcode->Store) { 8186 VData = Op.getOperand(2); 8187 8188 MVT StoreVT = VData.getSimpleValueType(); 8189 if (StoreVT.getScalarType() == MVT::f16) { 8190 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16) 8191 return Op; // D16 is unsupported for this instruction 8192 8193 IsD16 = true; 8194 VData = handleD16VData(VData, DAG, true); 8195 } 8196 8197 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32; 8198 } else if (!BaseOpcode->NoReturn) { 8199 // Work out the num dwords based on the dmask popcount and underlying type 8200 // and whether packing is supported. 8201 MVT LoadVT = ResultTypes[0].getSimpleVT(); 8202 if (LoadVT.getScalarType() == MVT::f16) { 8203 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16) 8204 return Op; // D16 is unsupported for this instruction 8205 8206 IsD16 = true; 8207 } 8208 8209 // Confirm that the return type is large enough for the dmask specified 8210 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) || 8211 (!LoadVT.isVector() && DMaskLanes > 1)) 8212 return Op; 8213 8214 // The sq block of gfx8 and gfx9 do not estimate register use correctly 8215 // for d16 image_gather4, image_gather4_l, and image_gather4_lz 8216 // instructions. 8217 if (IsD16 && !Subtarget->hasUnpackedD16VMem() && 8218 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug())) 8219 NumVDataDwords = (DMaskLanes + 1) / 2; 8220 else 8221 NumVDataDwords = DMaskLanes; 8222 8223 AdjustRetType = true; 8224 } 8225 } 8226 8227 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd; 8228 SmallVector<SDValue, 4> VAddrs; 8229 8230 // Check for 16 bit addresses or derivatives and pack if true. 8231 MVT VAddrVT = 8232 Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType(); 8233 MVT VAddrScalarVT = VAddrVT.getScalarType(); 8234 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16; 8235 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16; 8236 8237 VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType(); 8238 VAddrScalarVT = VAddrVT.getScalarType(); 8239 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16; 8240 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16; 8241 8242 // Push back extra arguments. 8243 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) { 8244 if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) { 8245 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument"); 8246 // Special handling of bias when A16 is on. Bias is of type half but 8247 // occupies full 32-bit. 8248 SDValue Bias = DAG.getBuildVector( 8249 MVT::v2f16, DL, 8250 {Op.getOperand(ArgOffset + I), DAG.getUNDEF(MVT::f16)}); 8251 VAddrs.push_back(Bias); 8252 } else { 8253 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) && 8254 "Bias needs to be converted to 16 bit in A16 mode"); 8255 VAddrs.push_back(Op.getOperand(ArgOffset + I)); 8256 } 8257 } 8258 8259 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) { 8260 // 16 bit gradients are supported, but are tied to the A16 control 8261 // so both gradients and addresses must be 16 bit 8262 LLVM_DEBUG( 8263 dbgs() << "Failed to lower image intrinsic: 16 bit addresses " 8264 "require 16 bit args for both gradients and addresses"); 8265 return Op; 8266 } 8267 8268 if (IsA16) { 8269 if (!ST->hasA16()) { 8270 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not " 8271 "support 16 bit addresses\n"); 8272 return Op; 8273 } 8274 } 8275 8276 // We've dealt with incorrect input so we know that if IsA16, IsG16 8277 // are set then we have to compress/pack operands (either address, 8278 // gradient or both) 8279 // In the case where a16 and gradients are tied (no G16 support) then we 8280 // have already verified that both IsA16 and IsG16 are true 8281 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) { 8282 // Activate g16 8283 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo = 8284 AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode); 8285 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16 8286 } 8287 8288 // Add gradients (packed or unpacked) 8289 if (IsG16) { 8290 // Pack the gradients 8291 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart); 8292 packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs, 8293 ArgOffset + Intr->GradientStart, 8294 ArgOffset + Intr->CoordStart, Intr->NumGradients); 8295 } else { 8296 for (unsigned I = ArgOffset + Intr->GradientStart; 8297 I < ArgOffset + Intr->CoordStart; I++) 8298 VAddrs.push_back(Op.getOperand(I)); 8299 } 8300 8301 // Add addresses (packed or unpacked) 8302 if (IsA16) { 8303 packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs, 8304 ArgOffset + Intr->CoordStart, VAddrEnd, 8305 0 /* No gradients */); 8306 } else { 8307 // Add uncompressed address 8308 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++) 8309 VAddrs.push_back(Op.getOperand(I)); 8310 } 8311 8312 // If the register allocator cannot place the address registers contiguously 8313 // without introducing moves, then using the non-sequential address encoding 8314 // is always preferable, since it saves VALU instructions and is usually a 8315 // wash in terms of code size or even better. 8316 // 8317 // However, we currently have no way of hinting to the register allocator that 8318 // MIMG addresses should be placed contiguously when it is possible to do so, 8319 // so force non-NSA for the common 2-address case as a heuristic. 8320 // 8321 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 8322 // allocation when possible. 8323 // 8324 // Partial NSA is allowed on GFX11+ where the final register is a contiguous 8325 // set of the remaining addresses. 8326 const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler); 8327 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding(); 8328 const bool UseNSA = ST->hasNSAEncoding() && 8329 VAddrs.size() >= ST->getNSAThreshold(MF) && 8330 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding); 8331 const bool UsePartialNSA = 8332 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize; 8333 8334 SDValue VAddr; 8335 if (UsePartialNSA) { 8336 VAddr = getBuildDwordsVector(DAG, DL, 8337 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1)); 8338 } else if (!UseNSA) { 8339 VAddr = getBuildDwordsVector(DAG, DL, VAddrs); 8340 } 8341 8342 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1); 8343 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1); 8344 SDValue Unorm; 8345 if (!BaseOpcode->Sampler) { 8346 Unorm = True; 8347 } else { 8348 uint64_t UnormConst = 8349 Op.getConstantOperandVal(ArgOffset + Intr->UnormIndex); 8350 8351 Unorm = UnormConst ? True : False; 8352 } 8353 8354 SDValue TFE; 8355 SDValue LWE; 8356 SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex); 8357 bool IsTexFail = false; 8358 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail)) 8359 return Op; 8360 8361 if (IsTexFail) { 8362 if (!DMaskLanes) { 8363 // Expecting to get an error flag since TFC is on - and dmask is 0 8364 // Force dmask to be at least 1 otherwise the instruction will fail 8365 DMask = 0x1; 8366 DMaskLanes = 1; 8367 NumVDataDwords = 1; 8368 } 8369 NumVDataDwords += 1; 8370 AdjustRetType = true; 8371 } 8372 8373 // Has something earlier tagged that the return type needs adjusting 8374 // This happens if the instruction is a load or has set TexFailCtrl flags 8375 if (AdjustRetType) { 8376 // NumVDataDwords reflects the true number of dwords required in the return 8377 // type 8378 if (DMaskLanes == 0 && !BaseOpcode->Store) { 8379 // This is a no-op load. This can be eliminated 8380 SDValue Undef = DAG.getUNDEF(Op.getValueType()); 8381 if (isa<MemSDNode>(Op)) 8382 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL); 8383 return Undef; 8384 } 8385 8386 EVT NewVT = NumVDataDwords > 1 ? EVT::getVectorVT(*DAG.getContext(), 8387 MVT::i32, NumVDataDwords) 8388 : MVT::i32; 8389 8390 ResultTypes[0] = NewVT; 8391 if (ResultTypes.size() == 3) { 8392 // Original result was aggregate type used for TexFailCtrl results 8393 // The actual instruction returns as a vector type which has now been 8394 // created. Remove the aggregate result. 8395 ResultTypes.erase(&ResultTypes[1]); 8396 } 8397 } 8398 8399 unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex); 8400 if (BaseOpcode->Atomic) 8401 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization 8402 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) | 8403 AMDGPU::CPol::VOLATILE)) 8404 return Op; 8405 8406 SmallVector<SDValue, 26> Ops; 8407 if (BaseOpcode->Store || BaseOpcode->Atomic) 8408 Ops.push_back(VData); // vdata 8409 if (UsePartialNSA) { 8410 append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1)); 8411 Ops.push_back(VAddr); 8412 } else if (UseNSA) 8413 append_range(Ops, VAddrs); 8414 else 8415 Ops.push_back(VAddr); 8416 SDValue Rsrc = Op.getOperand(ArgOffset + Intr->RsrcIndex); 8417 EVT RsrcVT = Rsrc.getValueType(); 8418 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32) 8419 return Op; 8420 Ops.push_back(Rsrc); 8421 if (BaseOpcode->Sampler) { 8422 SDValue Samp = Op.getOperand(ArgOffset + Intr->SampIndex); 8423 if (Samp.getValueType() != MVT::v4i32) 8424 return Op; 8425 Ops.push_back(Samp); 8426 } 8427 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32)); 8428 if (IsGFX10Plus) 8429 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32)); 8430 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA) 8431 Ops.push_back(Unorm); 8432 Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32)); 8433 Ops.push_back(IsA16 && // r128, a16 for gfx9 8434 ST->hasFeature(AMDGPU::FeatureR128A16) 8435 ? True 8436 : False); 8437 if (IsGFX10Plus) 8438 Ops.push_back(IsA16 ? True : False); 8439 if (!Subtarget->hasGFX90AInsts()) { 8440 Ops.push_back(TFE); // tfe 8441 } else if (TFE->getAsZExtVal()) { 8442 report_fatal_error("TFE is not supported on this GPU"); 8443 } 8444 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA) 8445 Ops.push_back(LWE); // lwe 8446 if (!IsGFX10Plus) 8447 Ops.push_back(DimInfo->DA ? True : False); 8448 if (BaseOpcode->HasD16) 8449 Ops.push_back(IsD16 ? True : False); 8450 if (isa<MemSDNode>(Op)) 8451 Ops.push_back(Op.getOperand(0)); // chain 8452 8453 int NumVAddrDwords = 8454 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32; 8455 int Opcode = -1; 8456 8457 if (IsGFX12Plus) { 8458 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12, 8459 NumVDataDwords, NumVAddrDwords); 8460 } else if (IsGFX11Plus) { 8461 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, 8462 UseNSA ? AMDGPU::MIMGEncGfx11NSA 8463 : AMDGPU::MIMGEncGfx11Default, 8464 NumVDataDwords, NumVAddrDwords); 8465 } else if (IsGFX10Plus) { 8466 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, 8467 UseNSA ? AMDGPU::MIMGEncGfx10NSA 8468 : AMDGPU::MIMGEncGfx10Default, 8469 NumVDataDwords, NumVAddrDwords); 8470 } else { 8471 if (Subtarget->hasGFX90AInsts()) { 8472 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a, 8473 NumVDataDwords, NumVAddrDwords); 8474 if (Opcode == -1) 8475 report_fatal_error( 8476 "requested image instruction is not supported on this GPU"); 8477 } 8478 if (Opcode == -1 && 8479 Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 8480 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8, 8481 NumVDataDwords, NumVAddrDwords); 8482 if (Opcode == -1) 8483 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6, 8484 NumVDataDwords, NumVAddrDwords); 8485 } 8486 if (Opcode == -1) 8487 return Op; 8488 8489 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops); 8490 if (auto *MemOp = dyn_cast<MemSDNode>(Op)) { 8491 MachineMemOperand *MemRef = MemOp->getMemOperand(); 8492 DAG.setNodeMemRefs(NewNode, {MemRef}); 8493 } 8494 8495 if (BaseOpcode->AtomicX2) { 8496 SmallVector<SDValue, 1> Elt; 8497 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1); 8498 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL); 8499 } 8500 if (BaseOpcode->NoReturn) 8501 return SDValue(NewNode, 0); 8502 return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail, 8503 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes, 8504 NumVDataDwords, IsAtomicPacked16Bit, DL); 8505 } 8506 8507 SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, 8508 SDValue Offset, SDValue CachePolicy, 8509 SelectionDAG &DAG) const { 8510 MachineFunction &MF = DAG.getMachineFunction(); 8511 8512 const DataLayout &DataLayout = DAG.getDataLayout(); 8513 Align Alignment = 8514 DataLayout.getABITypeAlign(VT.getTypeForEVT(*DAG.getContext())); 8515 8516 MachineMemOperand *MMO = MF.getMachineMemOperand( 8517 MachinePointerInfo(), 8518 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 8519 MachineMemOperand::MOInvariant, 8520 VT.getStoreSize(), Alignment); 8521 8522 if (!Offset->isDivergent()) { 8523 SDValue Ops[] = {Rsrc, Offset, CachePolicy}; 8524 8525 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the 8526 // s_buffer_load_u16 instruction is emitted for both signed and unsigned 8527 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext 8528 // and generates s_buffer_load_i16 (performSignExtendInRegCombine). 8529 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) { 8530 SDValue BufferLoad = 8531 DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_USHORT, DL, 8532 DAG.getVTList(MVT::i32), Ops, VT, MMO); 8533 return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad); 8534 } 8535 8536 // Widen vec3 load to vec4. 8537 if (VT.isVector() && VT.getVectorNumElements() == 3 && 8538 !Subtarget->hasScalarDwordx3Loads()) { 8539 EVT WidenedVT = 8540 EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4); 8541 auto WidenedOp = DAG.getMemIntrinsicNode( 8542 AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT, 8543 MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize())); 8544 auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp, 8545 DAG.getVectorIdxConstant(0, DL)); 8546 return Subvector; 8547 } 8548 8549 return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL, 8550 DAG.getVTList(VT), Ops, VT, MMO); 8551 } 8552 8553 // We have a divergent offset. Emit a MUBUF buffer load instead. We can 8554 // assume that the buffer is unswizzled. 8555 SDValue Ops[] = { 8556 DAG.getEntryNode(), // Chain 8557 Rsrc, // rsrc 8558 DAG.getConstant(0, DL, MVT::i32), // vindex 8559 {}, // voffset 8560 {}, // soffset 8561 {}, // offset 8562 CachePolicy, // cachepolicy 8563 DAG.getTargetConstant(0, DL, MVT::i1), // idxen 8564 }; 8565 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) { 8566 setBufferOffsets(Offset, DAG, &Ops[3], Align(4)); 8567 return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO); 8568 } 8569 8570 SmallVector<SDValue, 4> Loads; 8571 unsigned NumLoads = 1; 8572 MVT LoadVT = VT.getSimpleVT(); 8573 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1; 8574 assert((LoadVT.getScalarType() == MVT::i32 || 8575 LoadVT.getScalarType() == MVT::f32)); 8576 8577 if (NumElts == 8 || NumElts == 16) { 8578 NumLoads = NumElts / 4; 8579 LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4); 8580 } 8581 8582 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue}); 8583 8584 // Use the alignment to ensure that the required offsets will fit into the 8585 // immediate offsets. 8586 setBufferOffsets(Offset, DAG, &Ops[3], 8587 NumLoads > 1 ? Align(16 * NumLoads) : Align(4)); 8588 8589 uint64_t InstOffset = Ops[5]->getAsZExtVal(); 8590 for (unsigned i = 0; i < NumLoads; ++i) { 8591 Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32); 8592 Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops, 8593 LoadVT, MMO, DAG)); 8594 } 8595 8596 if (NumElts == 8 || NumElts == 16) 8597 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads); 8598 8599 return Loads[0]; 8600 } 8601 8602 SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const { 8603 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25]. 8604 if (!Subtarget->hasArchitectedSGPRs()) 8605 return {}; 8606 SDLoc SL(Op); 8607 MVT VT = MVT::i32; 8608 SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT); 8609 return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8, 8610 DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT)); 8611 } 8612 8613 SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op, 8614 unsigned Dim, 8615 const ArgDescriptor &Arg) const { 8616 SDLoc SL(Op); 8617 MachineFunction &MF = DAG.getMachineFunction(); 8618 unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim); 8619 if (MaxID == 0) 8620 return DAG.getConstant(0, SL, MVT::i32); 8621 8622 SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32, 8623 SDLoc(DAG.getEntryNode()), Arg); 8624 8625 // Don't bother inserting AssertZext for packed IDs since we're emitting the 8626 // masking operations anyway. 8627 // 8628 // TODO: We could assert the top bit is 0 for the source copy. 8629 if (Arg.isMasked()) 8630 return Val; 8631 8632 // Preserve the known bits after expansion to a copy. 8633 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), llvm::bit_width(MaxID)); 8634 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val, 8635 DAG.getValueType(SmallVT)); 8636 } 8637 8638 SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, 8639 SelectionDAG &DAG) const { 8640 MachineFunction &MF = DAG.getMachineFunction(); 8641 auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 8642 8643 EVT VT = Op.getValueType(); 8644 SDLoc DL(Op); 8645 unsigned IntrinsicID = Op.getConstantOperandVal(0); 8646 8647 // TODO: Should this propagate fast-math-flags? 8648 8649 switch (IntrinsicID) { 8650 case Intrinsic::amdgcn_implicit_buffer_ptr: { 8651 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction())) 8652 return emitNonHSAIntrinsicError(DAG, DL, VT); 8653 return getPreloadedValue(DAG, *MFI, VT, 8654 AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 8655 } 8656 case Intrinsic::amdgcn_dispatch_ptr: 8657 case Intrinsic::amdgcn_queue_ptr: { 8658 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) { 8659 DiagnosticInfoUnsupported BadIntrin( 8660 MF.getFunction(), "unsupported hsa intrinsic without hsa target", 8661 DL.getDebugLoc()); 8662 DAG.getContext()->diagnose(BadIntrin); 8663 return DAG.getUNDEF(VT); 8664 } 8665 8666 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr 8667 ? AMDGPUFunctionArgInfo::DISPATCH_PTR 8668 : AMDGPUFunctionArgInfo::QUEUE_PTR; 8669 return getPreloadedValue(DAG, *MFI, VT, RegID); 8670 } 8671 case Intrinsic::amdgcn_implicitarg_ptr: { 8672 if (MFI->isEntryFunction()) 8673 return getImplicitArgPtr(DAG, DL); 8674 return getPreloadedValue(DAG, *MFI, VT, 8675 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 8676 } 8677 case Intrinsic::amdgcn_kernarg_segment_ptr: { 8678 if (!AMDGPU::isKernel(MF.getFunction().getCallingConv())) { 8679 // This only makes sense to call in a kernel, so just lower to null. 8680 return DAG.getConstant(0, DL, VT); 8681 } 8682 8683 return getPreloadedValue(DAG, *MFI, VT, 8684 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 8685 } 8686 case Intrinsic::amdgcn_dispatch_id: { 8687 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID); 8688 } 8689 case Intrinsic::amdgcn_rcp: 8690 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1)); 8691 case Intrinsic::amdgcn_rsq: 8692 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); 8693 case Intrinsic::amdgcn_rsq_legacy: 8694 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 8695 return emitRemovedIntrinsicError(DAG, DL, VT); 8696 return SDValue(); 8697 case Intrinsic::amdgcn_rcp_legacy: 8698 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 8699 return emitRemovedIntrinsicError(DAG, DL, VT); 8700 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1)); 8701 case Intrinsic::amdgcn_rsq_clamp: { 8702 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 8703 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1)); 8704 8705 Type *Type = VT.getTypeForEVT(*DAG.getContext()); 8706 APFloat Max = APFloat::getLargest(Type->getFltSemantics()); 8707 APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true); 8708 8709 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); 8710 SDValue Tmp = 8711 DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, DAG.getConstantFP(Max, DL, VT)); 8712 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp, 8713 DAG.getConstantFP(Min, DL, VT)); 8714 } 8715 case Intrinsic::r600_read_ngroups_x: 8716 if (Subtarget->isAmdHsaOS()) 8717 return emitNonHSAIntrinsicError(DAG, DL, VT); 8718 8719 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 8720 SI::KernelInputOffsets::NGROUPS_X, Align(4), 8721 false); 8722 case Intrinsic::r600_read_ngroups_y: 8723 if (Subtarget->isAmdHsaOS()) 8724 return emitNonHSAIntrinsicError(DAG, DL, VT); 8725 8726 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 8727 SI::KernelInputOffsets::NGROUPS_Y, Align(4), 8728 false); 8729 case Intrinsic::r600_read_ngroups_z: 8730 if (Subtarget->isAmdHsaOS()) 8731 return emitNonHSAIntrinsicError(DAG, DL, VT); 8732 8733 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 8734 SI::KernelInputOffsets::NGROUPS_Z, Align(4), 8735 false); 8736 case Intrinsic::r600_read_global_size_x: 8737 if (Subtarget->isAmdHsaOS()) 8738 return emitNonHSAIntrinsicError(DAG, DL, VT); 8739 8740 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 8741 SI::KernelInputOffsets::GLOBAL_SIZE_X, 8742 Align(4), false); 8743 case Intrinsic::r600_read_global_size_y: 8744 if (Subtarget->isAmdHsaOS()) 8745 return emitNonHSAIntrinsicError(DAG, DL, VT); 8746 8747 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 8748 SI::KernelInputOffsets::GLOBAL_SIZE_Y, 8749 Align(4), false); 8750 case Intrinsic::r600_read_global_size_z: 8751 if (Subtarget->isAmdHsaOS()) 8752 return emitNonHSAIntrinsicError(DAG, DL, VT); 8753 8754 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 8755 SI::KernelInputOffsets::GLOBAL_SIZE_Z, 8756 Align(4), false); 8757 case Intrinsic::r600_read_local_size_x: 8758 if (Subtarget->isAmdHsaOS()) 8759 return emitNonHSAIntrinsicError(DAG, DL, VT); 8760 8761 return lowerImplicitZextParam(DAG, Op, MVT::i16, 8762 SI::KernelInputOffsets::LOCAL_SIZE_X); 8763 case Intrinsic::r600_read_local_size_y: 8764 if (Subtarget->isAmdHsaOS()) 8765 return emitNonHSAIntrinsicError(DAG, DL, VT); 8766 8767 return lowerImplicitZextParam(DAG, Op, MVT::i16, 8768 SI::KernelInputOffsets::LOCAL_SIZE_Y); 8769 case Intrinsic::r600_read_local_size_z: 8770 if (Subtarget->isAmdHsaOS()) 8771 return emitNonHSAIntrinsicError(DAG, DL, VT); 8772 8773 return lowerImplicitZextParam(DAG, Op, MVT::i16, 8774 SI::KernelInputOffsets::LOCAL_SIZE_Z); 8775 case Intrinsic::amdgcn_workgroup_id_x: 8776 return getPreloadedValue(DAG, *MFI, VT, 8777 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 8778 case Intrinsic::amdgcn_workgroup_id_y: 8779 return getPreloadedValue(DAG, *MFI, VT, 8780 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 8781 case Intrinsic::amdgcn_workgroup_id_z: 8782 return getPreloadedValue(DAG, *MFI, VT, 8783 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 8784 case Intrinsic::amdgcn_wave_id: 8785 return lowerWaveID(DAG, Op); 8786 case Intrinsic::amdgcn_lds_kernel_id: { 8787 if (MFI->isEntryFunction()) 8788 return getLDSKernelId(DAG, DL); 8789 return getPreloadedValue(DAG, *MFI, VT, 8790 AMDGPUFunctionArgInfo::LDS_KERNEL_ID); 8791 } 8792 case Intrinsic::amdgcn_workitem_id_x: 8793 return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX); 8794 case Intrinsic::amdgcn_workitem_id_y: 8795 return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY); 8796 case Intrinsic::amdgcn_workitem_id_z: 8797 return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ); 8798 case Intrinsic::amdgcn_wavefrontsize: 8799 return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(), 8800 SDLoc(Op), MVT::i32); 8801 case Intrinsic::amdgcn_s_buffer_load: { 8802 unsigned CPol = Op.getConstantOperandVal(3); 8803 // s_buffer_load, because of how it's optimized, can't be volatile 8804 // so reject ones with the volatile bit set. 8805 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12) 8806 ? AMDGPU::CPol::ALL 8807 : AMDGPU::CPol::ALL_pregfx12)) 8808 return Op; 8809 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2), 8810 Op.getOperand(3), DAG); 8811 } 8812 case Intrinsic::amdgcn_fdiv_fast: 8813 return lowerFDIV_FAST(Op, DAG); 8814 case Intrinsic::amdgcn_sin: 8815 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1)); 8816 8817 case Intrinsic::amdgcn_cos: 8818 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1)); 8819 8820 case Intrinsic::amdgcn_mul_u24: 8821 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1), 8822 Op.getOperand(2)); 8823 case Intrinsic::amdgcn_mul_i24: 8824 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1), 8825 Op.getOperand(2)); 8826 8827 case Intrinsic::amdgcn_log_clamp: { 8828 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 8829 return SDValue(); 8830 8831 return emitRemovedIntrinsicError(DAG, DL, VT); 8832 } 8833 case Intrinsic::amdgcn_fract: 8834 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1)); 8835 8836 case Intrinsic::amdgcn_class: 8837 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, Op.getOperand(1), 8838 Op.getOperand(2)); 8839 case Intrinsic::amdgcn_div_fmas: 8840 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, Op.getOperand(1), 8841 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4)); 8842 8843 case Intrinsic::amdgcn_div_fixup: 8844 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, Op.getOperand(1), 8845 Op.getOperand(2), Op.getOperand(3)); 8846 8847 case Intrinsic::amdgcn_div_scale: { 8848 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3)); 8849 8850 // Translate to the operands expected by the machine instruction. The 8851 // first parameter must be the same as the first instruction. 8852 SDValue Numerator = Op.getOperand(1); 8853 SDValue Denominator = Op.getOperand(2); 8854 8855 // Note this order is opposite of the machine instruction's operations, 8856 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The 8857 // intrinsic has the numerator as the first operand to match a normal 8858 // division operation. 8859 8860 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator; 8861 8862 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0, 8863 Denominator, Numerator); 8864 } 8865 case Intrinsic::amdgcn_icmp: { 8866 // There is a Pat that handles this variant, so return it as-is. 8867 if (Op.getOperand(1).getValueType() == MVT::i1 && 8868 Op.getConstantOperandVal(2) == 0 && 8869 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE) 8870 return Op; 8871 return lowerICMPIntrinsic(*this, Op.getNode(), DAG); 8872 } 8873 case Intrinsic::amdgcn_fcmp: { 8874 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG); 8875 } 8876 case Intrinsic::amdgcn_ballot: 8877 return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG); 8878 case Intrinsic::amdgcn_fmed3: 8879 return DAG.getNode(AMDGPUISD::FMED3, DL, VT, Op.getOperand(1), 8880 Op.getOperand(2), Op.getOperand(3)); 8881 case Intrinsic::amdgcn_fdot2: 8882 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT, Op.getOperand(1), 8883 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4)); 8884 case Intrinsic::amdgcn_fmul_legacy: 8885 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, Op.getOperand(1), 8886 Op.getOperand(2)); 8887 case Intrinsic::amdgcn_sffbh: 8888 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1)); 8889 case Intrinsic::amdgcn_sbfe: 8890 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, Op.getOperand(1), 8891 Op.getOperand(2), Op.getOperand(3)); 8892 case Intrinsic::amdgcn_ubfe: 8893 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, Op.getOperand(1), 8894 Op.getOperand(2), Op.getOperand(3)); 8895 case Intrinsic::amdgcn_cvt_pkrtz: 8896 case Intrinsic::amdgcn_cvt_pknorm_i16: 8897 case Intrinsic::amdgcn_cvt_pknorm_u16: 8898 case Intrinsic::amdgcn_cvt_pk_i16: 8899 case Intrinsic::amdgcn_cvt_pk_u16: { 8900 // FIXME: Stop adding cast if v2f16/v2i16 are legal. 8901 EVT VT = Op.getValueType(); 8902 unsigned Opcode; 8903 8904 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz) 8905 Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32; 8906 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16) 8907 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32; 8908 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16) 8909 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32; 8910 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16) 8911 Opcode = AMDGPUISD::CVT_PK_I16_I32; 8912 else 8913 Opcode = AMDGPUISD::CVT_PK_U16_U32; 8914 8915 if (isTypeLegal(VT)) 8916 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2)); 8917 8918 SDValue Node = 8919 DAG.getNode(Opcode, DL, MVT::i32, Op.getOperand(1), Op.getOperand(2)); 8920 return DAG.getNode(ISD::BITCAST, DL, VT, Node); 8921 } 8922 case Intrinsic::amdgcn_fmad_ftz: 8923 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1), 8924 Op.getOperand(2), Op.getOperand(3)); 8925 8926 case Intrinsic::amdgcn_if_break: 8927 return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT, 8928 Op->getOperand(1), Op->getOperand(2)), 8929 0); 8930 8931 case Intrinsic::amdgcn_groupstaticsize: { 8932 Triple::OSType OS = getTargetMachine().getTargetTriple().getOS(); 8933 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) 8934 return Op; 8935 8936 const Module *M = MF.getFunction().getParent(); 8937 const GlobalValue *GV = 8938 Intrinsic::getDeclarationIfExists(M, Intrinsic::amdgcn_groupstaticsize); 8939 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0, 8940 SIInstrInfo::MO_ABS32_LO); 8941 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0}; 8942 } 8943 case Intrinsic::amdgcn_is_shared: 8944 case Intrinsic::amdgcn_is_private: { 8945 SDLoc SL(Op); 8946 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared) 8947 ? AMDGPUAS::LOCAL_ADDRESS 8948 : AMDGPUAS::PRIVATE_ADDRESS; 8949 SDValue Aperture = getSegmentAperture(AS, SL, DAG); 8950 SDValue SrcVec = 8951 DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1)); 8952 8953 SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec, 8954 DAG.getConstant(1, SL, MVT::i32)); 8955 return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ); 8956 } 8957 case Intrinsic::amdgcn_perm: 8958 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1), 8959 Op.getOperand(2), Op.getOperand(3)); 8960 case Intrinsic::amdgcn_reloc_constant: { 8961 Module *M = const_cast<Module *>(MF.getFunction().getParent()); 8962 const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD(); 8963 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString(); 8964 auto *RelocSymbol = cast<GlobalVariable>( 8965 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext()))); 8966 SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0, 8967 SIInstrInfo::MO_ABS32_LO); 8968 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0}; 8969 } 8970 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16: 8971 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16: 8972 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16: 8973 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16: 8974 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8: 8975 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8: 8976 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8: 8977 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: { 8978 if (Op.getOperand(4).getValueType() == MVT::i32) 8979 return SDValue(); 8980 8981 SDLoc SL(Op); 8982 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32); 8983 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(), 8984 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2), 8985 Op.getOperand(3), IndexKeyi32); 8986 } 8987 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4: 8988 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8: 8989 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: { 8990 if (Op.getOperand(6).getValueType() == MVT::i32) 8991 return SDValue(); 8992 8993 SDLoc SL(Op); 8994 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32); 8995 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(), 8996 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2), 8997 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5), 8998 IndexKeyi32, Op.getOperand(7)}); 8999 } 9000 case Intrinsic::amdgcn_addrspacecast_nonnull: 9001 return lowerADDRSPACECAST(Op, DAG); 9002 case Intrinsic::amdgcn_readlane: 9003 case Intrinsic::amdgcn_readfirstlane: 9004 case Intrinsic::amdgcn_writelane: 9005 case Intrinsic::amdgcn_permlane16: 9006 case Intrinsic::amdgcn_permlanex16: 9007 case Intrinsic::amdgcn_permlane64: 9008 case Intrinsic::amdgcn_set_inactive: 9009 case Intrinsic::amdgcn_set_inactive_chain_arg: 9010 case Intrinsic::amdgcn_mov_dpp8: 9011 case Intrinsic::amdgcn_update_dpp: 9012 return lowerLaneOp(*this, Op.getNode(), DAG); 9013 default: 9014 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 9015 AMDGPU::getImageDimIntrinsicInfo(IntrinsicID)) 9016 return lowerImage(Op, ImageDimIntr, DAG, false); 9017 9018 return Op; 9019 } 9020 } 9021 9022 // On targets not supporting constant in soffset field, turn zero to 9023 // SGPR_NULL to avoid generating an extra s_mov with zero. 9024 static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, 9025 const GCNSubtarget *Subtarget) { 9026 if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset)) 9027 return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32); 9028 return SOffset; 9029 } 9030 9031 SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op, 9032 SelectionDAG &DAG, 9033 unsigned NewOpcode) const { 9034 SDLoc DL(Op); 9035 9036 SDValue VData = Op.getOperand(2); 9037 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG); 9038 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG); 9039 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget); 9040 SDValue Ops[] = { 9041 Op.getOperand(0), // Chain 9042 VData, // vdata 9043 Rsrc, // rsrc 9044 DAG.getConstant(0, DL, MVT::i32), // vindex 9045 VOffset, // voffset 9046 SOffset, // soffset 9047 Offset, // offset 9048 Op.getOperand(6), // cachepolicy 9049 DAG.getTargetConstant(0, DL, MVT::i1), // idxen 9050 }; 9051 9052 auto *M = cast<MemSDNode>(Op); 9053 9054 EVT MemVT = VData.getValueType(); 9055 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT, 9056 M->getMemOperand()); 9057 } 9058 9059 SDValue 9060 SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG, 9061 unsigned NewOpcode) const { 9062 SDLoc DL(Op); 9063 9064 SDValue VData = Op.getOperand(2); 9065 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG); 9066 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG); 9067 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget); 9068 SDValue Ops[] = { 9069 Op.getOperand(0), // Chain 9070 VData, // vdata 9071 Rsrc, // rsrc 9072 Op.getOperand(4), // vindex 9073 VOffset, // voffset 9074 SOffset, // soffset 9075 Offset, // offset 9076 Op.getOperand(7), // cachepolicy 9077 DAG.getTargetConstant(1, DL, MVT::i1), // idxen 9078 }; 9079 9080 auto *M = cast<MemSDNode>(Op); 9081 9082 EVT MemVT = VData.getValueType(); 9083 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT, 9084 M->getMemOperand()); 9085 } 9086 9087 SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, 9088 SelectionDAG &DAG) const { 9089 unsigned IntrID = Op.getConstantOperandVal(1); 9090 SDLoc DL(Op); 9091 9092 switch (IntrID) { 9093 case Intrinsic::amdgcn_ds_ordered_add: 9094 case Intrinsic::amdgcn_ds_ordered_swap: { 9095 MemSDNode *M = cast<MemSDNode>(Op); 9096 SDValue Chain = M->getOperand(0); 9097 SDValue M0 = M->getOperand(2); 9098 SDValue Value = M->getOperand(3); 9099 unsigned IndexOperand = M->getConstantOperandVal(7); 9100 unsigned WaveRelease = M->getConstantOperandVal(8); 9101 unsigned WaveDone = M->getConstantOperandVal(9); 9102 9103 unsigned OrderedCountIndex = IndexOperand & 0x3f; 9104 IndexOperand &= ~0x3f; 9105 unsigned CountDw = 0; 9106 9107 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) { 9108 CountDw = (IndexOperand >> 24) & 0xf; 9109 IndexOperand &= ~(0xf << 24); 9110 9111 if (CountDw < 1 || CountDw > 4) { 9112 report_fatal_error( 9113 "ds_ordered_count: dword count must be between 1 and 4"); 9114 } 9115 } 9116 9117 if (IndexOperand) 9118 report_fatal_error("ds_ordered_count: bad index operand"); 9119 9120 if (WaveDone && !WaveRelease) 9121 report_fatal_error("ds_ordered_count: wave_done requires wave_release"); 9122 9123 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1; 9124 unsigned ShaderType = 9125 SIInstrInfo::getDSShaderTypeValue(DAG.getMachineFunction()); 9126 unsigned Offset0 = OrderedCountIndex << 2; 9127 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4); 9128 9129 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) 9130 Offset1 |= (CountDw - 1) << 6; 9131 9132 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11) 9133 Offset1 |= ShaderType << 2; 9134 9135 unsigned Offset = Offset0 | (Offset1 << 8); 9136 9137 SDValue Ops[] = { 9138 Chain, Value, DAG.getTargetConstant(Offset, DL, MVT::i16), 9139 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue 9140 }; 9141 return DAG.getMemIntrinsicNode(AMDGPUISD::DS_ORDERED_COUNT, DL, 9142 M->getVTList(), Ops, M->getMemoryVT(), 9143 M->getMemOperand()); 9144 } 9145 case Intrinsic::amdgcn_raw_buffer_load: 9146 case Intrinsic::amdgcn_raw_ptr_buffer_load: 9147 case Intrinsic::amdgcn_raw_atomic_buffer_load: 9148 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load: 9149 case Intrinsic::amdgcn_raw_buffer_load_format: 9150 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: { 9151 const bool IsFormat = 9152 IntrID == Intrinsic::amdgcn_raw_buffer_load_format || 9153 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format; 9154 9155 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG); 9156 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG); 9157 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget); 9158 SDValue Ops[] = { 9159 Op.getOperand(0), // Chain 9160 Rsrc, // rsrc 9161 DAG.getConstant(0, DL, MVT::i32), // vindex 9162 VOffset, // voffset 9163 SOffset, // soffset 9164 Offset, // offset 9165 Op.getOperand(5), // cachepolicy, swizzled buffer 9166 DAG.getTargetConstant(0, DL, MVT::i1), // idxen 9167 }; 9168 9169 auto *M = cast<MemSDNode>(Op); 9170 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops); 9171 } 9172 case Intrinsic::amdgcn_struct_buffer_load: 9173 case Intrinsic::amdgcn_struct_ptr_buffer_load: 9174 case Intrinsic::amdgcn_struct_buffer_load_format: 9175 case Intrinsic::amdgcn_struct_ptr_buffer_load_format: 9176 case Intrinsic::amdgcn_struct_atomic_buffer_load: 9177 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: { 9178 const bool IsFormat = 9179 IntrID == Intrinsic::amdgcn_struct_buffer_load_format || 9180 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format; 9181 9182 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG); 9183 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG); 9184 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget); 9185 SDValue Ops[] = { 9186 Op.getOperand(0), // Chain 9187 Rsrc, // rsrc 9188 Op.getOperand(3), // vindex 9189 VOffset, // voffset 9190 SOffset, // soffset 9191 Offset, // offset 9192 Op.getOperand(6), // cachepolicy, swizzled buffer 9193 DAG.getTargetConstant(1, DL, MVT::i1), // idxen 9194 }; 9195 9196 return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops); 9197 } 9198 case Intrinsic::amdgcn_raw_tbuffer_load: 9199 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: { 9200 MemSDNode *M = cast<MemSDNode>(Op); 9201 EVT LoadVT = Op.getValueType(); 9202 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG); 9203 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG); 9204 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget); 9205 9206 SDValue Ops[] = { 9207 Op.getOperand(0), // Chain 9208 Rsrc, // rsrc 9209 DAG.getConstant(0, DL, MVT::i32), // vindex 9210 VOffset, // voffset 9211 SOffset, // soffset 9212 Offset, // offset 9213 Op.getOperand(5), // format 9214 Op.getOperand(6), // cachepolicy, swizzled buffer 9215 DAG.getTargetConstant(0, DL, MVT::i1), // idxen 9216 }; 9217 9218 if (LoadVT.getScalarType() == MVT::f16) 9219 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG, 9220 Ops); 9221 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL, 9222 Op->getVTList(), Ops, LoadVT, M->getMemOperand(), 9223 DAG); 9224 } 9225 case Intrinsic::amdgcn_struct_tbuffer_load: 9226 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: { 9227 MemSDNode *M = cast<MemSDNode>(Op); 9228 EVT LoadVT = Op.getValueType(); 9229 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG); 9230 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG); 9231 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget); 9232 9233 SDValue Ops[] = { 9234 Op.getOperand(0), // Chain 9235 Rsrc, // rsrc 9236 Op.getOperand(3), // vindex 9237 VOffset, // voffset 9238 SOffset, // soffset 9239 Offset, // offset 9240 Op.getOperand(6), // format 9241 Op.getOperand(7), // cachepolicy, swizzled buffer 9242 DAG.getTargetConstant(1, DL, MVT::i1), // idxen 9243 }; 9244 9245 if (LoadVT.getScalarType() == MVT::f16) 9246 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG, 9247 Ops); 9248 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL, 9249 Op->getVTList(), Ops, LoadVT, M->getMemOperand(), 9250 DAG); 9251 } 9252 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 9253 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd: 9254 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD); 9255 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 9256 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd: 9257 return lowerStructBufferAtomicIntrin(Op, DAG, 9258 AMDGPUISD::BUFFER_ATOMIC_FADD); 9259 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 9260 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin: 9261 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN); 9262 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 9263 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin: 9264 return lowerStructBufferAtomicIntrin(Op, DAG, 9265 AMDGPUISD::BUFFER_ATOMIC_FMIN); 9266 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 9267 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax: 9268 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX); 9269 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 9270 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax: 9271 return lowerStructBufferAtomicIntrin(Op, DAG, 9272 AMDGPUISD::BUFFER_ATOMIC_FMAX); 9273 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 9274 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap: 9275 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP); 9276 case Intrinsic::amdgcn_raw_buffer_atomic_add: 9277 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add: 9278 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD); 9279 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 9280 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub: 9281 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB); 9282 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 9283 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin: 9284 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN); 9285 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 9286 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin: 9287 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN); 9288 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 9289 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax: 9290 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX); 9291 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 9292 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax: 9293 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX); 9294 case Intrinsic::amdgcn_raw_buffer_atomic_and: 9295 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and: 9296 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND); 9297 case Intrinsic::amdgcn_raw_buffer_atomic_or: 9298 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or: 9299 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR); 9300 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 9301 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor: 9302 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR); 9303 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 9304 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc: 9305 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC); 9306 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 9307 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec: 9308 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC); 9309 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32: 9310 return lowerRawBufferAtomicIntrin(Op, DAG, 9311 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32); 9312 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 9313 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap: 9314 return lowerStructBufferAtomicIntrin(Op, DAG, 9315 AMDGPUISD::BUFFER_ATOMIC_SWAP); 9316 case Intrinsic::amdgcn_struct_buffer_atomic_add: 9317 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add: 9318 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD); 9319 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 9320 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub: 9321 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB); 9322 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 9323 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin: 9324 return lowerStructBufferAtomicIntrin(Op, DAG, 9325 AMDGPUISD::BUFFER_ATOMIC_SMIN); 9326 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 9327 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin: 9328 return lowerStructBufferAtomicIntrin(Op, DAG, 9329 AMDGPUISD::BUFFER_ATOMIC_UMIN); 9330 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 9331 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax: 9332 return lowerStructBufferAtomicIntrin(Op, DAG, 9333 AMDGPUISD::BUFFER_ATOMIC_SMAX); 9334 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 9335 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax: 9336 return lowerStructBufferAtomicIntrin(Op, DAG, 9337 AMDGPUISD::BUFFER_ATOMIC_UMAX); 9338 case Intrinsic::amdgcn_struct_buffer_atomic_and: 9339 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and: 9340 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND); 9341 case Intrinsic::amdgcn_struct_buffer_atomic_or: 9342 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or: 9343 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR); 9344 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 9345 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor: 9346 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR); 9347 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 9348 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc: 9349 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC); 9350 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 9351 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec: 9352 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC); 9353 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32: 9354 return lowerStructBufferAtomicIntrin(Op, DAG, 9355 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32); 9356 9357 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 9358 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: { 9359 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG); 9360 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG); 9361 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget); 9362 SDValue Ops[] = { 9363 Op.getOperand(0), // Chain 9364 Op.getOperand(2), // src 9365 Op.getOperand(3), // cmp 9366 Rsrc, // rsrc 9367 DAG.getConstant(0, DL, MVT::i32), // vindex 9368 VOffset, // voffset 9369 SOffset, // soffset 9370 Offset, // offset 9371 Op.getOperand(7), // cachepolicy 9372 DAG.getTargetConstant(0, DL, MVT::i1), // idxen 9373 }; 9374 EVT VT = Op.getValueType(); 9375 auto *M = cast<MemSDNode>(Op); 9376 9377 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL, 9378 Op->getVTList(), Ops, VT, 9379 M->getMemOperand()); 9380 } 9381 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 9382 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: { 9383 SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG); 9384 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(6), DAG); 9385 auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget); 9386 SDValue Ops[] = { 9387 Op.getOperand(0), // Chain 9388 Op.getOperand(2), // src 9389 Op.getOperand(3), // cmp 9390 Rsrc, // rsrc 9391 Op.getOperand(5), // vindex 9392 VOffset, // voffset 9393 SOffset, // soffset 9394 Offset, // offset 9395 Op.getOperand(8), // cachepolicy 9396 DAG.getTargetConstant(1, DL, MVT::i1), // idxen 9397 }; 9398 EVT VT = Op.getValueType(); 9399 auto *M = cast<MemSDNode>(Op); 9400 9401 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL, 9402 Op->getVTList(), Ops, VT, 9403 M->getMemOperand()); 9404 } 9405 case Intrinsic::amdgcn_image_bvh_intersect_ray: { 9406 MemSDNode *M = cast<MemSDNode>(Op); 9407 SDValue NodePtr = M->getOperand(2); 9408 SDValue RayExtent = M->getOperand(3); 9409 SDValue RayOrigin = M->getOperand(4); 9410 SDValue RayDir = M->getOperand(5); 9411 SDValue RayInvDir = M->getOperand(6); 9412 SDValue TDescr = M->getOperand(7); 9413 9414 assert(NodePtr.getValueType() == MVT::i32 || 9415 NodePtr.getValueType() == MVT::i64); 9416 assert(RayDir.getValueType() == MVT::v3f16 || 9417 RayDir.getValueType() == MVT::v3f32); 9418 9419 if (!Subtarget->hasGFX10_AEncoding()) { 9420 emitRemovedIntrinsicError(DAG, DL, Op.getValueType()); 9421 return SDValue(); 9422 } 9423 9424 const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget); 9425 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget); 9426 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget); 9427 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16; 9428 const bool Is64 = NodePtr.getValueType() == MVT::i64; 9429 const unsigned NumVDataDwords = 4; 9430 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); 9431 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords; 9432 const bool UseNSA = (Subtarget->hasNSAEncoding() && 9433 NumVAddrs <= Subtarget->getNSAMaxSize()) || 9434 IsGFX12Plus; 9435 const unsigned BaseOpcodes[2][2] = { 9436 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16}, 9437 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY, 9438 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}}; 9439 int Opcode; 9440 if (UseNSA) { 9441 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], 9442 IsGFX12Plus ? AMDGPU::MIMGEncGfx12 9443 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA 9444 : AMDGPU::MIMGEncGfx10NSA, 9445 NumVDataDwords, NumVAddrDwords); 9446 } else { 9447 assert(!IsGFX12Plus); 9448 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], 9449 IsGFX11 ? AMDGPU::MIMGEncGfx11Default 9450 : AMDGPU::MIMGEncGfx10Default, 9451 NumVDataDwords, NumVAddrDwords); 9452 } 9453 assert(Opcode != -1); 9454 9455 SmallVector<SDValue, 16> Ops; 9456 9457 auto packLanes = [&DAG, &Ops, &DL](SDValue Op, bool IsAligned) { 9458 SmallVector<SDValue, 3> Lanes; 9459 DAG.ExtractVectorElements(Op, Lanes, 0, 3); 9460 if (Lanes[0].getValueSizeInBits() == 32) { 9461 for (unsigned I = 0; I < 3; ++I) 9462 Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I])); 9463 } else { 9464 if (IsAligned) { 9465 Ops.push_back(DAG.getBitcast( 9466 MVT::i32, 9467 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[0], Lanes[1]}))); 9468 Ops.push_back(Lanes[2]); 9469 } else { 9470 SDValue Elt0 = Ops.pop_back_val(); 9471 Ops.push_back(DAG.getBitcast( 9472 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL, {Elt0, Lanes[0]}))); 9473 Ops.push_back(DAG.getBitcast( 9474 MVT::i32, 9475 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[1], Lanes[2]}))); 9476 } 9477 } 9478 }; 9479 9480 if (UseNSA && IsGFX11Plus) { 9481 Ops.push_back(NodePtr); 9482 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent)); 9483 Ops.push_back(RayOrigin); 9484 if (IsA16) { 9485 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes; 9486 DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3); 9487 DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3); 9488 for (unsigned I = 0; I < 3; ++I) { 9489 MergedLanes.push_back(DAG.getBitcast( 9490 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL, 9491 {DirLanes[I], InvDirLanes[I]}))); 9492 } 9493 Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes)); 9494 } else { 9495 Ops.push_back(RayDir); 9496 Ops.push_back(RayInvDir); 9497 } 9498 } else { 9499 if (Is64) 9500 DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0, 9501 2); 9502 else 9503 Ops.push_back(NodePtr); 9504 9505 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent)); 9506 packLanes(RayOrigin, true); 9507 packLanes(RayDir, true); 9508 packLanes(RayInvDir, false); 9509 } 9510 9511 if (!UseNSA) { 9512 // Build a single vector containing all the operands so far prepared. 9513 if (NumVAddrDwords > 12) { 9514 SDValue Undef = DAG.getUNDEF(MVT::i32); 9515 Ops.append(16 - Ops.size(), Undef); 9516 } 9517 assert(Ops.size() >= 8 && Ops.size() <= 12); 9518 SDValue MergedOps = 9519 DAG.getBuildVector(MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops); 9520 Ops.clear(); 9521 Ops.push_back(MergedOps); 9522 } 9523 9524 Ops.push_back(TDescr); 9525 Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1)); 9526 Ops.push_back(M->getChain()); 9527 9528 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops); 9529 MachineMemOperand *MemRef = M->getMemOperand(); 9530 DAG.setNodeMemRefs(NewNode, {MemRef}); 9531 return SDValue(NewNode, 0); 9532 } 9533 case Intrinsic::amdgcn_global_atomic_fmin_num: 9534 case Intrinsic::amdgcn_global_atomic_fmax_num: 9535 case Intrinsic::amdgcn_flat_atomic_fmin_num: 9536 case Intrinsic::amdgcn_flat_atomic_fmax_num: { 9537 MemSDNode *M = cast<MemSDNode>(Op); 9538 SDValue Ops[] = { 9539 M->getOperand(0), // Chain 9540 M->getOperand(2), // Ptr 9541 M->getOperand(3) // Value 9542 }; 9543 unsigned Opcode = 0; 9544 switch (IntrID) { 9545 case Intrinsic::amdgcn_global_atomic_fmin_num: 9546 case Intrinsic::amdgcn_flat_atomic_fmin_num: { 9547 Opcode = ISD::ATOMIC_LOAD_FMIN; 9548 break; 9549 } 9550 case Intrinsic::amdgcn_global_atomic_fmax_num: 9551 case Intrinsic::amdgcn_flat_atomic_fmax_num: { 9552 Opcode = ISD::ATOMIC_LOAD_FMAX; 9553 break; 9554 } 9555 default: 9556 llvm_unreachable("unhandled atomic opcode"); 9557 } 9558 return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(), 9559 Ops, M->getMemOperand()); 9560 } 9561 case Intrinsic::amdgcn_s_get_barrier_state: 9562 case Intrinsic::amdgcn_s_get_named_barrier_state: { 9563 SDValue Chain = Op->getOperand(0); 9564 SmallVector<SDValue, 2> Ops; 9565 unsigned Opc; 9566 9567 if (isa<ConstantSDNode>(Op->getOperand(2))) { 9568 uint64_t BarID = cast<ConstantSDNode>(Op->getOperand(2))->getZExtValue(); 9569 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) 9570 BarID = (BarID >> 4) & 0x3F; 9571 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM; 9572 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32); 9573 Ops.push_back(K); 9574 Ops.push_back(Chain); 9575 } else { 9576 Opc = AMDGPU::S_GET_BARRIER_STATE_M0; 9577 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) { 9578 SDValue M0Val; 9579 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, Op->getOperand(2), 9580 DAG.getShiftAmountConstant(4, MVT::i32, DL)); 9581 M0Val = SDValue( 9582 DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val, 9583 DAG.getTargetConstant(0x3F, DL, MVT::i32)), 9584 0); 9585 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0)); 9586 } else 9587 Ops.push_back(copyToM0(DAG, Chain, DL, Op->getOperand(2)).getValue(0)); 9588 } 9589 9590 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops); 9591 return SDValue(NewMI, 0); 9592 } 9593 default: 9594 9595 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 9596 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 9597 return lowerImage(Op, ImageDimIntr, DAG, true); 9598 9599 return SDValue(); 9600 } 9601 } 9602 9603 // Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to 9604 // dwordx4 if on SI and handle TFE loads. 9605 SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL, 9606 SDVTList VTList, 9607 ArrayRef<SDValue> Ops, EVT MemVT, 9608 MachineMemOperand *MMO, 9609 SelectionDAG &DAG) const { 9610 LLVMContext &C = *DAG.getContext(); 9611 MachineFunction &MF = DAG.getMachineFunction(); 9612 EVT VT = VTList.VTs[0]; 9613 9614 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3); 9615 bool IsTFE = VTList.NumVTs == 3; 9616 if (IsTFE) { 9617 unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32); 9618 unsigned NumOpDWords = NumValueDWords + 1; 9619 EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords); 9620 SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]); 9621 MachineMemOperand *OpDWordsMMO = 9622 MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4); 9623 SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops, 9624 OpDWordsVT, OpDWordsMMO, DAG); 9625 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, 9626 DAG.getVectorIdxConstant(NumValueDWords, DL)); 9627 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL); 9628 SDValue ValueDWords = 9629 NumValueDWords == 1 9630 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx) 9631 : DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, 9632 EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op, 9633 ZeroIdx); 9634 SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords); 9635 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL); 9636 } 9637 9638 if (!Subtarget->hasDwordx3LoadStores() && 9639 (VT == MVT::v3i32 || VT == MVT::v3f32)) { 9640 EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4); 9641 EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4); 9642 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16); 9643 SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]); 9644 SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops, 9645 WidenedMemVT, WidenedMMO); 9646 SDValue Value = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Op, 9647 DAG.getVectorIdxConstant(0, DL)); 9648 return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL); 9649 } 9650 9651 return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO); 9652 } 9653 9654 SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG, 9655 bool ImageStore) const { 9656 EVT StoreVT = VData.getValueType(); 9657 9658 // No change for f16 and legal vector D16 types. 9659 if (!StoreVT.isVector()) 9660 return VData; 9661 9662 SDLoc DL(VData); 9663 unsigned NumElements = StoreVT.getVectorNumElements(); 9664 9665 if (Subtarget->hasUnpackedD16VMem()) { 9666 // We need to unpack the packed data to store. 9667 EVT IntStoreVT = StoreVT.changeTypeToInteger(); 9668 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData); 9669 9670 EVT EquivStoreVT = 9671 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements); 9672 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData); 9673 return DAG.UnrollVectorOp(ZExt.getNode()); 9674 } 9675 9676 // The sq block of gfx8.1 does not estimate register use correctly for d16 9677 // image store instructions. The data operand is computed as if it were not a 9678 // d16 image instruction. 9679 if (ImageStore && Subtarget->hasImageStoreD16Bug()) { 9680 // Bitcast to i16 9681 EVT IntStoreVT = StoreVT.changeTypeToInteger(); 9682 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData); 9683 9684 // Decompose into scalars 9685 SmallVector<SDValue, 4> Elts; 9686 DAG.ExtractVectorElements(IntVData, Elts); 9687 9688 // Group pairs of i16 into v2i16 and bitcast to i32 9689 SmallVector<SDValue, 4> PackedElts; 9690 for (unsigned I = 0; I < Elts.size() / 2; I += 1) { 9691 SDValue Pair = 9692 DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]}); 9693 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair); 9694 PackedElts.push_back(IntPair); 9695 } 9696 if ((NumElements % 2) == 1) { 9697 // Handle v3i16 9698 unsigned I = Elts.size() / 2; 9699 SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL, 9700 {Elts[I * 2], DAG.getUNDEF(MVT::i16)}); 9701 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair); 9702 PackedElts.push_back(IntPair); 9703 } 9704 9705 // Pad using UNDEF 9706 PackedElts.resize(Elts.size(), DAG.getUNDEF(MVT::i32)); 9707 9708 // Build final vector 9709 EVT VecVT = 9710 EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size()); 9711 return DAG.getBuildVector(VecVT, DL, PackedElts); 9712 } 9713 9714 if (NumElements == 3) { 9715 EVT IntStoreVT = 9716 EVT::getIntegerVT(*DAG.getContext(), StoreVT.getStoreSizeInBits()); 9717 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData); 9718 9719 EVT WidenedStoreVT = EVT::getVectorVT( 9720 *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1); 9721 EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(), 9722 WidenedStoreVT.getStoreSizeInBits()); 9723 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData); 9724 return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt); 9725 } 9726 9727 assert(isTypeLegal(StoreVT)); 9728 return VData; 9729 } 9730 9731 SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, 9732 SelectionDAG &DAG) const { 9733 SDLoc DL(Op); 9734 SDValue Chain = Op.getOperand(0); 9735 unsigned IntrinsicID = Op.getConstantOperandVal(1); 9736 MachineFunction &MF = DAG.getMachineFunction(); 9737 9738 switch (IntrinsicID) { 9739 case Intrinsic::amdgcn_exp_compr: { 9740 if (!Subtarget->hasCompressedExport()) { 9741 DiagnosticInfoUnsupported BadIntrin( 9742 DAG.getMachineFunction().getFunction(), 9743 "intrinsic not supported on subtarget", DL.getDebugLoc()); 9744 DAG.getContext()->diagnose(BadIntrin); 9745 } 9746 SDValue Src0 = Op.getOperand(4); 9747 SDValue Src1 = Op.getOperand(5); 9748 // Hack around illegal type on SI by directly selecting it. 9749 if (isTypeLegal(Src0.getValueType())) 9750 return SDValue(); 9751 9752 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6)); 9753 SDValue Undef = DAG.getUNDEF(MVT::f32); 9754 const SDValue Ops[] = { 9755 Op.getOperand(2), // tgt 9756 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0 9757 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1 9758 Undef, // src2 9759 Undef, // src3 9760 Op.getOperand(7), // vm 9761 DAG.getTargetConstant(1, DL, MVT::i1), // compr 9762 Op.getOperand(3), // en 9763 Op.getOperand(0) // Chain 9764 }; 9765 9766 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE; 9767 return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0); 9768 } 9769 case Intrinsic::amdgcn_s_barrier: 9770 case Intrinsic::amdgcn_s_barrier_signal: 9771 case Intrinsic::amdgcn_s_barrier_wait: { 9772 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 9773 if (getTargetMachine().getOptLevel() > CodeGenOptLevel::None) { 9774 unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second; 9775 if (WGSize <= ST.getWavefrontSize()) { 9776 // If the workgroup fits in a wave, remove s_barrier_signal and lower 9777 // s_barrier/s_barrier_wait to wave_barrier. 9778 if (IntrinsicID == Intrinsic::amdgcn_s_barrier_signal) 9779 return Op.getOperand(0); 9780 else 9781 return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, 9782 MVT::Other, Op.getOperand(0)), 9783 0); 9784 } 9785 } 9786 9787 if (ST.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) { 9788 // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait 9789 SDValue K = 9790 DAG.getSignedTargetConstant(AMDGPU::Barrier::WORKGROUP, DL, MVT::i32); 9791 SDValue BarSignal = 9792 SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_IMM, DL, 9793 MVT::Other, K, Op.getOperand(0)), 9794 0); 9795 SDValue BarWait = 9796 SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_WAIT, DL, MVT::Other, K, 9797 BarSignal.getValue(0)), 9798 0); 9799 return BarWait; 9800 } 9801 9802 return SDValue(); 9803 }; 9804 9805 case Intrinsic::amdgcn_struct_tbuffer_store: 9806 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: { 9807 SDValue VData = Op.getOperand(2); 9808 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16); 9809 if (IsD16) 9810 VData = handleD16VData(VData, DAG); 9811 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG); 9812 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG); 9813 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget); 9814 SDValue Ops[] = { 9815 Chain, 9816 VData, // vdata 9817 Rsrc, // rsrc 9818 Op.getOperand(4), // vindex 9819 VOffset, // voffset 9820 SOffset, // soffset 9821 Offset, // offset 9822 Op.getOperand(7), // format 9823 Op.getOperand(8), // cachepolicy, swizzled buffer 9824 DAG.getTargetConstant(1, DL, MVT::i1), // idxen 9825 }; 9826 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 9827 : AMDGPUISD::TBUFFER_STORE_FORMAT; 9828 MemSDNode *M = cast<MemSDNode>(Op); 9829 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, 9830 M->getMemoryVT(), M->getMemOperand()); 9831 } 9832 9833 case Intrinsic::amdgcn_raw_tbuffer_store: 9834 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: { 9835 SDValue VData = Op.getOperand(2); 9836 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16); 9837 if (IsD16) 9838 VData = handleD16VData(VData, DAG); 9839 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG); 9840 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG); 9841 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget); 9842 SDValue Ops[] = { 9843 Chain, 9844 VData, // vdata 9845 Rsrc, // rsrc 9846 DAG.getConstant(0, DL, MVT::i32), // vindex 9847 VOffset, // voffset 9848 SOffset, // soffset 9849 Offset, // offset 9850 Op.getOperand(6), // format 9851 Op.getOperand(7), // cachepolicy, swizzled buffer 9852 DAG.getTargetConstant(0, DL, MVT::i1), // idxen 9853 }; 9854 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 9855 : AMDGPUISD::TBUFFER_STORE_FORMAT; 9856 MemSDNode *M = cast<MemSDNode>(Op); 9857 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, 9858 M->getMemoryVT(), M->getMemOperand()); 9859 } 9860 9861 case Intrinsic::amdgcn_raw_buffer_store: 9862 case Intrinsic::amdgcn_raw_ptr_buffer_store: 9863 case Intrinsic::amdgcn_raw_buffer_store_format: 9864 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: { 9865 const bool IsFormat = 9866 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format || 9867 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format; 9868 9869 SDValue VData = Op.getOperand(2); 9870 EVT VDataVT = VData.getValueType(); 9871 EVT EltType = VDataVT.getScalarType(); 9872 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16); 9873 if (IsD16) { 9874 VData = handleD16VData(VData, DAG); 9875 VDataVT = VData.getValueType(); 9876 } 9877 9878 if (!isTypeLegal(VDataVT)) { 9879 VData = 9880 DAG.getNode(ISD::BITCAST, DL, 9881 getEquivalentMemType(*DAG.getContext(), VDataVT), VData); 9882 } 9883 9884 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG); 9885 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG); 9886 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget); 9887 SDValue Ops[] = { 9888 Chain, 9889 VData, 9890 Rsrc, 9891 DAG.getConstant(0, DL, MVT::i32), // vindex 9892 VOffset, // voffset 9893 SOffset, // soffset 9894 Offset, // offset 9895 Op.getOperand(6), // cachepolicy, swizzled buffer 9896 DAG.getTargetConstant(0, DL, MVT::i1), // idxen 9897 }; 9898 unsigned Opc = 9899 IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE; 9900 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; 9901 MemSDNode *M = cast<MemSDNode>(Op); 9902 9903 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics 9904 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32) 9905 return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M); 9906 9907 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, 9908 M->getMemoryVT(), M->getMemOperand()); 9909 } 9910 9911 case Intrinsic::amdgcn_struct_buffer_store: 9912 case Intrinsic::amdgcn_struct_ptr_buffer_store: 9913 case Intrinsic::amdgcn_struct_buffer_store_format: 9914 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: { 9915 const bool IsFormat = 9916 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format || 9917 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format; 9918 9919 SDValue VData = Op.getOperand(2); 9920 EVT VDataVT = VData.getValueType(); 9921 EVT EltType = VDataVT.getScalarType(); 9922 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16); 9923 9924 if (IsD16) { 9925 VData = handleD16VData(VData, DAG); 9926 VDataVT = VData.getValueType(); 9927 } 9928 9929 if (!isTypeLegal(VDataVT)) { 9930 VData = 9931 DAG.getNode(ISD::BITCAST, DL, 9932 getEquivalentMemType(*DAG.getContext(), VDataVT), VData); 9933 } 9934 9935 auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG); 9936 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG); 9937 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget); 9938 SDValue Ops[] = { 9939 Chain, 9940 VData, 9941 Rsrc, 9942 Op.getOperand(4), // vindex 9943 VOffset, // voffset 9944 SOffset, // soffset 9945 Offset, // offset 9946 Op.getOperand(7), // cachepolicy, swizzled buffer 9947 DAG.getTargetConstant(1, DL, MVT::i1), // idxen 9948 }; 9949 unsigned Opc = 9950 !IsFormat ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; 9951 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; 9952 MemSDNode *M = cast<MemSDNode>(Op); 9953 9954 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics 9955 EVT VDataType = VData.getValueType().getScalarType(); 9956 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32) 9957 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M); 9958 9959 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, 9960 M->getMemoryVT(), M->getMemOperand()); 9961 } 9962 case Intrinsic::amdgcn_raw_buffer_load_lds: 9963 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: 9964 case Intrinsic::amdgcn_struct_buffer_load_lds: 9965 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: { 9966 assert(!AMDGPU::isGFX12Plus(*Subtarget)); 9967 unsigned Opc; 9968 bool HasVIndex = 9969 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds || 9970 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds; 9971 unsigned OpOffset = HasVIndex ? 1 : 0; 9972 SDValue VOffset = Op.getOperand(5 + OpOffset); 9973 bool HasVOffset = !isNullConstant(VOffset); 9974 unsigned Size = Op->getConstantOperandVal(4); 9975 9976 switch (Size) { 9977 default: 9978 return SDValue(); 9979 case 1: 9980 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN 9981 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN 9982 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN 9983 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET; 9984 break; 9985 case 2: 9986 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN 9987 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN 9988 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN 9989 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET; 9990 break; 9991 case 4: 9992 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN 9993 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN 9994 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN 9995 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET; 9996 break; 9997 case 12: 9998 if (!Subtarget->hasLDSLoadB96_B128()) 9999 return SDValue(); 10000 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN 10001 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN 10002 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN 10003 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET; 10004 break; 10005 case 16: 10006 if (!Subtarget->hasLDSLoadB96_B128()) 10007 return SDValue(); 10008 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN 10009 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN 10010 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN 10011 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET; 10012 break; 10013 } 10014 10015 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3)); 10016 10017 SmallVector<SDValue, 8> Ops; 10018 10019 if (HasVIndex && HasVOffset) 10020 Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL, 10021 {Op.getOperand(5), // VIndex 10022 VOffset})); 10023 else if (HasVIndex) 10024 Ops.push_back(Op.getOperand(5)); 10025 else if (HasVOffset) 10026 Ops.push_back(VOffset); 10027 10028 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG); 10029 Ops.push_back(Rsrc); 10030 Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset 10031 Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset 10032 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget); 10033 unsigned Aux = Op.getConstantOperandVal(8 + OpOffset); 10034 Ops.push_back(DAG.getTargetConstant( 10035 Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12), 10036 DL, MVT::i8)); // cpol 10037 Ops.push_back(DAG.getTargetConstant( 10038 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12) 10039 ? 1 10040 : 0, 10041 DL, MVT::i8)); // swz 10042 Ops.push_back(M0Val.getValue(0)); // Chain 10043 Ops.push_back(M0Val.getValue(1)); // Glue 10044 10045 auto *M = cast<MemSDNode>(Op); 10046 MachineMemOperand *LoadMMO = M->getMemOperand(); 10047 // Don't set the offset value here because the pointer points to the base of 10048 // the buffer. 10049 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo(); 10050 10051 MachinePointerInfo StorePtrI = LoadPtrI; 10052 LoadPtrI.V = PoisonValue::get( 10053 PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS)); 10054 LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS; 10055 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS; 10056 10057 auto F = LoadMMO->getFlags() & 10058 ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad); 10059 LoadMMO = 10060 MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, Size, 10061 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo()); 10062 10063 MachineMemOperand *StoreMMO = MF.getMachineMemOperand( 10064 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), 10065 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo()); 10066 10067 auto *Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops); 10068 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO}); 10069 10070 return SDValue(Load, 0); 10071 } 10072 case Intrinsic::amdgcn_global_load_lds: { 10073 unsigned Opc; 10074 unsigned Size = Op->getConstantOperandVal(4); 10075 switch (Size) { 10076 default: 10077 return SDValue(); 10078 case 1: 10079 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE; 10080 break; 10081 case 2: 10082 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT; 10083 break; 10084 case 4: 10085 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD; 10086 break; 10087 case 12: 10088 if (!Subtarget->hasLDSLoadB96_B128()) 10089 return SDValue(); 10090 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3; 10091 break; 10092 case 16: 10093 if (!Subtarget->hasLDSLoadB96_B128()) 10094 return SDValue(); 10095 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4; 10096 break; 10097 } 10098 10099 auto *M = cast<MemSDNode>(Op); 10100 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3)); 10101 10102 SmallVector<SDValue, 6> Ops; 10103 10104 SDValue Addr = Op.getOperand(2); // Global ptr 10105 SDValue VOffset; 10106 // Try to split SAddr and VOffset. Global and LDS pointers share the same 10107 // immediate offset, so we cannot use a regular SelectGlobalSAddr(). 10108 if (Addr->isDivergent() && Addr.getOpcode() == ISD::ADD) { 10109 SDValue LHS = Addr.getOperand(0); 10110 SDValue RHS = Addr.getOperand(1); 10111 10112 if (LHS->isDivergent()) 10113 std::swap(LHS, RHS); 10114 10115 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND && 10116 RHS.getOperand(0).getValueType() == MVT::i32) { 10117 // add (i64 sgpr), (zero_extend (i32 vgpr)) 10118 Addr = LHS; 10119 VOffset = RHS.getOperand(0); 10120 } 10121 } 10122 10123 Ops.push_back(Addr); 10124 if (!Addr->isDivergent()) { 10125 Opc = AMDGPU::getGlobalSaddrOp(Opc); 10126 if (!VOffset) 10127 VOffset = 10128 SDValue(DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, 10129 DAG.getTargetConstant(0, DL, MVT::i32)), 10130 0); 10131 Ops.push_back(VOffset); 10132 } 10133 10134 Ops.push_back(Op.getOperand(5)); // Offset 10135 Ops.push_back(Op.getOperand(6)); // CPol 10136 Ops.push_back(M0Val.getValue(0)); // Chain 10137 Ops.push_back(M0Val.getValue(1)); // Glue 10138 10139 MachineMemOperand *LoadMMO = M->getMemOperand(); 10140 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo(); 10141 LoadPtrI.Offset = Op->getConstantOperandVal(5); 10142 MachinePointerInfo StorePtrI = LoadPtrI; 10143 LoadPtrI.V = PoisonValue::get( 10144 PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS)); 10145 LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS; 10146 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS; 10147 auto F = LoadMMO->getFlags() & 10148 ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad); 10149 LoadMMO = 10150 MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, Size, 10151 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo()); 10152 MachineMemOperand *StoreMMO = MF.getMachineMemOperand( 10153 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4), 10154 LoadMMO->getAAInfo()); 10155 10156 auto *Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops); 10157 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO}); 10158 10159 return SDValue(Load, 0); 10160 } 10161 case Intrinsic::amdgcn_end_cf: 10162 return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other, 10163 Op->getOperand(2), Chain), 10164 0); 10165 case Intrinsic::amdgcn_s_barrier_init: 10166 case Intrinsic::amdgcn_s_barrier_signal_var: { 10167 // these two intrinsics have two operands: barrier pointer and member count 10168 SDValue Chain = Op->getOperand(0); 10169 SmallVector<SDValue, 2> Ops; 10170 SDValue BarOp = Op->getOperand(2); 10171 SDValue CntOp = Op->getOperand(3); 10172 SDValue M0Val; 10173 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init 10174 ? AMDGPU::S_BARRIER_INIT_M0 10175 : AMDGPU::S_BARRIER_SIGNAL_M0; 10176 // extract the BarrierID from bits 4-9 of BarOp 10177 SDValue BarID; 10178 BarID = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp, 10179 DAG.getShiftAmountConstant(4, MVT::i32, DL)); 10180 BarID = 10181 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, BarID, 10182 DAG.getTargetConstant(0x3F, DL, MVT::i32)), 10183 0); 10184 // Member count should be put into M0[ShAmt:+6] 10185 // Barrier ID should be put into M0[5:0] 10186 M0Val = 10187 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, CntOp, 10188 DAG.getTargetConstant(0x3F, DL, MVT::i32)), 10189 0); 10190 constexpr unsigned ShAmt = 16; 10191 M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, CntOp, 10192 DAG.getShiftAmountConstant(ShAmt, MVT::i32, DL)); 10193 10194 M0Val = SDValue( 10195 DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, M0Val, BarID), 0); 10196 10197 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0)); 10198 10199 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops); 10200 return SDValue(NewMI, 0); 10201 } 10202 case Intrinsic::amdgcn_s_barrier_join: { 10203 // these three intrinsics have one operand: barrier pointer 10204 SDValue Chain = Op->getOperand(0); 10205 SmallVector<SDValue, 2> Ops; 10206 SDValue BarOp = Op->getOperand(2); 10207 unsigned Opc; 10208 10209 if (isa<ConstantSDNode>(BarOp)) { 10210 uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue(); 10211 Opc = AMDGPU::S_BARRIER_JOIN_IMM; 10212 10213 // extract the BarrierID from bits 4-9 of the immediate 10214 unsigned BarID = (BarVal >> 4) & 0x3F; 10215 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32); 10216 Ops.push_back(K); 10217 Ops.push_back(Chain); 10218 } else { 10219 Opc = AMDGPU::S_BARRIER_JOIN_M0; 10220 10221 // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0] 10222 SDValue M0Val; 10223 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp, 10224 DAG.getShiftAmountConstant(4, MVT::i32, DL)); 10225 M0Val = 10226 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val, 10227 DAG.getTargetConstant(0x3F, DL, MVT::i32)), 10228 0); 10229 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0)); 10230 } 10231 10232 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops); 10233 return SDValue(NewMI, 0); 10234 } 10235 case Intrinsic::amdgcn_s_prefetch_data: { 10236 // For non-global address space preserve the chain and remove the call. 10237 if (!AMDGPU::isFlatGlobalAddrSpace(cast<MemSDNode>(Op)->getAddressSpace())) 10238 return Op.getOperand(0); 10239 return Op; 10240 } 10241 case Intrinsic::amdgcn_s_buffer_prefetch_data: { 10242 SDValue Ops[] = { 10243 Chain, bufferRsrcPtrToVector(Op.getOperand(2), DAG), 10244 Op.getOperand(3), // offset 10245 Op.getOperand(4), // length 10246 }; 10247 10248 MemSDNode *M = cast<MemSDNode>(Op); 10249 return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_PREFETCH_DATA, DL, 10250 Op->getVTList(), Ops, M->getMemoryVT(), 10251 M->getMemOperand()); 10252 } 10253 default: { 10254 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 10255 AMDGPU::getImageDimIntrinsicInfo(IntrinsicID)) 10256 return lowerImage(Op, ImageDimIntr, DAG, true); 10257 10258 return Op; 10259 } 10260 } 10261 } 10262 10263 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 10264 // offset (the offset that is included in bounds checking and swizzling, to be 10265 // split between the instruction's voffset and immoffset fields) and soffset 10266 // (the offset that is excluded from bounds checking and swizzling, to go in 10267 // the instruction's soffset field). This function takes the first kind of 10268 // offset and figures out how to split it between voffset and immoffset. 10269 std::pair<SDValue, SDValue> 10270 SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const { 10271 SDLoc DL(Offset); 10272 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget); 10273 SDValue N0 = Offset; 10274 ConstantSDNode *C1 = nullptr; 10275 10276 if ((C1 = dyn_cast<ConstantSDNode>(N0))) 10277 N0 = SDValue(); 10278 else if (DAG.isBaseWithConstantOffset(N0)) { 10279 C1 = cast<ConstantSDNode>(N0.getOperand(1)); 10280 N0 = N0.getOperand(0); 10281 } 10282 10283 if (C1) { 10284 unsigned ImmOffset = C1->getZExtValue(); 10285 // If the immediate value is too big for the immoffset field, put only bits 10286 // that would normally fit in the immoffset field. The remaining value that 10287 // is copied/added for the voffset field is a large power of 2, and it 10288 // stands more chance of being CSEd with the copy/add for another similar 10289 // load/store. 10290 // However, do not do that rounding down if that is a negative 10291 // number, as it appears to be illegal to have a negative offset in the 10292 // vgpr, even if adding the immediate offset makes it positive. 10293 unsigned Overflow = ImmOffset & ~MaxImm; 10294 ImmOffset -= Overflow; 10295 if ((int32_t)Overflow < 0) { 10296 Overflow += ImmOffset; 10297 ImmOffset = 0; 10298 } 10299 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32)); 10300 if (Overflow) { 10301 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32); 10302 if (!N0) 10303 N0 = OverflowVal; 10304 else { 10305 SDValue Ops[] = {N0, OverflowVal}; 10306 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops); 10307 } 10308 } 10309 } 10310 if (!N0) 10311 N0 = DAG.getConstant(0, DL, MVT::i32); 10312 if (!C1) 10313 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32)); 10314 return {N0, SDValue(C1, 0)}; 10315 } 10316 10317 // Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store 10318 // the three offsets (voffset, soffset and instoffset) into the SDValue[3] array 10319 // pointed to by Offsets. 10320 void SITargetLowering::setBufferOffsets(SDValue CombinedOffset, 10321 SelectionDAG &DAG, SDValue *Offsets, 10322 Align Alignment) const { 10323 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 10324 SDLoc DL(CombinedOffset); 10325 if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) { 10326 uint32_t Imm = C->getZExtValue(); 10327 uint32_t SOffset, ImmOffset; 10328 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) { 10329 Offsets[0] = DAG.getConstant(0, DL, MVT::i32); 10330 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32); 10331 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32); 10332 return; 10333 } 10334 } 10335 if (DAG.isBaseWithConstantOffset(CombinedOffset)) { 10336 SDValue N0 = CombinedOffset.getOperand(0); 10337 SDValue N1 = CombinedOffset.getOperand(1); 10338 uint32_t SOffset, ImmOffset; 10339 int Offset = cast<ConstantSDNode>(N1)->getSExtValue(); 10340 if (Offset >= 0 && 10341 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) { 10342 Offsets[0] = N0; 10343 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32); 10344 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32); 10345 return; 10346 } 10347 } 10348 10349 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset() 10350 ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32) 10351 : DAG.getConstant(0, DL, MVT::i32); 10352 10353 Offsets[0] = CombinedOffset; 10354 Offsets[1] = SOffsetZero; 10355 Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32); 10356 } 10357 10358 SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer, 10359 SelectionDAG &DAG) const { 10360 if (!MaybePointer.getValueType().isScalarInteger()) 10361 return MaybePointer; 10362 10363 SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer); 10364 return Rsrc; 10365 } 10366 10367 // Wrap a global or flat pointer into a buffer intrinsic using the flags 10368 // specified in the intrinsic. 10369 SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op, 10370 SelectionDAG &DAG) const { 10371 SDLoc Loc(Op); 10372 10373 SDValue Pointer = Op->getOperand(1); 10374 SDValue Stride = Op->getOperand(2); 10375 SDValue NumRecords = Op->getOperand(3); 10376 SDValue Flags = Op->getOperand(4); 10377 10378 auto [LowHalf, HighHalf] = DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32); 10379 SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32); 10380 SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask); 10381 std::optional<uint32_t> ConstStride = std::nullopt; 10382 if (auto *ConstNode = dyn_cast<ConstantSDNode>(Stride)) 10383 ConstStride = ConstNode->getZExtValue(); 10384 10385 SDValue NewHighHalf = Masked; 10386 if (!ConstStride || *ConstStride != 0) { 10387 SDValue ShiftedStride; 10388 if (ConstStride) { 10389 ShiftedStride = DAG.getConstant(*ConstStride << 16, Loc, MVT::i32); 10390 } else { 10391 SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32); 10392 ShiftedStride = 10393 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride, 10394 DAG.getShiftAmountConstant(16, MVT::i32, Loc)); 10395 } 10396 NewHighHalf = DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride); 10397 } 10398 10399 SDValue Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf, 10400 NewHighHalf, NumRecords, Flags); 10401 SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc); 10402 return RsrcPtr; 10403 } 10404 10405 // Handle 8 bit and 16 bit buffer loads 10406 SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG, 10407 EVT LoadVT, SDLoc DL, 10408 ArrayRef<SDValue> Ops, 10409 MachineMemOperand *MMO, 10410 bool IsTFE) const { 10411 EVT IntVT = LoadVT.changeTypeToInteger(); 10412 10413 if (IsTFE) { 10414 unsigned Opc = (LoadVT.getScalarType() == MVT::i8) 10415 ? AMDGPUISD::BUFFER_LOAD_UBYTE_TFE 10416 : AMDGPUISD::BUFFER_LOAD_USHORT_TFE; 10417 MachineFunction &MF = DAG.getMachineFunction(); 10418 MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, 0, 8); 10419 SDVTList VTs = DAG.getVTList(MVT::v2i32, MVT::Other); 10420 SDValue Op = getMemIntrinsicNode(Opc, DL, VTs, Ops, MVT::v2i32, OpMMO, DAG); 10421 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, 10422 DAG.getConstant(1, DL, MVT::i32)); 10423 SDValue Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, 10424 DAG.getConstant(0, DL, MVT::i32)); 10425 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Data); 10426 SDValue Value = DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc); 10427 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL); 10428 } 10429 10430 unsigned Opc = LoadVT.getScalarType() == MVT::i8 10431 ? AMDGPUISD::BUFFER_LOAD_UBYTE 10432 : AMDGPUISD::BUFFER_LOAD_USHORT; 10433 10434 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other); 10435 SDValue BufferLoad = 10436 DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO); 10437 SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad); 10438 LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal); 10439 10440 return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL); 10441 } 10442 10443 // Handle 8 bit and 16 bit buffer stores 10444 SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG, 10445 EVT VDataType, SDLoc DL, 10446 SDValue Ops[], 10447 MemSDNode *M) const { 10448 if (VDataType == MVT::f16 || VDataType == MVT::bf16) 10449 Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]); 10450 10451 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]); 10452 Ops[1] = BufferStoreExt; 10453 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE 10454 : AMDGPUISD::BUFFER_STORE_SHORT; 10455 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9); 10456 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType, 10457 M->getMemOperand()); 10458 } 10459 10460 static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, 10461 SDValue Op, const SDLoc &SL, EVT VT) { 10462 if (VT.bitsLT(Op.getValueType())) 10463 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op); 10464 10465 switch (ExtType) { 10466 case ISD::SEXTLOAD: 10467 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op); 10468 case ISD::ZEXTLOAD: 10469 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op); 10470 case ISD::EXTLOAD: 10471 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op); 10472 case ISD::NON_EXTLOAD: 10473 return Op; 10474 } 10475 10476 llvm_unreachable("invalid ext type"); 10477 } 10478 10479 // Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads. 10480 // TODO: Skip this on GFX12 which does have scalar sub-dword loads. 10481 SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, 10482 DAGCombinerInfo &DCI) const { 10483 SelectionDAG &DAG = DCI.DAG; 10484 if (Ld->getAlign() < Align(4) || Ld->isDivergent()) 10485 return SDValue(); 10486 10487 // FIXME: Constant loads should all be marked invariant. 10488 unsigned AS = Ld->getAddressSpace(); 10489 if (AS != AMDGPUAS::CONSTANT_ADDRESS && 10490 AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT && 10491 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant())) 10492 return SDValue(); 10493 10494 // Don't do this early, since it may interfere with adjacent load merging for 10495 // illegal types. We can avoid losing alignment information for exotic types 10496 // pre-legalize. 10497 EVT MemVT = Ld->getMemoryVT(); 10498 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) || 10499 MemVT.getSizeInBits() >= 32) 10500 return SDValue(); 10501 10502 SDLoc SL(Ld); 10503 10504 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) && 10505 "unexpected vector extload"); 10506 10507 // TODO: Drop only high part of range. 10508 SDValue Ptr = Ld->getBasePtr(); 10509 SDValue NewLoad = DAG.getLoad( 10510 ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr, 10511 Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(), 10512 Ld->getMemOperand()->getFlags(), Ld->getAAInfo(), 10513 nullptr); // Drop ranges 10514 10515 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()); 10516 if (MemVT.isFloatingPoint()) { 10517 assert(Ld->getExtensionType() == ISD::NON_EXTLOAD && 10518 "unexpected fp extload"); 10519 TruncVT = MemVT.changeTypeToInteger(); 10520 } 10521 10522 SDValue Cvt = NewLoad; 10523 if (Ld->getExtensionType() == ISD::SEXTLOAD) { 10524 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad, 10525 DAG.getValueType(TruncVT)); 10526 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD || 10527 Ld->getExtensionType() == ISD::NON_EXTLOAD) { 10528 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT); 10529 } else { 10530 assert(Ld->getExtensionType() == ISD::EXTLOAD); 10531 } 10532 10533 EVT VT = Ld->getValueType(0); 10534 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits()); 10535 10536 DCI.AddToWorklist(Cvt.getNode()); 10537 10538 // We may need to handle exotic cases, such as i16->i64 extloads, so insert 10539 // the appropriate extension from the 32-bit load. 10540 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT); 10541 DCI.AddToWorklist(Cvt.getNode()); 10542 10543 // Handle conversion back to floating point if necessary. 10544 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt); 10545 10546 return DAG.getMergeValues({Cvt, NewLoad.getValue(1)}, SL); 10547 } 10548 10549 static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, 10550 const SIMachineFunctionInfo &Info) { 10551 // TODO: Should check if the address can definitely not access stack. 10552 if (Info.isEntryFunction()) 10553 return Info.getUserSGPRInfo().hasFlatScratchInit(); 10554 return true; 10555 } 10556 10557 SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { 10558 SDLoc DL(Op); 10559 LoadSDNode *Load = cast<LoadSDNode>(Op); 10560 ISD::LoadExtType ExtType = Load->getExtensionType(); 10561 EVT MemVT = Load->getMemoryVT(); 10562 MachineMemOperand *MMO = Load->getMemOperand(); 10563 10564 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) { 10565 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16)) 10566 return SDValue(); 10567 10568 // FIXME: Copied from PPC 10569 // First, load into 32 bits, then truncate to 1 bit. 10570 10571 SDValue Chain = Load->getChain(); 10572 SDValue BasePtr = Load->getBasePtr(); 10573 10574 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16; 10575 10576 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, BasePtr, 10577 RealMemVT, MMO); 10578 10579 if (!MemVT.isVector()) { 10580 SDValue Ops[] = {DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD), 10581 NewLD.getValue(1)}; 10582 10583 return DAG.getMergeValues(Ops, DL); 10584 } 10585 10586 SmallVector<SDValue, 3> Elts; 10587 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) { 10588 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD, 10589 DAG.getConstant(I, DL, MVT::i32)); 10590 10591 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt)); 10592 } 10593 10594 SDValue Ops[] = {DAG.getBuildVector(MemVT, DL, Elts), NewLD.getValue(1)}; 10595 10596 return DAG.getMergeValues(Ops, DL); 10597 } 10598 10599 if (!MemVT.isVector()) 10600 return SDValue(); 10601 10602 assert(Op.getValueType().getVectorElementType() == MVT::i32 && 10603 "Custom lowering for non-i32 vectors hasn't been implemented."); 10604 10605 Align Alignment = Load->getAlign(); 10606 unsigned AS = Load->getAddressSpace(); 10607 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS && 10608 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) { 10609 return SplitVectorLoad(Op, DAG); 10610 } 10611 10612 MachineFunction &MF = DAG.getMachineFunction(); 10613 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 10614 // If there is a possibility that flat instruction access scratch memory 10615 // then we need to use the same legalization rules we use for private. 10616 if (AS == AMDGPUAS::FLAT_ADDRESS && 10617 !Subtarget->hasMultiDwordFlatScratchAddressing()) 10618 AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI) 10619 ? AMDGPUAS::PRIVATE_ADDRESS 10620 : AMDGPUAS::GLOBAL_ADDRESS; 10621 10622 unsigned NumElements = MemVT.getVectorNumElements(); 10623 10624 if (AS == AMDGPUAS::CONSTANT_ADDRESS || 10625 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT || 10626 (AS == AMDGPUAS::GLOBAL_ADDRESS && 10627 Subtarget->getScalarizeGlobalBehavior() && Load->isSimple() && 10628 isMemOpHasNoClobberedMemOperand(Load))) { 10629 if ((!Op->isDivergent() || AMDGPUInstrInfo::isUniformMMO(MMO)) && 10630 Alignment >= Align(4) && NumElements < 32) { 10631 if (MemVT.isPow2VectorType() || 10632 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3)) 10633 return SDValue(); 10634 return WidenOrSplitVectorLoad(Op, DAG); 10635 } 10636 // Non-uniform loads will be selected to MUBUF instructions, so they 10637 // have the same legalization requirements as global and private 10638 // loads. 10639 // 10640 } 10641 if (AS == AMDGPUAS::CONSTANT_ADDRESS || 10642 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT || 10643 AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) { 10644 if (NumElements > 4) 10645 return SplitVectorLoad(Op, DAG); 10646 // v3 loads not supported on SI. 10647 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores()) 10648 return WidenOrSplitVectorLoad(Op, DAG); 10649 10650 // v3 and v4 loads are supported for private and global memory. 10651 return SDValue(); 10652 } 10653 if (AS == AMDGPUAS::PRIVATE_ADDRESS) { 10654 // Depending on the setting of the private_element_size field in the 10655 // resource descriptor, we can only make private accesses up to a certain 10656 // size. 10657 switch (Subtarget->getMaxPrivateElementSize()) { 10658 case 4: { 10659 auto [Op0, Op1] = scalarizeVectorLoad(Load, DAG); 10660 return DAG.getMergeValues({Op0, Op1}, DL); 10661 } 10662 case 8: 10663 if (NumElements > 2) 10664 return SplitVectorLoad(Op, DAG); 10665 return SDValue(); 10666 case 16: 10667 // Same as global/flat 10668 if (NumElements > 4) 10669 return SplitVectorLoad(Op, DAG); 10670 // v3 loads not supported on SI. 10671 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores()) 10672 return WidenOrSplitVectorLoad(Op, DAG); 10673 10674 return SDValue(); 10675 default: 10676 llvm_unreachable("unsupported private_element_size"); 10677 } 10678 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 10679 unsigned Fast = 0; 10680 auto Flags = Load->getMemOperand()->getFlags(); 10681 if (allowsMisalignedMemoryAccessesImpl(MemVT.getSizeInBits(), AS, 10682 Load->getAlign(), Flags, &Fast) && 10683 Fast > 1) 10684 return SDValue(); 10685 10686 if (MemVT.isVector()) 10687 return SplitVectorLoad(Op, DAG); 10688 } 10689 10690 if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), 10691 MemVT, *Load->getMemOperand())) { 10692 auto [Op0, Op1] = expandUnalignedLoad(Load, DAG); 10693 return DAG.getMergeValues({Op0, Op1}, DL); 10694 } 10695 10696 return SDValue(); 10697 } 10698 10699 SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 10700 EVT VT = Op.getValueType(); 10701 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 || 10702 VT.getSizeInBits() == 512) 10703 return splitTernaryVectorOp(Op, DAG); 10704 10705 assert(VT.getSizeInBits() == 64); 10706 10707 SDLoc DL(Op); 10708 SDValue Cond = Op.getOperand(0); 10709 10710 SDValue Zero = DAG.getConstant(0, DL, MVT::i32); 10711 SDValue One = DAG.getConstant(1, DL, MVT::i32); 10712 10713 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1)); 10714 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2)); 10715 10716 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero); 10717 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero); 10718 10719 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1); 10720 10721 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One); 10722 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One); 10723 10724 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1); 10725 10726 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi}); 10727 return DAG.getNode(ISD::BITCAST, DL, VT, Res); 10728 } 10729 10730 // Catch division cases where we can use shortcuts with rcp and rsq 10731 // instructions. 10732 SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op, 10733 SelectionDAG &DAG) const { 10734 SDLoc SL(Op); 10735 SDValue LHS = Op.getOperand(0); 10736 SDValue RHS = Op.getOperand(1); 10737 EVT VT = Op.getValueType(); 10738 const SDNodeFlags Flags = Op->getFlags(); 10739 10740 bool AllowInaccurateRcp = 10741 Flags.hasApproximateFuncs() || DAG.getTarget().Options.UnsafeFPMath; 10742 10743 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) { 10744 // Without !fpmath accuracy information, we can't do more because we don't 10745 // know exactly whether rcp is accurate enough to meet !fpmath requirement. 10746 // f16 is always accurate enough 10747 if (!AllowInaccurateRcp && VT != MVT::f16) 10748 return SDValue(); 10749 10750 if (CLHS->isExactlyValue(1.0)) { 10751 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to 10752 // the CI documentation has a worst case error of 1 ulp. 10753 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to 10754 // use it as long as we aren't trying to use denormals. 10755 // 10756 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp. 10757 10758 // 1.0 / sqrt(x) -> rsq(x) 10759 10760 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP 10761 // error seems really high at 2^29 ULP. 10762 // 1.0 / x -> rcp(x) 10763 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); 10764 } 10765 10766 // Same as for 1.0, but expand the sign out of the constant. 10767 if (CLHS->isExactlyValue(-1.0)) { 10768 // -1.0 / x -> rcp (fneg x) 10769 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); 10770 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS); 10771 } 10772 } 10773 10774 // For f16 require afn or arcp. 10775 // For f32 require afn. 10776 if (!AllowInaccurateRcp && (VT != MVT::f16 || !Flags.hasAllowReciprocal())) 10777 return SDValue(); 10778 10779 // Turn into multiply by the reciprocal. 10780 // x / y -> x * (1.0 / y) 10781 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); 10782 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags); 10783 } 10784 10785 SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op, 10786 SelectionDAG &DAG) const { 10787 SDLoc SL(Op); 10788 SDValue X = Op.getOperand(0); 10789 SDValue Y = Op.getOperand(1); 10790 EVT VT = Op.getValueType(); 10791 const SDNodeFlags Flags = Op->getFlags(); 10792 10793 bool AllowInaccurateDiv = 10794 Flags.hasApproximateFuncs() || DAG.getTarget().Options.UnsafeFPMath; 10795 if (!AllowInaccurateDiv) 10796 return SDValue(); 10797 10798 SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y); 10799 SDValue One = DAG.getConstantFP(1.0, SL, VT); 10800 10801 SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y); 10802 SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One); 10803 10804 R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R); 10805 SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One); 10806 R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R); 10807 SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R); 10808 SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X); 10809 return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret); 10810 } 10811 10812 static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, 10813 EVT VT, SDValue A, SDValue B, SDValue GlueChain, 10814 SDNodeFlags Flags) { 10815 if (GlueChain->getNumValues() <= 1) { 10816 return DAG.getNode(Opcode, SL, VT, A, B, Flags); 10817 } 10818 10819 assert(GlueChain->getNumValues() == 3); 10820 10821 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue); 10822 switch (Opcode) { 10823 default: 10824 llvm_unreachable("no chain equivalent for opcode"); 10825 case ISD::FMUL: 10826 Opcode = AMDGPUISD::FMUL_W_CHAIN; 10827 break; 10828 } 10829 10830 return DAG.getNode(Opcode, SL, VTList, 10831 {GlueChain.getValue(1), A, B, GlueChain.getValue(2)}, 10832 Flags); 10833 } 10834 10835 static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, 10836 EVT VT, SDValue A, SDValue B, SDValue C, 10837 SDValue GlueChain, SDNodeFlags Flags) { 10838 if (GlueChain->getNumValues() <= 1) { 10839 return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags); 10840 } 10841 10842 assert(GlueChain->getNumValues() == 3); 10843 10844 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue); 10845 switch (Opcode) { 10846 default: 10847 llvm_unreachable("no chain equivalent for opcode"); 10848 case ISD::FMA: 10849 Opcode = AMDGPUISD::FMA_W_CHAIN; 10850 break; 10851 } 10852 10853 return DAG.getNode(Opcode, SL, VTList, 10854 {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)}, 10855 Flags); 10856 } 10857 10858 SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const { 10859 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG)) 10860 return FastLowered; 10861 10862 SDLoc SL(Op); 10863 SDValue LHS = Op.getOperand(0); 10864 SDValue RHS = Op.getOperand(1); 10865 10866 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32 10867 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32 10868 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d 10869 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp 10870 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n 10871 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp 10872 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n 10873 // tmp.u = opx(V_MUL_F32, e32.u, r32.u); 10874 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000) 10875 // q32.u = opx(V_ADD_F32, tmp.u, q32.u); 10876 // q16.u = opx(V_CVT_F16_F32, q32.u); 10877 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n) 10878 10879 // We will use ISD::FMA on targets that don't support ISD::FMAD. 10880 unsigned FMADOpCode = 10881 isOperationLegal(ISD::FMAD, MVT::f32) ? ISD::FMAD : ISD::FMA; 10882 10883 SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS); 10884 SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS); 10885 SDValue NegRHSExt = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHSExt); 10886 SDValue Rcp = 10887 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt, Op->getFlags()); 10888 SDValue Quot = 10889 DAG.getNode(ISD::FMUL, SL, MVT::f32, LHSExt, Rcp, Op->getFlags()); 10890 SDValue Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt, 10891 Op->getFlags()); 10892 Quot = DAG.getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot, Op->getFlags()); 10893 Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt, 10894 Op->getFlags()); 10895 SDValue Tmp = DAG.getNode(ISD::FMUL, SL, MVT::f32, Err, Rcp, Op->getFlags()); 10896 SDValue TmpCast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Tmp); 10897 TmpCast = DAG.getNode(ISD::AND, SL, MVT::i32, TmpCast, 10898 DAG.getConstant(0xff800000, SL, MVT::i32)); 10899 Tmp = DAG.getNode(ISD::BITCAST, SL, MVT::f32, TmpCast); 10900 Quot = DAG.getNode(ISD::FADD, SL, MVT::f32, Tmp, Quot, Op->getFlags()); 10901 SDValue RDst = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, 10902 DAG.getTargetConstant(0, SL, MVT::i32)); 10903 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst, RHS, LHS, 10904 Op->getFlags()); 10905 } 10906 10907 // Faster 2.5 ULP division that does not support denormals. 10908 SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const { 10909 SDNodeFlags Flags = Op->getFlags(); 10910 SDLoc SL(Op); 10911 SDValue LHS = Op.getOperand(1); 10912 SDValue RHS = Op.getOperand(2); 10913 10914 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags); 10915 10916 const APFloat K0Val(0x1p+96f); 10917 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32); 10918 10919 const APFloat K1Val(0x1p-32f); 10920 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32); 10921 10922 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); 10923 10924 EVT SetCCVT = 10925 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32); 10926 10927 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT); 10928 10929 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags); 10930 10931 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags); 10932 10933 // rcp does not support denormals. 10934 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags); 10935 10936 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags); 10937 10938 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags); 10939 } 10940 10941 // Returns immediate value for setting the F32 denorm mode when using the 10942 // S_DENORM_MODE instruction. 10943 static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, 10944 const SIMachineFunctionInfo *Info, 10945 const GCNSubtarget *ST) { 10946 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE"); 10947 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue(); 10948 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2); 10949 return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32); 10950 } 10951 10952 SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { 10953 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG)) 10954 return FastLowered; 10955 10956 // The selection matcher assumes anything with a chain selecting to a 10957 // mayRaiseFPException machine instruction. Since we're introducing a chain 10958 // here, we need to explicitly report nofpexcept for the regular fdiv 10959 // lowering. 10960 SDNodeFlags Flags = Op->getFlags(); 10961 Flags.setNoFPExcept(true); 10962 10963 SDLoc SL(Op); 10964 SDValue LHS = Op.getOperand(0); 10965 SDValue RHS = Op.getOperand(1); 10966 10967 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); 10968 10969 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1); 10970 10971 SDValue DenominatorScaled = 10972 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {RHS, RHS, LHS}, Flags); 10973 SDValue NumeratorScaled = 10974 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {LHS, RHS, LHS}, Flags); 10975 10976 // Denominator is scaled to not be denormal, so using rcp is ok. 10977 SDValue ApproxRcp = 10978 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled, Flags); 10979 SDValue NegDivScale0 = 10980 DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled, Flags); 10981 10982 using namespace AMDGPU::Hwreg; 10983 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2); 10984 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32); 10985 10986 const MachineFunction &MF = DAG.getMachineFunction(); 10987 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 10988 const DenormalMode DenormMode = Info->getMode().FP32Denormals; 10989 10990 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE(); 10991 const bool HasDynamicDenormals = 10992 (DenormMode.Input == DenormalMode::Dynamic) || 10993 (DenormMode.Output == DenormalMode::Dynamic); 10994 10995 SDValue SavedDenormMode; 10996 10997 if (!PreservesDenormals) { 10998 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV 10999 // lowering. The chain dependence is insufficient, and we need glue. We do 11000 // not need the glue variants in a strictfp function. 11001 11002 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); 11003 11004 SDValue Glue = DAG.getEntryNode(); 11005 if (HasDynamicDenormals) { 11006 SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL, 11007 DAG.getVTList(MVT::i32, MVT::Glue), 11008 {BitField, Glue}); 11009 SavedDenormMode = SDValue(GetReg, 0); 11010 11011 Glue = DAG.getMergeValues( 11012 {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL); 11013 } 11014 11015 SDNode *EnableDenorm; 11016 if (Subtarget->hasDenormModeInst()) { 11017 const SDValue EnableDenormValue = 11018 getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, Info, Subtarget); 11019 11020 EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue, 11021 EnableDenormValue) 11022 .getNode(); 11023 } else { 11024 const SDValue EnableDenormValue = 11025 DAG.getConstant(FP_DENORM_FLUSH_NONE, SL, MVT::i32); 11026 EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs, 11027 {EnableDenormValue, BitField, Glue}); 11028 } 11029 11030 SDValue Ops[3] = {NegDivScale0, SDValue(EnableDenorm, 0), 11031 SDValue(EnableDenorm, 1)}; 11032 11033 NegDivScale0 = DAG.getMergeValues(Ops, SL); 11034 } 11035 11036 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, 11037 ApproxRcp, One, NegDivScale0, Flags); 11038 11039 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp, 11040 ApproxRcp, Fma0, Flags); 11041 11042 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1, 11043 Fma1, Flags); 11044 11045 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul, 11046 NumeratorScaled, Mul, Flags); 11047 11048 SDValue Fma3 = 11049 getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul, Fma2, Flags); 11050 11051 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3, 11052 NumeratorScaled, Fma3, Flags); 11053 11054 if (!PreservesDenormals) { 11055 SDNode *DisableDenorm; 11056 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) { 11057 const SDValue DisableDenormValue = getSPDenormModeValue( 11058 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget); 11059 11060 DisableDenorm = 11061 DAG.getNode(AMDGPUISD::DENORM_MODE, SL, MVT::Other, Fma4.getValue(1), 11062 DisableDenormValue, Fma4.getValue(2)) 11063 .getNode(); 11064 } else { 11065 assert(HasDynamicDenormals == (bool)SavedDenormMode); 11066 const SDValue DisableDenormValue = 11067 HasDynamicDenormals 11068 ? SavedDenormMode 11069 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32); 11070 11071 DisableDenorm = DAG.getMachineNode( 11072 AMDGPU::S_SETREG_B32, SL, MVT::Other, 11073 {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)}); 11074 } 11075 11076 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other, 11077 SDValue(DisableDenorm, 0), DAG.getRoot()); 11078 DAG.setRoot(OutputChain); 11079 } 11080 11081 SDValue Scale = NumeratorScaled.getValue(1); 11082 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32, 11083 {Fma4, Fma1, Fma3, Scale}, Flags); 11084 11085 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags); 11086 } 11087 11088 SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const { 11089 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG)) 11090 return FastLowered; 11091 11092 SDLoc SL(Op); 11093 SDValue X = Op.getOperand(0); 11094 SDValue Y = Op.getOperand(1); 11095 11096 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64); 11097 11098 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1); 11099 11100 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X); 11101 11102 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0); 11103 11104 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0); 11105 11106 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One); 11107 11108 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp); 11109 11110 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One); 11111 11112 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X); 11113 11114 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1); 11115 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3); 11116 11117 SDValue Fma4 = 11118 DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Mul, DivScale1); 11119 11120 SDValue Scale; 11121 11122 if (!Subtarget->hasUsableDivScaleConditionOutput()) { 11123 // Workaround a hardware bug on SI where the condition output from div_scale 11124 // is not usable. 11125 11126 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32); 11127 11128 // Figure out if the scale to use for div_fmas. 11129 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X); 11130 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y); 11131 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0); 11132 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1); 11133 11134 SDValue NumHi = 11135 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi); 11136 SDValue DenHi = 11137 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi); 11138 11139 SDValue Scale0Hi = 11140 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi); 11141 SDValue Scale1Hi = 11142 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi); 11143 11144 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ); 11145 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ); 11146 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen); 11147 } else { 11148 Scale = DivScale1.getValue(1); 11149 } 11150 11151 SDValue Fmas = 11152 DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, Fma4, Fma3, Mul, Scale); 11153 11154 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X); 11155 } 11156 11157 SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const { 11158 EVT VT = Op.getValueType(); 11159 11160 if (VT == MVT::f32) 11161 return LowerFDIV32(Op, DAG); 11162 11163 if (VT == MVT::f64) 11164 return LowerFDIV64(Op, DAG); 11165 11166 if (VT == MVT::f16) 11167 return LowerFDIV16(Op, DAG); 11168 11169 llvm_unreachable("Unexpected type for fdiv"); 11170 } 11171 11172 SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const { 11173 SDLoc dl(Op); 11174 SDValue Val = Op.getOperand(0); 11175 EVT VT = Val.getValueType(); 11176 EVT ResultExpVT = Op->getValueType(1); 11177 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32; 11178 11179 SDValue Mant = DAG.getNode( 11180 ISD::INTRINSIC_WO_CHAIN, dl, VT, 11181 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val); 11182 11183 SDValue Exp = DAG.getNode( 11184 ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT, 11185 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val); 11186 11187 if (Subtarget->hasFractBug()) { 11188 SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val); 11189 SDValue Inf = 11190 DAG.getConstantFP(APFloat::getInf(VT.getFltSemantics()), dl, VT); 11191 11192 SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT); 11193 SDValue Zero = DAG.getConstant(0, dl, InstrExpVT); 11194 Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero); 11195 Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val); 11196 } 11197 11198 SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT); 11199 return DAG.getMergeValues({Mant, CastExp}, dl); 11200 } 11201 11202 SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { 11203 SDLoc DL(Op); 11204 StoreSDNode *Store = cast<StoreSDNode>(Op); 11205 EVT VT = Store->getMemoryVT(); 11206 11207 if (VT == MVT::i1) { 11208 return DAG.getTruncStore( 11209 Store->getChain(), DL, 11210 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32), 11211 Store->getBasePtr(), MVT::i1, Store->getMemOperand()); 11212 } 11213 11214 assert(VT.isVector() && 11215 Store->getValue().getValueType().getScalarType() == MVT::i32); 11216 11217 unsigned AS = Store->getAddressSpace(); 11218 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS && 11219 Store->getAlign().value() < VT.getStoreSize() && 11220 VT.getSizeInBits() > 32) { 11221 return SplitVectorStore(Op, DAG); 11222 } 11223 11224 MachineFunction &MF = DAG.getMachineFunction(); 11225 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 11226 // If there is a possibility that flat instruction access scratch memory 11227 // then we need to use the same legalization rules we use for private. 11228 if (AS == AMDGPUAS::FLAT_ADDRESS && 11229 !Subtarget->hasMultiDwordFlatScratchAddressing()) 11230 AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI) 11231 ? AMDGPUAS::PRIVATE_ADDRESS 11232 : AMDGPUAS::GLOBAL_ADDRESS; 11233 11234 unsigned NumElements = VT.getVectorNumElements(); 11235 if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) { 11236 if (NumElements > 4) 11237 return SplitVectorStore(Op, DAG); 11238 // v3 stores not supported on SI. 11239 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores()) 11240 return SplitVectorStore(Op, DAG); 11241 11242 if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), 11243 VT, *Store->getMemOperand())) 11244 return expandUnalignedStore(Store, DAG); 11245 11246 return SDValue(); 11247 } 11248 if (AS == AMDGPUAS::PRIVATE_ADDRESS) { 11249 switch (Subtarget->getMaxPrivateElementSize()) { 11250 case 4: 11251 return scalarizeVectorStore(Store, DAG); 11252 case 8: 11253 if (NumElements > 2) 11254 return SplitVectorStore(Op, DAG); 11255 return SDValue(); 11256 case 16: 11257 if (NumElements > 4 || 11258 (NumElements == 3 && !Subtarget->enableFlatScratch())) 11259 return SplitVectorStore(Op, DAG); 11260 return SDValue(); 11261 default: 11262 llvm_unreachable("unsupported private_element_size"); 11263 } 11264 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 11265 unsigned Fast = 0; 11266 auto Flags = Store->getMemOperand()->getFlags(); 11267 if (allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AS, 11268 Store->getAlign(), Flags, &Fast) && 11269 Fast > 1) 11270 return SDValue(); 11271 11272 if (VT.isVector()) 11273 return SplitVectorStore(Op, DAG); 11274 11275 return expandUnalignedStore(Store, DAG); 11276 } 11277 11278 // Probably an invalid store. If so we'll end up emitting a selection error. 11279 return SDValue(); 11280 } 11281 11282 // Avoid the full correct expansion for f32 sqrt when promoting from f16. 11283 SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const { 11284 SDLoc SL(Op); 11285 assert(!Subtarget->has16BitInsts()); 11286 SDNodeFlags Flags = Op->getFlags(); 11287 SDValue Ext = 11288 DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags); 11289 11290 SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32); 11291 SDValue Sqrt = 11292 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags); 11293 11294 return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt, 11295 DAG.getTargetConstant(0, SL, MVT::i32), Flags); 11296 } 11297 11298 SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const { 11299 SDLoc DL(Op); 11300 SDNodeFlags Flags = Op->getFlags(); 11301 MVT VT = Op.getValueType().getSimpleVT(); 11302 const SDValue X = Op.getOperand(0); 11303 11304 if (allowApproxFunc(DAG, Flags)) { 11305 // Instruction is 1ulp but ignores denormals. 11306 return DAG.getNode( 11307 ISD::INTRINSIC_WO_CHAIN, DL, VT, 11308 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags); 11309 } 11310 11311 SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT); 11312 SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT); 11313 11314 SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT); 11315 11316 SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags); 11317 11318 SDValue SqrtX = 11319 DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags); 11320 11321 SDValue SqrtS; 11322 if (needsDenormHandlingF32(DAG, X, Flags)) { 11323 SDValue SqrtID = 11324 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32); 11325 SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags); 11326 11327 SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS); 11328 SDValue SqrtSNextDownInt = 11329 DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt, 11330 DAG.getAllOnesConstant(DL, MVT::i32)); 11331 SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt); 11332 11333 SDValue NegSqrtSNextDown = 11334 DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags); 11335 11336 SDValue SqrtVP = 11337 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags); 11338 11339 SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt, 11340 DAG.getConstant(1, DL, MVT::i32)); 11341 SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt); 11342 11343 SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags); 11344 SDValue SqrtVS = 11345 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags); 11346 11347 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT); 11348 SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE); 11349 11350 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS, 11351 Flags); 11352 11353 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT); 11354 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS, 11355 Flags); 11356 } else { 11357 SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags); 11358 11359 SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags); 11360 11361 SDValue Half = DAG.getConstantFP(0.5f, DL, VT); 11362 SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags); 11363 SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags); 11364 11365 SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags); 11366 SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags); 11367 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags); 11368 11369 SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags); 11370 SDValue SqrtD = 11371 DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags); 11372 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags); 11373 } 11374 11375 SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT); 11376 11377 SDValue ScaledDown = 11378 DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags); 11379 11380 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags); 11381 SDValue IsZeroOrInf = 11382 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX, 11383 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32)); 11384 11385 return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags); 11386 } 11387 11388 SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const { 11389 // For double type, the SQRT and RSQ instructions don't have required 11390 // precision, we apply Goldschmidt's algorithm to improve the result: 11391 // 11392 // y0 = rsq(x) 11393 // g0 = x * y0 11394 // h0 = 0.5 * y0 11395 // 11396 // r0 = 0.5 - h0 * g0 11397 // g1 = g0 * r0 + g0 11398 // h1 = h0 * r0 + h0 11399 // 11400 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1 11401 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1 11402 // h2 = h1 * r1 + h1 11403 // 11404 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2 11405 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2 11406 // 11407 // sqrt(x) = g3 11408 11409 SDNodeFlags Flags = Op->getFlags(); 11410 11411 SDLoc DL(Op); 11412 11413 SDValue X = Op.getOperand(0); 11414 SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64); 11415 11416 SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT); 11417 11418 SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32); 11419 11420 // Scale up input if it is too small. 11421 SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32); 11422 SDValue ScaleUp = 11423 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt); 11424 SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags); 11425 11426 SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX); 11427 11428 SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY); 11429 11430 SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64); 11431 SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half); 11432 11433 SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0); 11434 SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half); 11435 11436 SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0); 11437 11438 SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0); 11439 11440 SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1); 11441 SDValue SqrtD0 = 11442 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX); 11443 11444 SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1); 11445 11446 SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2); 11447 SDValue SqrtD1 = 11448 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX); 11449 11450 SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2); 11451 11452 SDValue ScaleDownFactor = DAG.getSignedConstant(-128, DL, MVT::i32); 11453 SDValue ScaleDown = 11454 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt); 11455 SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags); 11456 11457 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check 11458 // with finite only or nsz because rsq(+/-0) = +/-inf 11459 11460 // TODO: Check for DAZ and expand to subnormals 11461 SDValue IsZeroOrInf = 11462 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX, 11463 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32)); 11464 11465 // If x is +INF, +0, or -0, use its original value 11466 return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet, 11467 Flags); 11468 } 11469 11470 SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { 11471 SDLoc DL(Op); 11472 EVT VT = Op.getValueType(); 11473 SDValue Arg = Op.getOperand(0); 11474 SDValue TrigVal; 11475 11476 // Propagate fast-math flags so that the multiply we introduce can be folded 11477 // if Arg is already the result of a multiply by constant. 11478 auto Flags = Op->getFlags(); 11479 11480 SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT); 11481 11482 if (Subtarget->hasTrigReducedRange()) { 11483 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags); 11484 TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags); 11485 } else { 11486 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags); 11487 } 11488 11489 switch (Op.getOpcode()) { 11490 case ISD::FCOS: 11491 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags); 11492 case ISD::FSIN: 11493 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags); 11494 default: 11495 llvm_unreachable("Wrong trig opcode"); 11496 } 11497 } 11498 11499 SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, 11500 SelectionDAG &DAG) const { 11501 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op); 11502 assert(AtomicNode->isCompareAndSwap()); 11503 unsigned AS = AtomicNode->getAddressSpace(); 11504 11505 // No custom lowering required for local address space 11506 if (!AMDGPU::isFlatGlobalAddrSpace(AS)) 11507 return Op; 11508 11509 // Non-local address space requires custom lowering for atomic compare 11510 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2 11511 SDLoc DL(Op); 11512 SDValue ChainIn = Op.getOperand(0); 11513 SDValue Addr = Op.getOperand(1); 11514 SDValue Old = Op.getOperand(2); 11515 SDValue New = Op.getOperand(3); 11516 EVT VT = Op.getValueType(); 11517 MVT SimpleVT = VT.getSimpleVT(); 11518 MVT VecType = MVT::getVectorVT(SimpleVT, 2); 11519 11520 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old}); 11521 SDValue Ops[] = {ChainIn, Addr, NewOld}; 11522 11523 return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, 11524 Op->getVTList(), Ops, VT, 11525 AtomicNode->getMemOperand()); 11526 } 11527 11528 //===----------------------------------------------------------------------===// 11529 // Custom DAG optimizations 11530 //===----------------------------------------------------------------------===// 11531 11532 SDValue 11533 SITargetLowering::performUCharToFloatCombine(SDNode *N, 11534 DAGCombinerInfo &DCI) const { 11535 EVT VT = N->getValueType(0); 11536 EVT ScalarVT = VT.getScalarType(); 11537 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16) 11538 return SDValue(); 11539 11540 SelectionDAG &DAG = DCI.DAG; 11541 SDLoc DL(N); 11542 11543 SDValue Src = N->getOperand(0); 11544 EVT SrcVT = Src.getValueType(); 11545 11546 // TODO: We could try to match extracting the higher bytes, which would be 11547 // easier if i8 vectors weren't promoted to i32 vectors, particularly after 11548 // types are legalized. v4i8 -> v4f32 is probably the only case to worry 11549 // about in practice. 11550 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) { 11551 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) { 11552 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src); 11553 DCI.AddToWorklist(Cvt.getNode()); 11554 11555 // For the f16 case, fold to a cast to f32 and then cast back to f16. 11556 if (ScalarVT != MVT::f32) { 11557 Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt, 11558 DAG.getTargetConstant(0, DL, MVT::i32)); 11559 } 11560 return Cvt; 11561 } 11562 } 11563 11564 return SDValue(); 11565 } 11566 11567 SDValue SITargetLowering::performFCopySignCombine(SDNode *N, 11568 DAGCombinerInfo &DCI) const { 11569 SDValue MagnitudeOp = N->getOperand(0); 11570 SDValue SignOp = N->getOperand(1); 11571 SelectionDAG &DAG = DCI.DAG; 11572 SDLoc DL(N); 11573 11574 // f64 fcopysign is really an f32 copysign on the high bits, so replace the 11575 // lower half with a copy. 11576 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y) 11577 if (MagnitudeOp.getValueType() == MVT::f64) { 11578 SDValue MagAsVector = 11579 DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, MagnitudeOp); 11580 SDValue MagLo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, 11581 MagAsVector, DAG.getConstant(0, DL, MVT::i32)); 11582 SDValue MagHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, 11583 MagAsVector, DAG.getConstant(1, DL, MVT::i32)); 11584 11585 SDValue HiOp = DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOp); 11586 11587 SDValue Vector = 11588 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp); 11589 11590 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector); 11591 } 11592 11593 if (SignOp.getValueType() != MVT::f64) 11594 return SDValue(); 11595 11596 // Reduce width of sign operand, we only need the highest bit. 11597 // 11598 // fcopysign f64:x, f64:y -> 11599 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1) 11600 // TODO: In some cases it might make sense to go all the way to f16. 11601 SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, SignOp); 11602 SDValue SignAsF32 = 11603 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector, 11604 DAG.getConstant(1, DL, MVT::i32)); 11605 11606 return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0), 11607 SignAsF32); 11608 } 11609 11610 // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2) 11611 // (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no 11612 // bits 11613 11614 // This is a variant of 11615 // (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2), 11616 // 11617 // The normal DAG combiner will do this, but only if the add has one use since 11618 // that would increase the number of instructions. 11619 // 11620 // This prevents us from seeing a constant offset that can be folded into a 11621 // memory instruction's addressing mode. If we know the resulting add offset of 11622 // a pointer can be folded into an addressing offset, we can replace the pointer 11623 // operand with the add of new constant offset. This eliminates one of the uses, 11624 // and may allow the remaining use to also be simplified. 11625 // 11626 SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, unsigned AddrSpace, 11627 EVT MemVT, 11628 DAGCombinerInfo &DCI) const { 11629 SDValue N0 = N->getOperand(0); 11630 SDValue N1 = N->getOperand(1); 11631 11632 // We only do this to handle cases where it's profitable when there are 11633 // multiple uses of the add, so defer to the standard combine. 11634 if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) || 11635 N0->hasOneUse()) 11636 return SDValue(); 11637 11638 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1); 11639 if (!CN1) 11640 return SDValue(); 11641 11642 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 11643 if (!CAdd) 11644 return SDValue(); 11645 11646 SelectionDAG &DAG = DCI.DAG; 11647 11648 if (N0->getOpcode() == ISD::OR && 11649 !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1))) 11650 return SDValue(); 11651 11652 // If the resulting offset is too large, we can't fold it into the 11653 // addressing mode offset. 11654 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue(); 11655 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext()); 11656 11657 AddrMode AM; 11658 AM.HasBaseReg = true; 11659 AM.BaseOffs = Offset.getSExtValue(); 11660 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace)) 11661 return SDValue(); 11662 11663 SDLoc SL(N); 11664 EVT VT = N->getValueType(0); 11665 11666 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1); 11667 SDValue COffset = DAG.getConstant(Offset, SL, VT); 11668 11669 SDNodeFlags Flags; 11670 Flags.setNoUnsignedWrap( 11671 N->getFlags().hasNoUnsignedWrap() && 11672 (N0.getOpcode() == ISD::OR || N0->getFlags().hasNoUnsignedWrap())); 11673 11674 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags); 11675 } 11676 11677 /// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset 11678 /// by the chain and intrinsic ID. Theoretically we would also need to check the 11679 /// specific intrinsic, but they all place the pointer operand first. 11680 static unsigned getBasePtrIndex(const MemSDNode *N) { 11681 switch (N->getOpcode()) { 11682 case ISD::STORE: 11683 case ISD::INTRINSIC_W_CHAIN: 11684 case ISD::INTRINSIC_VOID: 11685 return 2; 11686 default: 11687 return 1; 11688 } 11689 } 11690 11691 SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N, 11692 DAGCombinerInfo &DCI) const { 11693 SelectionDAG &DAG = DCI.DAG; 11694 SDLoc SL(N); 11695 11696 unsigned PtrIdx = getBasePtrIndex(N); 11697 SDValue Ptr = N->getOperand(PtrIdx); 11698 11699 // TODO: We could also do this for multiplies. 11700 if (Ptr.getOpcode() == ISD::SHL) { 11701 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(), 11702 N->getMemoryVT(), DCI); 11703 if (NewPtr) { 11704 SmallVector<SDValue, 8> NewOps(N->ops()); 11705 11706 NewOps[PtrIdx] = NewPtr; 11707 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); 11708 } 11709 } 11710 11711 return SDValue(); 11712 } 11713 11714 static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) { 11715 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) || 11716 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) || 11717 (Opc == ISD::XOR && Val == 0); 11718 } 11719 11720 // Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This 11721 // will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit 11722 // integer combine opportunities since most 64-bit operations are decomposed 11723 // this way. TODO: We won't want this for SALU especially if it is an inline 11724 // immediate. 11725 SDValue SITargetLowering::splitBinaryBitConstantOp( 11726 DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, 11727 const ConstantSDNode *CRHS) const { 11728 uint64_t Val = CRHS->getZExtValue(); 11729 uint32_t ValLo = Lo_32(Val); 11730 uint32_t ValHi = Hi_32(Val); 11731 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 11732 11733 if ((bitOpWithConstantIsReducible(Opc, ValLo) || 11734 bitOpWithConstantIsReducible(Opc, ValHi)) || 11735 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) { 11736 // If we need to materialize a 64-bit immediate, it will be split up later 11737 // anyway. Avoid creating the harder to understand 64-bit immediate 11738 // materialization. 11739 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi); 11740 } 11741 11742 return SDValue(); 11743 } 11744 11745 bool llvm::isBoolSGPR(SDValue V) { 11746 if (V.getValueType() != MVT::i1) 11747 return false; 11748 switch (V.getOpcode()) { 11749 default: 11750 break; 11751 case ISD::SETCC: 11752 case AMDGPUISD::FP_CLASS: 11753 return true; 11754 case ISD::AND: 11755 case ISD::OR: 11756 case ISD::XOR: 11757 return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1)); 11758 } 11759 return false; 11760 } 11761 11762 // If a constant has all zeroes or all ones within each byte return it. 11763 // Otherwise return 0. 11764 static uint32_t getConstantPermuteMask(uint32_t C) { 11765 // 0xff for any zero byte in the mask 11766 uint32_t ZeroByteMask = 0; 11767 if (!(C & 0x000000ff)) 11768 ZeroByteMask |= 0x000000ff; 11769 if (!(C & 0x0000ff00)) 11770 ZeroByteMask |= 0x0000ff00; 11771 if (!(C & 0x00ff0000)) 11772 ZeroByteMask |= 0x00ff0000; 11773 if (!(C & 0xff000000)) 11774 ZeroByteMask |= 0xff000000; 11775 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte 11776 if ((NonZeroByteMask & C) != NonZeroByteMask) 11777 return 0; // Partial bytes selected. 11778 return C; 11779 } 11780 11781 // Check if a node selects whole bytes from its operand 0 starting at a byte 11782 // boundary while masking the rest. Returns select mask as in the v_perm_b32 11783 // or -1 if not succeeded. 11784 // Note byte select encoding: 11785 // value 0-3 selects corresponding source byte; 11786 // value 0xc selects zero; 11787 // value 0xff selects 0xff. 11788 static uint32_t getPermuteMask(SDValue V) { 11789 assert(V.getValueSizeInBits() == 32); 11790 11791 if (V.getNumOperands() != 2) 11792 return ~0; 11793 11794 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1)); 11795 if (!N1) 11796 return ~0; 11797 11798 uint32_t C = N1->getZExtValue(); 11799 11800 switch (V.getOpcode()) { 11801 default: 11802 break; 11803 case ISD::AND: 11804 if (uint32_t ConstMask = getConstantPermuteMask(C)) 11805 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask); 11806 break; 11807 11808 case ISD::OR: 11809 if (uint32_t ConstMask = getConstantPermuteMask(C)) 11810 return (0x03020100 & ~ConstMask) | ConstMask; 11811 break; 11812 11813 case ISD::SHL: 11814 if (C % 8) 11815 return ~0; 11816 11817 return uint32_t((0x030201000c0c0c0cull << C) >> 32); 11818 11819 case ISD::SRL: 11820 if (C % 8) 11821 return ~0; 11822 11823 return uint32_t(0x0c0c0c0c03020100ull >> C); 11824 } 11825 11826 return ~0; 11827 } 11828 11829 SDValue SITargetLowering::performAndCombine(SDNode *N, 11830 DAGCombinerInfo &DCI) const { 11831 if (DCI.isBeforeLegalize()) 11832 return SDValue(); 11833 11834 SelectionDAG &DAG = DCI.DAG; 11835 EVT VT = N->getValueType(0); 11836 SDValue LHS = N->getOperand(0); 11837 SDValue RHS = N->getOperand(1); 11838 11839 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS); 11840 if (VT == MVT::i64 && CRHS) { 11841 if (SDValue Split = 11842 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS)) 11843 return Split; 11844 } 11845 11846 if (CRHS && VT == MVT::i32) { 11847 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb 11848 // nb = number of trailing zeroes in mask 11849 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass, 11850 // given that we are selecting 8 or 16 bit fields starting at byte boundary. 11851 uint64_t Mask = CRHS->getZExtValue(); 11852 unsigned Bits = llvm::popcount(Mask); 11853 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL && 11854 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) { 11855 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) { 11856 unsigned Shift = CShift->getZExtValue(); 11857 unsigned NB = CRHS->getAPIntValue().countr_zero(); 11858 unsigned Offset = NB + Shift; 11859 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary. 11860 SDLoc SL(N); 11861 SDValue BFE = 11862 DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, LHS->getOperand(0), 11863 DAG.getConstant(Offset, SL, MVT::i32), 11864 DAG.getConstant(Bits, SL, MVT::i32)); 11865 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits); 11866 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE, 11867 DAG.getValueType(NarrowVT)); 11868 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext, 11869 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32)); 11870 return Shl; 11871 } 11872 } 11873 } 11874 11875 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2) 11876 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM && 11877 isa<ConstantSDNode>(LHS.getOperand(2))) { 11878 uint32_t Sel = getConstantPermuteMask(Mask); 11879 if (!Sel) 11880 return SDValue(); 11881 11882 // Select 0xc for all zero bytes 11883 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c); 11884 SDLoc DL(N); 11885 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0), 11886 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32)); 11887 } 11888 } 11889 11890 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) -> 11891 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity) 11892 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) { 11893 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get(); 11894 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get(); 11895 11896 SDValue X = LHS.getOperand(0); 11897 SDValue Y = RHS.getOperand(0); 11898 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X || 11899 !isTypeLegal(X.getValueType())) 11900 return SDValue(); 11901 11902 if (LCC == ISD::SETO) { 11903 if (X != LHS.getOperand(1)) 11904 return SDValue(); 11905 11906 if (RCC == ISD::SETUNE) { 11907 const ConstantFPSDNode *C1 = 11908 dyn_cast<ConstantFPSDNode>(RHS.getOperand(1)); 11909 if (!C1 || !C1->isInfinity() || C1->isNegative()) 11910 return SDValue(); 11911 11912 const uint32_t Mask = SIInstrFlags::N_NORMAL | 11913 SIInstrFlags::N_SUBNORMAL | SIInstrFlags::N_ZERO | 11914 SIInstrFlags::P_ZERO | SIInstrFlags::P_SUBNORMAL | 11915 SIInstrFlags::P_NORMAL; 11916 11917 static_assert( 11918 ((~(SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN | 11919 SIInstrFlags::N_INFINITY | SIInstrFlags::P_INFINITY)) & 11920 0x3ff) == Mask, 11921 "mask not equal"); 11922 11923 SDLoc DL(N); 11924 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, X, 11925 DAG.getConstant(Mask, DL, MVT::i32)); 11926 } 11927 } 11928 } 11929 11930 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS) 11931 std::swap(LHS, RHS); 11932 11933 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS && 11934 RHS.hasOneUse()) { 11935 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get(); 11936 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan | 11937 // n_nan) and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan 11938 // | n_nan) 11939 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1)); 11940 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask && 11941 (RHS.getOperand(0) == LHS.getOperand(0) && 11942 LHS.getOperand(0) == LHS.getOperand(1))) { 11943 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN; 11944 unsigned NewMask = LCC == ISD::SETO ? Mask->getZExtValue() & ~OrdMask 11945 : Mask->getZExtValue() & OrdMask; 11946 11947 SDLoc DL(N); 11948 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0), 11949 DAG.getConstant(NewMask, DL, MVT::i32)); 11950 } 11951 } 11952 11953 if (VT == MVT::i32 && (RHS.getOpcode() == ISD::SIGN_EXTEND || 11954 LHS.getOpcode() == ISD::SIGN_EXTEND)) { 11955 // and x, (sext cc from i1) => select cc, x, 0 11956 if (RHS.getOpcode() != ISD::SIGN_EXTEND) 11957 std::swap(LHS, RHS); 11958 if (isBoolSGPR(RHS.getOperand(0))) 11959 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0), LHS, 11960 DAG.getConstant(0, SDLoc(N), MVT::i32)); 11961 } 11962 11963 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2) 11964 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 11965 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() && 11966 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) { 11967 uint32_t LHSMask = getPermuteMask(LHS); 11968 uint32_t RHSMask = getPermuteMask(RHS); 11969 if (LHSMask != ~0u && RHSMask != ~0u) { 11970 // Canonicalize the expression in an attempt to have fewer unique masks 11971 // and therefore fewer registers used to hold the masks. 11972 if (LHSMask > RHSMask) { 11973 std::swap(LHSMask, RHSMask); 11974 std::swap(LHS, RHS); 11975 } 11976 11977 // Select 0xc for each lane used from source operand. Zero has 0xc mask 11978 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range. 11979 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c; 11980 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c; 11981 11982 // Check of we need to combine values from two sources within a byte. 11983 if (!(LHSUsedLanes & RHSUsedLanes) && 11984 // If we select high and lower word keep it for SDWA. 11985 // TODO: teach SDWA to work with v_perm_b32 and remove the check. 11986 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) { 11987 // Each byte in each mask is either selector mask 0-3, or has higher 11988 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for 11989 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise 11990 // mask which is not 0xff wins. By anding both masks we have a correct 11991 // result except that 0x0c shall be corrected to give 0x0c only. 11992 uint32_t Mask = LHSMask & RHSMask; 11993 for (unsigned I = 0; I < 32; I += 8) { 11994 uint32_t ByteSel = 0xff << I; 11995 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c) 11996 Mask &= (0x0c << I) & 0xffffffff; 11997 } 11998 11999 // Add 4 to each active LHS lane. It will not affect any existing 0xff 12000 // or 0x0c. 12001 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404); 12002 SDLoc DL(N); 12003 12004 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0), 12005 RHS.getOperand(0), 12006 DAG.getConstant(Sel, DL, MVT::i32)); 12007 } 12008 } 12009 } 12010 12011 return SDValue(); 12012 } 12013 12014 // A key component of v_perm is a mapping between byte position of the src 12015 // operands, and the byte position of the dest. To provide such, we need: 1. the 12016 // node that provides x byte of the dest of the OR, and 2. the byte of the node 12017 // used to provide that x byte. calculateByteProvider finds which node provides 12018 // a certain byte of the dest of the OR, and calculateSrcByte takes that node, 12019 // and finds an ultimate src and byte position For example: The supported 12020 // LoadCombine pattern for vector loads is as follows 12021 // t1 12022 // or 12023 // / \ 12024 // t2 t3 12025 // zext shl 12026 // | | \ 12027 // t4 t5 16 12028 // or anyext 12029 // / \ | 12030 // t6 t7 t8 12031 // srl shl or 12032 // / | / \ / \ 12033 // t9 t10 t11 t12 t13 t14 12034 // trunc* 8 trunc* 8 and and 12035 // | | / | | \ 12036 // t15 t16 t17 t18 t19 t20 12037 // trunc* 255 srl -256 12038 // | / \ 12039 // t15 t15 16 12040 // 12041 // *In this example, the truncs are from i32->i16 12042 // 12043 // calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3 12044 // respectively. calculateSrcByte would find (given node) -> ultimate src & 12045 // byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3. 12046 // After finding the mapping, we can combine the tree into vperm t15, t16, 12047 // 0x05000407 12048 12049 // Find the source and byte position from a node. 12050 // \p DestByte is the byte position of the dest of the or that the src 12051 // ultimately provides. \p SrcIndex is the byte of the src that maps to this 12052 // dest of the or byte. \p Depth tracks how many recursive iterations we have 12053 // performed. 12054 static const std::optional<ByteProvider<SDValue>> 12055 calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0, 12056 unsigned Depth = 0) { 12057 // We may need to recursively traverse a series of SRLs 12058 if (Depth >= 6) 12059 return std::nullopt; 12060 12061 if (Op.getValueSizeInBits() < 8) 12062 return std::nullopt; 12063 12064 if (Op.getValueType().isVector()) 12065 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex); 12066 12067 switch (Op->getOpcode()) { 12068 case ISD::TRUNCATE: { 12069 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1); 12070 } 12071 12072 case ISD::SIGN_EXTEND: 12073 case ISD::ZERO_EXTEND: 12074 case ISD::SIGN_EXTEND_INREG: { 12075 SDValue NarrowOp = Op->getOperand(0); 12076 auto NarrowVT = NarrowOp.getValueType(); 12077 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) { 12078 auto *VTSign = cast<VTSDNode>(Op->getOperand(1)); 12079 NarrowVT = VTSign->getVT(); 12080 } 12081 if (!NarrowVT.isByteSized()) 12082 return std::nullopt; 12083 uint64_t NarrowByteWidth = NarrowVT.getStoreSize(); 12084 12085 if (SrcIndex >= NarrowByteWidth) 12086 return std::nullopt; 12087 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1); 12088 } 12089 12090 case ISD::SRA: 12091 case ISD::SRL: { 12092 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1)); 12093 if (!ShiftOp) 12094 return std::nullopt; 12095 12096 uint64_t BitShift = ShiftOp->getZExtValue(); 12097 12098 if (BitShift % 8 != 0) 12099 return std::nullopt; 12100 12101 SrcIndex += BitShift / 8; 12102 12103 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1); 12104 } 12105 12106 default: { 12107 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex); 12108 } 12109 } 12110 llvm_unreachable("fully handled switch"); 12111 } 12112 12113 // For a byte position in the result of an Or, traverse the tree and find the 12114 // node (and the byte of the node) which ultimately provides this {Or, 12115 // BytePosition}. \p Op is the operand we are currently examining. \p Index is 12116 // the byte position of the Op that corresponds with the originally requested 12117 // byte of the Or \p Depth tracks how many recursive iterations we have 12118 // performed. \p StartingIndex is the originally requested byte of the Or 12119 static const std::optional<ByteProvider<SDValue>> 12120 calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth, 12121 unsigned StartingIndex = 0) { 12122 // Finding Src tree of RHS of or typically requires at least 1 additional 12123 // depth 12124 if (Depth > 6) 12125 return std::nullopt; 12126 12127 unsigned BitWidth = Op.getScalarValueSizeInBits(); 12128 if (BitWidth % 8 != 0) 12129 return std::nullopt; 12130 if (Index > BitWidth / 8 - 1) 12131 return std::nullopt; 12132 12133 bool IsVec = Op.getValueType().isVector(); 12134 switch (Op.getOpcode()) { 12135 case ISD::OR: { 12136 if (IsVec) 12137 return std::nullopt; 12138 12139 auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1, 12140 StartingIndex); 12141 if (!RHS) 12142 return std::nullopt; 12143 auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1, 12144 StartingIndex); 12145 if (!LHS) 12146 return std::nullopt; 12147 // A well formed Or will have two ByteProviders for each byte, one of which 12148 // is constant zero 12149 if (!LHS->isConstantZero() && !RHS->isConstantZero()) 12150 return std::nullopt; 12151 if (!LHS || LHS->isConstantZero()) 12152 return RHS; 12153 if (!RHS || RHS->isConstantZero()) 12154 return LHS; 12155 return std::nullopt; 12156 } 12157 12158 case ISD::AND: { 12159 if (IsVec) 12160 return std::nullopt; 12161 12162 auto *BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1)); 12163 if (!BitMaskOp) 12164 return std::nullopt; 12165 12166 uint32_t BitMask = BitMaskOp->getZExtValue(); 12167 // Bits we expect for our StartingIndex 12168 uint32_t IndexMask = 0xFF << (Index * 8); 12169 12170 if ((IndexMask & BitMask) != IndexMask) { 12171 // If the result of the and partially provides the byte, then it 12172 // is not well formatted 12173 if (IndexMask & BitMask) 12174 return std::nullopt; 12175 return ByteProvider<SDValue>::getConstantZero(); 12176 } 12177 12178 return calculateSrcByte(Op->getOperand(0), StartingIndex, Index); 12179 } 12180 12181 case ISD::FSHR: { 12182 if (IsVec) 12183 return std::nullopt; 12184 12185 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW)) 12186 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2)); 12187 if (!ShiftOp || Op.getValueType().isVector()) 12188 return std::nullopt; 12189 12190 uint64_t BitsProvided = Op.getValueSizeInBits(); 12191 if (BitsProvided % 8 != 0) 12192 return std::nullopt; 12193 12194 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided); 12195 if (BitShift % 8) 12196 return std::nullopt; 12197 12198 uint64_t ConcatSizeInBytes = BitsProvided / 4; 12199 uint64_t ByteShift = BitShift / 8; 12200 12201 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes; 12202 uint64_t BytesProvided = BitsProvided / 8; 12203 SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1); 12204 NewIndex %= BytesProvided; 12205 return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex); 12206 } 12207 12208 case ISD::SRA: 12209 case ISD::SRL: { 12210 if (IsVec) 12211 return std::nullopt; 12212 12213 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1)); 12214 if (!ShiftOp) 12215 return std::nullopt; 12216 12217 uint64_t BitShift = ShiftOp->getZExtValue(); 12218 if (BitShift % 8) 12219 return std::nullopt; 12220 12221 auto BitsProvided = Op.getScalarValueSizeInBits(); 12222 if (BitsProvided % 8 != 0) 12223 return std::nullopt; 12224 12225 uint64_t BytesProvided = BitsProvided / 8; 12226 uint64_t ByteShift = BitShift / 8; 12227 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes. 12228 // If the byte we are trying to provide (as tracked by index) falls in this 12229 // range, then the SRL provides the byte. The byte of interest of the src of 12230 // the SRL is Index + ByteShift 12231 return BytesProvided - ByteShift > Index 12232 ? calculateSrcByte(Op->getOperand(0), StartingIndex, 12233 Index + ByteShift) 12234 : ByteProvider<SDValue>::getConstantZero(); 12235 } 12236 12237 case ISD::SHL: { 12238 if (IsVec) 12239 return std::nullopt; 12240 12241 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1)); 12242 if (!ShiftOp) 12243 return std::nullopt; 12244 12245 uint64_t BitShift = ShiftOp->getZExtValue(); 12246 if (BitShift % 8 != 0) 12247 return std::nullopt; 12248 uint64_t ByteShift = BitShift / 8; 12249 12250 // If we are shifting by an amount greater than (or equal to) 12251 // the index we are trying to provide, then it provides 0s. If not, 12252 // then this bytes are not definitively 0s, and the corresponding byte 12253 // of interest is Index - ByteShift of the src 12254 return Index < ByteShift 12255 ? ByteProvider<SDValue>::getConstantZero() 12256 : calculateByteProvider(Op.getOperand(0), Index - ByteShift, 12257 Depth + 1, StartingIndex); 12258 } 12259 case ISD::ANY_EXTEND: 12260 case ISD::SIGN_EXTEND: 12261 case ISD::ZERO_EXTEND: 12262 case ISD::SIGN_EXTEND_INREG: 12263 case ISD::AssertZext: 12264 case ISD::AssertSext: { 12265 if (IsVec) 12266 return std::nullopt; 12267 12268 SDValue NarrowOp = Op->getOperand(0); 12269 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits(); 12270 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG || 12271 Op->getOpcode() == ISD::AssertZext || 12272 Op->getOpcode() == ISD::AssertSext) { 12273 auto *VTSign = cast<VTSDNode>(Op->getOperand(1)); 12274 NarrowBitWidth = VTSign->getVT().getSizeInBits(); 12275 } 12276 if (NarrowBitWidth % 8 != 0) 12277 return std::nullopt; 12278 uint64_t NarrowByteWidth = NarrowBitWidth / 8; 12279 12280 if (Index >= NarrowByteWidth) 12281 return Op.getOpcode() == ISD::ZERO_EXTEND 12282 ? std::optional<ByteProvider<SDValue>>( 12283 ByteProvider<SDValue>::getConstantZero()) 12284 : std::nullopt; 12285 return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex); 12286 } 12287 12288 case ISD::TRUNCATE: { 12289 if (IsVec) 12290 return std::nullopt; 12291 12292 uint64_t NarrowByteWidth = BitWidth / 8; 12293 12294 if (NarrowByteWidth >= Index) { 12295 return calculateByteProvider(Op.getOperand(0), Index, Depth + 1, 12296 StartingIndex); 12297 } 12298 12299 return std::nullopt; 12300 } 12301 12302 case ISD::CopyFromReg: { 12303 if (BitWidth / 8 > Index) 12304 return calculateSrcByte(Op, StartingIndex, Index); 12305 12306 return std::nullopt; 12307 } 12308 12309 case ISD::LOAD: { 12310 auto *L = cast<LoadSDNode>(Op.getNode()); 12311 12312 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits(); 12313 if (NarrowBitWidth % 8 != 0) 12314 return std::nullopt; 12315 uint64_t NarrowByteWidth = NarrowBitWidth / 8; 12316 12317 // If the width of the load does not reach byte we are trying to provide for 12318 // and it is not a ZEXTLOAD, then the load does not provide for the byte in 12319 // question 12320 if (Index >= NarrowByteWidth) { 12321 return L->getExtensionType() == ISD::ZEXTLOAD 12322 ? std::optional<ByteProvider<SDValue>>( 12323 ByteProvider<SDValue>::getConstantZero()) 12324 : std::nullopt; 12325 } 12326 12327 if (NarrowByteWidth > Index) { 12328 return calculateSrcByte(Op, StartingIndex, Index); 12329 } 12330 12331 return std::nullopt; 12332 } 12333 12334 case ISD::BSWAP: { 12335 if (IsVec) 12336 return std::nullopt; 12337 12338 return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1, 12339 Depth + 1, StartingIndex); 12340 } 12341 12342 case ISD::EXTRACT_VECTOR_ELT: { 12343 auto *IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1)); 12344 if (!IdxOp) 12345 return std::nullopt; 12346 auto VecIdx = IdxOp->getZExtValue(); 12347 auto ScalarSize = Op.getScalarValueSizeInBits(); 12348 if (ScalarSize < 32) 12349 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index; 12350 return calculateSrcByte(ScalarSize >= 32 ? Op : Op.getOperand(0), 12351 StartingIndex, Index); 12352 } 12353 12354 case AMDGPUISD::PERM: { 12355 if (IsVec) 12356 return std::nullopt; 12357 12358 auto *PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2)); 12359 if (!PermMask) 12360 return std::nullopt; 12361 12362 auto IdxMask = 12363 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8); 12364 if (IdxMask > 0x07 && IdxMask != 0x0c) 12365 return std::nullopt; 12366 12367 auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1); 12368 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask; 12369 12370 return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex) 12371 : ByteProvider<SDValue>( 12372 ByteProvider<SDValue>::getConstantZero()); 12373 } 12374 12375 default: { 12376 return std::nullopt; 12377 } 12378 } 12379 12380 llvm_unreachable("fully handled switch"); 12381 } 12382 12383 // Returns true if the Operand is a scalar and is 16 bits 12384 static bool isExtendedFrom16Bits(SDValue &Operand) { 12385 12386 switch (Operand.getOpcode()) { 12387 case ISD::ANY_EXTEND: 12388 case ISD::SIGN_EXTEND: 12389 case ISD::ZERO_EXTEND: { 12390 auto OpVT = Operand.getOperand(0).getValueType(); 12391 return !OpVT.isVector() && OpVT.getSizeInBits() == 16; 12392 } 12393 case ISD::LOAD: { 12394 LoadSDNode *L = cast<LoadSDNode>(Operand.getNode()); 12395 auto ExtType = cast<LoadSDNode>(L)->getExtensionType(); 12396 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD || 12397 ExtType == ISD::EXTLOAD) { 12398 auto MemVT = L->getMemoryVT(); 12399 return !MemVT.isVector() && MemVT.getSizeInBits() == 16; 12400 } 12401 return L->getMemoryVT().getSizeInBits() == 16; 12402 } 12403 default: 12404 return false; 12405 } 12406 } 12407 12408 // Returns true if the mask matches consecutive bytes, and the first byte 12409 // begins at a power of 2 byte offset from 0th byte 12410 static bool addresses16Bits(int Mask) { 12411 int Low8 = Mask & 0xff; 12412 int Hi8 = (Mask & 0xff00) >> 8; 12413 12414 assert(Low8 < 8 && Hi8 < 8); 12415 // Are the bytes contiguous in the order of increasing addresses. 12416 bool IsConsecutive = (Hi8 - Low8 == 1); 12417 // Is the first byte at location that is aligned for 16 bit instructions. 12418 // A counter example is taking 2 consecutive bytes starting at the 8th bit. 12419 // In this case, we still need code to extract the 16 bit operand, so it 12420 // is better to use i8 v_perm 12421 bool Is16Aligned = !(Low8 % 2); 12422 12423 return IsConsecutive && Is16Aligned; 12424 } 12425 12426 // Do not lower into v_perm if the operands are actually 16 bit 12427 // and the selected bits (based on PermMask) correspond with two 12428 // easily addressable 16 bit operands. 12429 static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, 12430 SDValue &OtherOp) { 12431 int Low16 = PermMask & 0xffff; 12432 int Hi16 = (PermMask & 0xffff0000) >> 16; 12433 12434 auto TempOp = peekThroughBitcasts(Op); 12435 auto TempOtherOp = peekThroughBitcasts(OtherOp); 12436 12437 auto OpIs16Bit = 12438 TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp); 12439 if (!OpIs16Bit) 12440 return true; 12441 12442 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 || 12443 isExtendedFrom16Bits(TempOtherOp); 12444 if (!OtherOpIs16Bit) 12445 return true; 12446 12447 // Do we cleanly address both 12448 return !addresses16Bits(Low16) || !addresses16Bits(Hi16); 12449 } 12450 12451 static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, 12452 unsigned DWordOffset) { 12453 SDValue Ret; 12454 12455 auto TypeSize = Src.getValueSizeInBits().getFixedValue(); 12456 // ByteProvider must be at least 8 bits 12457 assert(Src.getValueSizeInBits().isKnownMultipleOf(8)); 12458 12459 if (TypeSize <= 32) 12460 return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32); 12461 12462 if (Src.getValueType().isVector()) { 12463 auto ScalarTySize = Src.getScalarValueSizeInBits(); 12464 auto ScalarTy = Src.getValueType().getScalarType(); 12465 if (ScalarTySize == 32) { 12466 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src, 12467 DAG.getConstant(DWordOffset, SL, MVT::i32)); 12468 } 12469 if (ScalarTySize > 32) { 12470 Ret = DAG.getNode( 12471 ISD::EXTRACT_VECTOR_ELT, SL, ScalarTy, Src, 12472 DAG.getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32)); 12473 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32)); 12474 if (ShiftVal) 12475 Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret, 12476 DAG.getConstant(ShiftVal, SL, MVT::i32)); 12477 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32); 12478 } 12479 12480 assert(ScalarTySize < 32); 12481 auto NumElements = TypeSize / ScalarTySize; 12482 auto Trunc32Elements = (ScalarTySize * NumElements) / 32; 12483 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize; 12484 auto NumElementsIn32 = 32 / ScalarTySize; 12485 auto NumAvailElements = DWordOffset < Trunc32Elements 12486 ? NumElementsIn32 12487 : NumElements - NormalizedTrunc; 12488 12489 SmallVector<SDValue, 4> VecSrcs; 12490 DAG.ExtractVectorElements(Src, VecSrcs, DWordOffset * NumElementsIn32, 12491 NumAvailElements); 12492 12493 Ret = DAG.getBuildVector( 12494 MVT::getVectorVT(MVT::getIntegerVT(ScalarTySize), NumAvailElements), SL, 12495 VecSrcs); 12496 return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32); 12497 } 12498 12499 /// Scalar Type 12500 auto ShiftVal = 32 * DWordOffset; 12501 Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src, 12502 DAG.getConstant(ShiftVal, SL, MVT::i32)); 12503 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32); 12504 } 12505 12506 static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 12507 SelectionDAG &DAG = DCI.DAG; 12508 [[maybe_unused]] EVT VT = N->getValueType(0); 12509 SmallVector<ByteProvider<SDValue>, 8> PermNodes; 12510 12511 // VT is known to be MVT::i32, so we need to provide 4 bytes. 12512 assert(VT == MVT::i32); 12513 for (int i = 0; i < 4; i++) { 12514 // Find the ByteProvider that provides the ith byte of the result of OR 12515 std::optional<ByteProvider<SDValue>> P = 12516 calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i); 12517 // TODO support constantZero 12518 if (!P || P->isConstantZero()) 12519 return SDValue(); 12520 12521 PermNodes.push_back(*P); 12522 } 12523 if (PermNodes.size() != 4) 12524 return SDValue(); 12525 12526 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4); 12527 std::optional<std::pair<unsigned, unsigned>> SecondSrc; 12528 uint64_t PermMask = 0x00000000; 12529 for (size_t i = 0; i < PermNodes.size(); i++) { 12530 auto PermOp = PermNodes[i]; 12531 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset 12532 // by sizeof(Src2) = 4 12533 int SrcByteAdjust = 4; 12534 12535 // If the Src uses a byte from a different DWORD, then it corresponds 12536 // with a difference source 12537 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) || 12538 ((PermOp.SrcOffset / 4) != FirstSrc.second)) { 12539 if (SecondSrc) 12540 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) || 12541 ((PermOp.SrcOffset / 4) != SecondSrc->second)) 12542 return SDValue(); 12543 12544 // Set the index of the second distinct Src node 12545 SecondSrc = {i, PermNodes[i].SrcOffset / 4}; 12546 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8)); 12547 SrcByteAdjust = 0; 12548 } 12549 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8); 12550 assert(!DAG.getDataLayout().isBigEndian()); 12551 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8); 12552 } 12553 SDLoc DL(N); 12554 SDValue Op = *PermNodes[FirstSrc.first].Src; 12555 Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second); 12556 assert(Op.getValueSizeInBits() == 32); 12557 12558 // Check that we are not just extracting the bytes in order from an op 12559 if (!SecondSrc) { 12560 int Low16 = PermMask & 0xffff; 12561 int Hi16 = (PermMask & 0xffff0000) >> 16; 12562 12563 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100); 12564 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302); 12565 12566 // The perm op would really just produce Op. So combine into Op 12567 if (WellFormedLow && WellFormedHi) 12568 return DAG.getBitcast(MVT::getIntegerVT(32), Op); 12569 } 12570 12571 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op; 12572 12573 if (SecondSrc) { 12574 OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second); 12575 assert(OtherOp.getValueSizeInBits() == 32); 12576 } 12577 12578 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) { 12579 12580 assert(Op.getValueType().isByteSized() && 12581 OtherOp.getValueType().isByteSized()); 12582 12583 // If the ultimate src is less than 32 bits, then we will only be 12584 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or. 12585 // CalculateByteProvider would not have returned Op as source if we 12586 // used a byte that is outside its ValueType. Thus, we are free to 12587 // ANY_EXTEND as the extended bits are dont-cares. 12588 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32); 12589 OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32); 12590 12591 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp, 12592 DAG.getConstant(PermMask, DL, MVT::i32)); 12593 } 12594 return SDValue(); 12595 } 12596 12597 SDValue SITargetLowering::performOrCombine(SDNode *N, 12598 DAGCombinerInfo &DCI) const { 12599 SelectionDAG &DAG = DCI.DAG; 12600 SDValue LHS = N->getOperand(0); 12601 SDValue RHS = N->getOperand(1); 12602 12603 EVT VT = N->getValueType(0); 12604 if (VT == MVT::i1) { 12605 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2) 12606 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS && 12607 RHS.getOpcode() == AMDGPUISD::FP_CLASS) { 12608 SDValue Src = LHS.getOperand(0); 12609 if (Src != RHS.getOperand(0)) 12610 return SDValue(); 12611 12612 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1)); 12613 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1)); 12614 if (!CLHS || !CRHS) 12615 return SDValue(); 12616 12617 // Only 10 bits are used. 12618 static const uint32_t MaxMask = 0x3ff; 12619 12620 uint32_t NewMask = 12621 (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask; 12622 SDLoc DL(N); 12623 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, Src, 12624 DAG.getConstant(NewMask, DL, MVT::i32)); 12625 } 12626 12627 return SDValue(); 12628 } 12629 12630 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2) 12631 if (isa<ConstantSDNode>(RHS) && LHS.hasOneUse() && 12632 LHS.getOpcode() == AMDGPUISD::PERM && 12633 isa<ConstantSDNode>(LHS.getOperand(2))) { 12634 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1)); 12635 if (!Sel) 12636 return SDValue(); 12637 12638 Sel |= LHS.getConstantOperandVal(2); 12639 SDLoc DL(N); 12640 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0), 12641 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32)); 12642 } 12643 12644 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2) 12645 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 12646 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() && 12647 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) { 12648 12649 // If all the uses of an or need to extract the individual elements, do not 12650 // attempt to lower into v_perm 12651 auto usesCombinedOperand = [](SDNode *OrUse) { 12652 // If we have any non-vectorized use, then it is a candidate for v_perm 12653 if (OrUse->getOpcode() != ISD::BITCAST || 12654 !OrUse->getValueType(0).isVector()) 12655 return true; 12656 12657 // If we have any non-vectorized use, then it is a candidate for v_perm 12658 for (auto *VUser : OrUse->users()) { 12659 if (!VUser->getValueType(0).isVector()) 12660 return true; 12661 12662 // If the use of a vector is a store, then combining via a v_perm 12663 // is beneficial. 12664 // TODO -- whitelist more uses 12665 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg}) 12666 if (VUser->getOpcode() == VectorwiseOp) 12667 return true; 12668 } 12669 return false; 12670 }; 12671 12672 if (!any_of(N->users(), usesCombinedOperand)) 12673 return SDValue(); 12674 12675 uint32_t LHSMask = getPermuteMask(LHS); 12676 uint32_t RHSMask = getPermuteMask(RHS); 12677 12678 if (LHSMask != ~0u && RHSMask != ~0u) { 12679 // Canonicalize the expression in an attempt to have fewer unique masks 12680 // and therefore fewer registers used to hold the masks. 12681 if (LHSMask > RHSMask) { 12682 std::swap(LHSMask, RHSMask); 12683 std::swap(LHS, RHS); 12684 } 12685 12686 // Select 0xc for each lane used from source operand. Zero has 0xc mask 12687 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range. 12688 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c; 12689 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c; 12690 12691 // Check of we need to combine values from two sources within a byte. 12692 if (!(LHSUsedLanes & RHSUsedLanes) && 12693 // If we select high and lower word keep it for SDWA. 12694 // TODO: teach SDWA to work with v_perm_b32 and remove the check. 12695 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) { 12696 // Kill zero bytes selected by other mask. Zero value is 0xc. 12697 LHSMask &= ~RHSUsedLanes; 12698 RHSMask &= ~LHSUsedLanes; 12699 // Add 4 to each active LHS lane 12700 LHSMask |= LHSUsedLanes & 0x04040404; 12701 // Combine masks 12702 uint32_t Sel = LHSMask | RHSMask; 12703 SDLoc DL(N); 12704 12705 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0), 12706 RHS.getOperand(0), 12707 DAG.getConstant(Sel, DL, MVT::i32)); 12708 } 12709 } 12710 if (LHSMask == ~0u || RHSMask == ~0u) { 12711 if (SDValue Perm = matchPERM(N, DCI)) 12712 return Perm; 12713 } 12714 } 12715 12716 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps()) 12717 return SDValue(); 12718 12719 // TODO: This could be a generic combine with a predicate for extracting the 12720 // high half of an integer being free. 12721 12722 // (or i64:x, (zero_extend i32:y)) -> 12723 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x))) 12724 if (LHS.getOpcode() == ISD::ZERO_EXTEND && 12725 RHS.getOpcode() != ISD::ZERO_EXTEND) 12726 std::swap(LHS, RHS); 12727 12728 if (RHS.getOpcode() == ISD::ZERO_EXTEND) { 12729 SDValue ExtSrc = RHS.getOperand(0); 12730 EVT SrcVT = ExtSrc.getValueType(); 12731 if (SrcVT == MVT::i32) { 12732 SDLoc SL(N); 12733 auto [LowLHS, HiBits] = split64BitValue(LHS, DAG); 12734 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc); 12735 12736 DCI.AddToWorklist(LowOr.getNode()); 12737 DCI.AddToWorklist(HiBits.getNode()); 12738 12739 SDValue Vec = 12740 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, LowOr, HiBits); 12741 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec); 12742 } 12743 } 12744 12745 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1)); 12746 if (CRHS) { 12747 if (SDValue Split = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR, 12748 N->getOperand(0), CRHS)) 12749 return Split; 12750 } 12751 12752 return SDValue(); 12753 } 12754 12755 SDValue SITargetLowering::performXorCombine(SDNode *N, 12756 DAGCombinerInfo &DCI) const { 12757 if (SDValue RV = reassociateScalarOps(N, DCI.DAG)) 12758 return RV; 12759 12760 SDValue LHS = N->getOperand(0); 12761 SDValue RHS = N->getOperand(1); 12762 12763 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS); 12764 SelectionDAG &DAG = DCI.DAG; 12765 12766 EVT VT = N->getValueType(0); 12767 if (CRHS && VT == MVT::i64) { 12768 if (SDValue Split = 12769 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS)) 12770 return Split; 12771 } 12772 12773 // Make sure to apply the 64-bit constant splitting fold before trying to fold 12774 // fneg-like xors into 64-bit select. 12775 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) { 12776 // This looks like an fneg, try to fold as a source modifier. 12777 if (CRHS && CRHS->getAPIntValue().isSignMask() && 12778 shouldFoldFNegIntoSrc(N, LHS)) { 12779 // xor (select c, a, b), 0x80000000 -> 12780 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b))) 12781 SDLoc DL(N); 12782 SDValue CastLHS = 12783 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1)); 12784 SDValue CastRHS = 12785 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2)); 12786 SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS); 12787 SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS); 12788 SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32, 12789 LHS->getOperand(0), FNegLHS, FNegRHS); 12790 return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect); 12791 } 12792 } 12793 12794 return SDValue(); 12795 } 12796 12797 SDValue SITargetLowering::performZeroExtendCombine(SDNode *N, 12798 DAGCombinerInfo &DCI) const { 12799 if (!Subtarget->has16BitInsts() || 12800 DCI.getDAGCombineLevel() < AfterLegalizeDAG) 12801 return SDValue(); 12802 12803 EVT VT = N->getValueType(0); 12804 if (VT != MVT::i32) 12805 return SDValue(); 12806 12807 SDValue Src = N->getOperand(0); 12808 if (Src.getValueType() != MVT::i16) 12809 return SDValue(); 12810 12811 return SDValue(); 12812 } 12813 12814 SDValue 12815 SITargetLowering::performSignExtendInRegCombine(SDNode *N, 12816 DAGCombinerInfo &DCI) const { 12817 SDValue Src = N->getOperand(0); 12818 auto *VTSign = cast<VTSDNode>(N->getOperand(1)); 12819 12820 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them 12821 // with s_buffer_load_i8 and s_buffer_load_i16 respectively. 12822 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE && 12823 VTSign->getVT() == MVT::i8) || 12824 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT && 12825 VTSign->getVT() == MVT::i16))) { 12826 assert(Subtarget->hasScalarSubwordLoads() && 12827 "s_buffer_load_{u8, i8} are supported " 12828 "in GFX12 (or newer) architectures."); 12829 EVT VT = Src.getValueType(); 12830 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE) 12831 ? AMDGPUISD::SBUFFER_LOAD_BYTE 12832 : AMDGPUISD::SBUFFER_LOAD_SHORT; 12833 SDLoc DL(N); 12834 SDVTList ResList = DCI.DAG.getVTList(MVT::i32); 12835 SDValue Ops[] = { 12836 Src.getOperand(0), // source register 12837 Src.getOperand(1), // offset 12838 Src.getOperand(2) // cachePolicy 12839 }; 12840 auto *M = cast<MemSDNode>(Src); 12841 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode( 12842 Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand()); 12843 SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad); 12844 return LoadVal; 12845 } 12846 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE && 12847 VTSign->getVT() == MVT::i8) || 12848 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT && 12849 VTSign->getVT() == MVT::i16)) && 12850 Src.hasOneUse()) { 12851 auto *M = cast<MemSDNode>(Src); 12852 SDValue Ops[] = {Src.getOperand(0), // Chain 12853 Src.getOperand(1), // rsrc 12854 Src.getOperand(2), // vindex 12855 Src.getOperand(3), // voffset 12856 Src.getOperand(4), // soffset 12857 Src.getOperand(5), // offset 12858 Src.getOperand(6), Src.getOperand(7)}; 12859 // replace with BUFFER_LOAD_BYTE/SHORT 12860 SDVTList ResList = 12861 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType()); 12862 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE) 12863 ? AMDGPUISD::BUFFER_LOAD_BYTE 12864 : AMDGPUISD::BUFFER_LOAD_SHORT; 12865 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode( 12866 Opc, SDLoc(N), ResList, Ops, M->getMemoryVT(), M->getMemOperand()); 12867 return DCI.DAG.getMergeValues( 12868 {BufferLoadSignExt, BufferLoadSignExt.getValue(1)}, SDLoc(N)); 12869 } 12870 return SDValue(); 12871 } 12872 12873 SDValue SITargetLowering::performClassCombine(SDNode *N, 12874 DAGCombinerInfo &DCI) const { 12875 SelectionDAG &DAG = DCI.DAG; 12876 SDValue Mask = N->getOperand(1); 12877 12878 // fp_class x, 0 -> false 12879 if (isNullConstant(Mask)) 12880 return DAG.getConstant(0, SDLoc(N), MVT::i1); 12881 12882 if (N->getOperand(0).isUndef()) 12883 return DAG.getUNDEF(MVT::i1); 12884 12885 return SDValue(); 12886 } 12887 12888 SDValue SITargetLowering::performRcpCombine(SDNode *N, 12889 DAGCombinerInfo &DCI) const { 12890 EVT VT = N->getValueType(0); 12891 SDValue N0 = N->getOperand(0); 12892 12893 if (N0.isUndef()) { 12894 return DCI.DAG.getConstantFP(APFloat::getQNaN(VT.getFltSemantics()), 12895 SDLoc(N), VT); 12896 } 12897 12898 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP || 12899 N0.getOpcode() == ISD::SINT_TO_FP)) { 12900 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0, 12901 N->getFlags()); 12902 } 12903 12904 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here. 12905 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) && 12906 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) { 12907 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT, N0.getOperand(0), 12908 N->getFlags()); 12909 } 12910 12911 return AMDGPUTargetLowering::performRcpCombine(N, DCI); 12912 } 12913 12914 bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op, 12915 unsigned MaxDepth) const { 12916 unsigned Opcode = Op.getOpcode(); 12917 if (Opcode == ISD::FCANONICALIZE) 12918 return true; 12919 12920 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) { 12921 const auto &F = CFP->getValueAPF(); 12922 if (F.isNaN() && F.isSignaling()) 12923 return false; 12924 if (!F.isDenormal()) 12925 return true; 12926 12927 DenormalMode Mode = 12928 DAG.getMachineFunction().getDenormalMode(F.getSemantics()); 12929 return Mode == DenormalMode::getIEEE(); 12930 } 12931 12932 // If source is a result of another standard FP operation it is already in 12933 // canonical form. 12934 if (MaxDepth == 0) 12935 return false; 12936 12937 switch (Opcode) { 12938 // These will flush denorms if required. 12939 case ISD::FADD: 12940 case ISD::FSUB: 12941 case ISD::FMUL: 12942 case ISD::FCEIL: 12943 case ISD::FFLOOR: 12944 case ISD::FMA: 12945 case ISD::FMAD: 12946 case ISD::FSQRT: 12947 case ISD::FDIV: 12948 case ISD::FREM: 12949 case ISD::FP_ROUND: 12950 case ISD::FP_EXTEND: 12951 case ISD::FP16_TO_FP: 12952 case ISD::FP_TO_FP16: 12953 case ISD::BF16_TO_FP: 12954 case ISD::FP_TO_BF16: 12955 case ISD::FLDEXP: 12956 case AMDGPUISD::FMUL_LEGACY: 12957 case AMDGPUISD::FMAD_FTZ: 12958 case AMDGPUISD::RCP: 12959 case AMDGPUISD::RSQ: 12960 case AMDGPUISD::RSQ_CLAMP: 12961 case AMDGPUISD::RCP_LEGACY: 12962 case AMDGPUISD::RCP_IFLAG: 12963 case AMDGPUISD::LOG: 12964 case AMDGPUISD::EXP: 12965 case AMDGPUISD::DIV_SCALE: 12966 case AMDGPUISD::DIV_FMAS: 12967 case AMDGPUISD::DIV_FIXUP: 12968 case AMDGPUISD::FRACT: 12969 case AMDGPUISD::CVT_PKRTZ_F16_F32: 12970 case AMDGPUISD::CVT_F32_UBYTE0: 12971 case AMDGPUISD::CVT_F32_UBYTE1: 12972 case AMDGPUISD::CVT_F32_UBYTE2: 12973 case AMDGPUISD::CVT_F32_UBYTE3: 12974 case AMDGPUISD::FP_TO_FP16: 12975 case AMDGPUISD::SIN_HW: 12976 case AMDGPUISD::COS_HW: 12977 return true; 12978 12979 // It can/will be lowered or combined as a bit operation. 12980 // Need to check their input recursively to handle. 12981 case ISD::FNEG: 12982 case ISD::FABS: 12983 case ISD::FCOPYSIGN: 12984 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1); 12985 12986 case ISD::AND: 12987 if (Op.getValueType() == MVT::i32) { 12988 // Be careful as we only know it is a bitcast floating point type. It 12989 // could be f32, v2f16, we have no way of knowing. Luckily the constant 12990 // value that we optimize for, which comes up in fp32 to bf16 conversions, 12991 // is valid to optimize for all types. 12992 if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 12993 if (RHS->getZExtValue() == 0xffff0000) { 12994 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1); 12995 } 12996 } 12997 } 12998 break; 12999 13000 case ISD::FSIN: 13001 case ISD::FCOS: 13002 case ISD::FSINCOS: 13003 return Op.getValueType().getScalarType() != MVT::f16; 13004 13005 case ISD::FMINNUM: 13006 case ISD::FMAXNUM: 13007 case ISD::FMINNUM_IEEE: 13008 case ISD::FMAXNUM_IEEE: 13009 case ISD::FMINIMUM: 13010 case ISD::FMAXIMUM: 13011 case AMDGPUISD::CLAMP: 13012 case AMDGPUISD::FMED3: 13013 case AMDGPUISD::FMAX3: 13014 case AMDGPUISD::FMIN3: 13015 case AMDGPUISD::FMAXIMUM3: 13016 case AMDGPUISD::FMINIMUM3: { 13017 // FIXME: Shouldn't treat the generic operations different based these. 13018 // However, we aren't really required to flush the result from 13019 // minnum/maxnum.. 13020 13021 // snans will be quieted, so we only need to worry about denormals. 13022 if (Subtarget->supportsMinMaxDenormModes() || 13023 // FIXME: denormalsEnabledForType is broken for dynamic 13024 denormalsEnabledForType(DAG, Op.getValueType())) 13025 return true; 13026 13027 // Flushing may be required. 13028 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such 13029 // targets need to check their input recursively. 13030 13031 // FIXME: Does this apply with clamp? It's implemented with max. 13032 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) { 13033 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1)) 13034 return false; 13035 } 13036 13037 return true; 13038 } 13039 case ISD::SELECT: { 13040 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) && 13041 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1); 13042 } 13043 case ISD::BUILD_VECTOR: { 13044 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) { 13045 SDValue SrcOp = Op.getOperand(i); 13046 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1)) 13047 return false; 13048 } 13049 13050 return true; 13051 } 13052 case ISD::EXTRACT_VECTOR_ELT: 13053 case ISD::EXTRACT_SUBVECTOR: { 13054 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1); 13055 } 13056 case ISD::INSERT_VECTOR_ELT: { 13057 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) && 13058 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1); 13059 } 13060 case ISD::UNDEF: 13061 // Could be anything. 13062 return false; 13063 13064 case ISD::BITCAST: 13065 // TODO: This is incorrect as it loses track of the operand's type. We may 13066 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the 13067 // same bits that are canonicalized in one type need not be in the other. 13068 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1); 13069 case ISD::TRUNCATE: { 13070 // Hack round the mess we make when legalizing extract_vector_elt 13071 if (Op.getValueType() == MVT::i16) { 13072 SDValue TruncSrc = Op.getOperand(0); 13073 if (TruncSrc.getValueType() == MVT::i32 && 13074 TruncSrc.getOpcode() == ISD::BITCAST && 13075 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) { 13076 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1); 13077 } 13078 } 13079 return false; 13080 } 13081 case ISD::INTRINSIC_WO_CHAIN: { 13082 unsigned IntrinsicID = Op.getConstantOperandVal(0); 13083 // TODO: Handle more intrinsics 13084 switch (IntrinsicID) { 13085 case Intrinsic::amdgcn_cvt_pkrtz: 13086 case Intrinsic::amdgcn_cubeid: 13087 case Intrinsic::amdgcn_frexp_mant: 13088 case Intrinsic::amdgcn_fdot2: 13089 case Intrinsic::amdgcn_rcp: 13090 case Intrinsic::amdgcn_rsq: 13091 case Intrinsic::amdgcn_rsq_clamp: 13092 case Intrinsic::amdgcn_rcp_legacy: 13093 case Intrinsic::amdgcn_rsq_legacy: 13094 case Intrinsic::amdgcn_trig_preop: 13095 case Intrinsic::amdgcn_log: 13096 case Intrinsic::amdgcn_exp2: 13097 case Intrinsic::amdgcn_sqrt: 13098 return true; 13099 default: 13100 break; 13101 } 13102 13103 break; 13104 } 13105 default: 13106 break; 13107 } 13108 13109 // FIXME: denormalsEnabledForType is broken for dynamic 13110 return denormalsEnabledForType(DAG, Op.getValueType()) && 13111 DAG.isKnownNeverSNaN(Op); 13112 } 13113 13114 bool SITargetLowering::isCanonicalized(Register Reg, const MachineFunction &MF, 13115 unsigned MaxDepth) const { 13116 const MachineRegisterInfo &MRI = MF.getRegInfo(); 13117 MachineInstr *MI = MRI.getVRegDef(Reg); 13118 unsigned Opcode = MI->getOpcode(); 13119 13120 if (Opcode == AMDGPU::G_FCANONICALIZE) 13121 return true; 13122 13123 std::optional<FPValueAndVReg> FCR; 13124 // Constant splat (can be padded with undef) or scalar constant. 13125 if (mi_match(Reg, MRI, MIPatternMatch::m_GFCstOrSplat(FCR))) { 13126 if (FCR->Value.isSignaling()) 13127 return false; 13128 if (!FCR->Value.isDenormal()) 13129 return true; 13130 13131 DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics()); 13132 return Mode == DenormalMode::getIEEE(); 13133 } 13134 13135 if (MaxDepth == 0) 13136 return false; 13137 13138 switch (Opcode) { 13139 case AMDGPU::G_FADD: 13140 case AMDGPU::G_FSUB: 13141 case AMDGPU::G_FMUL: 13142 case AMDGPU::G_FCEIL: 13143 case AMDGPU::G_FFLOOR: 13144 case AMDGPU::G_FRINT: 13145 case AMDGPU::G_FNEARBYINT: 13146 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND: 13147 case AMDGPU::G_INTRINSIC_TRUNC: 13148 case AMDGPU::G_INTRINSIC_ROUNDEVEN: 13149 case AMDGPU::G_FMA: 13150 case AMDGPU::G_FMAD: 13151 case AMDGPU::G_FSQRT: 13152 case AMDGPU::G_FDIV: 13153 case AMDGPU::G_FREM: 13154 case AMDGPU::G_FPOW: 13155 case AMDGPU::G_FPEXT: 13156 case AMDGPU::G_FLOG: 13157 case AMDGPU::G_FLOG2: 13158 case AMDGPU::G_FLOG10: 13159 case AMDGPU::G_FPTRUNC: 13160 case AMDGPU::G_AMDGPU_RCP_IFLAG: 13161 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0: 13162 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1: 13163 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2: 13164 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3: 13165 return true; 13166 case AMDGPU::G_FNEG: 13167 case AMDGPU::G_FABS: 13168 case AMDGPU::G_FCOPYSIGN: 13169 return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1); 13170 case AMDGPU::G_FMINNUM: 13171 case AMDGPU::G_FMAXNUM: 13172 case AMDGPU::G_FMINNUM_IEEE: 13173 case AMDGPU::G_FMAXNUM_IEEE: 13174 case AMDGPU::G_FMINIMUM: 13175 case AMDGPU::G_FMAXIMUM: { 13176 if (Subtarget->supportsMinMaxDenormModes() || 13177 // FIXME: denormalsEnabledForType is broken for dynamic 13178 denormalsEnabledForType(MRI.getType(Reg), MF)) 13179 return true; 13180 13181 [[fallthrough]]; 13182 } 13183 case AMDGPU::G_BUILD_VECTOR: 13184 for (const MachineOperand &MO : llvm::drop_begin(MI->operands())) 13185 if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1)) 13186 return false; 13187 return true; 13188 case AMDGPU::G_INTRINSIC: 13189 case AMDGPU::G_INTRINSIC_CONVERGENT: 13190 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) { 13191 case Intrinsic::amdgcn_fmul_legacy: 13192 case Intrinsic::amdgcn_fmad_ftz: 13193 case Intrinsic::amdgcn_sqrt: 13194 case Intrinsic::amdgcn_fmed3: 13195 case Intrinsic::amdgcn_sin: 13196 case Intrinsic::amdgcn_cos: 13197 case Intrinsic::amdgcn_log: 13198 case Intrinsic::amdgcn_exp2: 13199 case Intrinsic::amdgcn_log_clamp: 13200 case Intrinsic::amdgcn_rcp: 13201 case Intrinsic::amdgcn_rcp_legacy: 13202 case Intrinsic::amdgcn_rsq: 13203 case Intrinsic::amdgcn_rsq_clamp: 13204 case Intrinsic::amdgcn_rsq_legacy: 13205 case Intrinsic::amdgcn_div_scale: 13206 case Intrinsic::amdgcn_div_fmas: 13207 case Intrinsic::amdgcn_div_fixup: 13208 case Intrinsic::amdgcn_fract: 13209 case Intrinsic::amdgcn_cvt_pkrtz: 13210 case Intrinsic::amdgcn_cubeid: 13211 case Intrinsic::amdgcn_cubema: 13212 case Intrinsic::amdgcn_cubesc: 13213 case Intrinsic::amdgcn_cubetc: 13214 case Intrinsic::amdgcn_frexp_mant: 13215 case Intrinsic::amdgcn_fdot2: 13216 case Intrinsic::amdgcn_trig_preop: 13217 return true; 13218 default: 13219 break; 13220 } 13221 13222 [[fallthrough]]; 13223 default: 13224 return false; 13225 } 13226 13227 llvm_unreachable("invalid operation"); 13228 } 13229 13230 // Constant fold canonicalize. 13231 SDValue SITargetLowering::getCanonicalConstantFP(SelectionDAG &DAG, 13232 const SDLoc &SL, EVT VT, 13233 const APFloat &C) const { 13234 // Flush denormals to 0 if not enabled. 13235 if (C.isDenormal()) { 13236 DenormalMode Mode = 13237 DAG.getMachineFunction().getDenormalMode(C.getSemantics()); 13238 if (Mode == DenormalMode::getPreserveSign()) { 13239 return DAG.getConstantFP( 13240 APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT); 13241 } 13242 13243 if (Mode != DenormalMode::getIEEE()) 13244 return SDValue(); 13245 } 13246 13247 if (C.isNaN()) { 13248 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics()); 13249 if (C.isSignaling()) { 13250 // Quiet a signaling NaN. 13251 // FIXME: Is this supposed to preserve payload bits? 13252 return DAG.getConstantFP(CanonicalQNaN, SL, VT); 13253 } 13254 13255 // Make sure it is the canonical NaN bitpattern. 13256 // 13257 // TODO: Can we use -1 as the canonical NaN value since it's an inline 13258 // immediate? 13259 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt()) 13260 return DAG.getConstantFP(CanonicalQNaN, SL, VT); 13261 } 13262 13263 // Already canonical. 13264 return DAG.getConstantFP(C, SL, VT); 13265 } 13266 13267 static bool vectorEltWillFoldAway(SDValue Op) { 13268 return Op.isUndef() || isa<ConstantFPSDNode>(Op); 13269 } 13270 13271 SDValue 13272 SITargetLowering::performFCanonicalizeCombine(SDNode *N, 13273 DAGCombinerInfo &DCI) const { 13274 SelectionDAG &DAG = DCI.DAG; 13275 SDValue N0 = N->getOperand(0); 13276 EVT VT = N->getValueType(0); 13277 13278 // fcanonicalize undef -> qnan 13279 if (N0.isUndef()) { 13280 APFloat QNaN = APFloat::getQNaN(VT.getFltSemantics()); 13281 return DAG.getConstantFP(QNaN, SDLoc(N), VT); 13282 } 13283 13284 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) { 13285 EVT VT = N->getValueType(0); 13286 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF()); 13287 } 13288 13289 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x), 13290 // (fcanonicalize k) 13291 // 13292 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0 13293 13294 // TODO: This could be better with wider vectors that will be split to v2f16, 13295 // and to consider uses since there aren't that many packed operations. 13296 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 && 13297 isTypeLegal(MVT::v2f16)) { 13298 SDLoc SL(N); 13299 SDValue NewElts[2]; 13300 SDValue Lo = N0.getOperand(0); 13301 SDValue Hi = N0.getOperand(1); 13302 EVT EltVT = Lo.getValueType(); 13303 13304 if (vectorEltWillFoldAway(Lo) || vectorEltWillFoldAway(Hi)) { 13305 for (unsigned I = 0; I != 2; ++I) { 13306 SDValue Op = N0.getOperand(I); 13307 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) { 13308 NewElts[I] = 13309 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF()); 13310 } else if (Op.isUndef()) { 13311 // Handled below based on what the other operand is. 13312 NewElts[I] = Op; 13313 } else { 13314 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op); 13315 } 13316 } 13317 13318 // If one half is undef, and one is constant, prefer a splat vector rather 13319 // than the normal qNaN. If it's a register, prefer 0.0 since that's 13320 // cheaper to use and may be free with a packed operation. 13321 if (NewElts[0].isUndef()) { 13322 if (isa<ConstantFPSDNode>(NewElts[1])) 13323 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1]) 13324 ? NewElts[1] 13325 : DAG.getConstantFP(0.0f, SL, EltVT); 13326 } 13327 13328 if (NewElts[1].isUndef()) { 13329 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0]) 13330 ? NewElts[0] 13331 : DAG.getConstantFP(0.0f, SL, EltVT); 13332 } 13333 13334 return DAG.getBuildVector(VT, SL, NewElts); 13335 } 13336 } 13337 13338 return SDValue(); 13339 } 13340 13341 static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) { 13342 switch (Opc) { 13343 case ISD::FMAXNUM: 13344 case ISD::FMAXNUM_IEEE: 13345 return AMDGPUISD::FMAX3; 13346 case ISD::FMAXIMUM: 13347 return AMDGPUISD::FMAXIMUM3; 13348 case ISD::SMAX: 13349 return AMDGPUISD::SMAX3; 13350 case ISD::UMAX: 13351 return AMDGPUISD::UMAX3; 13352 case ISD::FMINNUM: 13353 case ISD::FMINNUM_IEEE: 13354 return AMDGPUISD::FMIN3; 13355 case ISD::FMINIMUM: 13356 return AMDGPUISD::FMINIMUM3; 13357 case ISD::SMIN: 13358 return AMDGPUISD::SMIN3; 13359 case ISD::UMIN: 13360 return AMDGPUISD::UMIN3; 13361 default: 13362 llvm_unreachable("Not a min/max opcode"); 13363 } 13364 } 13365 13366 SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG, 13367 const SDLoc &SL, SDValue Src, 13368 SDValue MinVal, 13369 SDValue MaxVal, 13370 bool Signed) const { 13371 13372 // med3 comes from 13373 // min(max(x, K0), K1), K0 < K1 13374 // max(min(x, K0), K1), K1 < K0 13375 // 13376 // "MinVal" and "MaxVal" respectively refer to the rhs of the 13377 // min/max op. 13378 ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(MinVal); 13379 ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(MaxVal); 13380 13381 if (!MinK || !MaxK) 13382 return SDValue(); 13383 13384 if (Signed) { 13385 if (MaxK->getAPIntValue().sge(MinK->getAPIntValue())) 13386 return SDValue(); 13387 } else { 13388 if (MaxK->getAPIntValue().uge(MinK->getAPIntValue())) 13389 return SDValue(); 13390 } 13391 13392 EVT VT = MinK->getValueType(0); 13393 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3; 13394 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16())) 13395 return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal); 13396 13397 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is 13398 // not available, but this is unlikely to be profitable as constants 13399 // will often need to be materialized & extended, especially on 13400 // pre-GFX10 where VOP3 instructions couldn't take literal operands. 13401 return SDValue(); 13402 } 13403 13404 static ConstantFPSDNode *getSplatConstantFP(SDValue Op) { 13405 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op)) 13406 return C; 13407 13408 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op)) { 13409 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode()) 13410 return C; 13411 } 13412 13413 return nullptr; 13414 } 13415 13416 SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG, 13417 const SDLoc &SL, SDValue Op0, 13418 SDValue Op1) const { 13419 ConstantFPSDNode *K1 = getSplatConstantFP(Op1); 13420 if (!K1) 13421 return SDValue(); 13422 13423 ConstantFPSDNode *K0 = getSplatConstantFP(Op0.getOperand(1)); 13424 if (!K0) 13425 return SDValue(); 13426 13427 // Ordered >= (although NaN inputs should have folded away by now). 13428 if (K0->getValueAPF() > K1->getValueAPF()) 13429 return SDValue(); 13430 13431 const MachineFunction &MF = DAG.getMachineFunction(); 13432 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 13433 13434 // TODO: Check IEEE bit enabled? 13435 EVT VT = Op0.getValueType(); 13436 if (Info->getMode().DX10Clamp) { 13437 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the 13438 // hardware fmed3 behavior converting to a min. 13439 // FIXME: Should this be allowing -0.0? 13440 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0)) 13441 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0)); 13442 } 13443 13444 // med3 for f16 is only available on gfx9+, and not available for v2f16. 13445 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) { 13446 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a 13447 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would 13448 // then give the other result, which is different from med3 with a NaN 13449 // input. 13450 SDValue Var = Op0.getOperand(0); 13451 if (!DAG.isKnownNeverSNaN(Var)) 13452 return SDValue(); 13453 13454 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 13455 13456 if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) && 13457 (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) { 13458 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), Var, 13459 SDValue(K0, 0), SDValue(K1, 0)); 13460 } 13461 } 13462 13463 return SDValue(); 13464 } 13465 13466 /// \return true if the subtarget supports minimum3 and maximum3 with the given 13467 /// base min/max opcode \p Opc for type \p VT. 13468 static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, 13469 EVT VT) { 13470 switch (Opc) { 13471 case ISD::FMINNUM: 13472 case ISD::FMAXNUM: 13473 case ISD::FMINNUM_IEEE: 13474 case ISD::FMAXNUM_IEEE: 13475 case AMDGPUISD::FMIN_LEGACY: 13476 case AMDGPUISD::FMAX_LEGACY: 13477 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16()); 13478 case ISD::FMINIMUM: 13479 case ISD::FMAXIMUM: 13480 return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) || 13481 (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16()); 13482 case ISD::SMAX: 13483 case ISD::SMIN: 13484 case ISD::UMAX: 13485 case ISD::UMIN: 13486 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16()); 13487 default: 13488 return false; 13489 } 13490 13491 llvm_unreachable("not a min/max opcode"); 13492 } 13493 13494 SDValue SITargetLowering::performMinMaxCombine(SDNode *N, 13495 DAGCombinerInfo &DCI) const { 13496 SelectionDAG &DAG = DCI.DAG; 13497 13498 EVT VT = N->getValueType(0); 13499 unsigned Opc = N->getOpcode(); 13500 SDValue Op0 = N->getOperand(0); 13501 SDValue Op1 = N->getOperand(1); 13502 13503 // Only do this if the inner op has one use since this will just increases 13504 // register pressure for no benefit. 13505 13506 if (supportsMin3Max3(*Subtarget, Opc, VT)) { 13507 // max(max(a, b), c) -> max3(a, b, c) 13508 // min(min(a, b), c) -> min3(a, b, c) 13509 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) { 13510 SDLoc DL(N); 13511 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0), 13512 Op0.getOperand(0), Op0.getOperand(1), Op1); 13513 } 13514 13515 // Try commuted. 13516 // max(a, max(b, c)) -> max3(a, b, c) 13517 // min(a, min(b, c)) -> min3(a, b, c) 13518 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) { 13519 SDLoc DL(N); 13520 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0), 13521 Op0, Op1.getOperand(0), Op1.getOperand(1)); 13522 } 13523 } 13524 13525 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1) 13526 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0) 13527 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) { 13528 if (SDValue Med3 = performIntMed3ImmCombine( 13529 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true)) 13530 return Med3; 13531 } 13532 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) { 13533 if (SDValue Med3 = performIntMed3ImmCombine( 13534 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true)) 13535 return Med3; 13536 } 13537 13538 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) { 13539 if (SDValue Med3 = performIntMed3ImmCombine( 13540 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false)) 13541 return Med3; 13542 } 13543 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) { 13544 if (SDValue Med3 = performIntMed3ImmCombine( 13545 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false)) 13546 return Med3; 13547 } 13548 13549 // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1) 13550 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) || 13551 (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) || 13552 (Opc == AMDGPUISD::FMIN_LEGACY && 13553 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) && 13554 (VT == MVT::f32 || VT == MVT::f64 || 13555 (VT == MVT::f16 && Subtarget->has16BitInsts()) || 13556 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) && 13557 Op0.hasOneUse()) { 13558 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1)) 13559 return Res; 13560 } 13561 13562 return SDValue(); 13563 } 13564 13565 static bool isClampZeroToOne(SDValue A, SDValue B) { 13566 if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A)) { 13567 if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B)) { 13568 // FIXME: Should this be allowing -0.0? 13569 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) || 13570 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0)); 13571 } 13572 } 13573 13574 return false; 13575 } 13576 13577 // FIXME: Should only worry about snans for version with chain. 13578 SDValue SITargetLowering::performFMed3Combine(SDNode *N, 13579 DAGCombinerInfo &DCI) const { 13580 EVT VT = N->getValueType(0); 13581 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and 13582 // NaNs. With a NaN input, the order of the operands may change the result. 13583 13584 SelectionDAG &DAG = DCI.DAG; 13585 SDLoc SL(N); 13586 13587 SDValue Src0 = N->getOperand(0); 13588 SDValue Src1 = N->getOperand(1); 13589 SDValue Src2 = N->getOperand(2); 13590 13591 if (isClampZeroToOne(Src0, Src1)) { 13592 // const_a, const_b, x -> clamp is safe in all cases including signaling 13593 // nans. 13594 // FIXME: Should this be allowing -0.0? 13595 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2); 13596 } 13597 13598 const MachineFunction &MF = DAG.getMachineFunction(); 13599 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 13600 13601 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother 13602 // handling no dx10-clamp? 13603 if (Info->getMode().DX10Clamp) { 13604 // If NaNs is clamped to 0, we are free to reorder the inputs. 13605 13606 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1)) 13607 std::swap(Src0, Src1); 13608 13609 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2)) 13610 std::swap(Src1, Src2); 13611 13612 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1)) 13613 std::swap(Src0, Src1); 13614 13615 if (isClampZeroToOne(Src1, Src2)) 13616 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0); 13617 } 13618 13619 return SDValue(); 13620 } 13621 13622 SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N, 13623 DAGCombinerInfo &DCI) const { 13624 SDValue Src0 = N->getOperand(0); 13625 SDValue Src1 = N->getOperand(1); 13626 if (Src0.isUndef() && Src1.isUndef()) 13627 return DCI.DAG.getUNDEF(N->getValueType(0)); 13628 return SDValue(); 13629 } 13630 13631 // Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be 13632 // expanded into a set of cmp/select instructions. 13633 bool SITargetLowering::shouldExpandVectorDynExt(unsigned EltSize, 13634 unsigned NumElem, 13635 bool IsDivergentIdx, 13636 const GCNSubtarget *Subtarget) { 13637 if (UseDivergentRegisterIndexing) 13638 return false; 13639 13640 unsigned VecSize = EltSize * NumElem; 13641 13642 // Sub-dword vectors of size 2 dword or less have better implementation. 13643 if (VecSize <= 64 && EltSize < 32) 13644 return false; 13645 13646 // Always expand the rest of sub-dword instructions, otherwise it will be 13647 // lowered via memory. 13648 if (EltSize < 32) 13649 return true; 13650 13651 // Always do this if var-idx is divergent, otherwise it will become a loop. 13652 if (IsDivergentIdx) 13653 return true; 13654 13655 // Large vectors would yield too many compares and v_cndmask_b32 instructions. 13656 unsigned NumInsts = NumElem /* Number of compares */ + 13657 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */; 13658 13659 // On some architectures (GFX9) movrel is not available and it's better 13660 // to expand. 13661 if (Subtarget->useVGPRIndexMode()) 13662 return NumInsts <= 16; 13663 13664 // If movrel is available, use it instead of expanding for vector of 8 13665 // elements. 13666 if (Subtarget->hasMovrel()) 13667 return NumInsts <= 15; 13668 13669 return true; 13670 } 13671 13672 bool SITargetLowering::shouldExpandVectorDynExt(SDNode *N) const { 13673 SDValue Idx = N->getOperand(N->getNumOperands() - 1); 13674 if (isa<ConstantSDNode>(Idx)) 13675 return false; 13676 13677 SDValue Vec = N->getOperand(0); 13678 EVT VecVT = Vec.getValueType(); 13679 EVT EltVT = VecVT.getVectorElementType(); 13680 unsigned EltSize = EltVT.getSizeInBits(); 13681 unsigned NumElem = VecVT.getVectorNumElements(); 13682 13683 return SITargetLowering::shouldExpandVectorDynExt( 13684 EltSize, NumElem, Idx->isDivergent(), getSubtarget()); 13685 } 13686 13687 SDValue 13688 SITargetLowering::performExtractVectorEltCombine(SDNode *N, 13689 DAGCombinerInfo &DCI) const { 13690 SDValue Vec = N->getOperand(0); 13691 SelectionDAG &DAG = DCI.DAG; 13692 13693 EVT VecVT = Vec.getValueType(); 13694 EVT VecEltVT = VecVT.getVectorElementType(); 13695 EVT ResVT = N->getValueType(0); 13696 13697 unsigned VecSize = VecVT.getSizeInBits(); 13698 unsigned VecEltSize = VecEltVT.getSizeInBits(); 13699 13700 if ((Vec.getOpcode() == ISD::FNEG || Vec.getOpcode() == ISD::FABS) && 13701 allUsesHaveSourceMods(N)) { 13702 SDLoc SL(N); 13703 SDValue Idx = N->getOperand(1); 13704 SDValue Elt = 13705 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx); 13706 return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt); 13707 } 13708 13709 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx) 13710 // => 13711 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx) 13712 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx) 13713 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt 13714 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) { 13715 SDLoc SL(N); 13716 SDValue Idx = N->getOperand(1); 13717 unsigned Opc = Vec.getOpcode(); 13718 13719 switch (Opc) { 13720 default: 13721 break; 13722 // TODO: Support other binary operations. 13723 case ISD::FADD: 13724 case ISD::FSUB: 13725 case ISD::FMUL: 13726 case ISD::ADD: 13727 case ISD::UMIN: 13728 case ISD::UMAX: 13729 case ISD::SMIN: 13730 case ISD::SMAX: 13731 case ISD::FMAXNUM: 13732 case ISD::FMINNUM: 13733 case ISD::FMAXNUM_IEEE: 13734 case ISD::FMINNUM_IEEE: 13735 case ISD::FMAXIMUM: 13736 case ISD::FMINIMUM: { 13737 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, 13738 Vec.getOperand(0), Idx); 13739 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, 13740 Vec.getOperand(1), Idx); 13741 13742 DCI.AddToWorklist(Elt0.getNode()); 13743 DCI.AddToWorklist(Elt1.getNode()); 13744 return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags()); 13745 } 13746 } 13747 } 13748 13749 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx) 13750 if (shouldExpandVectorDynExt(N)) { 13751 SDLoc SL(N); 13752 SDValue Idx = N->getOperand(1); 13753 SDValue V; 13754 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) { 13755 SDValue IC = DAG.getVectorIdxConstant(I, SL); 13756 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC); 13757 if (I == 0) 13758 V = Elt; 13759 else 13760 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ); 13761 } 13762 return V; 13763 } 13764 13765 if (!DCI.isBeforeLegalize()) 13766 return SDValue(); 13767 13768 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit 13769 // elements. This exposes more load reduction opportunities by replacing 13770 // multiple small extract_vector_elements with a single 32-bit extract. 13771 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1)); 13772 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() && 13773 VecSize > 32 && VecSize % 32 == 0 && Idx) { 13774 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT); 13775 13776 unsigned BitIndex = Idx->getZExtValue() * VecEltSize; 13777 unsigned EltIdx = BitIndex / 32; 13778 unsigned LeftoverBitIdx = BitIndex % 32; 13779 SDLoc SL(N); 13780 13781 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec); 13782 DCI.AddToWorklist(Cast.getNode()); 13783 13784 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast, 13785 DAG.getConstant(EltIdx, SL, MVT::i32)); 13786 DCI.AddToWorklist(Elt.getNode()); 13787 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt, 13788 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32)); 13789 DCI.AddToWorklist(Srl.getNode()); 13790 13791 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger(); 13792 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl); 13793 DCI.AddToWorklist(Trunc.getNode()); 13794 13795 if (VecEltVT == ResVT) { 13796 return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc); 13797 } 13798 13799 assert(ResVT.isScalarInteger()); 13800 return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT); 13801 } 13802 13803 return SDValue(); 13804 } 13805 13806 SDValue 13807 SITargetLowering::performInsertVectorEltCombine(SDNode *N, 13808 DAGCombinerInfo &DCI) const { 13809 SDValue Vec = N->getOperand(0); 13810 SDValue Idx = N->getOperand(2); 13811 EVT VecVT = Vec.getValueType(); 13812 EVT EltVT = VecVT.getVectorElementType(); 13813 13814 // INSERT_VECTOR_ELT (<n x e>, var-idx) 13815 // => BUILD_VECTOR n x select (e, const-idx) 13816 if (!shouldExpandVectorDynExt(N)) 13817 return SDValue(); 13818 13819 SelectionDAG &DAG = DCI.DAG; 13820 SDLoc SL(N); 13821 SDValue Ins = N->getOperand(1); 13822 EVT IdxVT = Idx.getValueType(); 13823 13824 SmallVector<SDValue, 16> Ops; 13825 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) { 13826 SDValue IC = DAG.getConstant(I, SL, IdxVT); 13827 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC); 13828 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ); 13829 Ops.push_back(V); 13830 } 13831 13832 return DAG.getBuildVector(VecVT, SL, Ops); 13833 } 13834 13835 /// Return the source of an fp_extend from f16 to f32, or a converted FP 13836 /// constant. 13837 static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src) { 13838 if (Src.getOpcode() == ISD::FP_EXTEND && 13839 Src.getOperand(0).getValueType() == MVT::f16) { 13840 return Src.getOperand(0); 13841 } 13842 13843 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) { 13844 APFloat Val = CFP->getValueAPF(); 13845 bool LosesInfo = true; 13846 Val.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, &LosesInfo); 13847 if (!LosesInfo) 13848 return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16); 13849 } 13850 13851 return SDValue(); 13852 } 13853 13854 SDValue SITargetLowering::performFPRoundCombine(SDNode *N, 13855 DAGCombinerInfo &DCI) const { 13856 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() && 13857 "combine only useful on gfx8"); 13858 13859 SDValue TruncSrc = N->getOperand(0); 13860 EVT VT = N->getValueType(0); 13861 if (VT != MVT::f16) 13862 return SDValue(); 13863 13864 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 || 13865 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse()) 13866 return SDValue(); 13867 13868 SelectionDAG &DAG = DCI.DAG; 13869 SDLoc SL(N); 13870 13871 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3, 13872 // and expanding it with min/max saves 1 instruction vs. casting to f32 and 13873 // casting back. 13874 13875 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) => 13876 // fmin(fmax(a, b), fmax(fmin(a, b), c)) 13877 SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0)); 13878 if (!A) 13879 return SDValue(); 13880 13881 SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1)); 13882 if (!B) 13883 return SDValue(); 13884 13885 SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2)); 13886 if (!C) 13887 return SDValue(); 13888 13889 // This changes signaling nan behavior. If an input is a signaling nan, it 13890 // would have been quieted by the fpext originally. We don't care because 13891 // these are unconstrained ops. If we needed to insert quieting canonicalizes 13892 // we would be worse off than just doing the promotion. 13893 SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B); 13894 SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B); 13895 SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C); 13896 return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1); 13897 } 13898 13899 unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG, 13900 const SDNode *N0, 13901 const SDNode *N1) const { 13902 EVT VT = N0->getValueType(0); 13903 13904 // Only do this if we are not trying to support denormals. v_mad_f32 does not 13905 // support denormals ever. 13906 if (((VT == MVT::f32 && 13907 denormalModeIsFlushAllF32(DAG.getMachineFunction())) || 13908 (VT == MVT::f16 && Subtarget->hasMadF16() && 13909 denormalModeIsFlushAllF64F16(DAG.getMachineFunction()))) && 13910 isOperationLegal(ISD::FMAD, VT)) 13911 return ISD::FMAD; 13912 13913 const TargetOptions &Options = DAG.getTarget().Options; 13914 if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath || 13915 (N0->getFlags().hasAllowContract() && 13916 N1->getFlags().hasAllowContract())) && 13917 isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT)) { 13918 return ISD::FMA; 13919 } 13920 13921 return 0; 13922 } 13923 13924 // For a reassociatable opcode perform: 13925 // op x, (op y, z) -> op (op x, z), y, if x and z are uniform 13926 SDValue SITargetLowering::reassociateScalarOps(SDNode *N, 13927 SelectionDAG &DAG) const { 13928 EVT VT = N->getValueType(0); 13929 if (VT != MVT::i32 && VT != MVT::i64) 13930 return SDValue(); 13931 13932 if (DAG.isBaseWithConstantOffset(SDValue(N, 0))) 13933 return SDValue(); 13934 13935 unsigned Opc = N->getOpcode(); 13936 SDValue Op0 = N->getOperand(0); 13937 SDValue Op1 = N->getOperand(1); 13938 13939 if (!(Op0->isDivergent() ^ Op1->isDivergent())) 13940 return SDValue(); 13941 13942 if (Op0->isDivergent()) 13943 std::swap(Op0, Op1); 13944 13945 if (Op1.getOpcode() != Opc || !Op1.hasOneUse()) 13946 return SDValue(); 13947 13948 SDValue Op2 = Op1.getOperand(1); 13949 Op1 = Op1.getOperand(0); 13950 if (!(Op1->isDivergent() ^ Op2->isDivergent())) 13951 return SDValue(); 13952 13953 if (Op1->isDivergent()) 13954 std::swap(Op1, Op2); 13955 13956 SDLoc SL(N); 13957 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1); 13958 return DAG.getNode(Opc, SL, VT, Add1, Op2); 13959 } 13960 13961 static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, 13962 SDValue N0, SDValue N1, SDValue N2, bool Signed) { 13963 unsigned MadOpc = Signed ? AMDGPUISD::MAD_I64_I32 : AMDGPUISD::MAD_U64_U32; 13964 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1); 13965 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2); 13966 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad); 13967 } 13968 13969 // Fold 13970 // y = lshr i64 x, 32 13971 // res = add (mul i64 y, Const), x where "Const" is a 64-bit constant 13972 // with Const.hi == -1 13973 // To 13974 // res = mad_u64_u32 y.lo ,Const.lo, x.lo 13975 static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, 13976 SDValue MulLHS, SDValue MulRHS, 13977 SDValue AddRHS) { 13978 if (MulRHS.getOpcode() == ISD::SRL) 13979 std::swap(MulLHS, MulRHS); 13980 13981 if (MulLHS.getValueType() != MVT::i64 || MulLHS.getOpcode() != ISD::SRL) 13982 return SDValue(); 13983 13984 ConstantSDNode *ShiftVal = dyn_cast<ConstantSDNode>(MulLHS.getOperand(1)); 13985 if (!ShiftVal || ShiftVal->getAsZExtVal() != 32 || 13986 MulLHS.getOperand(0) != AddRHS) 13987 return SDValue(); 13988 13989 ConstantSDNode *Const = dyn_cast<ConstantSDNode>(MulRHS.getNode()); 13990 if (!Const || Hi_32(Const->getZExtValue()) != uint32_t(-1)) 13991 return SDValue(); 13992 13993 SDValue ConstMul = 13994 DAG.getConstant(Lo_32(Const->getZExtValue()), SL, MVT::i32); 13995 return getMad64_32(DAG, SL, MVT::i64, 13996 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS), ConstMul, 13997 DAG.getZeroExtendInReg(AddRHS, SL, MVT::i32), false); 13998 } 13999 14000 // Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high 14001 // multiplies, if any. 14002 // 14003 // Full 64-bit multiplies that feed into an addition are lowered here instead 14004 // of using the generic expansion. The generic expansion ends up with 14005 // a tree of ADD nodes that prevents us from using the "add" part of the 14006 // MAD instruction. The expansion produced here results in a chain of ADDs 14007 // instead of a tree. 14008 SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N, 14009 DAGCombinerInfo &DCI) const { 14010 assert(N->getOpcode() == ISD::ADD); 14011 14012 SelectionDAG &DAG = DCI.DAG; 14013 EVT VT = N->getValueType(0); 14014 SDLoc SL(N); 14015 SDValue LHS = N->getOperand(0); 14016 SDValue RHS = N->getOperand(1); 14017 14018 if (VT.isVector()) 14019 return SDValue(); 14020 14021 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall 14022 // result in scalar registers for uniform values. 14023 if (!N->isDivergent() && Subtarget->hasSMulHi()) 14024 return SDValue(); 14025 14026 unsigned NumBits = VT.getScalarSizeInBits(); 14027 if (NumBits <= 32 || NumBits > 64) 14028 return SDValue(); 14029 14030 if (LHS.getOpcode() != ISD::MUL) { 14031 assert(RHS.getOpcode() == ISD::MUL); 14032 std::swap(LHS, RHS); 14033 } 14034 14035 // Avoid the fold if it would unduly increase the number of multiplies due to 14036 // multiple uses, except on hardware with full-rate multiply-add (which is 14037 // part of full-rate 64-bit ops). 14038 if (!Subtarget->hasFullRate64Ops()) { 14039 unsigned NumUsers = 0; 14040 for (SDNode *User : LHS->users()) { 14041 // There is a use that does not feed into addition, so the multiply can't 14042 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL. 14043 if (User->getOpcode() != ISD::ADD) 14044 return SDValue(); 14045 14046 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer 14047 // MUL + 3xADD + 3xADDC over 3xMAD. 14048 ++NumUsers; 14049 if (NumUsers >= 3) 14050 return SDValue(); 14051 } 14052 } 14053 14054 SDValue MulLHS = LHS.getOperand(0); 14055 SDValue MulRHS = LHS.getOperand(1); 14056 SDValue AddRHS = RHS; 14057 14058 if (SDValue FoldedMAD = tryFoldMADwithSRL(DAG, SL, MulLHS, MulRHS, AddRHS)) 14059 return FoldedMAD; 14060 14061 // Always check whether operands are small unsigned values, since that 14062 // knowledge is useful in more cases. Check for small signed values only if 14063 // doing so can unlock a shorter code sequence. 14064 bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32; 14065 bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32; 14066 14067 bool MulSignedLo = false; 14068 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) { 14069 MulSignedLo = 14070 numBitsSigned(MulLHS, DAG) <= 32 && numBitsSigned(MulRHS, DAG) <= 32; 14071 } 14072 14073 // The operands and final result all have the same number of bits. If 14074 // operands need to be extended, they can be extended with garbage. The 14075 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is 14076 // truncated away in the end. 14077 if (VT != MVT::i64) { 14078 MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS); 14079 MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS); 14080 AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS); 14081 } 14082 14083 // The basic code generated is conceptually straightforward. Pseudo code: 14084 // 14085 // accum = mad_64_32 lhs.lo, rhs.lo, accum 14086 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi 14087 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi 14088 // 14089 // The second and third lines are optional, depending on whether the factors 14090 // are {sign,zero}-extended or not. 14091 // 14092 // The actual DAG is noisier than the pseudo code, but only due to 14093 // instructions that disassemble values into low and high parts, and 14094 // assemble the final result. 14095 SDValue One = DAG.getConstant(1, SL, MVT::i32); 14096 14097 auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS); 14098 auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS); 14099 SDValue Accum = 14100 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo); 14101 14102 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) { 14103 auto [AccumLo, AccumHi] = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32); 14104 14105 if (!MulLHSUnsigned32) { 14106 auto MulLHSHi = 14107 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One); 14108 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo); 14109 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi); 14110 } 14111 14112 if (!MulRHSUnsigned32) { 14113 auto MulRHSHi = 14114 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One); 14115 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi); 14116 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi); 14117 } 14118 14119 Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi}); 14120 Accum = DAG.getBitcast(MVT::i64, Accum); 14121 } 14122 14123 if (VT != MVT::i64) 14124 Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum); 14125 return Accum; 14126 } 14127 14128 SDValue 14129 SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N, 14130 DAGCombinerInfo &DCI) const { 14131 SDValue RHS = N->getOperand(1); 14132 auto *CRHS = dyn_cast<ConstantSDNode>(RHS); 14133 if (!CRHS) 14134 return SDValue(); 14135 14136 // TODO: Worth using computeKnownBits? Maybe expensive since it's so 14137 // common. 14138 uint64_t Val = CRHS->getZExtValue(); 14139 if (countr_zero(Val) >= 32) { 14140 SelectionDAG &DAG = DCI.DAG; 14141 SDLoc SL(N); 14142 SDValue LHS = N->getOperand(0); 14143 14144 // Avoid carry machinery if we know the low half of the add does not 14145 // contribute to the final result. 14146 // 14147 // add i64:x, K if computeTrailingZeros(K) >= 32 14148 // => build_pair (add x.hi, K.hi), x.lo 14149 14150 // Breaking the 64-bit add here with this strange constant is unlikely 14151 // to interfere with addressing mode patterns. 14152 14153 SDValue Hi = getHiHalf64(LHS, DAG); 14154 SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32); 14155 SDValue AddHi = 14156 DAG.getNode(N->getOpcode(), SL, MVT::i32, Hi, ConstHi32, N->getFlags()); 14157 14158 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS); 14159 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi); 14160 } 14161 14162 return SDValue(); 14163 } 14164 14165 // Collect the ultimate src of each of the mul node's operands, and confirm 14166 // each operand is 8 bytes. 14167 static std::optional<ByteProvider<SDValue>> 14168 handleMulOperand(const SDValue &MulOperand) { 14169 auto Byte0 = calculateByteProvider(MulOperand, 0, 0); 14170 if (!Byte0 || Byte0->isConstantZero()) { 14171 return std::nullopt; 14172 } 14173 auto Byte1 = calculateByteProvider(MulOperand, 1, 0); 14174 if (Byte1 && !Byte1->isConstantZero()) { 14175 return std::nullopt; 14176 } 14177 return Byte0; 14178 } 14179 14180 static unsigned addPermMasks(unsigned First, unsigned Second) { 14181 unsigned FirstCs = First & 0x0c0c0c0c; 14182 unsigned SecondCs = Second & 0x0c0c0c0c; 14183 unsigned FirstNoCs = First & ~0x0c0c0c0c; 14184 unsigned SecondNoCs = Second & ~0x0c0c0c0c; 14185 14186 assert((FirstCs & 0xFF) | (SecondCs & 0xFF)); 14187 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00)); 14188 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000)); 14189 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000)); 14190 14191 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs); 14192 } 14193 14194 struct DotSrc { 14195 SDValue SrcOp; 14196 int64_t PermMask; 14197 int64_t DWordOffset; 14198 }; 14199 14200 static void placeSources(ByteProvider<SDValue> &Src0, 14201 ByteProvider<SDValue> &Src1, 14202 SmallVectorImpl<DotSrc> &Src0s, 14203 SmallVectorImpl<DotSrc> &Src1s, int Step) { 14204 14205 assert(Src0.Src.has_value() && Src1.Src.has_value()); 14206 // Src0s and Src1s are empty, just place arbitrarily. 14207 if (Step == 0) { 14208 Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c, 14209 Src0.SrcOffset / 4}); 14210 Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c, 14211 Src1.SrcOffset / 4}); 14212 return; 14213 } 14214 14215 for (int BPI = 0; BPI < 2; BPI++) { 14216 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1}; 14217 if (BPI == 1) { 14218 BPP = {Src1, Src0}; 14219 } 14220 unsigned ZeroMask = 0x0c0c0c0c; 14221 unsigned FMask = 0xFF << (8 * (3 - Step)); 14222 14223 unsigned FirstMask = 14224 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask); 14225 unsigned SecondMask = 14226 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask); 14227 // Attempt to find Src vector which contains our SDValue, if so, add our 14228 // perm mask to the existing one. If we are unable to find a match for the 14229 // first SDValue, attempt to find match for the second. 14230 int FirstGroup = -1; 14231 for (int I = 0; I < 2; I++) { 14232 SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s; 14233 auto MatchesFirst = [&BPP](DotSrc &IterElt) { 14234 return IterElt.SrcOp == *BPP.first.Src && 14235 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4)); 14236 }; 14237 14238 auto *Match = llvm::find_if(Srcs, MatchesFirst); 14239 if (Match != Srcs.end()) { 14240 Match->PermMask = addPermMasks(FirstMask, Match->PermMask); 14241 FirstGroup = I; 14242 break; 14243 } 14244 } 14245 if (FirstGroup != -1) { 14246 SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s; 14247 auto MatchesSecond = [&BPP](DotSrc &IterElt) { 14248 return IterElt.SrcOp == *BPP.second.Src && 14249 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4)); 14250 }; 14251 auto *Match = llvm::find_if(Srcs, MatchesSecond); 14252 if (Match != Srcs.end()) { 14253 Match->PermMask = addPermMasks(SecondMask, Match->PermMask); 14254 } else 14255 Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4}); 14256 return; 14257 } 14258 } 14259 14260 // If we have made it here, then we could not find a match in Src0s or Src1s 14261 // for either Src0 or Src1, so just place them arbitrarily. 14262 14263 unsigned ZeroMask = 0x0c0c0c0c; 14264 unsigned FMask = 0xFF << (8 * (3 - Step)); 14265 14266 Src0s.push_back( 14267 {*Src0.Src, 14268 ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)), 14269 Src0.SrcOffset / 4}); 14270 Src1s.push_back( 14271 {*Src1.Src, 14272 ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)), 14273 Src1.SrcOffset / 4}); 14274 } 14275 14276 static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, 14277 SmallVectorImpl<DotSrc> &Srcs, bool IsSigned, 14278 bool IsAny) { 14279 14280 // If we just have one source, just permute it accordingly. 14281 if (Srcs.size() == 1) { 14282 auto *Elt = Srcs.begin(); 14283 auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset); 14284 14285 // v_perm will produce the original value 14286 if (Elt->PermMask == 0x3020100) 14287 return EltOp; 14288 14289 return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp, 14290 DAG.getConstant(Elt->PermMask, SL, MVT::i32)); 14291 } 14292 14293 auto *FirstElt = Srcs.begin(); 14294 auto *SecondElt = std::next(FirstElt); 14295 14296 SmallVector<SDValue, 2> Perms; 14297 14298 // If we have multiple sources in the chain, combine them via perms (using 14299 // calculated perm mask) and Ors. 14300 while (true) { 14301 auto FirstMask = FirstElt->PermMask; 14302 auto SecondMask = SecondElt->PermMask; 14303 14304 unsigned FirstCs = FirstMask & 0x0c0c0c0c; 14305 unsigned FirstPlusFour = FirstMask | 0x04040404; 14306 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any 14307 // original 0x0C. 14308 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs; 14309 14310 auto PermMask = addPermMasks(FirstMask, SecondMask); 14311 auto FirstVal = 14312 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset); 14313 auto SecondVal = 14314 getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset); 14315 14316 Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal, 14317 SecondVal, 14318 DAG.getConstant(PermMask, SL, MVT::i32))); 14319 14320 FirstElt = std::next(SecondElt); 14321 if (FirstElt == Srcs.end()) 14322 break; 14323 14324 SecondElt = std::next(FirstElt); 14325 // If we only have a FirstElt, then just combine that into the cumulative 14326 // source node. 14327 if (SecondElt == Srcs.end()) { 14328 auto EltOp = 14329 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset); 14330 14331 Perms.push_back( 14332 DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp, 14333 DAG.getConstant(FirstElt->PermMask, SL, MVT::i32))); 14334 break; 14335 } 14336 } 14337 14338 assert(Perms.size() == 1 || Perms.size() == 2); 14339 return Perms.size() == 2 14340 ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1]) 14341 : Perms[0]; 14342 } 14343 14344 static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) { 14345 for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) { 14346 EntryMask = EntryMask >> ((4 - ChainLength) * 8); 14347 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000; 14348 EntryMask += ZeroMask; 14349 } 14350 } 14351 14352 static bool isMul(const SDValue Op) { 14353 auto Opcode = Op.getOpcode(); 14354 14355 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 || 14356 Opcode == AMDGPUISD::MUL_I24); 14357 } 14358 14359 static std::optional<bool> 14360 checkDot4MulSignedness(const SDValue &N, ByteProvider<SDValue> &Src0, 14361 ByteProvider<SDValue> &Src1, const SDValue &S0Op, 14362 const SDValue &S1Op, const SelectionDAG &DAG) { 14363 // If we both ops are i8s (pre legalize-dag), then the signedness semantics 14364 // of the dot4 is irrelevant. 14365 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8) 14366 return false; 14367 14368 auto Known0 = DAG.computeKnownBits(S0Op, 0); 14369 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0; 14370 bool S0IsSigned = Known0.countMinLeadingOnes() > 0; 14371 auto Known1 = DAG.computeKnownBits(S1Op, 0); 14372 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0; 14373 bool S1IsSigned = Known1.countMinLeadingOnes() > 0; 14374 14375 assert(!(S0IsUnsigned && S0IsSigned)); 14376 assert(!(S1IsUnsigned && S1IsSigned)); 14377 14378 // There are 9 possible permutations of 14379 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned} 14380 14381 // In two permutations, the sign bits are known to be the same for both Ops, 14382 // so simply return Signed / Unsigned corresponding to the MSB 14383 14384 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned)) 14385 return S0IsSigned; 14386 14387 // In another two permutations, the sign bits are known to be opposite. In 14388 // this case return std::nullopt to indicate a bad match. 14389 14390 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned)) 14391 return std::nullopt; 14392 14393 // In the remaining five permutations, we don't know the value of the sign 14394 // bit for at least one Op. Since we have a valid ByteProvider, we know that 14395 // the upper bits must be extension bits. Thus, the only ways for the sign 14396 // bit to be unknown is if it was sign extended from unknown value, or if it 14397 // was any extended. In either case, it is correct to use the signed 14398 // version of the signedness semantics of dot4 14399 14400 // In two of such permutations, we known the sign bit is set for 14401 // one op, and the other is unknown. It is okay to used signed version of 14402 // dot4. 14403 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) || 14404 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned)))) 14405 return true; 14406 14407 // In one such permutation, we don't know either of the sign bits. It is okay 14408 // to used the signed version of dot4. 14409 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned))) 14410 return true; 14411 14412 // In two of such permutations, we known the sign bit is unset for 14413 // one op, and the other is unknown. Return std::nullopt to indicate a 14414 // bad match. 14415 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) || 14416 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned)))) 14417 return std::nullopt; 14418 14419 llvm_unreachable("Fully covered condition"); 14420 } 14421 14422 SDValue SITargetLowering::performAddCombine(SDNode *N, 14423 DAGCombinerInfo &DCI) const { 14424 SelectionDAG &DAG = DCI.DAG; 14425 EVT VT = N->getValueType(0); 14426 SDLoc SL(N); 14427 SDValue LHS = N->getOperand(0); 14428 SDValue RHS = N->getOperand(1); 14429 14430 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) { 14431 if (Subtarget->hasMad64_32()) { 14432 if (SDValue Folded = tryFoldToMad64_32(N, DCI)) 14433 return Folded; 14434 } 14435 } 14436 14437 if (SDValue V = reassociateScalarOps(N, DAG)) { 14438 return V; 14439 } 14440 14441 if (VT == MVT::i64) { 14442 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI)) 14443 return Folded; 14444 } 14445 14446 if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() && 14447 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) { 14448 SDValue TempNode(N, 0); 14449 std::optional<bool> IsSigned; 14450 SmallVector<DotSrc, 4> Src0s; 14451 SmallVector<DotSrc, 4> Src1s; 14452 SmallVector<SDValue, 4> Src2s; 14453 14454 // Match the v_dot4 tree, while collecting src nodes. 14455 int ChainLength = 0; 14456 for (int I = 0; I < 4; I++) { 14457 auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1; 14458 if (MulIdx == -1) 14459 break; 14460 auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0)); 14461 if (!Src0) 14462 break; 14463 auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1)); 14464 if (!Src1) 14465 break; 14466 14467 auto IterIsSigned = checkDot4MulSignedness( 14468 TempNode->getOperand(MulIdx), *Src0, *Src1, 14469 TempNode->getOperand(MulIdx)->getOperand(0), 14470 TempNode->getOperand(MulIdx)->getOperand(1), DAG); 14471 if (!IterIsSigned) 14472 break; 14473 if (!IsSigned) 14474 IsSigned = *IterIsSigned; 14475 if (*IterIsSigned != *IsSigned) 14476 break; 14477 placeSources(*Src0, *Src1, Src0s, Src1s, I); 14478 auto AddIdx = 1 - MulIdx; 14479 // Allow the special case where add (add (mul24, 0), mul24) became -> 14480 // add (mul24, mul24). 14481 if (I == 2 && isMul(TempNode->getOperand(AddIdx))) { 14482 Src2s.push_back(TempNode->getOperand(AddIdx)); 14483 auto Src0 = 14484 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0)); 14485 if (!Src0) 14486 break; 14487 auto Src1 = 14488 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1)); 14489 if (!Src1) 14490 break; 14491 auto IterIsSigned = checkDot4MulSignedness( 14492 TempNode->getOperand(AddIdx), *Src0, *Src1, 14493 TempNode->getOperand(AddIdx)->getOperand(0), 14494 TempNode->getOperand(AddIdx)->getOperand(1), DAG); 14495 if (!IterIsSigned) 14496 break; 14497 assert(IsSigned); 14498 if (*IterIsSigned != *IsSigned) 14499 break; 14500 placeSources(*Src0, *Src1, Src0s, Src1s, I + 1); 14501 Src2s.push_back(DAG.getConstant(0, SL, MVT::i32)); 14502 ChainLength = I + 2; 14503 break; 14504 } 14505 14506 TempNode = TempNode->getOperand(AddIdx); 14507 Src2s.push_back(TempNode); 14508 ChainLength = I + 1; 14509 if (TempNode->getNumOperands() < 2) 14510 break; 14511 LHS = TempNode->getOperand(0); 14512 RHS = TempNode->getOperand(1); 14513 } 14514 14515 if (ChainLength < 2) 14516 return SDValue(); 14517 14518 // Masks were constructed with assumption that we would find a chain of 14519 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of 14520 // 0x0c) so they do not affect dot calculation. 14521 if (ChainLength < 4) { 14522 fixMasks(Src0s, ChainLength); 14523 fixMasks(Src1s, ChainLength); 14524 } 14525 14526 SDValue Src0, Src1; 14527 14528 // If we are just using a single source for both, and have permuted the 14529 // bytes consistently, we can just use the sources without permuting 14530 // (commutation). 14531 bool UseOriginalSrc = false; 14532 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 && 14533 Src0s.begin()->PermMask == Src1s.begin()->PermMask && 14534 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 && 14535 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) { 14536 SmallVector<unsigned, 4> SrcBytes; 14537 auto Src0Mask = Src0s.begin()->PermMask; 14538 SrcBytes.push_back(Src0Mask & 0xFF000000); 14539 bool UniqueEntries = true; 14540 for (auto I = 1; I < 4; I++) { 14541 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8)); 14542 14543 if (is_contained(SrcBytes, NextByte)) { 14544 UniqueEntries = false; 14545 break; 14546 } 14547 SrcBytes.push_back(NextByte); 14548 } 14549 14550 if (UniqueEntries) { 14551 UseOriginalSrc = true; 14552 14553 auto *FirstElt = Src0s.begin(); 14554 auto FirstEltOp = 14555 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset); 14556 14557 auto *SecondElt = Src1s.begin(); 14558 auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp, 14559 SecondElt->DWordOffset); 14560 14561 Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL, 14562 MVT::getIntegerVT(32)); 14563 Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL, 14564 MVT::getIntegerVT(32)); 14565 } 14566 } 14567 14568 if (!UseOriginalSrc) { 14569 Src0 = resolveSources(DAG, SL, Src0s, false, true); 14570 Src1 = resolveSources(DAG, SL, Src1s, false, true); 14571 } 14572 14573 assert(IsSigned); 14574 SDValue Src2 = 14575 DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32); 14576 14577 SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4 14578 : Intrinsic::amdgcn_udot4, 14579 SL, MVT::i64); 14580 14581 assert(!VT.isVector()); 14582 auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0, 14583 Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1)); 14584 14585 return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT); 14586 } 14587 14588 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG()) 14589 return SDValue(); 14590 14591 // add x, zext (setcc) => uaddo_carry x, 0, setcc 14592 // add x, sext (setcc) => usubo_carry x, 0, setcc 14593 unsigned Opc = LHS.getOpcode(); 14594 if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND || 14595 Opc == ISD::ANY_EXTEND || Opc == ISD::UADDO_CARRY) 14596 std::swap(RHS, LHS); 14597 14598 Opc = RHS.getOpcode(); 14599 switch (Opc) { 14600 default: 14601 break; 14602 case ISD::ZERO_EXTEND: 14603 case ISD::SIGN_EXTEND: 14604 case ISD::ANY_EXTEND: { 14605 auto Cond = RHS.getOperand(0); 14606 // If this won't be a real VOPC output, we would still need to insert an 14607 // extra instruction anyway. 14608 if (!isBoolSGPR(Cond)) 14609 break; 14610 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1); 14611 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond}; 14612 Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::USUBO_CARRY : ISD::UADDO_CARRY; 14613 return DAG.getNode(Opc, SL, VTList, Args); 14614 } 14615 case ISD::UADDO_CARRY: { 14616 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc 14617 if (!isNullConstant(RHS.getOperand(1))) 14618 break; 14619 SDValue Args[] = {LHS, RHS.getOperand(0), RHS.getOperand(2)}; 14620 return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args); 14621 } 14622 } 14623 return SDValue(); 14624 } 14625 14626 SDValue SITargetLowering::performSubCombine(SDNode *N, 14627 DAGCombinerInfo &DCI) const { 14628 SelectionDAG &DAG = DCI.DAG; 14629 EVT VT = N->getValueType(0); 14630 14631 if (VT == MVT::i64) { 14632 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI)) 14633 return Folded; 14634 } 14635 14636 if (VT != MVT::i32) 14637 return SDValue(); 14638 14639 SDLoc SL(N); 14640 SDValue LHS = N->getOperand(0); 14641 SDValue RHS = N->getOperand(1); 14642 14643 // sub x, zext (setcc) => usubo_carry x, 0, setcc 14644 // sub x, sext (setcc) => uaddo_carry x, 0, setcc 14645 unsigned Opc = RHS.getOpcode(); 14646 switch (Opc) { 14647 default: 14648 break; 14649 case ISD::ZERO_EXTEND: 14650 case ISD::SIGN_EXTEND: 14651 case ISD::ANY_EXTEND: { 14652 auto Cond = RHS.getOperand(0); 14653 // If this won't be a real VOPC output, we would still need to insert an 14654 // extra instruction anyway. 14655 if (!isBoolSGPR(Cond)) 14656 break; 14657 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1); 14658 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond}; 14659 Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::UADDO_CARRY : ISD::USUBO_CARRY; 14660 return DAG.getNode(Opc, SL, VTList, Args); 14661 } 14662 } 14663 14664 if (LHS.getOpcode() == ISD::USUBO_CARRY) { 14665 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc 14666 if (!isNullConstant(LHS.getOperand(1))) 14667 return SDValue(); 14668 SDValue Args[] = {LHS.getOperand(0), RHS, LHS.getOperand(2)}; 14669 return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args); 14670 } 14671 return SDValue(); 14672 } 14673 14674 SDValue 14675 SITargetLowering::performAddCarrySubCarryCombine(SDNode *N, 14676 DAGCombinerInfo &DCI) const { 14677 14678 if (N->getValueType(0) != MVT::i32) 14679 return SDValue(); 14680 14681 if (!isNullConstant(N->getOperand(1))) 14682 return SDValue(); 14683 14684 SelectionDAG &DAG = DCI.DAG; 14685 SDValue LHS = N->getOperand(0); 14686 14687 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc 14688 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc 14689 unsigned LHSOpc = LHS.getOpcode(); 14690 unsigned Opc = N->getOpcode(); 14691 if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) || 14692 (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) { 14693 SDValue Args[] = {LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2)}; 14694 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args); 14695 } 14696 return SDValue(); 14697 } 14698 14699 SDValue SITargetLowering::performFAddCombine(SDNode *N, 14700 DAGCombinerInfo &DCI) const { 14701 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) 14702 return SDValue(); 14703 14704 SelectionDAG &DAG = DCI.DAG; 14705 EVT VT = N->getValueType(0); 14706 14707 SDLoc SL(N); 14708 SDValue LHS = N->getOperand(0); 14709 SDValue RHS = N->getOperand(1); 14710 14711 // These should really be instruction patterns, but writing patterns with 14712 // source modifiers is a pain. 14713 14714 // fadd (fadd (a, a), b) -> mad 2.0, a, b 14715 if (LHS.getOpcode() == ISD::FADD) { 14716 SDValue A = LHS.getOperand(0); 14717 if (A == LHS.getOperand(1)) { 14718 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode()); 14719 if (FusedOp != 0) { 14720 const SDValue Two = DAG.getConstantFP(2.0, SL, VT); 14721 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS); 14722 } 14723 } 14724 } 14725 14726 // fadd (b, fadd (a, a)) -> mad 2.0, a, b 14727 if (RHS.getOpcode() == ISD::FADD) { 14728 SDValue A = RHS.getOperand(0); 14729 if (A == RHS.getOperand(1)) { 14730 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode()); 14731 if (FusedOp != 0) { 14732 const SDValue Two = DAG.getConstantFP(2.0, SL, VT); 14733 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS); 14734 } 14735 } 14736 } 14737 14738 return SDValue(); 14739 } 14740 14741 SDValue SITargetLowering::performFSubCombine(SDNode *N, 14742 DAGCombinerInfo &DCI) const { 14743 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) 14744 return SDValue(); 14745 14746 SelectionDAG &DAG = DCI.DAG; 14747 SDLoc SL(N); 14748 EVT VT = N->getValueType(0); 14749 assert(!VT.isVector()); 14750 14751 // Try to get the fneg to fold into the source modifier. This undoes generic 14752 // DAG combines and folds them into the mad. 14753 // 14754 // Only do this if we are not trying to support denormals. v_mad_f32 does 14755 // not support denormals ever. 14756 SDValue LHS = N->getOperand(0); 14757 SDValue RHS = N->getOperand(1); 14758 if (LHS.getOpcode() == ISD::FADD) { 14759 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c) 14760 SDValue A = LHS.getOperand(0); 14761 if (A == LHS.getOperand(1)) { 14762 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode()); 14763 if (FusedOp != 0) { 14764 const SDValue Two = DAG.getConstantFP(2.0, SL, VT); 14765 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); 14766 14767 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS); 14768 } 14769 } 14770 } 14771 14772 if (RHS.getOpcode() == ISD::FADD) { 14773 // (fsub c, (fadd a, a)) -> mad -2.0, a, c 14774 14775 SDValue A = RHS.getOperand(0); 14776 if (A == RHS.getOperand(1)) { 14777 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode()); 14778 if (FusedOp != 0) { 14779 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT); 14780 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS); 14781 } 14782 } 14783 } 14784 14785 return SDValue(); 14786 } 14787 14788 SDValue SITargetLowering::performFDivCombine(SDNode *N, 14789 DAGCombinerInfo &DCI) const { 14790 SelectionDAG &DAG = DCI.DAG; 14791 SDLoc SL(N); 14792 EVT VT = N->getValueType(0); 14793 if (VT != MVT::f16 || !Subtarget->has16BitInsts()) 14794 return SDValue(); 14795 14796 SDValue LHS = N->getOperand(0); 14797 SDValue RHS = N->getOperand(1); 14798 14799 SDNodeFlags Flags = N->getFlags(); 14800 SDNodeFlags RHSFlags = RHS->getFlags(); 14801 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() || 14802 !RHS->hasOneUse()) 14803 return SDValue(); 14804 14805 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) { 14806 bool IsNegative = false; 14807 if (CLHS->isExactlyValue(1.0) || 14808 (IsNegative = CLHS->isExactlyValue(-1.0))) { 14809 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16 14810 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16 14811 if (RHS.getOpcode() == ISD::FSQRT) { 14812 // TODO: Or in RHS flags, somehow missing from SDNodeFlags 14813 SDValue Rsq = 14814 DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags); 14815 return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq; 14816 } 14817 } 14818 } 14819 14820 return SDValue(); 14821 } 14822 14823 SDValue SITargetLowering::performFMulCombine(SDNode *N, 14824 DAGCombinerInfo &DCI) const { 14825 SelectionDAG &DAG = DCI.DAG; 14826 EVT VT = N->getValueType(0); 14827 EVT ScalarVT = VT.getScalarType(); 14828 EVT IntVT = VT.changeElementType(MVT::i32); 14829 14830 SDValue LHS = N->getOperand(0); 14831 SDValue RHS = N->getOperand(1); 14832 14833 // It is cheaper to realize i32 inline constants as compared against 14834 // materializing f16 or f64 (or even non-inline f32) values, 14835 // possible via ldexp usage, as shown below : 14836 // 14837 // Given : A = 2^a & B = 2^b ; where a and b are integers. 14838 // fmul x, (select y, A, B) -> ldexp( x, (select i32 y, a, b) ) 14839 // fmul x, (select y, -A, -B) -> ldexp( (fneg x), (select i32 y, a, b) ) 14840 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) && 14841 (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT)) { 14842 const ConstantFPSDNode *TrueNode = isConstOrConstSplatFP(RHS.getOperand(1)); 14843 if (!TrueNode) 14844 return SDValue(); 14845 const ConstantFPSDNode *FalseNode = 14846 isConstOrConstSplatFP(RHS.getOperand(2)); 14847 if (!FalseNode) 14848 return SDValue(); 14849 14850 if (TrueNode->isNegative() != FalseNode->isNegative()) 14851 return SDValue(); 14852 14853 // For f32, only non-inline constants should be transformed. 14854 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 14855 if (ScalarVT == MVT::f32 && 14856 TII->isInlineConstant(TrueNode->getValueAPF()) && 14857 TII->isInlineConstant(FalseNode->getValueAPF())) 14858 return SDValue(); 14859 14860 int TrueNodeExpVal = TrueNode->getValueAPF().getExactLog2Abs(); 14861 if (TrueNodeExpVal == INT_MIN) 14862 return SDValue(); 14863 int FalseNodeExpVal = FalseNode->getValueAPF().getExactLog2Abs(); 14864 if (FalseNodeExpVal == INT_MIN) 14865 return SDValue(); 14866 14867 SDLoc SL(N); 14868 SDValue SelectNode = 14869 DAG.getNode(ISD::SELECT, SL, IntVT, RHS.getOperand(0), 14870 DAG.getSignedConstant(TrueNodeExpVal, SL, IntVT), 14871 DAG.getSignedConstant(FalseNodeExpVal, SL, IntVT)); 14872 14873 LHS = TrueNode->isNegative() 14874 ? DAG.getNode(ISD::FNEG, SL, VT, LHS, LHS->getFlags()) 14875 : LHS; 14876 14877 return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, SelectNode, N->getFlags()); 14878 } 14879 14880 return SDValue(); 14881 } 14882 14883 SDValue SITargetLowering::performFMACombine(SDNode *N, 14884 DAGCombinerInfo &DCI) const { 14885 SelectionDAG &DAG = DCI.DAG; 14886 EVT VT = N->getValueType(0); 14887 SDLoc SL(N); 14888 14889 if (!Subtarget->hasDot10Insts() || VT != MVT::f32) 14890 return SDValue(); 14891 14892 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) -> 14893 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z)) 14894 SDValue Op1 = N->getOperand(0); 14895 SDValue Op2 = N->getOperand(1); 14896 SDValue FMA = N->getOperand(2); 14897 14898 if (FMA.getOpcode() != ISD::FMA || Op1.getOpcode() != ISD::FP_EXTEND || 14899 Op2.getOpcode() != ISD::FP_EXTEND) 14900 return SDValue(); 14901 14902 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero, 14903 // regardless of the denorm mode setting. Therefore, 14904 // unsafe-fp-math/fp-contract is sufficient to allow generating fdot2. 14905 const TargetOptions &Options = DAG.getTarget().Options; 14906 if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath || 14907 (N->getFlags().hasAllowContract() && 14908 FMA->getFlags().hasAllowContract())) { 14909 Op1 = Op1.getOperand(0); 14910 Op2 = Op2.getOperand(0); 14911 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 14912 Op2.getOpcode() != ISD::EXTRACT_VECTOR_ELT) 14913 return SDValue(); 14914 14915 SDValue Vec1 = Op1.getOperand(0); 14916 SDValue Idx1 = Op1.getOperand(1); 14917 SDValue Vec2 = Op2.getOperand(0); 14918 14919 SDValue FMAOp1 = FMA.getOperand(0); 14920 SDValue FMAOp2 = FMA.getOperand(1); 14921 SDValue FMAAcc = FMA.getOperand(2); 14922 14923 if (FMAOp1.getOpcode() != ISD::FP_EXTEND || 14924 FMAOp2.getOpcode() != ISD::FP_EXTEND) 14925 return SDValue(); 14926 14927 FMAOp1 = FMAOp1.getOperand(0); 14928 FMAOp2 = FMAOp2.getOperand(0); 14929 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 14930 FMAOp2.getOpcode() != ISD::EXTRACT_VECTOR_ELT) 14931 return SDValue(); 14932 14933 SDValue Vec3 = FMAOp1.getOperand(0); 14934 SDValue Vec4 = FMAOp2.getOperand(0); 14935 SDValue Idx2 = FMAOp1.getOperand(1); 14936 14937 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) || 14938 // Idx1 and Idx2 cannot be the same. 14939 Idx1 == Idx2) 14940 return SDValue(); 14941 14942 if (Vec1 == Vec2 || Vec3 == Vec4) 14943 return SDValue(); 14944 14945 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16) 14946 return SDValue(); 14947 14948 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) { 14949 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc, 14950 DAG.getTargetConstant(0, SL, MVT::i1)); 14951 } 14952 } 14953 return SDValue(); 14954 } 14955 14956 SDValue SITargetLowering::performSetCCCombine(SDNode *N, 14957 DAGCombinerInfo &DCI) const { 14958 SelectionDAG &DAG = DCI.DAG; 14959 SDLoc SL(N); 14960 14961 SDValue LHS = N->getOperand(0); 14962 SDValue RHS = N->getOperand(1); 14963 EVT VT = LHS.getValueType(); 14964 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); 14965 14966 auto *CRHS = dyn_cast<ConstantSDNode>(RHS); 14967 if (!CRHS) { 14968 CRHS = dyn_cast<ConstantSDNode>(LHS); 14969 if (CRHS) { 14970 std::swap(LHS, RHS); 14971 CC = getSetCCSwappedOperands(CC); 14972 } 14973 } 14974 14975 if (CRHS) { 14976 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND && 14977 isBoolSGPR(LHS.getOperand(0))) { 14978 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1 14979 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc 14980 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1 14981 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc 14982 if ((CRHS->isAllOnes() && 14983 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) || 14984 (CRHS->isZero() && 14985 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE))) 14986 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0), 14987 DAG.getAllOnesConstant(SL, MVT::i1)); 14988 if ((CRHS->isAllOnes() && 14989 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) || 14990 (CRHS->isZero() && 14991 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT))) 14992 return LHS.getOperand(0); 14993 } 14994 14995 const APInt &CRHSVal = CRHS->getAPIntValue(); 14996 if ((CC == ISD::SETEQ || CC == ISD::SETNE) && 14997 LHS.getOpcode() == ISD::SELECT && 14998 isa<ConstantSDNode>(LHS.getOperand(1)) && 14999 isa<ConstantSDNode>(LHS.getOperand(2)) && 15000 LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) && 15001 isBoolSGPR(LHS.getOperand(0))) { 15002 // Given CT != FT: 15003 // setcc (select cc, CT, CF), CF, eq => xor cc, -1 15004 // setcc (select cc, CT, CF), CF, ne => cc 15005 // setcc (select cc, CT, CF), CT, ne => xor cc, -1 15006 // setcc (select cc, CT, CF), CT, eq => cc 15007 const APInt &CT = LHS.getConstantOperandAPInt(1); 15008 const APInt &CF = LHS.getConstantOperandAPInt(2); 15009 15010 if ((CF == CRHSVal && CC == ISD::SETEQ) || 15011 (CT == CRHSVal && CC == ISD::SETNE)) 15012 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0), 15013 DAG.getAllOnesConstant(SL, MVT::i1)); 15014 if ((CF == CRHSVal && CC == ISD::SETNE) || 15015 (CT == CRHSVal && CC == ISD::SETEQ)) 15016 return LHS.getOperand(0); 15017 } 15018 } 15019 15020 if (VT != MVT::f32 && VT != MVT::f64 && 15021 (!Subtarget->has16BitInsts() || VT != MVT::f16)) 15022 return SDValue(); 15023 15024 // Match isinf/isfinite pattern 15025 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity)) 15026 // (fcmp one (fabs x), inf) -> (fp_class x, 15027 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero) 15028 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) && 15029 LHS.getOpcode() == ISD::FABS) { 15030 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS); 15031 if (!CRHS) 15032 return SDValue(); 15033 15034 const APFloat &APF = CRHS->getValueAPF(); 15035 if (APF.isInfinity() && !APF.isNegative()) { 15036 const unsigned IsInfMask = 15037 SIInstrFlags::P_INFINITY | SIInstrFlags::N_INFINITY; 15038 const unsigned IsFiniteMask = 15039 SIInstrFlags::N_ZERO | SIInstrFlags::P_ZERO | SIInstrFlags::N_NORMAL | 15040 SIInstrFlags::P_NORMAL | SIInstrFlags::N_SUBNORMAL | 15041 SIInstrFlags::P_SUBNORMAL; 15042 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask; 15043 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0), 15044 DAG.getConstant(Mask, SL, MVT::i32)); 15045 } 15046 } 15047 15048 return SDValue(); 15049 } 15050 15051 SDValue 15052 SITargetLowering::performCvtF32UByteNCombine(SDNode *N, 15053 DAGCombinerInfo &DCI) const { 15054 SelectionDAG &DAG = DCI.DAG; 15055 SDLoc SL(N); 15056 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0; 15057 15058 SDValue Src = N->getOperand(0); 15059 SDValue Shift = N->getOperand(0); 15060 15061 // TODO: Extend type shouldn't matter (assuming legal types). 15062 if (Shift.getOpcode() == ISD::ZERO_EXTEND) 15063 Shift = Shift.getOperand(0); 15064 15065 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) { 15066 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x 15067 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x 15068 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x 15069 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x 15070 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x 15071 if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) { 15072 SDValue Shifted = DAG.getZExtOrTrunc( 15073 Shift.getOperand(0), SDLoc(Shift.getOperand(0)), MVT::i32); 15074 15075 unsigned ShiftOffset = 8 * Offset; 15076 if (Shift.getOpcode() == ISD::SHL) 15077 ShiftOffset -= C->getZExtValue(); 15078 else 15079 ShiftOffset += C->getZExtValue(); 15080 15081 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) { 15082 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL, 15083 MVT::f32, Shifted); 15084 } 15085 } 15086 } 15087 15088 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 15089 APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8); 15090 if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) { 15091 // We simplified Src. If this node is not dead, visit it again so it is 15092 // folded properly. 15093 if (N->getOpcode() != ISD::DELETED_NODE) 15094 DCI.AddToWorklist(N); 15095 return SDValue(N, 0); 15096 } 15097 15098 // Handle (or x, (srl y, 8)) pattern when known bits are zero. 15099 if (SDValue DemandedSrc = 15100 TLI.SimplifyMultipleUseDemandedBits(Src, DemandedBits, DAG)) 15101 return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc); 15102 15103 return SDValue(); 15104 } 15105 15106 SDValue SITargetLowering::performClampCombine(SDNode *N, 15107 DAGCombinerInfo &DCI) const { 15108 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0)); 15109 if (!CSrc) 15110 return SDValue(); 15111 15112 const MachineFunction &MF = DCI.DAG.getMachineFunction(); 15113 const APFloat &F = CSrc->getValueAPF(); 15114 APFloat Zero = APFloat::getZero(F.getSemantics()); 15115 if (F < Zero || 15116 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) { 15117 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0)); 15118 } 15119 15120 APFloat One(F.getSemantics(), "1.0"); 15121 if (F > One) 15122 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0)); 15123 15124 return SDValue(CSrc, 0); 15125 } 15126 15127 SDValue SITargetLowering::PerformDAGCombine(SDNode *N, 15128 DAGCombinerInfo &DCI) const { 15129 switch (N->getOpcode()) { 15130 case ISD::ADD: 15131 case ISD::SUB: 15132 case ISD::SHL: 15133 case ISD::SRL: 15134 case ISD::SRA: 15135 case ISD::AND: 15136 case ISD::OR: 15137 case ISD::XOR: 15138 case ISD::MUL: 15139 case ISD::SETCC: 15140 case ISD::SELECT: 15141 case ISD::SMIN: 15142 case ISD::SMAX: 15143 case ISD::UMIN: 15144 case ISD::UMAX: 15145 if (auto Res = promoteUniformOpToI32(SDValue(N, 0), DCI)) 15146 return Res; 15147 break; 15148 default: 15149 break; 15150 } 15151 15152 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None) 15153 return SDValue(); 15154 15155 switch (N->getOpcode()) { 15156 case ISD::ADD: 15157 return performAddCombine(N, DCI); 15158 case ISD::SUB: 15159 return performSubCombine(N, DCI); 15160 case ISD::UADDO_CARRY: 15161 case ISD::USUBO_CARRY: 15162 return performAddCarrySubCarryCombine(N, DCI); 15163 case ISD::FADD: 15164 return performFAddCombine(N, DCI); 15165 case ISD::FSUB: 15166 return performFSubCombine(N, DCI); 15167 case ISD::FDIV: 15168 return performFDivCombine(N, DCI); 15169 case ISD::FMUL: 15170 return performFMulCombine(N, DCI); 15171 case ISD::SETCC: 15172 return performSetCCCombine(N, DCI); 15173 case ISD::FMAXNUM: 15174 case ISD::FMINNUM: 15175 case ISD::FMAXNUM_IEEE: 15176 case ISD::FMINNUM_IEEE: 15177 case ISD::FMAXIMUM: 15178 case ISD::FMINIMUM: 15179 case ISD::SMAX: 15180 case ISD::SMIN: 15181 case ISD::UMAX: 15182 case ISD::UMIN: 15183 case AMDGPUISD::FMIN_LEGACY: 15184 case AMDGPUISD::FMAX_LEGACY: 15185 return performMinMaxCombine(N, DCI); 15186 case ISD::FMA: 15187 return performFMACombine(N, DCI); 15188 case ISD::AND: 15189 return performAndCombine(N, DCI); 15190 case ISD::OR: 15191 return performOrCombine(N, DCI); 15192 case ISD::FSHR: { 15193 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 15194 if (N->getValueType(0) == MVT::i32 && N->isDivergent() && 15195 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) { 15196 return matchPERM(N, DCI); 15197 } 15198 break; 15199 } 15200 case ISD::XOR: 15201 return performXorCombine(N, DCI); 15202 case ISD::ZERO_EXTEND: 15203 return performZeroExtendCombine(N, DCI); 15204 case ISD::SIGN_EXTEND_INREG: 15205 return performSignExtendInRegCombine(N, DCI); 15206 case AMDGPUISD::FP_CLASS: 15207 return performClassCombine(N, DCI); 15208 case ISD::FCANONICALIZE: 15209 return performFCanonicalizeCombine(N, DCI); 15210 case AMDGPUISD::RCP: 15211 return performRcpCombine(N, DCI); 15212 case ISD::FLDEXP: 15213 case AMDGPUISD::FRACT: 15214 case AMDGPUISD::RSQ: 15215 case AMDGPUISD::RCP_LEGACY: 15216 case AMDGPUISD::RCP_IFLAG: 15217 case AMDGPUISD::RSQ_CLAMP: { 15218 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted 15219 SDValue Src = N->getOperand(0); 15220 if (Src.isUndef()) 15221 return Src; 15222 break; 15223 } 15224 case ISD::SINT_TO_FP: 15225 case ISD::UINT_TO_FP: 15226 return performUCharToFloatCombine(N, DCI); 15227 case ISD::FCOPYSIGN: 15228 return performFCopySignCombine(N, DCI); 15229 case AMDGPUISD::CVT_F32_UBYTE0: 15230 case AMDGPUISD::CVT_F32_UBYTE1: 15231 case AMDGPUISD::CVT_F32_UBYTE2: 15232 case AMDGPUISD::CVT_F32_UBYTE3: 15233 return performCvtF32UByteNCombine(N, DCI); 15234 case AMDGPUISD::FMED3: 15235 return performFMed3Combine(N, DCI); 15236 case AMDGPUISD::CVT_PKRTZ_F16_F32: 15237 return performCvtPkRTZCombine(N, DCI); 15238 case AMDGPUISD::CLAMP: 15239 return performClampCombine(N, DCI); 15240 case ISD::SCALAR_TO_VECTOR: { 15241 SelectionDAG &DAG = DCI.DAG; 15242 EVT VT = N->getValueType(0); 15243 15244 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x)) 15245 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) { 15246 SDLoc SL(N); 15247 SDValue Src = N->getOperand(0); 15248 EVT EltVT = Src.getValueType(); 15249 if (EltVT != MVT::i16) 15250 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src); 15251 15252 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src); 15253 return DAG.getNode(ISD::BITCAST, SL, VT, Ext); 15254 } 15255 15256 break; 15257 } 15258 case ISD::EXTRACT_VECTOR_ELT: 15259 return performExtractVectorEltCombine(N, DCI); 15260 case ISD::INSERT_VECTOR_ELT: 15261 return performInsertVectorEltCombine(N, DCI); 15262 case ISD::FP_ROUND: 15263 return performFPRoundCombine(N, DCI); 15264 case ISD::LOAD: { 15265 if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI)) 15266 return Widened; 15267 [[fallthrough]]; 15268 } 15269 default: { 15270 if (!DCI.isBeforeLegalize()) { 15271 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N)) 15272 return performMemSDNodeCombine(MemNode, DCI); 15273 } 15274 15275 break; 15276 } 15277 } 15278 15279 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); 15280 } 15281 15282 /// Helper function for adjustWritemask 15283 static unsigned SubIdx2Lane(unsigned Idx) { 15284 switch (Idx) { 15285 default: 15286 return ~0u; 15287 case AMDGPU::sub0: 15288 return 0; 15289 case AMDGPU::sub1: 15290 return 1; 15291 case AMDGPU::sub2: 15292 return 2; 15293 case AMDGPU::sub3: 15294 return 3; 15295 case AMDGPU::sub4: 15296 return 4; // Possible with TFE/LWE 15297 } 15298 } 15299 15300 /// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions 15301 SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node, 15302 SelectionDAG &DAG) const { 15303 unsigned Opcode = Node->getMachineOpcode(); 15304 15305 // Subtract 1 because the vdata output is not a MachineSDNode operand. 15306 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1; 15307 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx)) 15308 return Node; // not implemented for D16 15309 15310 SDNode *Users[5] = {nullptr}; 15311 unsigned Lane = 0; 15312 unsigned DmaskIdx = 15313 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1; 15314 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx); 15315 unsigned NewDmask = 0; 15316 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1; 15317 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1; 15318 bool UsesTFC = ((int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) || 15319 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx))) 15320 ? true 15321 : false; 15322 unsigned TFCLane = 0; 15323 bool HasChain = Node->getNumValues() > 1; 15324 15325 if (OldDmask == 0) { 15326 // These are folded out, but on the chance it happens don't assert. 15327 return Node; 15328 } 15329 15330 unsigned OldBitsSet = llvm::popcount(OldDmask); 15331 // Work out which is the TFE/LWE lane if that is enabled. 15332 if (UsesTFC) { 15333 TFCLane = OldBitsSet; 15334 } 15335 15336 // Try to figure out the used register components 15337 for (SDUse &Use : Node->uses()) { 15338 15339 // Don't look at users of the chain. 15340 if (Use.getResNo() != 0) 15341 continue; 15342 15343 SDNode *User = Use.getUser(); 15344 15345 // Abort if we can't understand the usage 15346 if (!User->isMachineOpcode() || 15347 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG) 15348 return Node; 15349 15350 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used. 15351 // Note that subregs are packed, i.e. Lane==0 is the first bit set 15352 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit 15353 // set, etc. 15354 Lane = SubIdx2Lane(User->getConstantOperandVal(1)); 15355 if (Lane == ~0u) 15356 return Node; 15357 15358 // Check if the use is for the TFE/LWE generated result at VGPRn+1. 15359 if (UsesTFC && Lane == TFCLane) { 15360 Users[Lane] = User; 15361 } else { 15362 // Set which texture component corresponds to the lane. 15363 unsigned Comp; 15364 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) { 15365 Comp = llvm::countr_zero(Dmask); 15366 Dmask &= ~(1 << Comp); 15367 } 15368 15369 // Abort if we have more than one user per component. 15370 if (Users[Lane]) 15371 return Node; 15372 15373 Users[Lane] = User; 15374 NewDmask |= 1 << Comp; 15375 } 15376 } 15377 15378 // Don't allow 0 dmask, as hardware assumes one channel enabled. 15379 bool NoChannels = !NewDmask; 15380 if (NoChannels) { 15381 if (!UsesTFC) { 15382 // No uses of the result and not using TFC. Then do nothing. 15383 return Node; 15384 } 15385 // If the original dmask has one channel - then nothing to do 15386 if (OldBitsSet == 1) 15387 return Node; 15388 // Use an arbitrary dmask - required for the instruction to work 15389 NewDmask = 1; 15390 } 15391 // Abort if there's no change 15392 if (NewDmask == OldDmask) 15393 return Node; 15394 15395 unsigned BitsSet = llvm::popcount(NewDmask); 15396 15397 // Check for TFE or LWE - increase the number of channels by one to account 15398 // for the extra return value 15399 // This will need adjustment for D16 if this is also included in 15400 // adjustWriteMask (this function) but at present D16 are excluded. 15401 unsigned NewChannels = BitsSet + UsesTFC; 15402 15403 int NewOpcode = 15404 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels); 15405 assert(NewOpcode != -1 && 15406 NewOpcode != static_cast<int>(Node->getMachineOpcode()) && 15407 "failed to find equivalent MIMG op"); 15408 15409 // Adjust the writemask in the node 15410 SmallVector<SDValue, 12> Ops; 15411 Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx); 15412 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32)); 15413 Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end()); 15414 15415 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT(); 15416 15417 MVT ResultVT = NewChannels == 1 15418 ? SVT 15419 : MVT::getVectorVT(SVT, NewChannels == 3 ? 4 15420 : NewChannels == 5 ? 8 15421 : NewChannels); 15422 SDVTList NewVTList = 15423 HasChain ? DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT); 15424 15425 MachineSDNode *NewNode = 15426 DAG.getMachineNode(NewOpcode, SDLoc(Node), NewVTList, Ops); 15427 15428 if (HasChain) { 15429 // Update chain. 15430 DAG.setNodeMemRefs(NewNode, Node->memoperands()); 15431 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1)); 15432 } 15433 15434 if (NewChannels == 1) { 15435 assert(Node->hasNUsesOfValue(1, 0)); 15436 SDNode *Copy = 15437 DAG.getMachineNode(TargetOpcode::COPY, SDLoc(Node), 15438 Users[Lane]->getValueType(0), SDValue(NewNode, 0)); 15439 DAG.ReplaceAllUsesWith(Users[Lane], Copy); 15440 return nullptr; 15441 } 15442 15443 // Update the users of the node with the new indices 15444 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) { 15445 SDNode *User = Users[i]; 15446 if (!User) { 15447 // Handle the special case of NoChannels. We set NewDmask to 1 above, but 15448 // Users[0] is still nullptr because channel 0 doesn't really have a use. 15449 if (i || !NoChannels) 15450 continue; 15451 } else { 15452 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32); 15453 SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op); 15454 if (NewUser != User) { 15455 DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0)); 15456 DAG.RemoveDeadNode(User); 15457 } 15458 } 15459 15460 switch (Idx) { 15461 default: 15462 break; 15463 case AMDGPU::sub0: 15464 Idx = AMDGPU::sub1; 15465 break; 15466 case AMDGPU::sub1: 15467 Idx = AMDGPU::sub2; 15468 break; 15469 case AMDGPU::sub2: 15470 Idx = AMDGPU::sub3; 15471 break; 15472 case AMDGPU::sub3: 15473 Idx = AMDGPU::sub4; 15474 break; 15475 } 15476 } 15477 15478 DAG.RemoveDeadNode(Node); 15479 return nullptr; 15480 } 15481 15482 static bool isFrameIndexOp(SDValue Op) { 15483 if (Op.getOpcode() == ISD::AssertZext) 15484 Op = Op.getOperand(0); 15485 15486 return isa<FrameIndexSDNode>(Op); 15487 } 15488 15489 /// Legalize target independent instructions (e.g. INSERT_SUBREG) 15490 /// with frame index operands. 15491 /// LLVM assumes that inputs are to these instructions are registers. 15492 SDNode * 15493 SITargetLowering::legalizeTargetIndependentNode(SDNode *Node, 15494 SelectionDAG &DAG) const { 15495 if (Node->getOpcode() == ISD::CopyToReg) { 15496 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1)); 15497 SDValue SrcVal = Node->getOperand(2); 15498 15499 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have 15500 // to try understanding copies to physical registers. 15501 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) { 15502 SDLoc SL(Node); 15503 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 15504 SDValue VReg = DAG.getRegister( 15505 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1); 15506 15507 SDNode *Glued = Node->getGluedNode(); 15508 SDValue ToVReg = DAG.getCopyToReg( 15509 Node->getOperand(0), SL, VReg, SrcVal, 15510 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0)); 15511 SDValue ToResultReg = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0), 15512 VReg, ToVReg.getValue(1)); 15513 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode()); 15514 DAG.RemoveDeadNode(Node); 15515 return ToResultReg.getNode(); 15516 } 15517 } 15518 15519 SmallVector<SDValue, 8> Ops; 15520 for (unsigned i = 0; i < Node->getNumOperands(); ++i) { 15521 if (!isFrameIndexOp(Node->getOperand(i))) { 15522 Ops.push_back(Node->getOperand(i)); 15523 continue; 15524 } 15525 15526 SDLoc DL(Node); 15527 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, 15528 Node->getOperand(i).getValueType(), 15529 Node->getOperand(i)), 15530 0)); 15531 } 15532 15533 return DAG.UpdateNodeOperands(Node, Ops); 15534 } 15535 15536 /// Fold the instructions after selecting them. 15537 /// Returns null if users were already updated. 15538 SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, 15539 SelectionDAG &DAG) const { 15540 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 15541 unsigned Opcode = Node->getMachineOpcode(); 15542 15543 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() && 15544 !TII->isGather4(Opcode) && 15545 AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) { 15546 return adjustWritemask(Node, DAG); 15547 } 15548 15549 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) { 15550 legalizeTargetIndependentNode(Node, DAG); 15551 return Node; 15552 } 15553 15554 switch (Opcode) { 15555 case AMDGPU::V_DIV_SCALE_F32_e64: 15556 case AMDGPU::V_DIV_SCALE_F64_e64: { 15557 // Satisfy the operand register constraint when one of the inputs is 15558 // undefined. Ordinarily each undef value will have its own implicit_def of 15559 // a vreg, so force these to use a single register. 15560 SDValue Src0 = Node->getOperand(1); 15561 SDValue Src1 = Node->getOperand(3); 15562 SDValue Src2 = Node->getOperand(5); 15563 15564 if ((Src0.isMachineOpcode() && 15565 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) && 15566 (Src0 == Src1 || Src0 == Src2)) 15567 break; 15568 15569 MVT VT = Src0.getValueType().getSimpleVT(); 15570 const TargetRegisterClass *RC = 15571 getRegClassFor(VT, Src0.getNode()->isDivergent()); 15572 15573 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 15574 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT); 15575 15576 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node), UndefReg, 15577 Src0, SDValue()); 15578 15579 // src0 must be the same register as src1 or src2, even if the value is 15580 // undefined, so make sure we don't violate this constraint. 15581 if (Src0.isMachineOpcode() && 15582 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) { 15583 if (Src1.isMachineOpcode() && 15584 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) 15585 Src0 = Src1; 15586 else if (Src2.isMachineOpcode() && 15587 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) 15588 Src0 = Src2; 15589 else { 15590 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF); 15591 Src0 = UndefReg; 15592 Src1 = UndefReg; 15593 } 15594 } else 15595 break; 15596 15597 SmallVector<SDValue, 9> Ops(Node->ops()); 15598 Ops[1] = Src0; 15599 Ops[3] = Src1; 15600 Ops[5] = Src2; 15601 Ops.push_back(ImpDef.getValue(1)); 15602 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); 15603 } 15604 default: 15605 break; 15606 } 15607 15608 return Node; 15609 } 15610 15611 // Any MIMG instructions that use tfe or lwe require an initialization of the 15612 // result register that will be written in the case of a memory access failure. 15613 // The required code is also added to tie this init code to the result of the 15614 // img instruction. 15615 void SITargetLowering::AddMemOpInit(MachineInstr &MI) const { 15616 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 15617 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 15618 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); 15619 MachineBasicBlock &MBB = *MI.getParent(); 15620 15621 int DstIdx = 15622 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata); 15623 unsigned InitIdx = 0; 15624 15625 if (TII->isImage(MI)) { 15626 MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe); 15627 MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe); 15628 MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16); 15629 15630 if (!TFE && !LWE) // intersect_ray 15631 return; 15632 15633 unsigned TFEVal = TFE ? TFE->getImm() : 0; 15634 unsigned LWEVal = LWE ? LWE->getImm() : 0; 15635 unsigned D16Val = D16 ? D16->getImm() : 0; 15636 15637 if (!TFEVal && !LWEVal) 15638 return; 15639 15640 // At least one of TFE or LWE are non-zero 15641 // We have to insert a suitable initialization of the result value and 15642 // tie this to the dest of the image instruction. 15643 15644 // Calculate which dword we have to initialize to 0. 15645 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask); 15646 15647 // check that dmask operand is found. 15648 assert(MO_Dmask && "Expected dmask operand in instruction"); 15649 15650 unsigned dmask = MO_Dmask->getImm(); 15651 // Determine the number of active lanes taking into account the 15652 // Gather4 special case 15653 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask); 15654 15655 bool Packed = !Subtarget->hasUnpackedD16VMem(); 15656 15657 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1; 15658 15659 // Abandon attempt if the dst size isn't large enough 15660 // - this is in fact an error but this is picked up elsewhere and 15661 // reported correctly. 15662 uint32_t DstSize = 15663 TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32; 15664 if (DstSize < InitIdx) 15665 return; 15666 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) { 15667 InitIdx = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32; 15668 } else { 15669 return; 15670 } 15671 15672 const DebugLoc &DL = MI.getDebugLoc(); 15673 15674 // Create a register for the initialization value. 15675 Register PrevDst = MRI.cloneVirtualRegister(MI.getOperand(DstIdx).getReg()); 15676 unsigned NewDst = 0; // Final initialized value will be in here 15677 15678 // If PRTStrictNull feature is enabled (the default) then initialize 15679 // all the result registers to 0, otherwise just the error indication 15680 // register (VGPRn+1) 15681 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1; 15682 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1); 15683 15684 BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst); 15685 for (; SizeLeft; SizeLeft--, CurrIdx++) { 15686 NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx)); 15687 // Initialize dword 15688 Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 15689 // clang-format off 15690 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg) 15691 .addImm(0); 15692 // clang-format on 15693 // Insert into the super-reg 15694 BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst) 15695 .addReg(PrevDst) 15696 .addReg(SubReg) 15697 .addImm(SIRegisterInfo::getSubRegFromChannel(CurrIdx)); 15698 15699 PrevDst = NewDst; 15700 } 15701 15702 // Add as an implicit operand 15703 MI.addOperand(MachineOperand::CreateReg(NewDst, false, true)); 15704 15705 // Tie the just added implicit operand to the dst 15706 MI.tieOperands(DstIdx, MI.getNumOperands() - 1); 15707 } 15708 15709 /// Assign the register class depending on the number of 15710 /// bits set in the writemask 15711 void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, 15712 SDNode *Node) const { 15713 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 15714 15715 MachineFunction *MF = MI.getParent()->getParent(); 15716 MachineRegisterInfo &MRI = MF->getRegInfo(); 15717 SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 15718 15719 if (TII->isVOP3(MI.getOpcode())) { 15720 // Make sure constant bus requirements are respected. 15721 TII->legalizeOperandsVOP3(MRI, MI); 15722 15723 // Prefer VGPRs over AGPRs in mAI instructions where possible. 15724 // This saves a chain-copy of registers and better balance register 15725 // use between vgpr and agpr as agpr tuples tend to be big. 15726 if (!MI.getDesc().operands().empty()) { 15727 unsigned Opc = MI.getOpcode(); 15728 bool HasAGPRs = Info->mayNeedAGPRs(); 15729 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); 15730 int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); 15731 for (auto I : 15732 {AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), 15733 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), Src2Idx}) { 15734 if (I == -1) 15735 break; 15736 if ((I == Src2Idx) && (HasAGPRs)) 15737 break; 15738 MachineOperand &Op = MI.getOperand(I); 15739 if (!Op.isReg() || !Op.getReg().isVirtual()) 15740 continue; 15741 auto *RC = TRI->getRegClassForReg(MRI, Op.getReg()); 15742 if (!TRI->hasAGPRs(RC)) 15743 continue; 15744 auto *Src = MRI.getUniqueVRegDef(Op.getReg()); 15745 if (!Src || !Src->isCopy() || 15746 !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg())) 15747 continue; 15748 auto *NewRC = TRI->getEquivalentVGPRClass(RC); 15749 // All uses of agpr64 and agpr32 can also accept vgpr except for 15750 // v_accvgpr_read, but we do not produce agpr reads during selection, 15751 // so no use checks are needed. 15752 MRI.setRegClass(Op.getReg(), NewRC); 15753 } 15754 15755 if (TII->isMAI(MI)) { 15756 // The ordinary src0, src1, src2 were legalized above. 15757 // 15758 // We have to also legalize the appended v_mfma_ld_scale_b32 operands, 15759 // as a separate instruction. 15760 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), 15761 AMDGPU::OpName::scale_src0); 15762 if (Src0Idx != -1) { 15763 int Src1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), 15764 AMDGPU::OpName::scale_src1); 15765 if (TII->usesConstantBus(MRI, MI, Src0Idx) && 15766 TII->usesConstantBus(MRI, MI, Src1Idx)) 15767 TII->legalizeOpWithMove(MI, Src1Idx); 15768 } 15769 } 15770 15771 if (!HasAGPRs) 15772 return; 15773 15774 // Resolve the rest of AV operands to AGPRs. 15775 if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) { 15776 if (Src2->isReg() && Src2->getReg().isVirtual()) { 15777 auto *RC = TRI->getRegClassForReg(MRI, Src2->getReg()); 15778 if (TRI->isVectorSuperClass(RC)) { 15779 auto *NewRC = TRI->getEquivalentAGPRClass(RC); 15780 MRI.setRegClass(Src2->getReg(), NewRC); 15781 if (Src2->isTied()) 15782 MRI.setRegClass(MI.getOperand(0).getReg(), NewRC); 15783 } 15784 } 15785 } 15786 } 15787 15788 return; 15789 } 15790 15791 if (TII->isImage(MI)) 15792 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr); 15793 } 15794 15795 static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, 15796 uint64_t Val) { 15797 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32); 15798 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0); 15799 } 15800 15801 MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG, 15802 const SDLoc &DL, 15803 SDValue Ptr) const { 15804 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 15805 15806 // Build the half of the subregister with the constants before building the 15807 // full 128-bit register. If we are building multiple resource descriptors, 15808 // this will allow CSEing of the 2-component register. 15809 const SDValue Ops0[] = { 15810 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32), 15811 buildSMovImm32(DAG, DL, 0), 15812 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32), 15813 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32), 15814 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)}; 15815 15816 SDValue SubRegHi = SDValue( 15817 DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v2i32, Ops0), 0); 15818 15819 // Combine the constants and the pointer. 15820 const SDValue Ops1[] = { 15821 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32), Ptr, 15822 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), SubRegHi, 15823 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)}; 15824 15825 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1); 15826 } 15827 15828 /// Return a resource descriptor with the 'Add TID' bit enabled 15829 /// The TID (Thread ID) is multiplied by the stride value (bits [61:48] 15830 /// of the resource descriptor) to create an offset, which is added to 15831 /// the resource pointer. 15832 MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL, 15833 SDValue Ptr, uint32_t RsrcDword1, 15834 uint64_t RsrcDword2And3) const { 15835 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr); 15836 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr); 15837 if (RsrcDword1) { 15838 PtrHi = 15839 SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi, 15840 DAG.getConstant(RsrcDword1, DL, MVT::i32)), 15841 0); 15842 } 15843 15844 SDValue DataLo = 15845 buildSMovImm32(DAG, DL, RsrcDword2And3 & UINT64_C(0xFFFFFFFF)); 15846 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32); 15847 15848 const SDValue Ops[] = { 15849 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32), 15850 PtrLo, 15851 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32), 15852 PtrHi, 15853 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32), 15854 DataLo, 15855 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32), 15856 DataHi, 15857 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)}; 15858 15859 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops); 15860 } 15861 15862 //===----------------------------------------------------------------------===// 15863 // SI Inline Assembly Support 15864 //===----------------------------------------------------------------------===// 15865 15866 std::pair<unsigned, const TargetRegisterClass *> 15867 SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_, 15868 StringRef Constraint, 15869 MVT VT) const { 15870 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_); 15871 15872 const TargetRegisterClass *RC = nullptr; 15873 if (Constraint.size() == 1) { 15874 const unsigned BitWidth = VT.getSizeInBits(); 15875 switch (Constraint[0]) { 15876 default: 15877 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 15878 case 's': 15879 case 'r': 15880 switch (BitWidth) { 15881 case 16: 15882 RC = &AMDGPU::SReg_32RegClass; 15883 break; 15884 case 64: 15885 RC = &AMDGPU::SGPR_64RegClass; 15886 break; 15887 default: 15888 RC = SIRegisterInfo::getSGPRClassForBitWidth(BitWidth); 15889 if (!RC) 15890 return std::pair(0U, nullptr); 15891 break; 15892 } 15893 break; 15894 case 'v': 15895 switch (BitWidth) { 15896 case 16: 15897 RC = &AMDGPU::VGPR_32RegClass; 15898 break; 15899 default: 15900 RC = TRI->getVGPRClassForBitWidth(BitWidth); 15901 if (!RC) 15902 return std::pair(0U, nullptr); 15903 break; 15904 } 15905 break; 15906 case 'a': 15907 if (!Subtarget->hasMAIInsts()) 15908 break; 15909 switch (BitWidth) { 15910 case 16: 15911 RC = &AMDGPU::AGPR_32RegClass; 15912 break; 15913 default: 15914 RC = TRI->getAGPRClassForBitWidth(BitWidth); 15915 if (!RC) 15916 return std::pair(0U, nullptr); 15917 break; 15918 } 15919 break; 15920 } 15921 // We actually support i128, i16 and f16 as inline parameters 15922 // even if they are not reported as legal 15923 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 || 15924 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16)) 15925 return std::pair(0U, RC); 15926 } 15927 15928 if (Constraint.starts_with("{") && Constraint.ends_with("}")) { 15929 StringRef RegName(Constraint.data() + 1, Constraint.size() - 2); 15930 if (RegName.consume_front("v")) { 15931 RC = &AMDGPU::VGPR_32RegClass; 15932 } else if (RegName.consume_front("s")) { 15933 RC = &AMDGPU::SGPR_32RegClass; 15934 } else if (RegName.consume_front("a")) { 15935 RC = &AMDGPU::AGPR_32RegClass; 15936 } 15937 15938 if (RC) { 15939 uint32_t Idx; 15940 if (RegName.consume_front("[")) { 15941 uint32_t End; 15942 bool Failed = RegName.consumeInteger(10, Idx); 15943 Failed |= !RegName.consume_front(":"); 15944 Failed |= RegName.consumeInteger(10, End); 15945 Failed |= !RegName.consume_back("]"); 15946 if (!Failed) { 15947 uint32_t Width = (End - Idx + 1) * 32; 15948 // Prohibit constraints for register ranges with a width that does not 15949 // match the required type. 15950 if (VT.SimpleTy != MVT::Other && Width != VT.getSizeInBits()) 15951 return std::pair(0U, nullptr); 15952 MCRegister Reg = RC->getRegister(Idx); 15953 if (SIRegisterInfo::isVGPRClass(RC)) 15954 RC = TRI->getVGPRClassForBitWidth(Width); 15955 else if (SIRegisterInfo::isSGPRClass(RC)) 15956 RC = TRI->getSGPRClassForBitWidth(Width); 15957 else if (SIRegisterInfo::isAGPRClass(RC)) 15958 RC = TRI->getAGPRClassForBitWidth(Width); 15959 if (RC) { 15960 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC); 15961 if (!Reg) { 15962 // The register class does not contain the requested register, 15963 // e.g., because it is an SGPR pair that would violate alignment 15964 // requirements. 15965 return std::pair(0U, nullptr); 15966 } 15967 return std::pair(Reg, RC); 15968 } 15969 } 15970 } else { 15971 // Check for lossy scalar/vector conversions. 15972 if (VT.isVector() && VT.getSizeInBits() != 32) 15973 return std::pair(0U, nullptr); 15974 bool Failed = RegName.getAsInteger(10, Idx); 15975 if (!Failed && Idx < RC->getNumRegs()) 15976 return std::pair(RC->getRegister(Idx), RC); 15977 } 15978 } 15979 } 15980 15981 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 15982 if (Ret.first) 15983 Ret.second = TRI->getPhysRegBaseClass(Ret.first); 15984 15985 return Ret; 15986 } 15987 15988 static bool isImmConstraint(StringRef Constraint) { 15989 if (Constraint.size() == 1) { 15990 switch (Constraint[0]) { 15991 default: 15992 break; 15993 case 'I': 15994 case 'J': 15995 case 'A': 15996 case 'B': 15997 case 'C': 15998 return true; 15999 } 16000 } else if (Constraint == "DA" || Constraint == "DB") { 16001 return true; 16002 } 16003 return false; 16004 } 16005 16006 SITargetLowering::ConstraintType 16007 SITargetLowering::getConstraintType(StringRef Constraint) const { 16008 if (Constraint.size() == 1) { 16009 switch (Constraint[0]) { 16010 default: 16011 break; 16012 case 's': 16013 case 'v': 16014 case 'a': 16015 return C_RegisterClass; 16016 } 16017 } 16018 if (isImmConstraint(Constraint)) { 16019 return C_Other; 16020 } 16021 return TargetLowering::getConstraintType(Constraint); 16022 } 16023 16024 static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) { 16025 if (!AMDGPU::isInlinableIntLiteral(Val)) { 16026 Val = Val & maskTrailingOnes<uint64_t>(Size); 16027 } 16028 return Val; 16029 } 16030 16031 void SITargetLowering::LowerAsmOperandForConstraint(SDValue Op, 16032 StringRef Constraint, 16033 std::vector<SDValue> &Ops, 16034 SelectionDAG &DAG) const { 16035 if (isImmConstraint(Constraint)) { 16036 uint64_t Val; 16037 if (getAsmOperandConstVal(Op, Val) && 16038 checkAsmConstraintVal(Op, Constraint, Val)) { 16039 Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits()); 16040 Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64)); 16041 } 16042 } else { 16043 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 16044 } 16045 } 16046 16047 bool SITargetLowering::getAsmOperandConstVal(SDValue Op, uint64_t &Val) const { 16048 unsigned Size = Op.getScalarValueSizeInBits(); 16049 if (Size > 64) 16050 return false; 16051 16052 if (Size == 16 && !Subtarget->has16BitInsts()) 16053 return false; 16054 16055 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 16056 Val = C->getSExtValue(); 16057 return true; 16058 } 16059 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op)) { 16060 Val = C->getValueAPF().bitcastToAPInt().getSExtValue(); 16061 return true; 16062 } 16063 if (BuildVectorSDNode *V = dyn_cast<BuildVectorSDNode>(Op)) { 16064 if (Size != 16 || Op.getNumOperands() != 2) 16065 return false; 16066 if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef()) 16067 return false; 16068 if (ConstantSDNode *C = V->getConstantSplatNode()) { 16069 Val = C->getSExtValue(); 16070 return true; 16071 } 16072 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) { 16073 Val = C->getValueAPF().bitcastToAPInt().getSExtValue(); 16074 return true; 16075 } 16076 } 16077 16078 return false; 16079 } 16080 16081 bool SITargetLowering::checkAsmConstraintVal(SDValue Op, StringRef Constraint, 16082 uint64_t Val) const { 16083 if (Constraint.size() == 1) { 16084 switch (Constraint[0]) { 16085 case 'I': 16086 return AMDGPU::isInlinableIntLiteral(Val); 16087 case 'J': 16088 return isInt<16>(Val); 16089 case 'A': 16090 return checkAsmConstraintValA(Op, Val); 16091 case 'B': 16092 return isInt<32>(Val); 16093 case 'C': 16094 return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) || 16095 AMDGPU::isInlinableIntLiteral(Val); 16096 default: 16097 break; 16098 } 16099 } else if (Constraint.size() == 2) { 16100 if (Constraint == "DA") { 16101 int64_t HiBits = static_cast<int32_t>(Val >> 32); 16102 int64_t LoBits = static_cast<int32_t>(Val); 16103 return checkAsmConstraintValA(Op, HiBits, 32) && 16104 checkAsmConstraintValA(Op, LoBits, 32); 16105 } 16106 if (Constraint == "DB") { 16107 return true; 16108 } 16109 } 16110 llvm_unreachable("Invalid asm constraint"); 16111 } 16112 16113 bool SITargetLowering::checkAsmConstraintValA(SDValue Op, uint64_t Val, 16114 unsigned MaxSize) const { 16115 unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize); 16116 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm(); 16117 if (Size == 16) { 16118 MVT VT = Op.getSimpleValueType(); 16119 switch (VT.SimpleTy) { 16120 default: 16121 return false; 16122 case MVT::i16: 16123 return AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi); 16124 case MVT::f16: 16125 return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi); 16126 case MVT::bf16: 16127 return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi); 16128 case MVT::v2i16: 16129 return AMDGPU::getInlineEncodingV2I16(Val).has_value(); 16130 case MVT::v2f16: 16131 return AMDGPU::getInlineEncodingV2F16(Val).has_value(); 16132 case MVT::v2bf16: 16133 return AMDGPU::getInlineEncodingV2BF16(Val).has_value(); 16134 } 16135 } 16136 if ((Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) || 16137 (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi))) 16138 return true; 16139 return false; 16140 } 16141 16142 static int getAlignedAGPRClassID(unsigned UnalignedClassID) { 16143 switch (UnalignedClassID) { 16144 case AMDGPU::VReg_64RegClassID: 16145 return AMDGPU::VReg_64_Align2RegClassID; 16146 case AMDGPU::VReg_96RegClassID: 16147 return AMDGPU::VReg_96_Align2RegClassID; 16148 case AMDGPU::VReg_128RegClassID: 16149 return AMDGPU::VReg_128_Align2RegClassID; 16150 case AMDGPU::VReg_160RegClassID: 16151 return AMDGPU::VReg_160_Align2RegClassID; 16152 case AMDGPU::VReg_192RegClassID: 16153 return AMDGPU::VReg_192_Align2RegClassID; 16154 case AMDGPU::VReg_224RegClassID: 16155 return AMDGPU::VReg_224_Align2RegClassID; 16156 case AMDGPU::VReg_256RegClassID: 16157 return AMDGPU::VReg_256_Align2RegClassID; 16158 case AMDGPU::VReg_288RegClassID: 16159 return AMDGPU::VReg_288_Align2RegClassID; 16160 case AMDGPU::VReg_320RegClassID: 16161 return AMDGPU::VReg_320_Align2RegClassID; 16162 case AMDGPU::VReg_352RegClassID: 16163 return AMDGPU::VReg_352_Align2RegClassID; 16164 case AMDGPU::VReg_384RegClassID: 16165 return AMDGPU::VReg_384_Align2RegClassID; 16166 case AMDGPU::VReg_512RegClassID: 16167 return AMDGPU::VReg_512_Align2RegClassID; 16168 case AMDGPU::VReg_1024RegClassID: 16169 return AMDGPU::VReg_1024_Align2RegClassID; 16170 case AMDGPU::AReg_64RegClassID: 16171 return AMDGPU::AReg_64_Align2RegClassID; 16172 case AMDGPU::AReg_96RegClassID: 16173 return AMDGPU::AReg_96_Align2RegClassID; 16174 case AMDGPU::AReg_128RegClassID: 16175 return AMDGPU::AReg_128_Align2RegClassID; 16176 case AMDGPU::AReg_160RegClassID: 16177 return AMDGPU::AReg_160_Align2RegClassID; 16178 case AMDGPU::AReg_192RegClassID: 16179 return AMDGPU::AReg_192_Align2RegClassID; 16180 case AMDGPU::AReg_256RegClassID: 16181 return AMDGPU::AReg_256_Align2RegClassID; 16182 case AMDGPU::AReg_512RegClassID: 16183 return AMDGPU::AReg_512_Align2RegClassID; 16184 case AMDGPU::AReg_1024RegClassID: 16185 return AMDGPU::AReg_1024_Align2RegClassID; 16186 default: 16187 return -1; 16188 } 16189 } 16190 16191 // Figure out which registers should be reserved for stack access. Only after 16192 // the function is legalized do we know all of the non-spill stack objects or if 16193 // calls are present. 16194 void SITargetLowering::finalizeLowering(MachineFunction &MF) const { 16195 MachineRegisterInfo &MRI = MF.getRegInfo(); 16196 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 16197 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 16198 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); 16199 const SIInstrInfo *TII = ST.getInstrInfo(); 16200 16201 if (Info->isEntryFunction()) { 16202 // Callable functions have fixed registers used for stack access. 16203 reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info); 16204 } 16205 16206 // TODO: Move this logic to getReservedRegs() 16207 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling. 16208 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF); 16209 Register SReg = ST.isWave32() 16210 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1) 16211 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2, 16212 &AMDGPU::SGPR_64RegClass); 16213 Info->setSGPRForEXECCopy(SReg); 16214 16215 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(), 16216 Info->getStackPtrOffsetReg())); 16217 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG) 16218 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg()); 16219 16220 // We need to worry about replacing the default register with itself in case 16221 // of MIR testcases missing the MFI. 16222 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG) 16223 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg()); 16224 16225 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG) 16226 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg()); 16227 16228 Info->limitOccupancy(MF); 16229 16230 if (ST.isWave32() && !MF.empty()) { 16231 for (auto &MBB : MF) { 16232 for (auto &MI : MBB) { 16233 TII->fixImplicitOperands(MI); 16234 } 16235 } 16236 } 16237 16238 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned 16239 // classes if required. Ideally the register class constraints would differ 16240 // per-subtarget, but there's no easy way to achieve that right now. This is 16241 // not a problem for VGPRs because the correctly aligned VGPR class is implied 16242 // from using them as the register class for legal types. 16243 if (ST.needsAlignedVGPRs()) { 16244 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { 16245 const Register Reg = Register::index2VirtReg(I); 16246 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg); 16247 if (!RC) 16248 continue; 16249 int NewClassID = getAlignedAGPRClassID(RC->getID()); 16250 if (NewClassID != -1) 16251 MRI.setRegClass(Reg, TRI->getRegClass(NewClassID)); 16252 } 16253 } 16254 16255 TargetLoweringBase::finalizeLowering(MF); 16256 } 16257 16258 void SITargetLowering::computeKnownBitsForTargetNode(const SDValue Op, 16259 KnownBits &Known, 16260 const APInt &DemandedElts, 16261 const SelectionDAG &DAG, 16262 unsigned Depth) const { 16263 Known.resetAll(); 16264 unsigned Opc = Op.getOpcode(); 16265 switch (Opc) { 16266 case ISD::INTRINSIC_WO_CHAIN: { 16267 unsigned IID = Op.getConstantOperandVal(0); 16268 switch (IID) { 16269 case Intrinsic::amdgcn_mbcnt_lo: 16270 case Intrinsic::amdgcn_mbcnt_hi: { 16271 const GCNSubtarget &ST = 16272 DAG.getMachineFunction().getSubtarget<GCNSubtarget>(); 16273 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at 16274 // most 31 + src1. 16275 Known.Zero.setBitsFrom( 16276 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5); 16277 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1); 16278 Known = KnownBits::add(Known, Known2); 16279 return; 16280 } 16281 } 16282 break; 16283 } 16284 } 16285 return AMDGPUTargetLowering::computeKnownBitsForTargetNode( 16286 Op, Known, DemandedElts, DAG, Depth); 16287 } 16288 16289 void SITargetLowering::computeKnownBitsForFrameIndex( 16290 const int FI, KnownBits &Known, const MachineFunction &MF) const { 16291 TargetLowering::computeKnownBitsForFrameIndex(FI, Known, MF); 16292 16293 // Set the high bits to zero based on the maximum allowed scratch size per 16294 // wave. We can't use vaddr in MUBUF instructions if we don't know the address 16295 // calculation won't overflow, so assume the sign bit is never set. 16296 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex()); 16297 } 16298 16299 static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelKnownBits &KB, 16300 KnownBits &Known, unsigned Dim) { 16301 unsigned MaxValue = 16302 ST.getMaxWorkitemID(KB.getMachineFunction().getFunction(), Dim); 16303 Known.Zero.setHighBits(llvm::countl_zero(MaxValue)); 16304 } 16305 16306 void SITargetLowering::computeKnownBitsForTargetInstr( 16307 GISelKnownBits &KB, Register R, KnownBits &Known, const APInt &DemandedElts, 16308 const MachineRegisterInfo &MRI, unsigned Depth) const { 16309 const MachineInstr *MI = MRI.getVRegDef(R); 16310 switch (MI->getOpcode()) { 16311 case AMDGPU::G_INTRINSIC: 16312 case AMDGPU::G_INTRINSIC_CONVERGENT: { 16313 Intrinsic::ID IID = cast<GIntrinsic>(MI)->getIntrinsicID(); 16314 switch (IID) { 16315 case Intrinsic::amdgcn_workitem_id_x: 16316 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 0); 16317 break; 16318 case Intrinsic::amdgcn_workitem_id_y: 16319 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 1); 16320 break; 16321 case Intrinsic::amdgcn_workitem_id_z: 16322 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 2); 16323 break; 16324 case Intrinsic::amdgcn_mbcnt_lo: 16325 case Intrinsic::amdgcn_mbcnt_hi: { 16326 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at 16327 // most 31 + src1. 16328 Known.Zero.setBitsFrom(IID == Intrinsic::amdgcn_mbcnt_lo 16329 ? getSubtarget()->getWavefrontSizeLog2() 16330 : 5); 16331 KnownBits Known2; 16332 KB.computeKnownBitsImpl(MI->getOperand(3).getReg(), Known2, DemandedElts, 16333 Depth + 1); 16334 Known = KnownBits::add(Known, Known2); 16335 break; 16336 } 16337 case Intrinsic::amdgcn_groupstaticsize: { 16338 // We can report everything over the maximum size as 0. We can't report 16339 // based on the actual size because we don't know if it's accurate or not 16340 // at any given point. 16341 Known.Zero.setHighBits( 16342 llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize())); 16343 break; 16344 } 16345 } 16346 break; 16347 } 16348 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE: 16349 Known.Zero.setHighBits(24); 16350 break; 16351 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: 16352 Known.Zero.setHighBits(16); 16353 break; 16354 case AMDGPU::G_AMDGPU_SMED3: 16355 case AMDGPU::G_AMDGPU_UMED3: { 16356 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs(); 16357 16358 KnownBits Known2; 16359 KB.computeKnownBitsImpl(Src2, Known2, DemandedElts, Depth + 1); 16360 if (Known2.isUnknown()) 16361 break; 16362 16363 KnownBits Known1; 16364 KB.computeKnownBitsImpl(Src1, Known1, DemandedElts, Depth + 1); 16365 if (Known1.isUnknown()) 16366 break; 16367 16368 KnownBits Known0; 16369 KB.computeKnownBitsImpl(Src0, Known0, DemandedElts, Depth + 1); 16370 if (Known0.isUnknown()) 16371 break; 16372 16373 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling. 16374 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero; 16375 Known.One = Known0.One & Known1.One & Known2.One; 16376 break; 16377 } 16378 } 16379 } 16380 16381 Align SITargetLowering::computeKnownAlignForTargetInstr( 16382 GISelKnownBits &KB, Register R, const MachineRegisterInfo &MRI, 16383 unsigned Depth) const { 16384 const MachineInstr *MI = MRI.getVRegDef(R); 16385 if (auto *GI = dyn_cast<GIntrinsic>(MI)) { 16386 // FIXME: Can this move to generic code? What about the case where the call 16387 // site specifies a lower alignment? 16388 Intrinsic::ID IID = GI->getIntrinsicID(); 16389 LLVMContext &Ctx = KB.getMachineFunction().getFunction().getContext(); 16390 AttributeList Attrs = Intrinsic::getAttributes(Ctx, IID); 16391 if (MaybeAlign RetAlign = Attrs.getRetAlignment()) 16392 return *RetAlign; 16393 } 16394 return Align(1); 16395 } 16396 16397 Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { 16398 const Align PrefAlign = TargetLowering::getPrefLoopAlignment(ML); 16399 const Align CacheLineAlign = Align(64); 16400 16401 // Pre-GFX10 target did not benefit from loop alignment 16402 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() || 16403 getSubtarget()->hasInstFwdPrefetchBug()) 16404 return PrefAlign; 16405 16406 // On GFX10 I$ is 4 x 64 bytes cache lines. 16407 // By default prefetcher keeps one cache line behind and reads two ahead. 16408 // We can modify it with S_INST_PREFETCH for larger loops to have two lines 16409 // behind and one ahead. 16410 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes. 16411 // If loop fits 64 bytes it always spans no more than two cache lines and 16412 // does not need an alignment. 16413 // Else if loop is less or equal 128 bytes we do not need to modify prefetch, 16414 // Else if loop is less or equal 192 bytes we need two lines behind. 16415 16416 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); 16417 const MachineBasicBlock *Header = ML->getHeader(); 16418 if (Header->getAlignment() != PrefAlign) 16419 return Header->getAlignment(); // Already processed. 16420 16421 unsigned LoopSize = 0; 16422 for (const MachineBasicBlock *MBB : ML->blocks()) { 16423 // If inner loop block is aligned assume in average half of the alignment 16424 // size to be added as nops. 16425 if (MBB != Header) 16426 LoopSize += MBB->getAlignment().value() / 2; 16427 16428 for (const MachineInstr &MI : *MBB) { 16429 LoopSize += TII->getInstSizeInBytes(MI); 16430 if (LoopSize > 192) 16431 return PrefAlign; 16432 } 16433 } 16434 16435 if (LoopSize <= 64) 16436 return PrefAlign; 16437 16438 if (LoopSize <= 128) 16439 return CacheLineAlign; 16440 16441 // If any of parent loops is surrounded by prefetch instructions do not 16442 // insert new for inner loop, which would reset parent's settings. 16443 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) { 16444 if (MachineBasicBlock *Exit = P->getExitBlock()) { 16445 auto I = Exit->getFirstNonDebugInstr(); 16446 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH) 16447 return CacheLineAlign; 16448 } 16449 } 16450 16451 MachineBasicBlock *Pre = ML->getLoopPreheader(); 16452 MachineBasicBlock *Exit = ML->getExitBlock(); 16453 16454 if (Pre && Exit) { 16455 auto PreTerm = Pre->getFirstTerminator(); 16456 if (PreTerm == Pre->begin() || 16457 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH) 16458 BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH)) 16459 .addImm(1); // prefetch 2 lines behind PC 16460 16461 auto ExitHead = Exit->getFirstNonDebugInstr(); 16462 if (ExitHead == Exit->end() || 16463 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH) 16464 BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH)) 16465 .addImm(2); // prefetch 1 line behind PC 16466 } 16467 16468 return CacheLineAlign; 16469 } 16470 16471 LLVM_ATTRIBUTE_UNUSED 16472 static bool isCopyFromRegOfInlineAsm(const SDNode *N) { 16473 assert(N->getOpcode() == ISD::CopyFromReg); 16474 do { 16475 // Follow the chain until we find an INLINEASM node. 16476 N = N->getOperand(0).getNode(); 16477 if (N->getOpcode() == ISD::INLINEASM || N->getOpcode() == ISD::INLINEASM_BR) 16478 return true; 16479 } while (N->getOpcode() == ISD::CopyFromReg); 16480 return false; 16481 } 16482 16483 bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N, 16484 FunctionLoweringInfo *FLI, 16485 UniformityInfo *UA) const { 16486 switch (N->getOpcode()) { 16487 case ISD::CopyFromReg: { 16488 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1)); 16489 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo(); 16490 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); 16491 Register Reg = R->getReg(); 16492 16493 // FIXME: Why does this need to consider isLiveIn? 16494 if (Reg.isPhysical() || MRI.isLiveIn(Reg)) 16495 return !TRI->isSGPRReg(MRI, Reg); 16496 16497 if (const Value *V = FLI->getValueFromVirtualReg(R->getReg())) 16498 return UA->isDivergent(V); 16499 16500 assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N)); 16501 return !TRI->isSGPRReg(MRI, Reg); 16502 } 16503 case ISD::LOAD: { 16504 const LoadSDNode *L = cast<LoadSDNode>(N); 16505 unsigned AS = L->getAddressSpace(); 16506 // A flat load may access private memory. 16507 return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS; 16508 } 16509 case ISD::CALLSEQ_END: 16510 return true; 16511 case ISD::INTRINSIC_WO_CHAIN: 16512 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0)); 16513 case ISD::INTRINSIC_W_CHAIN: 16514 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1)); 16515 case AMDGPUISD::ATOMIC_CMP_SWAP: 16516 case AMDGPUISD::BUFFER_ATOMIC_SWAP: 16517 case AMDGPUISD::BUFFER_ATOMIC_ADD: 16518 case AMDGPUISD::BUFFER_ATOMIC_SUB: 16519 case AMDGPUISD::BUFFER_ATOMIC_SMIN: 16520 case AMDGPUISD::BUFFER_ATOMIC_UMIN: 16521 case AMDGPUISD::BUFFER_ATOMIC_SMAX: 16522 case AMDGPUISD::BUFFER_ATOMIC_UMAX: 16523 case AMDGPUISD::BUFFER_ATOMIC_AND: 16524 case AMDGPUISD::BUFFER_ATOMIC_OR: 16525 case AMDGPUISD::BUFFER_ATOMIC_XOR: 16526 case AMDGPUISD::BUFFER_ATOMIC_INC: 16527 case AMDGPUISD::BUFFER_ATOMIC_DEC: 16528 case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP: 16529 case AMDGPUISD::BUFFER_ATOMIC_CSUB: 16530 case AMDGPUISD::BUFFER_ATOMIC_FADD: 16531 case AMDGPUISD::BUFFER_ATOMIC_FMIN: 16532 case AMDGPUISD::BUFFER_ATOMIC_FMAX: 16533 // Target-specific read-modify-write atomics are sources of divergence. 16534 return true; 16535 default: 16536 if (auto *A = dyn_cast<AtomicSDNode>(N)) { 16537 // Generic read-modify-write atomics are sources of divergence. 16538 return A->readMem() && A->writeMem(); 16539 } 16540 return false; 16541 } 16542 } 16543 16544 bool SITargetLowering::denormalsEnabledForType(const SelectionDAG &DAG, 16545 EVT VT) const { 16546 switch (VT.getScalarType().getSimpleVT().SimpleTy) { 16547 case MVT::f32: 16548 return !denormalModeIsFlushAllF32(DAG.getMachineFunction()); 16549 case MVT::f64: 16550 case MVT::f16: 16551 return !denormalModeIsFlushAllF64F16(DAG.getMachineFunction()); 16552 default: 16553 return false; 16554 } 16555 } 16556 16557 bool SITargetLowering::denormalsEnabledForType( 16558 LLT Ty, const MachineFunction &MF) const { 16559 switch (Ty.getScalarSizeInBits()) { 16560 case 32: 16561 return !denormalModeIsFlushAllF32(MF); 16562 case 64: 16563 case 16: 16564 return !denormalModeIsFlushAllF64F16(MF); 16565 default: 16566 return false; 16567 } 16568 } 16569 16570 bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op, 16571 const SelectionDAG &DAG, 16572 bool SNaN, 16573 unsigned Depth) const { 16574 if (Op.getOpcode() == AMDGPUISD::CLAMP) { 16575 const MachineFunction &MF = DAG.getMachineFunction(); 16576 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 16577 16578 if (Info->getMode().DX10Clamp) 16579 return true; // Clamped to 0. 16580 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1); 16581 } 16582 16583 return AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(Op, DAG, SNaN, 16584 Depth); 16585 } 16586 16587 // On older subtargets, global FP atomic instructions have a hardcoded FP mode 16588 // and do not support FP32 denormals, and only support v2f16/f64 denormals. 16589 static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW) { 16590 if (RMW->hasMetadata("amdgpu.ignore.denormal.mode")) 16591 return true; 16592 16593 const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics(); 16594 auto DenormMode = RMW->getFunction()->getDenormalMode(Flt); 16595 if (DenormMode == DenormalMode::getPreserveSign()) 16596 return true; 16597 16598 // TODO: Remove this. 16599 return RMW->getFunction() 16600 ->getFnAttribute("amdgpu-unsafe-fp-atomics") 16601 .getValueAsBool(); 16602 } 16603 16604 static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW) { 16605 LLVMContext &Ctx = RMW->getContext(); 16606 StringRef SS = Ctx.getSyncScopeName(RMW->getSyncScopeID()).value_or(""); 16607 StringRef MemScope = SS.empty() ? StringRef("system") : SS; 16608 16609 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW) 16610 << "Hardware instruction generated for atomic " 16611 << RMW->getOperationName(RMW->getOperation()) 16612 << " operation at memory scope " << MemScope; 16613 } 16614 16615 static bool isV2F16OrV2BF16(Type *Ty) { 16616 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) { 16617 Type *EltTy = VT->getElementType(); 16618 return VT->getNumElements() == 2 && 16619 (EltTy->isHalfTy() || EltTy->isBFloatTy()); 16620 } 16621 16622 return false; 16623 } 16624 16625 static bool isV2F16(Type *Ty) { 16626 FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty); 16627 return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy(); 16628 } 16629 16630 static bool isV2BF16(Type *Ty) { 16631 FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty); 16632 return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy(); 16633 } 16634 16635 /// \return true if atomicrmw integer ops work for the type. 16636 static bool isAtomicRMWLegalIntTy(Type *Ty) { 16637 if (auto *IT = dyn_cast<IntegerType>(Ty)) { 16638 unsigned BW = IT->getBitWidth(); 16639 return BW == 32 || BW == 64; 16640 } 16641 16642 return false; 16643 } 16644 16645 /// \return true if this atomicrmw xchg type can be selected. 16646 static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW) { 16647 Type *Ty = RMW->getType(); 16648 if (isAtomicRMWLegalIntTy(Ty)) 16649 return true; 16650 16651 if (PointerType *PT = dyn_cast<PointerType>(Ty)) { 16652 const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout(); 16653 unsigned BW = DL.getPointerSizeInBits(PT->getAddressSpace()); 16654 return BW == 32 || BW == 64; 16655 } 16656 16657 if (Ty->isFloatTy() || Ty->isDoubleTy()) 16658 return true; 16659 16660 if (FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty)) { 16661 return VT->getNumElements() == 2 && 16662 VT->getElementType()->getPrimitiveSizeInBits() == 16; 16663 } 16664 16665 return false; 16666 } 16667 16668 /// \returns true if it's valid to emit a native instruction for \p RMW, based 16669 /// on the properties of the target memory. 16670 static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, 16671 const AtomicRMWInst *RMW, 16672 bool HasSystemScope) { 16673 // The remote/fine-grained access logic is different from the integer 16674 // atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support, 16675 // fine-grained access does not work, even for a device local allocation. 16676 // 16677 // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local 16678 // allocations work. 16679 if (HasSystemScope) { 16680 if (Subtarget.supportsAgentScopeFineGrainedRemoteMemoryAtomics() && 16681 RMW->hasMetadata("amdgpu.no.remote.memory")) 16682 return true; 16683 } else if (Subtarget.supportsAgentScopeFineGrainedRemoteMemoryAtomics()) 16684 return true; 16685 16686 return RMW->hasMetadata("amdgpu.no.fine.grained.memory"); 16687 } 16688 16689 /// \return Action to perform on AtomicRMWInsts for integer operations. 16690 static TargetLowering::AtomicExpansionKind 16691 atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW) { 16692 return isAtomicRMWLegalIntTy(RMW->getType()) 16693 ? TargetLowering::AtomicExpansionKind::None 16694 : TargetLowering::AtomicExpansionKind::CmpXChg; 16695 } 16696 16697 /// Return if a flat address space atomicrmw can access private memory. 16698 static bool flatInstrMayAccessPrivate(const Instruction *I) { 16699 const MDNode *NoaliasAddrSpaceMD = 16700 I->getMetadata(LLVMContext::MD_noalias_addrspace); 16701 if (!NoaliasAddrSpaceMD) 16702 return true; 16703 16704 for (unsigned I = 0, E = NoaliasAddrSpaceMD->getNumOperands() / 2; I != E; 16705 ++I) { 16706 auto *Low = mdconst::extract<ConstantInt>( 16707 NoaliasAddrSpaceMD->getOperand(2 * I + 0)); 16708 if (Low->getValue().uge(AMDGPUAS::PRIVATE_ADDRESS)) { 16709 auto *High = mdconst::extract<ConstantInt>( 16710 NoaliasAddrSpaceMD->getOperand(2 * I + 1)); 16711 return High->getValue().ule(AMDGPUAS::PRIVATE_ADDRESS); 16712 } 16713 } 16714 16715 return true; 16716 } 16717 16718 TargetLowering::AtomicExpansionKind 16719 SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { 16720 unsigned AS = RMW->getPointerAddressSpace(); 16721 if (AS == AMDGPUAS::PRIVATE_ADDRESS) 16722 return AtomicExpansionKind::NotAtomic; 16723 16724 // 64-bit flat atomics that dynamically reside in private memory will silently 16725 // be dropped. 16726 // 16727 // Note that we will emit a new copy of the original atomic in the expansion, 16728 // which will be incrementally relegalized. 16729 const DataLayout &DL = RMW->getFunction()->getDataLayout(); 16730 if (AS == AMDGPUAS::FLAT_ADDRESS && 16731 DL.getTypeSizeInBits(RMW->getType()) == 64 && 16732 flatInstrMayAccessPrivate(RMW)) 16733 return AtomicExpansionKind::Expand; 16734 16735 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) { 16736 OptimizationRemarkEmitter ORE(RMW->getFunction()); 16737 ORE.emit([=]() { 16738 return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request."; 16739 }); 16740 return Kind; 16741 }; 16742 16743 auto SSID = RMW->getSyncScopeID(); 16744 bool HasSystemScope = 16745 SSID == SyncScope::System || 16746 SSID == RMW->getContext().getOrInsertSyncScopeID("one-as"); 16747 16748 auto Op = RMW->getOperation(); 16749 switch (Op) { 16750 case AtomicRMWInst::Xchg: { 16751 // PCIe supports add and xchg for system atomics. 16752 return isAtomicRMWLegalXChgTy(RMW) 16753 ? TargetLowering::AtomicExpansionKind::None 16754 : TargetLowering::AtomicExpansionKind::CmpXChg; 16755 } 16756 case AtomicRMWInst::Add: 16757 case AtomicRMWInst::And: 16758 case AtomicRMWInst::UIncWrap: 16759 case AtomicRMWInst::UDecWrap: 16760 return atomicSupportedIfLegalIntType(RMW); 16761 case AtomicRMWInst::Sub: 16762 case AtomicRMWInst::Or: 16763 case AtomicRMWInst::Xor: { 16764 // Atomic sub/or/xor do not work over PCI express, but atomic add 16765 // does. InstCombine transforms these with 0 to or, so undo that. 16766 if (HasSystemScope && AMDGPU::isFlatGlobalAddrSpace(AS)) { 16767 if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand()); 16768 ConstVal && ConstVal->isNullValue()) 16769 return AtomicExpansionKind::Expand; 16770 } 16771 16772 return atomicSupportedIfLegalIntType(RMW); 16773 } 16774 case AtomicRMWInst::FAdd: { 16775 Type *Ty = RMW->getType(); 16776 16777 // TODO: Handle REGION_ADDRESS 16778 if (AS == AMDGPUAS::LOCAL_ADDRESS) { 16779 // DS F32 FP atomics do respect the denormal mode, but the rounding mode 16780 // is fixed to round-to-nearest-even. 16781 // 16782 // F64 / PK_F16 / PK_BF16 never flush and are also fixed to 16783 // round-to-nearest-even. 16784 // 16785 // We ignore the rounding mode problem, even in strictfp. The C++ standard 16786 // suggests it is OK if the floating-point mode may not match the calling 16787 // thread. 16788 if (Ty->isFloatTy()) { 16789 return Subtarget->hasLDSFPAtomicAddF32() ? AtomicExpansionKind::None 16790 : AtomicExpansionKind::CmpXChg; 16791 } 16792 16793 if (Ty->isDoubleTy()) { 16794 // Ignores denormal mode, but we don't consider flushing mandatory. 16795 return Subtarget->hasLDSFPAtomicAddF64() ? AtomicExpansionKind::None 16796 : AtomicExpansionKind::CmpXChg; 16797 } 16798 16799 if (Subtarget->hasAtomicDsPkAdd16Insts() && isV2F16OrV2BF16(Ty)) 16800 return AtomicExpansionKind::None; 16801 16802 return AtomicExpansionKind::CmpXChg; 16803 } 16804 16805 // LDS atomics respect the denormal mode from the mode register. 16806 // 16807 // Traditionally f32 global/buffer memory atomics would unconditionally 16808 // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never 16809 // flush. 16810 // 16811 // On targets with flat atomic fadd, denormals would flush depending on 16812 // whether the target address resides in LDS or global memory. We consider 16813 // this flat-maybe-flush as will-flush. 16814 if (Ty->isFloatTy() && 16815 !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() && 16816 !atomicIgnoresDenormalModeOrFPModeIsFTZ(RMW)) 16817 return AtomicExpansionKind::CmpXChg; 16818 16819 // FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are 16820 // safe. The message phrasing also should be better. 16821 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) { 16822 if (AS == AMDGPUAS::FLAT_ADDRESS) { 16823 // gfx940, gfx12 16824 if (Subtarget->hasAtomicFlatPkAdd16Insts() && isV2F16OrV2BF16(Ty)) 16825 return ReportUnsafeHWInst(AtomicExpansionKind::None); 16826 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) { 16827 // gfx90a, gfx940, gfx12 16828 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty)) 16829 return ReportUnsafeHWInst(AtomicExpansionKind::None); 16830 16831 // gfx940, gfx12 16832 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isV2BF16(Ty)) 16833 return ReportUnsafeHWInst(AtomicExpansionKind::None); 16834 } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) { 16835 // gfx90a, gfx940, gfx12 16836 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty)) 16837 return ReportUnsafeHWInst(AtomicExpansionKind::None); 16838 16839 // While gfx90a/gfx940 supports v2bf16 for global/flat, it does not for 16840 // buffer. gfx12 does have the buffer version. 16841 if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isV2BF16(Ty)) 16842 return ReportUnsafeHWInst(AtomicExpansionKind::None); 16843 } 16844 16845 // global and flat atomic fadd f64: gfx90a, gfx940. 16846 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy()) 16847 return ReportUnsafeHWInst(AtomicExpansionKind::None); 16848 16849 if (AS != AMDGPUAS::FLAT_ADDRESS) { 16850 if (Ty->isFloatTy()) { 16851 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, 16852 // gfx11+. 16853 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts()) 16854 return ReportUnsafeHWInst(AtomicExpansionKind::None); 16855 // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+. 16856 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts()) 16857 return ReportUnsafeHWInst(AtomicExpansionKind::None); 16858 } else { 16859 // gfx908 16860 if (RMW->use_empty() && 16861 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() && 16862 isV2F16(Ty)) 16863 return ReportUnsafeHWInst(AtomicExpansionKind::None); 16864 } 16865 } 16866 16867 // flat atomic fadd f32: gfx940, gfx11+. 16868 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) { 16869 if (Subtarget->hasFlatAtomicFaddF32Inst()) 16870 return ReportUnsafeHWInst(AtomicExpansionKind::None); 16871 16872 // If it is in flat address space, and the type is float, we will try to 16873 // expand it, if the target supports global and lds atomic fadd. The 16874 // reason we need that is, in the expansion, we emit the check of 16875 // address space. If it is in global address space, we emit the global 16876 // atomic fadd; if it is in shared address space, we emit the LDS atomic 16877 // fadd. 16878 if (Subtarget->hasLDSFPAtomicAddF32()) { 16879 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts()) 16880 return AtomicExpansionKind::Expand; 16881 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts()) 16882 return AtomicExpansionKind::Expand; 16883 } 16884 } 16885 } 16886 16887 return AtomicExpansionKind::CmpXChg; 16888 } 16889 case AtomicRMWInst::FMin: 16890 case AtomicRMWInst::FMax: { 16891 Type *Ty = RMW->getType(); 16892 16893 // LDS float and double fmin/fmax were always supported. 16894 if (AS == AMDGPUAS::LOCAL_ADDRESS) { 16895 return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None 16896 : AtomicExpansionKind::CmpXChg; 16897 } 16898 16899 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) { 16900 // For flat and global cases: 16901 // float, double in gfx7. Manual claims denormal support. 16902 // Removed in gfx8. 16903 // float, double restored in gfx10. 16904 // double removed again in gfx11, so only f32 for gfx11/gfx12. 16905 // 16906 // For gfx9, gfx90a and gfx940 support f64 for global (same as fadd), but 16907 // no f32. 16908 if (AS == AMDGPUAS::FLAT_ADDRESS) { 16909 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy()) 16910 return ReportUnsafeHWInst(AtomicExpansionKind::None); 16911 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy()) 16912 return ReportUnsafeHWInst(AtomicExpansionKind::None); 16913 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) || 16914 AS == AMDGPUAS::BUFFER_FAT_POINTER) { 16915 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy()) 16916 return ReportUnsafeHWInst(AtomicExpansionKind::None); 16917 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy()) 16918 return ReportUnsafeHWInst(AtomicExpansionKind::None); 16919 } 16920 } 16921 16922 return AtomicExpansionKind::CmpXChg; 16923 } 16924 case AtomicRMWInst::Min: 16925 case AtomicRMWInst::Max: 16926 case AtomicRMWInst::UMin: 16927 case AtomicRMWInst::UMax: { 16928 if (AMDGPU::isFlatGlobalAddrSpace(AS) || 16929 AS == AMDGPUAS::BUFFER_FAT_POINTER) { 16930 // Always expand system scope min/max atomics. 16931 if (HasSystemScope) 16932 return AtomicExpansionKind::CmpXChg; 16933 } 16934 16935 return atomicSupportedIfLegalIntType(RMW); 16936 } 16937 case AtomicRMWInst::Nand: 16938 case AtomicRMWInst::FSub: 16939 default: 16940 return AtomicExpansionKind::CmpXChg; 16941 } 16942 16943 llvm_unreachable("covered atomicrmw op switch"); 16944 } 16945 16946 TargetLowering::AtomicExpansionKind 16947 SITargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { 16948 return LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS 16949 ? AtomicExpansionKind::NotAtomic 16950 : AtomicExpansionKind::None; 16951 } 16952 16953 TargetLowering::AtomicExpansionKind 16954 SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { 16955 return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS 16956 ? AtomicExpansionKind::NotAtomic 16957 : AtomicExpansionKind::None; 16958 } 16959 16960 TargetLowering::AtomicExpansionKind 16961 SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const { 16962 unsigned AddrSpace = CmpX->getPointerAddressSpace(); 16963 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) 16964 return AtomicExpansionKind::NotAtomic; 16965 16966 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX)) 16967 return AtomicExpansionKind::None; 16968 16969 const DataLayout &DL = CmpX->getDataLayout(); 16970 16971 Type *ValTy = CmpX->getNewValOperand()->getType(); 16972 16973 // If a 64-bit flat atomic may alias private, we need to avoid using the 16974 // atomic in the private case. 16975 return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::Expand 16976 : AtomicExpansionKind::None; 16977 } 16978 16979 const TargetRegisterClass * 16980 SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const { 16981 const TargetRegisterClass *RC = TargetLoweringBase::getRegClassFor(VT, false); 16982 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); 16983 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent) 16984 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass 16985 : &AMDGPU::SReg_32RegClass; 16986 if (!TRI->isSGPRClass(RC) && !isDivergent) 16987 return TRI->getEquivalentSGPRClass(RC); 16988 if (TRI->isSGPRClass(RC) && isDivergent) 16989 return TRI->getEquivalentVGPRClass(RC); 16990 16991 return RC; 16992 } 16993 16994 // FIXME: This is a workaround for DivergenceAnalysis not understanding always 16995 // uniform values (as produced by the mask results of control flow intrinsics) 16996 // used outside of divergent blocks. The phi users need to also be treated as 16997 // always uniform. 16998 // 16999 // FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis? 17000 static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited, 17001 unsigned WaveSize) { 17002 // FIXME: We assume we never cast the mask results of a control flow 17003 // intrinsic. 17004 // Early exit if the type won't be consistent as a compile time hack. 17005 IntegerType *IT = dyn_cast<IntegerType>(V->getType()); 17006 if (!IT || IT->getBitWidth() != WaveSize) 17007 return false; 17008 17009 if (!isa<Instruction>(V)) 17010 return false; 17011 if (!Visited.insert(V).second) 17012 return false; 17013 bool Result = false; 17014 for (const auto *U : V->users()) { 17015 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) { 17016 if (V == U->getOperand(1)) { 17017 switch (Intrinsic->getIntrinsicID()) { 17018 default: 17019 Result = false; 17020 break; 17021 case Intrinsic::amdgcn_if_break: 17022 case Intrinsic::amdgcn_if: 17023 case Intrinsic::amdgcn_else: 17024 Result = true; 17025 break; 17026 } 17027 } 17028 if (V == U->getOperand(0)) { 17029 switch (Intrinsic->getIntrinsicID()) { 17030 default: 17031 Result = false; 17032 break; 17033 case Intrinsic::amdgcn_end_cf: 17034 case Intrinsic::amdgcn_loop: 17035 Result = true; 17036 break; 17037 } 17038 } 17039 } else { 17040 Result = hasCFUser(U, Visited, WaveSize); 17041 } 17042 if (Result) 17043 break; 17044 } 17045 return Result; 17046 } 17047 17048 bool SITargetLowering::requiresUniformRegister(MachineFunction &MF, 17049 const Value *V) const { 17050 if (const CallInst *CI = dyn_cast<CallInst>(V)) { 17051 if (CI->isInlineAsm()) { 17052 // FIXME: This cannot give a correct answer. This should only trigger in 17053 // the case where inline asm returns mixed SGPR and VGPR results, used 17054 // outside the defining block. We don't have a specific result to 17055 // consider, so this assumes if any value is SGPR, the overall register 17056 // also needs to be SGPR. 17057 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo(); 17058 TargetLowering::AsmOperandInfoVector TargetConstraints = ParseConstraints( 17059 MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI); 17060 for (auto &TC : TargetConstraints) { 17061 if (TC.Type == InlineAsm::isOutput) { 17062 ComputeConstraintToUse(TC, SDValue()); 17063 const TargetRegisterClass *RC = 17064 getRegForInlineAsmConstraint(SIRI, TC.ConstraintCode, 17065 TC.ConstraintVT) 17066 .second; 17067 if (RC && SIRI->isSGPRClass(RC)) 17068 return true; 17069 } 17070 } 17071 } 17072 } 17073 SmallPtrSet<const Value *, 16> Visited; 17074 return hasCFUser(V, Visited, Subtarget->getWavefrontSize()); 17075 } 17076 17077 bool SITargetLowering::hasMemSDNodeUser(SDNode *N) const { 17078 for (SDUse &Use : N->uses()) { 17079 if (MemSDNode *M = dyn_cast<MemSDNode>(Use.getUser())) { 17080 if (getBasePtrIndex(M) == Use.getOperandNo()) 17081 return true; 17082 } 17083 } 17084 return false; 17085 } 17086 17087 bool SITargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0, 17088 SDValue N1) const { 17089 if (!N0.hasOneUse()) 17090 return false; 17091 // Take care of the opportunity to keep N0 uniform 17092 if (N0->isDivergent() || !N1->isDivergent()) 17093 return true; 17094 // Check if we have a good chance to form the memory access pattern with the 17095 // base and offset 17096 return (DAG.isBaseWithConstantOffset(N0) && 17097 hasMemSDNodeUser(*N0->user_begin())); 17098 } 17099 17100 bool SITargetLowering::isReassocProfitable(MachineRegisterInfo &MRI, 17101 Register N0, Register N1) const { 17102 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks 17103 } 17104 17105 MachineMemOperand::Flags 17106 SITargetLowering::getTargetMMOFlags(const Instruction &I) const { 17107 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load. 17108 MachineMemOperand::Flags Flags = MachineMemOperand::MONone; 17109 if (I.getMetadata("amdgpu.noclobber")) 17110 Flags |= MONoClobber; 17111 if (I.getMetadata("amdgpu.last.use")) 17112 Flags |= MOLastUse; 17113 return Flags; 17114 } 17115 17116 bool SITargetLowering::checkForPhysRegDependency( 17117 SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, 17118 const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const { 17119 if (User->getOpcode() != ISD::CopyToReg) 17120 return false; 17121 if (!Def->isMachineOpcode()) 17122 return false; 17123 MachineSDNode *MDef = dyn_cast<MachineSDNode>(Def); 17124 if (!MDef) 17125 return false; 17126 17127 unsigned ResNo = User->getOperand(Op).getResNo(); 17128 if (User->getOperand(Op)->getValueType(ResNo) != MVT::i1) 17129 return false; 17130 const MCInstrDesc &II = TII->get(MDef->getMachineOpcode()); 17131 if (II.isCompare() && II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) { 17132 PhysReg = AMDGPU::SCC; 17133 const TargetRegisterClass *RC = 17134 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo)); 17135 Cost = RC->getCopyCost(); 17136 return true; 17137 } 17138 return false; 17139 } 17140 17141 /// Check if it is profitable to hoist instruction in then/else to if. 17142 bool SITargetLowering::isProfitableToHoist(Instruction *I) const { 17143 if (!I->hasOneUse()) 17144 return true; 17145 17146 Instruction *User = I->user_back(); 17147 // TODO: Add more patterns that are not profitable to hoist and 17148 // handle modifiers such as fabs and fneg 17149 switch (I->getOpcode()) { 17150 case Instruction::FMul: { 17151 if (User->getOpcode() != Instruction::FSub && 17152 User->getOpcode() != Instruction::FAdd) 17153 return true; 17154 17155 const TargetOptions &Options = getTargetMachine().Options; 17156 17157 return ((!I->hasAllowContract() || !User->hasAllowContract()) && 17158 Options.AllowFPOpFusion != FPOpFusion::Fast && 17159 !Options.UnsafeFPMath) || 17160 !isFMAFasterThanFMulAndFAdd(*I->getFunction(), User->getType()); 17161 } 17162 default: 17163 return true; 17164 } 17165 return true; 17166 } 17167 17168 void SITargetLowering::emitExpandAtomicAddrSpacePredicate( 17169 Instruction *AI) const { 17170 // Given: atomicrmw fadd ptr %addr, float %val ordering 17171 // 17172 // With this expansion we produce the following code: 17173 // [...] 17174 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr) 17175 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private 17176 // 17177 // atomicrmw.shared: 17178 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3) 17179 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared, 17180 // float %val ordering 17181 // br label %atomicrmw.phi 17182 // 17183 // atomicrmw.check.private: 17184 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr) 17185 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global 17186 // 17187 // atomicrmw.private: 17188 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5) 17189 // %loaded.private = load float, ptr addrspace(5) %cast.private 17190 // %val.new = fadd float %loaded.private, %val 17191 // store float %val.new, ptr addrspace(5) %cast.private 17192 // br label %atomicrmw.phi 17193 // 17194 // atomicrmw.global: 17195 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1) 17196 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global, 17197 // float %val ordering 17198 // br label %atomicrmw.phi 17199 // 17200 // atomicrmw.phi: 17201 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ], 17202 // [ %loaded.private, %atomicrmw.private ], 17203 // [ %loaded.global, %atomicrmw.global ] 17204 // br label %atomicrmw.end 17205 // 17206 // atomicrmw.end: 17207 // [...] 17208 // 17209 // 17210 // For 64-bit atomics which may reside in private memory, we perform a simpler 17211 // version that only inserts the private check, and uses the flat operation. 17212 17213 IRBuilder<> Builder(AI); 17214 LLVMContext &Ctx = Builder.getContext(); 17215 17216 auto *RMW = dyn_cast<AtomicRMWInst>(AI); 17217 const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex() 17218 : AtomicCmpXchgInst::getPointerOperandIndex(); 17219 Value *Addr = AI->getOperand(PtrOpIdx); 17220 17221 /// TODO: Only need to check private, then emit flat-known-not private (no 17222 /// need for shared block, or cast to global). 17223 AtomicCmpXchgInst *CX = dyn_cast<AtomicCmpXchgInst>(AI); 17224 17225 Align Alignment; 17226 if (RMW) 17227 Alignment = RMW->getAlign(); 17228 else if (CX) 17229 Alignment = CX->getAlign(); 17230 else 17231 llvm_unreachable("unhandled atomic operation"); 17232 17233 // FullFlatEmulation is true if we need to issue the private, shared, and 17234 // global cases. 17235 // 17236 // If this is false, we are only dealing with the flat-targeting-private case, 17237 // where we only insert a check for private and still use the flat instruction 17238 // for global and shared. 17239 17240 bool FullFlatEmulation = RMW && RMW->getOperation() == AtomicRMWInst::FAdd && 17241 Subtarget->hasAtomicFaddInsts() && 17242 RMW->getType()->isFloatTy(); 17243 17244 // If the return value isn't used, do not introduce a false use in the phi. 17245 bool ReturnValueIsUsed = !AI->use_empty(); 17246 17247 BasicBlock *BB = Builder.GetInsertBlock(); 17248 Function *F = BB->getParent(); 17249 BasicBlock *ExitBB = 17250 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end"); 17251 BasicBlock *SharedBB = nullptr; 17252 17253 BasicBlock *CheckPrivateBB = BB; 17254 if (FullFlatEmulation) { 17255 SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB); 17256 CheckPrivateBB = 17257 BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB); 17258 } 17259 17260 BasicBlock *PrivateBB = 17261 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB); 17262 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB); 17263 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB); 17264 17265 std::prev(BB->end())->eraseFromParent(); 17266 Builder.SetInsertPoint(BB); 17267 17268 Value *LoadedShared = nullptr; 17269 if (FullFlatEmulation) { 17270 CallInst *IsShared = Builder.CreateIntrinsic( 17271 Intrinsic::amdgcn_is_shared, {}, {Addr}, nullptr, "is.shared"); 17272 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB); 17273 Builder.SetInsertPoint(SharedBB); 17274 Value *CastToLocal = Builder.CreateAddrSpaceCast( 17275 Addr, PointerType::get(Ctx, AMDGPUAS::LOCAL_ADDRESS)); 17276 17277 Instruction *Clone = AI->clone(); 17278 Clone->insertInto(SharedBB, SharedBB->end()); 17279 Clone->getOperandUse(PtrOpIdx).set(CastToLocal); 17280 LoadedShared = Clone; 17281 17282 Builder.CreateBr(PhiBB); 17283 Builder.SetInsertPoint(CheckPrivateBB); 17284 } 17285 17286 CallInst *IsPrivate = Builder.CreateIntrinsic( 17287 Intrinsic::amdgcn_is_private, {}, {Addr}, nullptr, "is.private"); 17288 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB); 17289 17290 Builder.SetInsertPoint(PrivateBB); 17291 17292 Value *CastToPrivate = Builder.CreateAddrSpaceCast( 17293 Addr, PointerType::get(Ctx, AMDGPUAS::PRIVATE_ADDRESS)); 17294 17295 Value *LoadedPrivate; 17296 if (RMW) { 17297 LoadedPrivate = Builder.CreateAlignedLoad( 17298 RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private"); 17299 17300 Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder, 17301 LoadedPrivate, RMW->getValOperand()); 17302 17303 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign()); 17304 } else { 17305 auto [ResultLoad, Equal] = 17306 buildCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(), 17307 CX->getNewValOperand(), CX->getAlign()); 17308 17309 Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()), 17310 ResultLoad, 0); 17311 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1); 17312 } 17313 17314 Builder.CreateBr(PhiBB); 17315 17316 Builder.SetInsertPoint(GlobalBB); 17317 17318 // Continue using a flat instruction if we only emitted the check for private. 17319 Instruction *LoadedGlobal = AI; 17320 if (FullFlatEmulation) { 17321 Value *CastToGlobal = Builder.CreateAddrSpaceCast( 17322 Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS)); 17323 AI->getOperandUse(PtrOpIdx).set(CastToGlobal); 17324 } 17325 17326 AI->removeFromParent(); 17327 AI->insertInto(GlobalBB, GlobalBB->end()); 17328 17329 // The new atomicrmw may go through another round of legalization later. 17330 if (!FullFlatEmulation) { 17331 // We inserted the runtime check already, make sure we do not try to 17332 // re-expand this. 17333 // TODO: Should union with any existing metadata. 17334 MDBuilder MDB(F->getContext()); 17335 MDNode *RangeNotPrivate = 17336 MDB.createRange(APInt(32, AMDGPUAS::PRIVATE_ADDRESS), 17337 APInt(32, AMDGPUAS::PRIVATE_ADDRESS + 1)); 17338 LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace, 17339 RangeNotPrivate); 17340 } 17341 17342 Builder.CreateBr(PhiBB); 17343 17344 Builder.SetInsertPoint(PhiBB); 17345 17346 if (ReturnValueIsUsed) { 17347 PHINode *Loaded = Builder.CreatePHI(AI->getType(), 3); 17348 AI->replaceAllUsesWith(Loaded); 17349 if (FullFlatEmulation) 17350 Loaded->addIncoming(LoadedShared, SharedBB); 17351 Loaded->addIncoming(LoadedPrivate, PrivateBB); 17352 Loaded->addIncoming(LoadedGlobal, GlobalBB); 17353 Loaded->takeName(AI); 17354 } 17355 17356 Builder.CreateBr(ExitBB); 17357 } 17358 17359 void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const { 17360 AtomicRMWInst::BinOp Op = AI->getOperation(); 17361 17362 if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or || 17363 Op == AtomicRMWInst::Xor) { 17364 if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand()); 17365 ConstVal && ConstVal->isNullValue()) { 17366 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0 17367 AI->setOperation(AtomicRMWInst::Add); 17368 17369 // We may still need the private-alias-flat handling below. 17370 17371 // TODO: Skip this for cases where we cannot access remote memory. 17372 } 17373 } 17374 17375 // The non-flat expansions should only perform the de-canonicalization of 17376 // identity values. 17377 if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS) 17378 return; 17379 17380 emitExpandAtomicAddrSpacePredicate(AI); 17381 } 17382 17383 void SITargetLowering::emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const { 17384 emitExpandAtomicAddrSpacePredicate(CI); 17385 } 17386 17387 LoadInst * 17388 SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { 17389 IRBuilder<> Builder(AI); 17390 auto Order = AI->getOrdering(); 17391 17392 // The optimization removes store aspect of the atomicrmw. Therefore, cache 17393 // must be flushed if the atomic ordering had a release semantics. This is 17394 // not necessary a fence, a release fence just coincides to do that flush. 17395 // Avoid replacing of an atomicrmw with a release semantics. 17396 if (isReleaseOrStronger(Order)) 17397 return nullptr; 17398 17399 LoadInst *LI = Builder.CreateAlignedLoad( 17400 AI->getType(), AI->getPointerOperand(), AI->getAlign()); 17401 LI->setAtomic(Order, AI->getSyncScopeID()); 17402 LI->copyMetadata(*AI); 17403 LI->takeName(AI); 17404 AI->replaceAllUsesWith(LI); 17405 AI->eraseFromParent(); 17406 return LI; 17407 } 17408