1 //===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This is the parent TargetLowering class for hardware code gen 11 /// targets. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPUISelLowering.h" 16 #include "AMDGPU.h" 17 #include "AMDGPUInstrInfo.h" 18 #include "AMDGPUMachineFunction.h" 19 #include "AMDGPUMemoryUtils.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "llvm/CodeGen/Analysis.h" 22 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" 23 #include "llvm/CodeGen/MachineFrameInfo.h" 24 #include "llvm/IR/DiagnosticInfo.h" 25 #include "llvm/IR/IntrinsicsAMDGPU.h" 26 #include "llvm/Support/CommandLine.h" 27 #include "llvm/Support/KnownBits.h" 28 #include "llvm/Target/TargetMachine.h" 29 30 using namespace llvm; 31 32 #include "AMDGPUGenCallingConv.inc" 33 34 static cl::opt<bool> AMDGPUBypassSlowDiv( 35 "amdgpu-bypass-slow-div", 36 cl::desc("Skip 64-bit divide for dynamic 32-bit values"), 37 cl::init(true)); 38 39 // Find a larger type to do a load / store of a vector with. 40 EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) { 41 unsigned StoreSize = VT.getStoreSizeInBits(); 42 if (StoreSize <= 32) 43 return EVT::getIntegerVT(Ctx, StoreSize); 44 45 if (StoreSize % 32 == 0) 46 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32); 47 48 return VT; 49 } 50 51 unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) { 52 return DAG.computeKnownBits(Op).countMaxActiveBits(); 53 } 54 55 unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) { 56 // In order for this to be a signed 24-bit value, bit 23, must 57 // be a sign bit. 58 return DAG.ComputeMaxSignificantBits(Op); 59 } 60 61 AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, 62 const AMDGPUSubtarget &STI) 63 : TargetLowering(TM), Subtarget(&STI) { 64 // Always lower memset, memcpy, and memmove intrinsics to load/store 65 // instructions, rather then generating calls to memset, mempcy or memmove. 66 MaxStoresPerMemset = MaxStoresPerMemsetOptSize = ~0U; 67 MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = ~0U; 68 MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = ~0U; 69 70 // Enable ganging up loads and stores in the memcpy DAG lowering. 71 MaxGluedStoresPerMemcpy = 16; 72 73 // Lower floating point store/load to integer store/load to reduce the number 74 // of patterns in tablegen. 75 setOperationAction(ISD::LOAD, MVT::f32, Promote); 76 AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32); 77 78 setOperationAction(ISD::LOAD, MVT::v2f32, Promote); 79 AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32); 80 81 setOperationAction(ISD::LOAD, MVT::v3f32, Promote); 82 AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32); 83 84 setOperationAction(ISD::LOAD, MVT::v4f32, Promote); 85 AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32); 86 87 setOperationAction(ISD::LOAD, MVT::v5f32, Promote); 88 AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32); 89 90 setOperationAction(ISD::LOAD, MVT::v6f32, Promote); 91 AddPromotedToType(ISD::LOAD, MVT::v6f32, MVT::v6i32); 92 93 setOperationAction(ISD::LOAD, MVT::v7f32, Promote); 94 AddPromotedToType(ISD::LOAD, MVT::v7f32, MVT::v7i32); 95 96 setOperationAction(ISD::LOAD, MVT::v8f32, Promote); 97 AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32); 98 99 setOperationAction(ISD::LOAD, MVT::v9f32, Promote); 100 AddPromotedToType(ISD::LOAD, MVT::v9f32, MVT::v9i32); 101 102 setOperationAction(ISD::LOAD, MVT::v10f32, Promote); 103 AddPromotedToType(ISD::LOAD, MVT::v10f32, MVT::v10i32); 104 105 setOperationAction(ISD::LOAD, MVT::v11f32, Promote); 106 AddPromotedToType(ISD::LOAD, MVT::v11f32, MVT::v11i32); 107 108 setOperationAction(ISD::LOAD, MVT::v12f32, Promote); 109 AddPromotedToType(ISD::LOAD, MVT::v12f32, MVT::v12i32); 110 111 setOperationAction(ISD::LOAD, MVT::v16f32, Promote); 112 AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32); 113 114 setOperationAction(ISD::LOAD, MVT::v32f32, Promote); 115 AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32); 116 117 setOperationAction(ISD::LOAD, MVT::i64, Promote); 118 AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32); 119 120 setOperationAction(ISD::LOAD, MVT::v2i64, Promote); 121 AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32); 122 123 setOperationAction(ISD::LOAD, MVT::f64, Promote); 124 AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32); 125 126 setOperationAction(ISD::LOAD, MVT::v2f64, Promote); 127 AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32); 128 129 setOperationAction(ISD::LOAD, MVT::v3i64, Promote); 130 AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32); 131 132 setOperationAction(ISD::LOAD, MVT::v4i64, Promote); 133 AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32); 134 135 setOperationAction(ISD::LOAD, MVT::v3f64, Promote); 136 AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32); 137 138 setOperationAction(ISD::LOAD, MVT::v4f64, Promote); 139 AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32); 140 141 setOperationAction(ISD::LOAD, MVT::v8i64, Promote); 142 AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32); 143 144 setOperationAction(ISD::LOAD, MVT::v8f64, Promote); 145 AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32); 146 147 setOperationAction(ISD::LOAD, MVT::v16i64, Promote); 148 AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32); 149 150 setOperationAction(ISD::LOAD, MVT::v16f64, Promote); 151 AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32); 152 153 setOperationAction(ISD::LOAD, MVT::i128, Promote); 154 AddPromotedToType(ISD::LOAD, MVT::i128, MVT::v4i32); 155 156 // TODO: Would be better to consume as directly legal 157 setOperationAction(ISD::ATOMIC_LOAD, MVT::f32, Promote); 158 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f32, MVT::i32); 159 160 setOperationAction(ISD::ATOMIC_LOAD, MVT::f64, Promote); 161 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f64, MVT::i64); 162 163 setOperationAction(ISD::ATOMIC_LOAD, MVT::f16, Promote); 164 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f16, MVT::i16); 165 166 setOperationAction(ISD::ATOMIC_LOAD, MVT::bf16, Promote); 167 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::bf16, MVT::i16); 168 169 setOperationAction(ISD::ATOMIC_STORE, MVT::f32, Promote); 170 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f32, MVT::i32); 171 172 setOperationAction(ISD::ATOMIC_STORE, MVT::f64, Promote); 173 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f64, MVT::i64); 174 175 setOperationAction(ISD::ATOMIC_STORE, MVT::f16, Promote); 176 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f16, MVT::i16); 177 178 setOperationAction(ISD::ATOMIC_STORE, MVT::bf16, Promote); 179 AddPromotedToType(ISD::ATOMIC_STORE, MVT::bf16, MVT::i16); 180 181 // There are no 64-bit extloads. These should be done as a 32-bit extload and 182 // an extension to 64-bit. 183 for (MVT VT : MVT::integer_valuetypes()) 184 setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::i64, VT, 185 Expand); 186 187 for (MVT VT : MVT::integer_valuetypes()) { 188 if (VT == MVT::i64) 189 continue; 190 191 for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) { 192 setLoadExtAction(Op, VT, MVT::i1, Promote); 193 setLoadExtAction(Op, VT, MVT::i8, Legal); 194 setLoadExtAction(Op, VT, MVT::i16, Legal); 195 setLoadExtAction(Op, VT, MVT::i32, Expand); 196 } 197 } 198 199 for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) 200 for (auto MemVT : 201 {MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16}) 202 setLoadExtAction({ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}, VT, MemVT, 203 Expand); 204 205 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); 206 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand); 207 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand); 208 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand); 209 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand); 210 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3bf16, Expand); 211 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand); 212 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand); 213 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand); 214 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand); 215 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand); 216 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16bf16, Expand); 217 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand); 218 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32bf16, Expand); 219 220 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); 221 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand); 222 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand); 223 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand); 224 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand); 225 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand); 226 227 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); 228 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand); 229 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand); 230 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand); 231 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand); 232 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3bf16, Expand); 233 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand); 234 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand); 235 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand); 236 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand); 237 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand); 238 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16bf16, Expand); 239 240 setOperationAction(ISD::STORE, MVT::f32, Promote); 241 AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32); 242 243 setOperationAction(ISD::STORE, MVT::v2f32, Promote); 244 AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32); 245 246 setOperationAction(ISD::STORE, MVT::v3f32, Promote); 247 AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32); 248 249 setOperationAction(ISD::STORE, MVT::v4f32, Promote); 250 AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32); 251 252 setOperationAction(ISD::STORE, MVT::v5f32, Promote); 253 AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32); 254 255 setOperationAction(ISD::STORE, MVT::v6f32, Promote); 256 AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32); 257 258 setOperationAction(ISD::STORE, MVT::v7f32, Promote); 259 AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32); 260 261 setOperationAction(ISD::STORE, MVT::v8f32, Promote); 262 AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32); 263 264 setOperationAction(ISD::STORE, MVT::v9f32, Promote); 265 AddPromotedToType(ISD::STORE, MVT::v9f32, MVT::v9i32); 266 267 setOperationAction(ISD::STORE, MVT::v10f32, Promote); 268 AddPromotedToType(ISD::STORE, MVT::v10f32, MVT::v10i32); 269 270 setOperationAction(ISD::STORE, MVT::v11f32, Promote); 271 AddPromotedToType(ISD::STORE, MVT::v11f32, MVT::v11i32); 272 273 setOperationAction(ISD::STORE, MVT::v12f32, Promote); 274 AddPromotedToType(ISD::STORE, MVT::v12f32, MVT::v12i32); 275 276 setOperationAction(ISD::STORE, MVT::v16f32, Promote); 277 AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32); 278 279 setOperationAction(ISD::STORE, MVT::v32f32, Promote); 280 AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32); 281 282 setOperationAction(ISD::STORE, MVT::i64, Promote); 283 AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32); 284 285 setOperationAction(ISD::STORE, MVT::v2i64, Promote); 286 AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32); 287 288 setOperationAction(ISD::STORE, MVT::f64, Promote); 289 AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32); 290 291 setOperationAction(ISD::STORE, MVT::v2f64, Promote); 292 AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32); 293 294 setOperationAction(ISD::STORE, MVT::v3i64, Promote); 295 AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32); 296 297 setOperationAction(ISD::STORE, MVT::v3f64, Promote); 298 AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32); 299 300 setOperationAction(ISD::STORE, MVT::v4i64, Promote); 301 AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32); 302 303 setOperationAction(ISD::STORE, MVT::v4f64, Promote); 304 AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32); 305 306 setOperationAction(ISD::STORE, MVT::v8i64, Promote); 307 AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32); 308 309 setOperationAction(ISD::STORE, MVT::v8f64, Promote); 310 AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32); 311 312 setOperationAction(ISD::STORE, MVT::v16i64, Promote); 313 AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32); 314 315 setOperationAction(ISD::STORE, MVT::v16f64, Promote); 316 AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32); 317 318 setOperationAction(ISD::STORE, MVT::i128, Promote); 319 AddPromotedToType(ISD::STORE, MVT::i128, MVT::v4i32); 320 321 setTruncStoreAction(MVT::i64, MVT::i1, Expand); 322 setTruncStoreAction(MVT::i64, MVT::i8, Expand); 323 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 324 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 325 326 setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand); 327 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand); 328 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand); 329 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand); 330 331 setTruncStoreAction(MVT::f32, MVT::bf16, Expand); 332 setTruncStoreAction(MVT::f32, MVT::f16, Expand); 333 setTruncStoreAction(MVT::v2f32, MVT::v2bf16, Expand); 334 setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand); 335 setTruncStoreAction(MVT::v3f32, MVT::v3bf16, Expand); 336 setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand); 337 setTruncStoreAction(MVT::v4f32, MVT::v4bf16, Expand); 338 setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand); 339 setTruncStoreAction(MVT::v8f32, MVT::v8bf16, Expand); 340 setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand); 341 setTruncStoreAction(MVT::v16f32, MVT::v16bf16, Expand); 342 setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand); 343 setTruncStoreAction(MVT::v32f32, MVT::v32bf16, Expand); 344 setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand); 345 346 setTruncStoreAction(MVT::f64, MVT::bf16, Expand); 347 setTruncStoreAction(MVT::f64, MVT::f16, Expand); 348 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 349 350 setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand); 351 setTruncStoreAction(MVT::v2f64, MVT::v2bf16, Expand); 352 setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand); 353 354 setTruncStoreAction(MVT::v3i32, MVT::v3i8, Expand); 355 356 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand); 357 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand); 358 setTruncStoreAction(MVT::v3i64, MVT::v3i8, Expand); 359 setTruncStoreAction(MVT::v3i64, MVT::v3i1, Expand); 360 setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand); 361 setTruncStoreAction(MVT::v3f64, MVT::v3bf16, Expand); 362 setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand); 363 364 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand); 365 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand); 366 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand); 367 setTruncStoreAction(MVT::v4f64, MVT::v4bf16, Expand); 368 setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand); 369 370 setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand); 371 setTruncStoreAction(MVT::v8f64, MVT::v8bf16, Expand); 372 setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand); 373 374 setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand); 375 setTruncStoreAction(MVT::v16f64, MVT::v16bf16, Expand); 376 setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand); 377 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand); 378 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand); 379 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand); 380 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand); 381 setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand); 382 383 setOperationAction(ISD::Constant, {MVT::i32, MVT::i64}, Legal); 384 setOperationAction(ISD::ConstantFP, {MVT::f32, MVT::f64}, Legal); 385 386 setOperationAction({ISD::BR_JT, ISD::BRIND}, MVT::Other, Expand); 387 388 // For R600, this is totally unsupported, just custom lower to produce an 389 // error. 390 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); 391 392 // Library functions. These default to Expand, but we have instructions 393 // for them. 394 setOperationAction({ISD::FCEIL, ISD::FPOW, ISD::FABS, ISD::FFLOOR, 395 ISD::FROUNDEVEN, ISD::FTRUNC, ISD::FMINNUM, ISD::FMAXNUM}, 396 MVT::f32, Legal); 397 398 setOperationAction(ISD::FLOG2, MVT::f32, Custom); 399 setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom); 400 setOperationAction({ISD::LROUND, ISD::LLROUND}, 401 {MVT::f16, MVT::f32, MVT::f64}, Expand); 402 403 setOperationAction( 404 {ISD::FLOG, ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10}, MVT::f32, 405 Custom); 406 407 setOperationAction(ISD::FNEARBYINT, {MVT::f16, MVT::f32, MVT::f64}, Custom); 408 409 setOperationAction(ISD::FRINT, {MVT::f16, MVT::f32, MVT::f64}, Custom); 410 411 setOperationAction({ISD::LRINT, ISD::LLRINT}, {MVT::f16, MVT::f32, MVT::f64}, 412 Expand); 413 414 setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Custom); 415 416 if (Subtarget->has16BitInsts()) 417 setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal); 418 else { 419 setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal); 420 setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Custom); 421 } 422 423 setOperationAction({ISD::FLOG10, ISD::FLOG, ISD::FEXP, ISD::FEXP10}, MVT::f16, 424 Custom); 425 426 // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches 427 // scalarization code. Can be removed when IS_FPCLASS expand isn't called by 428 // default unless marked custom/legal. 429 setOperationAction(ISD::IS_FPCLASS, 430 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, 431 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32, 432 MVT::v2f64, MVT::v3f64, MVT::v4f64, MVT::v8f64, 433 MVT::v16f64}, 434 Custom); 435 436 if (isTypeLegal(MVT::f16)) 437 setOperationAction(ISD::IS_FPCLASS, 438 {MVT::v2f16, MVT::v3f16, MVT::v4f16, MVT::v16f16}, 439 Custom); 440 441 // Expand to fneg + fadd. 442 setOperationAction(ISD::FSUB, MVT::f64, Expand); 443 444 setOperationAction(ISD::CONCAT_VECTORS, 445 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32, 446 MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32, 447 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32, 448 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32, 449 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32}, 450 Custom); 451 452 setOperationAction( 453 ISD::EXTRACT_SUBVECTOR, 454 {MVT::v2f32, MVT::v2i32, MVT::v3f32, MVT::v3i32, MVT::v4f32, 455 MVT::v4i32, MVT::v5f32, MVT::v5i32, MVT::v6f32, MVT::v6i32, 456 MVT::v7f32, MVT::v7i32, MVT::v8f32, MVT::v8i32, MVT::v9f32, 457 MVT::v9i32, MVT::v10i32, MVT::v10f32, MVT::v11i32, MVT::v11f32, 458 MVT::v12i32, MVT::v12f32, MVT::v16i32, MVT::v32f32, MVT::v32i32, 459 MVT::v2f64, MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64, 460 MVT::v4i64, MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64}, 461 Custom); 462 463 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); 464 setOperationAction(ISD::FP_TO_FP16, {MVT::f64, MVT::f32}, Custom); 465 466 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 }; 467 for (MVT VT : ScalarIntVTs) { 468 // These should use [SU]DIVREM, so set them to expand 469 setOperationAction({ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM}, VT, 470 Expand); 471 472 // GPU does not have divrem function for signed or unsigned. 473 setOperationAction({ISD::SDIVREM, ISD::UDIVREM}, VT, Custom); 474 475 // GPU does not have [S|U]MUL_LOHI functions as a single instruction. 476 setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, VT, Expand); 477 478 setOperationAction({ISD::BSWAP, ISD::CTTZ, ISD::CTLZ}, VT, Expand); 479 480 // AMDGPU uses ADDC/SUBC/ADDE/SUBE 481 setOperationAction({ISD::ADDC, ISD::SUBC, ISD::ADDE, ISD::SUBE}, VT, Legal); 482 } 483 484 // The hardware supports 32-bit FSHR, but not FSHL. 485 setOperationAction(ISD::FSHR, MVT::i32, Legal); 486 487 // The hardware supports 32-bit ROTR, but not ROTL. 488 setOperationAction(ISD::ROTL, {MVT::i32, MVT::i64}, Expand); 489 setOperationAction(ISD::ROTR, MVT::i64, Expand); 490 491 setOperationAction({ISD::MULHU, ISD::MULHS}, MVT::i16, Expand); 492 493 setOperationAction({ISD::MUL, ISD::MULHU, ISD::MULHS}, MVT::i64, Expand); 494 setOperationAction( 495 {ISD::UINT_TO_FP, ISD::SINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT}, 496 MVT::i64, Custom); 497 setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); 498 499 setOperationAction({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, MVT::i32, 500 Legal); 501 502 setOperationAction( 503 {ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, 504 MVT::i64, Custom); 505 506 for (auto VT : {MVT::i8, MVT::i16}) 507 setOperationAction({ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, VT, Custom); 508 509 static const MVT::SimpleValueType VectorIntTypes[] = { 510 MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32, 511 MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32}; 512 513 for (MVT VT : VectorIntTypes) { 514 // Expand the following operations for the current type by default. 515 setOperationAction({ISD::ADD, ISD::AND, ISD::FP_TO_SINT, 516 ISD::FP_TO_UINT, ISD::MUL, ISD::MULHU, 517 ISD::MULHS, ISD::OR, ISD::SHL, 518 ISD::SRA, ISD::SRL, ISD::ROTL, 519 ISD::ROTR, ISD::SUB, ISD::SINT_TO_FP, 520 ISD::UINT_TO_FP, ISD::SDIV, ISD::UDIV, 521 ISD::SREM, ISD::UREM, ISD::SMUL_LOHI, 522 ISD::UMUL_LOHI, ISD::SDIVREM, ISD::UDIVREM, 523 ISD::SELECT, ISD::VSELECT, ISD::SELECT_CC, 524 ISD::XOR, ISD::BSWAP, ISD::CTPOP, 525 ISD::CTTZ, ISD::CTLZ, ISD::VECTOR_SHUFFLE, 526 ISD::SETCC, ISD::ADDRSPACECAST}, 527 VT, Expand); 528 } 529 530 static const MVT::SimpleValueType FloatVectorTypes[] = { 531 MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32, 532 MVT::v9f32, MVT::v10f32, MVT::v11f32, MVT::v12f32}; 533 534 for (MVT VT : FloatVectorTypes) { 535 setOperationAction( 536 {ISD::FABS, ISD::FMINNUM, ISD::FMAXNUM, 537 ISD::FADD, ISD::FCEIL, ISD::FCOS, 538 ISD::FDIV, ISD::FEXP2, ISD::FEXP, 539 ISD::FEXP10, ISD::FLOG2, ISD::FREM, 540 ISD::FLOG, ISD::FLOG10, ISD::FPOW, 541 ISD::FFLOOR, ISD::FTRUNC, ISD::FMUL, 542 ISD::FMA, ISD::FRINT, ISD::FNEARBYINT, 543 ISD::FSQRT, ISD::FSIN, ISD::FSUB, 544 ISD::FNEG, ISD::VSELECT, ISD::SELECT_CC, 545 ISD::FCOPYSIGN, ISD::VECTOR_SHUFFLE, ISD::SETCC, 546 ISD::FCANONICALIZE, ISD::FROUNDEVEN}, 547 VT, Expand); 548 } 549 550 // This causes using an unrolled select operation rather than expansion with 551 // bit operations. This is in general better, but the alternative using BFI 552 // instructions may be better if the select sources are SGPRs. 553 setOperationAction(ISD::SELECT, MVT::v2f32, Promote); 554 AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32); 555 556 setOperationAction(ISD::SELECT, MVT::v3f32, Promote); 557 AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32); 558 559 setOperationAction(ISD::SELECT, MVT::v4f32, Promote); 560 AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32); 561 562 setOperationAction(ISD::SELECT, MVT::v5f32, Promote); 563 AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32); 564 565 setOperationAction(ISD::SELECT, MVT::v6f32, Promote); 566 AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32); 567 568 setOperationAction(ISD::SELECT, MVT::v7f32, Promote); 569 AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32); 570 571 setOperationAction(ISD::SELECT, MVT::v9f32, Promote); 572 AddPromotedToType(ISD::SELECT, MVT::v9f32, MVT::v9i32); 573 574 setOperationAction(ISD::SELECT, MVT::v10f32, Promote); 575 AddPromotedToType(ISD::SELECT, MVT::v10f32, MVT::v10i32); 576 577 setOperationAction(ISD::SELECT, MVT::v11f32, Promote); 578 AddPromotedToType(ISD::SELECT, MVT::v11f32, MVT::v11i32); 579 580 setOperationAction(ISD::SELECT, MVT::v12f32, Promote); 581 AddPromotedToType(ISD::SELECT, MVT::v12f32, MVT::v12i32); 582 583 setSchedulingPreference(Sched::RegPressure); 584 setJumpIsExpensive(true); 585 586 // FIXME: This is only partially true. If we have to do vector compares, any 587 // SGPR pair can be a condition register. If we have a uniform condition, we 588 // are better off doing SALU operations, where there is only one SCC. For now, 589 // we don't have a way of knowing during instruction selection if a condition 590 // will be uniform and we always use vector compares. Assume we are using 591 // vector compares until that is fixed. 592 setHasMultipleConditionRegisters(true); 593 594 setMinCmpXchgSizeInBits(32); 595 setSupportsUnalignedAtomics(false); 596 597 PredictableSelectIsExpensive = false; 598 599 // We want to find all load dependencies for long chains of stores to enable 600 // merging into very wide vectors. The problem is with vectors with > 4 601 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16 602 // vectors are a legal type, even though we have to split the loads 603 // usually. When we can more precisely specify load legality per address 604 // space, we should be able to make FindBetterChain/MergeConsecutiveStores 605 // smarter so that they can figure out what to do in 2 iterations without all 606 // N > 4 stores on the same chain. 607 GatherAllAliasesMaxDepth = 16; 608 609 // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry 610 // about these during lowering. 611 MaxStoresPerMemcpy = 0xffffffff; 612 MaxStoresPerMemmove = 0xffffffff; 613 MaxStoresPerMemset = 0xffffffff; 614 615 // The expansion for 64-bit division is enormous. 616 if (AMDGPUBypassSlowDiv) 617 addBypassSlowDiv(64, 32); 618 619 setTargetDAGCombine({ISD::BITCAST, ISD::SHL, 620 ISD::SRA, ISD::SRL, 621 ISD::TRUNCATE, ISD::MUL, 622 ISD::SMUL_LOHI, ISD::UMUL_LOHI, 623 ISD::MULHU, ISD::MULHS, 624 ISD::SELECT, ISD::SELECT_CC, 625 ISD::STORE, ISD::FADD, 626 ISD::FSUB, ISD::FNEG, 627 ISD::FABS, ISD::AssertZext, 628 ISD::AssertSext, ISD::INTRINSIC_WO_CHAIN}); 629 630 setMaxAtomicSizeInBitsSupported(64); 631 setMaxDivRemBitWidthSupported(64); 632 setMaxLargeFPConvertBitWidthSupported(64); 633 } 634 635 bool AMDGPUTargetLowering::mayIgnoreSignedZero(SDValue Op) const { 636 if (getTargetMachine().Options.NoSignedZerosFPMath) 637 return true; 638 639 const auto Flags = Op.getNode()->getFlags(); 640 if (Flags.hasNoSignedZeros()) 641 return true; 642 643 return false; 644 } 645 646 //===----------------------------------------------------------------------===// 647 // Target Information 648 //===----------------------------------------------------------------------===// 649 650 LLVM_READNONE 651 static bool fnegFoldsIntoOpcode(unsigned Opc) { 652 switch (Opc) { 653 case ISD::FADD: 654 case ISD::FSUB: 655 case ISD::FMUL: 656 case ISD::FMA: 657 case ISD::FMAD: 658 case ISD::FMINNUM: 659 case ISD::FMAXNUM: 660 case ISD::FMINNUM_IEEE: 661 case ISD::FMAXNUM_IEEE: 662 case ISD::FMINIMUM: 663 case ISD::FMAXIMUM: 664 case ISD::SELECT: 665 case ISD::FSIN: 666 case ISD::FTRUNC: 667 case ISD::FRINT: 668 case ISD::FNEARBYINT: 669 case ISD::FROUNDEVEN: 670 case ISD::FCANONICALIZE: 671 case AMDGPUISD::RCP: 672 case AMDGPUISD::RCP_LEGACY: 673 case AMDGPUISD::RCP_IFLAG: 674 case AMDGPUISD::SIN_HW: 675 case AMDGPUISD::FMUL_LEGACY: 676 case AMDGPUISD::FMIN_LEGACY: 677 case AMDGPUISD::FMAX_LEGACY: 678 case AMDGPUISD::FMED3: 679 // TODO: handle llvm.amdgcn.fma.legacy 680 return true; 681 case ISD::BITCAST: 682 llvm_unreachable("bitcast is special cased"); 683 default: 684 return false; 685 } 686 } 687 688 static bool fnegFoldsIntoOp(const SDNode *N) { 689 unsigned Opc = N->getOpcode(); 690 if (Opc == ISD::BITCAST) { 691 // TODO: Is there a benefit to checking the conditions performFNegCombine 692 // does? We don't for the other cases. 693 SDValue BCSrc = N->getOperand(0); 694 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) { 695 return BCSrc.getNumOperands() == 2 && 696 BCSrc.getOperand(1).getValueSizeInBits() == 32; 697 } 698 699 return BCSrc.getOpcode() == ISD::SELECT && BCSrc.getValueType() == MVT::f32; 700 } 701 702 return fnegFoldsIntoOpcode(Opc); 703 } 704 705 /// \p returns true if the operation will definitely need to use a 64-bit 706 /// encoding, and thus will use a VOP3 encoding regardless of the source 707 /// modifiers. 708 LLVM_READONLY 709 static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) { 710 return (N->getNumOperands() > 2 && N->getOpcode() != ISD::SELECT) || 711 VT == MVT::f64; 712 } 713 714 /// Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the 715 /// type for ISD::SELECT. 716 LLVM_READONLY 717 static bool selectSupportsSourceMods(const SDNode *N) { 718 // TODO: Only applies if select will be vector 719 return N->getValueType(0) == MVT::f32; 720 } 721 722 // Most FP instructions support source modifiers, but this could be refined 723 // slightly. 724 LLVM_READONLY 725 static bool hasSourceMods(const SDNode *N) { 726 if (isa<MemSDNode>(N)) 727 return false; 728 729 switch (N->getOpcode()) { 730 case ISD::CopyToReg: 731 case ISD::FDIV: 732 case ISD::FREM: 733 case ISD::INLINEASM: 734 case ISD::INLINEASM_BR: 735 case AMDGPUISD::DIV_SCALE: 736 case ISD::INTRINSIC_W_CHAIN: 737 738 // TODO: Should really be looking at the users of the bitcast. These are 739 // problematic because bitcasts are used to legalize all stores to integer 740 // types. 741 case ISD::BITCAST: 742 return false; 743 case ISD::INTRINSIC_WO_CHAIN: { 744 switch (N->getConstantOperandVal(0)) { 745 case Intrinsic::amdgcn_interp_p1: 746 case Intrinsic::amdgcn_interp_p2: 747 case Intrinsic::amdgcn_interp_mov: 748 case Intrinsic::amdgcn_interp_p1_f16: 749 case Intrinsic::amdgcn_interp_p2_f16: 750 return false; 751 default: 752 return true; 753 } 754 } 755 case ISD::SELECT: 756 return selectSupportsSourceMods(N); 757 default: 758 return true; 759 } 760 } 761 762 bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N, 763 unsigned CostThreshold) { 764 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus 765 // it is truly free to use a source modifier in all cases. If there are 766 // multiple users but for each one will necessitate using VOP3, there will be 767 // a code size increase. Try to avoid increasing code size unless we know it 768 // will save on the instruction count. 769 unsigned NumMayIncreaseSize = 0; 770 MVT VT = N->getValueType(0).getScalarType().getSimpleVT(); 771 772 assert(!N->use_empty()); 773 774 // XXX - Should this limit number of uses to check? 775 for (const SDNode *U : N->users()) { 776 if (!hasSourceMods(U)) 777 return false; 778 779 if (!opMustUseVOP3Encoding(U, VT)) { 780 if (++NumMayIncreaseSize > CostThreshold) 781 return false; 782 } 783 } 784 785 return true; 786 } 787 788 EVT AMDGPUTargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT, 789 ISD::NodeType ExtendKind) const { 790 assert(!VT.isVector() && "only scalar expected"); 791 792 // Round to the next multiple of 32-bits. 793 unsigned Size = VT.getSizeInBits(); 794 if (Size <= 32) 795 return MVT::i32; 796 return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32)); 797 } 798 799 MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const { 800 return MVT::i32; 801 } 802 803 bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const { 804 return true; 805 } 806 807 // The backend supports 32 and 64 bit floating point immediates. 808 // FIXME: Why are we reporting vectors of FP immediates as legal? 809 bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, 810 bool ForCodeSize) const { 811 EVT ScalarVT = VT.getScalarType(); 812 return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 || 813 (ScalarVT == MVT::f16 && Subtarget->has16BitInsts())); 814 } 815 816 // We don't want to shrink f64 / f32 constants. 817 bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const { 818 EVT ScalarVT = VT.getScalarType(); 819 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64); 820 } 821 822 bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N, 823 ISD::LoadExtType ExtTy, 824 EVT NewVT) const { 825 // TODO: This may be worth removing. Check regression tests for diffs. 826 if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT)) 827 return false; 828 829 unsigned NewSize = NewVT.getStoreSizeInBits(); 830 831 // If we are reducing to a 32-bit load or a smaller multi-dword load, 832 // this is always better. 833 if (NewSize >= 32) 834 return true; 835 836 EVT OldVT = N->getValueType(0); 837 unsigned OldSize = OldVT.getStoreSizeInBits(); 838 839 MemSDNode *MN = cast<MemSDNode>(N); 840 unsigned AS = MN->getAddressSpace(); 841 // Do not shrink an aligned scalar load to sub-dword. 842 // Scalar engine cannot do sub-dword loads. 843 // TODO: Update this for GFX12 which does have scalar sub-dword loads. 844 if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) && 845 (AS == AMDGPUAS::CONSTANT_ADDRESS || 846 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT || 847 (isa<LoadSDNode>(N) && AS == AMDGPUAS::GLOBAL_ADDRESS && 848 MN->isInvariant())) && 849 AMDGPUInstrInfo::isUniformMMO(MN->getMemOperand())) 850 return false; 851 852 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar 853 // extloads, so doing one requires using a buffer_load. In cases where we 854 // still couldn't use a scalar load, using the wider load shouldn't really 855 // hurt anything. 856 857 // If the old size already had to be an extload, there's no harm in continuing 858 // to reduce the width. 859 return (OldSize < 32); 860 } 861 862 bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy, 863 const SelectionDAG &DAG, 864 const MachineMemOperand &MMO) const { 865 866 assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits()); 867 868 if (LoadTy.getScalarType() == MVT::i32) 869 return false; 870 871 unsigned LScalarSize = LoadTy.getScalarSizeInBits(); 872 unsigned CastScalarSize = CastTy.getScalarSizeInBits(); 873 874 if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32)) 875 return false; 876 877 unsigned Fast = 0; 878 return allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), 879 CastTy, MMO, &Fast) && 880 Fast; 881 } 882 883 // SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also 884 // profitable with the expansion for 64-bit since it's generally good to 885 // speculate things. 886 bool AMDGPUTargetLowering::isCheapToSpeculateCttz(Type *Ty) const { 887 return true; 888 } 889 890 bool AMDGPUTargetLowering::isCheapToSpeculateCtlz(Type *Ty) const { 891 return true; 892 } 893 894 bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode *N) const { 895 switch (N->getOpcode()) { 896 case ISD::EntryToken: 897 case ISD::TokenFactor: 898 return true; 899 case ISD::INTRINSIC_WO_CHAIN: { 900 unsigned IntrID = N->getConstantOperandVal(0); 901 return AMDGPU::isIntrinsicAlwaysUniform(IntrID); 902 } 903 case ISD::INTRINSIC_W_CHAIN: { 904 unsigned IntrID = N->getConstantOperandVal(1); 905 return AMDGPU::isIntrinsicAlwaysUniform(IntrID); 906 } 907 case ISD::LOAD: 908 if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() == 909 AMDGPUAS::CONSTANT_ADDRESS_32BIT) 910 return true; 911 return false; 912 case AMDGPUISD::SETCC: // ballot-style instruction 913 return true; 914 } 915 return false; 916 } 917 918 SDValue AMDGPUTargetLowering::getNegatedExpression( 919 SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, 920 NegatibleCost &Cost, unsigned Depth) const { 921 922 switch (Op.getOpcode()) { 923 case ISD::FMA: 924 case ISD::FMAD: { 925 // Negating a fma is not free if it has users without source mods. 926 if (!allUsesHaveSourceMods(Op.getNode())) 927 return SDValue(); 928 break; 929 } 930 case AMDGPUISD::RCP: { 931 SDValue Src = Op.getOperand(0); 932 EVT VT = Op.getValueType(); 933 SDLoc SL(Op); 934 935 SDValue NegSrc = getNegatedExpression(Src, DAG, LegalOperations, 936 ForCodeSize, Cost, Depth + 1); 937 if (NegSrc) 938 return DAG.getNode(AMDGPUISD::RCP, SL, VT, NegSrc, Op->getFlags()); 939 return SDValue(); 940 } 941 default: 942 break; 943 } 944 945 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations, 946 ForCodeSize, Cost, Depth); 947 } 948 949 //===---------------------------------------------------------------------===// 950 // Target Properties 951 //===---------------------------------------------------------------------===// 952 953 bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const { 954 assert(VT.isFloatingPoint()); 955 956 // Packed operations do not have a fabs modifier. 957 return VT == MVT::f32 || VT == MVT::f64 || 958 (Subtarget->has16BitInsts() && (VT == MVT::f16 || VT == MVT::bf16)); 959 } 960 961 bool AMDGPUTargetLowering::isFNegFree(EVT VT) const { 962 assert(VT.isFloatingPoint()); 963 // Report this based on the end legalized type. 964 VT = VT.getScalarType(); 965 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16; 966 } 967 968 bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT, 969 unsigned NumElem, 970 unsigned AS) const { 971 return true; 972 } 973 974 bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const { 975 // There are few operations which truly have vector input operands. Any vector 976 // operation is going to involve operations on each component, and a 977 // build_vector will be a copy per element, so it always makes sense to use a 978 // build_vector input in place of the extracted element to avoid a copy into a 979 // super register. 980 // 981 // We should probably only do this if all users are extracts only, but this 982 // should be the common case. 983 return true; 984 } 985 986 bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const { 987 // Truncate is just accessing a subregister. 988 989 unsigned SrcSize = Source.getSizeInBits(); 990 unsigned DestSize = Dest.getSizeInBits(); 991 992 return DestSize < SrcSize && DestSize % 32 == 0 ; 993 } 994 995 bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const { 996 // Truncate is just accessing a subregister. 997 998 unsigned SrcSize = Source->getScalarSizeInBits(); 999 unsigned DestSize = Dest->getScalarSizeInBits(); 1000 1001 if (DestSize== 16 && Subtarget->has16BitInsts()) 1002 return SrcSize >= 32; 1003 1004 return DestSize < SrcSize && DestSize % 32 == 0; 1005 } 1006 1007 bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const { 1008 unsigned SrcSize = Src->getScalarSizeInBits(); 1009 unsigned DestSize = Dest->getScalarSizeInBits(); 1010 1011 if (SrcSize == 16 && Subtarget->has16BitInsts()) 1012 return DestSize >= 32; 1013 1014 return SrcSize == 32 && DestSize == 64; 1015 } 1016 1017 bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const { 1018 // Any register load of a 64-bit value really requires 2 32-bit moves. For all 1019 // practical purposes, the extra mov 0 to load a 64-bit is free. As used, 1020 // this will enable reducing 64-bit operations the 32-bit, which is always 1021 // good. 1022 1023 if (Src == MVT::i16) 1024 return Dest == MVT::i32 ||Dest == MVT::i64 ; 1025 1026 return Src == MVT::i32 && Dest == MVT::i64; 1027 } 1028 1029 bool AMDGPUTargetLowering::isNarrowingProfitable(SDNode *N, EVT SrcVT, 1030 EVT DestVT) const { 1031 switch (N->getOpcode()) { 1032 case ISD::ADD: 1033 case ISD::SUB: 1034 case ISD::SHL: 1035 case ISD::SRL: 1036 case ISD::SRA: 1037 case ISD::AND: 1038 case ISD::OR: 1039 case ISD::XOR: 1040 case ISD::MUL: 1041 case ISD::SETCC: 1042 case ISD::SELECT: 1043 if (Subtarget->has16BitInsts() && 1044 (DestVT.isVector() ? !Subtarget->hasVOP3PInsts() : true)) { 1045 // Don't narrow back down to i16 if promoted to i32 already. 1046 if (!N->isDivergent() && DestVT.isInteger() && 1047 DestVT.getScalarSizeInBits() > 1 && 1048 DestVT.getScalarSizeInBits() <= 16 && 1049 SrcVT.getScalarSizeInBits() > 16) { 1050 return false; 1051 } 1052 } 1053 return true; 1054 default: 1055 break; 1056 } 1057 1058 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a 1059 // limited number of native 64-bit operations. Shrinking an operation to fit 1060 // in a single 32-bit register should always be helpful. As currently used, 1061 // this is much less general than the name suggests, and is only used in 1062 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is 1063 // not profitable, and may actually be harmful. 1064 if (isa<LoadSDNode>(N)) 1065 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32; 1066 1067 return true; 1068 } 1069 1070 bool AMDGPUTargetLowering::isDesirableToCommuteWithShift( 1071 const SDNode* N, CombineLevel Level) const { 1072 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA || 1073 N->getOpcode() == ISD::SRL) && 1074 "Expected shift op"); 1075 1076 SDValue ShiftLHS = N->getOperand(0); 1077 if (!ShiftLHS->hasOneUse()) 1078 return false; 1079 1080 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND && 1081 !ShiftLHS.getOperand(0)->hasOneUse()) 1082 return false; 1083 1084 // Always commute pre-type legalization and right shifts. 1085 // We're looking for shl(or(x,y),z) patterns. 1086 if (Level < CombineLevel::AfterLegalizeTypes || 1087 N->getOpcode() != ISD::SHL || N->getOperand(0).getOpcode() != ISD::OR) 1088 return true; 1089 1090 // If only user is a i32 right-shift, then don't destroy a BFE pattern. 1091 if (N->getValueType(0) == MVT::i32 && N->hasOneUse() && 1092 (N->user_begin()->getOpcode() == ISD::SRA || 1093 N->user_begin()->getOpcode() == ISD::SRL)) 1094 return false; 1095 1096 // Don't destroy or(shl(load_zext(),c), load_zext()) patterns. 1097 auto IsShiftAndLoad = [](SDValue LHS, SDValue RHS) { 1098 if (LHS.getOpcode() != ISD::SHL) 1099 return false; 1100 auto *RHSLd = dyn_cast<LoadSDNode>(RHS); 1101 auto *LHS0 = dyn_cast<LoadSDNode>(LHS.getOperand(0)); 1102 auto *LHS1 = dyn_cast<ConstantSDNode>(LHS.getOperand(1)); 1103 return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() == ISD::ZEXTLOAD && 1104 LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() && 1105 RHSLd->getExtensionType() == ISD::ZEXTLOAD; 1106 }; 1107 SDValue LHS = N->getOperand(0).getOperand(0); 1108 SDValue RHS = N->getOperand(0).getOperand(1); 1109 return !(IsShiftAndLoad(LHS, RHS) || IsShiftAndLoad(RHS, LHS)); 1110 } 1111 1112 //===---------------------------------------------------------------------===// 1113 // TargetLowering Callbacks 1114 //===---------------------------------------------------------------------===// 1115 1116 CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC, 1117 bool IsVarArg) { 1118 switch (CC) { 1119 case CallingConv::AMDGPU_VS: 1120 case CallingConv::AMDGPU_GS: 1121 case CallingConv::AMDGPU_PS: 1122 case CallingConv::AMDGPU_CS: 1123 case CallingConv::AMDGPU_HS: 1124 case CallingConv::AMDGPU_ES: 1125 case CallingConv::AMDGPU_LS: 1126 return CC_AMDGPU; 1127 case CallingConv::AMDGPU_CS_Chain: 1128 case CallingConv::AMDGPU_CS_ChainPreserve: 1129 return CC_AMDGPU_CS_CHAIN; 1130 case CallingConv::C: 1131 case CallingConv::Fast: 1132 case CallingConv::Cold: 1133 return CC_AMDGPU_Func; 1134 case CallingConv::AMDGPU_Gfx: 1135 return CC_SI_Gfx; 1136 case CallingConv::AMDGPU_KERNEL: 1137 case CallingConv::SPIR_KERNEL: 1138 default: 1139 report_fatal_error("Unsupported calling convention for call"); 1140 } 1141 } 1142 1143 CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC, 1144 bool IsVarArg) { 1145 switch (CC) { 1146 case CallingConv::AMDGPU_KERNEL: 1147 case CallingConv::SPIR_KERNEL: 1148 llvm_unreachable("kernels should not be handled here"); 1149 case CallingConv::AMDGPU_VS: 1150 case CallingConv::AMDGPU_GS: 1151 case CallingConv::AMDGPU_PS: 1152 case CallingConv::AMDGPU_CS: 1153 case CallingConv::AMDGPU_CS_Chain: 1154 case CallingConv::AMDGPU_CS_ChainPreserve: 1155 case CallingConv::AMDGPU_HS: 1156 case CallingConv::AMDGPU_ES: 1157 case CallingConv::AMDGPU_LS: 1158 return RetCC_SI_Shader; 1159 case CallingConv::AMDGPU_Gfx: 1160 return RetCC_SI_Gfx; 1161 case CallingConv::C: 1162 case CallingConv::Fast: 1163 case CallingConv::Cold: 1164 return RetCC_AMDGPU_Func; 1165 default: 1166 report_fatal_error("Unsupported calling convention."); 1167 } 1168 } 1169 1170 /// The SelectionDAGBuilder will automatically promote function arguments 1171 /// with illegal types. However, this does not work for the AMDGPU targets 1172 /// since the function arguments are stored in memory as these illegal types. 1173 /// In order to handle this properly we need to get the original types sizes 1174 /// from the LLVM IR Function and fixup the ISD:InputArg values before 1175 /// passing them to AnalyzeFormalArguments() 1176 1177 /// When the SelectionDAGBuilder computes the Ins, it takes care of splitting 1178 /// input values across multiple registers. Each item in the Ins array 1179 /// represents a single value that will be stored in registers. Ins[x].VT is 1180 /// the value type of the value that will be stored in the register, so 1181 /// whatever SDNode we lower the argument to needs to be this type. 1182 /// 1183 /// In order to correctly lower the arguments we need to know the size of each 1184 /// argument. Since Ins[x].VT gives us the size of the register that will 1185 /// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type 1186 /// for the original function argument so that we can deduce the correct memory 1187 /// type to use for Ins[x]. In most cases the correct memory type will be 1188 /// Ins[x].ArgVT. However, this will not always be the case. If, for example, 1189 /// we have a kernel argument of type v8i8, this argument will be split into 1190 /// 8 parts and each part will be represented by its own item in the Ins array. 1191 /// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of 1192 /// the argument before it was split. From this, we deduce that the memory type 1193 /// for each individual part is i8. We pass the memory type as LocVT to the 1194 /// calling convention analysis function and the register type (Ins[x].VT) as 1195 /// the ValVT. 1196 void AMDGPUTargetLowering::analyzeFormalArgumentsCompute( 1197 CCState &State, 1198 const SmallVectorImpl<ISD::InputArg> &Ins) const { 1199 const MachineFunction &MF = State.getMachineFunction(); 1200 const Function &Fn = MF.getFunction(); 1201 LLVMContext &Ctx = Fn.getParent()->getContext(); 1202 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF); 1203 const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset(); 1204 CallingConv::ID CC = Fn.getCallingConv(); 1205 1206 Align MaxAlign = Align(1); 1207 uint64_t ExplicitArgOffset = 0; 1208 const DataLayout &DL = Fn.getDataLayout(); 1209 1210 unsigned InIndex = 0; 1211 1212 for (const Argument &Arg : Fn.args()) { 1213 const bool IsByRef = Arg.hasByRefAttr(); 1214 Type *BaseArgTy = Arg.getType(); 1215 Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy; 1216 Align Alignment = DL.getValueOrABITypeAlignment( 1217 IsByRef ? Arg.getParamAlign() : std::nullopt, MemArgTy); 1218 MaxAlign = std::max(Alignment, MaxAlign); 1219 uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy); 1220 1221 uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset; 1222 ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize; 1223 1224 // We're basically throwing away everything passed into us and starting over 1225 // to get accurate in-memory offsets. The "PartOffset" is completely useless 1226 // to us as computed in Ins. 1227 // 1228 // We also need to figure out what type legalization is trying to do to get 1229 // the correct memory offsets. 1230 1231 SmallVector<EVT, 16> ValueVTs; 1232 SmallVector<uint64_t, 16> Offsets; 1233 ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset); 1234 1235 for (unsigned Value = 0, NumValues = ValueVTs.size(); 1236 Value != NumValues; ++Value) { 1237 uint64_t BasePartOffset = Offsets[Value]; 1238 1239 EVT ArgVT = ValueVTs[Value]; 1240 EVT MemVT = ArgVT; 1241 MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT); 1242 unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT); 1243 1244 if (NumRegs == 1) { 1245 // This argument is not split, so the IR type is the memory type. 1246 if (ArgVT.isExtended()) { 1247 // We have an extended type, like i24, so we should just use the 1248 // register type. 1249 MemVT = RegisterVT; 1250 } else { 1251 MemVT = ArgVT; 1252 } 1253 } else if (ArgVT.isVector() && RegisterVT.isVector() && 1254 ArgVT.getScalarType() == RegisterVT.getScalarType()) { 1255 assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements()); 1256 // We have a vector value which has been split into a vector with 1257 // the same scalar type, but fewer elements. This should handle 1258 // all the floating-point vector types. 1259 MemVT = RegisterVT; 1260 } else if (ArgVT.isVector() && 1261 ArgVT.getVectorNumElements() == NumRegs) { 1262 // This arg has been split so that each element is stored in a separate 1263 // register. 1264 MemVT = ArgVT.getScalarType(); 1265 } else if (ArgVT.isExtended()) { 1266 // We have an extended type, like i65. 1267 MemVT = RegisterVT; 1268 } else { 1269 unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs; 1270 assert(ArgVT.getStoreSizeInBits() % NumRegs == 0); 1271 if (RegisterVT.isInteger()) { 1272 MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits); 1273 } else if (RegisterVT.isVector()) { 1274 assert(!RegisterVT.getScalarType().isFloatingPoint()); 1275 unsigned NumElements = RegisterVT.getVectorNumElements(); 1276 assert(MemoryBits % NumElements == 0); 1277 // This vector type has been split into another vector type with 1278 // a different elements size. 1279 EVT ScalarVT = EVT::getIntegerVT(State.getContext(), 1280 MemoryBits / NumElements); 1281 MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements); 1282 } else { 1283 llvm_unreachable("cannot deduce memory type."); 1284 } 1285 } 1286 1287 // Convert one element vectors to scalar. 1288 if (MemVT.isVector() && MemVT.getVectorNumElements() == 1) 1289 MemVT = MemVT.getScalarType(); 1290 1291 // Round up vec3/vec5 argument. 1292 if (MemVT.isVector() && !MemVT.isPow2VectorType()) { 1293 MemVT = MemVT.getPow2VectorType(State.getContext()); 1294 } else if (!MemVT.isSimple() && !MemVT.isVector()) { 1295 MemVT = MemVT.getRoundIntegerType(State.getContext()); 1296 } 1297 1298 unsigned PartOffset = 0; 1299 for (unsigned i = 0; i != NumRegs; ++i) { 1300 State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT, 1301 BasePartOffset + PartOffset, 1302 MemVT.getSimpleVT(), 1303 CCValAssign::Full)); 1304 PartOffset += MemVT.getStoreSize(); 1305 } 1306 } 1307 } 1308 } 1309 1310 SDValue AMDGPUTargetLowering::LowerReturn( 1311 SDValue Chain, CallingConv::ID CallConv, 1312 bool isVarArg, 1313 const SmallVectorImpl<ISD::OutputArg> &Outs, 1314 const SmallVectorImpl<SDValue> &OutVals, 1315 const SDLoc &DL, SelectionDAG &DAG) const { 1316 // FIXME: Fails for r600 tests 1317 //assert(!isVarArg && Outs.empty() && OutVals.empty() && 1318 // "wave terminate should not have return values"); 1319 return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain); 1320 } 1321 1322 //===---------------------------------------------------------------------===// 1323 // Target specific lowering 1324 //===---------------------------------------------------------------------===// 1325 1326 /// Selects the correct CCAssignFn for a given CallingConvention value. 1327 CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC, 1328 bool IsVarArg) { 1329 return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg); 1330 } 1331 1332 CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC, 1333 bool IsVarArg) { 1334 return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg); 1335 } 1336 1337 SDValue AMDGPUTargetLowering::addTokenForArgument(SDValue Chain, 1338 SelectionDAG &DAG, 1339 MachineFrameInfo &MFI, 1340 int ClobberedFI) const { 1341 SmallVector<SDValue, 8> ArgChains; 1342 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI); 1343 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1; 1344 1345 // Include the original chain at the beginning of the list. When this is 1346 // used by target LowerCall hooks, this helps legalize find the 1347 // CALLSEQ_BEGIN node. 1348 ArgChains.push_back(Chain); 1349 1350 // Add a chain value for each stack argument corresponding 1351 for (SDNode *U : DAG.getEntryNode().getNode()->users()) { 1352 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) { 1353 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) { 1354 if (FI->getIndex() < 0) { 1355 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex()); 1356 int64_t InLastByte = InFirstByte; 1357 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1; 1358 1359 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) || 1360 (FirstByte <= InFirstByte && InFirstByte <= LastByte)) 1361 ArgChains.push_back(SDValue(L, 1)); 1362 } 1363 } 1364 } 1365 } 1366 1367 // Build a tokenfactor for all the chains. 1368 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains); 1369 } 1370 1371 SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI, 1372 SmallVectorImpl<SDValue> &InVals, 1373 StringRef Reason) const { 1374 SDValue Callee = CLI.Callee; 1375 SelectionDAG &DAG = CLI.DAG; 1376 1377 const Function &Fn = DAG.getMachineFunction().getFunction(); 1378 1379 StringRef FuncName("<unknown>"); 1380 1381 if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee)) 1382 FuncName = G->getSymbol(); 1383 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) 1384 FuncName = G->getGlobal()->getName(); 1385 1386 DiagnosticInfoUnsupported NoCalls( 1387 Fn, Reason + FuncName, CLI.DL.getDebugLoc()); 1388 DAG.getContext()->diagnose(NoCalls); 1389 1390 if (!CLI.IsTailCall) { 1391 for (ISD::InputArg &Arg : CLI.Ins) 1392 InVals.push_back(DAG.getUNDEF(Arg.VT)); 1393 } 1394 1395 return DAG.getEntryNode(); 1396 } 1397 1398 SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI, 1399 SmallVectorImpl<SDValue> &InVals) const { 1400 return lowerUnhandledCall(CLI, InVals, "unsupported call to function "); 1401 } 1402 1403 SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 1404 SelectionDAG &DAG) const { 1405 const Function &Fn = DAG.getMachineFunction().getFunction(); 1406 1407 DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca", 1408 SDLoc(Op).getDebugLoc()); 1409 DAG.getContext()->diagnose(NoDynamicAlloca); 1410 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)}; 1411 return DAG.getMergeValues(Ops, SDLoc()); 1412 } 1413 1414 SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, 1415 SelectionDAG &DAG) const { 1416 switch (Op.getOpcode()) { 1417 default: 1418 Op->print(errs(), &DAG); 1419 llvm_unreachable("Custom lowering code for this " 1420 "instruction is not implemented yet!"); 1421 break; 1422 case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG); 1423 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); 1424 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); 1425 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG); 1426 case ISD::SDIVREM: return LowerSDIVREM(Op, DAG); 1427 case ISD::FREM: return LowerFREM(Op, DAG); 1428 case ISD::FCEIL: return LowerFCEIL(Op, DAG); 1429 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG); 1430 case ISD::FRINT: return LowerFRINT(Op, DAG); 1431 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG); 1432 case ISD::FROUNDEVEN: 1433 return LowerFROUNDEVEN(Op, DAG); 1434 case ISD::FROUND: return LowerFROUND(Op, DAG); 1435 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG); 1436 case ISD::FLOG2: 1437 return LowerFLOG2(Op, DAG); 1438 case ISD::FLOG: 1439 case ISD::FLOG10: 1440 return LowerFLOGCommon(Op, DAG); 1441 case ISD::FEXP: 1442 case ISD::FEXP10: 1443 return lowerFEXP(Op, DAG); 1444 case ISD::FEXP2: 1445 return lowerFEXP2(Op, DAG); 1446 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); 1447 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); 1448 case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG); 1449 case ISD::FP_TO_SINT: 1450 case ISD::FP_TO_UINT: 1451 return LowerFP_TO_INT(Op, DAG); 1452 case ISD::CTTZ: 1453 case ISD::CTTZ_ZERO_UNDEF: 1454 case ISD::CTLZ: 1455 case ISD::CTLZ_ZERO_UNDEF: 1456 return LowerCTLZ_CTTZ(Op, DAG); 1457 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); 1458 } 1459 return Op; 1460 } 1461 1462 void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N, 1463 SmallVectorImpl<SDValue> &Results, 1464 SelectionDAG &DAG) const { 1465 switch (N->getOpcode()) { 1466 case ISD::SIGN_EXTEND_INREG: 1467 // Different parts of legalization seem to interpret which type of 1468 // sign_extend_inreg is the one to check for custom lowering. The extended 1469 // from type is what really matters, but some places check for custom 1470 // lowering of the result type. This results in trying to use 1471 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do 1472 // nothing here and let the illegal result integer be handled normally. 1473 return; 1474 case ISD::FLOG2: 1475 if (SDValue Lowered = LowerFLOG2(SDValue(N, 0), DAG)) 1476 Results.push_back(Lowered); 1477 return; 1478 case ISD::FLOG: 1479 case ISD::FLOG10: 1480 if (SDValue Lowered = LowerFLOGCommon(SDValue(N, 0), DAG)) 1481 Results.push_back(Lowered); 1482 return; 1483 case ISD::FEXP2: 1484 if (SDValue Lowered = lowerFEXP2(SDValue(N, 0), DAG)) 1485 Results.push_back(Lowered); 1486 return; 1487 case ISD::FEXP: 1488 case ISD::FEXP10: 1489 if (SDValue Lowered = lowerFEXP(SDValue(N, 0), DAG)) 1490 Results.push_back(Lowered); 1491 return; 1492 case ISD::CTLZ: 1493 case ISD::CTLZ_ZERO_UNDEF: 1494 if (auto Lowered = lowerCTLZResults(SDValue(N, 0u), DAG)) 1495 Results.push_back(Lowered); 1496 return; 1497 default: 1498 return; 1499 } 1500 } 1501 1502 SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, 1503 SDValue Op, 1504 SelectionDAG &DAG) const { 1505 1506 const DataLayout &DL = DAG.getDataLayout(); 1507 GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op); 1508 const GlobalValue *GV = G->getGlobal(); 1509 1510 if (!MFI->isModuleEntryFunction()) { 1511 if (std::optional<uint32_t> Address = 1512 AMDGPUMachineFunction::getLDSAbsoluteAddress(*GV)) { 1513 return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType()); 1514 } 1515 } 1516 1517 if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || 1518 G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) { 1519 if (!MFI->isModuleEntryFunction() && 1520 GV->getName() != "llvm.amdgcn.module.lds" && 1521 !AMDGPU::isNamedBarrier(*cast<GlobalVariable>(GV))) { 1522 SDLoc DL(Op); 1523 const Function &Fn = DAG.getMachineFunction().getFunction(); 1524 DiagnosticInfoUnsupported BadLDSDecl( 1525 Fn, "local memory global used by non-kernel function", 1526 DL.getDebugLoc(), DS_Warning); 1527 DAG.getContext()->diagnose(BadLDSDecl); 1528 1529 // We currently don't have a way to correctly allocate LDS objects that 1530 // aren't directly associated with a kernel. We do force inlining of 1531 // functions that use local objects. However, if these dead functions are 1532 // not eliminated, we don't want a compile time error. Just emit a warning 1533 // and a trap, since there should be no callable path here. 1534 SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode()); 1535 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, 1536 Trap, DAG.getRoot()); 1537 DAG.setRoot(OutputChain); 1538 return DAG.getUNDEF(Op.getValueType()); 1539 } 1540 1541 // XXX: What does the value of G->getOffset() mean? 1542 assert(G->getOffset() == 0 && 1543 "Do not know what to do with an non-zero offset"); 1544 1545 // TODO: We could emit code to handle the initialization somewhere. 1546 // We ignore the initializer for now and legalize it to allow selection. 1547 // The initializer will anyway get errored out during assembly emission. 1548 unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV)); 1549 return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType()); 1550 } 1551 return SDValue(); 1552 } 1553 1554 SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op, 1555 SelectionDAG &DAG) const { 1556 SmallVector<SDValue, 8> Args; 1557 SDLoc SL(Op); 1558 1559 EVT VT = Op.getValueType(); 1560 if (VT.getVectorElementType().getSizeInBits() < 32) { 1561 unsigned OpBitSize = Op.getOperand(0).getValueType().getSizeInBits(); 1562 if (OpBitSize >= 32 && OpBitSize % 32 == 0) { 1563 unsigned NewNumElt = OpBitSize / 32; 1564 EVT NewEltVT = (NewNumElt == 1) ? MVT::i32 1565 : EVT::getVectorVT(*DAG.getContext(), 1566 MVT::i32, NewNumElt); 1567 for (const SDUse &U : Op->ops()) { 1568 SDValue In = U.get(); 1569 SDValue NewIn = DAG.getNode(ISD::BITCAST, SL, NewEltVT, In); 1570 if (NewNumElt > 1) 1571 DAG.ExtractVectorElements(NewIn, Args); 1572 else 1573 Args.push_back(NewIn); 1574 } 1575 1576 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, 1577 NewNumElt * Op.getNumOperands()); 1578 SDValue BV = DAG.getBuildVector(NewVT, SL, Args); 1579 return DAG.getNode(ISD::BITCAST, SL, VT, BV); 1580 } 1581 } 1582 1583 for (const SDUse &U : Op->ops()) 1584 DAG.ExtractVectorElements(U.get(), Args); 1585 1586 return DAG.getBuildVector(Op.getValueType(), SL, Args); 1587 } 1588 1589 SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, 1590 SelectionDAG &DAG) const { 1591 SDLoc SL(Op); 1592 SmallVector<SDValue, 8> Args; 1593 unsigned Start = Op.getConstantOperandVal(1); 1594 EVT VT = Op.getValueType(); 1595 EVT SrcVT = Op.getOperand(0).getValueType(); 1596 1597 if (VT.getScalarSizeInBits() == 16 && Start % 2 == 0) { 1598 unsigned NumElt = VT.getVectorNumElements(); 1599 unsigned NumSrcElt = SrcVT.getVectorNumElements(); 1600 assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 && "expect legal types"); 1601 1602 // Extract 32-bit registers at a time. 1603 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumSrcElt / 2); 1604 EVT NewVT = NumElt == 2 1605 ? MVT::i32 1606 : EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElt / 2); 1607 SDValue Tmp = DAG.getNode(ISD::BITCAST, SL, NewSrcVT, Op.getOperand(0)); 1608 1609 DAG.ExtractVectorElements(Tmp, Args, Start / 2, NumElt / 2); 1610 if (NumElt == 2) 1611 Tmp = Args[0]; 1612 else 1613 Tmp = DAG.getBuildVector(NewVT, SL, Args); 1614 1615 return DAG.getNode(ISD::BITCAST, SL, VT, Tmp); 1616 } 1617 1618 DAG.ExtractVectorElements(Op.getOperand(0), Args, Start, 1619 VT.getVectorNumElements()); 1620 1621 return DAG.getBuildVector(Op.getValueType(), SL, Args); 1622 } 1623 1624 // TODO: Handle fabs too 1625 static SDValue peekFNeg(SDValue Val) { 1626 if (Val.getOpcode() == ISD::FNEG) 1627 return Val.getOperand(0); 1628 1629 return Val; 1630 } 1631 1632 static SDValue peekFPSignOps(SDValue Val) { 1633 if (Val.getOpcode() == ISD::FNEG) 1634 Val = Val.getOperand(0); 1635 if (Val.getOpcode() == ISD::FABS) 1636 Val = Val.getOperand(0); 1637 if (Val.getOpcode() == ISD::FCOPYSIGN) 1638 Val = Val.getOperand(0); 1639 return Val; 1640 } 1641 1642 SDValue AMDGPUTargetLowering::combineFMinMaxLegacyImpl( 1643 const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, 1644 SDValue False, SDValue CC, DAGCombinerInfo &DCI) const { 1645 SelectionDAG &DAG = DCI.DAG; 1646 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); 1647 switch (CCOpcode) { 1648 case ISD::SETOEQ: 1649 case ISD::SETONE: 1650 case ISD::SETUNE: 1651 case ISD::SETNE: 1652 case ISD::SETUEQ: 1653 case ISD::SETEQ: 1654 case ISD::SETFALSE: 1655 case ISD::SETFALSE2: 1656 case ISD::SETTRUE: 1657 case ISD::SETTRUE2: 1658 case ISD::SETUO: 1659 case ISD::SETO: 1660 break; 1661 case ISD::SETULE: 1662 case ISD::SETULT: { 1663 if (LHS == True) 1664 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS); 1665 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS); 1666 } 1667 case ISD::SETOLE: 1668 case ISD::SETOLT: 1669 case ISD::SETLE: 1670 case ISD::SETLT: { 1671 // Ordered. Assume ordered for undefined. 1672 1673 // Only do this after legalization to avoid interfering with other combines 1674 // which might occur. 1675 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG && 1676 !DCI.isCalledByLegalizer()) 1677 return SDValue(); 1678 1679 // We need to permute the operands to get the correct NaN behavior. The 1680 // selected operand is the second one based on the failing compare with NaN, 1681 // so permute it based on the compare type the hardware uses. 1682 if (LHS == True) 1683 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS); 1684 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS); 1685 } 1686 case ISD::SETUGE: 1687 case ISD::SETUGT: { 1688 if (LHS == True) 1689 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS); 1690 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS); 1691 } 1692 case ISD::SETGT: 1693 case ISD::SETGE: 1694 case ISD::SETOGE: 1695 case ISD::SETOGT: { 1696 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG && 1697 !DCI.isCalledByLegalizer()) 1698 return SDValue(); 1699 1700 if (LHS == True) 1701 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS); 1702 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS); 1703 } 1704 case ISD::SETCC_INVALID: 1705 llvm_unreachable("Invalid setcc condcode!"); 1706 } 1707 return SDValue(); 1708 } 1709 1710 /// Generate Min/Max node 1711 SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT, 1712 SDValue LHS, SDValue RHS, 1713 SDValue True, SDValue False, 1714 SDValue CC, 1715 DAGCombinerInfo &DCI) const { 1716 if ((LHS == True && RHS == False) || (LHS == False && RHS == True)) 1717 return combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True, False, CC, DCI); 1718 1719 SelectionDAG &DAG = DCI.DAG; 1720 1721 // If we can't directly match this, try to see if we can fold an fneg to 1722 // match. 1723 1724 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS); 1725 ConstantFPSDNode *CFalse = dyn_cast<ConstantFPSDNode>(False); 1726 SDValue NegTrue = peekFNeg(True); 1727 1728 // Undo the combine foldFreeOpFromSelect does if it helps us match the 1729 // fmin/fmax. 1730 // 1731 // select (fcmp olt (lhs, K)), (fneg lhs), -K 1732 // -> fneg (fmin_legacy lhs, K) 1733 // 1734 // TODO: Use getNegatedExpression 1735 if (LHS == NegTrue && CFalse && CRHS) { 1736 APFloat NegRHS = neg(CRHS->getValueAPF()); 1737 if (NegRHS == CFalse->getValueAPF()) { 1738 SDValue Combined = 1739 combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, NegTrue, False, CC, DCI); 1740 if (Combined) 1741 return DAG.getNode(ISD::FNEG, DL, VT, Combined); 1742 return SDValue(); 1743 } 1744 } 1745 1746 return SDValue(); 1747 } 1748 1749 std::pair<SDValue, SDValue> 1750 AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const { 1751 SDLoc SL(Op); 1752 1753 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op); 1754 1755 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); 1756 const SDValue One = DAG.getConstant(1, SL, MVT::i32); 1757 1758 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero); 1759 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One); 1760 1761 return std::pair(Lo, Hi); 1762 } 1763 1764 SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const { 1765 SDLoc SL(Op); 1766 1767 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op); 1768 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); 1769 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero); 1770 } 1771 1772 SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const { 1773 SDLoc SL(Op); 1774 1775 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op); 1776 const SDValue One = DAG.getConstant(1, SL, MVT::i32); 1777 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One); 1778 } 1779 1780 // Split a vector type into two parts. The first part is a power of two vector. 1781 // The second part is whatever is left over, and is a scalar if it would 1782 // otherwise be a 1-vector. 1783 std::pair<EVT, EVT> 1784 AMDGPUTargetLowering::getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const { 1785 EVT LoVT, HiVT; 1786 EVT EltVT = VT.getVectorElementType(); 1787 unsigned NumElts = VT.getVectorNumElements(); 1788 unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2); 1789 LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts); 1790 HiVT = NumElts - LoNumElts == 1 1791 ? EltVT 1792 : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts); 1793 return std::pair(LoVT, HiVT); 1794 } 1795 1796 // Split a vector value into two parts of types LoVT and HiVT. HiVT could be 1797 // scalar. 1798 std::pair<SDValue, SDValue> 1799 AMDGPUTargetLowering::splitVector(const SDValue &N, const SDLoc &DL, 1800 const EVT &LoVT, const EVT &HiVT, 1801 SelectionDAG &DAG) const { 1802 assert(LoVT.getVectorNumElements() + 1803 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <= 1804 N.getValueType().getVectorNumElements() && 1805 "More vector elements requested than available!"); 1806 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N, 1807 DAG.getVectorIdxConstant(0, DL)); 1808 SDValue Hi = DAG.getNode( 1809 HiVT.isVector() ? ISD::EXTRACT_SUBVECTOR : ISD::EXTRACT_VECTOR_ELT, DL, 1810 HiVT, N, DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), DL)); 1811 return std::pair(Lo, Hi); 1812 } 1813 1814 SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op, 1815 SelectionDAG &DAG) const { 1816 LoadSDNode *Load = cast<LoadSDNode>(Op); 1817 EVT VT = Op.getValueType(); 1818 SDLoc SL(Op); 1819 1820 1821 // If this is a 2 element vector, we really want to scalarize and not create 1822 // weird 1 element vectors. 1823 if (VT.getVectorNumElements() == 2) { 1824 SDValue Ops[2]; 1825 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG); 1826 return DAG.getMergeValues(Ops, SL); 1827 } 1828 1829 SDValue BasePtr = Load->getBasePtr(); 1830 EVT MemVT = Load->getMemoryVT(); 1831 1832 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo(); 1833 1834 EVT LoVT, HiVT; 1835 EVT LoMemVT, HiMemVT; 1836 SDValue Lo, Hi; 1837 1838 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG); 1839 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG); 1840 std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG); 1841 1842 unsigned Size = LoMemVT.getStoreSize(); 1843 Align BaseAlign = Load->getAlign(); 1844 Align HiAlign = commonAlignment(BaseAlign, Size); 1845 1846 SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT, 1847 Load->getChain(), BasePtr, SrcValue, LoMemVT, 1848 BaseAlign, Load->getMemOperand()->getFlags()); 1849 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Size)); 1850 SDValue HiLoad = 1851 DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(), 1852 HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()), 1853 HiMemVT, HiAlign, Load->getMemOperand()->getFlags()); 1854 1855 SDValue Join; 1856 if (LoVT == HiVT) { 1857 // This is the case that the vector is power of two so was evenly split. 1858 Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad); 1859 } else { 1860 Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad, 1861 DAG.getVectorIdxConstant(0, SL)); 1862 Join = DAG.getNode( 1863 HiVT.isVector() ? ISD::INSERT_SUBVECTOR : ISD::INSERT_VECTOR_ELT, SL, 1864 VT, Join, HiLoad, 1865 DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), SL)); 1866 } 1867 1868 SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other, 1869 LoLoad.getValue(1), HiLoad.getValue(1))}; 1870 1871 return DAG.getMergeValues(Ops, SL); 1872 } 1873 1874 SDValue AMDGPUTargetLowering::WidenOrSplitVectorLoad(SDValue Op, 1875 SelectionDAG &DAG) const { 1876 LoadSDNode *Load = cast<LoadSDNode>(Op); 1877 EVT VT = Op.getValueType(); 1878 SDValue BasePtr = Load->getBasePtr(); 1879 EVT MemVT = Load->getMemoryVT(); 1880 SDLoc SL(Op); 1881 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo(); 1882 Align BaseAlign = Load->getAlign(); 1883 unsigned NumElements = MemVT.getVectorNumElements(); 1884 1885 // Widen from vec3 to vec4 when the load is at least 8-byte aligned 1886 // or 16-byte fully dereferenceable. Otherwise, split the vector load. 1887 if (NumElements != 3 || 1888 (BaseAlign < Align(8) && 1889 !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout()))) 1890 return SplitVectorLoad(Op, DAG); 1891 1892 assert(NumElements == 3); 1893 1894 EVT WideVT = 1895 EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4); 1896 EVT WideMemVT = 1897 EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(), 4); 1898 SDValue WideLoad = DAG.getExtLoad( 1899 Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue, 1900 WideMemVT, BaseAlign, Load->getMemOperand()->getFlags()); 1901 return DAG.getMergeValues( 1902 {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad, 1903 DAG.getVectorIdxConstant(0, SL)), 1904 WideLoad.getValue(1)}, 1905 SL); 1906 } 1907 1908 SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op, 1909 SelectionDAG &DAG) const { 1910 StoreSDNode *Store = cast<StoreSDNode>(Op); 1911 SDValue Val = Store->getValue(); 1912 EVT VT = Val.getValueType(); 1913 1914 // If this is a 2 element vector, we really want to scalarize and not create 1915 // weird 1 element vectors. 1916 if (VT.getVectorNumElements() == 2) 1917 return scalarizeVectorStore(Store, DAG); 1918 1919 EVT MemVT = Store->getMemoryVT(); 1920 SDValue Chain = Store->getChain(); 1921 SDValue BasePtr = Store->getBasePtr(); 1922 SDLoc SL(Op); 1923 1924 EVT LoVT, HiVT; 1925 EVT LoMemVT, HiMemVT; 1926 SDValue Lo, Hi; 1927 1928 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG); 1929 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG); 1930 std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG); 1931 1932 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize()); 1933 1934 const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo(); 1935 Align BaseAlign = Store->getAlign(); 1936 unsigned Size = LoMemVT.getStoreSize(); 1937 Align HiAlign = commonAlignment(BaseAlign, Size); 1938 1939 SDValue LoStore = 1940 DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign, 1941 Store->getMemOperand()->getFlags()); 1942 SDValue HiStore = 1943 DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size), 1944 HiMemVT, HiAlign, Store->getMemOperand()->getFlags()); 1945 1946 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore); 1947 } 1948 1949 // This is a shortcut for integer division because we have fast i32<->f32 1950 // conversions, and fast f32 reciprocal instructions. The fractional part of a 1951 // float is enough to accurately represent up to a 24-bit signed integer. 1952 SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, 1953 bool Sign) const { 1954 SDLoc DL(Op); 1955 EVT VT = Op.getValueType(); 1956 SDValue LHS = Op.getOperand(0); 1957 SDValue RHS = Op.getOperand(1); 1958 MVT IntVT = MVT::i32; 1959 MVT FltVT = MVT::f32; 1960 1961 unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS); 1962 if (LHSSignBits < 9) 1963 return SDValue(); 1964 1965 unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS); 1966 if (RHSSignBits < 9) 1967 return SDValue(); 1968 1969 unsigned BitSize = VT.getSizeInBits(); 1970 unsigned SignBits = std::min(LHSSignBits, RHSSignBits); 1971 unsigned DivBits = BitSize - SignBits; 1972 if (Sign) 1973 ++DivBits; 1974 1975 ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP; 1976 ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT; 1977 1978 SDValue jq = DAG.getConstant(1, DL, IntVT); 1979 1980 if (Sign) { 1981 // char|short jq = ia ^ ib; 1982 jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS); 1983 1984 // jq = jq >> (bitsize - 2) 1985 jq = DAG.getNode(ISD::SRA, DL, VT, jq, 1986 DAG.getConstant(BitSize - 2, DL, VT)); 1987 1988 // jq = jq | 0x1 1989 jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT)); 1990 } 1991 1992 // int ia = (int)LHS; 1993 SDValue ia = LHS; 1994 1995 // int ib, (int)RHS; 1996 SDValue ib = RHS; 1997 1998 // float fa = (float)ia; 1999 SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia); 2000 2001 // float fb = (float)ib; 2002 SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib); 2003 2004 SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT, 2005 fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb)); 2006 2007 // fq = trunc(fq); 2008 fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq); 2009 2010 // float fqneg = -fq; 2011 SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq); 2012 2013 MachineFunction &MF = DAG.getMachineFunction(); 2014 2015 bool UseFmadFtz = false; 2016 if (Subtarget->isGCN()) { 2017 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2018 UseFmadFtz = 2019 MFI->getMode().FP32Denormals != DenormalMode::getPreserveSign(); 2020 } 2021 2022 // float fr = mad(fqneg, fb, fa); 2023 unsigned OpCode = !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA 2024 : UseFmadFtz ? (unsigned)AMDGPUISD::FMAD_FTZ 2025 : (unsigned)ISD::FMAD; 2026 SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa); 2027 2028 // int iq = (int)fq; 2029 SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq); 2030 2031 // fr = fabs(fr); 2032 fr = DAG.getNode(ISD::FABS, DL, FltVT, fr); 2033 2034 // fb = fabs(fb); 2035 fb = DAG.getNode(ISD::FABS, DL, FltVT, fb); 2036 2037 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 2038 2039 // int cv = fr >= fb; 2040 SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE); 2041 2042 // jq = (cv ? jq : 0); 2043 jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT)); 2044 2045 // dst = iq + jq; 2046 SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq); 2047 2048 // Rem needs compensation, it's easier to recompute it 2049 SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS); 2050 Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem); 2051 2052 // Truncate to number of bits this divide really is. 2053 if (Sign) { 2054 SDValue InRegSize 2055 = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits)); 2056 Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize); 2057 Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize); 2058 } else { 2059 SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT); 2060 Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask); 2061 Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask); 2062 } 2063 2064 return DAG.getMergeValues({ Div, Rem }, DL); 2065 } 2066 2067 void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op, 2068 SelectionDAG &DAG, 2069 SmallVectorImpl<SDValue> &Results) const { 2070 SDLoc DL(Op); 2071 EVT VT = Op.getValueType(); 2072 2073 assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64"); 2074 2075 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext()); 2076 2077 SDValue One = DAG.getConstant(1, DL, HalfVT); 2078 SDValue Zero = DAG.getConstant(0, DL, HalfVT); 2079 2080 //HiLo split 2081 SDValue LHS_Lo, LHS_Hi; 2082 SDValue LHS = Op.getOperand(0); 2083 std::tie(LHS_Lo, LHS_Hi) = DAG.SplitScalar(LHS, DL, HalfVT, HalfVT); 2084 2085 SDValue RHS_Lo, RHS_Hi; 2086 SDValue RHS = Op.getOperand(1); 2087 std::tie(RHS_Lo, RHS_Hi) = DAG.SplitScalar(RHS, DL, HalfVT, HalfVT); 2088 2089 if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) && 2090 DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) { 2091 2092 SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT), 2093 LHS_Lo, RHS_Lo); 2094 2095 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero}); 2096 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero}); 2097 2098 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV)); 2099 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM)); 2100 return; 2101 } 2102 2103 if (isTypeLegal(MVT::i64)) { 2104 // The algorithm here is based on ideas from "Software Integer Division", 2105 // Tom Rodeheffer, August 2008. 2106 2107 MachineFunction &MF = DAG.getMachineFunction(); 2108 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2109 2110 // Compute denominator reciprocal. 2111 unsigned FMAD = 2112 !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA 2113 : MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign() 2114 ? (unsigned)ISD::FMAD 2115 : (unsigned)AMDGPUISD::FMAD_FTZ; 2116 2117 SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo); 2118 SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi); 2119 SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi, 2120 DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32), 2121 Cvt_Lo); 2122 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1); 2123 SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp, 2124 DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32)); 2125 SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1, 2126 DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32)); 2127 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2); 2128 SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc, 2129 DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32), 2130 Mul1); 2131 SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2); 2132 SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc); 2133 SDValue Rcp64 = DAG.getBitcast(VT, 2134 DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi})); 2135 2136 SDValue Zero64 = DAG.getConstant(0, DL, VT); 2137 SDValue One64 = DAG.getConstant(1, DL, VT); 2138 SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1); 2139 SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1); 2140 2141 // First round of UNR (Unsigned integer Newton-Raphson). 2142 SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS); 2143 SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64); 2144 SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1); 2145 SDValue Mulhi1_Lo, Mulhi1_Hi; 2146 std::tie(Mulhi1_Lo, Mulhi1_Hi) = 2147 DAG.SplitScalar(Mulhi1, DL, HalfVT, HalfVT); 2148 SDValue Add1_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Lo, 2149 Mulhi1_Lo, Zero1); 2150 SDValue Add1_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Hi, 2151 Mulhi1_Hi, Add1_Lo.getValue(1)); 2152 SDValue Add1 = DAG.getBitcast(VT, 2153 DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi})); 2154 2155 // Second round of UNR. 2156 SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1); 2157 SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2); 2158 SDValue Mulhi2_Lo, Mulhi2_Hi; 2159 std::tie(Mulhi2_Lo, Mulhi2_Hi) = 2160 DAG.SplitScalar(Mulhi2, DL, HalfVT, HalfVT); 2161 SDValue Add2_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Lo, 2162 Mulhi2_Lo, Zero1); 2163 SDValue Add2_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Hi, 2164 Mulhi2_Hi, Add2_Lo.getValue(1)); 2165 SDValue Add2 = DAG.getBitcast(VT, 2166 DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi})); 2167 2168 SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2); 2169 2170 SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3); 2171 2172 SDValue Mul3_Lo, Mul3_Hi; 2173 std::tie(Mul3_Lo, Mul3_Hi) = DAG.SplitScalar(Mul3, DL, HalfVT, HalfVT); 2174 SDValue Sub1_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Lo, 2175 Mul3_Lo, Zero1); 2176 SDValue Sub1_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Hi, 2177 Mul3_Hi, Sub1_Lo.getValue(1)); 2178 SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi); 2179 SDValue Sub1 = DAG.getBitcast(VT, 2180 DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi})); 2181 2182 SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT); 2183 SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero, 2184 ISD::SETUGE); 2185 SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero, 2186 ISD::SETUGE); 2187 SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ); 2188 2189 // TODO: Here and below portions of the code can be enclosed into if/endif. 2190 // Currently control flow is unconditional and we have 4 selects after 2191 // potential endif to substitute PHIs. 2192 2193 // if C3 != 0 ... 2194 SDValue Sub2_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Lo, 2195 RHS_Lo, Zero1); 2196 SDValue Sub2_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Mi, 2197 RHS_Hi, Sub1_Lo.getValue(1)); 2198 SDValue Sub2_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi, 2199 Zero, Sub2_Lo.getValue(1)); 2200 SDValue Sub2 = DAG.getBitcast(VT, 2201 DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi})); 2202 2203 SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64); 2204 2205 SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero, 2206 ISD::SETUGE); 2207 SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero, 2208 ISD::SETUGE); 2209 SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ); 2210 2211 // if (C6 != 0) 2212 SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64); 2213 2214 SDValue Sub3_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Lo, 2215 RHS_Lo, Zero1); 2216 SDValue Sub3_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi, 2217 RHS_Hi, Sub2_Lo.getValue(1)); 2218 SDValue Sub3_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub3_Mi, 2219 Zero, Sub3_Lo.getValue(1)); 2220 SDValue Sub3 = DAG.getBitcast(VT, 2221 DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi})); 2222 2223 // endif C6 2224 // endif C3 2225 2226 SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE); 2227 SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE); 2228 2229 SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE); 2230 SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE); 2231 2232 Results.push_back(Div); 2233 Results.push_back(Rem); 2234 2235 return; 2236 } 2237 2238 // r600 expandion. 2239 // Get Speculative values 2240 SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo); 2241 SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo); 2242 2243 SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ); 2244 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero}); 2245 REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM); 2246 2247 SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ); 2248 SDValue DIV_Lo = Zero; 2249 2250 const unsigned halfBitWidth = HalfVT.getSizeInBits(); 2251 2252 for (unsigned i = 0; i < halfBitWidth; ++i) { 2253 const unsigned bitPos = halfBitWidth - i - 1; 2254 SDValue POS = DAG.getConstant(bitPos, DL, HalfVT); 2255 // Get value of high bit 2256 SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS); 2257 HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One); 2258 HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit); 2259 2260 // Shift 2261 REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT)); 2262 // Add LHS high bit 2263 REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit); 2264 2265 SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT); 2266 SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE); 2267 2268 DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT); 2269 2270 // Update REM 2271 SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS); 2272 REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE); 2273 } 2274 2275 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi}); 2276 DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV); 2277 Results.push_back(DIV); 2278 Results.push_back(REM); 2279 } 2280 2281 SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op, 2282 SelectionDAG &DAG) const { 2283 SDLoc DL(Op); 2284 EVT VT = Op.getValueType(); 2285 2286 if (VT == MVT::i64) { 2287 SmallVector<SDValue, 2> Results; 2288 LowerUDIVREM64(Op, DAG, Results); 2289 return DAG.getMergeValues(Results, DL); 2290 } 2291 2292 if (VT == MVT::i32) { 2293 if (SDValue Res = LowerDIVREM24(Op, DAG, false)) 2294 return Res; 2295 } 2296 2297 SDValue X = Op.getOperand(0); 2298 SDValue Y = Op.getOperand(1); 2299 2300 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2301 // algorithm used here. 2302 2303 // Initial estimate of inv(y). 2304 SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y); 2305 2306 // One round of UNR. 2307 SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y); 2308 SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z); 2309 Z = DAG.getNode(ISD::ADD, DL, VT, Z, 2310 DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ)); 2311 2312 // Quotient/remainder estimate. 2313 SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z); 2314 SDValue R = 2315 DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y)); 2316 2317 // First quotient/remainder refinement. 2318 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 2319 SDValue One = DAG.getConstant(1, DL, VT); 2320 SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE); 2321 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond, 2322 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q); 2323 R = DAG.getNode(ISD::SELECT, DL, VT, Cond, 2324 DAG.getNode(ISD::SUB, DL, VT, R, Y), R); 2325 2326 // Second quotient/remainder refinement. 2327 Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE); 2328 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond, 2329 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q); 2330 R = DAG.getNode(ISD::SELECT, DL, VT, Cond, 2331 DAG.getNode(ISD::SUB, DL, VT, R, Y), R); 2332 2333 return DAG.getMergeValues({Q, R}, DL); 2334 } 2335 2336 SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op, 2337 SelectionDAG &DAG) const { 2338 SDLoc DL(Op); 2339 EVT VT = Op.getValueType(); 2340 2341 SDValue LHS = Op.getOperand(0); 2342 SDValue RHS = Op.getOperand(1); 2343 2344 SDValue Zero = DAG.getConstant(0, DL, VT); 2345 SDValue NegOne = DAG.getAllOnesConstant(DL, VT); 2346 2347 if (VT == MVT::i32) { 2348 if (SDValue Res = LowerDIVREM24(Op, DAG, true)) 2349 return Res; 2350 } 2351 2352 if (VT == MVT::i64 && 2353 DAG.ComputeNumSignBits(LHS) > 32 && 2354 DAG.ComputeNumSignBits(RHS) > 32) { 2355 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext()); 2356 2357 //HiLo split 2358 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero); 2359 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero); 2360 SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT), 2361 LHS_Lo, RHS_Lo); 2362 SDValue Res[2] = { 2363 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)), 2364 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1)) 2365 }; 2366 return DAG.getMergeValues(Res, DL); 2367 } 2368 2369 SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT); 2370 SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT); 2371 SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign); 2372 SDValue RSign = LHSign; // Remainder sign is the same as LHS 2373 2374 LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign); 2375 RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign); 2376 2377 LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign); 2378 RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign); 2379 2380 SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS); 2381 SDValue Rem = Div.getValue(1); 2382 2383 Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign); 2384 Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign); 2385 2386 Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign); 2387 Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign); 2388 2389 SDValue Res[2] = { 2390 Div, 2391 Rem 2392 }; 2393 return DAG.getMergeValues(Res, DL); 2394 } 2395 2396 // (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x) 2397 SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const { 2398 SDLoc SL(Op); 2399 EVT VT = Op.getValueType(); 2400 auto Flags = Op->getFlags(); 2401 SDValue X = Op.getOperand(0); 2402 SDValue Y = Op.getOperand(1); 2403 2404 SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y, Flags); 2405 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, VT, Div, Flags); 2406 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Trunc, Flags); 2407 // TODO: For f32 use FMAD instead if !hasFastFMA32? 2408 return DAG.getNode(ISD::FMA, SL, VT, Neg, Y, X, Flags); 2409 } 2410 2411 SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const { 2412 SDLoc SL(Op); 2413 SDValue Src = Op.getOperand(0); 2414 2415 // result = trunc(src) 2416 // if (src > 0.0 && src != result) 2417 // result += 1.0 2418 2419 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src); 2420 2421 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64); 2422 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64); 2423 2424 EVT SetCCVT = 2425 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64); 2426 2427 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT); 2428 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE); 2429 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc); 2430 2431 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero); 2432 // TODO: Should this propagate fast-math-flags? 2433 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add); 2434 } 2435 2436 static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, 2437 SelectionDAG &DAG) { 2438 const unsigned FractBits = 52; 2439 const unsigned ExpBits = 11; 2440 2441 SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, 2442 Hi, 2443 DAG.getConstant(FractBits - 32, SL, MVT::i32), 2444 DAG.getConstant(ExpBits, SL, MVT::i32)); 2445 SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart, 2446 DAG.getConstant(1023, SL, MVT::i32)); 2447 2448 return Exp; 2449 } 2450 2451 SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const { 2452 SDLoc SL(Op); 2453 SDValue Src = Op.getOperand(0); 2454 2455 assert(Op.getValueType() == MVT::f64); 2456 2457 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); 2458 2459 // Extract the upper half, since this is where we will find the sign and 2460 // exponent. 2461 SDValue Hi = getHiHalf64(Src, DAG); 2462 2463 SDValue Exp = extractF64Exponent(Hi, SL, DAG); 2464 2465 const unsigned FractBits = 52; 2466 2467 // Extract the sign bit. 2468 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32); 2469 SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask); 2470 2471 // Extend back to 64-bits. 2472 SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit}); 2473 SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64); 2474 2475 SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src); 2476 const SDValue FractMask 2477 = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64); 2478 2479 SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp); 2480 SDValue Not = DAG.getNOT(SL, Shr, MVT::i64); 2481 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not); 2482 2483 EVT SetCCVT = 2484 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32); 2485 2486 const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32); 2487 2488 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT); 2489 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT); 2490 2491 SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0); 2492 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1); 2493 2494 return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2); 2495 } 2496 2497 SDValue AMDGPUTargetLowering::LowerFROUNDEVEN(SDValue Op, 2498 SelectionDAG &DAG) const { 2499 SDLoc SL(Op); 2500 SDValue Src = Op.getOperand(0); 2501 2502 assert(Op.getValueType() == MVT::f64); 2503 2504 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 2505 SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64); 2506 SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src); 2507 2508 // TODO: Should this propagate fast-math-flags? 2509 2510 SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign); 2511 SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign); 2512 2513 SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src); 2514 2515 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 2516 SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64); 2517 2518 EVT SetCCVT = 2519 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64); 2520 SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT); 2521 2522 return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2); 2523 } 2524 2525 SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, 2526 SelectionDAG &DAG) const { 2527 // FNEARBYINT and FRINT are the same, except in their handling of FP 2528 // exceptions. Those aren't really meaningful for us, and OpenCL only has 2529 // rint, so just treat them as equivalent. 2530 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), Op.getValueType(), 2531 Op.getOperand(0)); 2532 } 2533 2534 SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const { 2535 auto VT = Op.getValueType(); 2536 auto Arg = Op.getOperand(0u); 2537 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), VT, Arg); 2538 } 2539 2540 // XXX - May require not supporting f32 denormals? 2541 2542 // Don't handle v2f16. The extra instructions to scalarize and repack around the 2543 // compare and vselect end up producing worse code than scalarizing the whole 2544 // operation. 2545 SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const { 2546 SDLoc SL(Op); 2547 SDValue X = Op.getOperand(0); 2548 EVT VT = Op.getValueType(); 2549 2550 SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X); 2551 2552 // TODO: Should this propagate fast-math-flags? 2553 2554 SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T); 2555 2556 SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff); 2557 2558 const SDValue Zero = DAG.getConstantFP(0.0, SL, VT); 2559 const SDValue One = DAG.getConstantFP(1.0, SL, VT); 2560 2561 EVT SetCCVT = 2562 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 2563 2564 const SDValue Half = DAG.getConstantFP(0.5, SL, VT); 2565 SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE); 2566 SDValue OneOrZeroFP = DAG.getNode(ISD::SELECT, SL, VT, Cmp, One, Zero); 2567 2568 SDValue SignedOffset = DAG.getNode(ISD::FCOPYSIGN, SL, VT, OneOrZeroFP, X); 2569 return DAG.getNode(ISD::FADD, SL, VT, T, SignedOffset); 2570 } 2571 2572 SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const { 2573 SDLoc SL(Op); 2574 SDValue Src = Op.getOperand(0); 2575 2576 // result = trunc(src); 2577 // if (src < 0.0 && src != result) 2578 // result += -1.0. 2579 2580 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src); 2581 2582 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64); 2583 const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64); 2584 2585 EVT SetCCVT = 2586 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64); 2587 2588 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT); 2589 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE); 2590 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc); 2591 2592 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero); 2593 // TODO: Should this propagate fast-math-flags? 2594 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add); 2595 } 2596 2597 /// Return true if it's known that \p Src can never be an f32 denormal value. 2598 static bool valueIsKnownNeverF32Denorm(SDValue Src) { 2599 switch (Src.getOpcode()) { 2600 case ISD::FP_EXTEND: 2601 return Src.getOperand(0).getValueType() == MVT::f16; 2602 case ISD::FP16_TO_FP: 2603 case ISD::FFREXP: 2604 return true; 2605 case ISD::INTRINSIC_WO_CHAIN: { 2606 unsigned IntrinsicID = Src.getConstantOperandVal(0); 2607 switch (IntrinsicID) { 2608 case Intrinsic::amdgcn_frexp_mant: 2609 return true; 2610 default: 2611 return false; 2612 } 2613 } 2614 default: 2615 return false; 2616 } 2617 2618 llvm_unreachable("covered opcode switch"); 2619 } 2620 2621 bool AMDGPUTargetLowering::allowApproxFunc(const SelectionDAG &DAG, 2622 SDNodeFlags Flags) { 2623 if (Flags.hasApproximateFuncs()) 2624 return true; 2625 auto &Options = DAG.getTarget().Options; 2626 return Options.UnsafeFPMath || Options.ApproxFuncFPMath; 2627 } 2628 2629 bool AMDGPUTargetLowering::needsDenormHandlingF32(const SelectionDAG &DAG, 2630 SDValue Src, 2631 SDNodeFlags Flags) { 2632 return !valueIsKnownNeverF32Denorm(Src) && 2633 DAG.getMachineFunction() 2634 .getDenormalMode(APFloat::IEEEsingle()) 2635 .Input != DenormalMode::PreserveSign; 2636 } 2637 2638 SDValue AMDGPUTargetLowering::getIsLtSmallestNormal(SelectionDAG &DAG, 2639 SDValue Src, 2640 SDNodeFlags Flags) const { 2641 SDLoc SL(Src); 2642 EVT VT = Src.getValueType(); 2643 const fltSemantics &Semantics = VT.getFltSemantics(); 2644 SDValue SmallestNormal = 2645 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT); 2646 2647 // Want to scale denormals up, but negatives and 0 work just as well on the 2648 // scaled path. 2649 SDValue IsLtSmallestNormal = DAG.getSetCC( 2650 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src, 2651 SmallestNormal, ISD::SETOLT); 2652 2653 return IsLtSmallestNormal; 2654 } 2655 2656 SDValue AMDGPUTargetLowering::getIsFinite(SelectionDAG &DAG, SDValue Src, 2657 SDNodeFlags Flags) const { 2658 SDLoc SL(Src); 2659 EVT VT = Src.getValueType(); 2660 const fltSemantics &Semantics = VT.getFltSemantics(); 2661 SDValue Inf = DAG.getConstantFP(APFloat::getInf(Semantics), SL, VT); 2662 2663 SDValue Fabs = DAG.getNode(ISD::FABS, SL, VT, Src, Flags); 2664 SDValue IsFinite = DAG.getSetCC( 2665 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Fabs, 2666 Inf, ISD::SETOLT); 2667 return IsFinite; 2668 } 2669 2670 /// If denormal handling is required return the scaled input to FLOG2, and the 2671 /// check for denormal range. Otherwise, return null values. 2672 std::pair<SDValue, SDValue> 2673 AMDGPUTargetLowering::getScaledLogInput(SelectionDAG &DAG, const SDLoc SL, 2674 SDValue Src, SDNodeFlags Flags) const { 2675 if (!needsDenormHandlingF32(DAG, Src, Flags)) 2676 return {}; 2677 2678 MVT VT = MVT::f32; 2679 const fltSemantics &Semantics = APFloat::IEEEsingle(); 2680 SDValue SmallestNormal = 2681 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT); 2682 2683 SDValue IsLtSmallestNormal = DAG.getSetCC( 2684 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src, 2685 SmallestNormal, ISD::SETOLT); 2686 2687 SDValue Scale32 = DAG.getConstantFP(0x1.0p+32, SL, VT); 2688 SDValue One = DAG.getConstantFP(1.0, SL, VT); 2689 SDValue ScaleFactor = 2690 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, Scale32, One, Flags); 2691 2692 SDValue ScaledInput = DAG.getNode(ISD::FMUL, SL, VT, Src, ScaleFactor, Flags); 2693 return {ScaledInput, IsLtSmallestNormal}; 2694 } 2695 2696 SDValue AMDGPUTargetLowering::LowerFLOG2(SDValue Op, SelectionDAG &DAG) const { 2697 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals. 2698 // If we have to handle denormals, scale up the input and adjust the result. 2699 2700 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0) 2701 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0) 2702 2703 SDLoc SL(Op); 2704 EVT VT = Op.getValueType(); 2705 SDValue Src = Op.getOperand(0); 2706 SDNodeFlags Flags = Op->getFlags(); 2707 2708 if (VT == MVT::f16) { 2709 // Nothing in half is a denormal when promoted to f32. 2710 assert(!Subtarget->has16BitInsts()); 2711 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags); 2712 SDValue Log = DAG.getNode(AMDGPUISD::LOG, SL, MVT::f32, Ext, Flags); 2713 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log, 2714 DAG.getTargetConstant(0, SL, MVT::i32), Flags); 2715 } 2716 2717 auto [ScaledInput, IsLtSmallestNormal] = 2718 getScaledLogInput(DAG, SL, Src, Flags); 2719 if (!ScaledInput) 2720 return DAG.getNode(AMDGPUISD::LOG, SL, VT, Src, Flags); 2721 2722 SDValue Log2 = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags); 2723 2724 SDValue ThirtyTwo = DAG.getConstantFP(32.0, SL, VT); 2725 SDValue Zero = DAG.getConstantFP(0.0, SL, VT); 2726 SDValue ResultOffset = 2727 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, ThirtyTwo, Zero); 2728 return DAG.getNode(ISD::FSUB, SL, VT, Log2, ResultOffset, Flags); 2729 } 2730 2731 static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, 2732 SDValue Y, SDValue C, SDNodeFlags Flags = SDNodeFlags()) { 2733 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Y, Flags); 2734 return DAG.getNode(ISD::FADD, SL, VT, Mul, C, Flags); 2735 } 2736 2737 SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op, 2738 SelectionDAG &DAG) const { 2739 SDValue X = Op.getOperand(0); 2740 EVT VT = Op.getValueType(); 2741 SDNodeFlags Flags = Op->getFlags(); 2742 SDLoc DL(Op); 2743 2744 const bool IsLog10 = Op.getOpcode() == ISD::FLOG10; 2745 assert(IsLog10 || Op.getOpcode() == ISD::FLOG); 2746 2747 const auto &Options = getTargetMachine().Options; 2748 if (VT == MVT::f16 || Flags.hasApproximateFuncs() || 2749 Options.ApproxFuncFPMath || Options.UnsafeFPMath) { 2750 2751 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) { 2752 // Log and multiply in f32 is good enough for f16. 2753 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X, Flags); 2754 } 2755 2756 SDValue Lowered = LowerFLOGUnsafe(X, DL, DAG, IsLog10, Flags); 2757 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) { 2758 return DAG.getNode(ISD::FP_ROUND, DL, VT, Lowered, 2759 DAG.getTargetConstant(0, DL, MVT::i32), Flags); 2760 } 2761 2762 return Lowered; 2763 } 2764 2765 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, DL, X, Flags); 2766 if (ScaledInput) 2767 X = ScaledInput; 2768 2769 SDValue Y = DAG.getNode(AMDGPUISD::LOG, DL, VT, X, Flags); 2770 2771 SDValue R; 2772 if (Subtarget->hasFastFMAF32()) { 2773 // c+cc are ln(2)/ln(10) to more than 49 bits 2774 const float c_log10 = 0x1.344134p-2f; 2775 const float cc_log10 = 0x1.09f79ep-26f; 2776 2777 // c + cc is ln(2) to more than 49 bits 2778 const float c_log = 0x1.62e42ep-1f; 2779 const float cc_log = 0x1.efa39ep-25f; 2780 2781 SDValue C = DAG.getConstantFP(IsLog10 ? c_log10 : c_log, DL, VT); 2782 SDValue CC = DAG.getConstantFP(IsLog10 ? cc_log10 : cc_log, DL, VT); 2783 2784 R = DAG.getNode(ISD::FMUL, DL, VT, Y, C, Flags); 2785 SDValue NegR = DAG.getNode(ISD::FNEG, DL, VT, R, Flags); 2786 SDValue FMA0 = DAG.getNode(ISD::FMA, DL, VT, Y, C, NegR, Flags); 2787 SDValue FMA1 = DAG.getNode(ISD::FMA, DL, VT, Y, CC, FMA0, Flags); 2788 R = DAG.getNode(ISD::FADD, DL, VT, R, FMA1, Flags); 2789 } else { 2790 // ch+ct is ln(2)/ln(10) to more than 36 bits 2791 const float ch_log10 = 0x1.344000p-2f; 2792 const float ct_log10 = 0x1.3509f6p-18f; 2793 2794 // ch + ct is ln(2) to more than 36 bits 2795 const float ch_log = 0x1.62e000p-1f; 2796 const float ct_log = 0x1.0bfbe8p-15f; 2797 2798 SDValue CH = DAG.getConstantFP(IsLog10 ? ch_log10 : ch_log, DL, VT); 2799 SDValue CT = DAG.getConstantFP(IsLog10 ? ct_log10 : ct_log, DL, VT); 2800 2801 SDValue YAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Y); 2802 SDValue MaskConst = DAG.getConstant(0xfffff000, DL, MVT::i32); 2803 SDValue YHInt = DAG.getNode(ISD::AND, DL, MVT::i32, YAsInt, MaskConst); 2804 SDValue YH = DAG.getNode(ISD::BITCAST, DL, MVT::f32, YHInt); 2805 SDValue YT = DAG.getNode(ISD::FSUB, DL, VT, Y, YH, Flags); 2806 2807 SDValue YTCT = DAG.getNode(ISD::FMUL, DL, VT, YT, CT, Flags); 2808 SDValue Mad0 = getMad(DAG, DL, VT, YH, CT, YTCT, Flags); 2809 SDValue Mad1 = getMad(DAG, DL, VT, YT, CH, Mad0, Flags); 2810 R = getMad(DAG, DL, VT, YH, CH, Mad1); 2811 } 2812 2813 const bool IsFiniteOnly = (Flags.hasNoNaNs() || Options.NoNaNsFPMath) && 2814 (Flags.hasNoInfs() || Options.NoInfsFPMath); 2815 2816 // TODO: Check if known finite from source value. 2817 if (!IsFiniteOnly) { 2818 SDValue IsFinite = getIsFinite(DAG, Y, Flags); 2819 R = DAG.getNode(ISD::SELECT, DL, VT, IsFinite, R, Y, Flags); 2820 } 2821 2822 if (IsScaled) { 2823 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT); 2824 SDValue ShiftK = 2825 DAG.getConstantFP(IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f, DL, VT); 2826 SDValue Shift = 2827 DAG.getNode(ISD::SELECT, DL, VT, IsScaled, ShiftK, Zero, Flags); 2828 R = DAG.getNode(ISD::FSUB, DL, VT, R, Shift, Flags); 2829 } 2830 2831 return R; 2832 } 2833 2834 SDValue AMDGPUTargetLowering::LowerFLOG10(SDValue Op, SelectionDAG &DAG) const { 2835 return LowerFLOGCommon(Op, DAG); 2836 } 2837 2838 // Do f32 fast math expansion for flog2 or flog10. This is accurate enough for a 2839 // promote f16 operation. 2840 SDValue AMDGPUTargetLowering::LowerFLOGUnsafe(SDValue Src, const SDLoc &SL, 2841 SelectionDAG &DAG, bool IsLog10, 2842 SDNodeFlags Flags) const { 2843 EVT VT = Src.getValueType(); 2844 unsigned LogOp = 2845 VT == MVT::f32 ? (unsigned)AMDGPUISD::LOG : (unsigned)ISD::FLOG2; 2846 2847 double Log2BaseInverted = 2848 IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2; 2849 2850 if (VT == MVT::f32) { 2851 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, SL, Src, Flags); 2852 if (ScaledInput) { 2853 SDValue LogSrc = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags); 2854 SDValue ScaledResultOffset = 2855 DAG.getConstantFP(-32.0 * Log2BaseInverted, SL, VT); 2856 2857 SDValue Zero = DAG.getConstantFP(0.0f, SL, VT); 2858 2859 SDValue ResultOffset = DAG.getNode(ISD::SELECT, SL, VT, IsScaled, 2860 ScaledResultOffset, Zero, Flags); 2861 2862 SDValue Log2Inv = DAG.getConstantFP(Log2BaseInverted, SL, VT); 2863 2864 if (Subtarget->hasFastFMAF32()) 2865 return DAG.getNode(ISD::FMA, SL, VT, LogSrc, Log2Inv, ResultOffset, 2866 Flags); 2867 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, LogSrc, Log2Inv, Flags); 2868 return DAG.getNode(ISD::FADD, SL, VT, Mul, ResultOffset); 2869 } 2870 } 2871 2872 SDValue Log2Operand = DAG.getNode(LogOp, SL, VT, Src, Flags); 2873 SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT); 2874 2875 return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand, 2876 Flags); 2877 } 2878 2879 SDValue AMDGPUTargetLowering::lowerFEXP2(SDValue Op, SelectionDAG &DAG) const { 2880 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals. 2881 // If we have to handle denormals, scale up the input and adjust the result. 2882 2883 SDLoc SL(Op); 2884 EVT VT = Op.getValueType(); 2885 SDValue Src = Op.getOperand(0); 2886 SDNodeFlags Flags = Op->getFlags(); 2887 2888 if (VT == MVT::f16) { 2889 // Nothing in half is a denormal when promoted to f32. 2890 assert(!Subtarget->has16BitInsts()); 2891 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags); 2892 SDValue Log = DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Ext, Flags); 2893 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log, 2894 DAG.getTargetConstant(0, SL, MVT::i32), Flags); 2895 } 2896 2897 assert(VT == MVT::f32); 2898 2899 if (!needsDenormHandlingF32(DAG, Src, Flags)) 2900 return DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Src, Flags); 2901 2902 // bool needs_scaling = x < -0x1.f80000p+6f; 2903 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f); 2904 2905 // -nextafter(128.0, -1) 2906 SDValue RangeCheckConst = DAG.getConstantFP(-0x1.f80000p+6f, SL, VT); 2907 2908 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 2909 2910 SDValue NeedsScaling = 2911 DAG.getSetCC(SL, SetCCVT, Src, RangeCheckConst, ISD::SETOLT); 2912 2913 SDValue SixtyFour = DAG.getConstantFP(0x1.0p+6f, SL, VT); 2914 SDValue Zero = DAG.getConstantFP(0.0, SL, VT); 2915 2916 SDValue AddOffset = 2917 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, SixtyFour, Zero); 2918 2919 SDValue AddInput = DAG.getNode(ISD::FADD, SL, VT, Src, AddOffset, Flags); 2920 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, AddInput, Flags); 2921 2922 SDValue TwoExpNeg64 = DAG.getConstantFP(0x1.0p-64f, SL, VT); 2923 SDValue One = DAG.getConstantFP(1.0, SL, VT); 2924 SDValue ResultScale = 2925 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, TwoExpNeg64, One); 2926 2927 return DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScale, Flags); 2928 } 2929 2930 SDValue AMDGPUTargetLowering::lowerFEXPUnsafe(SDValue X, const SDLoc &SL, 2931 SelectionDAG &DAG, 2932 SDNodeFlags Flags) const { 2933 EVT VT = X.getValueType(); 2934 const SDValue Log2E = DAG.getConstantFP(numbers::log2e, SL, VT); 2935 2936 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) { 2937 // exp2(M_LOG2E_F * f); 2938 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Log2E, Flags); 2939 return DAG.getNode(VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP 2940 : (unsigned)ISD::FEXP2, 2941 SL, VT, Mul, Flags); 2942 } 2943 2944 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 2945 2946 SDValue Threshold = DAG.getConstantFP(-0x1.5d58a0p+6f, SL, VT); 2947 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT); 2948 2949 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+6f, SL, VT); 2950 2951 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags); 2952 2953 SDValue AdjustedX = 2954 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X); 2955 2956 SDValue ExpInput = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, Log2E, Flags); 2957 2958 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, ExpInput, Flags); 2959 2960 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.969d48p-93f, SL, VT); 2961 SDValue AdjustedResult = 2962 DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScaleFactor, Flags); 2963 2964 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, Exp2, 2965 Flags); 2966 } 2967 2968 /// Emit approx-funcs appropriate lowering for exp10. inf/nan should still be 2969 /// handled correctly. 2970 SDValue AMDGPUTargetLowering::lowerFEXP10Unsafe(SDValue X, const SDLoc &SL, 2971 SelectionDAG &DAG, 2972 SDNodeFlags Flags) const { 2973 const EVT VT = X.getValueType(); 2974 const unsigned Exp2Op = VT == MVT::f32 ? AMDGPUISD::EXP : ISD::FEXP2; 2975 2976 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) { 2977 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f); 2978 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT); 2979 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT); 2980 2981 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, X, K0, Flags); 2982 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags); 2983 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, X, K1, Flags); 2984 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags); 2985 return DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1); 2986 } 2987 2988 // bool s = x < -0x1.2f7030p+5f; 2989 // x += s ? 0x1.0p+5f : 0.0f; 2990 // exp10 = exp2(x * 0x1.a92000p+1f) * 2991 // exp2(x * 0x1.4f0978p-11f) * 2992 // (s ? 0x1.9f623ep-107f : 1.0f); 2993 2994 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 2995 2996 SDValue Threshold = DAG.getConstantFP(-0x1.2f7030p+5f, SL, VT); 2997 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT); 2998 2999 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+5f, SL, VT); 3000 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags); 3001 SDValue AdjustedX = 3002 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X); 3003 3004 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT); 3005 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT); 3006 3007 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K0, Flags); 3008 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags); 3009 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K1, Flags); 3010 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags); 3011 3012 SDValue MulExps = DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1, Flags); 3013 3014 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.9f623ep-107f, SL, VT); 3015 SDValue AdjustedResult = 3016 DAG.getNode(ISD::FMUL, SL, VT, MulExps, ResultScaleFactor, Flags); 3017 3018 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, MulExps, 3019 Flags); 3020 } 3021 3022 SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const { 3023 EVT VT = Op.getValueType(); 3024 SDLoc SL(Op); 3025 SDValue X = Op.getOperand(0); 3026 SDNodeFlags Flags = Op->getFlags(); 3027 const bool IsExp10 = Op.getOpcode() == ISD::FEXP10; 3028 3029 if (VT.getScalarType() == MVT::f16) { 3030 // v_exp_f16 (fmul x, log2e) 3031 if (allowApproxFunc(DAG, Flags)) // TODO: Does this really require fast? 3032 return lowerFEXPUnsafe(X, SL, DAG, Flags); 3033 3034 if (VT.isVector()) 3035 return SDValue(); 3036 3037 // exp(f16 x) -> 3038 // fptrunc (v_exp_f32 (fmul (fpext x), log2e)) 3039 3040 // Nothing in half is a denormal when promoted to f32. 3041 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, X, Flags); 3042 SDValue Lowered = lowerFEXPUnsafe(Ext, SL, DAG, Flags); 3043 return DAG.getNode(ISD::FP_ROUND, SL, VT, Lowered, 3044 DAG.getTargetConstant(0, SL, MVT::i32), Flags); 3045 } 3046 3047 assert(VT == MVT::f32); 3048 3049 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying 3050 // library behavior. Also, is known-not-daz source sufficient? 3051 if (allowApproxFunc(DAG, Flags)) { 3052 return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags) 3053 : lowerFEXPUnsafe(X, SL, DAG, Flags); 3054 } 3055 3056 // Algorithm: 3057 // 3058 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64) 3059 // 3060 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer 3061 // n = 64*m + j, 0 <= j < 64 3062 // 3063 // e^x = 2^((64*m + j + f)/64) 3064 // = (2^m) * (2^(j/64)) * 2^(f/64) 3065 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64)) 3066 // 3067 // f = x*(64/ln(2)) - n 3068 // r = f*(ln(2)/64) = x - n*(ln(2)/64) 3069 // 3070 // e^x = (2^m) * (2^(j/64)) * e^r 3071 // 3072 // (2^(j/64)) is precomputed 3073 // 3074 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5! 3075 // e^r = 1 + q 3076 // 3077 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5! 3078 // 3079 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) ) 3080 SDNodeFlags FlagsNoContract = Flags; 3081 FlagsNoContract.setAllowContract(false); 3082 3083 SDValue PH, PL; 3084 if (Subtarget->hasFastFMAF32()) { 3085 const float c_exp = numbers::log2ef; 3086 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits 3087 const float c_exp10 = 0x1.a934f0p+1f; 3088 const float cc_exp10 = 0x1.2f346ep-24f; 3089 3090 SDValue C = DAG.getConstantFP(IsExp10 ? c_exp10 : c_exp, SL, VT); 3091 SDValue CC = DAG.getConstantFP(IsExp10 ? cc_exp10 : cc_exp, SL, VT); 3092 3093 PH = DAG.getNode(ISD::FMUL, SL, VT, X, C, Flags); 3094 SDValue NegPH = DAG.getNode(ISD::FNEG, SL, VT, PH, Flags); 3095 SDValue FMA0 = DAG.getNode(ISD::FMA, SL, VT, X, C, NegPH, Flags); 3096 PL = DAG.getNode(ISD::FMA, SL, VT, X, CC, FMA0, Flags); 3097 } else { 3098 const float ch_exp = 0x1.714000p+0f; 3099 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits 3100 3101 const float ch_exp10 = 0x1.a92000p+1f; 3102 const float cl_exp10 = 0x1.4f0978p-11f; 3103 3104 SDValue CH = DAG.getConstantFP(IsExp10 ? ch_exp10 : ch_exp, SL, VT); 3105 SDValue CL = DAG.getConstantFP(IsExp10 ? cl_exp10 : cl_exp, SL, VT); 3106 3107 SDValue XAsInt = DAG.getNode(ISD::BITCAST, SL, MVT::i32, X); 3108 SDValue MaskConst = DAG.getConstant(0xfffff000, SL, MVT::i32); 3109 SDValue XHAsInt = DAG.getNode(ISD::AND, SL, MVT::i32, XAsInt, MaskConst); 3110 SDValue XH = DAG.getNode(ISD::BITCAST, SL, VT, XHAsInt); 3111 SDValue XL = DAG.getNode(ISD::FSUB, SL, VT, X, XH, Flags); 3112 3113 PH = DAG.getNode(ISD::FMUL, SL, VT, XH, CH, Flags); 3114 3115 SDValue XLCL = DAG.getNode(ISD::FMUL, SL, VT, XL, CL, Flags); 3116 SDValue Mad0 = getMad(DAG, SL, VT, XL, CH, XLCL, Flags); 3117 PL = getMad(DAG, SL, VT, XH, CL, Mad0, Flags); 3118 } 3119 3120 SDValue E = DAG.getNode(ISD::FROUNDEVEN, SL, VT, PH, Flags); 3121 3122 // It is unsafe to contract this fsub into the PH multiply. 3123 SDValue PHSubE = DAG.getNode(ISD::FSUB, SL, VT, PH, E, FlagsNoContract); 3124 3125 SDValue A = DAG.getNode(ISD::FADD, SL, VT, PHSubE, PL, Flags); 3126 SDValue IntE = DAG.getNode(ISD::FP_TO_SINT, SL, MVT::i32, E); 3127 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, A, Flags); 3128 3129 SDValue R = DAG.getNode(ISD::FLDEXP, SL, VT, Exp2, IntE, Flags); 3130 3131 SDValue UnderflowCheckConst = 3132 DAG.getConstantFP(IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f, SL, VT); 3133 3134 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 3135 SDValue Zero = DAG.getConstantFP(0.0, SL, VT); 3136 SDValue Underflow = 3137 DAG.getSetCC(SL, SetCCVT, X, UnderflowCheckConst, ISD::SETOLT); 3138 3139 R = DAG.getNode(ISD::SELECT, SL, VT, Underflow, Zero, R); 3140 const auto &Options = getTargetMachine().Options; 3141 3142 if (!Flags.hasNoInfs() && !Options.NoInfsFPMath) { 3143 SDValue OverflowCheckConst = 3144 DAG.getConstantFP(IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f, SL, VT); 3145 SDValue Overflow = 3146 DAG.getSetCC(SL, SetCCVT, X, OverflowCheckConst, ISD::SETOGT); 3147 SDValue Inf = 3148 DAG.getConstantFP(APFloat::getInf(APFloat::IEEEsingle()), SL, VT); 3149 R = DAG.getNode(ISD::SELECT, SL, VT, Overflow, Inf, R); 3150 } 3151 3152 return R; 3153 } 3154 3155 static bool isCtlzOpc(unsigned Opc) { 3156 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF; 3157 } 3158 3159 static bool isCttzOpc(unsigned Opc) { 3160 return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF; 3161 } 3162 3163 SDValue AMDGPUTargetLowering::lowerCTLZResults(SDValue Op, 3164 SelectionDAG &DAG) const { 3165 auto SL = SDLoc(Op); 3166 auto Opc = Op.getOpcode(); 3167 auto Arg = Op.getOperand(0u); 3168 auto ResultVT = Op.getValueType(); 3169 3170 if (ResultVT != MVT::i8 && ResultVT != MVT::i16) 3171 return {}; 3172 3173 assert(isCtlzOpc(Opc)); 3174 assert(ResultVT == Arg.getValueType()); 3175 3176 const uint64_t NumBits = ResultVT.getFixedSizeInBits(); 3177 SDValue NumExtBits = DAG.getConstant(32u - NumBits, SL, MVT::i32); 3178 SDValue NewOp; 3179 3180 if (Opc == ISD::CTLZ_ZERO_UNDEF) { 3181 NewOp = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Arg); 3182 NewOp = DAG.getNode(ISD::SHL, SL, MVT::i32, NewOp, NumExtBits); 3183 NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp); 3184 } else { 3185 NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg); 3186 NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp); 3187 NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, NumExtBits); 3188 } 3189 3190 return DAG.getNode(ISD::TRUNCATE, SL, ResultVT, NewOp); 3191 } 3192 3193 SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const { 3194 SDLoc SL(Op); 3195 SDValue Src = Op.getOperand(0); 3196 3197 assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode())); 3198 bool Ctlz = isCtlzOpc(Op.getOpcode()); 3199 unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32; 3200 3201 bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF || 3202 Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF; 3203 bool Is64BitScalar = !Src->isDivergent() && Src.getValueType() == MVT::i64; 3204 3205 if (Src.getValueType() == MVT::i32 || Is64BitScalar) { 3206 // (ctlz hi:lo) -> (umin (ffbh src), 32) 3207 // (cttz hi:lo) -> (umin (ffbl src), 32) 3208 // (ctlz_zero_undef src) -> (ffbh src) 3209 // (cttz_zero_undef src) -> (ffbl src) 3210 3211 // 64-bit scalar version produce 32-bit result 3212 // (ctlz hi:lo) -> (umin (S_FLBIT_I32_B64 src), 64) 3213 // (cttz hi:lo) -> (umin (S_FF1_I32_B64 src), 64) 3214 // (ctlz_zero_undef src) -> (S_FLBIT_I32_B64 src) 3215 // (cttz_zero_undef src) -> (S_FF1_I32_B64 src) 3216 SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src); 3217 if (!ZeroUndef) { 3218 const SDValue ConstVal = DAG.getConstant( 3219 Op.getValueType().getScalarSizeInBits(), SL, MVT::i32); 3220 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, ConstVal); 3221 } 3222 return DAG.getNode(ISD::ZERO_EXTEND, SL, Src.getValueType(), NewOpr); 3223 } 3224 3225 SDValue Lo, Hi; 3226 std::tie(Lo, Hi) = split64BitValue(Src, DAG); 3227 3228 SDValue OprLo = DAG.getNode(NewOpc, SL, MVT::i32, Lo); 3229 SDValue OprHi = DAG.getNode(NewOpc, SL, MVT::i32, Hi); 3230 3231 // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64) 3232 // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64) 3233 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32)) 3234 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo)) 3235 3236 unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT; 3237 const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32); 3238 if (Ctlz) 3239 OprLo = DAG.getNode(AddOpc, SL, MVT::i32, OprLo, Const32); 3240 else 3241 OprHi = DAG.getNode(AddOpc, SL, MVT::i32, OprHi, Const32); 3242 3243 SDValue NewOpr; 3244 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, OprLo, OprHi); 3245 if (!ZeroUndef) { 3246 const SDValue Const64 = DAG.getConstant(64, SL, MVT::i32); 3247 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const64); 3248 } 3249 3250 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr); 3251 } 3252 3253 SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, 3254 bool Signed) const { 3255 // The regular method converting a 64-bit integer to float roughly consists of 3256 // 2 steps: normalization and rounding. In fact, after normalization, the 3257 // conversion from a 64-bit integer to a float is essentially the same as the 3258 // one from a 32-bit integer. The only difference is that it has more 3259 // trailing bits to be rounded. To leverage the native 32-bit conversion, a 3260 // 64-bit integer could be preprocessed and fit into a 32-bit integer then 3261 // converted into the correct float number. The basic steps for the unsigned 3262 // conversion are illustrated in the following pseudo code: 3263 // 3264 // f32 uitofp(i64 u) { 3265 // i32 hi, lo = split(u); 3266 // // Only count the leading zeros in hi as we have native support of the 3267 // // conversion from i32 to f32. If hi is all 0s, the conversion is 3268 // // reduced to a 32-bit one automatically. 3269 // i32 shamt = clz(hi); // Return 32 if hi is all 0s. 3270 // u <<= shamt; 3271 // hi, lo = split(u); 3272 // hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo. 3273 // // convert it as a 32-bit integer and scale the result back. 3274 // return uitofp(hi) * 2^(32 - shamt); 3275 // } 3276 // 3277 // The signed one follows the same principle but uses 'ffbh_i32' to count its 3278 // sign bits instead. If 'ffbh_i32' is not available, its absolute value is 3279 // converted instead followed by negation based its sign bit. 3280 3281 SDLoc SL(Op); 3282 SDValue Src = Op.getOperand(0); 3283 3284 SDValue Lo, Hi; 3285 std::tie(Lo, Hi) = split64BitValue(Src, DAG); 3286 SDValue Sign; 3287 SDValue ShAmt; 3288 if (Signed && Subtarget->isGCN()) { 3289 // We also need to consider the sign bit in Lo if Hi has just sign bits, 3290 // i.e. Hi is 0 or -1. However, that only needs to take the MSB into 3291 // account. That is, the maximal shift is 3292 // - 32 if Lo and Hi have opposite signs; 3293 // - 33 if Lo and Hi have the same sign. 3294 // 3295 // Or, MaxShAmt = 33 + OppositeSign, where 3296 // 3297 // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is 3298 // - -1 if Lo and Hi have opposite signs; and 3299 // - 0 otherwise. 3300 // 3301 // All in all, ShAmt is calculated as 3302 // 3303 // umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1. 3304 // 3305 // or 3306 // 3307 // umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31). 3308 // 3309 // to reduce the critical path. 3310 SDValue OppositeSign = DAG.getNode( 3311 ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi), 3312 DAG.getConstant(31, SL, MVT::i32)); 3313 SDValue MaxShAmt = 3314 DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32), 3315 OppositeSign); 3316 // Count the leading sign bits. 3317 ShAmt = DAG.getNode(AMDGPUISD::FFBH_I32, SL, MVT::i32, Hi); 3318 // Different from unsigned conversion, the shift should be one bit less to 3319 // preserve the sign bit. 3320 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt, 3321 DAG.getConstant(1, SL, MVT::i32)); 3322 ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt); 3323 } else { 3324 if (Signed) { 3325 // Without 'ffbh_i32', only leading zeros could be counted. Take the 3326 // absolute value first. 3327 Sign = DAG.getNode(ISD::SRA, SL, MVT::i64, Src, 3328 DAG.getConstant(63, SL, MVT::i64)); 3329 SDValue Abs = 3330 DAG.getNode(ISD::XOR, SL, MVT::i64, 3331 DAG.getNode(ISD::ADD, SL, MVT::i64, Src, Sign), Sign); 3332 std::tie(Lo, Hi) = split64BitValue(Abs, DAG); 3333 } 3334 // Count the leading zeros. 3335 ShAmt = DAG.getNode(ISD::CTLZ, SL, MVT::i32, Hi); 3336 // The shift amount for signed integers is [0, 32]. 3337 } 3338 // Normalize the given 64-bit integer. 3339 SDValue Norm = DAG.getNode(ISD::SHL, SL, MVT::i64, Src, ShAmt); 3340 // Split it again. 3341 std::tie(Lo, Hi) = split64BitValue(Norm, DAG); 3342 // Calculate the adjust bit for rounding. 3343 // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo) 3344 SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32, 3345 DAG.getConstant(1, SL, MVT::i32), Lo); 3346 // Get the 32-bit normalized integer. 3347 Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust); 3348 // Convert the normalized 32-bit integer into f32. 3349 unsigned Opc = 3350 (Signed && Subtarget->isGCN()) ? ISD::SINT_TO_FP : ISD::UINT_TO_FP; 3351 SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm); 3352 3353 // Finally, need to scale back the converted floating number as the original 3354 // 64-bit integer is converted as a 32-bit one. 3355 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32), 3356 ShAmt); 3357 // On GCN, use LDEXP directly. 3358 if (Subtarget->isGCN()) 3359 return DAG.getNode(ISD::FLDEXP, SL, MVT::f32, FVal, ShAmt); 3360 3361 // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent 3362 // part directly to emulate the multiplication of 2^ShAmt. That 8-bit 3363 // exponent is enough to avoid overflowing into the sign bit. 3364 SDValue Exp = DAG.getNode(ISD::SHL, SL, MVT::i32, ShAmt, 3365 DAG.getConstant(23, SL, MVT::i32)); 3366 SDValue IVal = 3367 DAG.getNode(ISD::ADD, SL, MVT::i32, 3368 DAG.getNode(ISD::BITCAST, SL, MVT::i32, FVal), Exp); 3369 if (Signed) { 3370 // Set the sign bit. 3371 Sign = DAG.getNode(ISD::SHL, SL, MVT::i32, 3372 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Sign), 3373 DAG.getConstant(31, SL, MVT::i32)); 3374 IVal = DAG.getNode(ISD::OR, SL, MVT::i32, IVal, Sign); 3375 } 3376 return DAG.getNode(ISD::BITCAST, SL, MVT::f32, IVal); 3377 } 3378 3379 SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, 3380 bool Signed) const { 3381 SDLoc SL(Op); 3382 SDValue Src = Op.getOperand(0); 3383 3384 SDValue Lo, Hi; 3385 std::tie(Lo, Hi) = split64BitValue(Src, DAG); 3386 3387 SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP, 3388 SL, MVT::f64, Hi); 3389 3390 SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo); 3391 3392 SDValue LdExp = DAG.getNode(ISD::FLDEXP, SL, MVT::f64, CvtHi, 3393 DAG.getConstant(32, SL, MVT::i32)); 3394 // TODO: Should this propagate fast-math-flags? 3395 return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo); 3396 } 3397 3398 SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op, 3399 SelectionDAG &DAG) const { 3400 // TODO: Factor out code common with LowerSINT_TO_FP. 3401 EVT DestVT = Op.getValueType(); 3402 SDValue Src = Op.getOperand(0); 3403 EVT SrcVT = Src.getValueType(); 3404 3405 if (SrcVT == MVT::i16) { 3406 if (DestVT == MVT::f16) 3407 return Op; 3408 SDLoc DL(Op); 3409 3410 // Promote src to i32 3411 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src); 3412 return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext); 3413 } 3414 3415 if (DestVT == MVT::bf16) { 3416 SDLoc SL(Op); 3417 SDValue ToF32 = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f32, Src); 3418 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true); 3419 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag); 3420 } 3421 3422 if (SrcVT != MVT::i64) 3423 return Op; 3424 3425 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) { 3426 SDLoc DL(Op); 3427 3428 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src); 3429 SDValue FPRoundFlag = 3430 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true); 3431 SDValue FPRound = 3432 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag); 3433 3434 return FPRound; 3435 } 3436 3437 if (DestVT == MVT::f32) 3438 return LowerINT_TO_FP32(Op, DAG, false); 3439 3440 assert(DestVT == MVT::f64); 3441 return LowerINT_TO_FP64(Op, DAG, false); 3442 } 3443 3444 SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op, 3445 SelectionDAG &DAG) const { 3446 EVT DestVT = Op.getValueType(); 3447 3448 SDValue Src = Op.getOperand(0); 3449 EVT SrcVT = Src.getValueType(); 3450 3451 if (SrcVT == MVT::i16) { 3452 if (DestVT == MVT::f16) 3453 return Op; 3454 3455 SDLoc DL(Op); 3456 // Promote src to i32 3457 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src); 3458 return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext); 3459 } 3460 3461 if (DestVT == MVT::bf16) { 3462 SDLoc SL(Op); 3463 SDValue ToF32 = DAG.getNode(ISD::SINT_TO_FP, SL, MVT::f32, Src); 3464 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true); 3465 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag); 3466 } 3467 3468 if (SrcVT != MVT::i64) 3469 return Op; 3470 3471 // TODO: Factor out code common with LowerUINT_TO_FP. 3472 3473 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) { 3474 SDLoc DL(Op); 3475 SDValue Src = Op.getOperand(0); 3476 3477 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src); 3478 SDValue FPRoundFlag = 3479 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true); 3480 SDValue FPRound = 3481 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag); 3482 3483 return FPRound; 3484 } 3485 3486 if (DestVT == MVT::f32) 3487 return LowerINT_TO_FP32(Op, DAG, true); 3488 3489 assert(DestVT == MVT::f64); 3490 return LowerINT_TO_FP64(Op, DAG, true); 3491 } 3492 3493 SDValue AMDGPUTargetLowering::LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG, 3494 bool Signed) const { 3495 SDLoc SL(Op); 3496 3497 SDValue Src = Op.getOperand(0); 3498 EVT SrcVT = Src.getValueType(); 3499 3500 assert(SrcVT == MVT::f32 || SrcVT == MVT::f64); 3501 3502 // The basic idea of converting a floating point number into a pair of 32-bit 3503 // integers is illustrated as follows: 3504 // 3505 // tf := trunc(val); 3506 // hif := floor(tf * 2^-32); 3507 // lof := tf - hif * 2^32; // lof is always positive due to floor. 3508 // hi := fptoi(hif); 3509 // lo := fptoi(lof); 3510 // 3511 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src); 3512 SDValue Sign; 3513 if (Signed && SrcVT == MVT::f32) { 3514 // However, a 32-bit floating point number has only 23 bits mantissa and 3515 // it's not enough to hold all the significant bits of `lof` if val is 3516 // negative. To avoid the loss of precision, We need to take the absolute 3517 // value after truncating and flip the result back based on the original 3518 // signedness. 3519 Sign = DAG.getNode(ISD::SRA, SL, MVT::i32, 3520 DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc), 3521 DAG.getConstant(31, SL, MVT::i32)); 3522 Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc); 3523 } 3524 3525 SDValue K0, K1; 3526 if (SrcVT == MVT::f64) { 3527 K0 = DAG.getConstantFP( 3528 llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)), SL, 3529 SrcVT); 3530 K1 = DAG.getConstantFP( 3531 llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)), SL, 3532 SrcVT); 3533 } else { 3534 K0 = DAG.getConstantFP( 3535 llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)), SL, SrcVT); 3536 K1 = DAG.getConstantFP( 3537 llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)), SL, SrcVT); 3538 } 3539 // TODO: Should this propagate fast-math-flags? 3540 SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0); 3541 3542 SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul); 3543 3544 SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc); 3545 3546 SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT 3547 : ISD::FP_TO_UINT, 3548 SL, MVT::i32, FloorMul); 3549 SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma); 3550 3551 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, 3552 DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi})); 3553 3554 if (Signed && SrcVT == MVT::f32) { 3555 assert(Sign); 3556 // Flip the result based on the signedness, which is either all 0s or 1s. 3557 Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64, 3558 DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign})); 3559 // r := xor(r, sign) - sign; 3560 Result = 3561 DAG.getNode(ISD::SUB, SL, MVT::i64, 3562 DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign); 3563 } 3564 3565 return Result; 3566 } 3567 3568 SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const { 3569 SDLoc DL(Op); 3570 SDValue N0 = Op.getOperand(0); 3571 3572 // Convert to target node to get known bits 3573 if (N0.getValueType() == MVT::f32) 3574 return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0); 3575 3576 if (getTargetMachine().Options.UnsafeFPMath) { 3577 // There is a generic expand for FP_TO_FP16 with unsafe fast math. 3578 return SDValue(); 3579 } 3580 3581 assert(N0.getSimpleValueType() == MVT::f64); 3582 3583 // f64 -> f16 conversion using round-to-nearest-even rounding mode. 3584 const unsigned ExpMask = 0x7ff; 3585 const unsigned ExpBiasf64 = 1023; 3586 const unsigned ExpBiasf16 = 15; 3587 SDValue Zero = DAG.getConstant(0, DL, MVT::i32); 3588 SDValue One = DAG.getConstant(1, DL, MVT::i32); 3589 SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0); 3590 SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U, 3591 DAG.getConstant(32, DL, MVT::i64)); 3592 UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32); 3593 U = DAG.getZExtOrTrunc(U, DL, MVT::i32); 3594 SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH, 3595 DAG.getConstant(20, DL, MVT::i64)); 3596 E = DAG.getNode(ISD::AND, DL, MVT::i32, E, 3597 DAG.getConstant(ExpMask, DL, MVT::i32)); 3598 // Subtract the fp64 exponent bias (1023) to get the real exponent and 3599 // add the f16 bias (15) to get the biased exponent for the f16 format. 3600 E = DAG.getNode(ISD::ADD, DL, MVT::i32, E, 3601 DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32)); 3602 3603 SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH, 3604 DAG.getConstant(8, DL, MVT::i32)); 3605 M = DAG.getNode(ISD::AND, DL, MVT::i32, M, 3606 DAG.getConstant(0xffe, DL, MVT::i32)); 3607 3608 SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH, 3609 DAG.getConstant(0x1ff, DL, MVT::i32)); 3610 MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U); 3611 3612 SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ); 3613 M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set); 3614 3615 // (M != 0 ? 0x0200 : 0) | 0x7c00; 3616 SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32, 3617 DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32), 3618 Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32)); 3619 3620 // N = M | (E << 12); 3621 SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M, 3622 DAG.getNode(ISD::SHL, DL, MVT::i32, E, 3623 DAG.getConstant(12, DL, MVT::i32))); 3624 3625 // B = clamp(1-E, 0, 13); 3626 SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32, 3627 One, E); 3628 SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero); 3629 B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B, 3630 DAG.getConstant(13, DL, MVT::i32)); 3631 3632 SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M, 3633 DAG.getConstant(0x1000, DL, MVT::i32)); 3634 3635 SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B); 3636 SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B); 3637 SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE); 3638 D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1); 3639 3640 SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT); 3641 SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V, 3642 DAG.getConstant(0x7, DL, MVT::i32)); 3643 V = DAG.getNode(ISD::SRL, DL, MVT::i32, V, 3644 DAG.getConstant(2, DL, MVT::i32)); 3645 SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32), 3646 One, Zero, ISD::SETEQ); 3647 SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32), 3648 One, Zero, ISD::SETGT); 3649 V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1); 3650 V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1); 3651 3652 V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32), 3653 DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT); 3654 V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32), 3655 I, V, ISD::SETEQ); 3656 3657 // Extract the sign bit. 3658 SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH, 3659 DAG.getConstant(16, DL, MVT::i32)); 3660 Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign, 3661 DAG.getConstant(0x8000, DL, MVT::i32)); 3662 3663 V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V); 3664 return DAG.getZExtOrTrunc(V, DL, Op.getValueType()); 3665 } 3666 3667 SDValue AMDGPUTargetLowering::LowerFP_TO_INT(const SDValue Op, 3668 SelectionDAG &DAG) const { 3669 SDValue Src = Op.getOperand(0); 3670 unsigned OpOpcode = Op.getOpcode(); 3671 EVT SrcVT = Src.getValueType(); 3672 EVT DestVT = Op.getValueType(); 3673 3674 // Will be selected natively 3675 if (SrcVT == MVT::f16 && DestVT == MVT::i16) 3676 return Op; 3677 3678 if (SrcVT == MVT::bf16) { 3679 SDLoc DL(Op); 3680 SDValue PromotedSrc = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src); 3681 return DAG.getNode(Op.getOpcode(), DL, DestVT, PromotedSrc); 3682 } 3683 3684 // Promote i16 to i32 3685 if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) { 3686 SDLoc DL(Op); 3687 3688 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src); 3689 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32); 3690 } 3691 3692 if (DestVT != MVT::i64) 3693 return Op; 3694 3695 if (SrcVT == MVT::f16 || 3696 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) { 3697 SDLoc DL(Op); 3698 3699 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src); 3700 unsigned Ext = 3701 OpOpcode == ISD::FP_TO_SINT ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 3702 return DAG.getNode(Ext, DL, MVT::i64, FpToInt32); 3703 } 3704 3705 if (SrcVT == MVT::f32 || SrcVT == MVT::f64) 3706 return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT); 3707 3708 return SDValue(); 3709 } 3710 3711 SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, 3712 SelectionDAG &DAG) const { 3713 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT(); 3714 MVT VT = Op.getSimpleValueType(); 3715 MVT ScalarVT = VT.getScalarType(); 3716 3717 assert(VT.isVector()); 3718 3719 SDValue Src = Op.getOperand(0); 3720 SDLoc DL(Op); 3721 3722 // TODO: Don't scalarize on Evergreen? 3723 unsigned NElts = VT.getVectorNumElements(); 3724 SmallVector<SDValue, 8> Args; 3725 DAG.ExtractVectorElements(Src, Args, 0, NElts); 3726 3727 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType()); 3728 for (unsigned I = 0; I < NElts; ++I) 3729 Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp); 3730 3731 return DAG.getBuildVector(VT, DL, Args); 3732 } 3733 3734 //===----------------------------------------------------------------------===// 3735 // Custom DAG optimizations 3736 //===----------------------------------------------------------------------===// 3737 3738 static bool isU24(SDValue Op, SelectionDAG &DAG) { 3739 return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24; 3740 } 3741 3742 static bool isI24(SDValue Op, SelectionDAG &DAG) { 3743 EVT VT = Op.getValueType(); 3744 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated 3745 // as unsigned 24-bit values. 3746 AMDGPUTargetLowering::numBitsSigned(Op, DAG) <= 24; 3747 } 3748 3749 static SDValue simplifyMul24(SDNode *Node24, 3750 TargetLowering::DAGCombinerInfo &DCI) { 3751 SelectionDAG &DAG = DCI.DAG; 3752 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 3753 bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN; 3754 3755 SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0); 3756 SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1); 3757 unsigned NewOpcode = Node24->getOpcode(); 3758 if (IsIntrin) { 3759 unsigned IID = Node24->getConstantOperandVal(0); 3760 switch (IID) { 3761 case Intrinsic::amdgcn_mul_i24: 3762 NewOpcode = AMDGPUISD::MUL_I24; 3763 break; 3764 case Intrinsic::amdgcn_mul_u24: 3765 NewOpcode = AMDGPUISD::MUL_U24; 3766 break; 3767 case Intrinsic::amdgcn_mulhi_i24: 3768 NewOpcode = AMDGPUISD::MULHI_I24; 3769 break; 3770 case Intrinsic::amdgcn_mulhi_u24: 3771 NewOpcode = AMDGPUISD::MULHI_U24; 3772 break; 3773 default: 3774 llvm_unreachable("Expected 24-bit mul intrinsic"); 3775 } 3776 } 3777 3778 APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24); 3779 3780 // First try to simplify using SimplifyMultipleUseDemandedBits which allows 3781 // the operands to have other uses, but will only perform simplifications that 3782 // involve bypassing some nodes for this user. 3783 SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG); 3784 SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG); 3785 if (DemandedLHS || DemandedRHS) 3786 return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(), 3787 DemandedLHS ? DemandedLHS : LHS, 3788 DemandedRHS ? DemandedRHS : RHS); 3789 3790 // Now try SimplifyDemandedBits which can simplify the nodes used by our 3791 // operands if this node is the only user. 3792 if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI)) 3793 return SDValue(Node24, 0); 3794 if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI)) 3795 return SDValue(Node24, 0); 3796 3797 return SDValue(); 3798 } 3799 3800 template <typename IntTy> 3801 static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset, 3802 uint32_t Width, const SDLoc &DL) { 3803 if (Width + Offset < 32) { 3804 uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width); 3805 IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width); 3806 if constexpr (std::is_signed_v<IntTy>) { 3807 return DAG.getSignedConstant(Result, DL, MVT::i32); 3808 } else { 3809 return DAG.getConstant(Result, DL, MVT::i32); 3810 } 3811 } 3812 3813 return DAG.getConstant(Src0 >> Offset, DL, MVT::i32); 3814 } 3815 3816 static bool hasVolatileUser(SDNode *Val) { 3817 for (SDNode *U : Val->users()) { 3818 if (MemSDNode *M = dyn_cast<MemSDNode>(U)) { 3819 if (M->isVolatile()) 3820 return true; 3821 } 3822 } 3823 3824 return false; 3825 } 3826 3827 bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const { 3828 // i32 vectors are the canonical memory type. 3829 if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT)) 3830 return false; 3831 3832 if (!VT.isByteSized()) 3833 return false; 3834 3835 unsigned Size = VT.getStoreSize(); 3836 3837 if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector()) 3838 return false; 3839 3840 if (Size == 3 || (Size > 4 && (Size % 4 != 0))) 3841 return false; 3842 3843 return true; 3844 } 3845 3846 // Replace load of an illegal type with a store of a bitcast to a friendlier 3847 // type. 3848 SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N, 3849 DAGCombinerInfo &DCI) const { 3850 if (!DCI.isBeforeLegalize()) 3851 return SDValue(); 3852 3853 LoadSDNode *LN = cast<LoadSDNode>(N); 3854 if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN)) 3855 return SDValue(); 3856 3857 SDLoc SL(N); 3858 SelectionDAG &DAG = DCI.DAG; 3859 EVT VT = LN->getMemoryVT(); 3860 3861 unsigned Size = VT.getStoreSize(); 3862 Align Alignment = LN->getAlign(); 3863 if (Alignment < Size && isTypeLegal(VT)) { 3864 unsigned IsFast; 3865 unsigned AS = LN->getAddressSpace(); 3866 3867 // Expand unaligned loads earlier than legalization. Due to visitation order 3868 // problems during legalization, the emitted instructions to pack and unpack 3869 // the bytes again are not eliminated in the case of an unaligned copy. 3870 if (!allowsMisalignedMemoryAccesses( 3871 VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) { 3872 if (VT.isVector()) 3873 return SplitVectorLoad(SDValue(LN, 0), DAG); 3874 3875 SDValue Ops[2]; 3876 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG); 3877 3878 return DAG.getMergeValues(Ops, SDLoc(N)); 3879 } 3880 3881 if (!IsFast) 3882 return SDValue(); 3883 } 3884 3885 if (!shouldCombineMemoryType(VT)) 3886 return SDValue(); 3887 3888 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT); 3889 3890 SDValue NewLoad 3891 = DAG.getLoad(NewVT, SL, LN->getChain(), 3892 LN->getBasePtr(), LN->getMemOperand()); 3893 3894 SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad); 3895 DCI.CombineTo(N, BC, NewLoad.getValue(1)); 3896 return SDValue(N, 0); 3897 } 3898 3899 // Replace store of an illegal type with a store of a bitcast to a friendlier 3900 // type. 3901 SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N, 3902 DAGCombinerInfo &DCI) const { 3903 if (!DCI.isBeforeLegalize()) 3904 return SDValue(); 3905 3906 StoreSDNode *SN = cast<StoreSDNode>(N); 3907 if (!SN->isSimple() || !ISD::isNormalStore(SN)) 3908 return SDValue(); 3909 3910 EVT VT = SN->getMemoryVT(); 3911 unsigned Size = VT.getStoreSize(); 3912 3913 SDLoc SL(N); 3914 SelectionDAG &DAG = DCI.DAG; 3915 Align Alignment = SN->getAlign(); 3916 if (Alignment < Size && isTypeLegal(VT)) { 3917 unsigned IsFast; 3918 unsigned AS = SN->getAddressSpace(); 3919 3920 // Expand unaligned stores earlier than legalization. Due to visitation 3921 // order problems during legalization, the emitted instructions to pack and 3922 // unpack the bytes again are not eliminated in the case of an unaligned 3923 // copy. 3924 if (!allowsMisalignedMemoryAccesses( 3925 VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) { 3926 if (VT.isVector()) 3927 return SplitVectorStore(SDValue(SN, 0), DAG); 3928 3929 return expandUnalignedStore(SN, DAG); 3930 } 3931 3932 if (!IsFast) 3933 return SDValue(); 3934 } 3935 3936 if (!shouldCombineMemoryType(VT)) 3937 return SDValue(); 3938 3939 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT); 3940 SDValue Val = SN->getValue(); 3941 3942 //DCI.AddToWorklist(Val.getNode()); 3943 3944 bool OtherUses = !Val.hasOneUse(); 3945 SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val); 3946 if (OtherUses) { 3947 SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal); 3948 DAG.ReplaceAllUsesOfValueWith(Val, CastBack); 3949 } 3950 3951 return DAG.getStore(SN->getChain(), SL, CastVal, 3952 SN->getBasePtr(), SN->getMemOperand()); 3953 } 3954 3955 // FIXME: This should go in generic DAG combiner with an isTruncateFree check, 3956 // but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU 3957 // issues. 3958 SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N, 3959 DAGCombinerInfo &DCI) const { 3960 SelectionDAG &DAG = DCI.DAG; 3961 SDValue N0 = N->getOperand(0); 3962 3963 // (vt2 (assertzext (truncate vt0:x), vt1)) -> 3964 // (vt2 (truncate (assertzext vt0:x, vt1))) 3965 if (N0.getOpcode() == ISD::TRUNCATE) { 3966 SDValue N1 = N->getOperand(1); 3967 EVT ExtVT = cast<VTSDNode>(N1)->getVT(); 3968 SDLoc SL(N); 3969 3970 SDValue Src = N0.getOperand(0); 3971 EVT SrcVT = Src.getValueType(); 3972 if (SrcVT.bitsGE(ExtVT)) { 3973 SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1); 3974 return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg); 3975 } 3976 } 3977 3978 return SDValue(); 3979 } 3980 3981 SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine( 3982 SDNode *N, DAGCombinerInfo &DCI) const { 3983 unsigned IID = N->getConstantOperandVal(0); 3984 switch (IID) { 3985 case Intrinsic::amdgcn_mul_i24: 3986 case Intrinsic::amdgcn_mul_u24: 3987 case Intrinsic::amdgcn_mulhi_i24: 3988 case Intrinsic::amdgcn_mulhi_u24: 3989 return simplifyMul24(N, DCI); 3990 case Intrinsic::amdgcn_fract: 3991 case Intrinsic::amdgcn_rsq: 3992 case Intrinsic::amdgcn_rcp_legacy: 3993 case Intrinsic::amdgcn_rsq_legacy: 3994 case Intrinsic::amdgcn_rsq_clamp: { 3995 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted 3996 SDValue Src = N->getOperand(1); 3997 return Src.isUndef() ? Src : SDValue(); 3998 } 3999 case Intrinsic::amdgcn_frexp_exp: { 4000 // frexp_exp (fneg x) -> frexp_exp x 4001 // frexp_exp (fabs x) -> frexp_exp x 4002 // frexp_exp (fneg (fabs x)) -> frexp_exp x 4003 SDValue Src = N->getOperand(1); 4004 SDValue PeekSign = peekFPSignOps(Src); 4005 if (PeekSign == Src) 4006 return SDValue(); 4007 return SDValue(DCI.DAG.UpdateNodeOperands(N, N->getOperand(0), PeekSign), 4008 0); 4009 } 4010 default: 4011 return SDValue(); 4012 } 4013 } 4014 4015 /// Split the 64-bit value \p LHS into two 32-bit components, and perform the 4016 /// binary operation \p Opc to it with the corresponding constant operands. 4017 SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl( 4018 DAGCombinerInfo &DCI, const SDLoc &SL, 4019 unsigned Opc, SDValue LHS, 4020 uint32_t ValLo, uint32_t ValHi) const { 4021 SelectionDAG &DAG = DCI.DAG; 4022 SDValue Lo, Hi; 4023 std::tie(Lo, Hi) = split64BitValue(LHS, DAG); 4024 4025 SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32); 4026 SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32); 4027 4028 SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS); 4029 SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS); 4030 4031 // Re-visit the ands. It's possible we eliminated one of them and it could 4032 // simplify the vector. 4033 DCI.AddToWorklist(Lo.getNode()); 4034 DCI.AddToWorklist(Hi.getNode()); 4035 4036 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd}); 4037 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec); 4038 } 4039 4040 SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N, 4041 DAGCombinerInfo &DCI) const { 4042 EVT VT = N->getValueType(0); 4043 4044 ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1)); 4045 if (!RHS) 4046 return SDValue(); 4047 4048 SDValue LHS = N->getOperand(0); 4049 unsigned RHSVal = RHS->getZExtValue(); 4050 if (!RHSVal) 4051 return LHS; 4052 4053 SDLoc SL(N); 4054 SelectionDAG &DAG = DCI.DAG; 4055 4056 switch (LHS->getOpcode()) { 4057 default: 4058 break; 4059 case ISD::ZERO_EXTEND: 4060 case ISD::SIGN_EXTEND: 4061 case ISD::ANY_EXTEND: { 4062 SDValue X = LHS->getOperand(0); 4063 4064 if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 && 4065 isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) { 4066 // Prefer build_vector as the canonical form if packed types are legal. 4067 // (shl ([asz]ext i16:x), 16 -> build_vector 0, x 4068 SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL, 4069 { DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0) }); 4070 return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec); 4071 } 4072 4073 // shl (ext x) => zext (shl x), if shift does not overflow int 4074 if (VT != MVT::i64) 4075 break; 4076 KnownBits Known = DAG.computeKnownBits(X); 4077 unsigned LZ = Known.countMinLeadingZeros(); 4078 if (LZ < RHSVal) 4079 break; 4080 EVT XVT = X.getValueType(); 4081 SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0)); 4082 return DAG.getZExtOrTrunc(Shl, SL, VT); 4083 } 4084 } 4085 4086 if (VT != MVT::i64) 4087 return SDValue(); 4088 4089 // i64 (shl x, C) -> (build_pair 0, (shl x, C -32)) 4090 4091 // On some subtargets, 64-bit shift is a quarter rate instruction. In the 4092 // common case, splitting this into a move and a 32-bit shift is faster and 4093 // the same code size. 4094 if (RHSVal < 32) 4095 return SDValue(); 4096 4097 SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32); 4098 4099 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS); 4100 SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt); 4101 4102 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); 4103 4104 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift}); 4105 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec); 4106 } 4107 4108 SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N, 4109 DAGCombinerInfo &DCI) const { 4110 if (N->getValueType(0) != MVT::i64) 4111 return SDValue(); 4112 4113 const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1)); 4114 if (!RHS) 4115 return SDValue(); 4116 4117 SelectionDAG &DAG = DCI.DAG; 4118 SDLoc SL(N); 4119 unsigned RHSVal = RHS->getZExtValue(); 4120 4121 // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31) 4122 if (RHSVal == 32) { 4123 SDValue Hi = getHiHalf64(N->getOperand(0), DAG); 4124 SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi, 4125 DAG.getConstant(31, SL, MVT::i32)); 4126 4127 SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift}); 4128 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec); 4129 } 4130 4131 // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31) 4132 if (RHSVal == 63) { 4133 SDValue Hi = getHiHalf64(N->getOperand(0), DAG); 4134 SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi, 4135 DAG.getConstant(31, SL, MVT::i32)); 4136 SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift}); 4137 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec); 4138 } 4139 4140 return SDValue(); 4141 } 4142 4143 SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N, 4144 DAGCombinerInfo &DCI) const { 4145 auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1)); 4146 if (!RHS) 4147 return SDValue(); 4148 4149 EVT VT = N->getValueType(0); 4150 SDValue LHS = N->getOperand(0); 4151 unsigned ShiftAmt = RHS->getZExtValue(); 4152 SelectionDAG &DAG = DCI.DAG; 4153 SDLoc SL(N); 4154 4155 // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1) 4156 // this improves the ability to match BFE patterns in isel. 4157 if (LHS.getOpcode() == ISD::AND) { 4158 if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) { 4159 unsigned MaskIdx, MaskLen; 4160 if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) && 4161 MaskIdx == ShiftAmt) { 4162 return DAG.getNode( 4163 ISD::AND, SL, VT, 4164 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), N->getOperand(1)), 4165 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), N->getOperand(1))); 4166 } 4167 } 4168 } 4169 4170 if (VT != MVT::i64) 4171 return SDValue(); 4172 4173 if (ShiftAmt < 32) 4174 return SDValue(); 4175 4176 // srl i64:x, C for C >= 32 4177 // => 4178 // build_pair (srl hi_32(x), C - 32), 0 4179 SDValue Zero = DAG.getConstant(0, SL, MVT::i32); 4180 4181 SDValue Hi = getHiHalf64(LHS, DAG); 4182 4183 SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32); 4184 SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst); 4185 4186 SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero}); 4187 4188 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair); 4189 } 4190 4191 SDValue AMDGPUTargetLowering::performTruncateCombine( 4192 SDNode *N, DAGCombinerInfo &DCI) const { 4193 SDLoc SL(N); 4194 SelectionDAG &DAG = DCI.DAG; 4195 EVT VT = N->getValueType(0); 4196 SDValue Src = N->getOperand(0); 4197 4198 // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x) 4199 if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) { 4200 SDValue Vec = Src.getOperand(0); 4201 if (Vec.getOpcode() == ISD::BUILD_VECTOR) { 4202 SDValue Elt0 = Vec.getOperand(0); 4203 EVT EltVT = Elt0.getValueType(); 4204 if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) { 4205 if (EltVT.isFloatingPoint()) { 4206 Elt0 = DAG.getNode(ISD::BITCAST, SL, 4207 EltVT.changeTypeToInteger(), Elt0); 4208 } 4209 4210 return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0); 4211 } 4212 } 4213 } 4214 4215 // Equivalent of above for accessing the high element of a vector as an 4216 // integer operation. 4217 // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y) 4218 if (Src.getOpcode() == ISD::SRL && !VT.isVector()) { 4219 if (auto *K = isConstOrConstSplat(Src.getOperand(1))) { 4220 if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) { 4221 SDValue BV = stripBitcast(Src.getOperand(0)); 4222 if (BV.getOpcode() == ISD::BUILD_VECTOR && 4223 BV.getValueType().getVectorNumElements() == 2) { 4224 SDValue SrcElt = BV.getOperand(1); 4225 EVT SrcEltVT = SrcElt.getValueType(); 4226 if (SrcEltVT.isFloatingPoint()) { 4227 SrcElt = DAG.getNode(ISD::BITCAST, SL, 4228 SrcEltVT.changeTypeToInteger(), SrcElt); 4229 } 4230 4231 return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt); 4232 } 4233 } 4234 } 4235 } 4236 4237 // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit. 4238 // 4239 // i16 (trunc (srl i64:x, K)), K <= 16 -> 4240 // i16 (trunc (srl (i32 (trunc x), K))) 4241 if (VT.getScalarSizeInBits() < 32) { 4242 EVT SrcVT = Src.getValueType(); 4243 if (SrcVT.getScalarSizeInBits() > 32 && 4244 (Src.getOpcode() == ISD::SRL || 4245 Src.getOpcode() == ISD::SRA || 4246 Src.getOpcode() == ISD::SHL)) { 4247 SDValue Amt = Src.getOperand(1); 4248 KnownBits Known = DAG.computeKnownBits(Amt); 4249 4250 // - For left shifts, do the transform as long as the shift 4251 // amount is still legal for i32, so when ShiftAmt < 32 (<= 31) 4252 // - For right shift, do it if ShiftAmt <= (32 - Size) to avoid 4253 // losing information stored in the high bits when truncating. 4254 const unsigned MaxCstSize = 4255 (Src.getOpcode() == ISD::SHL) ? 31 : (32 - VT.getScalarSizeInBits()); 4256 if (Known.getMaxValue().ule(MaxCstSize)) { 4257 EVT MidVT = VT.isVector() ? 4258 EVT::getVectorVT(*DAG.getContext(), MVT::i32, 4259 VT.getVectorNumElements()) : MVT::i32; 4260 4261 EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout()); 4262 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT, 4263 Src.getOperand(0)); 4264 DCI.AddToWorklist(Trunc.getNode()); 4265 4266 if (Amt.getValueType() != NewShiftVT) { 4267 Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT); 4268 DCI.AddToWorklist(Amt.getNode()); 4269 } 4270 4271 SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT, 4272 Trunc, Amt); 4273 return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift); 4274 } 4275 } 4276 } 4277 4278 return SDValue(); 4279 } 4280 4281 // We need to specifically handle i64 mul here to avoid unnecessary conversion 4282 // instructions. If we only match on the legalized i64 mul expansion, 4283 // SimplifyDemandedBits will be unable to remove them because there will be 4284 // multiple uses due to the separate mul + mulh[su]. 4285 static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL, 4286 SDValue N0, SDValue N1, unsigned Size, bool Signed) { 4287 if (Size <= 32) { 4288 unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24; 4289 return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1); 4290 } 4291 4292 unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24; 4293 unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24; 4294 4295 SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1); 4296 SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1); 4297 4298 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi); 4299 } 4300 4301 /// If \p V is an add of a constant 1, returns the other operand. Otherwise 4302 /// return SDValue(). 4303 static SDValue getAddOneOp(const SDNode *V) { 4304 if (V->getOpcode() != ISD::ADD) 4305 return SDValue(); 4306 4307 return isOneConstant(V->getOperand(1)) ? V->getOperand(0) : SDValue(); 4308 } 4309 4310 SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N, 4311 DAGCombinerInfo &DCI) const { 4312 assert(N->getOpcode() == ISD::MUL); 4313 EVT VT = N->getValueType(0); 4314 4315 // Don't generate 24-bit multiplies on values that are in SGPRs, since 4316 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs 4317 // unnecessarily). isDivergent() is used as an approximation of whether the 4318 // value is in an SGPR. 4319 if (!N->isDivergent()) 4320 return SDValue(); 4321 4322 unsigned Size = VT.getSizeInBits(); 4323 if (VT.isVector() || Size > 64) 4324 return SDValue(); 4325 4326 SelectionDAG &DAG = DCI.DAG; 4327 SDLoc DL(N); 4328 4329 SDValue N0 = N->getOperand(0); 4330 SDValue N1 = N->getOperand(1); 4331 4332 // Undo InstCombine canonicalize X * (Y + 1) -> X * Y + X to enable mad 4333 // matching. 4334 4335 // mul x, (add y, 1) -> add (mul x, y), x 4336 auto IsFoldableAdd = [](SDValue V) -> SDValue { 4337 SDValue AddOp = getAddOneOp(V.getNode()); 4338 if (!AddOp) 4339 return SDValue(); 4340 4341 if (V.hasOneUse() || all_of(V->users(), [](const SDNode *U) -> bool { 4342 return U->getOpcode() == ISD::MUL; 4343 })) 4344 return AddOp; 4345 4346 return SDValue(); 4347 }; 4348 4349 // FIXME: The selection pattern is not properly checking for commuted 4350 // operands, so we have to place the mul in the LHS 4351 if (SDValue MulOper = IsFoldableAdd(N0)) { 4352 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N1, MulOper); 4353 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N1); 4354 } 4355 4356 if (SDValue MulOper = IsFoldableAdd(N1)) { 4357 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N0, MulOper); 4358 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N0); 4359 } 4360 4361 // There are i16 integer mul/mad. 4362 if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16)) 4363 return SDValue(); 4364 4365 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends 4366 // in the source into any_extends if the result of the mul is truncated. Since 4367 // we can assume the high bits are whatever we want, use the underlying value 4368 // to avoid the unknown high bits from interfering. 4369 if (N0.getOpcode() == ISD::ANY_EXTEND) 4370 N0 = N0.getOperand(0); 4371 4372 if (N1.getOpcode() == ISD::ANY_EXTEND) 4373 N1 = N1.getOperand(0); 4374 4375 SDValue Mul; 4376 4377 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) { 4378 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32); 4379 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32); 4380 Mul = getMul24(DAG, DL, N0, N1, Size, false); 4381 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) { 4382 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32); 4383 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32); 4384 Mul = getMul24(DAG, DL, N0, N1, Size, true); 4385 } else { 4386 return SDValue(); 4387 } 4388 4389 // We need to use sext even for MUL_U24, because MUL_U24 is used 4390 // for signed multiply of 8 and 16-bit types. 4391 return DAG.getSExtOrTrunc(Mul, DL, VT); 4392 } 4393 4394 SDValue 4395 AMDGPUTargetLowering::performMulLoHiCombine(SDNode *N, 4396 DAGCombinerInfo &DCI) const { 4397 if (N->getValueType(0) != MVT::i32) 4398 return SDValue(); 4399 4400 SelectionDAG &DAG = DCI.DAG; 4401 SDLoc DL(N); 4402 4403 bool Signed = N->getOpcode() == ISD::SMUL_LOHI; 4404 SDValue N0 = N->getOperand(0); 4405 SDValue N1 = N->getOperand(1); 4406 4407 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends 4408 // in the source into any_extends if the result of the mul is truncated. Since 4409 // we can assume the high bits are whatever we want, use the underlying value 4410 // to avoid the unknown high bits from interfering. 4411 if (N0.getOpcode() == ISD::ANY_EXTEND) 4412 N0 = N0.getOperand(0); 4413 if (N1.getOpcode() == ISD::ANY_EXTEND) 4414 N1 = N1.getOperand(0); 4415 4416 // Try to use two fast 24-bit multiplies (one for each half of the result) 4417 // instead of one slow extending multiply. 4418 unsigned LoOpcode = 0; 4419 unsigned HiOpcode = 0; 4420 if (Signed) { 4421 if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) { 4422 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32); 4423 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32); 4424 LoOpcode = AMDGPUISD::MUL_I24; 4425 HiOpcode = AMDGPUISD::MULHI_I24; 4426 } 4427 } else { 4428 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) { 4429 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32); 4430 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32); 4431 LoOpcode = AMDGPUISD::MUL_U24; 4432 HiOpcode = AMDGPUISD::MULHI_U24; 4433 } 4434 } 4435 if (!LoOpcode) 4436 return SDValue(); 4437 4438 SDValue Lo = DAG.getNode(LoOpcode, DL, MVT::i32, N0, N1); 4439 SDValue Hi = DAG.getNode(HiOpcode, DL, MVT::i32, N0, N1); 4440 DCI.CombineTo(N, Lo, Hi); 4441 return SDValue(N, 0); 4442 } 4443 4444 SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N, 4445 DAGCombinerInfo &DCI) const { 4446 EVT VT = N->getValueType(0); 4447 4448 if (!Subtarget->hasMulI24() || VT.isVector()) 4449 return SDValue(); 4450 4451 // Don't generate 24-bit multiplies on values that are in SGPRs, since 4452 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs 4453 // unnecessarily). isDivergent() is used as an approximation of whether the 4454 // value is in an SGPR. 4455 // This doesn't apply if no s_mul_hi is available (since we'll end up with a 4456 // valu op anyway) 4457 if (Subtarget->hasSMulHi() && !N->isDivergent()) 4458 return SDValue(); 4459 4460 SelectionDAG &DAG = DCI.DAG; 4461 SDLoc DL(N); 4462 4463 SDValue N0 = N->getOperand(0); 4464 SDValue N1 = N->getOperand(1); 4465 4466 if (!isI24(N0, DAG) || !isI24(N1, DAG)) 4467 return SDValue(); 4468 4469 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32); 4470 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32); 4471 4472 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1); 4473 DCI.AddToWorklist(Mulhi.getNode()); 4474 return DAG.getSExtOrTrunc(Mulhi, DL, VT); 4475 } 4476 4477 SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N, 4478 DAGCombinerInfo &DCI) const { 4479 EVT VT = N->getValueType(0); 4480 4481 if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32) 4482 return SDValue(); 4483 4484 // Don't generate 24-bit multiplies on values that are in SGPRs, since 4485 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs 4486 // unnecessarily). isDivergent() is used as an approximation of whether the 4487 // value is in an SGPR. 4488 // This doesn't apply if no s_mul_hi is available (since we'll end up with a 4489 // valu op anyway) 4490 if (Subtarget->hasSMulHi() && !N->isDivergent()) 4491 return SDValue(); 4492 4493 SelectionDAG &DAG = DCI.DAG; 4494 SDLoc DL(N); 4495 4496 SDValue N0 = N->getOperand(0); 4497 SDValue N1 = N->getOperand(1); 4498 4499 if (!isU24(N0, DAG) || !isU24(N1, DAG)) 4500 return SDValue(); 4501 4502 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32); 4503 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32); 4504 4505 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1); 4506 DCI.AddToWorklist(Mulhi.getNode()); 4507 return DAG.getZExtOrTrunc(Mulhi, DL, VT); 4508 } 4509 4510 SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG, 4511 SDValue Op, 4512 const SDLoc &DL, 4513 unsigned Opc) const { 4514 EVT VT = Op.getValueType(); 4515 EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT); 4516 if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() && 4517 LegalVT != MVT::i16)) 4518 return SDValue(); 4519 4520 if (VT != MVT::i32) 4521 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op); 4522 4523 SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op); 4524 if (VT != MVT::i32) 4525 FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX); 4526 4527 return FFBX; 4528 } 4529 4530 // The native instructions return -1 on 0 input. Optimize out a select that 4531 // produces -1 on 0. 4532 // 4533 // TODO: If zero is not undef, we could also do this if the output is compared 4534 // against the bitwidth. 4535 // 4536 // TODO: Should probably combine against FFBH_U32 instead of ctlz directly. 4537 SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, 4538 SDValue LHS, SDValue RHS, 4539 DAGCombinerInfo &DCI) const { 4540 if (!isNullConstant(Cond.getOperand(1))) 4541 return SDValue(); 4542 4543 SelectionDAG &DAG = DCI.DAG; 4544 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 4545 SDValue CmpLHS = Cond.getOperand(0); 4546 4547 // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x 4548 // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x 4549 if (CCOpcode == ISD::SETEQ && 4550 (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) && 4551 RHS.getOperand(0) == CmpLHS && isAllOnesConstant(LHS)) { 4552 unsigned Opc = 4553 isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32; 4554 return getFFBX_U32(DAG, CmpLHS, SL, Opc); 4555 } 4556 4557 // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x 4558 // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x 4559 if (CCOpcode == ISD::SETNE && 4560 (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) && 4561 LHS.getOperand(0) == CmpLHS && isAllOnesConstant(RHS)) { 4562 unsigned Opc = 4563 isCttzOpc(LHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32; 4564 4565 return getFFBX_U32(DAG, CmpLHS, SL, Opc); 4566 } 4567 4568 return SDValue(); 4569 } 4570 4571 static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI, 4572 unsigned Op, 4573 const SDLoc &SL, 4574 SDValue Cond, 4575 SDValue N1, 4576 SDValue N2) { 4577 SelectionDAG &DAG = DCI.DAG; 4578 EVT VT = N1.getValueType(); 4579 4580 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond, 4581 N1.getOperand(0), N2.getOperand(0)); 4582 DCI.AddToWorklist(NewSelect.getNode()); 4583 return DAG.getNode(Op, SL, VT, NewSelect); 4584 } 4585 4586 // Pull a free FP operation out of a select so it may fold into uses. 4587 // 4588 // select c, (fneg x), (fneg y) -> fneg (select c, x, y) 4589 // select c, (fneg x), k -> fneg (select c, x, (fneg k)) 4590 // 4591 // select c, (fabs x), (fabs y) -> fabs (select c, x, y) 4592 // select c, (fabs x), +k -> fabs (select c, x, k) 4593 SDValue 4594 AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, 4595 SDValue N) const { 4596 SelectionDAG &DAG = DCI.DAG; 4597 SDValue Cond = N.getOperand(0); 4598 SDValue LHS = N.getOperand(1); 4599 SDValue RHS = N.getOperand(2); 4600 4601 EVT VT = N.getValueType(); 4602 if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) || 4603 (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) { 4604 if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N.getNode())) 4605 return SDValue(); 4606 4607 return distributeOpThroughSelect(DCI, LHS.getOpcode(), 4608 SDLoc(N), Cond, LHS, RHS); 4609 } 4610 4611 bool Inv = false; 4612 if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) { 4613 std::swap(LHS, RHS); 4614 Inv = true; 4615 } 4616 4617 // TODO: Support vector constants. 4618 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS); 4619 if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS && 4620 !selectSupportsSourceMods(N.getNode())) { 4621 SDLoc SL(N); 4622 // If one side is an fneg/fabs and the other is a constant, we can push the 4623 // fneg/fabs down. If it's an fabs, the constant needs to be non-negative. 4624 SDValue NewLHS = LHS.getOperand(0); 4625 SDValue NewRHS = RHS; 4626 4627 // Careful: if the neg can be folded up, don't try to pull it back down. 4628 bool ShouldFoldNeg = true; 4629 4630 if (NewLHS.hasOneUse()) { 4631 unsigned Opc = NewLHS.getOpcode(); 4632 if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(NewLHS.getNode())) 4633 ShouldFoldNeg = false; 4634 if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL) 4635 ShouldFoldNeg = false; 4636 } 4637 4638 if (ShouldFoldNeg) { 4639 if (LHS.getOpcode() == ISD::FABS && CRHS->isNegative()) 4640 return SDValue(); 4641 4642 // We're going to be forced to use a source modifier anyway, there's no 4643 // point to pulling the negate out unless we can get a size reduction by 4644 // negating the constant. 4645 // 4646 // TODO: Generalize to use getCheaperNegatedExpression which doesn't know 4647 // about cheaper constants. 4648 if (NewLHS.getOpcode() == ISD::FABS && 4649 getConstantNegateCost(CRHS) != NegatibleCost::Cheaper) 4650 return SDValue(); 4651 4652 if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N.getNode())) 4653 return SDValue(); 4654 4655 if (LHS.getOpcode() == ISD::FNEG) 4656 NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); 4657 4658 if (Inv) 4659 std::swap(NewLHS, NewRHS); 4660 4661 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, 4662 Cond, NewLHS, NewRHS); 4663 DCI.AddToWorklist(NewSelect.getNode()); 4664 return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect); 4665 } 4666 } 4667 4668 return SDValue(); 4669 } 4670 4671 SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, 4672 DAGCombinerInfo &DCI) const { 4673 if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0))) 4674 return Folded; 4675 4676 SDValue Cond = N->getOperand(0); 4677 if (Cond.getOpcode() != ISD::SETCC) 4678 return SDValue(); 4679 4680 EVT VT = N->getValueType(0); 4681 SDValue LHS = Cond.getOperand(0); 4682 SDValue RHS = Cond.getOperand(1); 4683 SDValue CC = Cond.getOperand(2); 4684 4685 SDValue True = N->getOperand(1); 4686 SDValue False = N->getOperand(2); 4687 4688 if (Cond.hasOneUse()) { // TODO: Look for multiple select uses. 4689 SelectionDAG &DAG = DCI.DAG; 4690 if (DAG.isConstantValueOfAnyType(True) && 4691 !DAG.isConstantValueOfAnyType(False)) { 4692 // Swap cmp + select pair to move constant to false input. 4693 // This will allow using VOPC cndmasks more often. 4694 // select (setcc x, y), k, x -> select (setccinv x, y), x, k 4695 4696 SDLoc SL(N); 4697 ISD::CondCode NewCC = 4698 getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), LHS.getValueType()); 4699 4700 SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC); 4701 return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True); 4702 } 4703 4704 if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) { 4705 SDValue MinMax 4706 = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI); 4707 // Revisit this node so we can catch min3/max3/med3 patterns. 4708 //DCI.AddToWorklist(MinMax.getNode()); 4709 return MinMax; 4710 } 4711 } 4712 4713 // There's no reason to not do this if the condition has other uses. 4714 return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI); 4715 } 4716 4717 static bool isInv2Pi(const APFloat &APF) { 4718 static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118)); 4719 static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983)); 4720 static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882)); 4721 4722 return APF.bitwiseIsEqual(KF16) || 4723 APF.bitwiseIsEqual(KF32) || 4724 APF.bitwiseIsEqual(KF64); 4725 } 4726 4727 // 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an 4728 // additional cost to negate them. 4729 TargetLowering::NegatibleCost 4730 AMDGPUTargetLowering::getConstantNegateCost(const ConstantFPSDNode *C) const { 4731 if (C->isZero()) 4732 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive; 4733 4734 if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF())) 4735 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive; 4736 4737 return NegatibleCost::Neutral; 4738 } 4739 4740 bool AMDGPUTargetLowering::isConstantCostlierToNegate(SDValue N) const { 4741 if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N)) 4742 return getConstantNegateCost(C) == NegatibleCost::Expensive; 4743 return false; 4744 } 4745 4746 bool AMDGPUTargetLowering::isConstantCheaperToNegate(SDValue N) const { 4747 if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N)) 4748 return getConstantNegateCost(C) == NegatibleCost::Cheaper; 4749 return false; 4750 } 4751 4752 static unsigned inverseMinMax(unsigned Opc) { 4753 switch (Opc) { 4754 case ISD::FMAXNUM: 4755 return ISD::FMINNUM; 4756 case ISD::FMINNUM: 4757 return ISD::FMAXNUM; 4758 case ISD::FMAXNUM_IEEE: 4759 return ISD::FMINNUM_IEEE; 4760 case ISD::FMINNUM_IEEE: 4761 return ISD::FMAXNUM_IEEE; 4762 case ISD::FMAXIMUM: 4763 return ISD::FMINIMUM; 4764 case ISD::FMINIMUM: 4765 return ISD::FMAXIMUM; 4766 case AMDGPUISD::FMAX_LEGACY: 4767 return AMDGPUISD::FMIN_LEGACY; 4768 case AMDGPUISD::FMIN_LEGACY: 4769 return AMDGPUISD::FMAX_LEGACY; 4770 default: 4771 llvm_unreachable("invalid min/max opcode"); 4772 } 4773 } 4774 4775 /// \return true if it's profitable to try to push an fneg into its source 4776 /// instruction. 4777 bool AMDGPUTargetLowering::shouldFoldFNegIntoSrc(SDNode *N, SDValue N0) { 4778 // If the input has multiple uses and we can either fold the negate down, or 4779 // the other uses cannot, give up. This both prevents unprofitable 4780 // transformations and infinite loops: we won't repeatedly try to fold around 4781 // a negate that has no 'good' form. 4782 if (N0.hasOneUse()) { 4783 // This may be able to fold into the source, but at a code size cost. Don't 4784 // fold if the fold into the user is free. 4785 if (allUsesHaveSourceMods(N, 0)) 4786 return false; 4787 } else { 4788 if (fnegFoldsIntoOp(N0.getNode()) && 4789 (allUsesHaveSourceMods(N) || !allUsesHaveSourceMods(N0.getNode()))) 4790 return false; 4791 } 4792 4793 return true; 4794 } 4795 4796 SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N, 4797 DAGCombinerInfo &DCI) const { 4798 SelectionDAG &DAG = DCI.DAG; 4799 SDValue N0 = N->getOperand(0); 4800 EVT VT = N->getValueType(0); 4801 4802 unsigned Opc = N0.getOpcode(); 4803 4804 if (!shouldFoldFNegIntoSrc(N, N0)) 4805 return SDValue(); 4806 4807 SDLoc SL(N); 4808 switch (Opc) { 4809 case ISD::FADD: { 4810 if (!mayIgnoreSignedZero(N0)) 4811 return SDValue(); 4812 4813 // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y)) 4814 SDValue LHS = N0.getOperand(0); 4815 SDValue RHS = N0.getOperand(1); 4816 4817 if (LHS.getOpcode() != ISD::FNEG) 4818 LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS); 4819 else 4820 LHS = LHS.getOperand(0); 4821 4822 if (RHS.getOpcode() != ISD::FNEG) 4823 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); 4824 else 4825 RHS = RHS.getOperand(0); 4826 4827 SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags()); 4828 if (Res.getOpcode() != ISD::FADD) 4829 return SDValue(); // Op got folded away. 4830 if (!N0.hasOneUse()) 4831 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); 4832 return Res; 4833 } 4834 case ISD::FMUL: 4835 case AMDGPUISD::FMUL_LEGACY: { 4836 // (fneg (fmul x, y)) -> (fmul x, (fneg y)) 4837 // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y)) 4838 SDValue LHS = N0.getOperand(0); 4839 SDValue RHS = N0.getOperand(1); 4840 4841 if (LHS.getOpcode() == ISD::FNEG) 4842 LHS = LHS.getOperand(0); 4843 else if (RHS.getOpcode() == ISD::FNEG) 4844 RHS = RHS.getOperand(0); 4845 else 4846 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); 4847 4848 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags()); 4849 if (Res.getOpcode() != Opc) 4850 return SDValue(); // Op got folded away. 4851 if (!N0.hasOneUse()) 4852 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); 4853 return Res; 4854 } 4855 case ISD::FMA: 4856 case ISD::FMAD: { 4857 // TODO: handle llvm.amdgcn.fma.legacy 4858 if (!mayIgnoreSignedZero(N0)) 4859 return SDValue(); 4860 4861 // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z)) 4862 SDValue LHS = N0.getOperand(0); 4863 SDValue MHS = N0.getOperand(1); 4864 SDValue RHS = N0.getOperand(2); 4865 4866 if (LHS.getOpcode() == ISD::FNEG) 4867 LHS = LHS.getOperand(0); 4868 else if (MHS.getOpcode() == ISD::FNEG) 4869 MHS = MHS.getOperand(0); 4870 else 4871 MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS); 4872 4873 if (RHS.getOpcode() != ISD::FNEG) 4874 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); 4875 else 4876 RHS = RHS.getOperand(0); 4877 4878 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS); 4879 if (Res.getOpcode() != Opc) 4880 return SDValue(); // Op got folded away. 4881 if (!N0.hasOneUse()) 4882 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); 4883 return Res; 4884 } 4885 case ISD::FMAXNUM: 4886 case ISD::FMINNUM: 4887 case ISD::FMAXNUM_IEEE: 4888 case ISD::FMINNUM_IEEE: 4889 case ISD::FMINIMUM: 4890 case ISD::FMAXIMUM: 4891 case AMDGPUISD::FMAX_LEGACY: 4892 case AMDGPUISD::FMIN_LEGACY: { 4893 // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y) 4894 // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y) 4895 // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y) 4896 // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y) 4897 4898 SDValue LHS = N0.getOperand(0); 4899 SDValue RHS = N0.getOperand(1); 4900 4901 // 0 doesn't have a negated inline immediate. 4902 // TODO: This constant check should be generalized to other operations. 4903 if (isConstantCostlierToNegate(RHS)) 4904 return SDValue(); 4905 4906 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS); 4907 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); 4908 unsigned Opposite = inverseMinMax(Opc); 4909 4910 SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags()); 4911 if (Res.getOpcode() != Opposite) 4912 return SDValue(); // Op got folded away. 4913 if (!N0.hasOneUse()) 4914 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); 4915 return Res; 4916 } 4917 case AMDGPUISD::FMED3: { 4918 SDValue Ops[3]; 4919 for (unsigned I = 0; I < 3; ++I) 4920 Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags()); 4921 4922 SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags()); 4923 if (Res.getOpcode() != AMDGPUISD::FMED3) 4924 return SDValue(); // Op got folded away. 4925 4926 if (!N0.hasOneUse()) { 4927 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Res); 4928 DAG.ReplaceAllUsesWith(N0, Neg); 4929 4930 for (SDNode *U : Neg->users()) 4931 DCI.AddToWorklist(U); 4932 } 4933 4934 return Res; 4935 } 4936 case ISD::FP_EXTEND: 4937 case ISD::FTRUNC: 4938 case ISD::FRINT: 4939 case ISD::FNEARBYINT: // XXX - Should fround be handled? 4940 case ISD::FROUNDEVEN: 4941 case ISD::FSIN: 4942 case ISD::FCANONICALIZE: 4943 case AMDGPUISD::RCP: 4944 case AMDGPUISD::RCP_LEGACY: 4945 case AMDGPUISD::RCP_IFLAG: 4946 case AMDGPUISD::SIN_HW: { 4947 SDValue CvtSrc = N0.getOperand(0); 4948 if (CvtSrc.getOpcode() == ISD::FNEG) { 4949 // (fneg (fp_extend (fneg x))) -> (fp_extend x) 4950 // (fneg (rcp (fneg x))) -> (rcp x) 4951 return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0)); 4952 } 4953 4954 if (!N0.hasOneUse()) 4955 return SDValue(); 4956 4957 // (fneg (fp_extend x)) -> (fp_extend (fneg x)) 4958 // (fneg (rcp x)) -> (rcp (fneg x)) 4959 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc); 4960 return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags()); 4961 } 4962 case ISD::FP_ROUND: { 4963 SDValue CvtSrc = N0.getOperand(0); 4964 4965 if (CvtSrc.getOpcode() == ISD::FNEG) { 4966 // (fneg (fp_round (fneg x))) -> (fp_round x) 4967 return DAG.getNode(ISD::FP_ROUND, SL, VT, 4968 CvtSrc.getOperand(0), N0.getOperand(1)); 4969 } 4970 4971 if (!N0.hasOneUse()) 4972 return SDValue(); 4973 4974 // (fneg (fp_round x)) -> (fp_round (fneg x)) 4975 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc); 4976 return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1)); 4977 } 4978 case ISD::FP16_TO_FP: { 4979 // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal 4980 // f16, but legalization of f16 fneg ends up pulling it out of the source. 4981 // Put the fneg back as a legal source operation that can be matched later. 4982 SDLoc SL(N); 4983 4984 SDValue Src = N0.getOperand(0); 4985 EVT SrcVT = Src.getValueType(); 4986 4987 // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000) 4988 SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src, 4989 DAG.getConstant(0x8000, SL, SrcVT)); 4990 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg); 4991 } 4992 case ISD::SELECT: { 4993 // fneg (select c, a, b) -> select c, (fneg a), (fneg b) 4994 // TODO: Invert conditions of foldFreeOpFromSelect 4995 return SDValue(); 4996 } 4997 case ISD::BITCAST: { 4998 SDLoc SL(N); 4999 SDValue BCSrc = N0.getOperand(0); 5000 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) { 5001 SDValue HighBits = BCSrc.getOperand(BCSrc.getNumOperands() - 1); 5002 if (HighBits.getValueType().getSizeInBits() != 32 || 5003 !fnegFoldsIntoOp(HighBits.getNode())) 5004 return SDValue(); 5005 5006 // f64 fneg only really needs to operate on the high half of of the 5007 // register, so try to force it to an f32 operation to help make use of 5008 // source modifiers. 5009 // 5010 // 5011 // fneg (f64 (bitcast (build_vector x, y))) -> 5012 // f64 (bitcast (build_vector (bitcast i32:x to f32), 5013 // (fneg (bitcast i32:y to f32))) 5014 5015 SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::f32, HighBits); 5016 SDValue NegHi = DAG.getNode(ISD::FNEG, SL, MVT::f32, CastHi); 5017 SDValue CastBack = 5018 DAG.getNode(ISD::BITCAST, SL, HighBits.getValueType(), NegHi); 5019 5020 SmallVector<SDValue, 8> Ops(BCSrc->ops()); 5021 Ops.back() = CastBack; 5022 DCI.AddToWorklist(NegHi.getNode()); 5023 SDValue Build = 5024 DAG.getNode(ISD::BUILD_VECTOR, SL, BCSrc.getValueType(), Ops); 5025 SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, Build); 5026 5027 if (!N0.hasOneUse()) 5028 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Result)); 5029 return Result; 5030 } 5031 5032 if (BCSrc.getOpcode() == ISD::SELECT && VT == MVT::f32 && 5033 BCSrc.hasOneUse()) { 5034 // fneg (bitcast (f32 (select cond, i32:lhs, i32:rhs))) -> 5035 // select cond, (bitcast i32:lhs to f32), (bitcast i32:rhs to f32) 5036 5037 // TODO: Cast back result for multiple uses is beneficial in some cases. 5038 5039 SDValue LHS = 5040 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(1)); 5041 SDValue RHS = 5042 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(2)); 5043 5044 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, LHS); 5045 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHS); 5046 5047 return DAG.getNode(ISD::SELECT, SL, MVT::f32, BCSrc.getOperand(0), NegLHS, 5048 NegRHS); 5049 } 5050 5051 return SDValue(); 5052 } 5053 default: 5054 return SDValue(); 5055 } 5056 } 5057 5058 SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N, 5059 DAGCombinerInfo &DCI) const { 5060 SelectionDAG &DAG = DCI.DAG; 5061 SDValue N0 = N->getOperand(0); 5062 5063 if (!N0.hasOneUse()) 5064 return SDValue(); 5065 5066 switch (N0.getOpcode()) { 5067 case ISD::FP16_TO_FP: { 5068 assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal"); 5069 SDLoc SL(N); 5070 SDValue Src = N0.getOperand(0); 5071 EVT SrcVT = Src.getValueType(); 5072 5073 // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff) 5074 SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src, 5075 DAG.getConstant(0x7fff, SL, SrcVT)); 5076 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs); 5077 } 5078 default: 5079 return SDValue(); 5080 } 5081 } 5082 5083 SDValue AMDGPUTargetLowering::performRcpCombine(SDNode *N, 5084 DAGCombinerInfo &DCI) const { 5085 const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0)); 5086 if (!CFP) 5087 return SDValue(); 5088 5089 // XXX - Should this flush denormals? 5090 const APFloat &Val = CFP->getValueAPF(); 5091 APFloat One(Val.getSemantics(), "1.0"); 5092 return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0)); 5093 } 5094 5095 SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, 5096 DAGCombinerInfo &DCI) const { 5097 SelectionDAG &DAG = DCI.DAG; 5098 SDLoc DL(N); 5099 5100 switch(N->getOpcode()) { 5101 default: 5102 break; 5103 case ISD::BITCAST: { 5104 EVT DestVT = N->getValueType(0); 5105 5106 // Push casts through vector builds. This helps avoid emitting a large 5107 // number of copies when materializing floating point vector constants. 5108 // 5109 // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) => 5110 // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y)) 5111 if (DestVT.isVector()) { 5112 SDValue Src = N->getOperand(0); 5113 if (Src.getOpcode() == ISD::BUILD_VECTOR && 5114 (DCI.getDAGCombineLevel() < AfterLegalizeDAG || 5115 isOperationLegal(ISD::BUILD_VECTOR, DestVT))) { 5116 EVT SrcVT = Src.getValueType(); 5117 unsigned NElts = DestVT.getVectorNumElements(); 5118 5119 if (SrcVT.getVectorNumElements() == NElts) { 5120 EVT DestEltVT = DestVT.getVectorElementType(); 5121 5122 SmallVector<SDValue, 8> CastedElts; 5123 SDLoc SL(N); 5124 for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) { 5125 SDValue Elt = Src.getOperand(I); 5126 CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt)); 5127 } 5128 5129 return DAG.getBuildVector(DestVT, SL, CastedElts); 5130 } 5131 } 5132 } 5133 5134 if (DestVT.getSizeInBits() != 64 || !DestVT.isVector()) 5135 break; 5136 5137 // Fold bitcasts of constants. 5138 // 5139 // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k) 5140 // TODO: Generalize and move to DAGCombiner 5141 SDValue Src = N->getOperand(0); 5142 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) { 5143 SDLoc SL(N); 5144 uint64_t CVal = C->getZExtValue(); 5145 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, 5146 DAG.getConstant(Lo_32(CVal), SL, MVT::i32), 5147 DAG.getConstant(Hi_32(CVal), SL, MVT::i32)); 5148 return DAG.getNode(ISD::BITCAST, SL, DestVT, BV); 5149 } 5150 5151 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) { 5152 const APInt &Val = C->getValueAPF().bitcastToAPInt(); 5153 SDLoc SL(N); 5154 uint64_t CVal = Val.getZExtValue(); 5155 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, 5156 DAG.getConstant(Lo_32(CVal), SL, MVT::i32), 5157 DAG.getConstant(Hi_32(CVal), SL, MVT::i32)); 5158 5159 return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec); 5160 } 5161 5162 break; 5163 } 5164 case ISD::SHL: { 5165 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) 5166 break; 5167 5168 return performShlCombine(N, DCI); 5169 } 5170 case ISD::SRL: { 5171 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) 5172 break; 5173 5174 return performSrlCombine(N, DCI); 5175 } 5176 case ISD::SRA: { 5177 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) 5178 break; 5179 5180 return performSraCombine(N, DCI); 5181 } 5182 case ISD::TRUNCATE: 5183 return performTruncateCombine(N, DCI); 5184 case ISD::MUL: 5185 return performMulCombine(N, DCI); 5186 case AMDGPUISD::MUL_U24: 5187 case AMDGPUISD::MUL_I24: { 5188 if (SDValue Simplified = simplifyMul24(N, DCI)) 5189 return Simplified; 5190 break; 5191 } 5192 case AMDGPUISD::MULHI_I24: 5193 case AMDGPUISD::MULHI_U24: 5194 return simplifyMul24(N, DCI); 5195 case ISD::SMUL_LOHI: 5196 case ISD::UMUL_LOHI: 5197 return performMulLoHiCombine(N, DCI); 5198 case ISD::MULHS: 5199 return performMulhsCombine(N, DCI); 5200 case ISD::MULHU: 5201 return performMulhuCombine(N, DCI); 5202 case ISD::SELECT: 5203 return performSelectCombine(N, DCI); 5204 case ISD::FNEG: 5205 return performFNegCombine(N, DCI); 5206 case ISD::FABS: 5207 return performFAbsCombine(N, DCI); 5208 case AMDGPUISD::BFE_I32: 5209 case AMDGPUISD::BFE_U32: { 5210 assert(!N->getValueType(0).isVector() && 5211 "Vector handling of BFE not implemented"); 5212 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2)); 5213 if (!Width) 5214 break; 5215 5216 uint32_t WidthVal = Width->getZExtValue() & 0x1f; 5217 if (WidthVal == 0) 5218 return DAG.getConstant(0, DL, MVT::i32); 5219 5220 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1)); 5221 if (!Offset) 5222 break; 5223 5224 SDValue BitsFrom = N->getOperand(0); 5225 uint32_t OffsetVal = Offset->getZExtValue() & 0x1f; 5226 5227 bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32; 5228 5229 if (OffsetVal == 0) { 5230 // This is already sign / zero extended, so try to fold away extra BFEs. 5231 unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal); 5232 5233 unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom); 5234 if (OpSignBits >= SignBits) 5235 return BitsFrom; 5236 5237 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal); 5238 if (Signed) { 5239 // This is a sign_extend_inreg. Replace it to take advantage of existing 5240 // DAG Combines. If not eliminated, we will match back to BFE during 5241 // selection. 5242 5243 // TODO: The sext_inreg of extended types ends, although we can could 5244 // handle them in a single BFE. 5245 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom, 5246 DAG.getValueType(SmallVT)); 5247 } 5248 5249 return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT); 5250 } 5251 5252 if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) { 5253 if (Signed) { 5254 return constantFoldBFE<int32_t>(DAG, 5255 CVal->getSExtValue(), 5256 OffsetVal, 5257 WidthVal, 5258 DL); 5259 } 5260 5261 return constantFoldBFE<uint32_t>(DAG, 5262 CVal->getZExtValue(), 5263 OffsetVal, 5264 WidthVal, 5265 DL); 5266 } 5267 5268 if ((OffsetVal + WidthVal) >= 32 && 5269 !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) { 5270 SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32); 5271 return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32, 5272 BitsFrom, ShiftVal); 5273 } 5274 5275 if (BitsFrom.hasOneUse()) { 5276 APInt Demanded = APInt::getBitsSet(32, 5277 OffsetVal, 5278 OffsetVal + WidthVal); 5279 5280 KnownBits Known; 5281 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 5282 !DCI.isBeforeLegalizeOps()); 5283 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 5284 if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) || 5285 TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) { 5286 DCI.CommitTargetLoweringOpt(TLO); 5287 } 5288 } 5289 5290 break; 5291 } 5292 case ISD::LOAD: 5293 return performLoadCombine(N, DCI); 5294 case ISD::STORE: 5295 return performStoreCombine(N, DCI); 5296 case AMDGPUISD::RCP: 5297 case AMDGPUISD::RCP_IFLAG: 5298 return performRcpCombine(N, DCI); 5299 case ISD::AssertZext: 5300 case ISD::AssertSext: 5301 return performAssertSZExtCombine(N, DCI); 5302 case ISD::INTRINSIC_WO_CHAIN: 5303 return performIntrinsicWOChainCombine(N, DCI); 5304 case AMDGPUISD::FMAD_FTZ: { 5305 SDValue N0 = N->getOperand(0); 5306 SDValue N1 = N->getOperand(1); 5307 SDValue N2 = N->getOperand(2); 5308 EVT VT = N->getValueType(0); 5309 5310 // FMAD_FTZ is a FMAD + flush denormals to zero. 5311 // We flush the inputs, the intermediate step, and the output. 5312 ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0); 5313 ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1); 5314 ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2); 5315 if (N0CFP && N1CFP && N2CFP) { 5316 const auto FTZ = [](const APFloat &V) { 5317 if (V.isDenormal()) { 5318 APFloat Zero(V.getSemantics(), 0); 5319 return V.isNegative() ? -Zero : Zero; 5320 } 5321 return V; 5322 }; 5323 5324 APFloat V0 = FTZ(N0CFP->getValueAPF()); 5325 APFloat V1 = FTZ(N1CFP->getValueAPF()); 5326 APFloat V2 = FTZ(N2CFP->getValueAPF()); 5327 V0.multiply(V1, APFloat::rmNearestTiesToEven); 5328 V0 = FTZ(V0); 5329 V0.add(V2, APFloat::rmNearestTiesToEven); 5330 return DAG.getConstantFP(FTZ(V0), DL, VT); 5331 } 5332 break; 5333 } 5334 } 5335 return SDValue(); 5336 } 5337 5338 //===----------------------------------------------------------------------===// 5339 // Helper functions 5340 //===----------------------------------------------------------------------===// 5341 5342 SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG, 5343 const TargetRegisterClass *RC, 5344 Register Reg, EVT VT, 5345 const SDLoc &SL, 5346 bool RawReg) const { 5347 MachineFunction &MF = DAG.getMachineFunction(); 5348 MachineRegisterInfo &MRI = MF.getRegInfo(); 5349 Register VReg; 5350 5351 if (!MRI.isLiveIn(Reg)) { 5352 VReg = MRI.createVirtualRegister(RC); 5353 MRI.addLiveIn(Reg, VReg); 5354 } else { 5355 VReg = MRI.getLiveInVirtReg(Reg); 5356 } 5357 5358 if (RawReg) 5359 return DAG.getRegister(VReg, VT); 5360 5361 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT); 5362 } 5363 5364 // This may be called multiple times, and nothing prevents creating multiple 5365 // objects at the same offset. See if we already defined this object. 5366 static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size, 5367 int64_t Offset) { 5368 for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) { 5369 if (MFI.getObjectOffset(I) == Offset) { 5370 assert(MFI.getObjectSize(I) == Size); 5371 return I; 5372 } 5373 } 5374 5375 return MFI.CreateFixedObject(Size, Offset, true); 5376 } 5377 5378 SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG, 5379 EVT VT, 5380 const SDLoc &SL, 5381 int64_t Offset) const { 5382 MachineFunction &MF = DAG.getMachineFunction(); 5383 MachineFrameInfo &MFI = MF.getFrameInfo(); 5384 int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset); 5385 5386 auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset); 5387 SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32); 5388 5389 return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, Align(4), 5390 MachineMemOperand::MODereferenceable | 5391 MachineMemOperand::MOInvariant); 5392 } 5393 5394 SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG, 5395 const SDLoc &SL, 5396 SDValue Chain, 5397 SDValue ArgVal, 5398 int64_t Offset) const { 5399 MachineFunction &MF = DAG.getMachineFunction(); 5400 MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset); 5401 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 5402 5403 SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32); 5404 // Stores to the argument stack area are relative to the stack pointer. 5405 SDValue SP = 5406 DAG.getCopyFromReg(Chain, SL, Info->getStackPtrOffsetReg(), MVT::i32); 5407 Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, SP, Ptr); 5408 SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, Align(4), 5409 MachineMemOperand::MODereferenceable); 5410 return Store; 5411 } 5412 5413 SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG, 5414 const TargetRegisterClass *RC, 5415 EVT VT, const SDLoc &SL, 5416 const ArgDescriptor &Arg) const { 5417 assert(Arg && "Attempting to load missing argument"); 5418 5419 SDValue V = Arg.isRegister() ? 5420 CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) : 5421 loadStackInputValue(DAG, VT, SL, Arg.getStackOffset()); 5422 5423 if (!Arg.isMasked()) 5424 return V; 5425 5426 unsigned Mask = Arg.getMask(); 5427 unsigned Shift = llvm::countr_zero<unsigned>(Mask); 5428 V = DAG.getNode(ISD::SRL, SL, VT, V, 5429 DAG.getShiftAmountConstant(Shift, VT, SL)); 5430 return DAG.getNode(ISD::AND, SL, VT, V, 5431 DAG.getConstant(Mask >> Shift, SL, VT)); 5432 } 5433 5434 uint32_t AMDGPUTargetLowering::getImplicitParameterOffset( 5435 uint64_t ExplicitKernArgSize, const ImplicitParameter Param) const { 5436 unsigned ExplicitArgOffset = Subtarget->getExplicitKernelArgOffset(); 5437 const Align Alignment = Subtarget->getAlignmentForImplicitArgPtr(); 5438 uint64_t ArgOffset = 5439 alignTo(ExplicitKernArgSize, Alignment) + ExplicitArgOffset; 5440 switch (Param) { 5441 case FIRST_IMPLICIT: 5442 return ArgOffset; 5443 case PRIVATE_BASE: 5444 return ArgOffset + AMDGPU::ImplicitArg::PRIVATE_BASE_OFFSET; 5445 case SHARED_BASE: 5446 return ArgOffset + AMDGPU::ImplicitArg::SHARED_BASE_OFFSET; 5447 case QUEUE_PTR: 5448 return ArgOffset + AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET; 5449 } 5450 llvm_unreachable("unexpected implicit parameter type"); 5451 } 5452 5453 uint32_t AMDGPUTargetLowering::getImplicitParameterOffset( 5454 const MachineFunction &MF, const ImplicitParameter Param) const { 5455 const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>(); 5456 return getImplicitParameterOffset(MFI->getExplicitKernArgSize(), Param); 5457 } 5458 5459 #define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node; 5460 5461 const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { 5462 switch ((AMDGPUISD::NodeType)Opcode) { 5463 case AMDGPUISD::FIRST_NUMBER: break; 5464 // AMDIL DAG nodes 5465 NODE_NAME_CASE(BRANCH_COND); 5466 5467 // AMDGPU DAG nodes 5468 NODE_NAME_CASE(IF) 5469 NODE_NAME_CASE(ELSE) 5470 NODE_NAME_CASE(LOOP) 5471 NODE_NAME_CASE(CALL) 5472 NODE_NAME_CASE(TC_RETURN) 5473 NODE_NAME_CASE(TC_RETURN_GFX) 5474 NODE_NAME_CASE(TC_RETURN_CHAIN) 5475 NODE_NAME_CASE(TRAP) 5476 NODE_NAME_CASE(RET_GLUE) 5477 NODE_NAME_CASE(WAVE_ADDRESS) 5478 NODE_NAME_CASE(RETURN_TO_EPILOG) 5479 NODE_NAME_CASE(ENDPGM) 5480 NODE_NAME_CASE(ENDPGM_TRAP) 5481 NODE_NAME_CASE(SIMULATED_TRAP) 5482 NODE_NAME_CASE(DWORDADDR) 5483 NODE_NAME_CASE(FRACT) 5484 NODE_NAME_CASE(SETCC) 5485 NODE_NAME_CASE(DENORM_MODE) 5486 NODE_NAME_CASE(FMA_W_CHAIN) 5487 NODE_NAME_CASE(FMUL_W_CHAIN) 5488 NODE_NAME_CASE(CLAMP) 5489 NODE_NAME_CASE(COS_HW) 5490 NODE_NAME_CASE(SIN_HW) 5491 NODE_NAME_CASE(FMAX_LEGACY) 5492 NODE_NAME_CASE(FMIN_LEGACY) 5493 NODE_NAME_CASE(FMAX3) 5494 NODE_NAME_CASE(SMAX3) 5495 NODE_NAME_CASE(UMAX3) 5496 NODE_NAME_CASE(FMIN3) 5497 NODE_NAME_CASE(SMIN3) 5498 NODE_NAME_CASE(UMIN3) 5499 NODE_NAME_CASE(FMED3) 5500 NODE_NAME_CASE(SMED3) 5501 NODE_NAME_CASE(UMED3) 5502 NODE_NAME_CASE(FMAXIMUM3) 5503 NODE_NAME_CASE(FMINIMUM3) 5504 NODE_NAME_CASE(FDOT2) 5505 NODE_NAME_CASE(URECIP) 5506 NODE_NAME_CASE(DIV_SCALE) 5507 NODE_NAME_CASE(DIV_FMAS) 5508 NODE_NAME_CASE(DIV_FIXUP) 5509 NODE_NAME_CASE(FMAD_FTZ) 5510 NODE_NAME_CASE(RCP) 5511 NODE_NAME_CASE(RSQ) 5512 NODE_NAME_CASE(RCP_LEGACY) 5513 NODE_NAME_CASE(RCP_IFLAG) 5514 NODE_NAME_CASE(LOG) 5515 NODE_NAME_CASE(EXP) 5516 NODE_NAME_CASE(FMUL_LEGACY) 5517 NODE_NAME_CASE(RSQ_CLAMP) 5518 NODE_NAME_CASE(FP_CLASS) 5519 NODE_NAME_CASE(DOT4) 5520 NODE_NAME_CASE(CARRY) 5521 NODE_NAME_CASE(BORROW) 5522 NODE_NAME_CASE(BFE_U32) 5523 NODE_NAME_CASE(BFE_I32) 5524 NODE_NAME_CASE(BFI) 5525 NODE_NAME_CASE(BFM) 5526 NODE_NAME_CASE(FFBH_U32) 5527 NODE_NAME_CASE(FFBH_I32) 5528 NODE_NAME_CASE(FFBL_B32) 5529 NODE_NAME_CASE(MUL_U24) 5530 NODE_NAME_CASE(MUL_I24) 5531 NODE_NAME_CASE(MULHI_U24) 5532 NODE_NAME_CASE(MULHI_I24) 5533 NODE_NAME_CASE(MAD_U24) 5534 NODE_NAME_CASE(MAD_I24) 5535 NODE_NAME_CASE(MAD_I64_I32) 5536 NODE_NAME_CASE(MAD_U64_U32) 5537 NODE_NAME_CASE(PERM) 5538 NODE_NAME_CASE(TEXTURE_FETCH) 5539 NODE_NAME_CASE(R600_EXPORT) 5540 NODE_NAME_CASE(CONST_ADDRESS) 5541 NODE_NAME_CASE(REGISTER_LOAD) 5542 NODE_NAME_CASE(REGISTER_STORE) 5543 NODE_NAME_CASE(CVT_F32_UBYTE0) 5544 NODE_NAME_CASE(CVT_F32_UBYTE1) 5545 NODE_NAME_CASE(CVT_F32_UBYTE2) 5546 NODE_NAME_CASE(CVT_F32_UBYTE3) 5547 NODE_NAME_CASE(CVT_PKRTZ_F16_F32) 5548 NODE_NAME_CASE(CVT_PKNORM_I16_F32) 5549 NODE_NAME_CASE(CVT_PKNORM_U16_F32) 5550 NODE_NAME_CASE(CVT_PK_I16_I32) 5551 NODE_NAME_CASE(CVT_PK_U16_U32) 5552 NODE_NAME_CASE(FP_TO_FP16) 5553 NODE_NAME_CASE(BUILD_VERTICAL_VECTOR) 5554 NODE_NAME_CASE(CONST_DATA_PTR) 5555 NODE_NAME_CASE(PC_ADD_REL_OFFSET) 5556 NODE_NAME_CASE(LDS) 5557 NODE_NAME_CASE(DUMMY_CHAIN) 5558 NODE_NAME_CASE(LOAD_D16_HI) 5559 NODE_NAME_CASE(LOAD_D16_LO) 5560 NODE_NAME_CASE(LOAD_D16_HI_I8) 5561 NODE_NAME_CASE(LOAD_D16_HI_U8) 5562 NODE_NAME_CASE(LOAD_D16_LO_I8) 5563 NODE_NAME_CASE(LOAD_D16_LO_U8) 5564 NODE_NAME_CASE(STORE_MSKOR) 5565 NODE_NAME_CASE(TBUFFER_STORE_FORMAT) 5566 NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16) 5567 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT) 5568 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16) 5569 NODE_NAME_CASE(DS_ORDERED_COUNT) 5570 NODE_NAME_CASE(ATOMIC_CMP_SWAP) 5571 NODE_NAME_CASE(BUFFER_LOAD) 5572 NODE_NAME_CASE(BUFFER_LOAD_UBYTE) 5573 NODE_NAME_CASE(BUFFER_LOAD_USHORT) 5574 NODE_NAME_CASE(BUFFER_LOAD_BYTE) 5575 NODE_NAME_CASE(BUFFER_LOAD_SHORT) 5576 NODE_NAME_CASE(BUFFER_LOAD_TFE) 5577 NODE_NAME_CASE(BUFFER_LOAD_UBYTE_TFE) 5578 NODE_NAME_CASE(BUFFER_LOAD_USHORT_TFE) 5579 NODE_NAME_CASE(BUFFER_LOAD_BYTE_TFE) 5580 NODE_NAME_CASE(BUFFER_LOAD_SHORT_TFE) 5581 NODE_NAME_CASE(BUFFER_LOAD_FORMAT) 5582 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_TFE) 5583 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16) 5584 NODE_NAME_CASE(SBUFFER_LOAD) 5585 NODE_NAME_CASE(SBUFFER_LOAD_BYTE) 5586 NODE_NAME_CASE(SBUFFER_LOAD_UBYTE) 5587 NODE_NAME_CASE(SBUFFER_LOAD_SHORT) 5588 NODE_NAME_CASE(SBUFFER_LOAD_USHORT) 5589 NODE_NAME_CASE(SBUFFER_PREFETCH_DATA) 5590 NODE_NAME_CASE(BUFFER_STORE) 5591 NODE_NAME_CASE(BUFFER_STORE_BYTE) 5592 NODE_NAME_CASE(BUFFER_STORE_SHORT) 5593 NODE_NAME_CASE(BUFFER_STORE_FORMAT) 5594 NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16) 5595 NODE_NAME_CASE(BUFFER_ATOMIC_SWAP) 5596 NODE_NAME_CASE(BUFFER_ATOMIC_ADD) 5597 NODE_NAME_CASE(BUFFER_ATOMIC_SUB) 5598 NODE_NAME_CASE(BUFFER_ATOMIC_SMIN) 5599 NODE_NAME_CASE(BUFFER_ATOMIC_UMIN) 5600 NODE_NAME_CASE(BUFFER_ATOMIC_SMAX) 5601 NODE_NAME_CASE(BUFFER_ATOMIC_UMAX) 5602 NODE_NAME_CASE(BUFFER_ATOMIC_AND) 5603 NODE_NAME_CASE(BUFFER_ATOMIC_OR) 5604 NODE_NAME_CASE(BUFFER_ATOMIC_XOR) 5605 NODE_NAME_CASE(BUFFER_ATOMIC_INC) 5606 NODE_NAME_CASE(BUFFER_ATOMIC_DEC) 5607 NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP) 5608 NODE_NAME_CASE(BUFFER_ATOMIC_CSUB) 5609 NODE_NAME_CASE(BUFFER_ATOMIC_FADD) 5610 NODE_NAME_CASE(BUFFER_ATOMIC_FMIN) 5611 NODE_NAME_CASE(BUFFER_ATOMIC_FMAX) 5612 NODE_NAME_CASE(BUFFER_ATOMIC_COND_SUB_U32) 5613 } 5614 return nullptr; 5615 } 5616 5617 SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand, 5618 SelectionDAG &DAG, int Enabled, 5619 int &RefinementSteps, 5620 bool &UseOneConstNR, 5621 bool Reciprocal) const { 5622 EVT VT = Operand.getValueType(); 5623 5624 if (VT == MVT::f32) { 5625 RefinementSteps = 0; 5626 return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand); 5627 } 5628 5629 // TODO: There is also f64 rsq instruction, but the documentation is less 5630 // clear on its precision. 5631 5632 return SDValue(); 5633 } 5634 5635 SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand, 5636 SelectionDAG &DAG, int Enabled, 5637 int &RefinementSteps) const { 5638 EVT VT = Operand.getValueType(); 5639 5640 if (VT == MVT::f32) { 5641 // Reciprocal, < 1 ulp error. 5642 // 5643 // This reciprocal approximation converges to < 0.5 ulp error with one 5644 // newton rhapson performed with two fused multiple adds (FMAs). 5645 5646 RefinementSteps = 0; 5647 return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand); 5648 } 5649 5650 // TODO: There is also f64 rcp instruction, but the documentation is less 5651 // clear on its precision. 5652 5653 return SDValue(); 5654 } 5655 5656 static unsigned workitemIntrinsicDim(unsigned ID) { 5657 switch (ID) { 5658 case Intrinsic::amdgcn_workitem_id_x: 5659 return 0; 5660 case Intrinsic::amdgcn_workitem_id_y: 5661 return 1; 5662 case Intrinsic::amdgcn_workitem_id_z: 5663 return 2; 5664 default: 5665 llvm_unreachable("not a workitem intrinsic"); 5666 } 5667 } 5668 5669 void AMDGPUTargetLowering::computeKnownBitsForTargetNode( 5670 const SDValue Op, KnownBits &Known, 5671 const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const { 5672 5673 Known.resetAll(); // Don't know anything. 5674 5675 unsigned Opc = Op.getOpcode(); 5676 5677 switch (Opc) { 5678 default: 5679 break; 5680 case AMDGPUISD::CARRY: 5681 case AMDGPUISD::BORROW: { 5682 Known.Zero = APInt::getHighBitsSet(32, 31); 5683 break; 5684 } 5685 5686 case AMDGPUISD::BFE_I32: 5687 case AMDGPUISD::BFE_U32: { 5688 ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2)); 5689 if (!CWidth) 5690 return; 5691 5692 uint32_t Width = CWidth->getZExtValue() & 0x1f; 5693 5694 if (Opc == AMDGPUISD::BFE_U32) 5695 Known.Zero = APInt::getHighBitsSet(32, 32 - Width); 5696 5697 break; 5698 } 5699 case AMDGPUISD::FP_TO_FP16: { 5700 unsigned BitWidth = Known.getBitWidth(); 5701 5702 // High bits are zero. 5703 Known.Zero = APInt::getHighBitsSet(BitWidth, BitWidth - 16); 5704 break; 5705 } 5706 case AMDGPUISD::MUL_U24: 5707 case AMDGPUISD::MUL_I24: { 5708 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1); 5709 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1); 5710 unsigned TrailZ = LHSKnown.countMinTrailingZeros() + 5711 RHSKnown.countMinTrailingZeros(); 5712 Known.Zero.setLowBits(std::min(TrailZ, 32u)); 5713 // Skip extra check if all bits are known zeros. 5714 if (TrailZ >= 32) 5715 break; 5716 5717 // Truncate to 24 bits. 5718 LHSKnown = LHSKnown.trunc(24); 5719 RHSKnown = RHSKnown.trunc(24); 5720 5721 if (Opc == AMDGPUISD::MUL_I24) { 5722 unsigned LHSValBits = LHSKnown.countMaxSignificantBits(); 5723 unsigned RHSValBits = RHSKnown.countMaxSignificantBits(); 5724 unsigned MaxValBits = LHSValBits + RHSValBits; 5725 if (MaxValBits > 32) 5726 break; 5727 unsigned SignBits = 32 - MaxValBits + 1; 5728 bool LHSNegative = LHSKnown.isNegative(); 5729 bool LHSNonNegative = LHSKnown.isNonNegative(); 5730 bool LHSPositive = LHSKnown.isStrictlyPositive(); 5731 bool RHSNegative = RHSKnown.isNegative(); 5732 bool RHSNonNegative = RHSKnown.isNonNegative(); 5733 bool RHSPositive = RHSKnown.isStrictlyPositive(); 5734 5735 if ((LHSNonNegative && RHSNonNegative) || (LHSNegative && RHSNegative)) 5736 Known.Zero.setHighBits(SignBits); 5737 else if ((LHSNegative && RHSPositive) || (LHSPositive && RHSNegative)) 5738 Known.One.setHighBits(SignBits); 5739 } else { 5740 unsigned LHSValBits = LHSKnown.countMaxActiveBits(); 5741 unsigned RHSValBits = RHSKnown.countMaxActiveBits(); 5742 unsigned MaxValBits = LHSValBits + RHSValBits; 5743 if (MaxValBits >= 32) 5744 break; 5745 Known.Zero.setBitsFrom(MaxValBits); 5746 } 5747 break; 5748 } 5749 case AMDGPUISD::PERM: { 5750 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2)); 5751 if (!CMask) 5752 return; 5753 5754 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1); 5755 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1); 5756 unsigned Sel = CMask->getZExtValue(); 5757 5758 for (unsigned I = 0; I < 32; I += 8) { 5759 unsigned SelBits = Sel & 0xff; 5760 if (SelBits < 4) { 5761 SelBits *= 8; 5762 Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I; 5763 Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I; 5764 } else if (SelBits < 7) { 5765 SelBits = (SelBits & 3) * 8; 5766 Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I; 5767 Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I; 5768 } else if (SelBits == 0x0c) { 5769 Known.Zero |= 0xFFull << I; 5770 } else if (SelBits > 0x0c) { 5771 Known.One |= 0xFFull << I; 5772 } 5773 Sel >>= 8; 5774 } 5775 break; 5776 } 5777 case AMDGPUISD::BUFFER_LOAD_UBYTE: { 5778 Known.Zero.setHighBits(24); 5779 break; 5780 } 5781 case AMDGPUISD::BUFFER_LOAD_USHORT: { 5782 Known.Zero.setHighBits(16); 5783 break; 5784 } 5785 case AMDGPUISD::LDS: { 5786 auto *GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode()); 5787 Align Alignment = GA->getGlobal()->getPointerAlignment(DAG.getDataLayout()); 5788 5789 Known.Zero.setHighBits(16); 5790 Known.Zero.setLowBits(Log2(Alignment)); 5791 break; 5792 } 5793 case AMDGPUISD::SMIN3: 5794 case AMDGPUISD::SMAX3: 5795 case AMDGPUISD::SMED3: 5796 case AMDGPUISD::UMIN3: 5797 case AMDGPUISD::UMAX3: 5798 case AMDGPUISD::UMED3: { 5799 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1); 5800 if (Known2.isUnknown()) 5801 break; 5802 5803 KnownBits Known1 = DAG.computeKnownBits(Op.getOperand(1), Depth + 1); 5804 if (Known1.isUnknown()) 5805 break; 5806 5807 KnownBits Known0 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1); 5808 if (Known0.isUnknown()) 5809 break; 5810 5811 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling. 5812 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero; 5813 Known.One = Known0.One & Known1.One & Known2.One; 5814 break; 5815 } 5816 case ISD::INTRINSIC_WO_CHAIN: { 5817 unsigned IID = Op.getConstantOperandVal(0); 5818 switch (IID) { 5819 case Intrinsic::amdgcn_workitem_id_x: 5820 case Intrinsic::amdgcn_workitem_id_y: 5821 case Intrinsic::amdgcn_workitem_id_z: { 5822 unsigned MaxValue = Subtarget->getMaxWorkitemID( 5823 DAG.getMachineFunction().getFunction(), workitemIntrinsicDim(IID)); 5824 Known.Zero.setHighBits(llvm::countl_zero(MaxValue)); 5825 break; 5826 } 5827 default: 5828 break; 5829 } 5830 } 5831 } 5832 } 5833 5834 unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode( 5835 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, 5836 unsigned Depth) const { 5837 switch (Op.getOpcode()) { 5838 case AMDGPUISD::BFE_I32: { 5839 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2)); 5840 if (!Width) 5841 return 1; 5842 5843 unsigned SignBits = 32 - Width->getZExtValue() + 1; 5844 if (!isNullConstant(Op.getOperand(1))) 5845 return SignBits; 5846 5847 // TODO: Could probably figure something out with non-0 offsets. 5848 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1); 5849 return std::max(SignBits, Op0SignBits); 5850 } 5851 5852 case AMDGPUISD::BFE_U32: { 5853 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2)); 5854 return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1; 5855 } 5856 5857 case AMDGPUISD::CARRY: 5858 case AMDGPUISD::BORROW: 5859 return 31; 5860 case AMDGPUISD::BUFFER_LOAD_BYTE: 5861 return 25; 5862 case AMDGPUISD::BUFFER_LOAD_SHORT: 5863 return 17; 5864 case AMDGPUISD::BUFFER_LOAD_UBYTE: 5865 return 24; 5866 case AMDGPUISD::BUFFER_LOAD_USHORT: 5867 return 16; 5868 case AMDGPUISD::FP_TO_FP16: 5869 return 16; 5870 case AMDGPUISD::SMIN3: 5871 case AMDGPUISD::SMAX3: 5872 case AMDGPUISD::SMED3: 5873 case AMDGPUISD::UMIN3: 5874 case AMDGPUISD::UMAX3: 5875 case AMDGPUISD::UMED3: { 5876 unsigned Tmp2 = DAG.ComputeNumSignBits(Op.getOperand(2), Depth + 1); 5877 if (Tmp2 == 1) 5878 return 1; // Early out. 5879 5880 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1); 5881 if (Tmp1 == 1) 5882 return 1; // Early out. 5883 5884 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1); 5885 if (Tmp0 == 1) 5886 return 1; // Early out. 5887 5888 return std::min({Tmp0, Tmp1, Tmp2}); 5889 } 5890 default: 5891 return 1; 5892 } 5893 } 5894 5895 unsigned AMDGPUTargetLowering::computeNumSignBitsForTargetInstr( 5896 GISelKnownBits &Analysis, Register R, 5897 const APInt &DemandedElts, const MachineRegisterInfo &MRI, 5898 unsigned Depth) const { 5899 const MachineInstr *MI = MRI.getVRegDef(R); 5900 if (!MI) 5901 return 1; 5902 5903 // TODO: Check range metadata on MMO. 5904 switch (MI->getOpcode()) { 5905 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE: 5906 return 25; 5907 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT: 5908 return 17; 5909 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE: 5910 return 24; 5911 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: 5912 return 16; 5913 case AMDGPU::G_AMDGPU_SMED3: 5914 case AMDGPU::G_AMDGPU_UMED3: { 5915 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs(); 5916 unsigned Tmp2 = Analysis.computeNumSignBits(Src2, DemandedElts, Depth + 1); 5917 if (Tmp2 == 1) 5918 return 1; 5919 unsigned Tmp1 = Analysis.computeNumSignBits(Src1, DemandedElts, Depth + 1); 5920 if (Tmp1 == 1) 5921 return 1; 5922 unsigned Tmp0 = Analysis.computeNumSignBits(Src0, DemandedElts, Depth + 1); 5923 if (Tmp0 == 1) 5924 return 1; 5925 return std::min({Tmp0, Tmp1, Tmp2}); 5926 } 5927 default: 5928 return 1; 5929 } 5930 } 5931 5932 bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op, 5933 const SelectionDAG &DAG, 5934 bool SNaN, 5935 unsigned Depth) const { 5936 unsigned Opcode = Op.getOpcode(); 5937 switch (Opcode) { 5938 case AMDGPUISD::FMIN_LEGACY: 5939 case AMDGPUISD::FMAX_LEGACY: { 5940 if (SNaN) 5941 return true; 5942 5943 // TODO: Can check no nans on one of the operands for each one, but which 5944 // one? 5945 return false; 5946 } 5947 case AMDGPUISD::FMUL_LEGACY: 5948 case AMDGPUISD::CVT_PKRTZ_F16_F32: { 5949 if (SNaN) 5950 return true; 5951 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) && 5952 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1); 5953 } 5954 case AMDGPUISD::FMED3: 5955 case AMDGPUISD::FMIN3: 5956 case AMDGPUISD::FMAX3: 5957 case AMDGPUISD::FMINIMUM3: 5958 case AMDGPUISD::FMAXIMUM3: 5959 case AMDGPUISD::FMAD_FTZ: { 5960 if (SNaN) 5961 return true; 5962 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) && 5963 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) && 5964 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1); 5965 } 5966 case AMDGPUISD::CVT_F32_UBYTE0: 5967 case AMDGPUISD::CVT_F32_UBYTE1: 5968 case AMDGPUISD::CVT_F32_UBYTE2: 5969 case AMDGPUISD::CVT_F32_UBYTE3: 5970 return true; 5971 5972 case AMDGPUISD::RCP: 5973 case AMDGPUISD::RSQ: 5974 case AMDGPUISD::RCP_LEGACY: 5975 case AMDGPUISD::RSQ_CLAMP: { 5976 if (SNaN) 5977 return true; 5978 5979 // TODO: Need is known positive check. 5980 return false; 5981 } 5982 case ISD::FLDEXP: 5983 case AMDGPUISD::FRACT: { 5984 if (SNaN) 5985 return true; 5986 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1); 5987 } 5988 case AMDGPUISD::DIV_SCALE: 5989 case AMDGPUISD::DIV_FMAS: 5990 case AMDGPUISD::DIV_FIXUP: 5991 // TODO: Refine on operands. 5992 return SNaN; 5993 case AMDGPUISD::SIN_HW: 5994 case AMDGPUISD::COS_HW: { 5995 // TODO: Need check for infinity 5996 return SNaN; 5997 } 5998 case ISD::INTRINSIC_WO_CHAIN: { 5999 unsigned IntrinsicID = Op.getConstantOperandVal(0); 6000 // TODO: Handle more intrinsics 6001 switch (IntrinsicID) { 6002 case Intrinsic::amdgcn_cubeid: 6003 return true; 6004 6005 case Intrinsic::amdgcn_frexp_mant: { 6006 if (SNaN) 6007 return true; 6008 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1); 6009 } 6010 case Intrinsic::amdgcn_cvt_pkrtz: { 6011 if (SNaN) 6012 return true; 6013 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) && 6014 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1); 6015 } 6016 case Intrinsic::amdgcn_rcp: 6017 case Intrinsic::amdgcn_rsq: 6018 case Intrinsic::amdgcn_rcp_legacy: 6019 case Intrinsic::amdgcn_rsq_legacy: 6020 case Intrinsic::amdgcn_rsq_clamp: { 6021 if (SNaN) 6022 return true; 6023 6024 // TODO: Need is known positive check. 6025 return false; 6026 } 6027 case Intrinsic::amdgcn_trig_preop: 6028 case Intrinsic::amdgcn_fdot2: 6029 // TODO: Refine on operand 6030 return SNaN; 6031 case Intrinsic::amdgcn_fma_legacy: 6032 if (SNaN) 6033 return true; 6034 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) && 6035 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1) && 6036 DAG.isKnownNeverNaN(Op.getOperand(3), SNaN, Depth + 1); 6037 default: 6038 return false; 6039 } 6040 } 6041 default: 6042 return false; 6043 } 6044 } 6045 6046 bool AMDGPUTargetLowering::isReassocProfitable(MachineRegisterInfo &MRI, 6047 Register N0, Register N1) const { 6048 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks 6049 } 6050