1//===- NVPTXInstrInfo.td - NVPTX Instruction defs -------------*- tblgen-*-===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// This file describes the PTX instructions in TableGen format. 10// 11//===----------------------------------------------------------------------===// 12 13include "NVPTXInstrFormats.td" 14 15let OperandType = "OPERAND_IMMEDIATE" in { 16 def f16imm : Operand<f16>; 17 def bf16imm : Operand<bf16>; 18 19} 20 21// List of vector specific properties 22def isVecLD : VecInstTypeEnum<1>; 23def isVecST : VecInstTypeEnum<2>; 24def isVecBuild : VecInstTypeEnum<3>; 25def isVecShuffle : VecInstTypeEnum<4>; 26def isVecExtract : VecInstTypeEnum<5>; 27def isVecInsert : VecInstTypeEnum<6>; 28def isVecDest : VecInstTypeEnum<7>; 29def isVecOther : VecInstTypeEnum<15>; 30 31//===----------------------------------------------------------------------===// 32// NVPTX Operand Definitions. 33//===----------------------------------------------------------------------===// 34 35def brtarget : Operand<OtherVT>; 36 37// CVT conversion modes 38// These must match the enum in NVPTX.h 39def CvtNONE : PatLeaf<(i32 0x0)>; 40def CvtRNI : PatLeaf<(i32 0x1)>; 41def CvtRZI : PatLeaf<(i32 0x2)>; 42def CvtRMI : PatLeaf<(i32 0x3)>; 43def CvtRPI : PatLeaf<(i32 0x4)>; 44def CvtRN : PatLeaf<(i32 0x5)>; 45def CvtRZ : PatLeaf<(i32 0x6)>; 46def CvtRM : PatLeaf<(i32 0x7)>; 47def CvtRP : PatLeaf<(i32 0x8)>; 48def CvtRNA : PatLeaf<(i32 0x9)>; 49 50def CvtNONE_FTZ : PatLeaf<(i32 0x10)>; 51def CvtRNI_FTZ : PatLeaf<(i32 0x11)>; 52def CvtRZI_FTZ : PatLeaf<(i32 0x12)>; 53def CvtRMI_FTZ : PatLeaf<(i32 0x13)>; 54def CvtRPI_FTZ : PatLeaf<(i32 0x14)>; 55def CvtRN_FTZ : PatLeaf<(i32 0x15)>; 56def CvtRZ_FTZ : PatLeaf<(i32 0x16)>; 57def CvtRM_FTZ : PatLeaf<(i32 0x17)>; 58def CvtRP_FTZ : PatLeaf<(i32 0x18)>; 59 60def CvtSAT : PatLeaf<(i32 0x20)>; 61def CvtSAT_FTZ : PatLeaf<(i32 0x30)>; 62 63def CvtNONE_RELU : PatLeaf<(i32 0x40)>; 64def CvtRN_RELU : PatLeaf<(i32 0x45)>; 65def CvtRZ_RELU : PatLeaf<(i32 0x46)>; 66 67def CvtMode : Operand<i32> { 68 let PrintMethod = "printCvtMode"; 69} 70 71// Compare modes 72// These must match the enum in NVPTX.h 73def CmpEQ : PatLeaf<(i32 0)>; 74def CmpNE : PatLeaf<(i32 1)>; 75def CmpLT : PatLeaf<(i32 2)>; 76def CmpLE : PatLeaf<(i32 3)>; 77def CmpGT : PatLeaf<(i32 4)>; 78def CmpGE : PatLeaf<(i32 5)>; 79def CmpLO : PatLeaf<(i32 6)>; 80def CmpLS : PatLeaf<(i32 7)>; 81def CmpHI : PatLeaf<(i32 8)>; 82def CmpHS : PatLeaf<(i32 9)>; 83def CmpEQU : PatLeaf<(i32 10)>; 84def CmpNEU : PatLeaf<(i32 11)>; 85def CmpLTU : PatLeaf<(i32 12)>; 86def CmpLEU : PatLeaf<(i32 13)>; 87def CmpGTU : PatLeaf<(i32 14)>; 88def CmpGEU : PatLeaf<(i32 15)>; 89def CmpNUM : PatLeaf<(i32 16)>; 90def CmpNAN : PatLeaf<(i32 17)>; 91 92def CmpEQ_FTZ : PatLeaf<(i32 0x100)>; 93def CmpNE_FTZ : PatLeaf<(i32 0x101)>; 94def CmpLT_FTZ : PatLeaf<(i32 0x102)>; 95def CmpLE_FTZ : PatLeaf<(i32 0x103)>; 96def CmpGT_FTZ : PatLeaf<(i32 0x104)>; 97def CmpGE_FTZ : PatLeaf<(i32 0x105)>; 98def CmpEQU_FTZ : PatLeaf<(i32 0x10A)>; 99def CmpNEU_FTZ : PatLeaf<(i32 0x10B)>; 100def CmpLTU_FTZ : PatLeaf<(i32 0x10C)>; 101def CmpLEU_FTZ : PatLeaf<(i32 0x10D)>; 102def CmpGTU_FTZ : PatLeaf<(i32 0x10E)>; 103def CmpGEU_FTZ : PatLeaf<(i32 0x10F)>; 104def CmpNUM_FTZ : PatLeaf<(i32 0x110)>; 105def CmpNAN_FTZ : PatLeaf<(i32 0x111)>; 106 107def CmpMode : Operand<i32> { 108 let PrintMethod = "printCmpMode"; 109} 110def VecElement : Operand<i32> { 111 let PrintMethod = "printVecElement"; 112} 113 114// PRMT modes 115// These must match the enum in NVPTX.h 116def PrmtNONE : PatLeaf<(i32 0x0)>; 117def PrmtF4E : PatLeaf<(i32 0x1)>; 118def PrmtB4E : PatLeaf<(i32 0x2)>; 119def PrmtRC8 : PatLeaf<(i32 0x3)>; 120def PrmtECL : PatLeaf<(i32 0x4)>; 121def PrmtECR : PatLeaf<(i32 0x5)>; 122def PrmtRC16 : PatLeaf<(i32 0x6)>; 123 124def PrmtMode : Operand<i32> { 125 let PrintMethod = "printPrmtMode"; 126} 127 128 129//===----------------------------------------------------------------------===// 130// NVPTX Instruction Predicate Definitions 131//===----------------------------------------------------------------------===// 132 133 134def hasAtomAddF64 : Predicate<"Subtarget->hasAtomAddF64()">; 135def hasAtomScope : Predicate<"Subtarget->hasAtomScope()">; 136def hasAtomBitwise64 : Predicate<"Subtarget->hasAtomBitwise64()">; 137def hasAtomMinMax64 : Predicate<"Subtarget->hasAtomMinMax64()">; 138def hasVote : Predicate<"Subtarget->hasVote()">; 139def hasDouble : Predicate<"Subtarget->hasDouble()">; 140def hasLDG : Predicate<"Subtarget->hasLDG()">; 141def hasLDU : Predicate<"Subtarget->hasLDU()">; 142def hasPTXASUnreachableBug : Predicate<"Subtarget->hasPTXASUnreachableBug()">; 143def noPTXASUnreachableBug : Predicate<"!Subtarget->hasPTXASUnreachableBug()">; 144def hasOptEnabled : Predicate<"TM.getOptLevel() != CodeGenOptLevel::None">; 145 146def doF32FTZ : Predicate<"useF32FTZ()">; 147def doNoF32FTZ : Predicate<"!useF32FTZ()">; 148def doRsqrtOpt : Predicate<"doRsqrtOpt()">; 149 150def doMulWide : Predicate<"doMulWide">; 151 152def allowFMA : Predicate<"allowFMA()">; 153def noFMA : Predicate<"!allowFMA()">; 154def allowUnsafeFPMath : Predicate<"allowUnsafeFPMath()">; 155def noUnsafeFPMath : Predicate<"!allowUnsafeFPMath()">; 156 157def do_DIVF32_APPROX : Predicate<"getDivF32Level()==0">; 158def do_DIVF32_FULL : Predicate<"getDivF32Level()==1">; 159 160def do_SQRTF32_APPROX : Predicate<"!usePrecSqrtF32()">; 161def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">; 162 163def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">; 164def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">; 165def hasDotInstructions : Predicate<"Subtarget->hasDotInstructions()">; 166 167def True : Predicate<"true">; 168def False : Predicate<"false">; 169 170class hasPTX<int version>: Predicate<"Subtarget->getPTXVersion() >= " # version>; 171class hasSM<int version>: Predicate<"Subtarget->getSmVersion() >= " # version>; 172 173// Explicit records for arch-accelerated SM versions 174def hasSM90a : Predicate<"Subtarget->getFullSmVersion() == 901">; 175def hasSM100a : Predicate<"Subtarget->getFullSmVersion() == 1001">; 176def hasSM101a : Predicate<"Subtarget->getFullSmVersion() == 1011">; 177def hasSM120a : Predicate<"Subtarget->getFullSmVersion() == 1201">; 178 179// non-sync shfl instructions are not available on sm_70+ in PTX6.4+ 180def hasSHFL : Predicate<"!(Subtarget->getSmVersion() >= 70" 181 "&& Subtarget->getPTXVersion() >= 64)">; 182 183def useFP16Math: Predicate<"Subtarget->allowFP16Math()">; 184def hasBF16Math: Predicate<"Subtarget->hasBF16Math()">; 185 186// Helper class to aid conversion between ValueType and a matching RegisterClass. 187 188class ValueToRegClass<ValueType T> { 189 string name = !cast<string>(T); 190 NVPTXRegClass ret = !cond( 191 !eq(name, "i1"): Int1Regs, 192 !eq(name, "i16"): Int16Regs, 193 !eq(name, "v2i16"): Int32Regs, 194 !eq(name, "i32"): Int32Regs, 195 !eq(name, "i64"): Int64Regs, 196 !eq(name, "f16"): Int16Regs, 197 !eq(name, "v2f16"): Int32Regs, 198 !eq(name, "bf16"): Int16Regs, 199 !eq(name, "v2bf16"): Int32Regs, 200 !eq(name, "f32"): Float32Regs, 201 !eq(name, "f64"): Float64Regs, 202 !eq(name, "ai32"): Int32ArgRegs, 203 !eq(name, "ai64"): Int64ArgRegs, 204 !eq(name, "af32"): Float32ArgRegs, 205 !eq(name, "if64"): Float64ArgRegs, 206 ); 207} 208 209 210//===----------------------------------------------------------------------===// 211// Some Common Instruction Class Templates 212//===----------------------------------------------------------------------===// 213 214// Utility class to wrap up information about a register and DAG type for more 215// convenient iteration and parameterization 216class RegTyInfo<ValueType ty, NVPTXRegClass rc, Operand imm> { 217 ValueType Ty = ty; 218 NVPTXRegClass RC = rc; 219 Operand Imm = imm; 220 int Size = ty.Size; 221} 222 223def I16RT : RegTyInfo<i16, Int16Regs, i16imm>; 224def I32RT : RegTyInfo<i32, Int32Regs, i32imm>; 225def I64RT : RegTyInfo<i64, Int64Regs, i64imm>; 226 227// Template for instructions which take three int64, int32, or int16 args. 228// The instructions are named "<OpcStr><Width>" (e.g. "add.s64"). 229multiclass I3<string OpcStr, SDNode OpNode, bit commutative> { 230 foreach t = [I16RT, I32RT, I64RT] in { 231 defvar asmstr = OpcStr # t.Size # " \t$dst, $a, $b;"; 232 233 def t.Ty # rr : 234 NVPTXInst<(outs t.RC:$dst), (ins t.RC:$a, t.RC:$b), 235 asmstr, 236 [(set t.Ty:$dst, (OpNode t.Ty:$a, t.Ty:$b))]>; 237 def t.Ty # ri : 238 NVPTXInst<(outs t.RC:$dst), (ins t.RC:$a, t.Imm:$b), 239 asmstr, 240 [(set t.Ty:$dst, (OpNode t.RC:$a, imm:$b))]>; 241 if !not(commutative) then 242 def t.Ty # ir : 243 NVPTXInst<(outs t.RC:$dst), (ins t.Imm:$a, t.RC:$b), 244 asmstr, 245 [(set t.Ty:$dst, (OpNode imm:$a, t.RC:$b))]>; 246 } 247} 248 249class I16x2<string OpcStr, SDNode OpNode> : 250 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), 251 !strconcat(OpcStr, "16x2 \t$dst, $a, $b;"), 252 [(set v2i16:$dst, (OpNode v2i16:$a, v2i16:$b))]>, 253 Requires<[hasPTX<80>, hasSM<90>]>; 254 255// Template for instructions which take 3 int args. The instructions are 256// named "<OpcStr>.s32" (e.g. "addc.cc.s32"). 257multiclass ADD_SUB_INT_CARRY<string OpcStr, SDNode OpNode> { 258 let hasSideEffects = 1 in { 259 def i32rr : 260 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), 261 !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"), 262 [(set i32:$dst, (OpNode i32:$a, i32:$b))]>; 263 def i32ri : 264 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), 265 !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"), 266 [(set i32:$dst, (OpNode i32:$a, imm:$b))]>; 267 def i64rr : 268 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b), 269 !strconcat(OpcStr, ".s64 \t$dst, $a, $b;"), 270 [(set i64:$dst, (OpNode i64:$a, i64:$b))]>, 271 Requires<[hasPTX<43>]>; 272 def i64ri : 273 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b), 274 !strconcat(OpcStr, ".s64 \t$dst, $a, $b;"), 275 [(set i64:$dst, (OpNode i64:$a, imm:$b))]>, 276 Requires<[hasPTX<43>]>; 277 } 278} 279 280// Template for minimum/maximum instructions. 281// 282// Also defines ftz (flush subnormal inputs and results to sign-preserving 283// zero) variants for fp32 functions. 284multiclass FMINIMUMMAXIMUM<string OpcStr, bit NaN, SDNode OpNode> { 285 if !not(NaN) then { 286 def f64rr : 287 NVPTXInst<(outs Float64Regs:$dst), 288 (ins Float64Regs:$a, Float64Regs:$b), 289 !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"), 290 [(set f64:$dst, (OpNode f64:$a, f64:$b))]>; 291 def f64ri : 292 NVPTXInst<(outs Float64Regs:$dst), 293 (ins Float64Regs:$a, f64imm:$b), 294 !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"), 295 [(set f64:$dst, (OpNode f64:$a, fpimm:$b))]>; 296 } 297 def f32rr_ftz : 298 NVPTXInst<(outs Float32Regs:$dst), 299 (ins Float32Regs:$a, Float32Regs:$b), 300 !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"), 301 [(set f32:$dst, (OpNode f32:$a, f32:$b))]>, 302 Requires<[doF32FTZ]>; 303 def f32ri_ftz : 304 NVPTXInst<(outs Float32Regs:$dst), 305 (ins Float32Regs:$a, f32imm:$b), 306 !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"), 307 [(set f32:$dst, (OpNode f32:$a, fpimm:$b))]>, 308 Requires<[doF32FTZ]>; 309 def f32rr : 310 NVPTXInst<(outs Float32Regs:$dst), 311 (ins Float32Regs:$a, Float32Regs:$b), 312 !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"), 313 [(set f32:$dst, (OpNode f32:$a, f32:$b))]>; 314 def f32ri : 315 NVPTXInst<(outs Float32Regs:$dst), 316 (ins Float32Regs:$a, f32imm:$b), 317 !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"), 318 [(set f32:$dst, (OpNode f32:$a, fpimm:$b))]>; 319 320 def f16rr_ftz : 321 NVPTXInst<(outs Int16Regs:$dst), 322 (ins Int16Regs:$a, Int16Regs:$b), 323 !strconcat(OpcStr, ".ftz.f16 \t$dst, $a, $b;"), 324 [(set f16:$dst, (OpNode f16:$a, f16:$b))]>, 325 Requires<[useFP16Math, doF32FTZ]>; 326 def f16rr : 327 NVPTXInst<(outs Int16Regs:$dst), 328 (ins Int16Regs:$a, Int16Regs:$b), 329 !strconcat(OpcStr, ".f16 \t$dst, $a, $b;"), 330 [(set f16:$dst, (OpNode f16:$a, f16:$b))]>, 331 Requires<[useFP16Math, hasSM<80>, hasPTX<70>]>; 332 333 def f16x2rr_ftz : 334 NVPTXInst<(outs Int32Regs:$dst), 335 (ins Int32Regs:$a, Int32Regs:$b), 336 !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"), 337 [(set v2f16:$dst, (OpNode v2f16:$a, v2f16:$b))]>, 338 Requires<[useFP16Math, hasSM<80>, hasPTX<70>, doF32FTZ]>; 339 def f16x2rr : 340 NVPTXInst<(outs Int32Regs:$dst), 341 (ins Int32Regs:$a, Int32Regs:$b), 342 !strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"), 343 [(set v2f16:$dst, (OpNode v2f16:$a, v2f16:$b))]>, 344 Requires<[useFP16Math, hasSM<80>, hasPTX<70>]>; 345 def bf16rr : 346 NVPTXInst<(outs Int16Regs:$dst), 347 (ins Int16Regs:$a, Int16Regs:$b), 348 !strconcat(OpcStr, ".bf16 \t$dst, $a, $b;"), 349 [(set bf16:$dst, (OpNode bf16:$a, bf16:$b))]>, 350 Requires<[hasBF16Math, hasSM<80>, hasPTX<70>]>; 351 def bf16x2rr : 352 NVPTXInst<(outs Int32Regs:$dst), 353 (ins Int32Regs:$a, Int32Regs:$b), 354 !strconcat(OpcStr, ".bf16x2 \t$dst, $a, $b;"), 355 [(set v2bf16:$dst, (OpNode v2bf16:$a, v2bf16:$b))]>, 356 Requires<[hasBF16Math, hasSM<80>, hasPTX<70>]>; 357} 358 359// Template for instructions which take three FP args. The 360// instructions are named "<OpcStr>.f<Width>" (e.g. "add.f64"). 361// 362// Also defines ftz (flush subnormal inputs and results to sign-preserving 363// zero) variants for fp32/fp16 functions. 364// 365// This multiclass should be used for nodes that can be folded to make fma ops. 366// In this case, we use the ".rn" variant when FMA is disabled, as this behaves 367// just like the non ".rn" op, but prevents ptxas from creating FMAs. 368multiclass F3_fma_component<string OpcStr, SDNode OpNode> { 369 def f64rr : 370 NVPTXInst<(outs Float64Regs:$dst), 371 (ins Float64Regs:$a, Float64Regs:$b), 372 !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"), 373 [(set f64:$dst, (OpNode f64:$a, f64:$b))]>, 374 Requires<[allowFMA]>; 375 def f64ri : 376 NVPTXInst<(outs Float64Regs:$dst), 377 (ins Float64Regs:$a, f64imm:$b), 378 !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"), 379 [(set f64:$dst, (OpNode f64:$a, fpimm:$b))]>, 380 Requires<[allowFMA]>; 381 def f32rr_ftz : 382 NVPTXInst<(outs Float32Regs:$dst), 383 (ins Float32Regs:$a, Float32Regs:$b), 384 !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"), 385 [(set f32:$dst, (OpNode f32:$a, f32:$b))]>, 386 Requires<[allowFMA, doF32FTZ]>; 387 def f32ri_ftz : 388 NVPTXInst<(outs Float32Regs:$dst), 389 (ins Float32Regs:$a, f32imm:$b), 390 !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"), 391 [(set f32:$dst, (OpNode f32:$a, fpimm:$b))]>, 392 Requires<[allowFMA, doF32FTZ]>; 393 def f32rr : 394 NVPTXInst<(outs Float32Regs:$dst), 395 (ins Float32Regs:$a, Float32Regs:$b), 396 !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"), 397 [(set f32:$dst, (OpNode f32:$a, f32:$b))]>, 398 Requires<[allowFMA]>; 399 def f32ri : 400 NVPTXInst<(outs Float32Regs:$dst), 401 (ins Float32Regs:$a, f32imm:$b), 402 !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"), 403 [(set f32:$dst, (OpNode f32:$a, fpimm:$b))]>, 404 Requires<[allowFMA]>; 405 406 def f16rr_ftz : 407 NVPTXInst<(outs Int16Regs:$dst), 408 (ins Int16Regs:$a, Int16Regs:$b), 409 !strconcat(OpcStr, ".ftz.f16 \t$dst, $a, $b;"), 410 [(set f16:$dst, (OpNode f16:$a, f16:$b))]>, 411 Requires<[useFP16Math, allowFMA, doF32FTZ]>; 412 def f16rr : 413 NVPTXInst<(outs Int16Regs:$dst), 414 (ins Int16Regs:$a, Int16Regs:$b), 415 !strconcat(OpcStr, ".f16 \t$dst, $a, $b;"), 416 [(set f16:$dst, (OpNode f16:$a, f16:$b))]>, 417 Requires<[useFP16Math, allowFMA]>; 418 419 def f16x2rr_ftz : 420 NVPTXInst<(outs Int32Regs:$dst), 421 (ins Int32Regs:$a, Int32Regs:$b), 422 !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"), 423 [(set v2f16:$dst, (OpNode v2f16:$a, v2f16:$b))]>, 424 Requires<[useFP16Math, allowFMA, doF32FTZ]>; 425 def f16x2rr : 426 NVPTXInst<(outs Int32Regs:$dst), 427 (ins Int32Regs:$a, Int32Regs:$b), 428 !strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"), 429 [(set v2f16:$dst, (OpNode v2f16:$a, v2f16:$b))]>, 430 Requires<[useFP16Math, allowFMA]>; 431 def bf16rr : 432 NVPTXInst<(outs Int16Regs:$dst), 433 (ins Int16Regs:$a, Int16Regs:$b), 434 !strconcat(OpcStr, ".bf16 \t$dst, $a, $b;"), 435 [(set bf16:$dst, (OpNode bf16:$a, bf16:$b))]>, 436 Requires<[hasBF16Math, allowFMA]>; 437 438 def bf16x2rr : 439 NVPTXInst<(outs Int32Regs:$dst), 440 (ins Int32Regs:$a, Int32Regs:$b), 441 !strconcat(OpcStr, ".bf16x2 \t$dst, $a, $b;"), 442 [(set v2bf16:$dst, (OpNode v2bf16:$a, v2bf16:$b))]>, 443 Requires<[hasBF16Math, allowFMA]>; 444 // These have strange names so we don't perturb existing mir tests. 445 def _rnf64rr : 446 NVPTXInst<(outs Float64Regs:$dst), 447 (ins Float64Regs:$a, Float64Regs:$b), 448 !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"), 449 [(set f64:$dst, (OpNode f64:$a, f64:$b))]>, 450 Requires<[noFMA]>; 451 def _rnf64ri : 452 NVPTXInst<(outs Float64Regs:$dst), 453 (ins Float64Regs:$a, f64imm:$b), 454 !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"), 455 [(set f64:$dst, (OpNode f64:$a, fpimm:$b))]>, 456 Requires<[noFMA]>; 457 def _rnf32rr_ftz : 458 NVPTXInst<(outs Float32Regs:$dst), 459 (ins Float32Regs:$a, Float32Regs:$b), 460 !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"), 461 [(set f32:$dst, (OpNode f32:$a, Float32Regs:$b))]>, 462 Requires<[noFMA, doF32FTZ]>; 463 def _rnf32ri_ftz : 464 NVPTXInst<(outs Float32Regs:$dst), 465 (ins Float32Regs:$a, f32imm:$b), 466 !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"), 467 [(set f32:$dst, (OpNode f32:$a, fpimm:$b))]>, 468 Requires<[noFMA, doF32FTZ]>; 469 def _rnf32rr : 470 NVPTXInst<(outs Float32Regs:$dst), 471 (ins Float32Regs:$a, Float32Regs:$b), 472 !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"), 473 [(set f32:$dst, (OpNode f32:$a, f32:$b))]>, 474 Requires<[noFMA]>; 475 def _rnf32ri : 476 NVPTXInst<(outs Float32Regs:$dst), 477 (ins Float32Regs:$a, f32imm:$b), 478 !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"), 479 [(set f32:$dst, (OpNode f32:$a, fpimm:$b))]>, 480 Requires<[noFMA]>; 481 def _rnf16rr_ftz : 482 NVPTXInst<(outs Int16Regs:$dst), 483 (ins Int16Regs:$a, Int16Regs:$b), 484 !strconcat(OpcStr, ".rn.ftz.f16 \t$dst, $a, $b;"), 485 [(set f16:$dst, (OpNode f16:$a, f16:$b))]>, 486 Requires<[useFP16Math, noFMA, doF32FTZ]>; 487 def _rnf16rr : 488 NVPTXInst<(outs Int16Regs:$dst), 489 (ins Int16Regs:$a, Int16Regs:$b), 490 !strconcat(OpcStr, ".rn.f16 \t$dst, $a, $b;"), 491 [(set f16:$dst, (OpNode f16:$a, f16:$b))]>, 492 Requires<[useFP16Math, noFMA]>; 493 def _rnf16x2rr_ftz : 494 NVPTXInst<(outs Int32Regs:$dst), 495 (ins Int32Regs:$a, Int32Regs:$b), 496 !strconcat(OpcStr, ".rn.ftz.f16x2 \t$dst, $a, $b;"), 497 [(set v2f16:$dst, (OpNode v2f16:$a, v2f16:$b))]>, 498 Requires<[useFP16Math, noFMA, doF32FTZ]>; 499 def _rnf16x2rr : 500 NVPTXInst<(outs Int32Regs:$dst), 501 (ins Int32Regs:$a, Int32Regs:$b), 502 !strconcat(OpcStr, ".rn.f16x2 \t$dst, $a, $b;"), 503 [(set v2f16:$dst, (OpNode v2f16:$a, v2f16:$b))]>, 504 Requires<[useFP16Math, noFMA]>; 505 def _rnbf16rr_ftz : 506 NVPTXInst<(outs Int16Regs:$dst), 507 (ins Int16Regs:$a, Int16Regs:$b), 508 !strconcat(OpcStr, ".rn.ftz.bf16 \t$dst, $a, $b;"), 509 [(set bf16:$dst, (OpNode bf16:$a, bf16:$b))]>, 510 Requires<[hasBF16Math, noFMA, doF32FTZ]>; 511 def _rnbf16rr : 512 NVPTXInst<(outs Int16Regs:$dst), 513 (ins Int16Regs:$a, Int16Regs:$b), 514 !strconcat(OpcStr, ".rn.bf16 \t$dst, $a, $b;"), 515 [(set bf16:$dst, (OpNode bf16:$a, bf16:$b))]>, 516 Requires<[hasBF16Math, noFMA]>; 517 def _rnbf16x2rr_ftz : 518 NVPTXInst<(outs Int32Regs:$dst), 519 (ins Int32Regs:$a, Int32Regs:$b), 520 !strconcat(OpcStr, ".rn.ftz.bf16x2 \t$dst, $a, $b;"), 521 [(set v2bf16:$dst, (OpNode v2bf16:$a, v2bf16:$b))]>, 522 Requires<[hasBF16Math, noFMA, doF32FTZ]>; 523 def _rnbf16x2rr : 524 NVPTXInst<(outs Int32Regs:$dst), 525 (ins Int32Regs:$a, Int32Regs:$b), 526 !strconcat(OpcStr, ".rn.bf16x2 \t$dst, $a, $b;"), 527 [(set v2bf16:$dst, (OpNode v2bf16:$a, v2bf16:$b))]>, 528 Requires<[hasBF16Math, noFMA]>; 529} 530 531// Template for operations which take two f32 or f64 operands. Provides three 532// instructions: <OpcStr>.f64, <OpcStr>.f32, and <OpcStr>.ftz.f32 (flush 533// subnormal inputs and results to zero). 534multiclass F2<string OpcStr, SDNode OpNode> { 535 def f64 : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a), 536 !strconcat(OpcStr, ".f64 \t$dst, $a;"), 537 [(set f64:$dst, (OpNode f64:$a))]>; 538 def f32_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a), 539 !strconcat(OpcStr, ".ftz.f32 \t$dst, $a;"), 540 [(set f32:$dst, (OpNode f32:$a))]>, 541 Requires<[doF32FTZ]>; 542 def f32 : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a), 543 !strconcat(OpcStr, ".f32 \t$dst, $a;"), 544 [(set f32:$dst, (OpNode f32:$a))]>; 545} 546 547multiclass F2_Support_Half<string OpcStr, SDNode OpNode> { 548 def bf16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a), 549 !strconcat(OpcStr, ".bf16 \t$dst, $a;"), 550 [(set bf16:$dst, (OpNode bf16:$a))]>, 551 Requires<[hasSM<80>, hasPTX<70>]>; 552 def bf16x2 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a), 553 !strconcat(OpcStr, ".bf16x2 \t$dst, $a;"), 554 [(set v2bf16:$dst, (OpNode v2bf16:$a))]>, 555 Requires<[hasSM<80>, hasPTX<70>]>; 556 def f16_ftz : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a), 557 !strconcat(OpcStr, ".ftz.f16 \t$dst, $a;"), 558 [(set f16:$dst, (OpNode f16:$a))]>, 559 Requires<[hasSM<53>, hasPTX<65>, doF32FTZ]>; 560 def f16x2_ftz : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a), 561 !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a;"), 562 [(set v2f16:$dst, (OpNode v2f16:$a))]>, 563 Requires<[hasSM<53>, hasPTX<65>, doF32FTZ]>; 564 def f16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a), 565 !strconcat(OpcStr, ".f16 \t$dst, $a;"), 566 [(set f16:$dst, (OpNode f16:$a))]>, 567 Requires<[hasSM<53>, hasPTX<65>]>; 568 def f16x2 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a), 569 !strconcat(OpcStr, ".f16x2 \t$dst, $a;"), 570 [(set v2f16:$dst, (OpNode v2f16:$a))]>, 571 Requires<[hasSM<53>, hasPTX<65>]>; 572 573} 574 575// Variant where only .ftz.bf16 is supported. 576multiclass F2_Support_Half_BF<string OpcStr, SDNode OpNode> { 577 def bf16_ftz : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a), 578 OpcStr # ".ftz.bf16 \t$dst, $a;", 579 [(set bf16:$dst, (OpNode bf16:$a))]>, 580 Requires<[hasSM<90>, hasPTX<78>]>; 581 def bf16x2_ftz: NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a), 582 OpcStr # ".ftz.bf16x2 \t$dst, $a;", 583 [(set v2bf16:$dst, (OpNode v2bf16:$a))]>, 584 Requires<[hasSM<90>, hasPTX<78>]>; 585} 586 587//===----------------------------------------------------------------------===// 588// NVPTX Instructions. 589//===----------------------------------------------------------------------===// 590 591//----------------------------------- 592// Type Conversion 593//----------------------------------- 594 595let hasSideEffects = false in { 596 // Generate a cvt to the given type from all possible types. Each instance 597 // takes a CvtMode immediate that defines the conversion mode to use. It can 598 // be CvtNONE to omit a conversion mode. 599 multiclass CVT_FROM_ALL<string ToType, RegisterClass RC, list<Predicate> Preds = []> { 600 def _s8 : 601 NVPTXInst<(outs RC:$dst), 602 (ins Int16Regs:$src, CvtMode:$mode), 603 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 604 ToType, ".s8 \t$dst, $src;"), []>, 605 Requires<Preds>; 606 def _u8 : 607 NVPTXInst<(outs RC:$dst), 608 (ins Int16Regs:$src, CvtMode:$mode), 609 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 610 ToType, ".u8 \t$dst, $src;"), []>, 611 Requires<Preds>; 612 def _s16 : 613 NVPTXInst<(outs RC:$dst), 614 (ins Int16Regs:$src, CvtMode:$mode), 615 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 616 ToType, ".s16 \t$dst, $src;"), []>, 617 Requires<Preds>; 618 def _u16 : 619 NVPTXInst<(outs RC:$dst), 620 (ins Int16Regs:$src, CvtMode:$mode), 621 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 622 ToType, ".u16 \t$dst, $src;"), []>, 623 Requires<Preds>; 624 def _s32 : 625 NVPTXInst<(outs RC:$dst), 626 (ins Int32Regs:$src, CvtMode:$mode), 627 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 628 ToType, ".s32 \t$dst, $src;"), []>, 629 Requires<Preds>; 630 def _u32 : 631 NVPTXInst<(outs RC:$dst), 632 (ins Int32Regs:$src, CvtMode:$mode), 633 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 634 ToType, ".u32 \t$dst, $src;"), []>, 635 Requires<Preds>; 636 def _s64 : 637 NVPTXInst<(outs RC:$dst), 638 (ins Int64Regs:$src, CvtMode:$mode), 639 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 640 ToType, ".s64 \t$dst, $src;"), []>, 641 Requires<Preds>; 642 def _u64 : 643 NVPTXInst<(outs RC:$dst), 644 (ins Int64Regs:$src, CvtMode:$mode), 645 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 646 ToType, ".u64 \t$dst, $src;"), []>, 647 Requires<Preds>; 648 def _f16 : 649 NVPTXInst<(outs RC:$dst), 650 (ins Int16Regs:$src, CvtMode:$mode), 651 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 652 ToType, ".f16 \t$dst, $src;"), []>, 653 Requires<Preds>; 654 def _bf16 : 655 NVPTXInst<(outs RC:$dst), 656 (ins Int16Regs:$src, CvtMode:$mode), 657 !strconcat("cvt${mode:base}${mode:ftz}${mode:relu}${mode:sat}.", 658 ToType, ".bf16 \t$dst, $src;"), []>, 659 Requires<!if(!eq(ToType, "f32"), 660 // bf16->f32 was introduced early. 661 [hasPTX<71>, hasSM<80>], 662 // bf16->everything else needs sm90/ptx78 663 [hasPTX<78>, hasSM<90>])>; 664 def _f32 : 665 NVPTXInst<(outs RC:$dst), 666 (ins Float32Regs:$src, CvtMode:$mode), 667 !strconcat("cvt${mode:base}${mode:ftz}${mode:relu}${mode:sat}.", 668 ToType, ".f32 \t$dst, $src;"), []>, 669 Requires<!if(!eq(ToType, "bf16"), 670 // f32->bf16 was introduced early. 671 [hasPTX<70>, hasSM<80>], 672 Preds)>; 673 def _f64 : 674 NVPTXInst<(outs RC:$dst), 675 (ins Float64Regs:$src, CvtMode:$mode), 676 !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.", 677 ToType, ".f64 \t$dst, $src;"), []>, 678 Requires<Preds>; 679 } 680 681 // Generate cvts from all types to all types. 682 defm CVT_s8 : CVT_FROM_ALL<"s8", Int16Regs>; 683 defm CVT_u8 : CVT_FROM_ALL<"u8", Int16Regs>; 684 defm CVT_s16 : CVT_FROM_ALL<"s16", Int16Regs>; 685 defm CVT_u16 : CVT_FROM_ALL<"u16", Int16Regs>; 686 defm CVT_s32 : CVT_FROM_ALL<"s32", Int32Regs>; 687 defm CVT_u32 : CVT_FROM_ALL<"u32", Int32Regs>; 688 defm CVT_s64 : CVT_FROM_ALL<"s64", Int64Regs>; 689 defm CVT_u64 : CVT_FROM_ALL<"u64", Int64Regs>; 690 defm CVT_f16 : CVT_FROM_ALL<"f16", Int16Regs>; 691 defm CVT_bf16 : CVT_FROM_ALL<"bf16", Int16Regs, [hasPTX<78>, hasSM<90>]>; 692 defm CVT_f32 : CVT_FROM_ALL<"f32", Float32Regs>; 693 defm CVT_f64 : CVT_FROM_ALL<"f64", Float64Regs>; 694 695 // These cvts are different from those above: The source and dest registers 696 // are of the same type. 697 def CVT_INREG_s16_s8 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), 698 "cvt.s16.s8 \t$dst, $src;", []>; 699 def CVT_INREG_s32_s8 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src), 700 "cvt.s32.s8 \t$dst, $src;", []>; 701 def CVT_INREG_s32_s16 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src), 702 "cvt.s32.s16 \t$dst, $src;", []>; 703 def CVT_INREG_s64_s8 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), 704 "cvt.s64.s8 \t$dst, $src;", []>; 705 def CVT_INREG_s64_s16 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), 706 "cvt.s64.s16 \t$dst, $src;", []>; 707 def CVT_INREG_s64_s32 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), 708 "cvt.s64.s32 \t$dst, $src;", []>; 709 710 multiclass CVT_FROM_FLOAT_V2_SM80<string FromName, RegisterClass RC> { 711 def _f32 : 712 NVPTXInst<(outs RC:$dst), 713 (ins Float32Regs:$src1, Float32Regs:$src2, CvtMode:$mode), 714 !strconcat("cvt${mode:base}${mode:relu}.", 715 FromName, ".f32 \t$dst, $src1, $src2;"), []>, 716 Requires<[hasPTX<70>, hasSM<80>]>; 717 } 718 719 defm CVT_f16x2 : CVT_FROM_FLOAT_V2_SM80<"f16x2", Int32Regs>; 720 defm CVT_bf16x2 : CVT_FROM_FLOAT_V2_SM80<"bf16x2", Int32Regs>; 721 722 // FP8 conversions. 723 multiclass CVT_TO_F8X2<string F8Name> { 724 def _f32 : 725 NVPTXInst<(outs Int16Regs:$dst), 726 (ins Float32Regs:$src1, Float32Regs:$src2, CvtMode:$mode), 727 !strconcat("cvt${mode:base}.satfinite${mode:relu}.", 728 F8Name, "x2.f32 \t$dst, $src1, $src2;"), []>, 729 Requires<[hasPTX<81>, hasSM<89>]>; 730 def _f16x2 : 731 NVPTXInst<(outs Int16Regs:$dst), 732 (ins Int32Regs:$src, CvtMode:$mode), 733 !strconcat("cvt${mode:base}.satfinite${mode:relu}.", 734 F8Name, "x2.f16x2 \t$dst, $src;"), []>, 735 Requires<[hasPTX<81>, hasSM<89>]>; 736 } 737 738 defm CVT_e4m3x2 : CVT_TO_F8X2<"e4m3">; 739 defm CVT_e5m2x2 : CVT_TO_F8X2<"e5m2">; 740 741 class CVT_f16x2_fp8<string F8Name> : 742 NVPTXInst<(outs Int32Regs:$dst), 743 (ins Int16Regs:$src, CvtMode:$mode), 744 !strconcat("cvt${mode:base}${mode:relu}.f16x2.", 745 F8Name, "x2 \t$dst, $src;"), []>, 746 Requires<[hasPTX<81>, hasSM<89>]>; 747 748 def CVT_f16x2_e4m3x2 : CVT_f16x2_fp8<"e4m3">; 749 def CVT_f16x2_e5m2x2 : CVT_f16x2_fp8<"e5m2">; 750 751 // Float to TF32 conversions 752 multiclass CVT_TO_TF32<string Modifier, list<Predicate> Preds = [hasPTX<78>, hasSM<90>]> { 753 defvar Intr = !cast<Intrinsic>("int_nvvm_f2tf32_" # !subst(".", "_", Modifier)); 754 755 def NAME : NVPTXInst<(outs Int32Regs:$dst), (ins Float32Regs:$src), 756 "cvt." # Modifier # ".tf32.f32 \t$dst, $src;", 757 [(set i32:$dst, (Intr f32:$src))]>, 758 Requires<Preds>; 759 } 760 761 defm CVT_to_tf32_rn : CVT_TO_TF32<"rn">; 762 defm CVT_to_tf32_rz : CVT_TO_TF32<"rz">; 763 defm CVT_to_tf32_rn_relu : CVT_TO_TF32<"rn.relu">; 764 defm CVT_to_tf32_rz_relu : CVT_TO_TF32<"rz.relu">; 765 defm CVT_to_tf32_rna : CVT_TO_TF32<"rna", [hasPTX<70>, hasSM<80>]>; 766 defm CVT_to_tf32_rna_satf : CVT_TO_TF32<"rna.satfinite", [hasPTX<81>, hasSM<89>]>; 767 768 defm CVT_to_tf32_rn_satf : CVT_TO_TF32<"rn.satfinite", [hasPTX<86>, hasSM<100>]>; 769 defm CVT_to_tf32_rz_satf : CVT_TO_TF32<"rz.satfinite", [hasPTX<86>, hasSM<100>]>; 770 defm CVT_to_tf32_rn_relu_satf : CVT_TO_TF32<"rn.relu.satfinite", [hasPTX<86>, hasSM<100>]>; 771 defm CVT_to_tf32_rz_relu_satf : CVT_TO_TF32<"rz.relu.satfinite", [hasPTX<86>, hasSM<100>]>; 772} 773 774def fpround_oneuse : PatFrag<(ops node:$a), (fpround node:$a), [{ 775 return N->hasOneUse(); 776}]>; 777 778def : Pat<(v2bf16 (build_vector (bf16 (fpround_oneuse f32:$lo)), 779 (bf16 (fpround_oneuse f32:$hi)))), 780 (CVT_bf16x2_f32 $hi, $lo, CvtRN)>, 781 Requires<[hasPTX<70>, hasSM<80>, hasBF16Math]>; 782 783def : Pat<(v2f16 (build_vector (f16 (fpround_oneuse f32:$lo)), 784 (f16 (fpround_oneuse f32:$hi)))), 785 (CVT_f16x2_f32 $hi, $lo, CvtRN)>, 786 Requires<[hasPTX<70>, hasSM<80>, useFP16Math]>; 787 788//----------------------------------- 789// Selection instructions (selp) 790//----------------------------------- 791 792// TODO: Missing slct 793 794// selp instructions that don't have any pattern matches; we explicitly use 795// them within this file. 796let hasSideEffects = false in { 797 multiclass SELP<string TypeStr, RegisterClass RC, Operand ImmCls> { 798 def rr : NVPTXInst<(outs RC:$dst), 799 (ins RC:$a, RC:$b, Int1Regs:$p), 800 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>; 801 def ri : NVPTXInst<(outs RC:$dst), 802 (ins RC:$a, ImmCls:$b, Int1Regs:$p), 803 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>; 804 def ir : NVPTXInst<(outs RC:$dst), 805 (ins ImmCls:$a, RC:$b, Int1Regs:$p), 806 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>; 807 def ii : NVPTXInst<(outs RC:$dst), 808 (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p), 809 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>; 810 } 811 812 multiclass SELP_PATTERN<string TypeStr, ValueType T, RegisterClass RC, 813 Operand ImmCls, SDNode ImmNode> { 814 def rr : 815 NVPTXInst<(outs RC:$dst), 816 (ins RC:$a, RC:$b, Int1Regs:$p), 817 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), 818 [(set T:$dst, (select i1:$p, T:$a, T:$b))]>; 819 def ri : 820 NVPTXInst<(outs RC:$dst), 821 (ins RC:$a, ImmCls:$b, Int1Regs:$p), 822 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), 823 [(set T:$dst, (select i1:$p, T:$a, (T ImmNode:$b)))]>; 824 def ir : 825 NVPTXInst<(outs RC:$dst), 826 (ins ImmCls:$a, RC:$b, Int1Regs:$p), 827 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), 828 [(set T:$dst, (select i1:$p, ImmNode:$a, T:$b))]>; 829 def ii : 830 NVPTXInst<(outs RC:$dst), 831 (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p), 832 !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), 833 [(set T:$dst, (select i1:$p, ImmNode:$a, ImmNode:$b))]>; 834 } 835} 836 837// Don't pattern match on selp.{s,u}{16,32,64} -- selp.b{16,32,64} is just as 838// good. 839defm SELP_b16 : SELP_PATTERN<"b16", i16, Int16Regs, i16imm, imm>; 840defm SELP_s16 : SELP<"s16", Int16Regs, i16imm>; 841defm SELP_u16 : SELP<"u16", Int16Regs, i16imm>; 842defm SELP_b32 : SELP_PATTERN<"b32", i32, Int32Regs, i32imm, imm>; 843defm SELP_s32 : SELP<"s32", Int32Regs, i32imm>; 844defm SELP_u32 : SELP<"u32", Int32Regs, i32imm>; 845defm SELP_b64 : SELP_PATTERN<"b64", i64, Int64Regs, i64imm, imm>; 846defm SELP_s64 : SELP<"s64", Int64Regs, i64imm>; 847defm SELP_u64 : SELP<"u64", Int64Regs, i64imm>; 848defm SELP_f16 : SELP_PATTERN<"b16", f16, Int16Regs, f16imm, fpimm>; 849defm SELP_bf16 : SELP_PATTERN<"b16", bf16, Int16Regs, bf16imm, fpimm>; 850 851defm SELP_f32 : SELP_PATTERN<"f32", f32, Float32Regs, f32imm, fpimm>; 852defm SELP_f64 : SELP_PATTERN<"f64", f64, Float64Regs, f64imm, fpimm>; 853 854// This does not work as tablegen fails to infer the type of 'imm'. 855// def v2f16imm : Operand<v2f16>; 856// defm SELP_f16x2 : SELP_PATTERN<"b32", v2f16, Int32Regs, v2f16imm, imm>; 857 858foreach vt = [v2f16, v2bf16, v2i16, v4i8] in { 859def : Pat<(vt (select i1:$p, vt:$a, vt:$b)), 860 (SELP_b32rr $a, $b, $p)>; 861} 862 863//----------------------------------- 864// Test Instructions 865//----------------------------------- 866 867def TESTINF_f32r : NVPTXInst<(outs Int1Regs:$p), (ins Float32Regs:$a), 868 "testp.infinite.f32 \t$p, $a;", 869 []>; 870def TESTINF_f32i : NVPTXInst<(outs Int1Regs:$p), (ins f32imm:$a), 871 "testp.infinite.f32 \t$p, $a;", 872 []>; 873def TESTINF_f64r : NVPTXInst<(outs Int1Regs:$p), (ins Float64Regs:$a), 874 "testp.infinite.f64 \t$p, $a;", 875 []>; 876def TESTINF_f64i : NVPTXInst<(outs Int1Regs:$p), (ins f64imm:$a), 877 "testp.infinite.f64 \t$p, $a;", 878 []>; 879 880//----------------------------------- 881// Integer Arithmetic 882//----------------------------------- 883 884// Template for xor masquerading as int1 arithmetic. 885multiclass ADD_SUB_i1<SDNode OpNode> { 886 def _rr: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b), 887 "xor.pred \t$dst, $a, $b;", 888 [(set i1:$dst, (OpNode i1:$a, i1:$b))]>; 889 def _ri: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b), 890 "xor.pred \t$dst, $a, $b;", 891 [(set i1:$dst, (OpNode i1:$a, (imm):$b))]>; 892} 893 894// int1 addition and subtraction are both just xor. 895defm ADD_i1 : ADD_SUB_i1<add>; 896defm SUB_i1 : ADD_SUB_i1<sub>; 897 898// int16, int32, and int64 signed addition. Since nvptx is 2's complement, we 899// also use these for unsigned arithmetic. 900defm ADD : I3<"add.s", add, /*commutative=*/ true>; 901defm SUB : I3<"sub.s", sub, /*commutative=*/ false>; 902 903def ADD16x2 : I16x2<"add.s", add>; 904 905// in32 and int64 addition and subtraction with carry-out. 906defm ADDCC : ADD_SUB_INT_CARRY<"add.cc", addc>; 907defm SUBCC : ADD_SUB_INT_CARRY<"sub.cc", subc>; 908 909// int32 and int64 addition and subtraction with carry-in and carry-out. 910defm ADDCCC : ADD_SUB_INT_CARRY<"addc.cc", adde>; 911defm SUBCCC : ADD_SUB_INT_CARRY<"subc.cc", sube>; 912 913defm MULT : I3<"mul.lo.s", mul, /*commutative=*/ true>; 914 915defm MULTHS : I3<"mul.hi.s", mulhs, /*commutative=*/ true>; 916defm MULTHU : I3<"mul.hi.u", mulhu, /*commutative=*/ true>; 917 918defm SDIV : I3<"div.s", sdiv, /*commutative=*/ false>; 919defm UDIV : I3<"div.u", udiv, /*commutative=*/ false>; 920 921// The ri versions of rem.s and rem.u won't be selected; DAGCombiner::visitSREM 922// will lower it. 923defm SREM : I3<"rem.s", srem, /*commutative=*/ false>; 924defm UREM : I3<"rem.u", urem, /*commutative=*/ false>; 925 926// Integer absolute value. NumBits should be one minus the bit width of RC. 927// This idiom implements the algorithm at 928// http://graphics.stanford.edu/~seander/bithacks.html#IntegerAbs. 929multiclass ABS<ValueType T, RegisterClass RC, string SizeName> { 930 def : NVPTXInst<(outs RC:$dst), (ins RC:$a), 931 !strconcat("abs", SizeName, " \t$dst, $a;"), 932 [(set T:$dst, (abs T:$a))]>; 933} 934defm ABS_16 : ABS<i16, Int16Regs, ".s16">; 935defm ABS_32 : ABS<i32, Int32Regs, ".s32">; 936defm ABS_64 : ABS<i64, Int64Regs, ".s64">; 937 938// Integer min/max. 939defm SMAX : I3<"max.s", smax, /*commutative=*/ true>; 940defm UMAX : I3<"max.u", umax, /*commutative=*/ true>; 941defm SMIN : I3<"min.s", smin, /*commutative=*/ true>; 942defm UMIN : I3<"min.u", umin, /*commutative=*/ true>; 943 944def SMAX16x2 : I16x2<"max.s", smax>; 945def UMAX16x2 : I16x2<"max.u", umax>; 946def SMIN16x2 : I16x2<"min.s", smin>; 947def UMIN16x2 : I16x2<"min.u", umin>; 948 949 950// 951// Wide multiplication 952// 953def MULWIDES64 : 954 NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), 955 "mul.wide.s32 \t$dst, $a, $b;", []>; 956def MULWIDES64Imm : 957 NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b), 958 "mul.wide.s32 \t$dst, $a, $b;", []>; 959def MULWIDES64Imm64 : 960 NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b), 961 "mul.wide.s32 \t$dst, $a, $b;", []>; 962 963def MULWIDEU64 : 964 NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), 965 "mul.wide.u32 \t$dst, $a, $b;", []>; 966def MULWIDEU64Imm : 967 NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b), 968 "mul.wide.u32 \t$dst, $a, $b;", []>; 969def MULWIDEU64Imm64 : 970 NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b), 971 "mul.wide.u32 \t$dst, $a, $b;", []>; 972 973def MULWIDES32 : 974 NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), 975 "mul.wide.s16 \t$dst, $a, $b;", []>; 976def MULWIDES32Imm : 977 NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b), 978 "mul.wide.s16 \t$dst, $a, $b;", []>; 979def MULWIDES32Imm32 : 980 NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b), 981 "mul.wide.s16 \t$dst, $a, $b;", []>; 982 983def MULWIDEU32 : 984 NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), 985 "mul.wide.u16 \t$dst, $a, $b;", []>; 986def MULWIDEU32Imm : 987 NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b), 988 "mul.wide.u16 \t$dst, $a, $b;", []>; 989def MULWIDEU32Imm32 : 990 NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b), 991 "mul.wide.u16 \t$dst, $a, $b;", []>; 992 993def SDTMulWide : SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>]>; 994def mul_wide_signed : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide>; 995def mul_wide_unsigned : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>; 996 997// Matchers for signed, unsigned mul.wide ISD nodes. 998def : Pat<(i32 (mul_wide_signed i16:$a, i16:$b)), 999 (MULWIDES32 $a, $b)>, 1000 Requires<[doMulWide]>; 1001def : Pat<(i32 (mul_wide_signed i16:$a, imm:$b)), 1002 (MULWIDES32Imm $a, imm:$b)>, 1003 Requires<[doMulWide]>; 1004def : Pat<(i32 (mul_wide_unsigned i16:$a, i16:$b)), 1005 (MULWIDEU32 $a, $b)>, 1006 Requires<[doMulWide]>; 1007def : Pat<(i32 (mul_wide_unsigned i16:$a, imm:$b)), 1008 (MULWIDEU32Imm $a, imm:$b)>, 1009 Requires<[doMulWide]>; 1010 1011def : Pat<(i64 (mul_wide_signed i32:$a, i32:$b)), 1012 (MULWIDES64 $a, $b)>, 1013 Requires<[doMulWide]>; 1014def : Pat<(i64 (mul_wide_signed i32:$a, imm:$b)), 1015 (MULWIDES64Imm $a, imm:$b)>, 1016 Requires<[doMulWide]>; 1017def : Pat<(i64 (mul_wide_unsigned i32:$a, i32:$b)), 1018 (MULWIDEU64 $a, $b)>, 1019 Requires<[doMulWide]>; 1020def : Pat<(i64 (mul_wide_unsigned i32:$a, imm:$b)), 1021 (MULWIDEU64Imm $a, imm:$b)>, 1022 Requires<[doMulWide]>; 1023 1024// Predicates used for converting some patterns to mul.wide. 1025def SInt32Const : PatLeaf<(imm), [{ 1026 const APInt &v = N->getAPIntValue(); 1027 return v.isSignedIntN(32); 1028}]>; 1029 1030def UInt32Const : PatLeaf<(imm), [{ 1031 const APInt &v = N->getAPIntValue(); 1032 return v.isIntN(32); 1033}]>; 1034 1035def SInt16Const : PatLeaf<(imm), [{ 1036 const APInt &v = N->getAPIntValue(); 1037 return v.isSignedIntN(16); 1038}]>; 1039 1040def UInt16Const : PatLeaf<(imm), [{ 1041 const APInt &v = N->getAPIntValue(); 1042 return v.isIntN(16); 1043}]>; 1044 1045def IntConst_0_30 : PatLeaf<(imm), [{ 1046 // Check if 0 <= v < 31; only then will the result of (x << v) be an int32. 1047 const APInt &v = N->getAPIntValue(); 1048 return v.sge(0) && v.slt(31); 1049}]>; 1050 1051def IntConst_0_14 : PatLeaf<(imm), [{ 1052 // Check if 0 <= v < 15; only then will the result of (x << v) be an int16. 1053 const APInt &v = N->getAPIntValue(); 1054 return v.sge(0) && v.slt(15); 1055}]>; 1056 1057def SHL2MUL32 : SDNodeXForm<imm, [{ 1058 const APInt &v = N->getAPIntValue(); 1059 APInt temp(32, 1); 1060 return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i32); 1061}]>; 1062 1063def SHL2MUL16 : SDNodeXForm<imm, [{ 1064 const APInt &v = N->getAPIntValue(); 1065 APInt temp(16, 1); 1066 return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i16); 1067}]>; 1068 1069// Convert "sign/zero-extend, then shift left by an immediate" to mul.wide. 1070def : Pat<(shl (sext i32:$a), (i32 IntConst_0_30:$b)), 1071 (MULWIDES64Imm $a, (SHL2MUL32 $b))>, 1072 Requires<[doMulWide]>; 1073def : Pat<(shl (zext i32:$a), (i32 IntConst_0_30:$b)), 1074 (MULWIDEU64Imm $a, (SHL2MUL32 $b))>, 1075 Requires<[doMulWide]>; 1076 1077def : Pat<(shl (sext i16:$a), (i16 IntConst_0_14:$b)), 1078 (MULWIDES32Imm $a, (SHL2MUL16 $b))>, 1079 Requires<[doMulWide]>; 1080def : Pat<(shl (zext i16:$a), (i16 IntConst_0_14:$b)), 1081 (MULWIDEU32Imm $a, (SHL2MUL16 $b))>, 1082 Requires<[doMulWide]>; 1083 1084// Convert "sign/zero-extend then multiply" to mul.wide. 1085def : Pat<(mul (sext i32:$a), (sext i32:$b)), 1086 (MULWIDES64 $a, $b)>, 1087 Requires<[doMulWide]>; 1088def : Pat<(mul (sext i32:$a), (i64 SInt32Const:$b)), 1089 (MULWIDES64Imm64 $a, (i64 SInt32Const:$b))>, 1090 Requires<[doMulWide]>; 1091 1092def : Pat<(mul (zext i32:$a), (zext i32:$b)), 1093 (MULWIDEU64 $a, $b)>, 1094 Requires<[doMulWide]>; 1095def : Pat<(mul (zext i32:$a), (i64 UInt32Const:$b)), 1096 (MULWIDEU64Imm64 $a, (i64 UInt32Const:$b))>, 1097 Requires<[doMulWide]>; 1098 1099def : Pat<(mul (sext i16:$a), (sext i16:$b)), 1100 (MULWIDES32 $a, $b)>, 1101 Requires<[doMulWide]>; 1102def : Pat<(mul (sext i16:$a), (i32 SInt16Const:$b)), 1103 (MULWIDES32Imm32 $a, (i32 SInt16Const:$b))>, 1104 Requires<[doMulWide]>; 1105 1106def : Pat<(mul (zext i16:$a), (zext i16:$b)), 1107 (MULWIDEU32 $a, $b)>, 1108 Requires<[doMulWide]>; 1109def : Pat<(mul (zext i16:$a), (i32 UInt16Const:$b)), 1110 (MULWIDEU32Imm32 $a, (i32 UInt16Const:$b))>, 1111 Requires<[doMulWide]>; 1112 1113// 1114// Integer multiply-add 1115// 1116def mul_oneuse : PatFrag<(ops node:$a, node:$b), (mul node:$a, node:$b), [{ 1117 return N->hasOneUse(); 1118}]>; 1119 1120multiclass MAD<string Ptx, ValueType VT, NVPTXRegClass Reg, Operand Imm> { 1121 def rrr: 1122 NVPTXInst<(outs Reg:$dst), 1123 (ins Reg:$a, Reg:$b, Reg:$c), 1124 Ptx # " \t$dst, $a, $b, $c;", 1125 [(set VT:$dst, (add (mul_oneuse VT:$a, VT:$b), VT:$c))]>; 1126 1127 def rir: 1128 NVPTXInst<(outs Reg:$dst), 1129 (ins Reg:$a, Imm:$b, Reg:$c), 1130 Ptx # " \t$dst, $a, $b, $c;", 1131 [(set VT:$dst, (add (mul_oneuse VT:$a, imm:$b), VT:$c))]>; 1132 def rri: 1133 NVPTXInst<(outs Reg:$dst), 1134 (ins Reg:$a, Reg:$b, Imm:$c), 1135 Ptx # " \t$dst, $a, $b, $c;", 1136 [(set VT:$dst, (add (mul_oneuse VT:$a, VT:$b), imm:$c))]>; 1137 def rii: 1138 NVPTXInst<(outs Reg:$dst), 1139 (ins Reg:$a, Imm:$b, Imm:$c), 1140 Ptx # " \t$dst, $a, $b, $c;", 1141 [(set VT:$dst, (add (mul_oneuse VT:$a, imm:$b), imm:$c))]>; 1142} 1143 1144let Predicates = [hasOptEnabled] in { 1145defm MAD16 : MAD<"mad.lo.s16", i16, Int16Regs, i16imm>; 1146defm MAD32 : MAD<"mad.lo.s32", i32, Int32Regs, i32imm>; 1147defm MAD64 : MAD<"mad.lo.s64", i64, Int64Regs, i64imm>; 1148} 1149 1150def INEG16 : 1151 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), 1152 "neg.s16 \t$dst, $src;", 1153 [(set i16:$dst, (ineg i16:$src))]>; 1154def INEG32 : 1155 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src), 1156 "neg.s32 \t$dst, $src;", 1157 [(set i32:$dst, (ineg i32:$src))]>; 1158def INEG64 : 1159 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), 1160 "neg.s64 \t$dst, $src;", 1161 [(set i64:$dst, (ineg i64:$src))]>; 1162 1163//----------------------------------- 1164// Floating Point Arithmetic 1165//----------------------------------- 1166 1167// Constant 1.0f 1168def FloatConst1 : PatLeaf<(fpimm), [{ 1169 return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEsingle() && 1170 N->getValueAPF().convertToFloat() == 1.0f; 1171}]>; 1172// Constant 1.0 (double) 1173def DoubleConst1 : PatLeaf<(fpimm), [{ 1174 return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEdouble() && 1175 N->getValueAPF().convertToDouble() == 1.0; 1176}]>; 1177// Constant -1.0 (double) 1178def DoubleConstNeg1 : PatLeaf<(fpimm), [{ 1179 return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEdouble() && 1180 N->getValueAPF().convertToDouble() == -1.0; 1181}]>; 1182 1183 1184// Constant -X -> X (double) 1185def NegDoubleConst : SDNodeXForm<fpimm, [{ 1186 return CurDAG->getTargetConstantFP(-(N->getValueAPF()), 1187 SDLoc(N), MVT::f64); 1188}]>; 1189 1190defm FADD : F3_fma_component<"add", fadd>; 1191defm FSUB : F3_fma_component<"sub", fsub>; 1192defm FMUL : F3_fma_component<"mul", fmul>; 1193 1194defm FMIN : FMINIMUMMAXIMUM<"min", /* NaN */ false, fminnum>; 1195defm FMAX : FMINIMUMMAXIMUM<"max", /* NaN */ false, fmaxnum>; 1196defm FMINNAN : FMINIMUMMAXIMUM<"min.NaN", /* NaN */ true, fminimum>; 1197defm FMAXNAN : FMINIMUMMAXIMUM<"max.NaN", /* NaN */ true, fmaximum>; 1198 1199defm FABS : F2<"abs", fabs>; 1200defm FNEG : F2<"neg", fneg>; 1201defm FABS_H: F2_Support_Half<"abs", fabs>; 1202defm FNEG_H: F2_Support_Half<"neg", fneg>; 1203 1204defm FSQRT : F2<"sqrt.rn", fsqrt>; 1205 1206defm FEXP2_H: F2_Support_Half_BF<"ex2.approx", fexp2>; 1207 1208// 1209// F16 NEG 1210// 1211class FNEG_F16_F16X2<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> : 1212 NVPTXInst<(outs RC:$dst), (ins RC:$src), 1213 !strconcat(OpcStr, " \t$dst, $src;"), 1214 [(set T:$dst, (fneg T:$src))]>, 1215 Requires<[useFP16Math, hasPTX<60>, hasSM<53>, Pred]>; 1216def FNEG16_ftz : FNEG_F16_F16X2<"neg.ftz.f16", f16, Int16Regs, doF32FTZ>; 1217def FNEG16 : FNEG_F16_F16X2<"neg.f16", f16, Int16Regs, True>; 1218def FNEG16x2_ftz : FNEG_F16_F16X2<"neg.ftz.f16x2", v2f16, Int32Regs, doF32FTZ>; 1219def FNEG16x2 : FNEG_F16_F16X2<"neg.f16x2", v2f16, Int32Regs, True>; 1220 1221// 1222// BF16 NEG 1223// 1224 1225class FNEG_BF16_F16X2<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> : 1226 NVPTXInst<(outs RC:$dst), (ins RC:$src), 1227 !strconcat(OpcStr, " \t$dst, $src;"), 1228 [(set T:$dst, (fneg T:$src))]>, 1229 Requires<[hasBF16Math, hasPTX<70>, hasSM<80>, Pred]>; 1230def BFNEG16_ftz : FNEG_BF16_F16X2<"neg.ftz.bf16", bf16, Int16Regs, doF32FTZ>; 1231def BFNEG16 : FNEG_BF16_F16X2<"neg.bf16", bf16, Int16Regs, True>; 1232def BFNEG16x2_ftz : FNEG_BF16_F16X2<"neg.ftz.bf16x2", v2bf16, Int32Regs, doF32FTZ>; 1233def BFNEG16x2 : FNEG_BF16_F16X2<"neg.bf16x2", v2bf16, Int32Regs, True>; 1234 1235// 1236// F64 division 1237// 1238def FDIV641r : 1239 NVPTXInst<(outs Float64Regs:$dst), 1240 (ins f64imm:$a, Float64Regs:$b), 1241 "rcp.rn.f64 \t$dst, $b;", 1242 [(set f64:$dst, (fdiv DoubleConst1:$a, f64:$b))]>; 1243def FDIV64rr : 1244 NVPTXInst<(outs Float64Regs:$dst), 1245 (ins Float64Regs:$a, Float64Regs:$b), 1246 "div.rn.f64 \t$dst, $a, $b;", 1247 [(set f64:$dst, (fdiv f64:$a, f64:$b))]>; 1248def FDIV64ri : 1249 NVPTXInst<(outs Float64Regs:$dst), 1250 (ins Float64Regs:$a, f64imm:$b), 1251 "div.rn.f64 \t$dst, $a, $b;", 1252 [(set f64:$dst, (fdiv f64:$a, fpimm:$b))]>; 1253 1254// fdiv will be converted to rcp 1255// fneg (fdiv 1.0, X) => fneg (rcp.rn X) 1256def : Pat<(fdiv DoubleConstNeg1:$a, f64:$b), 1257 (FNEGf64 (FDIV641r (NegDoubleConst node:$a), $b))>; 1258 1259// 1260// F32 Approximate reciprocal 1261// 1262def FDIV321r_ftz : 1263 NVPTXInst<(outs Float32Regs:$dst), 1264 (ins f32imm:$a, Float32Regs:$b), 1265 "rcp.approx.ftz.f32 \t$dst, $b;", 1266 [(set f32:$dst, (fdiv FloatConst1:$a, f32:$b))]>, 1267 Requires<[do_DIVF32_APPROX, doF32FTZ]>; 1268def FDIV321r : 1269 NVPTXInst<(outs Float32Regs:$dst), 1270 (ins f32imm:$a, Float32Regs:$b), 1271 "rcp.approx.f32 \t$dst, $b;", 1272 [(set f32:$dst, (fdiv FloatConst1:$a, f32:$b))]>, 1273 Requires<[do_DIVF32_APPROX]>; 1274// 1275// F32 Approximate division 1276// 1277def FDIV32approxrr_ftz : 1278 NVPTXInst<(outs Float32Regs:$dst), 1279 (ins Float32Regs:$a, Float32Regs:$b), 1280 "div.approx.ftz.f32 \t$dst, $a, $b;", 1281 [(set f32:$dst, (fdiv f32:$a, f32:$b))]>, 1282 Requires<[do_DIVF32_APPROX, doF32FTZ]>; 1283def FDIV32approxri_ftz : 1284 NVPTXInst<(outs Float32Regs:$dst), 1285 (ins Float32Regs:$a, f32imm:$b), 1286 "div.approx.ftz.f32 \t$dst, $a, $b;", 1287 [(set f32:$dst, (fdiv f32:$a, fpimm:$b))]>, 1288 Requires<[do_DIVF32_APPROX, doF32FTZ]>; 1289def FDIV32approxrr : 1290 NVPTXInst<(outs Float32Regs:$dst), 1291 (ins Float32Regs:$a, Float32Regs:$b), 1292 "div.approx.f32 \t$dst, $a, $b;", 1293 [(set f32:$dst, (fdiv f32:$a, f32:$b))]>, 1294 Requires<[do_DIVF32_APPROX]>; 1295def FDIV32approxri : 1296 NVPTXInst<(outs Float32Regs:$dst), 1297 (ins Float32Regs:$a, f32imm:$b), 1298 "div.approx.f32 \t$dst, $a, $b;", 1299 [(set f32:$dst, (fdiv f32:$a, fpimm:$b))]>, 1300 Requires<[do_DIVF32_APPROX]>; 1301// 1302// F32 Semi-accurate reciprocal 1303// 1304// rcp.approx gives the same result as div.full(1.0f, a) and is faster. 1305// 1306def FDIV321r_approx_ftz : 1307 NVPTXInst<(outs Float32Regs:$dst), 1308 (ins f32imm:$a, Float32Regs:$b), 1309 "rcp.approx.ftz.f32 \t$dst, $b;", 1310 [(set f32:$dst, (fdiv FloatConst1:$a, f32:$b))]>, 1311 Requires<[do_DIVF32_FULL, doF32FTZ]>; 1312def FDIV321r_approx : 1313 NVPTXInst<(outs Float32Regs:$dst), 1314 (ins f32imm:$a, Float32Regs:$b), 1315 "rcp.approx.f32 \t$dst, $b;", 1316 [(set f32:$dst, (fdiv FloatConst1:$a, f32:$b))]>, 1317 Requires<[do_DIVF32_FULL]>; 1318// 1319// F32 Semi-accurate division 1320// 1321def FDIV32rr_ftz : 1322 NVPTXInst<(outs Float32Regs:$dst), 1323 (ins Float32Regs:$a, Float32Regs:$b), 1324 "div.full.ftz.f32 \t$dst, $a, $b;", 1325 [(set f32:$dst, (fdiv Float32Regs:$a, f32:$b))]>, 1326 Requires<[do_DIVF32_FULL, doF32FTZ]>; 1327def FDIV32ri_ftz : 1328 NVPTXInst<(outs Float32Regs:$dst), 1329 (ins Float32Regs:$a, f32imm:$b), 1330 "div.full.ftz.f32 \t$dst, $a, $b;", 1331 [(set f32:$dst, (fdiv f32:$a, fpimm:$b))]>, 1332 Requires<[do_DIVF32_FULL, doF32FTZ]>; 1333def FDIV32rr : 1334 NVPTXInst<(outs Float32Regs:$dst), 1335 (ins Float32Regs:$a, Float32Regs:$b), 1336 "div.full.f32 \t$dst, $a, $b;", 1337 [(set f32:$dst, (fdiv f32:$a, f32:$b))]>, 1338 Requires<[do_DIVF32_FULL]>; 1339def FDIV32ri : 1340 NVPTXInst<(outs Float32Regs:$dst), 1341 (ins Float32Regs:$a, f32imm:$b), 1342 "div.full.f32 \t$dst, $a, $b;", 1343 [(set f32:$dst, (fdiv f32:$a, fpimm:$b))]>, 1344 Requires<[do_DIVF32_FULL]>; 1345// 1346// F32 Accurate reciprocal 1347// 1348def FDIV321r_prec_ftz : 1349 NVPTXInst<(outs Float32Regs:$dst), 1350 (ins f32imm:$a, Float32Regs:$b), 1351 "rcp.rn.ftz.f32 \t$dst, $b;", 1352 [(set f32:$dst, (fdiv FloatConst1:$a, f32:$b))]>, 1353 Requires<[doF32FTZ]>; 1354def FDIV321r_prec : 1355 NVPTXInst<(outs Float32Regs:$dst), 1356 (ins f32imm:$a, Float32Regs:$b), 1357 "rcp.rn.f32 \t$dst, $b;", 1358 [(set f32:$dst, (fdiv FloatConst1:$a, f32:$b))]>; 1359// 1360// F32 Accurate division 1361// 1362def FDIV32rr_prec_ftz : 1363 NVPTXInst<(outs Float32Regs:$dst), 1364 (ins Float32Regs:$a, Float32Regs:$b), 1365 "div.rn.ftz.f32 \t$dst, $a, $b;", 1366 [(set f32:$dst, (fdiv f32:$a, f32:$b))]>, 1367 Requires<[doF32FTZ]>; 1368def FDIV32ri_prec_ftz : 1369 NVPTXInst<(outs Float32Regs:$dst), 1370 (ins Float32Regs:$a, f32imm:$b), 1371 "div.rn.ftz.f32 \t$dst, $a, $b;", 1372 [(set f32:$dst, (fdiv f32:$a, fpimm:$b))]>, 1373 Requires<[doF32FTZ]>; 1374def FDIV32rr_prec : 1375 NVPTXInst<(outs Float32Regs:$dst), 1376 (ins Float32Regs:$a, Float32Regs:$b), 1377 "div.rn.f32 \t$dst, $a, $b;", 1378 [(set f32:$dst, (fdiv f32:$a, f32:$b))]>; 1379def FDIV32ri_prec : 1380 NVPTXInst<(outs Float32Regs:$dst), 1381 (ins Float32Regs:$a, f32imm:$b), 1382 "div.rn.f32 \t$dst, $a, $b;", 1383 [(set f32:$dst, (fdiv f32:$a, fpimm:$b))]>; 1384 1385// 1386// FMA 1387// 1388 1389multiclass FMA<string OpcStr, RegisterClass RC, Operand ImmCls, Predicate Pred> { 1390 defvar asmstr = OpcStr # " \t$dst, $a, $b, $c;"; 1391 def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c), 1392 asmstr, 1393 [(set RC:$dst, (fma RC:$a, RC:$b, RC:$c))]>, 1394 Requires<[Pred]>; 1395 def rri : NVPTXInst<(outs RC:$dst), 1396 (ins RC:$a, RC:$b, ImmCls:$c), 1397 asmstr, 1398 [(set RC:$dst, (fma RC:$a, RC:$b, fpimm:$c))]>, 1399 Requires<[Pred]>; 1400 def rir : NVPTXInst<(outs RC:$dst), 1401 (ins RC:$a, ImmCls:$b, RC:$c), 1402 asmstr, 1403 [(set RC:$dst, (fma RC:$a, fpimm:$b, RC:$c))]>, 1404 Requires<[Pred]>; 1405 def rii : NVPTXInst<(outs RC:$dst), 1406 (ins RC:$a, ImmCls:$b, ImmCls:$c), 1407 asmstr, 1408 [(set RC:$dst, (fma RC:$a, fpimm:$b, fpimm:$c))]>, 1409 Requires<[Pred]>; 1410 def iir : NVPTXInst<(outs RC:$dst), 1411 (ins ImmCls:$a, ImmCls:$b, RC:$c), 1412 asmstr, 1413 [(set RC:$dst, (fma fpimm:$a, fpimm:$b, RC:$c))]>, 1414 Requires<[Pred]>; 1415 1416} 1417 1418multiclass FMA_F16<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> { 1419 def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c), 1420 !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), 1421 [(set T:$dst, (fma T:$a, T:$b, T:$c))]>, 1422 Requires<[useFP16Math, Pred]>; 1423} 1424 1425multiclass FMA_BF16<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> { 1426 def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c), 1427 !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), 1428 [(set T:$dst, (fma T:$a, T:$b, T:$c))]>, 1429 Requires<[hasBF16Math, Pred]>; 1430} 1431 1432defm FMA16_ftz : FMA_F16<"fma.rn.ftz.f16", f16, Int16Regs, doF32FTZ>; 1433defm FMA16 : FMA_F16<"fma.rn.f16", f16, Int16Regs, True>; 1434defm FMA16x2_ftz : FMA_F16<"fma.rn.ftz.f16x2", v2f16, Int32Regs, doF32FTZ>; 1435defm FMA16x2 : FMA_F16<"fma.rn.f16x2", v2f16, Int32Regs, True>; 1436defm BFMA16 : FMA_BF16<"fma.rn.bf16", bf16, Int16Regs, True>; 1437defm BFMA16x2 : FMA_BF16<"fma.rn.bf16x2", v2bf16, Int32Regs, True>; 1438defm FMA32_ftz : FMA<"fma.rn.ftz.f32", Float32Regs, f32imm, doF32FTZ>; 1439defm FMA32 : FMA<"fma.rn.f32", Float32Regs, f32imm, True>; 1440defm FMA64 : FMA<"fma.rn.f64", Float64Regs, f64imm, True>; 1441 1442// sin/cos 1443def SINF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src), 1444 "sin.approx.f32 \t$dst, $src;", 1445 [(set f32:$dst, (fsin f32:$src))]>, 1446 Requires<[allowUnsafeFPMath]>; 1447def COSF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src), 1448 "cos.approx.f32 \t$dst, $src;", 1449 [(set f32:$dst, (fcos f32:$src))]>, 1450 Requires<[allowUnsafeFPMath]>; 1451 1452// Lower (frem x, y) into (sub x, (mul (ftrunc (div x, y)) y)), 1453// i.e. "poor man's fmod()". When y is infinite, x is returned. This matches the 1454// semantics of LLVM's frem. 1455 1456// frem - f32 FTZ 1457def : Pat<(frem f32:$x, f32:$y), 1458 (FSUBf32rr_ftz $x, (FMULf32rr_ftz (CVT_f32_f32 1459 (FDIV32rr_prec_ftz $x, $y), CvtRZI_FTZ), 1460 $y))>, 1461 Requires<[doF32FTZ, allowUnsafeFPMath]>; 1462def : Pat<(frem f32:$x, fpimm:$y), 1463 (FSUBf32rr_ftz $x, (FMULf32ri_ftz (CVT_f32_f32 1464 (FDIV32ri_prec_ftz $x, fpimm:$y), CvtRZI_FTZ), 1465 fpimm:$y))>, 1466 Requires<[doF32FTZ, allowUnsafeFPMath]>; 1467 1468def : Pat<(frem f32:$x, f32:$y), 1469 (SELP_f32rr $x, 1470 (FSUBf32rr_ftz $x, (FMULf32rr_ftz (CVT_f32_f32 1471 (FDIV32rr_prec_ftz $x, $y), CvtRZI_FTZ), 1472 $y)), 1473 (TESTINF_f32r $y))>, 1474 Requires<[doF32FTZ, noUnsafeFPMath]>; 1475def : Pat<(frem f32:$x, fpimm:$y), 1476 (SELP_f32rr $x, 1477 (FSUBf32rr_ftz $x, (FMULf32ri_ftz (CVT_f32_f32 1478 (FDIV32ri_prec_ftz $x, fpimm:$y), CvtRZI_FTZ), 1479 fpimm:$y)), 1480 (TESTINF_f32i fpimm:$y))>, 1481 Requires<[doF32FTZ, noUnsafeFPMath]>; 1482 1483// frem - f32 1484def : Pat<(frem f32:$x, f32:$y), 1485 (FSUBf32rr $x, (FMULf32rr (CVT_f32_f32 1486 (FDIV32rr_prec $x, $y), CvtRZI), 1487 $y))>, 1488 Requires<[allowUnsafeFPMath]>; 1489def : Pat<(frem f32:$x, fpimm:$y), 1490 (FSUBf32rr $x, (FMULf32ri (CVT_f32_f32 1491 (FDIV32ri_prec $x, fpimm:$y), CvtRZI), 1492 fpimm:$y))>, 1493 Requires<[allowUnsafeFPMath]>; 1494 1495def : Pat<(frem f32:$x, f32:$y), 1496 (SELP_f32rr $x, 1497 (FSUBf32rr $x, (FMULf32rr (CVT_f32_f32 1498 (FDIV32rr_prec $x, $y), CvtRZI), 1499 $y)), 1500 (TESTINF_f32r Float32Regs:$y))>, 1501 Requires<[noUnsafeFPMath]>; 1502def : Pat<(frem f32:$x, fpimm:$y), 1503 (SELP_f32rr $x, 1504 (FSUBf32rr $x, (FMULf32ri (CVT_f32_f32 1505 (FDIV32ri_prec $x, fpimm:$y), CvtRZI), 1506 fpimm:$y)), 1507 (TESTINF_f32i fpimm:$y))>, 1508 Requires<[noUnsafeFPMath]>; 1509 1510// frem - f64 1511def : Pat<(frem f64:$x, f64:$y), 1512 (FSUBf64rr $x, (FMULf64rr (CVT_f64_f64 1513 (FDIV64rr $x, $y), CvtRZI), 1514 $y))>, 1515 Requires<[allowUnsafeFPMath]>; 1516def : Pat<(frem f64:$x, fpimm:$y), 1517 (FSUBf64rr $x, (FMULf64ri (CVT_f64_f64 1518 (FDIV64ri $x, fpimm:$y), CvtRZI), 1519 fpimm:$y))>, 1520 Requires<[allowUnsafeFPMath]>; 1521 1522def : Pat<(frem f64:$x, f64:$y), 1523 (SELP_f64rr $x, 1524 (FSUBf64rr $x, (FMULf64rr (CVT_f64_f64 1525 (FDIV64rr $x, $y), CvtRZI), 1526 $y)), 1527 (TESTINF_f64r Float64Regs:$y))>, 1528 Requires<[noUnsafeFPMath]>; 1529def : Pat<(frem f64:$x, fpimm:$y), 1530 (SELP_f64rr $x, 1531 (FSUBf64rr $x, (FMULf64ri (CVT_f64_f64 1532 (FDIV64ri $x, fpimm:$y), CvtRZI), 1533 fpimm:$y)), 1534 (TESTINF_f64r $y))>, 1535 Requires<[noUnsafeFPMath]>; 1536 1537//----------------------------------- 1538// Bitwise operations 1539//----------------------------------- 1540 1541// Template for three-arg bitwise operations. Takes three args, Creates .b16, 1542// .b32, .b64, and .pred (predicate registers -- i.e., i1) versions of OpcStr. 1543multiclass BITWISE<string OpcStr, SDNode OpNode> { 1544 def b1rr : 1545 NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b), 1546 !strconcat(OpcStr, ".pred \t$dst, $a, $b;"), 1547 [(set i1:$dst, (OpNode i1:$a, i1:$b))]>; 1548 def b1ri : 1549 NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b), 1550 !strconcat(OpcStr, ".pred \t$dst, $a, $b;"), 1551 [(set i1:$dst, (OpNode i1:$a, imm:$b))]>; 1552 def b16rr : 1553 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), 1554 !strconcat(OpcStr, ".b16 \t$dst, $a, $b;"), 1555 [(set i16:$dst, (OpNode i16:$a, i16:$b))]>; 1556 def b16ri : 1557 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b), 1558 !strconcat(OpcStr, ".b16 \t$dst, $a, $b;"), 1559 [(set i16:$dst, (OpNode i16:$a, imm:$b))]>; 1560 def b32rr : 1561 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), 1562 !strconcat(OpcStr, ".b32 \t$dst, $a, $b;"), 1563 [(set i32:$dst, (OpNode i32:$a, i32:$b))]>; 1564 def b32ri : 1565 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), 1566 !strconcat(OpcStr, ".b32 \t$dst, $a, $b;"), 1567 [(set i32:$dst, (OpNode i32:$a, imm:$b))]>; 1568 def b64rr : 1569 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b), 1570 !strconcat(OpcStr, ".b64 \t$dst, $a, $b;"), 1571 [(set i64:$dst, (OpNode i64:$a, i64:$b))]>; 1572 def b64ri : 1573 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b), 1574 !strconcat(OpcStr, ".b64 \t$dst, $a, $b;"), 1575 [(set i64:$dst, (OpNode i64:$a, imm:$b))]>; 1576} 1577 1578defm OR : BITWISE<"or", or>; 1579defm AND : BITWISE<"and", and>; 1580defm XOR : BITWISE<"xor", xor>; 1581 1582// PTX does not support mul on predicates, convert to and instructions 1583def : Pat<(mul i1:$a, i1:$b), (ANDb1rr $a, $b)>; 1584def : Pat<(mul i1:$a, imm:$b), (ANDb1ri $a, imm:$b)>; 1585 1586// These transformations were once reliably performed by instcombine, but thanks 1587// to poison semantics they are no longer safe for LLVM IR, perform them here 1588// instead. 1589def : Pat<(select i1:$a, i1:$b, 0), (ANDb1rr $a, $b)>; 1590def : Pat<(select i1:$a, 1, i1:$b), (ORb1rr $a, $b)>; 1591 1592// Lower logical v2i16/v4i8 ops as bitwise ops on b32. 1593foreach vt = [v2i16, v4i8] in { 1594 def: Pat<(or vt:$a, vt:$b), 1595 (ORb32rr $a, $b)>; 1596 def: Pat<(xor vt:$a, vt:$b), 1597 (XORb32rr $a, $b)>; 1598 def: Pat<(and vt:$a, vt:$b), 1599 (ANDb32rr $a, $b)>; 1600 1601 // The constants get legalized into a bitcast from i32, so that's what we need 1602 // to match here. 1603 def: Pat<(or vt:$a, (vt (bitconvert (i32 imm:$b)))), 1604 (ORb32ri $a, imm:$b)>; 1605 def: Pat<(xor vt:$a, (vt (bitconvert (i32 imm:$b)))), 1606 (XORb32ri $a, imm:$b)>; 1607 def: Pat<(and vt:$a, (vt (bitconvert (i32 imm:$b)))), 1608 (ANDb32ri $a, imm:$b)>; 1609} 1610 1611def NOT1 : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$src), 1612 "not.pred \t$dst, $src;", 1613 [(set i1:$dst, (not i1:$src))]>; 1614def NOT16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), 1615 "not.b16 \t$dst, $src;", 1616 [(set i16:$dst, (not i16:$src))]>; 1617def NOT32 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src), 1618 "not.b32 \t$dst, $src;", 1619 [(set i32:$dst, (not i32:$src))]>; 1620def NOT64 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), 1621 "not.b64 \t$dst, $src;", 1622 [(set i64:$dst, (not i64:$src))]>; 1623 1624// Template for left/right shifts. Takes three operands, 1625// [dest (reg), src (reg), shift (reg or imm)]. 1626// dest and src may be int64, int32, or int16, but shift is always int32. 1627// 1628// This template also defines a 32-bit shift (imm, imm) instruction. 1629multiclass SHIFT<string OpcStr, SDNode OpNode> { 1630 def i64rr : 1631 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int32Regs:$b), 1632 !strconcat(OpcStr, "64 \t$dst, $a, $b;"), 1633 [(set i64:$dst, (OpNode i64:$a, i32:$b))]>; 1634 def i64ri : 1635 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i32imm:$b), 1636 !strconcat(OpcStr, "64 \t$dst, $a, $b;"), 1637 [(set i64:$dst, (OpNode i64:$a, (i32 imm:$b)))]>; 1638 def i32rr : 1639 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), 1640 !strconcat(OpcStr, "32 \t$dst, $a, $b;"), 1641 [(set i32:$dst, (OpNode i32:$a, i32:$b))]>; 1642 def i32ri : 1643 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), 1644 !strconcat(OpcStr, "32 \t$dst, $a, $b;"), 1645 [(set i32:$dst, (OpNode i32:$a, (i32 imm:$b)))]>; 1646 def i32ii : 1647 NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$a, i32imm:$b), 1648 !strconcat(OpcStr, "32 \t$dst, $a, $b;"), 1649 [(set i32:$dst, (OpNode (i32 imm:$a), (i32 imm:$b)))]>; 1650 def i16rr : 1651 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int32Regs:$b), 1652 !strconcat(OpcStr, "16 \t$dst, $a, $b;"), 1653 [(set i16:$dst, (OpNode i16:$a, i32:$b))]>; 1654 def i16ri : 1655 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i32imm:$b), 1656 !strconcat(OpcStr, "16 \t$dst, $a, $b;"), 1657 [(set i16:$dst, (OpNode i16:$a, (i32 imm:$b)))]>; 1658} 1659 1660defm SHL : SHIFT<"shl.b", shl>; 1661defm SRA : SHIFT<"shr.s", sra>; 1662defm SRL : SHIFT<"shr.u", srl>; 1663 1664// Bit-reverse 1665def BREV32 : 1666 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a), 1667 "brev.b32 \t$dst, $a;", 1668 [(set i32:$dst, (bitreverse i32:$a))]>; 1669def BREV64 : 1670 NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a), 1671 "brev.b64 \t$dst, $a;", 1672 [(set i64:$dst, (bitreverse i64:$a))]>; 1673 1674 1675// 1676// BFE - bit-field extract 1677// 1678 1679// Template for BFE/BFI instructions. 1680// Args: [dest (reg), src (reg), start (reg or imm), end (reg or imm)]. 1681// Start may be an imm only if end is also an imm. FIXME: Is this a 1682// restriction in PTX? 1683// 1684// dest and src may be int32 or int64, but start and end are always int32. 1685def SDTBFE : 1686 SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>, 1687 SDTCisVT<2, i32>, SDTCisVT<3, i32>]>; 1688def bfe : SDNode<"NVPTXISD::BFE", SDTBFE>; 1689 1690def SDTBFI : 1691 SDTypeProfile<1, 4, [SDTCisInt<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, 1692 SDTCisVT<3, i32>, SDTCisVT<4, i32>]>; 1693def bfi : SDNode<"NVPTXISD::BFI", SDTBFI>; 1694 1695def SDTPRMT : 1696 SDTypeProfile<1, 4, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, 1697 SDTCisVT<2, i32>, SDTCisVT<3, i32>, SDTCisVT<4, i32>,]>; 1698def prmt : SDNode<"NVPTXISD::PRMT", SDTPRMT>; 1699 1700multiclass BFE<string Instr, ValueType T, RegisterClass RC> { 1701 def rrr 1702 : NVPTXInst<(outs RC:$d), 1703 (ins RC:$a, Int32Regs:$b, Int32Regs:$c), 1704 !strconcat(Instr, " \t$d, $a, $b, $c;"), 1705 [(set T:$d, (bfe T:$a, i32:$b, i32:$c))]>; 1706 def rri 1707 : NVPTXInst<(outs RC:$d), 1708 (ins RC:$a, Int32Regs:$b, i32imm:$c), 1709 !strconcat(Instr, " \t$d, $a, $b, $c;"), 1710 [(set T:$d, (bfe T:$a, i32:$b, imm:$c))]>; 1711 def rii 1712 : NVPTXInst<(outs RC:$d), 1713 (ins RC:$a, i32imm:$b, i32imm:$c), 1714 !strconcat(Instr, " \t$d, $a, $b, $c;"), 1715 [(set T:$d, (bfe T:$a, imm:$b, imm:$c))]>; 1716} 1717 1718multiclass BFI<string Instr, ValueType T, RegisterClass RC, Operand ImmCls> { 1719 def rrrr 1720 : NVPTXInst<(outs RC:$f), 1721 (ins RC:$a, RC:$b, Int32Regs:$c, Int32Regs:$d), 1722 !strconcat(Instr, " \t$f, $a, $b, $c, $d;"), 1723 [(set T:$f, (bfi T:$a, T:$b, i32:$c, i32:$d))]>; 1724 def rrri 1725 : NVPTXInst<(outs RC:$f), 1726 (ins RC:$a, RC:$b, Int32Regs:$c, i32imm:$d), 1727 !strconcat(Instr, " \t$f, $a, $b, $c, $d;"), 1728 [(set T:$f, (bfi T:$a, T:$b, i32:$c, imm:$d))]>; 1729 def rrii 1730 : NVPTXInst<(outs RC:$f), 1731 (ins RC:$a, RC:$b, i32imm:$c, i32imm:$d), 1732 !strconcat(Instr, " \t$f, $a, $b, $c, $d;"), 1733 [(set T:$f, (bfi T:$a, T:$b, imm:$c, imm:$d))]>; 1734 def irrr 1735 : NVPTXInst<(outs RC:$f), 1736 (ins ImmCls:$a, RC:$b, Int32Regs:$c, Int32Regs:$d), 1737 !strconcat(Instr, " \t$f, $a, $b, $c, $d;"), 1738 [(set T:$f, (bfi (T imm:$a), T:$b, i32:$c, i32:$d))]>; 1739 def irri 1740 : NVPTXInst<(outs RC:$f), 1741 (ins ImmCls:$a, RC:$b, Int32Regs:$c, i32imm:$d), 1742 !strconcat(Instr, " \t$f, $a, $b, $c, $d;"), 1743 [(set T:$f, (bfi (T imm:$a), T:$b, i32:$c, imm:$d))]>; 1744 def irii 1745 : NVPTXInst<(outs RC:$f), 1746 (ins ImmCls:$a, RC:$b, i32imm:$c, i32imm:$d), 1747 !strconcat(Instr, " \t$f, $a, $b, $c, $d;"), 1748 [(set T:$f, (bfi (T imm:$a), T:$b, imm:$c, imm:$d))]>; 1749} 1750 1751def Hexu32imm : Operand<i32> { 1752 let PrintMethod = "printHexu32imm"; 1753} 1754 1755multiclass PRMT<ValueType T, RegisterClass RC> { 1756 def rrr 1757 : NVPTXInst<(outs RC:$d), 1758 (ins RC:$a, Int32Regs:$b, Int32Regs:$c, PrmtMode:$mode), 1759 !strconcat("prmt.b32${mode}", " \t$d, $a, $b, $c;"), 1760 [(set T:$d, (prmt T:$a, T:$b, i32:$c, imm:$mode))]>; 1761 def rri 1762 : NVPTXInst<(outs RC:$d), 1763 (ins RC:$a, Int32Regs:$b, Hexu32imm:$c, PrmtMode:$mode), 1764 !strconcat("prmt.b32${mode}", " \t$d, $a, $b, $c;"), 1765 [(set T:$d, (prmt T:$a, T:$b, imm:$c, imm:$mode))]>; 1766 def rii 1767 : NVPTXInst<(outs RC:$d), 1768 (ins RC:$a, i32imm:$b, Hexu32imm:$c, PrmtMode:$mode), 1769 !strconcat("prmt.b32${mode}", " \t$d, $a, $b, $c;"), 1770 [(set T:$d, (prmt T:$a, imm:$b, imm:$c, imm:$mode))]>; 1771} 1772 1773let hasSideEffects = false in { 1774 // order is somewhat important here. signed/unsigned variants match 1775 // the same patterns, so the first one wins. Having unsigned byte extraction 1776 // has the benefit of always having zero in unused bits, which makes some 1777 // optimizations easier (e.g. no need to mask them). 1778 defm BFE_U32 : BFE<"bfe.u32", i32, Int32Regs>; 1779 defm BFE_S32 : BFE<"bfe.s32", i32, Int32Regs>; 1780 defm BFE_U64 : BFE<"bfe.u64", i64, Int64Regs>; 1781 defm BFE_S64 : BFE<"bfe.s64", i64, Int64Regs>; 1782 1783 defm BFI_B32 : BFI<"bfi.b32", i32, Int32Regs, i32imm>; 1784 defm BFI_B64 : BFI<"bfi.b64", i64, Int64Regs, i64imm>; 1785 1786 defm PRMT_B32 : PRMT<i32, Int32Regs>; 1787} 1788 1789 1790// byte extraction + signed/unsigned extension to i32. 1791def : Pat<(i32 (sext_inreg (bfe i32:$s, i32:$o, 8), i8)), 1792 (BFE_S32rri $s, $o, 8)>; 1793def : Pat<(i32 (sext_inreg (bfe i32:$s, imm:$o, 8), i8)), 1794 (BFE_S32rii $s, imm:$o, 8)>; 1795def : Pat<(i32 (and (bfe i32:$s, i32:$o, 8), 255)), 1796 (BFE_U32rri $s, $o, 8)>; 1797def : Pat<(i32 (and (bfe i32:$s, imm:$o, 8), 255)), 1798 (BFE_U32rii $s, imm:$o, 8)>; 1799 1800// byte extraction + signed extension to i16 1801def : Pat<(i16 (sext_inreg (trunc (bfe i32:$s, imm:$o, 8)), i8)), 1802 (CVT_s8_s32 (BFE_S32rii $s, imm:$o, 8), CvtNONE)>; 1803 1804 1805// Byte extraction via shift/trunc/sext 1806def : Pat<(i16 (sext_inreg (trunc i32:$s), i8)), 1807 (CVT_s8_s32 $s, CvtNONE)>; 1808def : Pat<(i16 (sext_inreg (trunc (srl i32:$s, (i32 imm:$o))), i8)), 1809 (CVT_s8_s32 (BFE_S32rii $s, imm:$o, 8), CvtNONE)>; 1810def : Pat<(sext_inreg (srl i32:$s, (i32 imm:$o)), i8), 1811 (BFE_S32rii $s, imm:$o, 8)>; 1812def : Pat<(i16 (sra (i16 (trunc i32:$s)), (i32 8))), 1813 (CVT_s8_s32 (BFE_S32rii $s, 8, 8), CvtNONE)>; 1814def : Pat<(sext_inreg (srl i64:$s, (i32 imm:$o)), i8), 1815 (BFE_S64rii $s, imm:$o, 8)>; 1816def : Pat<(i16 (sext_inreg (trunc i64:$s), i8)), 1817 (CVT_s8_s64 $s, CvtNONE)>; 1818def : Pat<(i16 (sext_inreg (trunc (srl i64:$s, (i32 imm:$o))), i8)), 1819 (CVT_s8_s64 (BFE_S64rii $s, imm:$o, 8), CvtNONE)>; 1820 1821//----------------------------------- 1822// Comparison instructions (setp, set) 1823//----------------------------------- 1824 1825// FIXME: This doesn't cover versions of set and setp that combine with a 1826// boolean predicate, e.g. setp.eq.and.b16. 1827 1828let hasSideEffects = false in { 1829 multiclass SETP<string TypeStr, RegisterClass RC, Operand ImmCls> { 1830 def rr : 1831 NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, RC:$b, CmpMode:$cmp), 1832 !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr, 1833 " \t$dst, $a, $b;"), []>; 1834 def ri : 1835 NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, ImmCls:$b, CmpMode:$cmp), 1836 !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr, 1837 " \t$dst, $a, $b;"), []>; 1838 def ir : 1839 NVPTXInst<(outs Int1Regs:$dst), (ins ImmCls:$a, RC:$b, CmpMode:$cmp), 1840 !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr, 1841 " \t$dst, $a, $b;"), []>; 1842 } 1843} 1844 1845defm SETP_b16 : SETP<"b16", Int16Regs, i16imm>; 1846defm SETP_s16 : SETP<"s16", Int16Regs, i16imm>; 1847defm SETP_u16 : SETP<"u16", Int16Regs, i16imm>; 1848defm SETP_b32 : SETP<"b32", Int32Regs, i32imm>; 1849defm SETP_s32 : SETP<"s32", Int32Regs, i32imm>; 1850defm SETP_u32 : SETP<"u32", Int32Regs, i32imm>; 1851defm SETP_b64 : SETP<"b64", Int64Regs, i64imm>; 1852defm SETP_s64 : SETP<"s64", Int64Regs, i64imm>; 1853defm SETP_u64 : SETP<"u64", Int64Regs, i64imm>; 1854defm SETP_f32 : SETP<"f32", Float32Regs, f32imm>; 1855defm SETP_f64 : SETP<"f64", Float64Regs, f64imm>; 1856def SETP_f16rr : 1857 NVPTXInst<(outs Int1Regs:$dst), 1858 (ins Int16Regs:$a, Int16Regs:$b, CmpMode:$cmp), 1859 "setp${cmp:base}${cmp:ftz}.f16 \t$dst, $a, $b;", 1860 []>, Requires<[useFP16Math]>; 1861 1862def SETP_f16x2rr : 1863 NVPTXInst<(outs Int1Regs:$p, Int1Regs:$q), 1864 (ins Int32Regs:$a, Int32Regs:$b, CmpMode:$cmp), 1865 "setp${cmp:base}${cmp:ftz}.f16x2 \t$p|$q, $a, $b;", 1866 []>, 1867 Requires<[useFP16Math]>; 1868def SETP_bf16rr : 1869 NVPTXInst<(outs Int1Regs:$dst), 1870 (ins Int16Regs:$a, Int16Regs:$b, CmpMode:$cmp), 1871 "setp${cmp:base}${cmp:ftz}.bf16 \t$dst, $a, $b;", 1872 []>, Requires<[hasBF16Math, hasPTX<78>, hasSM<90>]>; 1873 1874def SETP_bf16x2rr : 1875 NVPTXInst<(outs Int1Regs:$p, Int1Regs:$q), 1876 (ins Int32Regs:$a, Int32Regs:$b, CmpMode:$cmp), 1877 "setp${cmp:base}${cmp:ftz}.bf16x2 \t$p|$q, $a, $b;", 1878 []>, 1879 Requires<[hasBF16Math, hasPTX<78>, hasSM<90>]>; 1880 1881 1882// FIXME: This doesn't appear to be correct. The "set" mnemonic has the form 1883// "set.CmpOp{.ftz}.dtype.stype", where dtype is the type of the destination 1884// reg, either u32, s32, or f32. Anyway these aren't used at the moment. 1885 1886let hasSideEffects = false in { 1887 multiclass SET<string TypeStr, RegisterClass RC, Operand ImmCls> { 1888 def rr : NVPTXInst<(outs Int32Regs:$dst), 1889 (ins RC:$a, RC:$b, CmpMode:$cmp), 1890 !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>; 1891 def ri : NVPTXInst<(outs Int32Regs:$dst), 1892 (ins RC:$a, ImmCls:$b, CmpMode:$cmp), 1893 !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>; 1894 def ir : NVPTXInst<(outs Int32Regs:$dst), 1895 (ins ImmCls:$a, RC:$b, CmpMode:$cmp), 1896 !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>; 1897 } 1898} 1899 1900defm SET_b16 : SET<"b16", Int16Regs, i16imm>; 1901defm SET_s16 : SET<"s16", Int16Regs, i16imm>; 1902defm SET_u16 : SET<"u16", Int16Regs, i16imm>; 1903defm SET_b32 : SET<"b32", Int32Regs, i32imm>; 1904defm SET_s32 : SET<"s32", Int32Regs, i32imm>; 1905defm SET_u32 : SET<"u32", Int32Regs, i32imm>; 1906defm SET_b64 : SET<"b64", Int64Regs, i64imm>; 1907defm SET_s64 : SET<"s64", Int64Regs, i64imm>; 1908defm SET_u64 : SET<"u64", Int64Regs, i64imm>; 1909defm SET_f16 : SET<"f16", Int16Regs, f16imm>; 1910defm SET_bf16 : SET<"bf16", Int16Regs, bf16imm>, Requires<[hasPTX<78>, hasSM<90>]>; 1911defm SET_f32 : SET<"f32", Float32Regs, f32imm>; 1912defm SET_f64 : SET<"f64", Float64Regs, f64imm>; 1913 1914//----------------------------------- 1915// Data Movement (Load / Store, Move) 1916//----------------------------------- 1917 1918let WantsRoot = true in { 1919 def ADDRri : ComplexPattern<i32, 2, "SelectADDRri", [frameindex]>; 1920 def ADDRri64 : ComplexPattern<i64, 2, "SelectADDRri64", [frameindex]>; 1921} 1922def ADDRvar : ComplexPattern<iPTR, 1, "SelectDirectAddr", [], []>; 1923 1924def MEMri : Operand<i32> { 1925 let PrintMethod = "printMemOperand"; 1926 let MIOperandInfo = (ops Int32Regs, i32imm); 1927} 1928def MEMri64 : Operand<i64> { 1929 let PrintMethod = "printMemOperand"; 1930 let MIOperandInfo = (ops Int64Regs, i64imm); 1931} 1932 1933def imem : Operand<iPTR> { 1934 let PrintMethod = "printOperand"; 1935} 1936 1937def imemAny : Operand<pAny> { 1938 let PrintMethod = "printOperand"; 1939} 1940 1941def LdStCode : Operand<i32> { 1942 let PrintMethod = "printLdStCode"; 1943} 1944 1945def MmaCode : Operand<i32> { 1946 let PrintMethod = "printMmaCode"; 1947} 1948 1949def Offseti32imm : Operand<i32> { 1950 let PrintMethod = "printOffseti32imm"; 1951} 1952 1953def SDTWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>; 1954def Wrapper : SDNode<"NVPTXISD::Wrapper", SDTWrapper>; 1955 1956// Load a memory address into a u32 or u64 register. 1957def MOV_ADDR : NVPTXInst<(outs Int32Regs:$dst), (ins imem:$a), 1958 "mov.u32 \t$dst, $a;", 1959 [(set i32:$dst, (Wrapper tglobaladdr:$a))]>; 1960def MOV_ADDR64 : NVPTXInst<(outs Int64Regs:$dst), (ins imem:$a), 1961 "mov.u64 \t$dst, $a;", 1962 [(set i64:$dst, (Wrapper tglobaladdr:$a))]>; 1963 1964// Get pointer to local stack. 1965let hasSideEffects = false in { 1966 def MOV_DEPOT_ADDR : NVPTXInst<(outs Int32Regs:$d), (ins i32imm:$num), 1967 "mov.u32 \t$d, __local_depot$num;", []>; 1968 def MOV_DEPOT_ADDR_64 : NVPTXInst<(outs Int64Regs:$d), (ins i32imm:$num), 1969 "mov.u64 \t$d, __local_depot$num;", []>; 1970} 1971 1972 1973// copyPhysreg is hard-coded in NVPTXInstrInfo.cpp 1974let IsSimpleMove=1, hasSideEffects=0, isAsCheapAsAMove=1 in { 1975 def IMOV1rr : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$sss), 1976 "mov.pred \t$dst, $sss;", []>; 1977 def IMOV16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss), 1978 "mov.u16 \t$dst, $sss;", []>; 1979 def IMOV32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss), 1980 "mov.u32 \t$dst, $sss;", []>; 1981 def IMOV64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss), 1982 "mov.u64 \t$dst, $sss;", []>; 1983 def IMOV128rr : NVPTXInst<(outs Int128Regs:$dst), (ins Int128Regs:$sss), 1984 "mov.b128 \t$dst, $sss;", []>; 1985 1986 def FMOV32rr : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src), 1987 "mov.f32 \t$dst, $src;", []>; 1988 def FMOV64rr : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$src), 1989 "mov.f64 \t$dst, $src;", []>; 1990 1991 def IMOV1ri : NVPTXInst<(outs Int1Regs:$dst), (ins i1imm:$src), 1992 "mov.pred \t$dst, $src;", 1993 [(set i1:$dst, imm:$src)]>; 1994 def IMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src), 1995 "mov.b16 \t$dst, $src;", 1996 [(set i16:$dst, imm:$src)]>; 1997 def IMOV32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src), 1998 "mov.b32 \t$dst, $src;", 1999 [(set i32:$dst, imm:$src)]>; 2000 def IMOV64ri : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src), 2001 "mov.b64 \t$dst, $src;", 2002 [(set i64:$dst, imm:$src)]>; 2003 2004 def FMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins f16imm:$src), 2005 "mov.b16 \t$dst, $src;", 2006 [(set f16:$dst, fpimm:$src)]>; 2007 def BFMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins bf16imm:$src), 2008 "mov.b16 \t$dst, $src;", 2009 [(set bf16:$dst, fpimm:$src)]>; 2010 def FMOV32ri : NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$src), 2011 "mov.f32 \t$dst, $src;", 2012 [(set f32:$dst, fpimm:$src)]>; 2013 def FMOV64ri : NVPTXInst<(outs Float64Regs:$dst), (ins f64imm:$src), 2014 "mov.f64 \t$dst, $src;", 2015 [(set f64:$dst, fpimm:$src)]>; 2016} 2017 2018def : Pat<(i32 (Wrapper texternalsym:$dst)), (IMOV32ri texternalsym:$dst)>; 2019def : Pat<(i64 (Wrapper texternalsym:$dst)), (IMOV64ri texternalsym:$dst)>; 2020 2021//---- Copy Frame Index ---- 2022def LEA_ADDRi : NVPTXInst<(outs Int32Regs:$dst), (ins MEMri:$addr), 2023 "add.u32 \t$dst, ${addr:add};", 2024 [(set i32:$dst, ADDRri:$addr)]>; 2025def LEA_ADDRi64 : NVPTXInst<(outs Int64Regs:$dst), (ins MEMri64:$addr), 2026 "add.u64 \t$dst, ${addr:add};", 2027 [(set i64:$dst, ADDRri64:$addr)]>; 2028 2029//----------------------------------- 2030// Comparison and Selection 2031//----------------------------------- 2032 2033multiclass ISET_FORMAT<PatFrag OpNode, PatLeaf Mode, 2034 Instruction setp_16rr, 2035 Instruction setp_16ri, 2036 Instruction setp_16ir, 2037 Instruction setp_32rr, 2038 Instruction setp_32ri, 2039 Instruction setp_32ir, 2040 Instruction setp_64rr, 2041 Instruction setp_64ri, 2042 Instruction setp_64ir, 2043 Instruction set_16rr, 2044 Instruction set_16ri, 2045 Instruction set_16ir, 2046 Instruction set_32rr, 2047 Instruction set_32ri, 2048 Instruction set_32ir, 2049 Instruction set_64rr, 2050 Instruction set_64ri, 2051 Instruction set_64ir> { 2052 // i16 -> pred 2053 def : Pat<(i1 (OpNode i16:$a, i16:$b)), 2054 (setp_16rr $a, $b, Mode)>; 2055 def : Pat<(i1 (OpNode i16:$a, imm:$b)), 2056 (setp_16ri $a, imm:$b, Mode)>; 2057 def : Pat<(i1 (OpNode imm:$a, i16:$b)), 2058 (setp_16ir imm:$a, $b, Mode)>; 2059 // i32 -> pred 2060 def : Pat<(i1 (OpNode i32:$a, i32:$b)), 2061 (setp_32rr $a, $b, Mode)>; 2062 def : Pat<(i1 (OpNode i32:$a, imm:$b)), 2063 (setp_32ri $a, imm:$b, Mode)>; 2064 def : Pat<(i1 (OpNode imm:$a, i32:$b)), 2065 (setp_32ir imm:$a, $b, Mode)>; 2066 // i64 -> pred 2067 def : Pat<(i1 (OpNode i64:$a, i64:$b)), 2068 (setp_64rr $a, $b, Mode)>; 2069 def : Pat<(i1 (OpNode i64:$a, imm:$b)), 2070 (setp_64ri $a, imm:$b, Mode)>; 2071 def : Pat<(i1 (OpNode imm:$a, i64:$b)), 2072 (setp_64ir imm:$a, $b, Mode)>; 2073 2074 // i16 -> i32 2075 def : Pat<(i32 (OpNode i16:$a, i16:$b)), 2076 (set_16rr $a, $b, Mode)>; 2077 def : Pat<(i32 (OpNode i16:$a, imm:$b)), 2078 (set_16ri $a, imm:$b, Mode)>; 2079 def : Pat<(i32 (OpNode imm:$a, i16:$b)), 2080 (set_16ir imm:$a, $b, Mode)>; 2081 // i32 -> i32 2082 def : Pat<(i32 (OpNode i32:$a, i32:$b)), 2083 (set_32rr $a, $b, Mode)>; 2084 def : Pat<(i32 (OpNode i32:$a, imm:$b)), 2085 (set_32ri $a, imm:$b, Mode)>; 2086 def : Pat<(i32 (OpNode imm:$a, i32:$b)), 2087 (set_32ir imm:$a, $b, Mode)>; 2088 // i64 -> i32 2089 def : Pat<(i32 (OpNode i64:$a, Int64Regs:$b)), 2090 (set_64rr $a, $b, Mode)>; 2091 def : Pat<(i32 (OpNode i64:$a, imm:$b)), 2092 (set_64ri $a, imm:$b, Mode)>; 2093 def : Pat<(i32 (OpNode imm:$a, i64:$b)), 2094 (set_64ir imm:$a, $b, Mode)>; 2095} 2096 2097multiclass ISET_FORMAT_SIGNED<PatFrag OpNode, PatLeaf Mode> 2098 : ISET_FORMAT<OpNode, Mode, 2099 SETP_s16rr, SETP_s16ri, SETP_s16ir, 2100 SETP_s32rr, SETP_s32ri, SETP_s32ir, 2101 SETP_s64rr, SETP_s64ri, SETP_s64ir, 2102 SET_s16rr, SET_s16ri, SET_s16ir, 2103 SET_s32rr, SET_s32ri, SET_s32ir, 2104 SET_s64rr, SET_s64ri, SET_s64ir> { 2105 // TableGen doesn't like empty multiclasses. 2106 def : PatLeaf<(i32 0)>; 2107} 2108 2109multiclass ISET_FORMAT_UNSIGNED<PatFrag OpNode, PatLeaf Mode> 2110 : ISET_FORMAT<OpNode, Mode, 2111 SETP_u16rr, SETP_u16ri, SETP_u16ir, 2112 SETP_u32rr, SETP_u32ri, SETP_u32ir, 2113 SETP_u64rr, SETP_u64ri, SETP_u64ir, 2114 SET_u16rr, SET_u16ri, SET_u16ir, 2115 SET_u32rr, SET_u32ri, SET_u32ir, 2116 SET_u64rr, SET_u64ri, SET_u64ir> { 2117 // TableGen doesn't like empty multiclasses. 2118 def : PatLeaf<(i32 0)>; 2119} 2120 2121defm : ISET_FORMAT_SIGNED<setgt, CmpGT>; 2122defm : ISET_FORMAT_SIGNED<setlt, CmpLT>; 2123defm : ISET_FORMAT_SIGNED<setge, CmpGE>; 2124defm : ISET_FORMAT_SIGNED<setle, CmpLE>; 2125defm : ISET_FORMAT_SIGNED<seteq, CmpEQ>; 2126defm : ISET_FORMAT_SIGNED<setne, CmpNE>; 2127defm : ISET_FORMAT_UNSIGNED<setugt, CmpGT>; 2128defm : ISET_FORMAT_UNSIGNED<setult, CmpLT>; 2129defm : ISET_FORMAT_UNSIGNED<setuge, CmpGE>; 2130defm : ISET_FORMAT_UNSIGNED<setule, CmpLE>; 2131defm : ISET_FORMAT_UNSIGNED<setueq, CmpEQ>; 2132defm : ISET_FORMAT_UNSIGNED<setune, CmpNE>; 2133 2134// comparisons of i8 extracted with BFE as i32 2135// It's faster to do comparison directly on i32 extracted by BFE, 2136// instead of the long conversion and sign extending. 2137def: Pat<(setgt (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8))), i8)), 2138 (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8))), i8))), 2139 (SETP_s32rr (BFE_S32rri $a, $oa, 8), (BFE_S32rri $b, $ob, 8), CmpGT)>; 2140def: Pat<(setgt (i16 (sext_inreg (trunc (bfe Int32Regs:$a, imm:$oa, 8)), i8)), 2141 (i16 (sext_inreg (trunc (bfe Int32Regs:$b, imm:$ob, 8)), i8))), 2142 (SETP_s32rr (BFE_S32rii $a, imm:$oa, 8), (BFE_S32rii $b, imm:$ob, 8), CmpGT)>; 2143def: Pat<(setge (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8))), i8)), 2144 (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8))), i8))), 2145 (SETP_s32rr (BFE_S32rri $a, $oa, 8), (BFE_S32rri $b, $ob, 8), CmpGE)>; 2146def: Pat<(setge (i16 (sext_inreg (trunc (bfe Int32Regs:$a, imm:$oa, 8)), i8)), 2147 (i16 (sext_inreg (trunc (bfe Int32Regs:$b, imm:$ob, 8)), i8))), 2148 (SETP_s32rr (BFE_S32rii $a, imm:$oa, 8), (BFE_S32rii $b, imm:$ob, 8), CmpGE)>; 2149def: Pat<(setlt (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8))), i8)), 2150 (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8))), i8))), 2151 (SETP_s32rr (BFE_S32rri $a, $oa, 8), (BFE_S32rri $b, $ob, 8), CmpLT)>; 2152def: Pat<(setlt (i16 (sext_inreg (trunc (bfe Int32Regs:$a, imm:$oa, 8)), i8)), 2153 (i16 (sext_inreg (trunc (bfe Int32Regs:$b, imm:$ob, 8)), i8))), 2154 (SETP_s32rr (BFE_S32rii $a, imm:$oa, 8), (BFE_S32rii $b, imm:$ob, 8), CmpLT)>; 2155def: Pat<(setle (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8))), i8)), 2156 (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8))), i8))), 2157 (SETP_s32rr (BFE_S32rri $a, $oa, 8), (BFE_S32rri $b, $ob, 8), CmpLE)>; 2158def: Pat<(setle (i16 (sext_inreg (trunc (bfe Int32Regs:$a, imm:$oa, 8)), i8)), 2159 (i16 (sext_inreg (trunc (bfe Int32Regs:$b, imm:$ob, 8)), i8))), 2160 (SETP_s32rr (BFE_S32rii $a, imm:$oa, 8), (BFE_S32rii $b, imm:$ob, 8), CmpLE)>; 2161 2162def: Pat<(setugt (i16 (and (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8)), 255)), 2163 (i16 (and (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8)), 255))), 2164 (SETP_u32rr (BFE_U32rri $a, $oa, 8), (BFE_U32rri $b, $ob, 8), CmpHI)>; 2165def: Pat<(setugt (i16 (and (trunc (bfe Int32Regs:$a, imm:$oa, 8)), 255)), 2166 (i16 (and (trunc (bfe Int32Regs:$b, imm:$ob, 8)), 255))), 2167 (SETP_u32rr (BFE_U32rii $a, imm:$oa, 8), (BFE_U32rii $b, imm:$ob, 8), CmpHI)>; 2168def: Pat<(setuge (i16 (and (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8)), 255)), 2169 (i16 (and (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8)), 255))), 2170 (SETP_u32rr (BFE_U32rri $a, $oa, 8), (BFE_U32rri $b, $ob, 8), CmpHS)>; 2171def: Pat<(setuge (i16 (and (trunc (bfe Int32Regs:$a, imm:$oa, 8)), 255)), 2172 (i16 (and (trunc (bfe Int32Regs:$b, imm:$ob, 8)), 255))), 2173 (SETP_u32rr (BFE_U32rii $a, imm:$oa, 8), (BFE_U32rii $b, imm:$ob, 8), CmpHS)>; 2174def: Pat<(setult (i16 (and (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8)), 255)), 2175 (i16 (and (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8)), 255))), 2176 (SETP_u32rr (BFE_U32rri $a, $oa, 8), (BFE_U32rri $b, $ob, 8), CmpLO)>; 2177def: Pat<(setult (i16 (and (trunc (bfe Int32Regs:$a, imm:$oa, 8)), 255)), 2178 (i16 (and (trunc (bfe Int32Regs:$b, imm:$ob, 8)), 255))), 2179 (SETP_u32rr (BFE_U32rii $a, imm:$oa, 8), (BFE_U32rii $b, imm:$ob, 8), CmpLO)>; 2180def: Pat<(setule (i16 (and (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8)), 255)), 2181 (i16 (and (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8)), 255))), 2182 (SETP_u32rr (BFE_U32rri $a, $oa, 8), (BFE_U32rri $b, $ob, 8), CmpLS)>; 2183def: Pat<(setule (i16 (and (trunc (bfe Int32Regs:$a, imm:$oa, 8)), 255)), 2184 (i16 (and (trunc (bfe Int32Regs:$b, imm:$ob, 8)), 255))), 2185 (SETP_u32rr (BFE_U32rii $a, imm:$oa, 8), (BFE_U32rii $b, imm:$ob, 8), CmpLS)>; 2186def: Pat<(seteq (i16 (and (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8)), 255)), 2187 (i16 (and (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8)), 255))), 2188 (SETP_u32rr (BFE_U32rri $a, $oa, 8), (BFE_U32rri $b, $ob, 8), CmpEQ)>; 2189def: Pat<(seteq (i16 (and (trunc (bfe Int32Regs:$a, imm:$oa, 8)), 255)), 2190 (i16 (and (trunc (bfe Int32Regs:$b, imm:$ob, 8)), 255))), 2191 (SETP_u32rr (BFE_U32rii $a, imm:$oa, 8), (BFE_U32rii $b, imm:$ob, 8), CmpEQ)>; 2192def: Pat<(setne (i16 (and (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8)), 255)), 2193 (i16 (and (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8)), 255))), 2194 (SETP_u32rr (BFE_U32rri $a, $oa, 8), (BFE_U32rri $b, $ob, 8), CmpNE)>; 2195def: Pat<(setne (i16 (and (trunc (bfe Int32Regs:$a, imm:$oa, 8)), 255)), 2196 (i16 (and (trunc (bfe Int32Regs:$b, imm:$ob, 8)), 255))), 2197 (SETP_u32rr (BFE_U32rii $a, imm:$oa, 8), (BFE_U32rii $b, imm:$ob, 8), CmpNE)>; 2198 2199// i1 compare -> i32 2200def : Pat<(i32 (setne i1:$a, i1:$b)), 2201 (SELP_u32ii -1, 0, (XORb1rr $a, $b))>; 2202def : Pat<(i32 (setne i1:$a, i1:$b)), 2203 (SELP_u32ii 0, -1, (XORb1rr $a, $b))>; 2204 2205 2206 2207multiclass FSET_FORMAT<PatFrag OpNode, PatLeaf Mode, PatLeaf ModeFTZ> { 2208 // f16 -> pred 2209 def : Pat<(i1 (OpNode f16:$a, f16:$b)), 2210 (SETP_f16rr $a, $b, ModeFTZ)>, 2211 Requires<[useFP16Math,doF32FTZ]>; 2212 def : Pat<(i1 (OpNode f16:$a, f16:$b)), 2213 (SETP_f16rr $a, $b, Mode)>, 2214 Requires<[useFP16Math]>; 2215 2216 // bf16 -> pred 2217 def : Pat<(i1 (OpNode bf16:$a, bf16:$b)), 2218 (SETP_bf16rr $a, $b, ModeFTZ)>, 2219 Requires<[hasBF16Math,doF32FTZ]>; 2220 def : Pat<(i1 (OpNode bf16:$a, bf16:$b)), 2221 (SETP_bf16rr $a, $b, Mode)>, 2222 Requires<[hasBF16Math]>; 2223 2224 // f32 -> pred 2225 def : Pat<(i1 (OpNode f32:$a, f32:$b)), 2226 (SETP_f32rr $a, $b, ModeFTZ)>, 2227 Requires<[doF32FTZ]>; 2228 def : Pat<(i1 (OpNode f32:$a, f32:$b)), 2229 (SETP_f32rr $a, $b, Mode)>; 2230 def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)), 2231 (SETP_f32ri $a, fpimm:$b, ModeFTZ)>, 2232 Requires<[doF32FTZ]>; 2233 def : Pat<(i1 (OpNode f32:$a, fpimm:$b)), 2234 (SETP_f32ri $a, fpimm:$b, Mode)>; 2235 def : Pat<(i1 (OpNode fpimm:$a, f32:$b)), 2236 (SETP_f32ir fpimm:$a, $b, ModeFTZ)>, 2237 Requires<[doF32FTZ]>; 2238 def : Pat<(i1 (OpNode fpimm:$a, f32:$b)), 2239 (SETP_f32ir fpimm:$a, $b, Mode)>; 2240 2241 // f64 -> pred 2242 def : Pat<(i1 (OpNode f64:$a, f64:$b)), 2243 (SETP_f64rr $a, $b, Mode)>; 2244 def : Pat<(i1 (OpNode f64:$a, fpimm:$b)), 2245 (SETP_f64ri $a, fpimm:$b, Mode)>; 2246 def : Pat<(i1 (OpNode fpimm:$a, f64:$b)), 2247 (SETP_f64ir fpimm:$a, $b, Mode)>; 2248 2249 // f16 -> i32 2250 def : Pat<(i32 (OpNode f16:$a, f16:$b)), 2251 (SET_f16rr $a, $b, ModeFTZ)>, 2252 Requires<[useFP16Math, doF32FTZ]>; 2253 def : Pat<(i32 (OpNode f16:$a, f16:$b)), 2254 (SET_f16rr $a, $b, Mode)>, 2255 Requires<[useFP16Math]>; 2256 2257 // bf16 -> i32 2258 def : Pat<(i32 (OpNode bf16:$a, bf16:$b)), 2259 (SET_bf16rr $a, $b, ModeFTZ)>, 2260 Requires<[hasBF16Math, doF32FTZ]>; 2261 def : Pat<(i32 (OpNode bf16:$a, bf16:$b)), 2262 (SET_bf16rr $a, $b, Mode)>, 2263 Requires<[hasBF16Math]>; 2264 2265 // f32 -> i32 2266 def : Pat<(i32 (OpNode f32:$a, f32:$b)), 2267 (SET_f32rr $a, $b, ModeFTZ)>, 2268 Requires<[doF32FTZ]>; 2269 def : Pat<(i32 (OpNode f32:$a, f32:$b)), 2270 (SET_f32rr $a, $b, Mode)>; 2271 def : Pat<(i32 (OpNode f32:$a, fpimm:$b)), 2272 (SET_f32ri $a, fpimm:$b, ModeFTZ)>, 2273 Requires<[doF32FTZ]>; 2274 def : Pat<(i32 (OpNode f32:$a, fpimm:$b)), 2275 (SET_f32ri $a, fpimm:$b, Mode)>; 2276 def : Pat<(i32 (OpNode fpimm:$a, f32:$b)), 2277 (SET_f32ir fpimm:$a, $b, ModeFTZ)>, 2278 Requires<[doF32FTZ]>; 2279 def : Pat<(i32 (OpNode fpimm:$a, f32:$b)), 2280 (SET_f32ir fpimm:$a, $b, Mode)>; 2281 2282 // f64 -> i32 2283 def : Pat<(i32 (OpNode f64:$a, f64:$b)), 2284 (SET_f64rr $a, $b, Mode)>; 2285 def : Pat<(i32 (OpNode f64:$a, fpimm:$b)), 2286 (SET_f64ri $a, fpimm:$b, Mode)>; 2287 def : Pat<(i32 (OpNode fpimm:$a, f64:$b)), 2288 (SET_f64ir fpimm:$a, $b, Mode)>; 2289} 2290 2291defm FSetOGT : FSET_FORMAT<setogt, CmpGT, CmpGT_FTZ>; 2292defm FSetOLT : FSET_FORMAT<setolt, CmpLT, CmpLT_FTZ>; 2293defm FSetOGE : FSET_FORMAT<setoge, CmpGE, CmpGE_FTZ>; 2294defm FSetOLE : FSET_FORMAT<setole, CmpLE, CmpLE_FTZ>; 2295defm FSetOEQ : FSET_FORMAT<setoeq, CmpEQ, CmpEQ_FTZ>; 2296defm FSetONE : FSET_FORMAT<setone, CmpNE, CmpNE_FTZ>; 2297 2298defm FSetUGT : FSET_FORMAT<setugt, CmpGTU, CmpGTU_FTZ>; 2299defm FSetULT : FSET_FORMAT<setult, CmpLTU, CmpLTU_FTZ>; 2300defm FSetUGE : FSET_FORMAT<setuge, CmpGEU, CmpGEU_FTZ>; 2301defm FSetULE : FSET_FORMAT<setule, CmpLEU, CmpLEU_FTZ>; 2302defm FSetUEQ : FSET_FORMAT<setueq, CmpEQU, CmpEQU_FTZ>; 2303defm FSetUNE : FSET_FORMAT<setune, CmpNEU, CmpNEU_FTZ>; 2304 2305defm FSetGT : FSET_FORMAT<setgt, CmpGT, CmpGT_FTZ>; 2306defm FSetLT : FSET_FORMAT<setlt, CmpLT, CmpLT_FTZ>; 2307defm FSetGE : FSET_FORMAT<setge, CmpGE, CmpGE_FTZ>; 2308defm FSetLE : FSET_FORMAT<setle, CmpLE, CmpLE_FTZ>; 2309defm FSetEQ : FSET_FORMAT<seteq, CmpEQ, CmpEQ_FTZ>; 2310defm FSetNE : FSET_FORMAT<setne, CmpNE, CmpNE_FTZ>; 2311 2312defm FSetNUM : FSET_FORMAT<seto, CmpNUM, CmpNUM_FTZ>; 2313defm FSetNAN : FSET_FORMAT<setuo, CmpNAN, CmpNAN_FTZ>; 2314 2315def SDTDeclareParamProfile : 2316 SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>; 2317def SDTDeclareScalarParamProfile : 2318 SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>; 2319def SDTLoadParamProfile : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>; 2320def SDTLoadParamV2Profile : SDTypeProfile<2, 2, [SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisInt<3>]>; 2321def SDTLoadParamV4Profile : SDTypeProfile<4, 2, [SDTCisInt<4>, SDTCisInt<5>]>; 2322def SDTPrintCallProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>; 2323def SDTPrintCallUniProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>; 2324def SDTStoreParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>; 2325def SDTStoreParamV2Profile : SDTypeProfile<0, 4, [SDTCisInt<0>, SDTCisInt<1>]>; 2326def SDTStoreParamV4Profile : SDTypeProfile<0, 6, [SDTCisInt<0>, SDTCisInt<1>]>; 2327def SDTStoreParam32Profile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>; 2328def SDTCallArgProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>; 2329def SDTCallArgMarkProfile : SDTypeProfile<0, 0, []>; 2330def SDTCallVoidProfile : SDTypeProfile<0, 1, []>; 2331def SDTCallValProfile : SDTypeProfile<1, 0, []>; 2332def SDTMoveParamProfile : SDTypeProfile<1, 1, []>; 2333def SDTStoreRetvalProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>; 2334def SDTStoreRetvalV2Profile : SDTypeProfile<0, 3, [SDTCisInt<0>]>; 2335def SDTStoreRetvalV4Profile : SDTypeProfile<0, 5, [SDTCisInt<0>]>; 2336def SDTPseudoUseParamProfile : SDTypeProfile<0, 1, []>; 2337def SDTProxyRegProfile : SDTypeProfile<1, 1, []>; 2338 2339def DeclareParam : 2340 SDNode<"NVPTXISD::DeclareParam", SDTDeclareParamProfile, 2341 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2342def DeclareScalarParam : 2343 SDNode<"NVPTXISD::DeclareScalarParam", SDTDeclareScalarParamProfile, 2344 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2345def DeclareRetParam : 2346 SDNode<"NVPTXISD::DeclareRetParam", SDTDeclareParamProfile, 2347 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2348def DeclareRet : 2349 SDNode<"NVPTXISD::DeclareRet", SDTDeclareScalarParamProfile, 2350 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2351def LoadParam : 2352 SDNode<"NVPTXISD::LoadParam", SDTLoadParamProfile, 2353 [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>; 2354def LoadParamV2 : 2355 SDNode<"NVPTXISD::LoadParamV2", SDTLoadParamV2Profile, 2356 [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>; 2357def LoadParamV4 : 2358 SDNode<"NVPTXISD::LoadParamV4", SDTLoadParamV4Profile, 2359 [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>; 2360def PrintCall : 2361 SDNode<"NVPTXISD::PrintCall", SDTPrintCallProfile, 2362 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2363def PrintConvergentCall : 2364 SDNode<"NVPTXISD::PrintConvergentCall", SDTPrintCallProfile, 2365 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2366def PrintCallUni : 2367 SDNode<"NVPTXISD::PrintCallUni", SDTPrintCallUniProfile, 2368 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2369def PrintConvergentCallUni : 2370 SDNode<"NVPTXISD::PrintConvergentCallUni", SDTPrintCallUniProfile, 2371 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2372def StoreParam : 2373 SDNode<"NVPTXISD::StoreParam", SDTStoreParamProfile, 2374 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2375def StoreParamV2 : 2376 SDNode<"NVPTXISD::StoreParamV2", SDTStoreParamV2Profile, 2377 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2378def StoreParamV4 : 2379 SDNode<"NVPTXISD::StoreParamV4", SDTStoreParamV4Profile, 2380 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2381def StoreParamU32 : 2382 SDNode<"NVPTXISD::StoreParamU32", SDTStoreParam32Profile, 2383 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2384def StoreParamS32 : 2385 SDNode<"NVPTXISD::StoreParamS32", SDTStoreParam32Profile, 2386 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2387def CallArgBegin : 2388 SDNode<"NVPTXISD::CallArgBegin", SDTCallArgMarkProfile, 2389 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2390def CallArg : 2391 SDNode<"NVPTXISD::CallArg", SDTCallArgProfile, 2392 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2393def LastCallArg : 2394 SDNode<"NVPTXISD::LastCallArg", SDTCallArgProfile, 2395 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2396def CallArgEnd : 2397 SDNode<"NVPTXISD::CallArgEnd", SDTCallVoidProfile, 2398 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2399def CallVoid : 2400 SDNode<"NVPTXISD::CallVoid", SDTCallVoidProfile, 2401 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2402def Prototype : 2403 SDNode<"NVPTXISD::Prototype", SDTCallVoidProfile, 2404 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2405def CallVal : 2406 SDNode<"NVPTXISD::CallVal", SDTCallValProfile, 2407 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2408def MoveParam : 2409 SDNode<"NVPTXISD::MoveParam", SDTMoveParamProfile, []>; 2410def StoreRetval : 2411 SDNode<"NVPTXISD::StoreRetval", SDTStoreRetvalProfile, 2412 [SDNPHasChain, SDNPSideEffect]>; 2413def StoreRetvalV2 : 2414 SDNode<"NVPTXISD::StoreRetvalV2", SDTStoreRetvalV2Profile, 2415 [SDNPHasChain, SDNPSideEffect]>; 2416def StoreRetvalV4 : 2417 SDNode<"NVPTXISD::StoreRetvalV4", SDTStoreRetvalV4Profile, 2418 [SDNPHasChain, SDNPSideEffect]>; 2419def PseudoUseParam : 2420 SDNode<"NVPTXISD::PseudoUseParam", SDTPseudoUseParamProfile, 2421 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2422def RETURNNode : 2423 SDNode<"NVPTXISD::RETURN", SDTCallArgMarkProfile, 2424 [SDNPHasChain, SDNPSideEffect]>; 2425def ProxyReg : 2426 SDNode<"NVPTXISD::ProxyReg", SDTProxyRegProfile, 2427 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 2428 2429let mayLoad = true in { 2430 class LoadParamMemInst<NVPTXRegClass regclass, string opstr> : 2431 NVPTXInst<(outs regclass:$dst), (ins Offseti32imm:$b), 2432 !strconcat("ld.param", opstr, " \t$dst, [retval0$b];"), 2433 []>; 2434 2435 class LoadParamV2MemInst<NVPTXRegClass regclass, string opstr> : 2436 NVPTXInst<(outs regclass:$dst, regclass:$dst2), (ins Offseti32imm:$b), 2437 !strconcat("ld.param.v2", opstr, 2438 " \t{{$dst, $dst2}}, [retval0$b];"), []>; 2439 2440 class LoadParamV4MemInst<NVPTXRegClass regclass, string opstr> : 2441 NVPTXInst<(outs regclass:$dst, regclass:$dst2, regclass:$dst3, 2442 regclass:$dst4), 2443 (ins Offseti32imm:$b), 2444 !strconcat("ld.param.v4", opstr, 2445 " \t{{$dst, $dst2, $dst3, $dst4}}, [retval0$b];"), 2446 []>; 2447} 2448 2449class LoadParamRegInst<NVPTXRegClass regclass, string opstr> : 2450 NVPTXInst<(outs regclass:$dst), (ins i32imm:$b), 2451 !strconcat("mov", opstr, " \t$dst, retval$b;"), 2452 [(set regclass:$dst, (LoadParam (i32 0), (i32 imm:$b)))]>; 2453 2454let mayStore = true in { 2455 2456 multiclass StoreParamInst<NVPTXRegClass regclass, Operand IMMType, string opstr, bit support_imm = true> { 2457 foreach op = [IMMType, regclass] in 2458 if !or(support_imm, !isa<NVPTXRegClass>(op)) then 2459 def _ # !if(!isa<NVPTXRegClass>(op), "r", "i") 2460 : NVPTXInst<(outs), 2461 (ins op:$val, i32imm:$a, Offseti32imm:$b), 2462 "st.param" # opstr # " \t[param$a$b], $val;", 2463 []>; 2464 } 2465 2466 multiclass StoreParamV2Inst<NVPTXRegClass regclass, Operand IMMType, string opstr> { 2467 foreach op1 = [IMMType, regclass] in 2468 foreach op2 = [IMMType, regclass] in 2469 def _ # !if(!isa<NVPTXRegClass>(op1), "r", "i") 2470 # !if(!isa<NVPTXRegClass>(op2), "r", "i") 2471 : NVPTXInst<(outs), 2472 (ins op1:$val1, op2:$val2, 2473 i32imm:$a, Offseti32imm:$b), 2474 "st.param.v2" # opstr # " \t[param$a$b], {{$val1, $val2}};", 2475 []>; 2476 } 2477 2478 multiclass StoreParamV4Inst<NVPTXRegClass regclass, Operand IMMType, string opstr> { 2479 foreach op1 = [IMMType, regclass] in 2480 foreach op2 = [IMMType, regclass] in 2481 foreach op3 = [IMMType, regclass] in 2482 foreach op4 = [IMMType, regclass] in 2483 def _ # !if(!isa<NVPTXRegClass>(op1), "r", "i") 2484 # !if(!isa<NVPTXRegClass>(op2), "r", "i") 2485 # !if(!isa<NVPTXRegClass>(op3), "r", "i") 2486 # !if(!isa<NVPTXRegClass>(op4), "r", "i") 2487 2488 : NVPTXInst<(outs), 2489 (ins op1:$val1, op2:$val2, op3:$val3, op4:$val4, 2490 i32imm:$a, Offseti32imm:$b), 2491 "st.param.v4" # opstr # 2492 " \t[param$a$b], {{$val1, $val2, $val3, $val4}};", 2493 []>; 2494 } 2495 2496 class StoreRetvalInst<NVPTXRegClass regclass, string opstr> : 2497 NVPTXInst<(outs), (ins regclass:$val, Offseti32imm:$a), 2498 !strconcat("st.param", opstr, " \t[func_retval0$a], $val;"), 2499 []>; 2500 2501 class StoreRetvalV2Inst<NVPTXRegClass regclass, string opstr> : 2502 NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, Offseti32imm:$a), 2503 !strconcat("st.param.v2", opstr, 2504 " \t[func_retval0$a], {{$val, $val2}};"), 2505 []>; 2506 2507 class StoreRetvalV4Inst<NVPTXRegClass regclass, string opstr> : 2508 NVPTXInst<(outs), 2509 (ins regclass:$val, regclass:$val2, regclass:$val3, 2510 regclass:$val4, Offseti32imm:$a), 2511 !strconcat("st.param.v4", opstr, 2512 " \t[func_retval0$a], {{$val, $val2, $val3, $val4}};"), 2513 []>; 2514} 2515 2516let isCall=1 in { 2517 multiclass CALL<string OpcStr, SDNode OpNode> { 2518 def PrintCallNoRetInst : NVPTXInst<(outs), (ins), 2519 !strconcat(OpcStr, " "), [(OpNode (i32 0))]>; 2520 def PrintCallRetInst1 : NVPTXInst<(outs), (ins), 2521 !strconcat(OpcStr, " (retval0), "), [(OpNode (i32 1))]>; 2522 def PrintCallRetInst2 : NVPTXInst<(outs), (ins), 2523 !strconcat(OpcStr, " (retval0, retval1), "), [(OpNode (i32 2))]>; 2524 def PrintCallRetInst3 : NVPTXInst<(outs), (ins), 2525 !strconcat(OpcStr, " (retval0, retval1, retval2), "), [(OpNode (i32 3))]>; 2526 def PrintCallRetInst4 : NVPTXInst<(outs), (ins), 2527 !strconcat(OpcStr, " (retval0, retval1, retval2, retval3), "), 2528 [(OpNode (i32 4))]>; 2529 def PrintCallRetInst5 : NVPTXInst<(outs), (ins), 2530 !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4), "), 2531 [(OpNode (i32 5))]>; 2532 def PrintCallRetInst6 : NVPTXInst<(outs), (ins), 2533 !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, " 2534 "retval5), "), 2535 [(OpNode (i32 6))]>; 2536 def PrintCallRetInst7 : NVPTXInst<(outs), (ins), 2537 !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, " 2538 "retval5, retval6), "), 2539 [(OpNode (i32 7))]>; 2540 def PrintCallRetInst8 : NVPTXInst<(outs), (ins), 2541 !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, " 2542 "retval5, retval6, retval7), "), 2543 [(OpNode (i32 8))]>; 2544 } 2545} 2546 2547defm Call : CALL<"call", PrintCall>; 2548defm CallUni : CALL<"call.uni", PrintCallUni>; 2549 2550// Convergent call instructions. These are identical to regular calls, except 2551// they have the isConvergent bit set. 2552let isConvergent=1 in { 2553 defm ConvergentCall : CALL<"call", PrintConvergentCall>; 2554 defm ConvergentCallUni : CALL<"call.uni", PrintConvergentCallUni>; 2555} 2556 2557def LoadParamMemI64 : LoadParamMemInst<Int64Regs, ".b64">; 2558def LoadParamMemI32 : LoadParamMemInst<Int32Regs, ".b32">; 2559def LoadParamMemI16 : LoadParamMemInst<Int16Regs, ".b16">; 2560def LoadParamMemI8 : LoadParamMemInst<Int16Regs, ".b8">; 2561def LoadParamMemV2I64 : LoadParamV2MemInst<Int64Regs, ".b64">; 2562def LoadParamMemV2I32 : LoadParamV2MemInst<Int32Regs, ".b32">; 2563def LoadParamMemV2I16 : LoadParamV2MemInst<Int16Regs, ".b16">; 2564def LoadParamMemV2I8 : LoadParamV2MemInst<Int16Regs, ".b8">; 2565def LoadParamMemV4I32 : LoadParamV4MemInst<Int32Regs, ".b32">; 2566def LoadParamMemV4I16 : LoadParamV4MemInst<Int16Regs, ".b16">; 2567def LoadParamMemV4I8 : LoadParamV4MemInst<Int16Regs, ".b8">; 2568def LoadParamMemF32 : LoadParamMemInst<Float32Regs, ".f32">; 2569def LoadParamMemF64 : LoadParamMemInst<Float64Regs, ".f64">; 2570def LoadParamMemV2F32 : LoadParamV2MemInst<Float32Regs, ".f32">; 2571def LoadParamMemV2F64 : LoadParamV2MemInst<Float64Regs, ".f64">; 2572def LoadParamMemV4F32 : LoadParamV4MemInst<Float32Regs, ".f32">; 2573 2574defm StoreParamI64 : StoreParamInst<Int64Regs, i64imm, ".b64">; 2575defm StoreParamI32 : StoreParamInst<Int32Regs, i32imm, ".b32">; 2576defm StoreParamI16 : StoreParamInst<Int16Regs, i16imm, ".b16">; 2577defm StoreParamI8 : StoreParamInst<Int16Regs, i8imm, ".b8">; 2578 2579defm StoreParamI8TruncI32 : StoreParamInst<Int32Regs, i8imm, ".b8", /* support_imm */ false>; 2580defm StoreParamI8TruncI64 : StoreParamInst<Int64Regs, i8imm, ".b8", /* support_imm */ false>; 2581 2582defm StoreParamV2I64 : StoreParamV2Inst<Int64Regs, i64imm, ".b64">; 2583defm StoreParamV2I32 : StoreParamV2Inst<Int32Regs, i32imm, ".b32">; 2584defm StoreParamV2I16 : StoreParamV2Inst<Int16Regs, i16imm, ".b16">; 2585defm StoreParamV2I8 : StoreParamV2Inst<Int16Regs, i8imm, ".b8">; 2586 2587defm StoreParamV4I32 : StoreParamV4Inst<Int32Regs, i32imm, ".b32">; 2588defm StoreParamV4I16 : StoreParamV4Inst<Int16Regs, i16imm, ".b16">; 2589defm StoreParamV4I8 : StoreParamV4Inst<Int16Regs, i8imm, ".b8">; 2590 2591defm StoreParamF32 : StoreParamInst<Float32Regs, f32imm, ".f32">; 2592defm StoreParamF64 : StoreParamInst<Float64Regs, f64imm, ".f64">; 2593 2594defm StoreParamV2F32 : StoreParamV2Inst<Float32Regs, f32imm, ".f32">; 2595defm StoreParamV2F64 : StoreParamV2Inst<Float64Regs, f64imm, ".f64">; 2596 2597defm StoreParamV4F32 : StoreParamV4Inst<Float32Regs, f32imm, ".f32">; 2598 2599def StoreRetvalI64 : StoreRetvalInst<Int64Regs, ".b64">; 2600def StoreRetvalI32 : StoreRetvalInst<Int32Regs, ".b32">; 2601def StoreRetvalI16 : StoreRetvalInst<Int16Regs, ".b16">; 2602def StoreRetvalI8 : StoreRetvalInst<Int16Regs, ".b8">; 2603def StoreRetvalI8TruncI32 : StoreRetvalInst<Int32Regs, ".b8">; 2604def StoreRetvalI8TruncI64 : StoreRetvalInst<Int64Regs, ".b8">; 2605def StoreRetvalV2I64 : StoreRetvalV2Inst<Int64Regs, ".b64">; 2606def StoreRetvalV2I32 : StoreRetvalV2Inst<Int32Regs, ".b32">; 2607def StoreRetvalV2I16 : StoreRetvalV2Inst<Int16Regs, ".b16">; 2608def StoreRetvalV2I8 : StoreRetvalV2Inst<Int16Regs, ".b8">; 2609def StoreRetvalV4I32 : StoreRetvalV4Inst<Int32Regs, ".b32">; 2610def StoreRetvalV4I16 : StoreRetvalV4Inst<Int16Regs, ".b16">; 2611def StoreRetvalV4I8 : StoreRetvalV4Inst<Int16Regs, ".b8">; 2612 2613def StoreRetvalF64 : StoreRetvalInst<Float64Regs, ".f64">; 2614def StoreRetvalF32 : StoreRetvalInst<Float32Regs, ".f32">; 2615def StoreRetvalV2F64 : StoreRetvalV2Inst<Float64Regs, ".f64">; 2616def StoreRetvalV2F32 : StoreRetvalV2Inst<Float32Regs, ".f32">; 2617def StoreRetvalV4F32 : StoreRetvalV4Inst<Float32Regs, ".f32">; 2618 2619def CallArgBeginInst : NVPTXInst<(outs), (ins), "(", [(CallArgBegin)]>; 2620def CallArgEndInst1 : NVPTXInst<(outs), (ins), ");", [(CallArgEnd (i32 1))]>; 2621def CallArgEndInst0 : NVPTXInst<(outs), (ins), ")", [(CallArgEnd (i32 0))]>; 2622def RETURNInst : NVPTXInst<(outs), (ins), "ret;", [(RETURNNode)]>; 2623 2624class CallArgInst<NVPTXRegClass regclass> : 2625 NVPTXInst<(outs), (ins regclass:$a), "$a, ", 2626 [(CallArg (i32 0), regclass:$a)]>; 2627 2628class CallArgInstVT<NVPTXRegClass regclass, ValueType vt> : 2629 NVPTXInst<(outs), (ins regclass:$a), "$a, ", 2630 [(CallArg (i32 0), vt:$a)]>; 2631 2632class LastCallArgInst<NVPTXRegClass regclass> : 2633 NVPTXInst<(outs), (ins regclass:$a), "$a", 2634 [(LastCallArg (i32 0), regclass:$a)]>; 2635class LastCallArgInstVT<NVPTXRegClass regclass, ValueType vt> : 2636 NVPTXInst<(outs), (ins regclass:$a), "$a", 2637 [(LastCallArg (i32 0), vt:$a)]>; 2638 2639def CallArgI64 : CallArgInst<Int64Regs>; 2640def CallArgI32 : CallArgInstVT<Int32Regs, i32>; 2641def CallArgI16 : CallArgInstVT<Int16Regs, i16>; 2642def CallArgF64 : CallArgInst<Float64Regs>; 2643def CallArgF32 : CallArgInst<Float32Regs>; 2644 2645def LastCallArgI64 : LastCallArgInst<Int64Regs>; 2646def LastCallArgI32 : LastCallArgInstVT<Int32Regs, i32>; 2647def LastCallArgI16 : LastCallArgInstVT<Int16Regs, i16>; 2648def LastCallArgF64 : LastCallArgInst<Float64Regs>; 2649def LastCallArgF32 : LastCallArgInst<Float32Regs>; 2650 2651def CallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a, ", 2652 [(CallArg (i32 0), (i32 imm:$a))]>; 2653def LastCallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a", 2654 [(LastCallArg (i32 0), (i32 imm:$a))]>; 2655 2656def CallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a, ", 2657 [(CallArg (i32 1), (i32 imm:$a))]>; 2658def LastCallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a", 2659 [(LastCallArg (i32 1), (i32 imm:$a))]>; 2660 2661def CallVoidInst : NVPTXInst<(outs), (ins imem:$addr), "$addr, ", 2662 [(CallVoid (Wrapper tglobaladdr:$addr))]>; 2663def CallVoidInstReg : NVPTXInst<(outs), (ins Int32Regs:$addr), "$addr, ", 2664 [(CallVoid i32:$addr)]>; 2665def CallVoidInstReg64 : NVPTXInst<(outs), (ins Int64Regs:$addr), "$addr, ", 2666 [(CallVoid i64:$addr)]>; 2667def PrototypeInst : NVPTXInst<(outs), (ins i32imm:$val), ", prototype_$val;", 2668 [(Prototype (i32 imm:$val))]>; 2669 2670def DeclareRetMemInst : 2671 NVPTXInst<(outs), (ins i32imm:$align, i32imm:$size, i32imm:$num), 2672 ".param .align $align .b8 retval$num[$size];", 2673 [(DeclareRetParam (i32 imm:$align), (i32 imm:$size), (i32 imm:$num))]>; 2674def DeclareRetScalarInst : 2675 NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num), 2676 ".param .b$size retval$num;", 2677 [(DeclareRet (i32 1), (i32 imm:$size), (i32 imm:$num))]>; 2678def DeclareRetRegInst : 2679 NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num), 2680 ".reg .b$size retval$num;", 2681 [(DeclareRet (i32 2), (i32 imm:$size), (i32 imm:$num))]>; 2682 2683def DeclareParamInst : 2684 NVPTXInst<(outs), (ins i32imm:$align, i32imm:$a, i32imm:$size), 2685 ".param .align $align .b8 param$a[$size];", 2686 [(DeclareParam (i32 imm:$align), (i32 imm:$a), (i32 imm:$size))]>; 2687def DeclareScalarParamInst : 2688 NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size), 2689 ".param .b$size param$a;", 2690 [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 0))]>; 2691def DeclareScalarRegInst : 2692 NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size), 2693 ".reg .b$size param$a;", 2694 [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 1))]>; 2695 2696class MoveParamInst<ValueType T, NVPTXRegClass regclass, string asmstr> : 2697 NVPTXInst<(outs regclass:$dst), (ins regclass:$src), 2698 !strconcat("mov", asmstr, " \t$dst, $src;"), 2699 [(set T:$dst, (MoveParam T:$src))]>; 2700 2701class MoveParamSymbolInst<NVPTXRegClass regclass, Operand srcty, ValueType vt, 2702 string asmstr> : 2703 NVPTXInst<(outs regclass:$dst), (ins srcty:$src), 2704 !strconcat("mov", asmstr, " \t$dst, $src;"), 2705 [(set vt:$dst, (MoveParam texternalsym:$src))]>; 2706 2707def MoveParamI64 : MoveParamInst<i64, Int64Regs, ".b64">; 2708def MoveParamI32 : MoveParamInst<i32, Int32Regs, ".b32">; 2709 2710def MoveParamSymbolI64 : MoveParamSymbolInst<Int64Regs, i64imm, i64, ".b64">; 2711def MoveParamSymbolI32 : MoveParamSymbolInst<Int32Regs, i32imm, i32, ".b32">; 2712 2713def MoveParamI16 : 2714 NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), 2715 "cvt.u16.u32 \t$dst, $src;", // ??? Why cvt.u16.u32 ? 2716 [(set i16:$dst, (MoveParam i16:$src))]>; 2717def MoveParamF64 : MoveParamInst<f64, Float64Regs, ".f64">; 2718def MoveParamF32 : MoveParamInst<f32, Float32Regs, ".f32">; 2719 2720class PseudoUseParamInst<NVPTXRegClass regclass, ValueType vt> : 2721 NVPTXInst<(outs), (ins regclass:$src), 2722 "// Pseudo use of $src", 2723 [(PseudoUseParam vt:$src)]>; 2724 2725def PseudoUseParamI64 : PseudoUseParamInst<Int64Regs, i64>; 2726def PseudoUseParamI32 : PseudoUseParamInst<Int32Regs, i32>; 2727def PseudoUseParamI16 : PseudoUseParamInst<Int16Regs, i16>; 2728def PseudoUseParamF64 : PseudoUseParamInst<Float64Regs, f64>; 2729def PseudoUseParamF32 : PseudoUseParamInst<Float32Regs, f32>; 2730 2731class ProxyRegInst<string SzStr, ValueType T, NVPTXRegClass regclass> : 2732 NVPTXInst<(outs regclass:$dst), (ins regclass:$src), 2733 !strconcat("mov.", SzStr, " \t$dst, $src;"), 2734 [(set T:$dst, (ProxyReg T:$src))]>; 2735 2736def ProxyRegI1 : ProxyRegInst<"pred", i1, Int1Regs>; 2737def ProxyRegI16 : ProxyRegInst<"b16", i16, Int16Regs>; 2738def ProxyRegI32 : ProxyRegInst<"b32", i32, Int32Regs>; 2739def ProxyRegI64 : ProxyRegInst<"b64", i64, Int64Regs>; 2740def ProxyRegF32 : ProxyRegInst<"f32", f32, Float32Regs>; 2741def ProxyRegF64 : ProxyRegInst<"f64", f64, Float64Regs>; 2742 2743foreach vt = [f16, bf16] in { 2744 def: Pat<(vt (ProxyReg vt:$src)), (ProxyRegI16 $src)>; 2745} 2746 2747foreach vt = [v2f16, v2bf16, v2i16, v4i8] in { 2748 def: Pat<(vt (ProxyReg vt:$src)), (ProxyRegI32 $src)>; 2749} 2750 2751// 2752// Load / Store Handling 2753// 2754multiclass LD<NVPTXRegClass regclass> { 2755 def _avar : NVPTXInst< 2756 (outs regclass:$dst), 2757 (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2758 i32imm:$fromWidth, imem:$addr), 2759 "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2760 "\t$dst, [$addr];", []>; 2761 def _areg : NVPTXInst< 2762 (outs regclass:$dst), 2763 (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2764 i32imm:$fromWidth, Int32Regs:$addr), 2765 "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2766 "\t$dst, [$addr];", []>; 2767 def _areg_64 : NVPTXInst< 2768 (outs regclass:$dst), 2769 (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2770 i32imm:$fromWidth, Int64Regs:$addr), 2771 "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2772 "\t$dst, [$addr];", []>; 2773 def _ari : NVPTXInst< 2774 (outs regclass:$dst), 2775 (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, 2776 i32imm:$fromWidth, Int32Regs:$addr, Offseti32imm:$offset), 2777 "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2778 "\t$dst, [$addr$offset];", []>; 2779 def _ari_64 : NVPTXInst< 2780 (outs regclass:$dst), 2781 (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, 2782 LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, Offseti32imm:$offset), 2783 "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2784 "\t$dst, [$addr$offset];", []>; 2785 def _asi : NVPTXInst< 2786 (outs regclass:$dst), 2787 (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, 2788 LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, Offseti32imm:$offset), 2789 "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2790 "\t$dst, [$addr$offset];", []>; 2791} 2792 2793let mayLoad=1, hasSideEffects=0 in { 2794 defm LD_i8 : LD<Int16Regs>; 2795 defm LD_i16 : LD<Int16Regs>; 2796 defm LD_i32 : LD<Int32Regs>; 2797 defm LD_i64 : LD<Int64Regs>; 2798 defm LD_f32 : LD<Float32Regs>; 2799 defm LD_f64 : LD<Float64Regs>; 2800} 2801 2802multiclass ST<NVPTXRegClass regclass> { 2803 def _avar : NVPTXInst< 2804 (outs), 2805 (ins regclass:$src, LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, 2806 LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, imem:$addr), 2807 "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" 2808 " \t[$addr], $src;", []>; 2809 def _areg : NVPTXInst< 2810 (outs), 2811 (ins regclass:$src, LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, 2812 LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr), 2813 "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" 2814 " \t[$addr], $src;", []>; 2815 def _areg_64 : NVPTXInst< 2816 (outs), 2817 (ins regclass:$src, LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, 2818 LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr), 2819 "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" 2820 " \t[$addr], $src;", []>; 2821 def _ari : NVPTXInst< 2822 (outs), 2823 (ins regclass:$src, LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, 2824 LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, 2825 Offseti32imm:$offset), 2826 "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" 2827 " \t[$addr$offset], $src;", []>; 2828 def _ari_64 : NVPTXInst< 2829 (outs), 2830 (ins regclass:$src, LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, 2831 LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr, 2832 Offseti32imm:$offset), 2833 "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" 2834 " \t[$addr$offset], $src;", []>; 2835 def _asi : NVPTXInst< 2836 (outs), 2837 (ins regclass:$src, LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, 2838 LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, imem:$addr, 2839 Offseti32imm:$offset), 2840 "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" 2841 " \t[$addr$offset], $src;", []>; 2842} 2843 2844let mayStore=1, hasSideEffects=0 in { 2845 defm ST_i8 : ST<Int16Regs>; 2846 defm ST_i16 : ST<Int16Regs>; 2847 defm ST_i32 : ST<Int32Regs>; 2848 defm ST_i64 : ST<Int64Regs>; 2849 defm ST_f32 : ST<Float32Regs>; 2850 defm ST_f64 : ST<Float64Regs>; 2851} 2852 2853// The following is used only in and after vector elementizations. Vector 2854// elementization happens at the machine instruction level, so the following 2855// instructions never appear in the DAG. 2856multiclass LD_VEC<NVPTXRegClass regclass> { 2857 def _v2_avar : NVPTXInst< 2858 (outs regclass:$dst1, regclass:$dst2), 2859 (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, 2860 LdStCode:$Sign, i32imm:$fromWidth, imem:$addr), 2861 "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2862 "\t{{$dst1, $dst2}}, [$addr];", []>; 2863 def _v2_areg : NVPTXInst< 2864 (outs regclass:$dst1, regclass:$dst2), 2865 (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, 2866 LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr), 2867 "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2868 "\t{{$dst1, $dst2}}, [$addr];", []>; 2869 def _v2_areg_64 : NVPTXInst< 2870 (outs regclass:$dst1, regclass:$dst2), 2871 (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, 2872 LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr), 2873 "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2874 "\t{{$dst1, $dst2}}, [$addr];", []>; 2875 def _v2_ari : NVPTXInst< 2876 (outs regclass:$dst1, regclass:$dst2), 2877 (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, 2878 LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, Offseti32imm:$offset), 2879 "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2880 "\t{{$dst1, $dst2}}, [$addr$offset];", []>; 2881 def _v2_ari_64 : NVPTXInst< 2882 (outs regclass:$dst1, regclass:$dst2), 2883 (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, 2884 LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, Offseti32imm:$offset), 2885 "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2886 "\t{{$dst1, $dst2}}, [$addr$offset];", []>; 2887 def _v2_asi : NVPTXInst< 2888 (outs regclass:$dst1, regclass:$dst2), 2889 (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, 2890 LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, Offseti32imm:$offset), 2891 "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2892 "\t{{$dst1, $dst2}}, [$addr$offset];", []>; 2893 def _v4_avar : NVPTXInst< 2894 (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), 2895 (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, 2896 LdStCode:$Sign, i32imm:$fromWidth, imem:$addr), 2897 "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2898 "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>; 2899 def _v4_areg : NVPTXInst< 2900 (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), 2901 (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, 2902 LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr), 2903 "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2904 "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>; 2905 def _v4_areg_64 : NVPTXInst< 2906 (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), 2907 (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, 2908 LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr), 2909 "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2910 "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>; 2911 def _v4_ari : NVPTXInst< 2912 (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), 2913 (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, 2914 LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, Offseti32imm:$offset), 2915 "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2916 "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr$offset];", []>; 2917 def _v4_ari_64 : NVPTXInst< 2918 (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), 2919 (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, 2920 LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, Offseti32imm:$offset), 2921 "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2922 "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr$offset];", []>; 2923 def _v4_asi : NVPTXInst< 2924 (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), 2925 (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, 2926 LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, Offseti32imm:$offset), 2927 "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2928 "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr$offset];", []>; 2929} 2930let mayLoad=1, hasSideEffects=0 in { 2931 defm LDV_i8 : LD_VEC<Int16Regs>; 2932 defm LDV_i16 : LD_VEC<Int16Regs>; 2933 defm LDV_i32 : LD_VEC<Int32Regs>; 2934 defm LDV_i64 : LD_VEC<Int64Regs>; 2935 defm LDV_f32 : LD_VEC<Float32Regs>; 2936 defm LDV_f64 : LD_VEC<Float64Regs>; 2937} 2938 2939multiclass ST_VEC<NVPTXRegClass regclass> { 2940 def _v2_avar : NVPTXInst< 2941 (outs), 2942 (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$scope, 2943 LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, 2944 imem:$addr), 2945 "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2946 "\t[$addr], {{$src1, $src2}};", []>; 2947 def _v2_areg : NVPTXInst< 2948 (outs), 2949 (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$scope, 2950 LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, 2951 Int32Regs:$addr), 2952 "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2953 "\t[$addr], {{$src1, $src2}};", []>; 2954 def _v2_areg_64 : NVPTXInst< 2955 (outs), 2956 (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$scope, 2957 LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, 2958 Int64Regs:$addr), 2959 "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2960 "\t[$addr], {{$src1, $src2}};", []>; 2961 def _v2_ari : NVPTXInst< 2962 (outs), 2963 (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$scope, 2964 LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, 2965 Int32Regs:$addr, Offseti32imm:$offset), 2966 "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2967 "\t[$addr$offset], {{$src1, $src2}};", []>; 2968 def _v2_ari_64 : NVPTXInst< 2969 (outs), 2970 (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$scope, 2971 LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, 2972 Int64Regs:$addr, Offseti32imm:$offset), 2973 "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2974 "\t[$addr$offset], {{$src1, $src2}};", []>; 2975 def _v2_asi : NVPTXInst< 2976 (outs), 2977 (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$scope, 2978 LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, 2979 imem:$addr, Offseti32imm:$offset), 2980 "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2981 "\t[$addr$offset], {{$src1, $src2}};", []>; 2982 def _v4_avar : NVPTXInst< 2983 (outs), 2984 (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, 2985 LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, 2986 LdStCode:$Sign, i32imm:$fromWidth, imem:$addr), 2987 "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2988 "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>; 2989 def _v4_areg : NVPTXInst< 2990 (outs), 2991 (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, 2992 LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, 2993 LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr), 2994 "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 2995 "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>; 2996 def _v4_areg_64 : NVPTXInst< 2997 (outs), 2998 (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, 2999 LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, 3000 LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr), 3001 "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 3002 "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>; 3003 def _v4_ari : NVPTXInst< 3004 (outs), 3005 (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, 3006 LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, 3007 LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, Offseti32imm:$offset), 3008 "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 3009 "\t[$addr$offset], {{$src1, $src2, $src3, $src4}};", []>; 3010 def _v4_ari_64 : NVPTXInst< 3011 (outs), 3012 (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, 3013 LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, 3014 LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, Offseti32imm:$offset), 3015 "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " 3016 "\t[$addr$offset], {{$src1, $src2, $src3, $src4}};", []>; 3017 def _v4_asi : NVPTXInst< 3018 (outs), 3019 (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, 3020 LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, 3021 LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, Offseti32imm:$offset), 3022 "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}" 3023 "$fromWidth \t[$addr$offset], {{$src1, $src2, $src3, $src4}};", []>; 3024} 3025 3026let mayStore=1, hasSideEffects=0 in { 3027 defm STV_i8 : ST_VEC<Int16Regs>; 3028 defm STV_i16 : ST_VEC<Int16Regs>; 3029 defm STV_i32 : ST_VEC<Int32Regs>; 3030 defm STV_i64 : ST_VEC<Int64Regs>; 3031 defm STV_f32 : ST_VEC<Float32Regs>; 3032 defm STV_f64 : ST_VEC<Float64Regs>; 3033} 3034 3035//---- Conversion ---- 3036 3037class F_BITCONVERT<string SzStr, ValueType TIn, ValueType TOut, 3038 NVPTXRegClass regclassIn = ValueToRegClass<TIn>.ret, 3039 NVPTXRegClass regclassOut = ValueToRegClass<TOut>.ret> : 3040 NVPTXInst<(outs regclassOut:$d), (ins regclassIn:$a), 3041 !strconcat("mov.b", SzStr, " \t$d, $a;"), 3042 [(set TOut:$d, (bitconvert TIn:$a))]>; 3043 3044def BITCONVERT_32_I2F : F_BITCONVERT<"32", i32, f32>; 3045def BITCONVERT_32_F2I : F_BITCONVERT<"32", f32, i32>; 3046def BITCONVERT_64_I2F : F_BITCONVERT<"64", i64, f64>; 3047def BITCONVERT_64_F2I : F_BITCONVERT<"64", f64, i64>; 3048 3049foreach vt = [v2f16, v2bf16, v2i16, v4i8] in { 3050def: Pat<(vt (bitconvert (f32 Float32Regs:$a))), 3051 (BITCONVERT_32_F2I $a)>; 3052def: Pat<(f32 (bitconvert vt:$a)), 3053 (BITCONVERT_32_I2F $a)>; 3054} 3055foreach vt = [f16, bf16] in { 3056 def: Pat<(vt (bitconvert i16:$a)), 3057 (vt Int16Regs:$a)>; 3058 def: Pat<(i16 (bitconvert vt:$a)), 3059 (i16 Int16Regs:$a)>; 3060} 3061 3062foreach ta = [v2f16, v2bf16, v2i16, v4i8, i32] in { 3063 foreach tb = [v2f16, v2bf16, v2i16, v4i8, i32] in { 3064 if !ne(ta, tb) then { 3065 def: Pat<(ta (bitconvert tb:$a)), 3066 (ta Int32Regs:$a)>; 3067 } 3068 } 3069} 3070 3071// NOTE: pred->fp are currently sub-optimal due to an issue in TableGen where 3072// we cannot specify floating-point literals in isel patterns. Therefore, we 3073// use an integer selp to select either 1 (or -1 in case of signed) or 0 3074// and then cvt to floating-point. 3075 3076// sint -> f16 3077def : Pat<(f16 (sint_to_fp i1:$a)), 3078 (CVT_f16_s32 (SELP_s32ii -1, 0, $a), CvtRN)>; 3079def : Pat<(f16 (sint_to_fp Int16Regs:$a)), 3080 (CVT_f16_s16 $a, CvtRN)>; 3081def : Pat<(f16 (sint_to_fp i32:$a)), 3082 (CVT_f16_s32 $a, CvtRN)>; 3083def : Pat<(f16 (sint_to_fp i64:$a)), 3084 (CVT_f16_s64 $a, CvtRN)>; 3085 3086// uint -> f16 3087def : Pat<(f16 (uint_to_fp i1:$a)), 3088 (CVT_f16_u32 (SELP_u32ii 1, 0, $a), CvtRN)>; 3089def : Pat<(f16 (uint_to_fp Int16Regs:$a)), 3090 (CVT_f16_u16 $a, CvtRN)>; 3091def : Pat<(f16 (uint_to_fp i32:$a)), 3092 (CVT_f16_u32 $a, CvtRN)>; 3093def : Pat<(f16 (uint_to_fp i64:$a)), 3094 (CVT_f16_u64 $a, CvtRN)>; 3095 3096// sint -> bf16 3097def : Pat<(bf16 (sint_to_fp i1:$a)), 3098 (CVT_bf16_s32 (SELP_u32ii 1, 0, $a), CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>; 3099def : Pat<(bf16 (sint_to_fp i16:$a)), 3100 (CVT_bf16_s16 $a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>; 3101def : Pat<(bf16 (sint_to_fp i32:$a)), 3102 (CVT_bf16_s32 $a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>; 3103def : Pat<(bf16 (sint_to_fp i64:$a)), 3104 (CVT_bf16_s64 $a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>; 3105 3106// uint -> bf16 3107def : Pat<(bf16 (uint_to_fp i1:$a)), 3108 (CVT_bf16_u32 (SELP_u32ii 1, 0, $a), CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>; 3109def : Pat<(bf16 (uint_to_fp i16:$a)), 3110 (CVT_bf16_u16 $a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>; 3111def : Pat<(bf16 (uint_to_fp i32:$a)), 3112 (CVT_bf16_u32 $a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>; 3113def : Pat<(bf16 (uint_to_fp i64:$a)), 3114 (CVT_bf16_u64 $a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>; 3115 3116// sint -> f32 3117def : Pat<(f32 (sint_to_fp i1:$a)), 3118 (CVT_f32_s32 (SELP_s32ii -1, 0, $a), CvtRN)>; 3119def : Pat<(f32 (sint_to_fp i16:$a)), 3120 (CVT_f32_s16 $a, CvtRN)>; 3121def : Pat<(f32 (sint_to_fp i32:$a)), 3122 (CVT_f32_s32 $a, CvtRN)>; 3123def : Pat<(f32 (sint_to_fp i64:$a)), 3124 (CVT_f32_s64 $a, CvtRN)>; 3125 3126// uint -> f32 3127def : Pat<(f32 (uint_to_fp i1:$a)), 3128 (CVT_f32_u32 (SELP_u32ii 1, 0, $a), CvtRN)>; 3129def : Pat<(f32 (uint_to_fp i16:$a)), 3130 (CVT_f32_u16 $a, CvtRN)>; 3131def : Pat<(f32 (uint_to_fp i32:$a)), 3132 (CVT_f32_u32 $a, CvtRN)>; 3133def : Pat<(f32 (uint_to_fp i64:$a)), 3134 (CVT_f32_u64 $a, CvtRN)>; 3135 3136// sint -> f64 3137def : Pat<(f64 (sint_to_fp i1:$a)), 3138 (CVT_f64_s32 (SELP_s32ii -1, 0, $a), CvtRN)>; 3139def : Pat<(f64 (sint_to_fp i16:$a)), 3140 (CVT_f64_s16 $a, CvtRN)>; 3141def : Pat<(f64 (sint_to_fp i32:$a)), 3142 (CVT_f64_s32 $a, CvtRN)>; 3143def : Pat<(f64 (sint_to_fp i64:$a)), 3144 (CVT_f64_s64 $a, CvtRN)>; 3145 3146// uint -> f64 3147def : Pat<(f64 (uint_to_fp i1:$a)), 3148 (CVT_f64_u32 (SELP_u32ii 1, 0, $a), CvtRN)>; 3149def : Pat<(f64 (uint_to_fp i16:$a)), 3150 (CVT_f64_u16 $a, CvtRN)>; 3151def : Pat<(f64 (uint_to_fp i32:$a)), 3152 (CVT_f64_u32 $a, CvtRN)>; 3153def : Pat<(f64 (uint_to_fp i64:$a)), 3154 (CVT_f64_u64 $a, CvtRN)>; 3155 3156 3157// f16 -> sint 3158def : Pat<(i1 (fp_to_sint f16:$a)), 3159 (SETP_b16ri $a, 0, CmpEQ)>; 3160def : Pat<(i16 (fp_to_sint f16:$a)), 3161 (CVT_s16_f16 $a, CvtRZI)>; 3162def : Pat<(i32 (fp_to_sint f16:$a)), 3163 (CVT_s32_f16 $a, CvtRZI)>; 3164def : Pat<(i64 (fp_to_sint f16:$a)), 3165 (CVT_s64_f16 $a, CvtRZI)>; 3166 3167// f16 -> uint 3168def : Pat<(i1 (fp_to_uint f16:$a)), 3169 (SETP_b16ri $a, 0, CmpEQ)>; 3170def : Pat<(i16 (fp_to_uint f16:$a)), 3171 (CVT_u16_f16 $a, CvtRZI)>; 3172def : Pat<(i32 (fp_to_uint f16:$a)), 3173 (CVT_u32_f16 $a, CvtRZI)>; 3174def : Pat<(i64 (fp_to_uint f16:$a)), 3175 (CVT_u64_f16 $a, CvtRZI)>; 3176 3177// bf16 -> sint 3178def : Pat<(i1 (fp_to_sint bf16:$a)), 3179 (SETP_b16ri $a, 0, CmpEQ)>; 3180def : Pat<(i16 (fp_to_sint bf16:$a)), 3181 (CVT_s16_bf16 $a, CvtRZI)>; 3182def : Pat<(i32 (fp_to_sint bf16:$a)), 3183 (CVT_s32_bf16 $a, CvtRZI)>; 3184def : Pat<(i64 (fp_to_sint bf16:$a)), 3185 (CVT_s64_bf16 $a, CvtRZI)>; 3186 3187// bf16 -> uint 3188def : Pat<(i1 (fp_to_uint bf16:$a)), 3189 (SETP_b16ri $a, 0, CmpEQ)>; 3190def : Pat<(i16 (fp_to_uint bf16:$a)), 3191 (CVT_u16_bf16 $a, CvtRZI)>; 3192def : Pat<(i32 (fp_to_uint bf16:$a)), 3193 (CVT_u32_bf16 $a, CvtRZI)>; 3194def : Pat<(i64 (fp_to_uint bf16:$a)), 3195 (CVT_u64_bf16 $a, CvtRZI)>; 3196// f32 -> sint 3197def : Pat<(i1 (fp_to_sint f32:$a)), 3198 (SETP_b32ri (BITCONVERT_32_F2I $a), 0, CmpEQ)>; 3199def : Pat<(i16 (fp_to_sint f32:$a)), 3200 (CVT_s16_f32 $a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; 3201def : Pat<(i16 (fp_to_sint f32:$a)), 3202 (CVT_s16_f32 $a, CvtRZI)>; 3203def : Pat<(i32 (fp_to_sint f32:$a)), 3204 (CVT_s32_f32 $a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; 3205def : Pat<(i32 (fp_to_sint f32:$a)), 3206 (CVT_s32_f32 $a, CvtRZI)>; 3207def : Pat<(i64 (fp_to_sint f32:$a)), 3208 (CVT_s64_f32 $a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; 3209def : Pat<(i64 (fp_to_sint f32:$a)), 3210 (CVT_s64_f32 $a, CvtRZI)>; 3211 3212// f32 -> uint 3213def : Pat<(i1 (fp_to_uint f32:$a)), 3214 (SETP_b32ri (BITCONVERT_32_F2I $a), 0, CmpEQ)>; 3215def : Pat<(i16 (fp_to_uint f32:$a)), 3216 (CVT_u16_f32 $a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; 3217def : Pat<(i16 (fp_to_uint f32:$a)), 3218 (CVT_u16_f32 $a, CvtRZI)>; 3219def : Pat<(i32 (fp_to_uint f32:$a)), 3220 (CVT_u32_f32 $a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; 3221def : Pat<(i32 (fp_to_uint f32:$a)), 3222 (CVT_u32_f32 $a, CvtRZI)>; 3223def : Pat<(i64 (fp_to_uint f32:$a)), 3224 (CVT_u64_f32 $a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>; 3225def : Pat<(i64 (fp_to_uint f32:$a)), 3226 (CVT_u64_f32 $a, CvtRZI)>; 3227 3228// f64 -> sint 3229def : Pat<(i1 (fp_to_sint f64:$a)), 3230 (SETP_b64ri (BITCONVERT_64_F2I $a), 0, CmpEQ)>; 3231def : Pat<(i16 (fp_to_sint f64:$a)), 3232 (CVT_s16_f64 $a, CvtRZI)>; 3233def : Pat<(i32 (fp_to_sint f64:$a)), 3234 (CVT_s32_f64 $a, CvtRZI)>; 3235def : Pat<(i64 (fp_to_sint f64:$a)), 3236 (CVT_s64_f64 $a, CvtRZI)>; 3237 3238// f64 -> uint 3239def : Pat<(i1 (fp_to_uint f64:$a)), 3240 (SETP_b64ri (BITCONVERT_64_F2I $a), 0, CmpEQ)>; 3241def : Pat<(i16 (fp_to_uint f64:$a)), 3242 (CVT_u16_f64 $a, CvtRZI)>; 3243def : Pat<(i32 (fp_to_uint f64:$a)), 3244 (CVT_u32_f64 $a, CvtRZI)>; 3245def : Pat<(i64 (fp_to_uint f64:$a)), 3246 (CVT_u64_f64 $a, CvtRZI)>; 3247 3248// sext i1 3249def : Pat<(i16 (sext i1:$a)), 3250 (SELP_s16ii -1, 0, $a)>; 3251def : Pat<(i32 (sext i1:$a)), 3252 (SELP_s32ii -1, 0, $a)>; 3253def : Pat<(i64 (sext i1:$a)), 3254 (SELP_s64ii -1, 0, $a)>; 3255 3256// zext i1 3257def : Pat<(i16 (zext i1:$a)), 3258 (SELP_u16ii 1, 0, $a)>; 3259def : Pat<(i32 (zext i1:$a)), 3260 (SELP_u32ii 1, 0, $a)>; 3261def : Pat<(i64 (zext i1:$a)), 3262 (SELP_u64ii 1, 0, $a)>; 3263 3264// anyext i1 3265def : Pat<(i16 (anyext i1:$a)), 3266 (SELP_u16ii -1, 0, $a)>; 3267def : Pat<(i32 (anyext i1:$a)), 3268 (SELP_u32ii -1, 0, $a)>; 3269def : Pat<(i64 (anyext i1:$a)), 3270 (SELP_u64ii -1, 0, $a)>; 3271 3272// sext i16 3273def : Pat<(i32 (sext i16:$a)), 3274 (CVT_s32_s16 $a, CvtNONE)>; 3275def : Pat<(i64 (sext i16:$a)), 3276 (CVT_s64_s16 $a, CvtNONE)>; 3277 3278// zext i16 3279def : Pat<(i32 (zext i16:$a)), 3280 (CVT_u32_u16 $a, CvtNONE)>; 3281def : Pat<(i64 (zext i16:$a)), 3282 (CVT_u64_u16 $a, CvtNONE)>; 3283 3284// anyext i16 3285def : Pat<(i32 (anyext i16:$a)), 3286 (CVT_u32_u16 $a, CvtNONE)>; 3287def : Pat<(i64 (anyext i16:$a)), 3288 (CVT_u64_u16 $a, CvtNONE)>; 3289 3290// sext i32 3291def : Pat<(i64 (sext i32:$a)), 3292 (CVT_s64_s32 $a, CvtNONE)>; 3293 3294// zext i32 3295def : Pat<(i64 (zext i32:$a)), 3296 (CVT_u64_u32 $a, CvtNONE)>; 3297 3298// anyext i32 3299def : Pat<(i64 (anyext i32:$a)), 3300 (CVT_u64_u32 $a, CvtNONE)>; 3301 3302 3303// truncate i64 3304def : Pat<(i32 (trunc i64:$a)), 3305 (CVT_u32_u64 $a, CvtNONE)>; 3306def : Pat<(i16 (trunc i64:$a)), 3307 (CVT_u16_u64 $a, CvtNONE)>; 3308def : Pat<(i1 (trunc i64:$a)), 3309 (SETP_b64ri (ANDb64ri $a, 1), 1, CmpEQ)>; 3310 3311// truncate i32 3312def : Pat<(i16 (trunc i32:$a)), 3313 (CVT_u16_u32 $a, CvtNONE)>; 3314def : Pat<(i1 (trunc i32:$a)), 3315 (SETP_b32ri (ANDb32ri $a, 1), 1, CmpEQ)>; 3316 3317// truncate i16 3318def : Pat<(i1 (trunc i16:$a)), 3319 (SETP_b16ri (ANDb16ri $a, 1), 1, CmpEQ)>; 3320 3321// sext_inreg 3322def : Pat<(sext_inreg i16:$a, i8), (CVT_INREG_s16_s8 $a)>; 3323def : Pat<(sext_inreg i32:$a, i8), (CVT_INREG_s32_s8 $a)>; 3324def : Pat<(sext_inreg i32:$a, i16), (CVT_INREG_s32_s16 $a)>; 3325def : Pat<(sext_inreg i64:$a, i8), (CVT_INREG_s64_s8 $a)>; 3326def : Pat<(sext_inreg i64:$a, i16), (CVT_INREG_s64_s16 $a)>; 3327def : Pat<(sext_inreg i64:$a, i32), (CVT_INREG_s64_s32 $a)>; 3328 3329 3330// Select instructions with 32-bit predicates 3331def : Pat<(select i32:$pred, i16:$a, i16:$b), 3332 (SELP_b16rr $a, $b, 3333 (SETP_b32ri (ANDb32ri $pred, 1), 1, CmpEQ))>; 3334def : Pat<(select i32:$pred, i32:$a, i32:$b), 3335 (SELP_b32rr $a, $b, 3336 (SETP_b32ri (ANDb32ri $pred, 1), 1, CmpEQ))>; 3337def : Pat<(select i32:$pred, i64:$a, i64:$b), 3338 (SELP_b64rr $a, $b, 3339 (SETP_b32ri (ANDb32ri $pred, 1), 1, CmpEQ))>; 3340def : Pat<(select i32:$pred, f16:$a, f16:$b), 3341 (SELP_f16rr $a, $b, 3342 (SETP_b32ri (ANDb32ri $pred, 1), 1, CmpEQ))>; 3343def : Pat<(select i32:$pred, bf16:$a, bf16:$b), 3344 (SELP_bf16rr $a, $b, 3345 (SETP_b32ri (ANDb32ri $pred, 1), 1, CmpEQ))>; 3346def : Pat<(select i32:$pred, f32:$a, f32:$b), 3347 (SELP_f32rr $a, $b, 3348 (SETP_b32ri (ANDb32ri $pred, 1), 1, CmpEQ))>; 3349def : Pat<(select i32:$pred, f64:$a, f64:$b), 3350 (SELP_f64rr $a, $b, 3351 (SETP_b32ri (ANDb32ri $pred, 1), 1, CmpEQ))>; 3352 3353 3354let hasSideEffects = false in { 3355 // pack a set of smaller int registers to a larger int register 3356 def V4I16toI64 : NVPTXInst<(outs Int64Regs:$d), 3357 (ins Int16Regs:$s1, Int16Regs:$s2, 3358 Int16Regs:$s3, Int16Regs:$s4), 3359 "mov.b64 \t$d, {{$s1, $s2, $s3, $s4}};", []>; 3360 def V2I16toI32 : NVPTXInst<(outs Int32Regs:$d), 3361 (ins Int16Regs:$s1, Int16Regs:$s2), 3362 "mov.b32 \t$d, {{$s1, $s2}};", []>; 3363 def V2I32toI64 : NVPTXInst<(outs Int64Regs:$d), 3364 (ins Int32Regs:$s1, Int32Regs:$s2), 3365 "mov.b64 \t$d, {{$s1, $s2}};", []>; 3366 def V2I64toI128 : NVPTXInst<(outs Int128Regs:$d), 3367 (ins Int64Regs:$s1, Int64Regs:$s2), 3368 "mov.b128 \t$d, {{$s1, $s2}};", []>; 3369 def V2F32toF64 : NVPTXInst<(outs Float64Regs:$d), 3370 (ins Float32Regs:$s1, Float32Regs:$s2), 3371 "mov.b64 \t$d, {{$s1, $s2}};", []>; 3372 3373 // unpack a larger int register to a set of smaller int registers 3374 def I64toV4I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2, 3375 Int16Regs:$d3, Int16Regs:$d4), 3376 (ins Int64Regs:$s), 3377 "mov.b64 \t{{$d1, $d2, $d3, $d4}}, $s;", []>; 3378 def I32toV2I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2), 3379 (ins Int32Regs:$s), 3380 "mov.b32 \t{{$d1, $d2}}, $s;", []>; 3381 def I64toV2I32 : NVPTXInst<(outs Int32Regs:$d1, Int32Regs:$d2), 3382 (ins Int64Regs:$s), 3383 "mov.b64 \t{{$d1, $d2}}, $s;", []>; 3384 def I128toV2I64: NVPTXInst<(outs Int64Regs:$d1, Int64Regs:$d2), 3385 (ins Int128Regs:$s), 3386 "mov.b128 \t{{$d1, $d2}}, $s;", []>; 3387 def F64toV2F32 : NVPTXInst<(outs Float32Regs:$d1, Float32Regs:$d2), 3388 (ins Float64Regs:$s), 3389 "mov.b64 \t{{$d1, $d2}}, $s;", []>; 3390 3391 def I32toI16H : NVPTXInst<(outs Int16Regs:$high), 3392 (ins Int32Regs:$s), 3393 "{{ .reg .b16 tmp; mov.b32 {tmp, $high}, $s; }}", 3394 []>; 3395 def I32toI16L : NVPTXInst<(outs Int16Regs:$low), 3396 (ins Int32Regs:$s), 3397 "{{ .reg .b16 tmp; mov.b32 {$low, tmp}, $s; }}", 3398 []>; 3399 def I64toI32H : NVPTXInst<(outs Int32Regs:$high), 3400 (ins Int64Regs:$s), 3401 "{{ .reg .b32 tmp; mov.b64 {tmp, $high}, $s; }}", 3402 []>; 3403 def I64toI32L : NVPTXInst<(outs Int32Regs:$low), 3404 (ins Int64Regs:$s), 3405 "{{ .reg .b32 tmp; mov.b64 {$low, tmp}, $s; }}", 3406 []>; 3407 3408} 3409 3410// Using partial vectorized move produces better SASS code for extraction of 3411// upper/lower parts of an integer. 3412def : Pat<(i16 (trunc (srl i32:$s, (i32 16)))), 3413 (I32toI16H $s)>; 3414def : Pat<(i16 (trunc (sra i32:$s, (i32 16)))), 3415 (I32toI16H $s)>; 3416def : Pat<(i32 (trunc (srl i64:$s, (i32 32)))), 3417 (I64toI32H $s)>; 3418def : Pat<(i32 (trunc (sra i64:$s, (i32 32)))), 3419 (I64toI32H $s)>; 3420 3421def: Pat<(i32 (sext (extractelt v2i16:$src, 0))), 3422 (CVT_INREG_s32_s16 $src)>; 3423 3424foreach vt = [v2f16, v2bf16, v2i16] in { 3425def : Pat<(extractelt vt:$src, 0), 3426 (I32toI16L $src)>; 3427def : Pat<(extractelt vt:$src, 1), 3428 (I32toI16H $src)>; 3429} 3430def : Pat<(v2f16 (build_vector f16:$a, f16:$b)), 3431 (V2I16toI32 $a, $b)>; 3432def : Pat<(v2bf16 (build_vector bf16:$a, bf16:$b)), 3433 (V2I16toI32 $a, $b)>; 3434def : Pat<(v2i16 (build_vector i16:$a, i16:$b)), 3435 (V2I16toI32 $a, $b)>; 3436 3437def: Pat<(v2i16 (scalar_to_vector i16:$a)), 3438 (CVT_u32_u16 $a, CvtNONE)>; 3439 3440// 3441// Funnel-Shift 3442// 3443 3444// Create SDNodes so they can be used in the DAG code, e.g. 3445// NVPTXISelLowering (LowerShiftLeftParts and LowerShiftRightParts) 3446def fshl_clamp : SDNode<"NVPTXISD::FSHL_CLAMP", SDTIntShiftDOp, []>; 3447def fshr_clamp : SDNode<"NVPTXISD::FSHR_CLAMP", SDTIntShiftDOp, []>; 3448 3449// Funnel shift, requires >= sm_32. Does not trap if amt is out of range, so 3450// no side effects. 3451let hasSideEffects = false in { 3452 multiclass ShfInst<string mode, SDNode op> { 3453 def _i 3454 : NVPTXInst<(outs Int32Regs:$dst), 3455 (ins Int32Regs:$lo, Int32Regs:$hi, i32imm:$amt), 3456 "shf." # mode # ".b32 \t$dst, $lo, $hi, $amt;", 3457 [(set i32:$dst, 3458 (op i32:$hi, i32:$lo, (i32 imm:$amt)))]>, 3459 Requires<[hasHWROT32]>; 3460 3461 def _r 3462 : NVPTXInst<(outs Int32Regs:$dst), 3463 (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt), 3464 "shf." # mode # ".b32 \t$dst, $lo, $hi, $amt;", 3465 [(set i32:$dst, 3466 (op i32:$hi, i32:$lo, i32:$amt))]>, 3467 Requires<[hasHWROT32]>; 3468 } 3469 3470 defm SHF_L_CLAMP : ShfInst<"l.clamp", fshl_clamp>; 3471 defm SHF_R_CLAMP : ShfInst<"r.clamp", fshr_clamp>; 3472 defm SHF_L_WRAP : ShfInst<"l.wrap", fshl>; 3473 defm SHF_R_WRAP : ShfInst<"r.wrap", fshr>; 3474} 3475 3476def : Pat<(i32 (int_nvvm_fshl_clamp i32:$hi, i32:$lo, i32:$amt)), 3477 (SHF_L_CLAMP_r $lo, $hi, $amt)>; 3478def : Pat<(i32 (int_nvvm_fshl_clamp i32:$hi, i32:$lo, (i32 imm:$amt))), 3479 (SHF_L_CLAMP_i $lo, $hi, imm:$amt)>; 3480def : Pat<(i32 (int_nvvm_fshr_clamp i32:$hi, i32:$lo, i32:$amt)), 3481 (SHF_R_CLAMP_r $lo, $hi, $amt)>; 3482def : Pat<(i32 (int_nvvm_fshr_clamp i32:$hi, i32:$lo, (i32 imm:$amt))), 3483 (SHF_R_CLAMP_i $lo, $hi, imm:$amt)>; 3484 3485// Count leading zeros 3486let hasSideEffects = false in { 3487 def CLZr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a), 3488 "clz.b32 \t$d, $a;", []>; 3489 def CLZr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), 3490 "clz.b64 \t$d, $a;", []>; 3491} 3492 3493// 32-bit has a direct PTX instruction 3494def : Pat<(i32 (ctlz i32:$a)), (CLZr32 $a)>; 3495 3496// The return type of the ctlz ISD node is the same as its input, but the PTX 3497// ctz instruction always returns a 32-bit value. For ctlz.i64, convert the 3498// ptx value to 64 bits to match the ISD node's semantics, unless we know we're 3499// truncating back down to 32 bits. 3500def : Pat<(i64 (ctlz i64:$a)), (CVT_u64_u32 (CLZr64 $a), CvtNONE)>; 3501def : Pat<(i32 (trunc (i64 (ctlz i64:$a)))), (CLZr64 $a)>; 3502 3503// For 16-bit ctlz, we zero-extend to 32-bit, perform the count, then trunc the 3504// result back to 16-bits if necessary. We also need to subtract 16 because 3505// the high-order 16 zeros were counted. 3506// 3507// TODO: NVPTX has a mov.b32 b32reg, {imm, b16reg} instruction, which we could 3508// use to save one SASS instruction (on sm_35 anyway): 3509// 3510// mov.b32 $tmp, {0xffff, $a} 3511// ctlz.b32 $result, $tmp 3512// 3513// That is, instead of zero-extending the input to 32 bits, we'd "one-extend" 3514// and then ctlz that value. This way we don't have to subtract 16 from the 3515// result. Unfortunately today we don't have a way to generate 3516// "mov b32reg, {b16imm, b16reg}", so we don't do this optimization. 3517def : Pat<(i16 (ctlz i16:$a)), 3518 (SUBi16ri (CVT_u16_u32 3519 (CLZr32 (CVT_u32_u16 $a, CvtNONE)), CvtNONE), 16)>; 3520def : Pat<(i32 (zext (i16 (ctlz i16:$a)))), 3521 (SUBi32ri (CLZr32 (CVT_u32_u16 $a, CvtNONE)), 16)>; 3522 3523// Population count 3524let hasSideEffects = false in { 3525 def POPCr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a), 3526 "popc.b32 \t$d, $a;", []>; 3527 def POPCr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), 3528 "popc.b64 \t$d, $a;", []>; 3529} 3530 3531// 32-bit has a direct PTX instruction 3532def : Pat<(i32 (ctpop i32:$a)), (POPCr32 $a)>; 3533 3534// For 64-bit, the result in PTX is actually 32-bit so we zero-extend to 64-bit 3535// to match the LLVM semantics. Just as with ctlz.i64, we provide a second 3536// pattern that avoids the type conversion if we're truncating the result to 3537// i32 anyway. 3538def : Pat<(ctpop i64:$a), (CVT_u64_u32 (POPCr64 $a), CvtNONE)>; 3539def : Pat<(i32 (trunc (i64 (ctpop i64:$a)))), (POPCr64 $a)>; 3540 3541// For 16-bit, we zero-extend to 32-bit, then trunc the result back to 16-bits. 3542// If we know that we're storing into an i32, we can avoid the final trunc. 3543def : Pat<(ctpop i16:$a), 3544 (CVT_u16_u32 (POPCr32 (CVT_u32_u16 $a, CvtNONE)), CvtNONE)>; 3545def : Pat<(i32 (zext (i16 (ctpop i16:$a)))), 3546 (POPCr32 (CVT_u32_u16 $a, CvtNONE))>; 3547 3548// fpround f32 -> f16 3549def : Pat<(f16 (fpround f32:$a)), 3550 (CVT_f16_f32 $a, CvtRN)>; 3551 3552// fpround f32 -> bf16 3553def : Pat<(bf16 (fpround f32:$a)), 3554 (CVT_bf16_f32 $a, CvtRN)>, Requires<[hasPTX<70>, hasSM<80>]>; 3555 3556// fpround f64 -> f16 3557def : Pat<(f16 (fpround f64:$a)), 3558 (CVT_f16_f64 $a, CvtRN)>; 3559 3560// fpround f64 -> bf16 3561def : Pat<(bf16 (fpround f64:$a)), 3562 (CVT_bf16_f64 $a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>; 3563// fpround f64 -> f32 3564def : Pat<(f32 (fpround f64:$a)), 3565 (CVT_f32_f64 $a, CvtRN_FTZ)>, Requires<[doF32FTZ]>; 3566def : Pat<(f32 (fpround f64:$a)), 3567 (CVT_f32_f64 $a, CvtRN)>; 3568 3569// fpextend f16 -> f32 3570def : Pat<(f32 (fpextend f16:$a)), 3571 (CVT_f32_f16 $a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>; 3572def : Pat<(f32 (fpextend f16:$a)), 3573 (CVT_f32_f16 $a, CvtNONE)>; 3574// fpextend bf16 -> f32 3575def : Pat<(f32 (fpextend bf16:$a)), 3576 (CVT_f32_bf16 $a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>; 3577def : Pat<(f32 (fpextend bf16:$a)), 3578 (CVT_f32_bf16 $a, CvtNONE)>, Requires<[hasPTX<71>, hasSM<80>]>; 3579 3580// fpextend f16 -> f64 3581def : Pat<(f64 (fpextend f16:$a)), 3582 (CVT_f64_f16 $a, CvtNONE)>; 3583 3584// fpextend bf16 -> f64 3585def : Pat<(f64 (fpextend bf16:$a)), 3586 (CVT_f64_bf16 $a, CvtNONE)>, Requires<[hasPTX<78>, hasSM<90>]>; 3587 3588// fpextend f32 -> f64 3589def : Pat<(f64 (fpextend f32:$a)), 3590 (CVT_f64_f32 $a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>; 3591def : Pat<(f64 (fpextend f32:$a)), 3592 (CVT_f64_f32 $a, CvtNONE)>; 3593 3594def retglue : SDNode<"NVPTXISD::RET_GLUE", SDTNone, 3595 [SDNPHasChain, SDNPOptInGlue]>; 3596 3597// fceil, ffloor, froundeven, ftrunc. 3598 3599multiclass CVT_ROUND<SDNode OpNode, PatLeaf Mode, PatLeaf ModeFTZ> { 3600 def : Pat<(OpNode f16:$a), 3601 (CVT_f16_f16 $a, Mode)>; 3602 def : Pat<(OpNode bf16:$a), 3603 (CVT_bf16_bf16 $a, Mode)>; 3604 def : Pat<(OpNode f32:$a), 3605 (CVT_f32_f32 $a, ModeFTZ)>, Requires<[doF32FTZ]>; 3606 def : Pat<(OpNode f32:$a), 3607 (CVT_f32_f32 $a, Mode)>, Requires<[doNoF32FTZ]>; 3608 def : Pat<(OpNode f64:$a), 3609 (CVT_f64_f64 $a, Mode)>; 3610} 3611 3612defm : CVT_ROUND<fceil, CvtRPI, CvtRPI_FTZ>; 3613defm : CVT_ROUND<ffloor, CvtRMI, CvtRMI_FTZ>; 3614defm : CVT_ROUND<froundeven, CvtRNI, CvtRNI_FTZ>; 3615defm : CVT_ROUND<ftrunc, CvtRZI, CvtRZI_FTZ>; 3616 3617// nearbyint and rint are implemented as rounding to nearest even. This isn't 3618// strictly correct, because it causes us to ignore the rounding mode. But it 3619// matches what CUDA's "libm" does. 3620 3621defm : CVT_ROUND<fnearbyint, CvtRNI, CvtRNI_FTZ>; 3622defm : CVT_ROUND<frint, CvtRNI, CvtRNI_FTZ>; 3623 3624//----------------------------------- 3625// Control-flow 3626//----------------------------------- 3627 3628let isTerminator=1 in { 3629 let isReturn=1, isBarrier=1 in 3630 def Return : NVPTXInst<(outs), (ins), "ret;", [(retglue)]>; 3631 3632 let isBranch=1 in 3633 def CBranch : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target), 3634 "@$a bra \t$target;", 3635 [(brcond i1:$a, bb:$target)]>; 3636 let isBranch=1 in 3637 def CBranchOther : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target), 3638 "@!$a bra \t$target;", []>; 3639 3640 let isBranch=1, isBarrier=1 in 3641 def GOTO : NVPTXInst<(outs), (ins brtarget:$target), 3642 "bra.uni \t$target;", [(br bb:$target)]>; 3643} 3644 3645def : Pat<(brcond i32:$a, bb:$target), 3646 (CBranch (SETP_u32ri $a, 0, CmpNE), bb:$target)>; 3647 3648// SelectionDAGBuilder::visitSWitchCase() will invert the condition of a 3649// conditional branch if the target block is the next block so that the code 3650// can fall through to the target block. The invertion is done by 'xor 3651// condition, 1', which will be translated to (setne condition, -1). Since ptx 3652// supports '@!pred bra target', we should use it. 3653def : Pat<(brcond (i1 (setne i1:$a, -1)), bb:$target), 3654 (CBranchOther $a, bb:$target)>; 3655 3656// Call 3657def SDT_NVPTXCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>, 3658 SDTCisVT<1, i32>]>; 3659def SDT_NVPTXCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>; 3660 3661def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_NVPTXCallSeqStart, 3662 [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>; 3663def callseq_end : SDNode<"ISD::CALLSEQ_END", SDT_NVPTXCallSeqEnd, 3664 [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, 3665 SDNPSideEffect]>; 3666 3667def SDT_NVPTXCall : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>; 3668def call : SDNode<"NVPTXISD::CALL", SDT_NVPTXCall, 3669 [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; 3670def calltarget : Operand<i32>; 3671let isCall=1 in { 3672 def CALL : NVPTXInst<(outs), (ins calltarget:$dst), "call \t$dst, (1);", []>; 3673} 3674 3675def : Pat<(call tglobaladdr:$dst), (CALL tglobaladdr:$dst)>; 3676def : Pat<(call texternalsym:$dst), (CALL texternalsym:$dst)>; 3677 3678// Pseudo instructions. 3679class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern> 3680 : NVPTXInst<outs, ins, asmstr, pattern>; 3681 3682def Callseq_Start : 3683 NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2), 3684 "\\{ // callseq $amt1, $amt2", 3685 [(callseq_start timm:$amt1, timm:$amt2)]>; 3686def Callseq_End : 3687 NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2), 3688 "\\} // callseq $amt1", 3689 [(callseq_end timm:$amt1, timm:$amt2)]>; 3690 3691// trap instruction 3692def trapinst : NVPTXInst<(outs), (ins), "trap;", [(trap)]>, Requires<[noPTXASUnreachableBug]>; 3693// Emit an `exit` as well to convey to ptxas that `trap` exits the CFG. 3694// This won't be necessary in a future version of ptxas. 3695def trapexitinst : NVPTXInst<(outs), (ins), "trap; exit;", [(trap)]>, Requires<[hasPTXASUnreachableBug]>; 3696// brkpt instruction 3697def debugtrapinst : NVPTXInst<(outs), (ins), "brkpt;", [(debugtrap)]>; 3698 3699// Call prototype wrapper 3700def SDTCallPrototype : SDTypeProfile<0, 1, [SDTCisInt<0>]>; 3701def CallPrototype : 3702 SDNode<"NVPTXISD::CallPrototype", SDTCallPrototype, 3703 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 3704def ProtoIdent : Operand<i32> { 3705 let PrintMethod = "printProtoIdent"; 3706} 3707def CALL_PROTOTYPE : 3708 NVPTXInst<(outs), (ins ProtoIdent:$ident), 3709 "$ident", [(CallPrototype (i32 texternalsym:$ident))]>; 3710 3711def SDTDynAllocaOp : 3712 SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, SDTCisInt<1>, SDTCisInt<2>]>; 3713 3714def dyn_alloca : 3715 SDNode<"NVPTXISD::DYNAMIC_STACKALLOC", SDTDynAllocaOp, 3716 [SDNPHasChain, SDNPSideEffect]>; 3717 3718def DYNAMIC_STACKALLOC32 : 3719 NVPTXInst<(outs Int32Regs:$ptr), 3720 (ins Int32Regs:$size, i32imm:$align), 3721 "alloca.u32 \t$ptr, $size, $align;\n\t" 3722 "cvta.local.u32 \t$ptr, $ptr;", 3723 [(set i32:$ptr, (dyn_alloca i32:$size, (i32 timm:$align)))]>, 3724 Requires<[hasPTX<73>, hasSM<52>]>; 3725 3726def DYNAMIC_STACKALLOC64 : 3727 NVPTXInst<(outs Int64Regs:$ptr), 3728 (ins Int64Regs:$size, i32imm:$align), 3729 "alloca.u64 \t$ptr, $size, $align;\n\t" 3730 "cvta.local.u64 \t$ptr, $ptr;", 3731 [(set i64:$ptr, (dyn_alloca i64:$size, (i32 timm:$align)))]>, 3732 Requires<[hasPTX<73>, hasSM<52>]>; 3733 3734 3735// 3736// BRX 3737// 3738 3739def SDTBrxStartProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>; 3740def SDTBrxItemProfile : SDTypeProfile<0, 1, [SDTCisVT<0, OtherVT>]>; 3741def SDTBrxEndProfile : SDTypeProfile<0, 3, [SDTCisVT<0, OtherVT>, SDTCisInt<1>, SDTCisInt<2>]>; 3742 3743def brx_start : 3744 SDNode<"NVPTXISD::BrxStart", SDTBrxStartProfile, 3745 [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>; 3746def brx_item : 3747 SDNode<"NVPTXISD::BrxItem", SDTBrxItemProfile, 3748 [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; 3749def brx_end : 3750 SDNode<"NVPTXISD::BrxEnd", SDTBrxEndProfile, 3751 [SDNPHasChain, SDNPInGlue, SDNPSideEffect]>; 3752 3753let isTerminator = 1, isBranch = 1, isIndirectBranch = 1, isNotDuplicable = 1 in { 3754 3755 def BRX_START : 3756 NVPTXInst<(outs), (ins i32imm:$id), 3757 "$$L_brx_$id: .branchtargets", 3758 [(brx_start (i32 imm:$id))]>; 3759 3760 def BRX_ITEM : 3761 NVPTXInst<(outs), (ins brtarget:$target), 3762 "\t$target,", 3763 [(brx_item bb:$target)]>; 3764 3765 def BRX_END : 3766 NVPTXInst<(outs), (ins brtarget:$target, Int32Regs:$val, i32imm:$id), 3767 "\t$target;\n\tbrx.idx \t$val, $$L_brx_$id;", 3768 [(brx_end bb:$target, i32:$val, (i32 imm:$id))]> { 3769 let isBarrier = 1; 3770 } 3771} 3772 3773 3774foreach a_type = ["s", "u"] in { 3775 foreach b_type = ["s", "u"] in { 3776 3777 def DOT4_ # a_type # b_type : 3778 NVPTXInst<(outs Int32Regs:$dst), 3779 (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c), 3780 "dp4a." # a_type # "32." # b_type # "32 \t$dst, $a, $b, $c;", 3781 [(set i32:$dst, 3782 (!cast<Intrinsic>("int_nvvm_idp4a_" # a_type # "_" # b_type) 3783 i32:$a, i32:$b, i32:$c))]>, 3784 Requires<[hasDotInstructions]>; 3785 3786 foreach is_hi = [0, -1] in { 3787 defvar lohi_suffix = !if(is_hi, "hi", "lo"); 3788 3789 def DOT2_ # lohi_suffix # _ # a_type # b_type : 3790 NVPTXInst<(outs Int32Regs:$dst), 3791 (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c), 3792 "dp2a." # lohi_suffix # "." # a_type # "32." # b_type # "32 \t$dst, $a, $b, $c;", 3793 [(set i32:$dst, 3794 (!cast<Intrinsic>("int_nvvm_idp2a_" # a_type # "_" # b_type) 3795 i32:$a, i32:$b, is_hi, i32:$c))]>, 3796 Requires<[hasDotInstructions]>; 3797 } 3798 } 3799} 3800 3801// 3802// Stack Manipulation 3803// 3804 3805def SDTStackRestore : SDTypeProfile<0, 1, [SDTCisInt<0>]>; 3806 3807def stackrestore : 3808 SDNode<"NVPTXISD::STACKRESTORE", SDTStackRestore, 3809 [SDNPHasChain, SDNPSideEffect]>; 3810 3811def stacksave : 3812 SDNode<"NVPTXISD::STACKSAVE", SDTIntLeaf, 3813 [SDNPHasChain, SDNPSideEffect]>; 3814 3815def STACKRESTORE_32 : 3816 NVPTXInst<(outs), (ins Int32Regs:$ptr), 3817 "stackrestore.u32 \t$ptr;", 3818 [(stackrestore i32:$ptr)]>, 3819 Requires<[hasPTX<73>, hasSM<52>]>; 3820 3821def STACKSAVE_32 : 3822 NVPTXInst<(outs Int32Regs:$dst), (ins), 3823 "stacksave.u32 \t$dst;", 3824 [(set i32:$dst, (i32 stacksave))]>, 3825 Requires<[hasPTX<73>, hasSM<52>]>; 3826 3827def STACKRESTORE_64 : 3828 NVPTXInst<(outs), (ins Int64Regs:$ptr), 3829 "stackrestore.u64 \t$ptr;", 3830 [(stackrestore i64:$ptr)]>, 3831 Requires<[hasPTX<73>, hasSM<52>]>; 3832 3833def STACKSAVE_64 : 3834 NVPTXInst<(outs Int64Regs:$dst), (ins), 3835 "stacksave.u64 \t$dst;", 3836 [(set i64:$dst, (i64 stacksave))]>, 3837 Requires<[hasPTX<73>, hasSM<52>]>; 3838 3839include "NVPTXIntrinsics.td" 3840 3841//----------------------------------- 3842// Notes 3843//----------------------------------- 3844// BSWAP is currently expanded. The following is a more efficient 3845// - for < sm_20, use vector scalar mov, as tesla support native 16-bit register 3846// - for sm_20, use pmpt (use vector scalar mov to get the pack and 3847// unpack). sm_20 supports native 32-bit register, but not native 16-bit 3848// register. 3849 3850def : Pat < 3851 (i32 (bswap i32:$a)), 3852 (INT_NVVM_PRMT $a, (i32 0), (i32 0x0123))>; 3853 3854def : Pat < 3855 (v2i16 (bswap v2i16:$a)), 3856 (INT_NVVM_PRMT $a, (i32 0), (i32 0x2301))>; 3857 3858def : Pat < 3859 (i64 (bswap i64:$a)), 3860 (V2I32toI64 3861 (INT_NVVM_PRMT (I64toI32H $a), (i32 0), (i32 0x0123)), 3862 (INT_NVVM_PRMT (I64toI32L $a), (i32 0), (i32 0x0123)))>; 3863 3864 3865//////////////////////////////////////////////////////////////////////////////// 3866// PTX Fence instructions 3867//////////////////////////////////////////////////////////////////////////////// 3868 3869def atomic_thread_fence_seq_cst_sys : 3870 NVPTXInst<(outs), (ins), "fence.sc.sys;", []>, 3871 Requires<[hasPTX<60>, hasSM<70>]>; 3872def atomic_thread_fence_acq_rel_sys : 3873 NVPTXInst<(outs), (ins), "fence.acq_rel.sys;", []>, 3874 Requires<[hasPTX<60>, hasSM<70>]>; 3875 3876def atomic_thread_fence_seq_cst_gpu : 3877 NVPTXInst<(outs), (ins), "fence.sc.gpu;", []>, 3878 Requires<[hasPTX<60>, hasSM<70>]>; 3879def atomic_thread_fence_acq_rel_gpu : 3880 NVPTXInst<(outs), (ins), "fence.acq_rel.gpu;", []>, 3881 Requires<[hasPTX<60>, hasSM<70>]>; 3882 3883def atomic_thread_fence_seq_cst_cluster : 3884 NVPTXInst<(outs), (ins), "fence.sc.cluster;", []>, 3885 Requires<[hasPTX<78>, hasSM<90>]>; 3886def atomic_thread_fence_acq_rel_cluster : 3887 NVPTXInst<(outs), (ins), "fence.acq_rel.cluster;", []>, 3888 Requires<[hasPTX<78>, hasSM<90>]>; 3889 3890def atomic_thread_fence_seq_cst_cta : 3891 NVPTXInst<(outs), (ins), "fence.sc.cta;", []>, 3892 Requires<[hasPTX<60>, hasSM<70>]>; 3893def atomic_thread_fence_acq_rel_cta : 3894 NVPTXInst<(outs), (ins), "fence.acq_rel.cta;", []>, 3895 Requires<[hasPTX<60>, hasSM<70>]>; 3896 3897def fpimm_any_zero : FPImmLeaf<fAny, [{ 3898 return Imm.isZero(); 3899}]>; 3900 3901def fpimm_positive_zero_v2f16 : PatFrag<(ops), (v2f16 (bitconvert (i32 0)))>; 3902def fpimm_positive_zero_v2bf16 : PatFrag<(ops), (v2bf16 (bitconvert (i32 0)))>; 3903 3904// Perform substitution if fma only has one use, and also if instruction has 3905// nnan instruction flag or if the TM has NoNaNsFPMath 3906def NVPTX_fma_oneuse_and_nnan : PatFrag<(ops node:$a, node:$b, node:$c), 3907 (fma node:$a, node:$b, node:$c), [{ 3908 return N->hasOneUse() && 3909 (N->getFlags().hasNoNaNs() || TM.Options.NoNaNsFPMath); 3910}]>; 3911// fmaxnum will differentiate between signed and unsigned zeros soon, so this 3912// PatFrag is for a fmaxnum node with nsz 3913def NVPTX_fmaxnum_nsz : PatFrag<(ops node:$a, node:$b), 3914 (fmaxnum node:$a, node:$b), [{ 3915 return N->getFlags().hasNoSignedZeros() || TM.Options.NoSignedZerosFPMath; 3916}]>; 3917 3918class NVPTXInst_rrr<RegisterClass RC, string Instruction, list<Predicate> Preds> 3919 : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c), 3920 !strconcat(Instruction, "\t$dst, $a, $b, $c;"), []>, 3921 Requires<Preds>; 3922 3923def FMARELU_F16 : NVPTXInst_rrr<Int16Regs, "fma.rn.relu.f16", [useFP16Math, hasPTX<70>, hasSM<80>]>; 3924def FMARELU_F16_FTZ : NVPTXInst_rrr<Int16Regs, "fma.rn.ftz.relu.f16", [useFP16Math, hasPTX<70>, hasSM<80>]>; 3925def FMARELU_BF16 : NVPTXInst_rrr<Int16Regs, "fma.rn.relu.bf16", [hasBF16Math, hasPTX<70>, hasSM<80>]>; 3926def FMARELU_F16X2 : NVPTXInst_rrr<Int32Regs, "fma.rn.relu.f16x2", [useFP16Math, hasPTX<70>, hasSM<80>]>; 3927def FMARELU_F16X2_FTZ : NVPTXInst_rrr<Int32Regs, "fma.rn.ftz.relu.f16x2", [useFP16Math, hasPTX<70>, hasSM<80>]>; 3928def FMARELU_BF16X2 : NVPTXInst_rrr<Int32Regs, "fma.rn.relu.bf16x2", [hasBF16Math, hasPTX<70>, hasSM<80>]>; 3929 3930// FTZ 3931def : Pat<(f16 (NVPTX_fmaxnum_nsz (NVPTX_fma_oneuse_and_nnan f16:$a, f16:$b, f16:$c), fpimm_any_zero)), 3932 (FMARELU_F16_FTZ $a, $b, $c)>, 3933 Requires<[doF32FTZ]>; 3934def : Pat<(v2f16 (NVPTX_fmaxnum_nsz (NVPTX_fma_oneuse_and_nnan v2f16:$a, v2f16:$b, v2f16:$c), fpimm_positive_zero_v2f16)), 3935 (FMARELU_F16X2_FTZ $a, $b, $c)>, 3936 Requires<[doF32FTZ]>; 3937 3938// NO FTZ 3939def : Pat<(f16 (NVPTX_fmaxnum_nsz (NVPTX_fma_oneuse_and_nnan f16:$a, f16:$b, f16:$c), fpimm_any_zero)), 3940 (FMARELU_F16 $a, $b, $c)>; 3941def : Pat<(bf16 (NVPTX_fmaxnum_nsz (NVPTX_fma_oneuse_and_nnan bf16:$a, bf16:$b, bf16:$c), fpimm_any_zero)), 3942 (FMARELU_BF16 $a, $b, $c)>; 3943def : Pat<(v2f16 (NVPTX_fmaxnum_nsz (NVPTX_fma_oneuse_and_nnan v2f16:$a, v2f16:$b, v2f16:$c), fpimm_positive_zero_v2f16)), 3944 (FMARELU_F16X2 $a, $b, $c)>; 3945def : Pat<(v2bf16 (NVPTX_fmaxnum_nsz (NVPTX_fma_oneuse_and_nnan v2bf16:$a, v2bf16:$b, v2bf16:$c), fpimm_positive_zero_v2bf16)), 3946 (FMARELU_BF16X2 $a, $b, $c)>; 3947