xref: /llvm-project/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td (revision 3b5e9eed2f67c1fb6dcf7033e92509ba2b0381e9)
1//===- NVPTXInstrInfo.td - NVPTX Instruction defs -------------*- tblgen-*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file describes the PTX instructions in TableGen format.
10//
11//===----------------------------------------------------------------------===//
12
13include "NVPTXInstrFormats.td"
14
15let OperandType = "OPERAND_IMMEDIATE" in {
16  def f16imm : Operand<f16>;
17  def bf16imm : Operand<bf16>;
18
19}
20
21// List of vector specific properties
22def isVecLD      : VecInstTypeEnum<1>;
23def isVecST      : VecInstTypeEnum<2>;
24def isVecBuild   : VecInstTypeEnum<3>;
25def isVecShuffle : VecInstTypeEnum<4>;
26def isVecExtract : VecInstTypeEnum<5>;
27def isVecInsert  : VecInstTypeEnum<6>;
28def isVecDest    : VecInstTypeEnum<7>;
29def isVecOther   : VecInstTypeEnum<15>;
30
31//===----------------------------------------------------------------------===//
32// NVPTX Operand Definitions.
33//===----------------------------------------------------------------------===//
34
35def brtarget    : Operand<OtherVT>;
36
37// CVT conversion modes
38// These must match the enum in NVPTX.h
39def CvtNONE : PatLeaf<(i32 0x0)>;
40def CvtRNI  : PatLeaf<(i32 0x1)>;
41def CvtRZI  : PatLeaf<(i32 0x2)>;
42def CvtRMI  : PatLeaf<(i32 0x3)>;
43def CvtRPI  : PatLeaf<(i32 0x4)>;
44def CvtRN   : PatLeaf<(i32 0x5)>;
45def CvtRZ   : PatLeaf<(i32 0x6)>;
46def CvtRM   : PatLeaf<(i32 0x7)>;
47def CvtRP   : PatLeaf<(i32 0x8)>;
48def CvtRNA   : PatLeaf<(i32 0x9)>;
49
50def CvtNONE_FTZ : PatLeaf<(i32 0x10)>;
51def CvtRNI_FTZ  : PatLeaf<(i32 0x11)>;
52def CvtRZI_FTZ  : PatLeaf<(i32 0x12)>;
53def CvtRMI_FTZ  : PatLeaf<(i32 0x13)>;
54def CvtRPI_FTZ  : PatLeaf<(i32 0x14)>;
55def CvtRN_FTZ   : PatLeaf<(i32 0x15)>;
56def CvtRZ_FTZ   : PatLeaf<(i32 0x16)>;
57def CvtRM_FTZ   : PatLeaf<(i32 0x17)>;
58def CvtRP_FTZ   : PatLeaf<(i32 0x18)>;
59
60def CvtSAT      : PatLeaf<(i32 0x20)>;
61def CvtSAT_FTZ  : PatLeaf<(i32 0x30)>;
62
63def CvtNONE_RELU   : PatLeaf<(i32 0x40)>;
64def CvtRN_RELU   : PatLeaf<(i32 0x45)>;
65def CvtRZ_RELU   : PatLeaf<(i32 0x46)>;
66
67def CvtMode : Operand<i32> {
68  let PrintMethod = "printCvtMode";
69}
70
71// Compare modes
72// These must match the enum in NVPTX.h
73def CmpEQ   : PatLeaf<(i32 0)>;
74def CmpNE   : PatLeaf<(i32 1)>;
75def CmpLT   : PatLeaf<(i32 2)>;
76def CmpLE   : PatLeaf<(i32 3)>;
77def CmpGT   : PatLeaf<(i32 4)>;
78def CmpGE   : PatLeaf<(i32 5)>;
79def CmpLO   : PatLeaf<(i32 6)>;
80def CmpLS   : PatLeaf<(i32 7)>;
81def CmpHI   : PatLeaf<(i32 8)>;
82def CmpHS   : PatLeaf<(i32 9)>;
83def CmpEQU  : PatLeaf<(i32 10)>;
84def CmpNEU  : PatLeaf<(i32 11)>;
85def CmpLTU  : PatLeaf<(i32 12)>;
86def CmpLEU  : PatLeaf<(i32 13)>;
87def CmpGTU  : PatLeaf<(i32 14)>;
88def CmpGEU  : PatLeaf<(i32 15)>;
89def CmpNUM  : PatLeaf<(i32 16)>;
90def CmpNAN  : PatLeaf<(i32 17)>;
91
92def CmpEQ_FTZ   : PatLeaf<(i32 0x100)>;
93def CmpNE_FTZ   : PatLeaf<(i32 0x101)>;
94def CmpLT_FTZ   : PatLeaf<(i32 0x102)>;
95def CmpLE_FTZ   : PatLeaf<(i32 0x103)>;
96def CmpGT_FTZ   : PatLeaf<(i32 0x104)>;
97def CmpGE_FTZ   : PatLeaf<(i32 0x105)>;
98def CmpEQU_FTZ  : PatLeaf<(i32 0x10A)>;
99def CmpNEU_FTZ  : PatLeaf<(i32 0x10B)>;
100def CmpLTU_FTZ  : PatLeaf<(i32 0x10C)>;
101def CmpLEU_FTZ  : PatLeaf<(i32 0x10D)>;
102def CmpGTU_FTZ  : PatLeaf<(i32 0x10E)>;
103def CmpGEU_FTZ  : PatLeaf<(i32 0x10F)>;
104def CmpNUM_FTZ  : PatLeaf<(i32 0x110)>;
105def CmpNAN_FTZ  : PatLeaf<(i32 0x111)>;
106
107def CmpMode : Operand<i32> {
108  let PrintMethod = "printCmpMode";
109}
110def VecElement : Operand<i32> {
111  let PrintMethod = "printVecElement";
112}
113
114// PRMT modes
115// These must match the enum in NVPTX.h
116def PrmtNONE : PatLeaf<(i32 0x0)>;
117def PrmtF4E  : PatLeaf<(i32 0x1)>;
118def PrmtB4E  : PatLeaf<(i32 0x2)>;
119def PrmtRC8  : PatLeaf<(i32 0x3)>;
120def PrmtECL  : PatLeaf<(i32 0x4)>;
121def PrmtECR  : PatLeaf<(i32 0x5)>;
122def PrmtRC16 : PatLeaf<(i32 0x6)>;
123
124def PrmtMode : Operand<i32> {
125  let PrintMethod = "printPrmtMode";
126}
127
128
129//===----------------------------------------------------------------------===//
130// NVPTX Instruction Predicate Definitions
131//===----------------------------------------------------------------------===//
132
133
134def hasAtomAddF64 : Predicate<"Subtarget->hasAtomAddF64()">;
135def hasAtomScope : Predicate<"Subtarget->hasAtomScope()">;
136def hasAtomBitwise64 : Predicate<"Subtarget->hasAtomBitwise64()">;
137def hasAtomMinMax64 : Predicate<"Subtarget->hasAtomMinMax64()">;
138def hasVote : Predicate<"Subtarget->hasVote()">;
139def hasDouble : Predicate<"Subtarget->hasDouble()">;
140def hasLDG : Predicate<"Subtarget->hasLDG()">;
141def hasLDU : Predicate<"Subtarget->hasLDU()">;
142def hasPTXASUnreachableBug : Predicate<"Subtarget->hasPTXASUnreachableBug()">;
143def noPTXASUnreachableBug : Predicate<"!Subtarget->hasPTXASUnreachableBug()">;
144def hasOptEnabled : Predicate<"TM.getOptLevel() != CodeGenOptLevel::None">;
145
146def doF32FTZ : Predicate<"useF32FTZ()">;
147def doNoF32FTZ : Predicate<"!useF32FTZ()">;
148def doRsqrtOpt : Predicate<"doRsqrtOpt()">;
149
150def doMulWide      : Predicate<"doMulWide">;
151
152def allowFMA : Predicate<"allowFMA()">;
153def noFMA : Predicate<"!allowFMA()">;
154def allowUnsafeFPMath : Predicate<"allowUnsafeFPMath()">;
155def noUnsafeFPMath : Predicate<"!allowUnsafeFPMath()">;
156
157def do_DIVF32_APPROX : Predicate<"getDivF32Level()==0">;
158def do_DIVF32_FULL : Predicate<"getDivF32Level()==1">;
159
160def do_SQRTF32_APPROX : Predicate<"!usePrecSqrtF32()">;
161def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">;
162
163def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">;
164def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">;
165def hasDotInstructions : Predicate<"Subtarget->hasDotInstructions()">;
166
167def True : Predicate<"true">;
168def False : Predicate<"false">;
169
170class hasPTX<int version>: Predicate<"Subtarget->getPTXVersion() >= " # version>;
171class hasSM<int version>: Predicate<"Subtarget->getSmVersion() >= " # version>;
172
173// Explicit records for arch-accelerated SM versions
174def hasSM90a : Predicate<"Subtarget->getFullSmVersion() == 901">;
175def hasSM100a : Predicate<"Subtarget->getFullSmVersion() == 1001">;
176def hasSM101a : Predicate<"Subtarget->getFullSmVersion() == 1011">;
177def hasSM120a : Predicate<"Subtarget->getFullSmVersion() == 1201">;
178
179// non-sync shfl instructions are not available on sm_70+ in PTX6.4+
180def hasSHFL : Predicate<"!(Subtarget->getSmVersion() >= 70"
181                          "&& Subtarget->getPTXVersion() >= 64)">;
182
183def useFP16Math: Predicate<"Subtarget->allowFP16Math()">;
184def hasBF16Math: Predicate<"Subtarget->hasBF16Math()">;
185
186// Helper class to aid conversion between ValueType and a matching RegisterClass.
187
188class ValueToRegClass<ValueType T> {
189   string name = !cast<string>(T);
190   NVPTXRegClass ret = !cond(
191     !eq(name, "i1"): Int1Regs,
192     !eq(name, "i16"): Int16Regs,
193     !eq(name, "v2i16"): Int32Regs,
194     !eq(name, "i32"): Int32Regs,
195     !eq(name, "i64"): Int64Regs,
196     !eq(name, "f16"): Int16Regs,
197     !eq(name, "v2f16"): Int32Regs,
198     !eq(name, "bf16"): Int16Regs,
199     !eq(name, "v2bf16"): Int32Regs,
200     !eq(name, "f32"): Float32Regs,
201     !eq(name, "f64"): Float64Regs,
202     !eq(name, "ai32"): Int32ArgRegs,
203     !eq(name, "ai64"): Int64ArgRegs,
204     !eq(name, "af32"): Float32ArgRegs,
205     !eq(name, "if64"): Float64ArgRegs,
206    );
207}
208
209
210//===----------------------------------------------------------------------===//
211// Some Common Instruction Class Templates
212//===----------------------------------------------------------------------===//
213
214// Utility class to wrap up information about a register and DAG type for more
215// convenient iteration and parameterization
216class RegTyInfo<ValueType ty, NVPTXRegClass rc, Operand imm> {
217  ValueType Ty = ty;
218  NVPTXRegClass RC = rc;
219  Operand Imm = imm;
220  int Size = ty.Size;
221}
222
223def I16RT : RegTyInfo<i16, Int16Regs, i16imm>;
224def I32RT : RegTyInfo<i32, Int32Regs, i32imm>;
225def I64RT : RegTyInfo<i64, Int64Regs, i64imm>;
226
227// Template for instructions which take three int64, int32, or int16 args.
228// The instructions are named "<OpcStr><Width>" (e.g. "add.s64").
229multiclass I3<string OpcStr, SDNode OpNode, bit commutative> {
230  foreach t = [I16RT, I32RT, I64RT] in {
231    defvar asmstr = OpcStr # t.Size # " \t$dst, $a, $b;";
232
233    def t.Ty # rr :
234      NVPTXInst<(outs t.RC:$dst), (ins t.RC:$a, t.RC:$b),
235                asmstr,
236                [(set t.Ty:$dst, (OpNode t.Ty:$a, t.Ty:$b))]>;
237    def t.Ty # ri :
238      NVPTXInst<(outs t.RC:$dst), (ins t.RC:$a, t.Imm:$b),
239                asmstr,
240                [(set t.Ty:$dst, (OpNode t.RC:$a, imm:$b))]>;
241    if !not(commutative) then
242      def t.Ty # ir :
243        NVPTXInst<(outs t.RC:$dst), (ins t.Imm:$a, t.RC:$b),
244                  asmstr,
245                  [(set t.Ty:$dst, (OpNode imm:$a, t.RC:$b))]>;
246  }
247}
248
249class I16x2<string OpcStr, SDNode OpNode> :
250 NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
251              !strconcat(OpcStr, "16x2 \t$dst, $a, $b;"),
252              [(set v2i16:$dst, (OpNode v2i16:$a, v2i16:$b))]>,
253              Requires<[hasPTX<80>, hasSM<90>]>;
254
255// Template for instructions which take 3 int args.  The instructions are
256// named "<OpcStr>.s32" (e.g. "addc.cc.s32").
257multiclass ADD_SUB_INT_CARRY<string OpcStr, SDNode OpNode> {
258  let hasSideEffects = 1 in {
259    def i32rr :
260      NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
261                !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
262                [(set i32:$dst, (OpNode i32:$a, i32:$b))]>;
263    def i32ri :
264      NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
265                !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
266                [(set i32:$dst, (OpNode i32:$a, imm:$b))]>;
267    def i64rr :
268      NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
269                !strconcat(OpcStr, ".s64 \t$dst, $a, $b;"),
270                [(set i64:$dst, (OpNode i64:$a, i64:$b))]>,
271      Requires<[hasPTX<43>]>;
272    def i64ri :
273      NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
274                !strconcat(OpcStr, ".s64 \t$dst, $a, $b;"),
275                [(set i64:$dst, (OpNode i64:$a, imm:$b))]>,
276      Requires<[hasPTX<43>]>;
277  }
278}
279
280// Template for minimum/maximum instructions.
281//
282// Also defines ftz (flush subnormal inputs and results to sign-preserving
283// zero) variants for fp32 functions.
284multiclass FMINIMUMMAXIMUM<string OpcStr, bit NaN, SDNode OpNode> {
285  if !not(NaN) then {
286   def f64rr :
287     NVPTXInst<(outs Float64Regs:$dst),
288               (ins Float64Regs:$a, Float64Regs:$b),
289               !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
290               [(set f64:$dst, (OpNode f64:$a, f64:$b))]>;
291   def f64ri :
292     NVPTXInst<(outs Float64Regs:$dst),
293               (ins Float64Regs:$a, f64imm:$b),
294               !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
295               [(set f64:$dst, (OpNode f64:$a, fpimm:$b))]>;
296  }
297   def f32rr_ftz :
298     NVPTXInst<(outs Float32Regs:$dst),
299               (ins Float32Regs:$a, Float32Regs:$b),
300               !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
301               [(set f32:$dst, (OpNode f32:$a, f32:$b))]>,
302               Requires<[doF32FTZ]>;
303   def f32ri_ftz :
304     NVPTXInst<(outs Float32Regs:$dst),
305               (ins Float32Regs:$a, f32imm:$b),
306               !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
307               [(set f32:$dst, (OpNode f32:$a, fpimm:$b))]>,
308               Requires<[doF32FTZ]>;
309   def f32rr :
310     NVPTXInst<(outs Float32Regs:$dst),
311               (ins Float32Regs:$a, Float32Regs:$b),
312               !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
313               [(set f32:$dst, (OpNode f32:$a, f32:$b))]>;
314   def f32ri :
315     NVPTXInst<(outs Float32Regs:$dst),
316               (ins Float32Regs:$a, f32imm:$b),
317               !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
318               [(set f32:$dst, (OpNode f32:$a, fpimm:$b))]>;
319
320   def f16rr_ftz :
321     NVPTXInst<(outs Int16Regs:$dst),
322               (ins Int16Regs:$a, Int16Regs:$b),
323               !strconcat(OpcStr, ".ftz.f16 \t$dst, $a, $b;"),
324               [(set f16:$dst, (OpNode f16:$a, f16:$b))]>,
325               Requires<[useFP16Math, doF32FTZ]>;
326   def f16rr :
327     NVPTXInst<(outs Int16Regs:$dst),
328               (ins Int16Regs:$a, Int16Regs:$b),
329               !strconcat(OpcStr, ".f16 \t$dst, $a, $b;"),
330               [(set f16:$dst, (OpNode f16:$a, f16:$b))]>,
331               Requires<[useFP16Math, hasSM<80>, hasPTX<70>]>;
332
333   def f16x2rr_ftz :
334     NVPTXInst<(outs Int32Regs:$dst),
335               (ins Int32Regs:$a, Int32Regs:$b),
336               !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"),
337               [(set v2f16:$dst, (OpNode v2f16:$a, v2f16:$b))]>,
338               Requires<[useFP16Math, hasSM<80>, hasPTX<70>, doF32FTZ]>;
339   def f16x2rr :
340     NVPTXInst<(outs Int32Regs:$dst),
341               (ins Int32Regs:$a, Int32Regs:$b),
342               !strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"),
343               [(set v2f16:$dst, (OpNode v2f16:$a, v2f16:$b))]>,
344               Requires<[useFP16Math, hasSM<80>, hasPTX<70>]>;
345   def bf16rr :
346     NVPTXInst<(outs Int16Regs:$dst),
347               (ins Int16Regs:$a, Int16Regs:$b),
348               !strconcat(OpcStr, ".bf16 \t$dst, $a, $b;"),
349               [(set bf16:$dst, (OpNode bf16:$a, bf16:$b))]>,
350               Requires<[hasBF16Math, hasSM<80>, hasPTX<70>]>;
351   def bf16x2rr :
352     NVPTXInst<(outs Int32Regs:$dst),
353               (ins Int32Regs:$a, Int32Regs:$b),
354               !strconcat(OpcStr, ".bf16x2 \t$dst, $a, $b;"),
355               [(set v2bf16:$dst, (OpNode v2bf16:$a, v2bf16:$b))]>,
356               Requires<[hasBF16Math, hasSM<80>, hasPTX<70>]>;
357}
358
359// Template for instructions which take three FP args.  The
360// instructions are named "<OpcStr>.f<Width>" (e.g. "add.f64").
361//
362// Also defines ftz (flush subnormal inputs and results to sign-preserving
363// zero) variants for fp32/fp16 functions.
364//
365// This multiclass should be used for nodes that can be folded to make fma ops.
366// In this case, we use the ".rn" variant when FMA is disabled, as this behaves
367// just like the non ".rn" op, but prevents ptxas from creating FMAs.
368multiclass F3_fma_component<string OpcStr, SDNode OpNode> {
369   def f64rr :
370     NVPTXInst<(outs Float64Regs:$dst),
371               (ins Float64Regs:$a, Float64Regs:$b),
372               !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
373               [(set f64:$dst, (OpNode f64:$a, f64:$b))]>,
374               Requires<[allowFMA]>;
375   def f64ri :
376     NVPTXInst<(outs Float64Regs:$dst),
377               (ins Float64Regs:$a, f64imm:$b),
378               !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
379               [(set f64:$dst, (OpNode f64:$a, fpimm:$b))]>,
380               Requires<[allowFMA]>;
381   def f32rr_ftz :
382     NVPTXInst<(outs Float32Regs:$dst),
383               (ins Float32Regs:$a, Float32Regs:$b),
384               !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
385               [(set f32:$dst, (OpNode f32:$a, f32:$b))]>,
386               Requires<[allowFMA, doF32FTZ]>;
387   def f32ri_ftz :
388     NVPTXInst<(outs Float32Regs:$dst),
389               (ins Float32Regs:$a, f32imm:$b),
390               !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
391               [(set f32:$dst, (OpNode f32:$a, fpimm:$b))]>,
392               Requires<[allowFMA, doF32FTZ]>;
393   def f32rr :
394     NVPTXInst<(outs Float32Regs:$dst),
395               (ins Float32Regs:$a, Float32Regs:$b),
396               !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
397               [(set f32:$dst, (OpNode f32:$a, f32:$b))]>,
398               Requires<[allowFMA]>;
399   def f32ri :
400     NVPTXInst<(outs Float32Regs:$dst),
401               (ins Float32Regs:$a, f32imm:$b),
402               !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
403               [(set f32:$dst, (OpNode f32:$a, fpimm:$b))]>,
404               Requires<[allowFMA]>;
405
406   def f16rr_ftz :
407     NVPTXInst<(outs Int16Regs:$dst),
408               (ins Int16Regs:$a, Int16Regs:$b),
409               !strconcat(OpcStr, ".ftz.f16 \t$dst, $a, $b;"),
410               [(set f16:$dst, (OpNode f16:$a, f16:$b))]>,
411               Requires<[useFP16Math, allowFMA, doF32FTZ]>;
412   def f16rr :
413     NVPTXInst<(outs Int16Regs:$dst),
414               (ins Int16Regs:$a, Int16Regs:$b),
415               !strconcat(OpcStr, ".f16 \t$dst, $a, $b;"),
416               [(set f16:$dst, (OpNode f16:$a, f16:$b))]>,
417               Requires<[useFP16Math, allowFMA]>;
418
419   def f16x2rr_ftz :
420     NVPTXInst<(outs Int32Regs:$dst),
421               (ins Int32Regs:$a, Int32Regs:$b),
422               !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"),
423               [(set v2f16:$dst, (OpNode v2f16:$a, v2f16:$b))]>,
424               Requires<[useFP16Math, allowFMA, doF32FTZ]>;
425   def f16x2rr :
426     NVPTXInst<(outs Int32Regs:$dst),
427               (ins Int32Regs:$a, Int32Regs:$b),
428               !strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"),
429               [(set v2f16:$dst, (OpNode v2f16:$a, v2f16:$b))]>,
430               Requires<[useFP16Math, allowFMA]>;
431   def bf16rr :
432     NVPTXInst<(outs Int16Regs:$dst),
433               (ins Int16Regs:$a, Int16Regs:$b),
434               !strconcat(OpcStr, ".bf16 \t$dst, $a, $b;"),
435               [(set bf16:$dst, (OpNode bf16:$a, bf16:$b))]>,
436               Requires<[hasBF16Math, allowFMA]>;
437
438   def bf16x2rr :
439     NVPTXInst<(outs Int32Regs:$dst),
440               (ins Int32Regs:$a, Int32Regs:$b),
441               !strconcat(OpcStr, ".bf16x2 \t$dst, $a, $b;"),
442               [(set v2bf16:$dst, (OpNode v2bf16:$a, v2bf16:$b))]>,
443               Requires<[hasBF16Math, allowFMA]>;
444   // These have strange names so we don't perturb existing mir tests.
445   def _rnf64rr :
446     NVPTXInst<(outs Float64Regs:$dst),
447               (ins Float64Regs:$a, Float64Regs:$b),
448               !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
449               [(set f64:$dst, (OpNode f64:$a, f64:$b))]>,
450               Requires<[noFMA]>;
451   def _rnf64ri :
452     NVPTXInst<(outs Float64Regs:$dst),
453               (ins Float64Regs:$a, f64imm:$b),
454               !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
455               [(set f64:$dst, (OpNode f64:$a, fpimm:$b))]>,
456               Requires<[noFMA]>;
457   def _rnf32rr_ftz :
458     NVPTXInst<(outs Float32Regs:$dst),
459               (ins Float32Regs:$a, Float32Regs:$b),
460               !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
461               [(set f32:$dst, (OpNode f32:$a, Float32Regs:$b))]>,
462               Requires<[noFMA, doF32FTZ]>;
463   def _rnf32ri_ftz :
464     NVPTXInst<(outs Float32Regs:$dst),
465               (ins Float32Regs:$a, f32imm:$b),
466               !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
467               [(set f32:$dst, (OpNode f32:$a, fpimm:$b))]>,
468               Requires<[noFMA, doF32FTZ]>;
469   def _rnf32rr :
470     NVPTXInst<(outs Float32Regs:$dst),
471               (ins Float32Regs:$a, Float32Regs:$b),
472               !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
473               [(set f32:$dst, (OpNode f32:$a, f32:$b))]>,
474               Requires<[noFMA]>;
475   def _rnf32ri :
476     NVPTXInst<(outs Float32Regs:$dst),
477               (ins Float32Regs:$a, f32imm:$b),
478               !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
479               [(set f32:$dst, (OpNode f32:$a, fpimm:$b))]>,
480               Requires<[noFMA]>;
481   def _rnf16rr_ftz :
482     NVPTXInst<(outs Int16Regs:$dst),
483               (ins Int16Regs:$a, Int16Regs:$b),
484               !strconcat(OpcStr, ".rn.ftz.f16 \t$dst, $a, $b;"),
485               [(set f16:$dst, (OpNode f16:$a, f16:$b))]>,
486               Requires<[useFP16Math, noFMA, doF32FTZ]>;
487   def _rnf16rr :
488     NVPTXInst<(outs Int16Regs:$dst),
489               (ins Int16Regs:$a, Int16Regs:$b),
490               !strconcat(OpcStr, ".rn.f16 \t$dst, $a, $b;"),
491               [(set f16:$dst, (OpNode f16:$a, f16:$b))]>,
492               Requires<[useFP16Math, noFMA]>;
493   def _rnf16x2rr_ftz :
494     NVPTXInst<(outs Int32Regs:$dst),
495               (ins Int32Regs:$a, Int32Regs:$b),
496               !strconcat(OpcStr, ".rn.ftz.f16x2 \t$dst, $a, $b;"),
497               [(set v2f16:$dst, (OpNode v2f16:$a, v2f16:$b))]>,
498               Requires<[useFP16Math, noFMA, doF32FTZ]>;
499   def _rnf16x2rr :
500     NVPTXInst<(outs Int32Regs:$dst),
501               (ins Int32Regs:$a, Int32Regs:$b),
502               !strconcat(OpcStr, ".rn.f16x2 \t$dst, $a, $b;"),
503               [(set v2f16:$dst, (OpNode v2f16:$a, v2f16:$b))]>,
504               Requires<[useFP16Math, noFMA]>;
505  def _rnbf16rr_ftz :
506     NVPTXInst<(outs Int16Regs:$dst),
507               (ins Int16Regs:$a, Int16Regs:$b),
508               !strconcat(OpcStr, ".rn.ftz.bf16 \t$dst, $a, $b;"),
509               [(set bf16:$dst, (OpNode bf16:$a, bf16:$b))]>,
510               Requires<[hasBF16Math, noFMA, doF32FTZ]>;
511   def _rnbf16rr :
512     NVPTXInst<(outs Int16Regs:$dst),
513               (ins Int16Regs:$a, Int16Regs:$b),
514               !strconcat(OpcStr, ".rn.bf16 \t$dst, $a, $b;"),
515               [(set bf16:$dst, (OpNode bf16:$a, bf16:$b))]>,
516               Requires<[hasBF16Math, noFMA]>;
517   def _rnbf16x2rr_ftz :
518     NVPTXInst<(outs Int32Regs:$dst),
519               (ins Int32Regs:$a, Int32Regs:$b),
520               !strconcat(OpcStr, ".rn.ftz.bf16x2 \t$dst, $a, $b;"),
521               [(set v2bf16:$dst, (OpNode v2bf16:$a, v2bf16:$b))]>,
522               Requires<[hasBF16Math, noFMA, doF32FTZ]>;
523   def _rnbf16x2rr :
524     NVPTXInst<(outs Int32Regs:$dst),
525               (ins Int32Regs:$a, Int32Regs:$b),
526               !strconcat(OpcStr, ".rn.bf16x2 \t$dst, $a, $b;"),
527               [(set v2bf16:$dst, (OpNode v2bf16:$a, v2bf16:$b))]>,
528               Requires<[hasBF16Math, noFMA]>;
529}
530
531// Template for operations which take two f32 or f64 operands.  Provides three
532// instructions: <OpcStr>.f64, <OpcStr>.f32, and <OpcStr>.ftz.f32 (flush
533// subnormal inputs and results to zero).
534multiclass F2<string OpcStr, SDNode OpNode> {
535   def f64 :     NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a),
536                           !strconcat(OpcStr, ".f64 \t$dst, $a;"),
537                           [(set f64:$dst, (OpNode f64:$a))]>;
538   def f32_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
539                           !strconcat(OpcStr, ".ftz.f32 \t$dst, $a;"),
540                           [(set f32:$dst, (OpNode f32:$a))]>,
541                           Requires<[doF32FTZ]>;
542   def f32 :     NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
543                           !strconcat(OpcStr, ".f32 \t$dst, $a;"),
544                           [(set f32:$dst, (OpNode f32:$a))]>;
545}
546
547multiclass F2_Support_Half<string OpcStr, SDNode OpNode> {
548   def bf16 :      NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a),
549                           !strconcat(OpcStr, ".bf16 \t$dst, $a;"),
550                           [(set bf16:$dst, (OpNode bf16:$a))]>,
551                           Requires<[hasSM<80>, hasPTX<70>]>;
552   def bf16x2 :    NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a),
553                           !strconcat(OpcStr, ".bf16x2 \t$dst, $a;"),
554                           [(set v2bf16:$dst, (OpNode v2bf16:$a))]>,
555                           Requires<[hasSM<80>, hasPTX<70>]>;
556   def f16_ftz :   NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a),
557                           !strconcat(OpcStr, ".ftz.f16 \t$dst, $a;"),
558                           [(set f16:$dst, (OpNode f16:$a))]>,
559                           Requires<[hasSM<53>, hasPTX<65>, doF32FTZ]>;
560   def f16x2_ftz : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a),
561                           !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a;"),
562                           [(set v2f16:$dst, (OpNode v2f16:$a))]>,
563                           Requires<[hasSM<53>, hasPTX<65>, doF32FTZ]>;
564   def f16 :       NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a),
565                           !strconcat(OpcStr, ".f16 \t$dst, $a;"),
566                           [(set f16:$dst, (OpNode f16:$a))]>,
567                           Requires<[hasSM<53>, hasPTX<65>]>;
568   def f16x2 :     NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a),
569                           !strconcat(OpcStr, ".f16x2 \t$dst, $a;"),
570                           [(set v2f16:$dst, (OpNode v2f16:$a))]>,
571                           Requires<[hasSM<53>, hasPTX<65>]>;
572
573}
574
575// Variant where only .ftz.bf16 is supported.
576multiclass F2_Support_Half_BF<string OpcStr, SDNode OpNode> {
577   def bf16_ftz :  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a),
578                           OpcStr # ".ftz.bf16 \t$dst, $a;",
579                           [(set bf16:$dst, (OpNode bf16:$a))]>,
580                           Requires<[hasSM<90>, hasPTX<78>]>;
581   def bf16x2_ftz: NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a),
582                           OpcStr # ".ftz.bf16x2 \t$dst, $a;",
583                           [(set v2bf16:$dst, (OpNode v2bf16:$a))]>,
584                           Requires<[hasSM<90>, hasPTX<78>]>;
585}
586
587//===----------------------------------------------------------------------===//
588// NVPTX Instructions.
589//===----------------------------------------------------------------------===//
590
591//-----------------------------------
592// Type Conversion
593//-----------------------------------
594
595let hasSideEffects = false in {
596  // Generate a cvt to the given type from all possible types.  Each instance
597  // takes a CvtMode immediate that defines the conversion mode to use.  It can
598  // be CvtNONE to omit a conversion mode.
599  multiclass CVT_FROM_ALL<string ToType, RegisterClass RC, list<Predicate> Preds = []> {
600    def _s8 :
601      NVPTXInst<(outs RC:$dst),
602                (ins Int16Regs:$src, CvtMode:$mode),
603                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
604                ToType, ".s8 \t$dst, $src;"), []>,
605      Requires<Preds>;
606    def _u8 :
607      NVPTXInst<(outs RC:$dst),
608                (ins Int16Regs:$src, CvtMode:$mode),
609                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
610                ToType, ".u8 \t$dst, $src;"), []>,
611      Requires<Preds>;
612    def _s16 :
613      NVPTXInst<(outs RC:$dst),
614                (ins Int16Regs:$src, CvtMode:$mode),
615                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
616                ToType, ".s16 \t$dst, $src;"), []>,
617      Requires<Preds>;
618    def _u16 :
619      NVPTXInst<(outs RC:$dst),
620                (ins Int16Regs:$src, CvtMode:$mode),
621                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
622                ToType, ".u16 \t$dst, $src;"), []>,
623      Requires<Preds>;
624    def _s32 :
625      NVPTXInst<(outs RC:$dst),
626                (ins Int32Regs:$src, CvtMode:$mode),
627                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
628                ToType, ".s32 \t$dst, $src;"), []>,
629      Requires<Preds>;
630    def _u32 :
631      NVPTXInst<(outs RC:$dst),
632                (ins Int32Regs:$src, CvtMode:$mode),
633                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
634                ToType, ".u32 \t$dst, $src;"), []>,
635      Requires<Preds>;
636    def _s64 :
637      NVPTXInst<(outs RC:$dst),
638                (ins Int64Regs:$src, CvtMode:$mode),
639                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
640                ToType, ".s64 \t$dst, $src;"), []>,
641      Requires<Preds>;
642    def _u64 :
643      NVPTXInst<(outs RC:$dst),
644                (ins Int64Regs:$src, CvtMode:$mode),
645                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
646                ToType, ".u64 \t$dst, $src;"), []>,
647      Requires<Preds>;
648    def _f16 :
649      NVPTXInst<(outs RC:$dst),
650                (ins Int16Regs:$src, CvtMode:$mode),
651                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
652                ToType, ".f16 \t$dst, $src;"), []>,
653      Requires<Preds>;
654    def _bf16 :
655      NVPTXInst<(outs RC:$dst),
656                (ins Int16Regs:$src, CvtMode:$mode),
657                !strconcat("cvt${mode:base}${mode:ftz}${mode:relu}${mode:sat}.",
658                ToType, ".bf16 \t$dst, $src;"), []>,
659      Requires<!if(!eq(ToType, "f32"),
660                   // bf16->f32 was introduced early.
661                   [hasPTX<71>, hasSM<80>],
662                   // bf16->everything else needs sm90/ptx78
663                   [hasPTX<78>, hasSM<90>])>;
664    def _f32 :
665      NVPTXInst<(outs RC:$dst),
666                (ins Float32Regs:$src, CvtMode:$mode),
667                !strconcat("cvt${mode:base}${mode:ftz}${mode:relu}${mode:sat}.",
668                ToType, ".f32 \t$dst, $src;"), []>,
669      Requires<!if(!eq(ToType, "bf16"),
670                   // f32->bf16 was introduced early.
671                   [hasPTX<70>, hasSM<80>],
672                   Preds)>;
673    def _f64 :
674      NVPTXInst<(outs RC:$dst),
675                (ins Float64Regs:$src, CvtMode:$mode),
676                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
677                ToType, ".f64 \t$dst, $src;"), []>,
678      Requires<Preds>;
679  }
680
681  // Generate cvts from all types to all types.
682  defm CVT_s8  : CVT_FROM_ALL<"s8",  Int16Regs>;
683  defm CVT_u8  : CVT_FROM_ALL<"u8",  Int16Regs>;
684  defm CVT_s16 : CVT_FROM_ALL<"s16", Int16Regs>;
685  defm CVT_u16 : CVT_FROM_ALL<"u16", Int16Regs>;
686  defm CVT_s32 : CVT_FROM_ALL<"s32", Int32Regs>;
687  defm CVT_u32 : CVT_FROM_ALL<"u32", Int32Regs>;
688  defm CVT_s64 : CVT_FROM_ALL<"s64", Int64Regs>;
689  defm CVT_u64 : CVT_FROM_ALL<"u64", Int64Regs>;
690  defm CVT_f16 : CVT_FROM_ALL<"f16", Int16Regs>;
691  defm CVT_bf16 : CVT_FROM_ALL<"bf16", Int16Regs, [hasPTX<78>, hasSM<90>]>;
692  defm CVT_f32 : CVT_FROM_ALL<"f32", Float32Regs>;
693  defm CVT_f64 : CVT_FROM_ALL<"f64", Float64Regs>;
694
695  // These cvts are different from those above: The source and dest registers
696  // are of the same type.
697  def CVT_INREG_s16_s8 :  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
698                                    "cvt.s16.s8 \t$dst, $src;", []>;
699  def CVT_INREG_s32_s8 :  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
700                                    "cvt.s32.s8 \t$dst, $src;", []>;
701  def CVT_INREG_s32_s16 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
702                                    "cvt.s32.s16 \t$dst, $src;", []>;
703  def CVT_INREG_s64_s8 :  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
704                                    "cvt.s64.s8 \t$dst, $src;", []>;
705  def CVT_INREG_s64_s16 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
706                                    "cvt.s64.s16 \t$dst, $src;", []>;
707  def CVT_INREG_s64_s32 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
708                                    "cvt.s64.s32 \t$dst, $src;", []>;
709
710  multiclass CVT_FROM_FLOAT_V2_SM80<string FromName, RegisterClass RC> {
711    def _f32 :
712      NVPTXInst<(outs RC:$dst),
713                (ins Float32Regs:$src1, Float32Regs:$src2,  CvtMode:$mode),
714                !strconcat("cvt${mode:base}${mode:relu}.",
715                FromName, ".f32 \t$dst, $src1, $src2;"), []>,
716    Requires<[hasPTX<70>, hasSM<80>]>;
717  }
718
719  defm CVT_f16x2 : CVT_FROM_FLOAT_V2_SM80<"f16x2", Int32Regs>;
720  defm CVT_bf16x2 : CVT_FROM_FLOAT_V2_SM80<"bf16x2", Int32Regs>;
721
722  // FP8 conversions.
723  multiclass CVT_TO_F8X2<string F8Name> {
724    def _f32 :
725      NVPTXInst<(outs Int16Regs:$dst),
726                (ins Float32Regs:$src1, Float32Regs:$src2, CvtMode:$mode),
727                !strconcat("cvt${mode:base}.satfinite${mode:relu}.",
728                F8Name, "x2.f32 \t$dst, $src1, $src2;"), []>,
729      Requires<[hasPTX<81>, hasSM<89>]>;
730    def _f16x2 :
731      NVPTXInst<(outs Int16Regs:$dst),
732                (ins Int32Regs:$src, CvtMode:$mode),
733                !strconcat("cvt${mode:base}.satfinite${mode:relu}.",
734                F8Name, "x2.f16x2 \t$dst, $src;"), []>,
735      Requires<[hasPTX<81>, hasSM<89>]>;
736  }
737
738  defm CVT_e4m3x2 : CVT_TO_F8X2<"e4m3">;
739  defm CVT_e5m2x2 : CVT_TO_F8X2<"e5m2">;
740
741  class CVT_f16x2_fp8<string F8Name> :
742    NVPTXInst<(outs Int32Regs:$dst),
743              (ins Int16Regs:$src, CvtMode:$mode),
744              !strconcat("cvt${mode:base}${mode:relu}.f16x2.",
745              F8Name, "x2 \t$dst, $src;"), []>,
746    Requires<[hasPTX<81>, hasSM<89>]>;
747
748  def CVT_f16x2_e4m3x2 : CVT_f16x2_fp8<"e4m3">;
749  def CVT_f16x2_e5m2x2 : CVT_f16x2_fp8<"e5m2">;
750
751  // Float to TF32 conversions
752  multiclass CVT_TO_TF32<string Modifier, list<Predicate> Preds = [hasPTX<78>, hasSM<90>]> {
753    defvar Intr = !cast<Intrinsic>("int_nvvm_f2tf32_" # !subst(".", "_", Modifier));
754
755    def NAME : NVPTXInst<(outs Int32Regs:$dst), (ins Float32Regs:$src),
756               "cvt." # Modifier # ".tf32.f32 \t$dst, $src;",
757               [(set i32:$dst, (Intr f32:$src))]>,
758               Requires<Preds>;
759  }
760
761  defm CVT_to_tf32_rn : CVT_TO_TF32<"rn">;
762  defm CVT_to_tf32_rz : CVT_TO_TF32<"rz">;
763  defm CVT_to_tf32_rn_relu  : CVT_TO_TF32<"rn.relu">;
764  defm CVT_to_tf32_rz_relu  : CVT_TO_TF32<"rz.relu">;
765  defm CVT_to_tf32_rna      : CVT_TO_TF32<"rna", [hasPTX<70>, hasSM<80>]>;
766  defm CVT_to_tf32_rna_satf : CVT_TO_TF32<"rna.satfinite", [hasPTX<81>, hasSM<89>]>;
767
768  defm CVT_to_tf32_rn_satf : CVT_TO_TF32<"rn.satfinite", [hasPTX<86>, hasSM<100>]>;
769  defm CVT_to_tf32_rz_satf : CVT_TO_TF32<"rz.satfinite", [hasPTX<86>, hasSM<100>]>;
770  defm CVT_to_tf32_rn_relu_satf  : CVT_TO_TF32<"rn.relu.satfinite", [hasPTX<86>, hasSM<100>]>;
771  defm CVT_to_tf32_rz_relu_satf  : CVT_TO_TF32<"rz.relu.satfinite", [hasPTX<86>, hasSM<100>]>;
772}
773
774def fpround_oneuse : PatFrag<(ops node:$a), (fpround node:$a), [{
775  return N->hasOneUse();
776}]>;
777
778def : Pat<(v2bf16 (build_vector (bf16 (fpround_oneuse f32:$lo)),
779                                (bf16 (fpround_oneuse f32:$hi)))),
780          (CVT_bf16x2_f32 $hi, $lo, CvtRN)>,
781      Requires<[hasPTX<70>, hasSM<80>, hasBF16Math]>;
782
783def : Pat<(v2f16 (build_vector (f16 (fpround_oneuse f32:$lo)),
784                               (f16 (fpround_oneuse f32:$hi)))),
785          (CVT_f16x2_f32 $hi, $lo, CvtRN)>,
786      Requires<[hasPTX<70>, hasSM<80>, useFP16Math]>;
787
788//-----------------------------------
789// Selection instructions (selp)
790//-----------------------------------
791
792// TODO: Missing slct
793
794// selp instructions that don't have any pattern matches; we explicitly use
795// them within this file.
796let hasSideEffects = false in {
797  multiclass SELP<string TypeStr, RegisterClass RC, Operand ImmCls> {
798    def rr : NVPTXInst<(outs RC:$dst),
799                       (ins RC:$a, RC:$b, Int1Regs:$p),
800                       !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
801    def ri : NVPTXInst<(outs RC:$dst),
802                       (ins RC:$a, ImmCls:$b, Int1Regs:$p),
803                       !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
804    def ir : NVPTXInst<(outs RC:$dst),
805                       (ins ImmCls:$a, RC:$b, Int1Regs:$p),
806                       !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
807    def ii : NVPTXInst<(outs RC:$dst),
808                       (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),
809                       !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
810  }
811
812  multiclass SELP_PATTERN<string TypeStr, ValueType T, RegisterClass RC,
813                          Operand ImmCls, SDNode ImmNode> {
814    def rr :
815      NVPTXInst<(outs RC:$dst),
816                (ins RC:$a, RC:$b, Int1Regs:$p),
817                !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
818                [(set T:$dst, (select i1:$p, T:$a, T:$b))]>;
819    def ri :
820      NVPTXInst<(outs RC:$dst),
821                (ins RC:$a, ImmCls:$b, Int1Regs:$p),
822                !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
823                [(set T:$dst, (select i1:$p, T:$a, (T ImmNode:$b)))]>;
824    def ir :
825      NVPTXInst<(outs RC:$dst),
826                (ins ImmCls:$a, RC:$b, Int1Regs:$p),
827                !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
828                [(set T:$dst, (select i1:$p, ImmNode:$a, T:$b))]>;
829    def ii :
830      NVPTXInst<(outs RC:$dst),
831                (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),
832                !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
833                [(set T:$dst, (select i1:$p, ImmNode:$a, ImmNode:$b))]>;
834  }
835}
836
837// Don't pattern match on selp.{s,u}{16,32,64} -- selp.b{16,32,64} is just as
838// good.
839defm SELP_b16 : SELP_PATTERN<"b16", i16, Int16Regs, i16imm, imm>;
840defm SELP_s16 : SELP<"s16", Int16Regs, i16imm>;
841defm SELP_u16 : SELP<"u16", Int16Regs, i16imm>;
842defm SELP_b32 : SELP_PATTERN<"b32", i32, Int32Regs, i32imm, imm>;
843defm SELP_s32 : SELP<"s32", Int32Regs, i32imm>;
844defm SELP_u32 : SELP<"u32", Int32Regs, i32imm>;
845defm SELP_b64 : SELP_PATTERN<"b64", i64, Int64Regs, i64imm, imm>;
846defm SELP_s64 : SELP<"s64", Int64Regs, i64imm>;
847defm SELP_u64 : SELP<"u64", Int64Regs, i64imm>;
848defm SELP_f16 : SELP_PATTERN<"b16", f16, Int16Regs, f16imm, fpimm>;
849defm SELP_bf16 : SELP_PATTERN<"b16", bf16, Int16Regs, bf16imm, fpimm>;
850
851defm SELP_f32 : SELP_PATTERN<"f32", f32, Float32Regs, f32imm, fpimm>;
852defm SELP_f64 : SELP_PATTERN<"f64", f64, Float64Regs, f64imm, fpimm>;
853
854// This does not work as tablegen fails to infer the type of 'imm'.
855// def v2f16imm : Operand<v2f16>;
856// defm SELP_f16x2 : SELP_PATTERN<"b32", v2f16, Int32Regs, v2f16imm, imm>;
857
858foreach vt = [v2f16, v2bf16, v2i16, v4i8] in {
859def : Pat<(vt (select i1:$p, vt:$a, vt:$b)),
860          (SELP_b32rr $a, $b, $p)>;
861}
862
863//-----------------------------------
864// Test Instructions
865//-----------------------------------
866
867def TESTINF_f32r : NVPTXInst<(outs Int1Regs:$p), (ins Float32Regs:$a),
868                             "testp.infinite.f32 \t$p, $a;",
869                             []>;
870def TESTINF_f32i : NVPTXInst<(outs Int1Regs:$p), (ins f32imm:$a),
871                             "testp.infinite.f32 \t$p, $a;",
872                             []>;
873def TESTINF_f64r : NVPTXInst<(outs Int1Regs:$p), (ins Float64Regs:$a),
874                             "testp.infinite.f64 \t$p, $a;",
875                             []>;
876def TESTINF_f64i : NVPTXInst<(outs Int1Regs:$p), (ins f64imm:$a),
877                             "testp.infinite.f64 \t$p, $a;",
878                             []>;
879
880//-----------------------------------
881// Integer Arithmetic
882//-----------------------------------
883
884// Template for xor masquerading as int1 arithmetic.
885multiclass ADD_SUB_i1<SDNode OpNode> {
886   def _rr: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
887                      "xor.pred \t$dst, $a, $b;",
888                      [(set i1:$dst, (OpNode i1:$a, i1:$b))]>;
889   def _ri: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
890                      "xor.pred \t$dst, $a, $b;",
891                      [(set i1:$dst, (OpNode i1:$a, (imm):$b))]>;
892}
893
894// int1 addition and subtraction are both just xor.
895defm ADD_i1 : ADD_SUB_i1<add>;
896defm SUB_i1 : ADD_SUB_i1<sub>;
897
898// int16, int32, and int64 signed addition.  Since nvptx is 2's complement, we
899// also use these for unsigned arithmetic.
900defm ADD : I3<"add.s", add, /*commutative=*/ true>;
901defm SUB : I3<"sub.s", sub, /*commutative=*/ false>;
902
903def ADD16x2 : I16x2<"add.s", add>;
904
905// in32 and int64 addition and subtraction with carry-out.
906defm ADDCC : ADD_SUB_INT_CARRY<"add.cc", addc>;
907defm SUBCC : ADD_SUB_INT_CARRY<"sub.cc", subc>;
908
909// int32 and int64 addition and subtraction with carry-in and carry-out.
910defm ADDCCC : ADD_SUB_INT_CARRY<"addc.cc", adde>;
911defm SUBCCC : ADD_SUB_INT_CARRY<"subc.cc", sube>;
912
913defm MULT : I3<"mul.lo.s", mul, /*commutative=*/ true>;
914
915defm MULTHS : I3<"mul.hi.s", mulhs, /*commutative=*/ true>;
916defm MULTHU : I3<"mul.hi.u", mulhu, /*commutative=*/ true>;
917
918defm SDIV : I3<"div.s", sdiv, /*commutative=*/ false>;
919defm UDIV : I3<"div.u", udiv, /*commutative=*/ false>;
920
921// The ri versions of rem.s and rem.u won't be selected; DAGCombiner::visitSREM
922// will lower it.
923defm SREM : I3<"rem.s", srem, /*commutative=*/ false>;
924defm UREM : I3<"rem.u", urem, /*commutative=*/ false>;
925
926// Integer absolute value.  NumBits should be one minus the bit width of RC.
927// This idiom implements the algorithm at
928// http://graphics.stanford.edu/~seander/bithacks.html#IntegerAbs.
929multiclass ABS<ValueType T, RegisterClass RC, string SizeName> {
930  def : NVPTXInst<(outs RC:$dst), (ins RC:$a),
931                  !strconcat("abs", SizeName, " \t$dst, $a;"),
932                  [(set T:$dst, (abs T:$a))]>;
933}
934defm ABS_16 : ABS<i16, Int16Regs, ".s16">;
935defm ABS_32 : ABS<i32, Int32Regs, ".s32">;
936defm ABS_64 : ABS<i64, Int64Regs, ".s64">;
937
938// Integer min/max.
939defm SMAX : I3<"max.s", smax, /*commutative=*/ true>;
940defm UMAX : I3<"max.u", umax, /*commutative=*/ true>;
941defm SMIN : I3<"min.s", smin, /*commutative=*/ true>;
942defm UMIN : I3<"min.u", umin, /*commutative=*/ true>;
943
944def SMAX16x2 : I16x2<"max.s", smax>;
945def UMAX16x2 : I16x2<"max.u", umax>;
946def SMIN16x2 : I16x2<"min.s", smin>;
947def UMIN16x2 : I16x2<"min.u", umin>;
948
949
950//
951// Wide multiplication
952//
953def MULWIDES64 :
954  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
955            "mul.wide.s32 \t$dst, $a, $b;", []>;
956def MULWIDES64Imm :
957  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
958            "mul.wide.s32 \t$dst, $a, $b;", []>;
959def MULWIDES64Imm64 :
960  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
961            "mul.wide.s32 \t$dst, $a, $b;", []>;
962
963def MULWIDEU64 :
964  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
965            "mul.wide.u32 \t$dst, $a, $b;", []>;
966def MULWIDEU64Imm :
967  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
968            "mul.wide.u32 \t$dst, $a, $b;", []>;
969def MULWIDEU64Imm64 :
970  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
971            "mul.wide.u32 \t$dst, $a, $b;", []>;
972
973def MULWIDES32 :
974  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
975            "mul.wide.s16 \t$dst, $a, $b;", []>;
976def MULWIDES32Imm :
977  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
978            "mul.wide.s16 \t$dst, $a, $b;", []>;
979def MULWIDES32Imm32 :
980  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
981            "mul.wide.s16 \t$dst, $a, $b;", []>;
982
983def MULWIDEU32 :
984  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
985            "mul.wide.u16 \t$dst, $a, $b;", []>;
986def MULWIDEU32Imm :
987  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
988            "mul.wide.u16 \t$dst, $a, $b;", []>;
989def MULWIDEU32Imm32 :
990  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
991            "mul.wide.u16 \t$dst, $a, $b;", []>;
992
993def SDTMulWide : SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>]>;
994def mul_wide_signed : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide>;
995def mul_wide_unsigned : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>;
996
997// Matchers for signed, unsigned mul.wide ISD nodes.
998def : Pat<(i32 (mul_wide_signed i16:$a, i16:$b)),
999          (MULWIDES32 $a, $b)>,
1000      Requires<[doMulWide]>;
1001def : Pat<(i32 (mul_wide_signed i16:$a, imm:$b)),
1002          (MULWIDES32Imm $a, imm:$b)>,
1003      Requires<[doMulWide]>;
1004def : Pat<(i32 (mul_wide_unsigned i16:$a, i16:$b)),
1005          (MULWIDEU32 $a, $b)>,
1006      Requires<[doMulWide]>;
1007def : Pat<(i32 (mul_wide_unsigned i16:$a, imm:$b)),
1008          (MULWIDEU32Imm $a, imm:$b)>,
1009      Requires<[doMulWide]>;
1010
1011def : Pat<(i64 (mul_wide_signed i32:$a, i32:$b)),
1012          (MULWIDES64 $a, $b)>,
1013      Requires<[doMulWide]>;
1014def : Pat<(i64 (mul_wide_signed i32:$a, imm:$b)),
1015          (MULWIDES64Imm $a, imm:$b)>,
1016      Requires<[doMulWide]>;
1017def : Pat<(i64 (mul_wide_unsigned i32:$a, i32:$b)),
1018          (MULWIDEU64 $a, $b)>,
1019      Requires<[doMulWide]>;
1020def : Pat<(i64 (mul_wide_unsigned i32:$a, imm:$b)),
1021          (MULWIDEU64Imm $a, imm:$b)>,
1022      Requires<[doMulWide]>;
1023
1024// Predicates used for converting some patterns to mul.wide.
1025def SInt32Const : PatLeaf<(imm), [{
1026  const APInt &v = N->getAPIntValue();
1027  return v.isSignedIntN(32);
1028}]>;
1029
1030def UInt32Const : PatLeaf<(imm), [{
1031  const APInt &v = N->getAPIntValue();
1032  return v.isIntN(32);
1033}]>;
1034
1035def SInt16Const : PatLeaf<(imm), [{
1036  const APInt &v = N->getAPIntValue();
1037  return v.isSignedIntN(16);
1038}]>;
1039
1040def UInt16Const : PatLeaf<(imm), [{
1041  const APInt &v = N->getAPIntValue();
1042  return v.isIntN(16);
1043}]>;
1044
1045def IntConst_0_30 : PatLeaf<(imm), [{
1046  // Check if 0 <= v < 31; only then will the result of (x << v) be an int32.
1047  const APInt &v = N->getAPIntValue();
1048  return v.sge(0) && v.slt(31);
1049}]>;
1050
1051def IntConst_0_14 : PatLeaf<(imm), [{
1052  // Check if 0 <= v < 15; only then will the result of (x << v) be an int16.
1053  const APInt &v = N->getAPIntValue();
1054  return v.sge(0) && v.slt(15);
1055}]>;
1056
1057def SHL2MUL32 : SDNodeXForm<imm, [{
1058  const APInt &v = N->getAPIntValue();
1059  APInt temp(32, 1);
1060  return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i32);
1061}]>;
1062
1063def SHL2MUL16 : SDNodeXForm<imm, [{
1064  const APInt &v = N->getAPIntValue();
1065  APInt temp(16, 1);
1066  return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i16);
1067}]>;
1068
1069// Convert "sign/zero-extend, then shift left by an immediate" to mul.wide.
1070def : Pat<(shl (sext i32:$a), (i32 IntConst_0_30:$b)),
1071          (MULWIDES64Imm $a, (SHL2MUL32 $b))>,
1072      Requires<[doMulWide]>;
1073def : Pat<(shl (zext i32:$a), (i32 IntConst_0_30:$b)),
1074          (MULWIDEU64Imm $a, (SHL2MUL32 $b))>,
1075      Requires<[doMulWide]>;
1076
1077def : Pat<(shl (sext i16:$a), (i16 IntConst_0_14:$b)),
1078          (MULWIDES32Imm $a, (SHL2MUL16 $b))>,
1079      Requires<[doMulWide]>;
1080def : Pat<(shl (zext i16:$a), (i16 IntConst_0_14:$b)),
1081          (MULWIDEU32Imm $a, (SHL2MUL16 $b))>,
1082      Requires<[doMulWide]>;
1083
1084// Convert "sign/zero-extend then multiply" to mul.wide.
1085def : Pat<(mul (sext i32:$a), (sext i32:$b)),
1086          (MULWIDES64 $a, $b)>,
1087      Requires<[doMulWide]>;
1088def : Pat<(mul (sext i32:$a), (i64 SInt32Const:$b)),
1089          (MULWIDES64Imm64 $a, (i64 SInt32Const:$b))>,
1090      Requires<[doMulWide]>;
1091
1092def : Pat<(mul (zext i32:$a), (zext i32:$b)),
1093          (MULWIDEU64 $a, $b)>,
1094      Requires<[doMulWide]>;
1095def : Pat<(mul (zext i32:$a), (i64 UInt32Const:$b)),
1096          (MULWIDEU64Imm64 $a, (i64 UInt32Const:$b))>,
1097      Requires<[doMulWide]>;
1098
1099def : Pat<(mul (sext i16:$a), (sext i16:$b)),
1100          (MULWIDES32 $a, $b)>,
1101      Requires<[doMulWide]>;
1102def : Pat<(mul (sext i16:$a), (i32 SInt16Const:$b)),
1103          (MULWIDES32Imm32 $a, (i32 SInt16Const:$b))>,
1104      Requires<[doMulWide]>;
1105
1106def : Pat<(mul (zext i16:$a), (zext i16:$b)),
1107          (MULWIDEU32 $a, $b)>,
1108      Requires<[doMulWide]>;
1109def : Pat<(mul (zext i16:$a), (i32 UInt16Const:$b)),
1110          (MULWIDEU32Imm32 $a, (i32 UInt16Const:$b))>,
1111      Requires<[doMulWide]>;
1112
1113//
1114// Integer multiply-add
1115//
1116def mul_oneuse : PatFrag<(ops node:$a, node:$b), (mul node:$a, node:$b), [{
1117  return N->hasOneUse();
1118}]>;
1119
1120multiclass MAD<string Ptx, ValueType VT, NVPTXRegClass Reg, Operand Imm> {
1121  def rrr:
1122    NVPTXInst<(outs Reg:$dst),
1123              (ins Reg:$a, Reg:$b, Reg:$c),
1124              Ptx # " \t$dst, $a, $b, $c;",
1125              [(set VT:$dst, (add (mul_oneuse VT:$a, VT:$b), VT:$c))]>;
1126
1127  def rir:
1128    NVPTXInst<(outs Reg:$dst),
1129              (ins Reg:$a, Imm:$b, Reg:$c),
1130              Ptx # " \t$dst, $a, $b, $c;",
1131              [(set VT:$dst, (add (mul_oneuse VT:$a, imm:$b), VT:$c))]>;
1132  def rri:
1133    NVPTXInst<(outs Reg:$dst),
1134              (ins Reg:$a, Reg:$b, Imm:$c),
1135              Ptx # " \t$dst, $a, $b, $c;",
1136              [(set VT:$dst, (add (mul_oneuse VT:$a, VT:$b), imm:$c))]>;
1137  def rii:
1138    NVPTXInst<(outs Reg:$dst),
1139              (ins Reg:$a, Imm:$b, Imm:$c),
1140              Ptx # " \t$dst, $a, $b, $c;",
1141              [(set VT:$dst, (add (mul_oneuse VT:$a, imm:$b), imm:$c))]>;
1142}
1143
1144let Predicates = [hasOptEnabled] in {
1145defm MAD16 : MAD<"mad.lo.s16", i16, Int16Regs, i16imm>;
1146defm MAD32 : MAD<"mad.lo.s32", i32, Int32Regs, i32imm>;
1147defm MAD64 : MAD<"mad.lo.s64", i64, Int64Regs, i64imm>;
1148}
1149
1150def INEG16 :
1151  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
1152            "neg.s16 \t$dst, $src;",
1153            [(set i16:$dst, (ineg i16:$src))]>;
1154def INEG32 :
1155  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
1156            "neg.s32 \t$dst, $src;",
1157            [(set i32:$dst, (ineg i32:$src))]>;
1158def INEG64 :
1159  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
1160            "neg.s64 \t$dst, $src;",
1161            [(set i64:$dst, (ineg i64:$src))]>;
1162
1163//-----------------------------------
1164// Floating Point Arithmetic
1165//-----------------------------------
1166
1167// Constant 1.0f
1168def FloatConst1 : PatLeaf<(fpimm), [{
1169  return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEsingle() &&
1170         N->getValueAPF().convertToFloat() == 1.0f;
1171}]>;
1172// Constant 1.0 (double)
1173def DoubleConst1 : PatLeaf<(fpimm), [{
1174  return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEdouble() &&
1175         N->getValueAPF().convertToDouble() == 1.0;
1176}]>;
1177// Constant -1.0 (double)
1178def DoubleConstNeg1 : PatLeaf<(fpimm), [{
1179  return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEdouble() &&
1180         N->getValueAPF().convertToDouble() == -1.0;
1181}]>;
1182
1183
1184// Constant -X -> X (double)
1185def NegDoubleConst : SDNodeXForm<fpimm, [{
1186  return CurDAG->getTargetConstantFP(-(N->getValueAPF()),
1187                                     SDLoc(N), MVT::f64);
1188}]>;
1189
1190defm FADD : F3_fma_component<"add", fadd>;
1191defm FSUB : F3_fma_component<"sub", fsub>;
1192defm FMUL : F3_fma_component<"mul", fmul>;
1193
1194defm FMIN : FMINIMUMMAXIMUM<"min", /* NaN */ false, fminnum>;
1195defm FMAX : FMINIMUMMAXIMUM<"max", /* NaN */ false, fmaxnum>;
1196defm FMINNAN : FMINIMUMMAXIMUM<"min.NaN", /* NaN */ true, fminimum>;
1197defm FMAXNAN : FMINIMUMMAXIMUM<"max.NaN", /* NaN */ true, fmaximum>;
1198
1199defm FABS  : F2<"abs", fabs>;
1200defm FNEG  : F2<"neg", fneg>;
1201defm FABS_H: F2_Support_Half<"abs", fabs>;
1202defm FNEG_H: F2_Support_Half<"neg", fneg>;
1203
1204defm FSQRT : F2<"sqrt.rn", fsqrt>;
1205
1206defm FEXP2_H: F2_Support_Half_BF<"ex2.approx", fexp2>;
1207
1208//
1209// F16 NEG
1210//
1211class FNEG_F16_F16X2<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> :
1212      NVPTXInst<(outs RC:$dst), (ins RC:$src),
1213                !strconcat(OpcStr, " \t$dst, $src;"),
1214                [(set T:$dst, (fneg T:$src))]>,
1215                Requires<[useFP16Math, hasPTX<60>, hasSM<53>, Pred]>;
1216def FNEG16_ftz   : FNEG_F16_F16X2<"neg.ftz.f16", f16, Int16Regs, doF32FTZ>;
1217def FNEG16       : FNEG_F16_F16X2<"neg.f16", f16, Int16Regs, True>;
1218def FNEG16x2_ftz : FNEG_F16_F16X2<"neg.ftz.f16x2", v2f16, Int32Regs, doF32FTZ>;
1219def FNEG16x2     : FNEG_F16_F16X2<"neg.f16x2", v2f16, Int32Regs, True>;
1220
1221//
1222// BF16 NEG
1223//
1224
1225class FNEG_BF16_F16X2<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> :
1226      NVPTXInst<(outs RC:$dst), (ins RC:$src),
1227                !strconcat(OpcStr, " \t$dst, $src;"),
1228                [(set T:$dst, (fneg T:$src))]>,
1229                Requires<[hasBF16Math, hasPTX<70>, hasSM<80>, Pred]>;
1230def BFNEG16_ftz   : FNEG_BF16_F16X2<"neg.ftz.bf16", bf16, Int16Regs, doF32FTZ>;
1231def BFNEG16       : FNEG_BF16_F16X2<"neg.bf16", bf16, Int16Regs, True>;
1232def BFNEG16x2_ftz : FNEG_BF16_F16X2<"neg.ftz.bf16x2", v2bf16, Int32Regs, doF32FTZ>;
1233def BFNEG16x2     : FNEG_BF16_F16X2<"neg.bf16x2", v2bf16, Int32Regs, True>;
1234
1235//
1236// F64 division
1237//
1238def FDIV641r :
1239  NVPTXInst<(outs Float64Regs:$dst),
1240            (ins f64imm:$a, Float64Regs:$b),
1241            "rcp.rn.f64 \t$dst, $b;",
1242            [(set f64:$dst, (fdiv DoubleConst1:$a, f64:$b))]>;
1243def FDIV64rr :
1244  NVPTXInst<(outs Float64Regs:$dst),
1245            (ins Float64Regs:$a, Float64Regs:$b),
1246            "div.rn.f64 \t$dst, $a, $b;",
1247            [(set f64:$dst, (fdiv f64:$a, f64:$b))]>;
1248def FDIV64ri :
1249  NVPTXInst<(outs Float64Regs:$dst),
1250            (ins Float64Regs:$a, f64imm:$b),
1251            "div.rn.f64 \t$dst, $a, $b;",
1252            [(set f64:$dst, (fdiv f64:$a, fpimm:$b))]>;
1253
1254// fdiv will be converted to rcp
1255// fneg (fdiv 1.0, X) => fneg (rcp.rn X)
1256def : Pat<(fdiv DoubleConstNeg1:$a, f64:$b),
1257          (FNEGf64 (FDIV641r (NegDoubleConst node:$a), $b))>;
1258
1259//
1260// F32 Approximate reciprocal
1261//
1262def FDIV321r_ftz :
1263  NVPTXInst<(outs Float32Regs:$dst),
1264            (ins f32imm:$a, Float32Regs:$b),
1265            "rcp.approx.ftz.f32 \t$dst, $b;",
1266            [(set f32:$dst, (fdiv FloatConst1:$a, f32:$b))]>,
1267            Requires<[do_DIVF32_APPROX, doF32FTZ]>;
1268def FDIV321r :
1269  NVPTXInst<(outs Float32Regs:$dst),
1270            (ins f32imm:$a, Float32Regs:$b),
1271            "rcp.approx.f32 \t$dst, $b;",
1272            [(set f32:$dst, (fdiv FloatConst1:$a, f32:$b))]>,
1273            Requires<[do_DIVF32_APPROX]>;
1274//
1275// F32 Approximate division
1276//
1277def FDIV32approxrr_ftz :
1278  NVPTXInst<(outs Float32Regs:$dst),
1279            (ins Float32Regs:$a, Float32Regs:$b),
1280            "div.approx.ftz.f32 \t$dst, $a, $b;",
1281            [(set f32:$dst, (fdiv f32:$a, f32:$b))]>,
1282            Requires<[do_DIVF32_APPROX, doF32FTZ]>;
1283def FDIV32approxri_ftz :
1284  NVPTXInst<(outs Float32Regs:$dst),
1285            (ins Float32Regs:$a, f32imm:$b),
1286            "div.approx.ftz.f32 \t$dst, $a, $b;",
1287            [(set f32:$dst, (fdiv f32:$a, fpimm:$b))]>,
1288            Requires<[do_DIVF32_APPROX, doF32FTZ]>;
1289def FDIV32approxrr :
1290  NVPTXInst<(outs Float32Regs:$dst),
1291            (ins Float32Regs:$a, Float32Regs:$b),
1292            "div.approx.f32 \t$dst, $a, $b;",
1293            [(set f32:$dst, (fdiv f32:$a, f32:$b))]>,
1294            Requires<[do_DIVF32_APPROX]>;
1295def FDIV32approxri :
1296  NVPTXInst<(outs Float32Regs:$dst),
1297            (ins Float32Regs:$a, f32imm:$b),
1298            "div.approx.f32 \t$dst, $a, $b;",
1299            [(set f32:$dst, (fdiv f32:$a, fpimm:$b))]>,
1300            Requires<[do_DIVF32_APPROX]>;
1301//
1302// F32 Semi-accurate reciprocal
1303//
1304// rcp.approx gives the same result as div.full(1.0f, a) and is faster.
1305//
1306def FDIV321r_approx_ftz :
1307  NVPTXInst<(outs Float32Regs:$dst),
1308            (ins f32imm:$a, Float32Regs:$b),
1309            "rcp.approx.ftz.f32 \t$dst, $b;",
1310            [(set f32:$dst, (fdiv FloatConst1:$a, f32:$b))]>,
1311            Requires<[do_DIVF32_FULL, doF32FTZ]>;
1312def FDIV321r_approx :
1313  NVPTXInst<(outs Float32Regs:$dst),
1314            (ins f32imm:$a, Float32Regs:$b),
1315            "rcp.approx.f32 \t$dst, $b;",
1316            [(set f32:$dst, (fdiv FloatConst1:$a, f32:$b))]>,
1317            Requires<[do_DIVF32_FULL]>;
1318//
1319// F32 Semi-accurate division
1320//
1321def FDIV32rr_ftz :
1322  NVPTXInst<(outs Float32Regs:$dst),
1323            (ins Float32Regs:$a, Float32Regs:$b),
1324            "div.full.ftz.f32 \t$dst, $a, $b;",
1325            [(set f32:$dst, (fdiv Float32Regs:$a, f32:$b))]>,
1326            Requires<[do_DIVF32_FULL, doF32FTZ]>;
1327def FDIV32ri_ftz :
1328  NVPTXInst<(outs Float32Regs:$dst),
1329            (ins Float32Regs:$a, f32imm:$b),
1330            "div.full.ftz.f32 \t$dst, $a, $b;",
1331            [(set f32:$dst, (fdiv f32:$a, fpimm:$b))]>,
1332            Requires<[do_DIVF32_FULL, doF32FTZ]>;
1333def FDIV32rr :
1334  NVPTXInst<(outs Float32Regs:$dst),
1335            (ins Float32Regs:$a, Float32Regs:$b),
1336            "div.full.f32 \t$dst, $a, $b;",
1337            [(set f32:$dst, (fdiv f32:$a, f32:$b))]>,
1338            Requires<[do_DIVF32_FULL]>;
1339def FDIV32ri :
1340  NVPTXInst<(outs Float32Regs:$dst),
1341            (ins Float32Regs:$a, f32imm:$b),
1342            "div.full.f32 \t$dst, $a, $b;",
1343            [(set f32:$dst, (fdiv f32:$a, fpimm:$b))]>,
1344            Requires<[do_DIVF32_FULL]>;
1345//
1346// F32 Accurate reciprocal
1347//
1348def FDIV321r_prec_ftz :
1349  NVPTXInst<(outs Float32Regs:$dst),
1350            (ins f32imm:$a, Float32Regs:$b),
1351            "rcp.rn.ftz.f32 \t$dst, $b;",
1352            [(set f32:$dst, (fdiv FloatConst1:$a, f32:$b))]>,
1353            Requires<[doF32FTZ]>;
1354def FDIV321r_prec :
1355  NVPTXInst<(outs Float32Regs:$dst),
1356            (ins f32imm:$a, Float32Regs:$b),
1357            "rcp.rn.f32 \t$dst, $b;",
1358            [(set f32:$dst, (fdiv FloatConst1:$a, f32:$b))]>;
1359//
1360// F32 Accurate division
1361//
1362def FDIV32rr_prec_ftz :
1363  NVPTXInst<(outs Float32Regs:$dst),
1364            (ins Float32Regs:$a, Float32Regs:$b),
1365            "div.rn.ftz.f32 \t$dst, $a, $b;",
1366            [(set f32:$dst, (fdiv f32:$a, f32:$b))]>,
1367            Requires<[doF32FTZ]>;
1368def FDIV32ri_prec_ftz :
1369  NVPTXInst<(outs Float32Regs:$dst),
1370            (ins Float32Regs:$a, f32imm:$b),
1371            "div.rn.ftz.f32 \t$dst, $a, $b;",
1372            [(set f32:$dst, (fdiv f32:$a, fpimm:$b))]>,
1373            Requires<[doF32FTZ]>;
1374def FDIV32rr_prec :
1375  NVPTXInst<(outs Float32Regs:$dst),
1376            (ins Float32Regs:$a, Float32Regs:$b),
1377            "div.rn.f32 \t$dst, $a, $b;",
1378            [(set f32:$dst, (fdiv f32:$a, f32:$b))]>;
1379def FDIV32ri_prec :
1380  NVPTXInst<(outs Float32Regs:$dst),
1381            (ins Float32Regs:$a, f32imm:$b),
1382            "div.rn.f32 \t$dst, $a, $b;",
1383            [(set f32:$dst, (fdiv f32:$a, fpimm:$b))]>;
1384
1385//
1386// FMA
1387//
1388
1389multiclass FMA<string OpcStr, RegisterClass RC, Operand ImmCls, Predicate Pred> {
1390  defvar asmstr = OpcStr # " \t$dst, $a, $b, $c;";
1391  def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c),
1392                      asmstr,
1393                      [(set RC:$dst, (fma RC:$a, RC:$b, RC:$c))]>,
1394                      Requires<[Pred]>;
1395  def rri : NVPTXInst<(outs RC:$dst),
1396                      (ins RC:$a, RC:$b, ImmCls:$c),
1397                      asmstr,
1398                      [(set RC:$dst, (fma RC:$a, RC:$b, fpimm:$c))]>,
1399                      Requires<[Pred]>;
1400  def rir : NVPTXInst<(outs RC:$dst),
1401                      (ins RC:$a, ImmCls:$b, RC:$c),
1402                      asmstr,
1403                      [(set RC:$dst, (fma RC:$a, fpimm:$b, RC:$c))]>,
1404                      Requires<[Pred]>;
1405  def rii : NVPTXInst<(outs RC:$dst),
1406                      (ins RC:$a, ImmCls:$b, ImmCls:$c),
1407                      asmstr,
1408                      [(set RC:$dst, (fma RC:$a, fpimm:$b, fpimm:$c))]>,
1409                      Requires<[Pred]>;
1410  def iir : NVPTXInst<(outs RC:$dst),
1411                      (ins ImmCls:$a, ImmCls:$b, RC:$c),
1412                      asmstr,
1413                      [(set RC:$dst, (fma fpimm:$a, fpimm:$b, RC:$c))]>,
1414                      Requires<[Pred]>;
1415
1416}
1417
1418multiclass FMA_F16<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> {
1419   def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c),
1420                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
1421                       [(set T:$dst, (fma T:$a, T:$b, T:$c))]>,
1422                       Requires<[useFP16Math, Pred]>;
1423}
1424
1425multiclass FMA_BF16<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> {
1426   def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c),
1427                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
1428                       [(set T:$dst, (fma T:$a, T:$b, T:$c))]>,
1429                       Requires<[hasBF16Math, Pred]>;
1430}
1431
1432defm FMA16_ftz    : FMA_F16<"fma.rn.ftz.f16", f16, Int16Regs, doF32FTZ>;
1433defm FMA16        : FMA_F16<"fma.rn.f16", f16, Int16Regs, True>;
1434defm FMA16x2_ftz  : FMA_F16<"fma.rn.ftz.f16x2", v2f16, Int32Regs, doF32FTZ>;
1435defm FMA16x2      : FMA_F16<"fma.rn.f16x2", v2f16, Int32Regs, True>;
1436defm BFMA16       : FMA_BF16<"fma.rn.bf16", bf16, Int16Regs, True>;
1437defm BFMA16x2     : FMA_BF16<"fma.rn.bf16x2", v2bf16, Int32Regs, True>;
1438defm FMA32_ftz    : FMA<"fma.rn.ftz.f32", Float32Regs, f32imm, doF32FTZ>;
1439defm FMA32        : FMA<"fma.rn.f32", Float32Regs, f32imm, True>;
1440defm FMA64        : FMA<"fma.rn.f64", Float64Regs, f64imm, True>;
1441
1442// sin/cos
1443def SINF:  NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
1444                      "sin.approx.f32 \t$dst, $src;",
1445                      [(set f32:$dst, (fsin f32:$src))]>,
1446                      Requires<[allowUnsafeFPMath]>;
1447def COSF:  NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
1448                      "cos.approx.f32 \t$dst, $src;",
1449                      [(set f32:$dst, (fcos f32:$src))]>,
1450                      Requires<[allowUnsafeFPMath]>;
1451
1452// Lower (frem x, y) into (sub x, (mul (ftrunc (div x, y)) y)),
1453// i.e. "poor man's fmod()". When y is infinite, x is returned. This matches the
1454// semantics of LLVM's frem.
1455
1456// frem - f32 FTZ
1457def : Pat<(frem f32:$x, f32:$y),
1458          (FSUBf32rr_ftz $x, (FMULf32rr_ftz (CVT_f32_f32
1459            (FDIV32rr_prec_ftz $x, $y), CvtRZI_FTZ),
1460             $y))>,
1461          Requires<[doF32FTZ, allowUnsafeFPMath]>;
1462def : Pat<(frem f32:$x, fpimm:$y),
1463          (FSUBf32rr_ftz $x, (FMULf32ri_ftz (CVT_f32_f32
1464            (FDIV32ri_prec_ftz $x, fpimm:$y), CvtRZI_FTZ),
1465             fpimm:$y))>,
1466          Requires<[doF32FTZ, allowUnsafeFPMath]>;
1467
1468def : Pat<(frem f32:$x, f32:$y),
1469          (SELP_f32rr $x,
1470            (FSUBf32rr_ftz $x, (FMULf32rr_ftz (CVT_f32_f32
1471              (FDIV32rr_prec_ftz $x, $y), CvtRZI_FTZ),
1472              $y)),
1473            (TESTINF_f32r $y))>,
1474          Requires<[doF32FTZ, noUnsafeFPMath]>;
1475def : Pat<(frem f32:$x, fpimm:$y),
1476          (SELP_f32rr $x,
1477            (FSUBf32rr_ftz $x, (FMULf32ri_ftz (CVT_f32_f32
1478              (FDIV32ri_prec_ftz $x, fpimm:$y), CvtRZI_FTZ),
1479              fpimm:$y)),
1480            (TESTINF_f32i fpimm:$y))>,
1481          Requires<[doF32FTZ, noUnsafeFPMath]>;
1482
1483// frem - f32
1484def : Pat<(frem f32:$x, f32:$y),
1485          (FSUBf32rr $x, (FMULf32rr (CVT_f32_f32
1486            (FDIV32rr_prec $x, $y), CvtRZI),
1487             $y))>,
1488          Requires<[allowUnsafeFPMath]>;
1489def : Pat<(frem f32:$x, fpimm:$y),
1490          (FSUBf32rr $x, (FMULf32ri (CVT_f32_f32
1491            (FDIV32ri_prec $x, fpimm:$y), CvtRZI),
1492             fpimm:$y))>,
1493          Requires<[allowUnsafeFPMath]>;
1494
1495def : Pat<(frem f32:$x, f32:$y),
1496          (SELP_f32rr $x,
1497            (FSUBf32rr $x, (FMULf32rr (CVT_f32_f32
1498              (FDIV32rr_prec $x, $y), CvtRZI),
1499              $y)),
1500            (TESTINF_f32r Float32Regs:$y))>,
1501          Requires<[noUnsafeFPMath]>;
1502def : Pat<(frem f32:$x, fpimm:$y),
1503          (SELP_f32rr $x,
1504            (FSUBf32rr $x, (FMULf32ri (CVT_f32_f32
1505              (FDIV32ri_prec $x, fpimm:$y), CvtRZI),
1506              fpimm:$y)),
1507            (TESTINF_f32i fpimm:$y))>,
1508          Requires<[noUnsafeFPMath]>;
1509
1510// frem - f64
1511def : Pat<(frem f64:$x, f64:$y),
1512          (FSUBf64rr $x, (FMULf64rr (CVT_f64_f64
1513            (FDIV64rr $x, $y), CvtRZI),
1514             $y))>,
1515          Requires<[allowUnsafeFPMath]>;
1516def : Pat<(frem f64:$x, fpimm:$y),
1517          (FSUBf64rr $x, (FMULf64ri (CVT_f64_f64
1518            (FDIV64ri $x, fpimm:$y), CvtRZI),
1519             fpimm:$y))>,
1520          Requires<[allowUnsafeFPMath]>;
1521
1522def : Pat<(frem f64:$x, f64:$y),
1523          (SELP_f64rr $x,
1524            (FSUBf64rr $x, (FMULf64rr (CVT_f64_f64
1525              (FDIV64rr $x, $y), CvtRZI),
1526               $y)),
1527            (TESTINF_f64r Float64Regs:$y))>,
1528          Requires<[noUnsafeFPMath]>;
1529def : Pat<(frem f64:$x, fpimm:$y),
1530          (SELP_f64rr $x,
1531            (FSUBf64rr $x, (FMULf64ri (CVT_f64_f64
1532              (FDIV64ri $x, fpimm:$y), CvtRZI),
1533              fpimm:$y)),
1534            (TESTINF_f64r $y))>,
1535          Requires<[noUnsafeFPMath]>;
1536
1537//-----------------------------------
1538// Bitwise operations
1539//-----------------------------------
1540
1541// Template for three-arg bitwise operations.  Takes three args, Creates .b16,
1542// .b32, .b64, and .pred (predicate registers -- i.e., i1) versions of OpcStr.
1543multiclass BITWISE<string OpcStr, SDNode OpNode> {
1544  def b1rr :
1545    NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
1546              !strconcat(OpcStr, ".pred  \t$dst, $a, $b;"),
1547              [(set i1:$dst, (OpNode i1:$a, i1:$b))]>;
1548  def b1ri :
1549    NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
1550              !strconcat(OpcStr, ".pred  \t$dst, $a, $b;"),
1551              [(set i1:$dst, (OpNode i1:$a, imm:$b))]>;
1552  def b16rr :
1553    NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
1554              !strconcat(OpcStr, ".b16  \t$dst, $a, $b;"),
1555              [(set i16:$dst, (OpNode i16:$a, i16:$b))]>;
1556  def b16ri :
1557    NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
1558              !strconcat(OpcStr, ".b16  \t$dst, $a, $b;"),
1559              [(set i16:$dst, (OpNode i16:$a, imm:$b))]>;
1560  def b32rr :
1561    NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
1562              !strconcat(OpcStr, ".b32  \t$dst, $a, $b;"),
1563              [(set i32:$dst, (OpNode i32:$a, i32:$b))]>;
1564  def b32ri :
1565    NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
1566              !strconcat(OpcStr, ".b32  \t$dst, $a, $b;"),
1567              [(set i32:$dst, (OpNode i32:$a, imm:$b))]>;
1568  def b64rr :
1569    NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
1570              !strconcat(OpcStr, ".b64  \t$dst, $a, $b;"),
1571              [(set i64:$dst, (OpNode i64:$a, i64:$b))]>;
1572  def b64ri :
1573    NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
1574              !strconcat(OpcStr, ".b64  \t$dst, $a, $b;"),
1575              [(set i64:$dst, (OpNode i64:$a, imm:$b))]>;
1576}
1577
1578defm OR  : BITWISE<"or", or>;
1579defm AND : BITWISE<"and", and>;
1580defm XOR : BITWISE<"xor", xor>;
1581
1582// PTX does not support mul on predicates, convert to and instructions
1583def : Pat<(mul i1:$a, i1:$b), (ANDb1rr $a, $b)>;
1584def : Pat<(mul i1:$a, imm:$b), (ANDb1ri $a, imm:$b)>;
1585
1586// These transformations were once reliably performed by instcombine, but thanks
1587// to poison semantics they are no longer safe for LLVM IR, perform them here
1588// instead.
1589def : Pat<(select i1:$a, i1:$b, 0), (ANDb1rr $a, $b)>;
1590def : Pat<(select i1:$a, 1, i1:$b), (ORb1rr $a, $b)>;
1591
1592// Lower logical v2i16/v4i8 ops as bitwise ops on b32.
1593foreach vt = [v2i16, v4i8] in {
1594  def: Pat<(or vt:$a, vt:$b),
1595           (ORb32rr $a, $b)>;
1596  def: Pat<(xor vt:$a, vt:$b),
1597           (XORb32rr $a, $b)>;
1598  def: Pat<(and vt:$a, vt:$b),
1599           (ANDb32rr $a, $b)>;
1600
1601  // The constants get legalized into a bitcast from i32, so that's what we need
1602  // to match here.
1603  def: Pat<(or vt:$a, (vt (bitconvert (i32 imm:$b)))),
1604           (ORb32ri $a, imm:$b)>;
1605  def: Pat<(xor vt:$a, (vt (bitconvert (i32 imm:$b)))),
1606           (XORb32ri $a, imm:$b)>;
1607  def: Pat<(and vt:$a, (vt (bitconvert (i32 imm:$b)))),
1608           (ANDb32ri $a, imm:$b)>;
1609}
1610
1611def NOT1  : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$src),
1612                      "not.pred \t$dst, $src;",
1613                      [(set i1:$dst, (not i1:$src))]>;
1614def NOT16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
1615                      "not.b16 \t$dst, $src;",
1616                      [(set i16:$dst, (not i16:$src))]>;
1617def NOT32 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
1618                      "not.b32 \t$dst, $src;",
1619                      [(set i32:$dst, (not i32:$src))]>;
1620def NOT64 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
1621                       "not.b64 \t$dst, $src;",
1622                       [(set i64:$dst, (not i64:$src))]>;
1623
1624// Template for left/right shifts.  Takes three operands,
1625//   [dest (reg), src (reg), shift (reg or imm)].
1626// dest and src may be int64, int32, or int16, but shift is always int32.
1627//
1628// This template also defines a 32-bit shift (imm, imm) instruction.
1629multiclass SHIFT<string OpcStr, SDNode OpNode> {
1630   def i64rr :
1631     NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int32Regs:$b),
1632               !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
1633               [(set i64:$dst, (OpNode i64:$a, i32:$b))]>;
1634   def i64ri :
1635     NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i32imm:$b),
1636               !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
1637               [(set i64:$dst, (OpNode i64:$a, (i32 imm:$b)))]>;
1638   def i32rr :
1639     NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
1640               !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
1641               [(set i32:$dst, (OpNode i32:$a, i32:$b))]>;
1642   def i32ri :
1643     NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
1644               !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
1645               [(set i32:$dst, (OpNode i32:$a, (i32 imm:$b)))]>;
1646   def i32ii :
1647     NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$a, i32imm:$b),
1648               !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
1649               [(set i32:$dst, (OpNode (i32 imm:$a), (i32 imm:$b)))]>;
1650   def i16rr :
1651     NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int32Regs:$b),
1652               !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
1653               [(set i16:$dst, (OpNode i16:$a, i32:$b))]>;
1654   def i16ri :
1655     NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
1656               !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
1657               [(set i16:$dst, (OpNode i16:$a, (i32 imm:$b)))]>;
1658}
1659
1660defm SHL : SHIFT<"shl.b", shl>;
1661defm SRA : SHIFT<"shr.s", sra>;
1662defm SRL : SHIFT<"shr.u", srl>;
1663
1664// Bit-reverse
1665def BREV32 :
1666  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a),
1667             "brev.b32 \t$dst, $a;",
1668             [(set i32:$dst, (bitreverse i32:$a))]>;
1669def BREV64 :
1670  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a),
1671             "brev.b64 \t$dst, $a;",
1672             [(set i64:$dst, (bitreverse i64:$a))]>;
1673
1674
1675//
1676// BFE - bit-field extract
1677//
1678
1679// Template for BFE/BFI instructions.
1680// Args: [dest (reg), src (reg), start (reg or imm), end (reg or imm)].
1681// Start may be an imm only if end is also an imm.  FIXME: Is this a
1682// restriction in PTX?
1683//
1684// dest and src may be int32 or int64, but start and end are always int32.
1685def SDTBFE :
1686  SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>,
1687                       SDTCisVT<2, i32>, SDTCisVT<3, i32>]>;
1688def bfe : SDNode<"NVPTXISD::BFE", SDTBFE>;
1689
1690def SDTBFI :
1691  SDTypeProfile<1, 4, [SDTCisInt<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
1692                       SDTCisVT<3, i32>, SDTCisVT<4, i32>]>;
1693def bfi : SDNode<"NVPTXISD::BFI", SDTBFI>;
1694
1695def SDTPRMT :
1696  SDTypeProfile<1, 4, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
1697                       SDTCisVT<2, i32>, SDTCisVT<3, i32>, SDTCisVT<4, i32>,]>;
1698def prmt : SDNode<"NVPTXISD::PRMT", SDTPRMT>;
1699
1700multiclass BFE<string Instr, ValueType T, RegisterClass RC> {
1701  def rrr
1702    : NVPTXInst<(outs RC:$d),
1703                (ins RC:$a, Int32Regs:$b, Int32Regs:$c),
1704                !strconcat(Instr, " \t$d, $a, $b, $c;"),
1705                [(set T:$d, (bfe T:$a, i32:$b, i32:$c))]>;
1706  def rri
1707    : NVPTXInst<(outs RC:$d),
1708                (ins RC:$a, Int32Regs:$b, i32imm:$c),
1709                !strconcat(Instr, " \t$d, $a, $b, $c;"),
1710                [(set T:$d, (bfe T:$a, i32:$b, imm:$c))]>;
1711  def rii
1712    : NVPTXInst<(outs RC:$d),
1713                (ins RC:$a, i32imm:$b, i32imm:$c),
1714                !strconcat(Instr, " \t$d, $a, $b, $c;"),
1715                [(set T:$d, (bfe T:$a, imm:$b, imm:$c))]>;
1716}
1717
1718multiclass BFI<string Instr, ValueType T, RegisterClass RC, Operand ImmCls> {
1719  def rrrr
1720    : NVPTXInst<(outs RC:$f),
1721                (ins RC:$a, RC:$b, Int32Regs:$c, Int32Regs:$d),
1722                !strconcat(Instr, " \t$f, $a, $b, $c, $d;"),
1723                [(set T:$f, (bfi T:$a, T:$b, i32:$c, i32:$d))]>;
1724  def rrri
1725    : NVPTXInst<(outs RC:$f),
1726                (ins RC:$a, RC:$b, Int32Regs:$c, i32imm:$d),
1727                !strconcat(Instr, " \t$f, $a, $b, $c, $d;"),
1728                [(set T:$f, (bfi T:$a, T:$b, i32:$c, imm:$d))]>;
1729  def rrii
1730    : NVPTXInst<(outs RC:$f),
1731                (ins RC:$a, RC:$b, i32imm:$c, i32imm:$d),
1732                !strconcat(Instr, " \t$f, $a, $b, $c, $d;"),
1733                [(set T:$f, (bfi T:$a, T:$b, imm:$c, imm:$d))]>;
1734  def irrr
1735    : NVPTXInst<(outs RC:$f),
1736                (ins ImmCls:$a, RC:$b, Int32Regs:$c, Int32Regs:$d),
1737                !strconcat(Instr, " \t$f, $a, $b, $c, $d;"),
1738                [(set T:$f, (bfi (T imm:$a), T:$b, i32:$c, i32:$d))]>;
1739  def irri
1740    : NVPTXInst<(outs RC:$f),
1741                (ins ImmCls:$a, RC:$b, Int32Regs:$c, i32imm:$d),
1742                !strconcat(Instr, " \t$f, $a, $b, $c, $d;"),
1743                [(set T:$f, (bfi (T imm:$a), T:$b, i32:$c, imm:$d))]>;
1744  def irii
1745    : NVPTXInst<(outs RC:$f),
1746                (ins ImmCls:$a, RC:$b, i32imm:$c, i32imm:$d),
1747                !strconcat(Instr, " \t$f, $a, $b, $c, $d;"),
1748                [(set T:$f, (bfi (T imm:$a), T:$b, imm:$c, imm:$d))]>;
1749}
1750
1751def Hexu32imm : Operand<i32> {
1752  let PrintMethod = "printHexu32imm";
1753}
1754
1755multiclass PRMT<ValueType T, RegisterClass RC> {
1756  def rrr
1757    : NVPTXInst<(outs RC:$d),
1758                (ins RC:$a, Int32Regs:$b, Int32Regs:$c, PrmtMode:$mode),
1759                !strconcat("prmt.b32${mode}", " \t$d, $a, $b, $c;"),
1760                [(set T:$d, (prmt T:$a, T:$b, i32:$c, imm:$mode))]>;
1761  def rri
1762    : NVPTXInst<(outs RC:$d),
1763                (ins RC:$a, Int32Regs:$b, Hexu32imm:$c, PrmtMode:$mode),
1764                !strconcat("prmt.b32${mode}", " \t$d, $a, $b, $c;"),
1765                [(set T:$d, (prmt T:$a, T:$b, imm:$c, imm:$mode))]>;
1766  def rii
1767    : NVPTXInst<(outs RC:$d),
1768                (ins RC:$a, i32imm:$b, Hexu32imm:$c, PrmtMode:$mode),
1769                !strconcat("prmt.b32${mode}", " \t$d, $a, $b, $c;"),
1770                [(set T:$d, (prmt T:$a, imm:$b, imm:$c, imm:$mode))]>;
1771}
1772
1773let hasSideEffects = false in {
1774  // order is somewhat important here. signed/unsigned variants match
1775  // the same patterns, so the first one wins. Having unsigned byte extraction
1776  // has the benefit of always having zero in unused bits, which makes some
1777  // optimizations easier (e.g. no need to mask them).
1778  defm BFE_U32 : BFE<"bfe.u32", i32, Int32Regs>;
1779  defm BFE_S32 : BFE<"bfe.s32", i32, Int32Regs>;
1780  defm BFE_U64 : BFE<"bfe.u64", i64, Int64Regs>;
1781  defm BFE_S64 : BFE<"bfe.s64", i64, Int64Regs>;
1782
1783  defm BFI_B32 : BFI<"bfi.b32", i32, Int32Regs, i32imm>;
1784  defm BFI_B64 : BFI<"bfi.b64", i64, Int64Regs, i64imm>;
1785
1786  defm PRMT_B32 : PRMT<i32, Int32Regs>;
1787}
1788
1789
1790// byte extraction + signed/unsigned extension to i32.
1791def : Pat<(i32 (sext_inreg (bfe i32:$s, i32:$o, 8), i8)),
1792          (BFE_S32rri $s, $o, 8)>;
1793def : Pat<(i32 (sext_inreg (bfe i32:$s, imm:$o, 8), i8)),
1794          (BFE_S32rii $s, imm:$o, 8)>;
1795def : Pat<(i32 (and (bfe i32:$s, i32:$o, 8), 255)),
1796          (BFE_U32rri $s, $o, 8)>;
1797def : Pat<(i32 (and (bfe i32:$s, imm:$o, 8), 255)),
1798          (BFE_U32rii $s, imm:$o, 8)>;
1799
1800// byte extraction + signed extension to i16
1801def : Pat<(i16 (sext_inreg (trunc (bfe i32:$s, imm:$o, 8)), i8)),
1802          (CVT_s8_s32 (BFE_S32rii $s, imm:$o, 8), CvtNONE)>;
1803
1804
1805// Byte extraction via shift/trunc/sext
1806def : Pat<(i16 (sext_inreg (trunc i32:$s), i8)),
1807          (CVT_s8_s32 $s, CvtNONE)>;
1808def : Pat<(i16 (sext_inreg (trunc (srl i32:$s,  (i32 imm:$o))), i8)),
1809          (CVT_s8_s32 (BFE_S32rii $s, imm:$o, 8), CvtNONE)>;
1810def : Pat<(sext_inreg (srl i32:$s,  (i32 imm:$o)), i8),
1811          (BFE_S32rii $s, imm:$o, 8)>;
1812def : Pat<(i16 (sra (i16 (trunc i32:$s)), (i32 8))),
1813          (CVT_s8_s32 (BFE_S32rii $s, 8, 8), CvtNONE)>;
1814def : Pat<(sext_inreg (srl i64:$s,  (i32 imm:$o)), i8),
1815          (BFE_S64rii $s, imm:$o, 8)>;
1816def : Pat<(i16 (sext_inreg (trunc i64:$s), i8)),
1817          (CVT_s8_s64 $s, CvtNONE)>;
1818def : Pat<(i16 (sext_inreg (trunc (srl i64:$s,  (i32 imm:$o))), i8)),
1819          (CVT_s8_s64 (BFE_S64rii $s, imm:$o, 8), CvtNONE)>;
1820
1821//-----------------------------------
1822// Comparison instructions (setp, set)
1823//-----------------------------------
1824
1825// FIXME: This doesn't cover versions of set and setp that combine with a
1826// boolean predicate, e.g. setp.eq.and.b16.
1827
1828let hasSideEffects = false in {
1829  multiclass SETP<string TypeStr, RegisterClass RC, Operand ImmCls> {
1830    def rr :
1831      NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, RC:$b, CmpMode:$cmp),
1832                !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
1833                           " \t$dst, $a, $b;"), []>;
1834    def ri :
1835      NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, ImmCls:$b, CmpMode:$cmp),
1836                !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
1837                           " \t$dst, $a, $b;"), []>;
1838    def ir :
1839      NVPTXInst<(outs Int1Regs:$dst), (ins ImmCls:$a, RC:$b, CmpMode:$cmp),
1840                !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
1841                           " \t$dst, $a, $b;"), []>;
1842  }
1843}
1844
1845defm SETP_b16 : SETP<"b16", Int16Regs, i16imm>;
1846defm SETP_s16 : SETP<"s16", Int16Regs, i16imm>;
1847defm SETP_u16 : SETP<"u16", Int16Regs, i16imm>;
1848defm SETP_b32 : SETP<"b32", Int32Regs, i32imm>;
1849defm SETP_s32 : SETP<"s32", Int32Regs, i32imm>;
1850defm SETP_u32 : SETP<"u32", Int32Regs, i32imm>;
1851defm SETP_b64 : SETP<"b64", Int64Regs, i64imm>;
1852defm SETP_s64 : SETP<"s64", Int64Regs, i64imm>;
1853defm SETP_u64 : SETP<"u64", Int64Regs, i64imm>;
1854defm SETP_f32 : SETP<"f32", Float32Regs, f32imm>;
1855defm SETP_f64 : SETP<"f64", Float64Regs, f64imm>;
1856def SETP_f16rr :
1857      NVPTXInst<(outs Int1Regs:$dst),
1858                (ins Int16Regs:$a, Int16Regs:$b, CmpMode:$cmp),
1859                "setp${cmp:base}${cmp:ftz}.f16 \t$dst, $a, $b;",
1860                []>, Requires<[useFP16Math]>;
1861
1862def SETP_f16x2rr :
1863      NVPTXInst<(outs Int1Regs:$p, Int1Regs:$q),
1864                (ins Int32Regs:$a, Int32Regs:$b, CmpMode:$cmp),
1865                "setp${cmp:base}${cmp:ftz}.f16x2 \t$p|$q, $a, $b;",
1866                []>,
1867                Requires<[useFP16Math]>;
1868def SETP_bf16rr :
1869      NVPTXInst<(outs Int1Regs:$dst),
1870                (ins Int16Regs:$a, Int16Regs:$b, CmpMode:$cmp),
1871                "setp${cmp:base}${cmp:ftz}.bf16 \t$dst, $a, $b;",
1872                []>, Requires<[hasBF16Math, hasPTX<78>, hasSM<90>]>;
1873
1874def SETP_bf16x2rr :
1875      NVPTXInst<(outs Int1Regs:$p, Int1Regs:$q),
1876                (ins Int32Regs:$a, Int32Regs:$b, CmpMode:$cmp),
1877                "setp${cmp:base}${cmp:ftz}.bf16x2 \t$p|$q, $a, $b;",
1878                []>,
1879                Requires<[hasBF16Math, hasPTX<78>, hasSM<90>]>;
1880
1881
1882// FIXME: This doesn't appear to be correct.  The "set" mnemonic has the form
1883// "set.CmpOp{.ftz}.dtype.stype", where dtype is the type of the destination
1884// reg, either u32, s32, or f32.  Anyway these aren't used at the moment.
1885
1886let hasSideEffects = false in {
1887  multiclass SET<string TypeStr, RegisterClass RC, Operand ImmCls> {
1888    def rr : NVPTXInst<(outs Int32Regs:$dst),
1889                       (ins RC:$a, RC:$b, CmpMode:$cmp),
1890                       !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>;
1891    def ri : NVPTXInst<(outs Int32Regs:$dst),
1892                       (ins RC:$a, ImmCls:$b, CmpMode:$cmp),
1893                       !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>;
1894    def ir : NVPTXInst<(outs Int32Regs:$dst),
1895                       (ins ImmCls:$a, RC:$b, CmpMode:$cmp),
1896                       !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>;
1897  }
1898}
1899
1900defm SET_b16 : SET<"b16", Int16Regs, i16imm>;
1901defm SET_s16 : SET<"s16", Int16Regs, i16imm>;
1902defm SET_u16 : SET<"u16", Int16Regs, i16imm>;
1903defm SET_b32 : SET<"b32", Int32Regs, i32imm>;
1904defm SET_s32 : SET<"s32", Int32Regs, i32imm>;
1905defm SET_u32 : SET<"u32", Int32Regs, i32imm>;
1906defm SET_b64 : SET<"b64", Int64Regs, i64imm>;
1907defm SET_s64 : SET<"s64", Int64Regs, i64imm>;
1908defm SET_u64 : SET<"u64", Int64Regs, i64imm>;
1909defm SET_f16 : SET<"f16", Int16Regs, f16imm>;
1910defm SET_bf16 : SET<"bf16", Int16Regs, bf16imm>, Requires<[hasPTX<78>, hasSM<90>]>;
1911defm SET_f32 : SET<"f32", Float32Regs, f32imm>;
1912defm SET_f64 : SET<"f64", Float64Regs, f64imm>;
1913
1914//-----------------------------------
1915// Data Movement (Load / Store, Move)
1916//-----------------------------------
1917
1918let WantsRoot = true in {
1919  def ADDRri : ComplexPattern<i32, 2, "SelectADDRri", [frameindex]>;
1920  def ADDRri64 : ComplexPattern<i64, 2, "SelectADDRri64", [frameindex]>;
1921}
1922def ADDRvar : ComplexPattern<iPTR, 1, "SelectDirectAddr", [], []>;
1923
1924def MEMri : Operand<i32> {
1925  let PrintMethod = "printMemOperand";
1926  let MIOperandInfo = (ops Int32Regs, i32imm);
1927}
1928def MEMri64 : Operand<i64> {
1929  let PrintMethod = "printMemOperand";
1930  let MIOperandInfo = (ops Int64Regs, i64imm);
1931}
1932
1933def imem : Operand<iPTR> {
1934  let PrintMethod = "printOperand";
1935}
1936
1937def imemAny : Operand<pAny> {
1938  let PrintMethod = "printOperand";
1939}
1940
1941def LdStCode : Operand<i32> {
1942  let PrintMethod = "printLdStCode";
1943}
1944
1945def MmaCode : Operand<i32> {
1946  let PrintMethod = "printMmaCode";
1947}
1948
1949def Offseti32imm : Operand<i32> {
1950  let PrintMethod = "printOffseti32imm";
1951}
1952
1953def SDTWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
1954def Wrapper    : SDNode<"NVPTXISD::Wrapper", SDTWrapper>;
1955
1956// Load a memory address into a u32 or u64 register.
1957def MOV_ADDR : NVPTXInst<(outs Int32Regs:$dst), (ins imem:$a),
1958                         "mov.u32 \t$dst, $a;",
1959                         [(set i32:$dst, (Wrapper tglobaladdr:$a))]>;
1960def MOV_ADDR64 : NVPTXInst<(outs Int64Regs:$dst), (ins imem:$a),
1961                           "mov.u64 \t$dst, $a;",
1962                           [(set i64:$dst, (Wrapper tglobaladdr:$a))]>;
1963
1964// Get pointer to local stack.
1965let hasSideEffects = false in {
1966  def MOV_DEPOT_ADDR :    NVPTXInst<(outs Int32Regs:$d), (ins i32imm:$num),
1967                                     "mov.u32 \t$d, __local_depot$num;", []>;
1968  def MOV_DEPOT_ADDR_64 : NVPTXInst<(outs Int64Regs:$d), (ins i32imm:$num),
1969                                    "mov.u64 \t$d, __local_depot$num;", []>;
1970}
1971
1972
1973// copyPhysreg is hard-coded in NVPTXInstrInfo.cpp
1974let IsSimpleMove=1, hasSideEffects=0, isAsCheapAsAMove=1 in {
1975  def IMOV1rr :  NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$sss),
1976                           "mov.pred \t$dst, $sss;", []>;
1977  def IMOV16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss),
1978                           "mov.u16 \t$dst, $sss;", []>;
1979  def IMOV32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss),
1980                           "mov.u32 \t$dst, $sss;", []>;
1981  def IMOV64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss),
1982                           "mov.u64 \t$dst, $sss;", []>;
1983  def IMOV128rr : NVPTXInst<(outs Int128Regs:$dst), (ins Int128Regs:$sss),
1984                           "mov.b128 \t$dst, $sss;", []>;
1985
1986  def FMOV32rr : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
1987                           "mov.f32 \t$dst, $src;", []>;
1988  def FMOV64rr : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$src),
1989                           "mov.f64 \t$dst, $src;", []>;
1990
1991  def IMOV1ri : NVPTXInst<(outs Int1Regs:$dst), (ins i1imm:$src),
1992                          "mov.pred \t$dst, $src;",
1993                          [(set i1:$dst, imm:$src)]>;
1994  def IMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src),
1995                          "mov.b16 \t$dst, $src;",
1996                          [(set i16:$dst, imm:$src)]>;
1997  def IMOV32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src),
1998                          "mov.b32 \t$dst, $src;",
1999                          [(set i32:$dst, imm:$src)]>;
2000  def IMOV64ri : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src),
2001                          "mov.b64 \t$dst, $src;",
2002                          [(set i64:$dst, imm:$src)]>;
2003
2004  def FMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins f16imm:$src),
2005                          "mov.b16 \t$dst, $src;",
2006                          [(set f16:$dst, fpimm:$src)]>;
2007  def BFMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins bf16imm:$src),
2008                          "mov.b16 \t$dst, $src;",
2009                          [(set bf16:$dst, fpimm:$src)]>;
2010  def FMOV32ri : NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$src),
2011                          "mov.f32 \t$dst, $src;",
2012                          [(set f32:$dst, fpimm:$src)]>;
2013  def FMOV64ri : NVPTXInst<(outs Float64Regs:$dst), (ins f64imm:$src),
2014                          "mov.f64 \t$dst, $src;",
2015                          [(set f64:$dst, fpimm:$src)]>;
2016}
2017
2018def : Pat<(i32 (Wrapper texternalsym:$dst)), (IMOV32ri texternalsym:$dst)>;
2019def : Pat<(i64 (Wrapper texternalsym:$dst)), (IMOV64ri texternalsym:$dst)>;
2020
2021//---- Copy Frame Index ----
2022def LEA_ADDRi :   NVPTXInst<(outs Int32Regs:$dst), (ins MEMri:$addr),
2023                            "add.u32 \t$dst, ${addr:add};",
2024                            [(set i32:$dst, ADDRri:$addr)]>;
2025def LEA_ADDRi64 : NVPTXInst<(outs Int64Regs:$dst), (ins MEMri64:$addr),
2026                            "add.u64 \t$dst, ${addr:add};",
2027                            [(set i64:$dst, ADDRri64:$addr)]>;
2028
2029//-----------------------------------
2030// Comparison and Selection
2031//-----------------------------------
2032
2033multiclass ISET_FORMAT<PatFrag OpNode, PatLeaf Mode,
2034                       Instruction setp_16rr,
2035                       Instruction setp_16ri,
2036                       Instruction setp_16ir,
2037                       Instruction setp_32rr,
2038                       Instruction setp_32ri,
2039                       Instruction setp_32ir,
2040                       Instruction setp_64rr,
2041                       Instruction setp_64ri,
2042                       Instruction setp_64ir,
2043                       Instruction set_16rr,
2044                       Instruction set_16ri,
2045                       Instruction set_16ir,
2046                       Instruction set_32rr,
2047                       Instruction set_32ri,
2048                       Instruction set_32ir,
2049                       Instruction set_64rr,
2050                       Instruction set_64ri,
2051                       Instruction set_64ir> {
2052  // i16 -> pred
2053  def : Pat<(i1 (OpNode i16:$a, i16:$b)),
2054            (setp_16rr $a, $b, Mode)>;
2055  def : Pat<(i1 (OpNode i16:$a, imm:$b)),
2056            (setp_16ri $a, imm:$b, Mode)>;
2057  def : Pat<(i1 (OpNode imm:$a, i16:$b)),
2058            (setp_16ir imm:$a, $b, Mode)>;
2059  // i32 -> pred
2060  def : Pat<(i1 (OpNode i32:$a, i32:$b)),
2061            (setp_32rr $a, $b, Mode)>;
2062  def : Pat<(i1 (OpNode i32:$a, imm:$b)),
2063            (setp_32ri $a, imm:$b, Mode)>;
2064  def : Pat<(i1 (OpNode imm:$a, i32:$b)),
2065            (setp_32ir imm:$a, $b, Mode)>;
2066  // i64 -> pred
2067  def : Pat<(i1 (OpNode i64:$a, i64:$b)),
2068            (setp_64rr $a, $b, Mode)>;
2069  def : Pat<(i1 (OpNode i64:$a, imm:$b)),
2070            (setp_64ri $a, imm:$b, Mode)>;
2071  def : Pat<(i1 (OpNode imm:$a, i64:$b)),
2072            (setp_64ir imm:$a, $b, Mode)>;
2073
2074  // i16 -> i32
2075  def : Pat<(i32 (OpNode i16:$a, i16:$b)),
2076            (set_16rr $a, $b, Mode)>;
2077  def : Pat<(i32 (OpNode i16:$a, imm:$b)),
2078            (set_16ri $a, imm:$b, Mode)>;
2079  def : Pat<(i32 (OpNode imm:$a, i16:$b)),
2080            (set_16ir imm:$a, $b, Mode)>;
2081  // i32 -> i32
2082  def : Pat<(i32 (OpNode i32:$a, i32:$b)),
2083            (set_32rr $a, $b, Mode)>;
2084  def : Pat<(i32 (OpNode i32:$a, imm:$b)),
2085            (set_32ri $a, imm:$b, Mode)>;
2086  def : Pat<(i32 (OpNode imm:$a, i32:$b)),
2087            (set_32ir imm:$a, $b, Mode)>;
2088  // i64 -> i32
2089  def : Pat<(i32 (OpNode i64:$a, Int64Regs:$b)),
2090            (set_64rr $a, $b, Mode)>;
2091  def : Pat<(i32 (OpNode i64:$a, imm:$b)),
2092            (set_64ri $a, imm:$b, Mode)>;
2093  def : Pat<(i32 (OpNode imm:$a, i64:$b)),
2094            (set_64ir imm:$a, $b, Mode)>;
2095}
2096
2097multiclass ISET_FORMAT_SIGNED<PatFrag OpNode, PatLeaf Mode>
2098  : ISET_FORMAT<OpNode, Mode,
2099                SETP_s16rr, SETP_s16ri, SETP_s16ir,
2100                SETP_s32rr, SETP_s32ri, SETP_s32ir,
2101                SETP_s64rr, SETP_s64ri, SETP_s64ir,
2102                SET_s16rr, SET_s16ri, SET_s16ir,
2103                SET_s32rr, SET_s32ri, SET_s32ir,
2104                SET_s64rr, SET_s64ri, SET_s64ir> {
2105  // TableGen doesn't like empty multiclasses.
2106  def : PatLeaf<(i32 0)>;
2107}
2108
2109multiclass ISET_FORMAT_UNSIGNED<PatFrag OpNode, PatLeaf Mode>
2110  : ISET_FORMAT<OpNode, Mode,
2111                SETP_u16rr, SETP_u16ri, SETP_u16ir,
2112                SETP_u32rr, SETP_u32ri, SETP_u32ir,
2113                SETP_u64rr, SETP_u64ri, SETP_u64ir,
2114                SET_u16rr, SET_u16ri, SET_u16ir,
2115                SET_u32rr, SET_u32ri, SET_u32ir,
2116                SET_u64rr, SET_u64ri, SET_u64ir> {
2117  // TableGen doesn't like empty multiclasses.
2118  def : PatLeaf<(i32 0)>;
2119}
2120
2121defm : ISET_FORMAT_SIGNED<setgt, CmpGT>;
2122defm : ISET_FORMAT_SIGNED<setlt, CmpLT>;
2123defm : ISET_FORMAT_SIGNED<setge, CmpGE>;
2124defm : ISET_FORMAT_SIGNED<setle, CmpLE>;
2125defm : ISET_FORMAT_SIGNED<seteq, CmpEQ>;
2126defm : ISET_FORMAT_SIGNED<setne, CmpNE>;
2127defm : ISET_FORMAT_UNSIGNED<setugt, CmpGT>;
2128defm : ISET_FORMAT_UNSIGNED<setult, CmpLT>;
2129defm : ISET_FORMAT_UNSIGNED<setuge, CmpGE>;
2130defm : ISET_FORMAT_UNSIGNED<setule, CmpLE>;
2131defm : ISET_FORMAT_UNSIGNED<setueq, CmpEQ>;
2132defm : ISET_FORMAT_UNSIGNED<setune, CmpNE>;
2133
2134// comparisons of i8 extracted with BFE as i32
2135// It's faster to do comparison directly on i32 extracted by BFE,
2136// instead of the long conversion and sign extending.
2137def: Pat<(setgt (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8))), i8)),
2138                (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8))), i8))),
2139         (SETP_s32rr (BFE_S32rri $a, $oa, 8), (BFE_S32rri $b, $ob, 8), CmpGT)>;
2140def: Pat<(setgt (i16 (sext_inreg (trunc (bfe Int32Regs:$a, imm:$oa, 8)), i8)),
2141                (i16 (sext_inreg (trunc (bfe Int32Regs:$b, imm:$ob, 8)), i8))),
2142         (SETP_s32rr (BFE_S32rii $a, imm:$oa, 8), (BFE_S32rii $b, imm:$ob, 8), CmpGT)>;
2143def: Pat<(setge (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8))), i8)),
2144                (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8))), i8))),
2145         (SETP_s32rr (BFE_S32rri $a, $oa, 8), (BFE_S32rri $b, $ob, 8), CmpGE)>;
2146def: Pat<(setge (i16 (sext_inreg (trunc (bfe Int32Regs:$a, imm:$oa, 8)), i8)),
2147                (i16 (sext_inreg (trunc (bfe Int32Regs:$b, imm:$ob, 8)), i8))),
2148         (SETP_s32rr (BFE_S32rii $a, imm:$oa, 8), (BFE_S32rii $b, imm:$ob, 8), CmpGE)>;
2149def: Pat<(setlt (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8))), i8)),
2150                (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8))), i8))),
2151         (SETP_s32rr (BFE_S32rri $a, $oa, 8), (BFE_S32rri $b, $ob, 8), CmpLT)>;
2152def: Pat<(setlt (i16 (sext_inreg (trunc (bfe Int32Regs:$a, imm:$oa, 8)), i8)),
2153                (i16 (sext_inreg (trunc (bfe Int32Regs:$b, imm:$ob, 8)), i8))),
2154         (SETP_s32rr (BFE_S32rii $a, imm:$oa, 8), (BFE_S32rii $b, imm:$ob, 8), CmpLT)>;
2155def: Pat<(setle (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8))), i8)),
2156                (i16 (sext_inreg (i16 (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8))), i8))),
2157         (SETP_s32rr (BFE_S32rri $a, $oa, 8), (BFE_S32rri $b, $ob, 8), CmpLE)>;
2158def: Pat<(setle (i16 (sext_inreg (trunc (bfe Int32Regs:$a, imm:$oa, 8)), i8)),
2159                (i16 (sext_inreg (trunc (bfe Int32Regs:$b, imm:$ob, 8)), i8))),
2160         (SETP_s32rr (BFE_S32rii $a, imm:$oa, 8), (BFE_S32rii $b, imm:$ob, 8), CmpLE)>;
2161
2162def: Pat<(setugt (i16 (and (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8)), 255)),
2163                 (i16 (and (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8)), 255))),
2164         (SETP_u32rr (BFE_U32rri $a, $oa, 8), (BFE_U32rri $b, $ob, 8), CmpHI)>;
2165def: Pat<(setugt (i16 (and (trunc (bfe Int32Regs:$a, imm:$oa, 8)), 255)),
2166                 (i16 (and (trunc (bfe Int32Regs:$b, imm:$ob, 8)), 255))),
2167         (SETP_u32rr (BFE_U32rii $a, imm:$oa, 8), (BFE_U32rii $b, imm:$ob, 8), CmpHI)>;
2168def: Pat<(setuge (i16 (and (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8)), 255)),
2169                 (i16 (and (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8)), 255))),
2170         (SETP_u32rr (BFE_U32rri $a, $oa, 8), (BFE_U32rri $b, $ob, 8), CmpHS)>;
2171def: Pat<(setuge (i16 (and (trunc (bfe Int32Regs:$a, imm:$oa, 8)), 255)),
2172                 (i16 (and (trunc (bfe Int32Regs:$b, imm:$ob, 8)), 255))),
2173         (SETP_u32rr (BFE_U32rii $a, imm:$oa, 8), (BFE_U32rii $b, imm:$ob, 8), CmpHS)>;
2174def: Pat<(setult (i16 (and (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8)), 255)),
2175                 (i16 (and (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8)), 255))),
2176         (SETP_u32rr (BFE_U32rri $a, $oa, 8), (BFE_U32rri $b, $ob, 8), CmpLO)>;
2177def: Pat<(setult (i16 (and (trunc (bfe Int32Regs:$a, imm:$oa, 8)), 255)),
2178                 (i16 (and (trunc (bfe Int32Regs:$b, imm:$ob, 8)), 255))),
2179         (SETP_u32rr (BFE_U32rii $a, imm:$oa, 8), (BFE_U32rii $b, imm:$ob, 8), CmpLO)>;
2180def: Pat<(setule (i16 (and (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8)), 255)),
2181                 (i16 (and (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8)), 255))),
2182         (SETP_u32rr (BFE_U32rri $a, $oa, 8), (BFE_U32rri $b, $ob, 8), CmpLS)>;
2183def: Pat<(setule (i16 (and (trunc (bfe Int32Regs:$a, imm:$oa, 8)), 255)),
2184                 (i16 (and (trunc (bfe Int32Regs:$b, imm:$ob, 8)), 255))),
2185         (SETP_u32rr (BFE_U32rii $a, imm:$oa, 8), (BFE_U32rii $b, imm:$ob, 8), CmpLS)>;
2186def: Pat<(seteq (i16 (and (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8)), 255)),
2187                 (i16 (and (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8)), 255))),
2188         (SETP_u32rr (BFE_U32rri $a, $oa, 8), (BFE_U32rri $b, $ob, 8), CmpEQ)>;
2189def: Pat<(seteq (i16 (and (trunc (bfe Int32Regs:$a, imm:$oa, 8)), 255)),
2190                 (i16 (and (trunc (bfe Int32Regs:$b, imm:$ob, 8)), 255))),
2191         (SETP_u32rr (BFE_U32rii $a, imm:$oa, 8), (BFE_U32rii $b, imm:$ob, 8), CmpEQ)>;
2192def: Pat<(setne (i16 (and (trunc (bfe Int32Regs:$a, Int32Regs:$oa, 8)), 255)),
2193                 (i16 (and (trunc (bfe Int32Regs:$b, Int32Regs:$ob, 8)), 255))),
2194         (SETP_u32rr (BFE_U32rri $a, $oa, 8), (BFE_U32rri $b, $ob, 8), CmpNE)>;
2195def: Pat<(setne (i16 (and (trunc (bfe Int32Regs:$a, imm:$oa, 8)), 255)),
2196                 (i16 (and (trunc (bfe Int32Regs:$b, imm:$ob, 8)), 255))),
2197         (SETP_u32rr (BFE_U32rii $a, imm:$oa, 8), (BFE_U32rii $b, imm:$ob, 8), CmpNE)>;
2198
2199// i1 compare -> i32
2200def : Pat<(i32 (setne i1:$a, i1:$b)),
2201          (SELP_u32ii -1, 0, (XORb1rr $a, $b))>;
2202def : Pat<(i32 (setne i1:$a, i1:$b)),
2203          (SELP_u32ii 0, -1, (XORb1rr $a, $b))>;
2204
2205
2206
2207multiclass FSET_FORMAT<PatFrag OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
2208  // f16 -> pred
2209  def : Pat<(i1 (OpNode f16:$a, f16:$b)),
2210            (SETP_f16rr $a, $b, ModeFTZ)>,
2211        Requires<[useFP16Math,doF32FTZ]>;
2212  def : Pat<(i1 (OpNode f16:$a, f16:$b)),
2213            (SETP_f16rr $a, $b, Mode)>,
2214        Requires<[useFP16Math]>;
2215
2216  // bf16 -> pred
2217  def : Pat<(i1 (OpNode bf16:$a, bf16:$b)),
2218            (SETP_bf16rr $a, $b, ModeFTZ)>,
2219        Requires<[hasBF16Math,doF32FTZ]>;
2220  def : Pat<(i1 (OpNode bf16:$a, bf16:$b)),
2221            (SETP_bf16rr $a, $b, Mode)>,
2222        Requires<[hasBF16Math]>;
2223
2224  // f32 -> pred
2225  def : Pat<(i1 (OpNode f32:$a, f32:$b)),
2226            (SETP_f32rr $a, $b, ModeFTZ)>,
2227        Requires<[doF32FTZ]>;
2228  def : Pat<(i1 (OpNode f32:$a, f32:$b)),
2229            (SETP_f32rr $a, $b, Mode)>;
2230  def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)),
2231            (SETP_f32ri $a, fpimm:$b, ModeFTZ)>,
2232        Requires<[doF32FTZ]>;
2233  def : Pat<(i1 (OpNode f32:$a, fpimm:$b)),
2234            (SETP_f32ri $a, fpimm:$b, Mode)>;
2235  def : Pat<(i1 (OpNode fpimm:$a, f32:$b)),
2236            (SETP_f32ir fpimm:$a, $b, ModeFTZ)>,
2237        Requires<[doF32FTZ]>;
2238  def : Pat<(i1 (OpNode fpimm:$a, f32:$b)),
2239            (SETP_f32ir fpimm:$a, $b, Mode)>;
2240
2241  // f64 -> pred
2242  def : Pat<(i1 (OpNode f64:$a, f64:$b)),
2243            (SETP_f64rr $a, $b, Mode)>;
2244  def : Pat<(i1 (OpNode f64:$a, fpimm:$b)),
2245            (SETP_f64ri $a, fpimm:$b, Mode)>;
2246  def : Pat<(i1 (OpNode fpimm:$a, f64:$b)),
2247            (SETP_f64ir fpimm:$a, $b, Mode)>;
2248
2249  // f16 -> i32
2250  def : Pat<(i32 (OpNode f16:$a, f16:$b)),
2251            (SET_f16rr $a, $b, ModeFTZ)>,
2252        Requires<[useFP16Math, doF32FTZ]>;
2253  def : Pat<(i32 (OpNode f16:$a, f16:$b)),
2254            (SET_f16rr $a, $b, Mode)>,
2255        Requires<[useFP16Math]>;
2256
2257  // bf16 -> i32
2258  def : Pat<(i32 (OpNode bf16:$a, bf16:$b)),
2259            (SET_bf16rr $a, $b, ModeFTZ)>,
2260        Requires<[hasBF16Math, doF32FTZ]>;
2261  def : Pat<(i32 (OpNode bf16:$a, bf16:$b)),
2262            (SET_bf16rr $a, $b, Mode)>,
2263        Requires<[hasBF16Math]>;
2264
2265  // f32 -> i32
2266  def : Pat<(i32 (OpNode f32:$a, f32:$b)),
2267            (SET_f32rr $a, $b, ModeFTZ)>,
2268        Requires<[doF32FTZ]>;
2269  def : Pat<(i32 (OpNode f32:$a, f32:$b)),
2270            (SET_f32rr $a, $b, Mode)>;
2271  def : Pat<(i32 (OpNode f32:$a, fpimm:$b)),
2272            (SET_f32ri $a, fpimm:$b, ModeFTZ)>,
2273        Requires<[doF32FTZ]>;
2274  def : Pat<(i32 (OpNode f32:$a, fpimm:$b)),
2275            (SET_f32ri $a, fpimm:$b, Mode)>;
2276  def : Pat<(i32 (OpNode fpimm:$a, f32:$b)),
2277            (SET_f32ir fpimm:$a, $b, ModeFTZ)>,
2278        Requires<[doF32FTZ]>;
2279  def : Pat<(i32 (OpNode fpimm:$a, f32:$b)),
2280            (SET_f32ir fpimm:$a, $b, Mode)>;
2281
2282  // f64 -> i32
2283  def : Pat<(i32 (OpNode f64:$a, f64:$b)),
2284            (SET_f64rr $a, $b, Mode)>;
2285  def : Pat<(i32 (OpNode f64:$a, fpimm:$b)),
2286            (SET_f64ri $a, fpimm:$b, Mode)>;
2287  def : Pat<(i32 (OpNode fpimm:$a, f64:$b)),
2288            (SET_f64ir fpimm:$a, $b, Mode)>;
2289}
2290
2291defm FSetOGT : FSET_FORMAT<setogt, CmpGT, CmpGT_FTZ>;
2292defm FSetOLT : FSET_FORMAT<setolt, CmpLT, CmpLT_FTZ>;
2293defm FSetOGE : FSET_FORMAT<setoge, CmpGE, CmpGE_FTZ>;
2294defm FSetOLE : FSET_FORMAT<setole, CmpLE, CmpLE_FTZ>;
2295defm FSetOEQ : FSET_FORMAT<setoeq, CmpEQ, CmpEQ_FTZ>;
2296defm FSetONE : FSET_FORMAT<setone, CmpNE, CmpNE_FTZ>;
2297
2298defm FSetUGT : FSET_FORMAT<setugt, CmpGTU, CmpGTU_FTZ>;
2299defm FSetULT : FSET_FORMAT<setult, CmpLTU, CmpLTU_FTZ>;
2300defm FSetUGE : FSET_FORMAT<setuge, CmpGEU, CmpGEU_FTZ>;
2301defm FSetULE : FSET_FORMAT<setule, CmpLEU, CmpLEU_FTZ>;
2302defm FSetUEQ : FSET_FORMAT<setueq, CmpEQU, CmpEQU_FTZ>;
2303defm FSetUNE : FSET_FORMAT<setune, CmpNEU, CmpNEU_FTZ>;
2304
2305defm FSetGT : FSET_FORMAT<setgt, CmpGT, CmpGT_FTZ>;
2306defm FSetLT : FSET_FORMAT<setlt, CmpLT, CmpLT_FTZ>;
2307defm FSetGE : FSET_FORMAT<setge, CmpGE, CmpGE_FTZ>;
2308defm FSetLE : FSET_FORMAT<setle, CmpLE, CmpLE_FTZ>;
2309defm FSetEQ : FSET_FORMAT<seteq, CmpEQ, CmpEQ_FTZ>;
2310defm FSetNE : FSET_FORMAT<setne, CmpNE, CmpNE_FTZ>;
2311
2312defm FSetNUM : FSET_FORMAT<seto, CmpNUM, CmpNUM_FTZ>;
2313defm FSetNAN : FSET_FORMAT<setuo, CmpNAN, CmpNAN_FTZ>;
2314
2315def SDTDeclareParamProfile :
2316  SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>;
2317def SDTDeclareScalarParamProfile :
2318  SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>;
2319def SDTLoadParamProfile : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>;
2320def SDTLoadParamV2Profile : SDTypeProfile<2, 2, [SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisInt<3>]>;
2321def SDTLoadParamV4Profile : SDTypeProfile<4, 2, [SDTCisInt<4>, SDTCisInt<5>]>;
2322def SDTPrintCallProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
2323def SDTPrintCallUniProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
2324def SDTStoreParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>;
2325def SDTStoreParamV2Profile : SDTypeProfile<0, 4, [SDTCisInt<0>, SDTCisInt<1>]>;
2326def SDTStoreParamV4Profile : SDTypeProfile<0, 6, [SDTCisInt<0>, SDTCisInt<1>]>;
2327def SDTStoreParam32Profile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>;
2328def SDTCallArgProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>;
2329def SDTCallArgMarkProfile : SDTypeProfile<0, 0, []>;
2330def SDTCallVoidProfile : SDTypeProfile<0, 1, []>;
2331def SDTCallValProfile : SDTypeProfile<1, 0, []>;
2332def SDTMoveParamProfile : SDTypeProfile<1, 1, []>;
2333def SDTStoreRetvalProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>;
2334def SDTStoreRetvalV2Profile : SDTypeProfile<0, 3, [SDTCisInt<0>]>;
2335def SDTStoreRetvalV4Profile : SDTypeProfile<0, 5, [SDTCisInt<0>]>;
2336def SDTPseudoUseParamProfile : SDTypeProfile<0, 1, []>;
2337def SDTProxyRegProfile : SDTypeProfile<1, 1, []>;
2338
2339def DeclareParam :
2340  SDNode<"NVPTXISD::DeclareParam", SDTDeclareParamProfile,
2341         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2342def DeclareScalarParam :
2343  SDNode<"NVPTXISD::DeclareScalarParam", SDTDeclareScalarParamProfile,
2344         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2345def DeclareRetParam :
2346  SDNode<"NVPTXISD::DeclareRetParam", SDTDeclareParamProfile,
2347         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2348def DeclareRet :
2349  SDNode<"NVPTXISD::DeclareRet", SDTDeclareScalarParamProfile,
2350         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2351def LoadParam :
2352  SDNode<"NVPTXISD::LoadParam", SDTLoadParamProfile,
2353         [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
2354def LoadParamV2 :
2355  SDNode<"NVPTXISD::LoadParamV2", SDTLoadParamV2Profile,
2356         [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
2357def LoadParamV4 :
2358  SDNode<"NVPTXISD::LoadParamV4", SDTLoadParamV4Profile,
2359         [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
2360def PrintCall :
2361  SDNode<"NVPTXISD::PrintCall", SDTPrintCallProfile,
2362         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2363def PrintConvergentCall :
2364  SDNode<"NVPTXISD::PrintConvergentCall", SDTPrintCallProfile,
2365         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2366def PrintCallUni :
2367  SDNode<"NVPTXISD::PrintCallUni", SDTPrintCallUniProfile,
2368         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2369def PrintConvergentCallUni :
2370  SDNode<"NVPTXISD::PrintConvergentCallUni", SDTPrintCallUniProfile,
2371         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2372def StoreParam :
2373  SDNode<"NVPTXISD::StoreParam", SDTStoreParamProfile,
2374         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2375def StoreParamV2 :
2376  SDNode<"NVPTXISD::StoreParamV2", SDTStoreParamV2Profile,
2377         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2378def StoreParamV4 :
2379  SDNode<"NVPTXISD::StoreParamV4", SDTStoreParamV4Profile,
2380         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2381def StoreParamU32 :
2382  SDNode<"NVPTXISD::StoreParamU32", SDTStoreParam32Profile,
2383         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2384def StoreParamS32 :
2385  SDNode<"NVPTXISD::StoreParamS32", SDTStoreParam32Profile,
2386         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2387def CallArgBegin :
2388  SDNode<"NVPTXISD::CallArgBegin", SDTCallArgMarkProfile,
2389         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2390def CallArg :
2391  SDNode<"NVPTXISD::CallArg", SDTCallArgProfile,
2392         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2393def LastCallArg :
2394  SDNode<"NVPTXISD::LastCallArg", SDTCallArgProfile,
2395         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2396def CallArgEnd :
2397  SDNode<"NVPTXISD::CallArgEnd", SDTCallVoidProfile,
2398         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2399def CallVoid :
2400  SDNode<"NVPTXISD::CallVoid", SDTCallVoidProfile,
2401         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2402def Prototype :
2403  SDNode<"NVPTXISD::Prototype", SDTCallVoidProfile,
2404         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2405def CallVal :
2406  SDNode<"NVPTXISD::CallVal", SDTCallValProfile,
2407         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2408def MoveParam :
2409  SDNode<"NVPTXISD::MoveParam", SDTMoveParamProfile, []>;
2410def StoreRetval :
2411  SDNode<"NVPTXISD::StoreRetval", SDTStoreRetvalProfile,
2412         [SDNPHasChain, SDNPSideEffect]>;
2413def StoreRetvalV2 :
2414  SDNode<"NVPTXISD::StoreRetvalV2", SDTStoreRetvalV2Profile,
2415         [SDNPHasChain, SDNPSideEffect]>;
2416def StoreRetvalV4 :
2417  SDNode<"NVPTXISD::StoreRetvalV4", SDTStoreRetvalV4Profile,
2418         [SDNPHasChain, SDNPSideEffect]>;
2419def PseudoUseParam :
2420  SDNode<"NVPTXISD::PseudoUseParam", SDTPseudoUseParamProfile,
2421         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2422def RETURNNode :
2423  SDNode<"NVPTXISD::RETURN", SDTCallArgMarkProfile,
2424         [SDNPHasChain, SDNPSideEffect]>;
2425def ProxyReg :
2426  SDNode<"NVPTXISD::ProxyReg", SDTProxyRegProfile,
2427         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
2428
2429let mayLoad = true in {
2430  class LoadParamMemInst<NVPTXRegClass regclass, string opstr> :
2431        NVPTXInst<(outs regclass:$dst), (ins Offseti32imm:$b),
2432                  !strconcat("ld.param", opstr, " \t$dst, [retval0$b];"),
2433                  []>;
2434
2435  class LoadParamV2MemInst<NVPTXRegClass regclass, string opstr> :
2436        NVPTXInst<(outs regclass:$dst, regclass:$dst2), (ins Offseti32imm:$b),
2437                  !strconcat("ld.param.v2", opstr,
2438                             " \t{{$dst, $dst2}}, [retval0$b];"), []>;
2439
2440  class LoadParamV4MemInst<NVPTXRegClass regclass, string opstr> :
2441        NVPTXInst<(outs regclass:$dst, regclass:$dst2, regclass:$dst3,
2442                        regclass:$dst4),
2443                  (ins Offseti32imm:$b),
2444                  !strconcat("ld.param.v4", opstr,
2445                             " \t{{$dst, $dst2, $dst3, $dst4}}, [retval0$b];"),
2446                  []>;
2447}
2448
2449class LoadParamRegInst<NVPTXRegClass regclass, string opstr> :
2450      NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
2451                !strconcat("mov", opstr, " \t$dst, retval$b;"),
2452                [(set regclass:$dst, (LoadParam (i32 0), (i32 imm:$b)))]>;
2453
2454let mayStore = true in {
2455
2456  multiclass StoreParamInst<NVPTXRegClass regclass, Operand IMMType, string opstr, bit support_imm = true> {
2457    foreach op = [IMMType, regclass] in
2458      if !or(support_imm, !isa<NVPTXRegClass>(op)) then
2459        def _ # !if(!isa<NVPTXRegClass>(op), "r", "i")
2460          : NVPTXInst<(outs),
2461                      (ins op:$val, i32imm:$a, Offseti32imm:$b),
2462                      "st.param" # opstr # " \t[param$a$b], $val;",
2463                      []>;
2464  }
2465
2466  multiclass StoreParamV2Inst<NVPTXRegClass regclass, Operand IMMType, string opstr> {
2467    foreach op1 = [IMMType, regclass] in
2468      foreach op2 = [IMMType, regclass] in
2469        def _ # !if(!isa<NVPTXRegClass>(op1), "r", "i")
2470              # !if(!isa<NVPTXRegClass>(op2), "r", "i")
2471          : NVPTXInst<(outs),
2472                      (ins op1:$val1, op2:$val2,
2473                           i32imm:$a, Offseti32imm:$b),
2474                      "st.param.v2" # opstr # " \t[param$a$b], {{$val1, $val2}};",
2475                      []>;
2476  }
2477
2478  multiclass StoreParamV4Inst<NVPTXRegClass regclass, Operand IMMType, string opstr> {
2479    foreach op1 = [IMMType, regclass] in
2480      foreach op2 = [IMMType, regclass] in
2481        foreach op3 = [IMMType, regclass] in
2482          foreach op4 = [IMMType, regclass] in
2483            def _ # !if(!isa<NVPTXRegClass>(op1), "r", "i")
2484                  # !if(!isa<NVPTXRegClass>(op2), "r", "i")
2485                  # !if(!isa<NVPTXRegClass>(op3), "r", "i")
2486                  # !if(!isa<NVPTXRegClass>(op4), "r", "i")
2487
2488              : NVPTXInst<(outs),
2489                          (ins op1:$val1, op2:$val2, op3:$val3, op4:$val4,
2490                               i32imm:$a, Offseti32imm:$b),
2491                          "st.param.v4" # opstr #
2492                          " \t[param$a$b], {{$val1, $val2, $val3, $val4}};",
2493                          []>;
2494  }
2495
2496  class StoreRetvalInst<NVPTXRegClass regclass, string opstr> :
2497        NVPTXInst<(outs), (ins regclass:$val, Offseti32imm:$a),
2498                  !strconcat("st.param", opstr, " \t[func_retval0$a], $val;"),
2499                  []>;
2500
2501  class StoreRetvalV2Inst<NVPTXRegClass regclass, string opstr> :
2502        NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, Offseti32imm:$a),
2503                  !strconcat("st.param.v2", opstr,
2504                             " \t[func_retval0$a], {{$val, $val2}};"),
2505                  []>;
2506
2507  class StoreRetvalV4Inst<NVPTXRegClass regclass, string opstr> :
2508        NVPTXInst<(outs),
2509                  (ins regclass:$val, regclass:$val2, regclass:$val3,
2510                       regclass:$val4, Offseti32imm:$a),
2511                  !strconcat("st.param.v4", opstr,
2512                             " \t[func_retval0$a], {{$val, $val2, $val3, $val4}};"),
2513                  []>;
2514}
2515
2516let isCall=1 in {
2517  multiclass CALL<string OpcStr, SDNode OpNode> {
2518     def PrintCallNoRetInst : NVPTXInst<(outs), (ins),
2519       !strconcat(OpcStr, " "), [(OpNode (i32 0))]>;
2520     def PrintCallRetInst1 : NVPTXInst<(outs), (ins),
2521       !strconcat(OpcStr, " (retval0), "), [(OpNode (i32 1))]>;
2522     def PrintCallRetInst2 : NVPTXInst<(outs), (ins),
2523       !strconcat(OpcStr, " (retval0, retval1), "), [(OpNode (i32 2))]>;
2524     def PrintCallRetInst3 : NVPTXInst<(outs), (ins),
2525       !strconcat(OpcStr, " (retval0, retval1, retval2), "), [(OpNode (i32 3))]>;
2526     def PrintCallRetInst4 : NVPTXInst<(outs), (ins),
2527       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3), "),
2528       [(OpNode (i32 4))]>;
2529     def PrintCallRetInst5 : NVPTXInst<(outs), (ins),
2530       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4), "),
2531       [(OpNode (i32 5))]>;
2532     def PrintCallRetInst6 : NVPTXInst<(outs), (ins),
2533       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
2534                            "retval5), "),
2535       [(OpNode (i32 6))]>;
2536     def PrintCallRetInst7 : NVPTXInst<(outs), (ins),
2537       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
2538                            "retval5, retval6), "),
2539       [(OpNode (i32 7))]>;
2540     def PrintCallRetInst8 : NVPTXInst<(outs), (ins),
2541       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
2542                            "retval5, retval6, retval7), "),
2543       [(OpNode (i32 8))]>;
2544  }
2545}
2546
2547defm Call : CALL<"call", PrintCall>;
2548defm CallUni : CALL<"call.uni", PrintCallUni>;
2549
2550// Convergent call instructions.  These are identical to regular calls, except
2551// they have the isConvergent bit set.
2552let isConvergent=1 in {
2553  defm ConvergentCall : CALL<"call", PrintConvergentCall>;
2554  defm ConvergentCallUni : CALL<"call.uni", PrintConvergentCallUni>;
2555}
2556
2557def LoadParamMemI64    : LoadParamMemInst<Int64Regs, ".b64">;
2558def LoadParamMemI32    : LoadParamMemInst<Int32Regs, ".b32">;
2559def LoadParamMemI16    : LoadParamMemInst<Int16Regs, ".b16">;
2560def LoadParamMemI8     : LoadParamMemInst<Int16Regs, ".b8">;
2561def LoadParamMemV2I64  : LoadParamV2MemInst<Int64Regs, ".b64">;
2562def LoadParamMemV2I32  : LoadParamV2MemInst<Int32Regs, ".b32">;
2563def LoadParamMemV2I16  : LoadParamV2MemInst<Int16Regs, ".b16">;
2564def LoadParamMemV2I8   : LoadParamV2MemInst<Int16Regs, ".b8">;
2565def LoadParamMemV4I32  : LoadParamV4MemInst<Int32Regs, ".b32">;
2566def LoadParamMemV4I16  : LoadParamV4MemInst<Int16Regs, ".b16">;
2567def LoadParamMemV4I8   : LoadParamV4MemInst<Int16Regs, ".b8">;
2568def LoadParamMemF32    : LoadParamMemInst<Float32Regs, ".f32">;
2569def LoadParamMemF64    : LoadParamMemInst<Float64Regs, ".f64">;
2570def LoadParamMemV2F32  : LoadParamV2MemInst<Float32Regs, ".f32">;
2571def LoadParamMemV2F64  : LoadParamV2MemInst<Float64Regs, ".f64">;
2572def LoadParamMemV4F32  : LoadParamV4MemInst<Float32Regs, ".f32">;
2573
2574defm StoreParamI64    : StoreParamInst<Int64Regs, i64imm, ".b64">;
2575defm StoreParamI32    : StoreParamInst<Int32Regs, i32imm, ".b32">;
2576defm StoreParamI16    : StoreParamInst<Int16Regs, i16imm, ".b16">;
2577defm StoreParamI8     : StoreParamInst<Int16Regs, i8imm,  ".b8">;
2578
2579defm StoreParamI8TruncI32 : StoreParamInst<Int32Regs, i8imm, ".b8", /* support_imm */ false>;
2580defm StoreParamI8TruncI64 : StoreParamInst<Int64Regs, i8imm, ".b8", /* support_imm */ false>;
2581
2582defm StoreParamV2I64  : StoreParamV2Inst<Int64Regs, i64imm, ".b64">;
2583defm StoreParamV2I32  : StoreParamV2Inst<Int32Regs, i32imm, ".b32">;
2584defm StoreParamV2I16  : StoreParamV2Inst<Int16Regs, i16imm, ".b16">;
2585defm StoreParamV2I8   : StoreParamV2Inst<Int16Regs, i8imm,  ".b8">;
2586
2587defm StoreParamV4I32  : StoreParamV4Inst<Int32Regs, i32imm, ".b32">;
2588defm StoreParamV4I16  : StoreParamV4Inst<Int16Regs, i16imm, ".b16">;
2589defm StoreParamV4I8   : StoreParamV4Inst<Int16Regs, i8imm,  ".b8">;
2590
2591defm StoreParamF32    : StoreParamInst<Float32Regs, f32imm, ".f32">;
2592defm StoreParamF64    : StoreParamInst<Float64Regs, f64imm, ".f64">;
2593
2594defm StoreParamV2F32  : StoreParamV2Inst<Float32Regs, f32imm, ".f32">;
2595defm StoreParamV2F64  : StoreParamV2Inst<Float64Regs, f64imm, ".f64">;
2596
2597defm StoreParamV4F32  : StoreParamV4Inst<Float32Regs, f32imm, ".f32">;
2598
2599def StoreRetvalI64    : StoreRetvalInst<Int64Regs, ".b64">;
2600def StoreRetvalI32    : StoreRetvalInst<Int32Regs, ".b32">;
2601def StoreRetvalI16    : StoreRetvalInst<Int16Regs, ".b16">;
2602def StoreRetvalI8     : StoreRetvalInst<Int16Regs, ".b8">;
2603def StoreRetvalI8TruncI32 : StoreRetvalInst<Int32Regs, ".b8">;
2604def StoreRetvalI8TruncI64 : StoreRetvalInst<Int64Regs, ".b8">;
2605def StoreRetvalV2I64  : StoreRetvalV2Inst<Int64Regs, ".b64">;
2606def StoreRetvalV2I32  : StoreRetvalV2Inst<Int32Regs, ".b32">;
2607def StoreRetvalV2I16  : StoreRetvalV2Inst<Int16Regs, ".b16">;
2608def StoreRetvalV2I8   : StoreRetvalV2Inst<Int16Regs, ".b8">;
2609def StoreRetvalV4I32  : StoreRetvalV4Inst<Int32Regs, ".b32">;
2610def StoreRetvalV4I16  : StoreRetvalV4Inst<Int16Regs, ".b16">;
2611def StoreRetvalV4I8   : StoreRetvalV4Inst<Int16Regs, ".b8">;
2612
2613def StoreRetvalF64    : StoreRetvalInst<Float64Regs, ".f64">;
2614def StoreRetvalF32    : StoreRetvalInst<Float32Regs, ".f32">;
2615def StoreRetvalV2F64  : StoreRetvalV2Inst<Float64Regs, ".f64">;
2616def StoreRetvalV2F32  : StoreRetvalV2Inst<Float32Regs, ".f32">;
2617def StoreRetvalV4F32  : StoreRetvalV4Inst<Float32Regs, ".f32">;
2618
2619def CallArgBeginInst : NVPTXInst<(outs), (ins), "(", [(CallArgBegin)]>;
2620def CallArgEndInst1  : NVPTXInst<(outs), (ins), ");", [(CallArgEnd (i32 1))]>;
2621def CallArgEndInst0  : NVPTXInst<(outs), (ins), ")", [(CallArgEnd (i32 0))]>;
2622def RETURNInst       : NVPTXInst<(outs), (ins), "ret;", [(RETURNNode)]>;
2623
2624class CallArgInst<NVPTXRegClass regclass> :
2625  NVPTXInst<(outs), (ins regclass:$a), "$a, ",
2626            [(CallArg (i32 0), regclass:$a)]>;
2627
2628class CallArgInstVT<NVPTXRegClass regclass, ValueType vt> :
2629  NVPTXInst<(outs), (ins regclass:$a), "$a, ",
2630            [(CallArg (i32 0), vt:$a)]>;
2631
2632class LastCallArgInst<NVPTXRegClass regclass> :
2633  NVPTXInst<(outs), (ins regclass:$a), "$a",
2634            [(LastCallArg (i32 0), regclass:$a)]>;
2635class LastCallArgInstVT<NVPTXRegClass regclass, ValueType vt> :
2636  NVPTXInst<(outs), (ins regclass:$a), "$a",
2637            [(LastCallArg (i32 0), vt:$a)]>;
2638
2639def CallArgI64     : CallArgInst<Int64Regs>;
2640def CallArgI32     : CallArgInstVT<Int32Regs, i32>;
2641def CallArgI16     : CallArgInstVT<Int16Regs, i16>;
2642def CallArgF64     : CallArgInst<Float64Regs>;
2643def CallArgF32     : CallArgInst<Float32Regs>;
2644
2645def LastCallArgI64 : LastCallArgInst<Int64Regs>;
2646def LastCallArgI32 : LastCallArgInstVT<Int32Regs, i32>;
2647def LastCallArgI16 : LastCallArgInstVT<Int16Regs, i16>;
2648def LastCallArgF64 : LastCallArgInst<Float64Regs>;
2649def LastCallArgF32 : LastCallArgInst<Float32Regs>;
2650
2651def CallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a, ",
2652                              [(CallArg (i32 0), (i32 imm:$a))]>;
2653def LastCallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a",
2654                                  [(LastCallArg (i32 0), (i32 imm:$a))]>;
2655
2656def CallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a, ",
2657                             [(CallArg (i32 1), (i32 imm:$a))]>;
2658def LastCallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a",
2659                                 [(LastCallArg (i32 1), (i32 imm:$a))]>;
2660
2661def CallVoidInst :      NVPTXInst<(outs), (ins imem:$addr), "$addr, ",
2662                                  [(CallVoid (Wrapper tglobaladdr:$addr))]>;
2663def CallVoidInstReg :   NVPTXInst<(outs), (ins Int32Regs:$addr), "$addr, ",
2664                                  [(CallVoid i32:$addr)]>;
2665def CallVoidInstReg64 : NVPTXInst<(outs), (ins Int64Regs:$addr), "$addr, ",
2666                                  [(CallVoid i64:$addr)]>;
2667def PrototypeInst :     NVPTXInst<(outs), (ins i32imm:$val), ", prototype_$val;",
2668                                  [(Prototype (i32 imm:$val))]>;
2669
2670def DeclareRetMemInst :
2671  NVPTXInst<(outs), (ins i32imm:$align, i32imm:$size, i32imm:$num),
2672            ".param .align $align .b8 retval$num[$size];",
2673            [(DeclareRetParam (i32 imm:$align), (i32 imm:$size), (i32 imm:$num))]>;
2674def DeclareRetScalarInst :
2675  NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num),
2676            ".param .b$size retval$num;",
2677            [(DeclareRet (i32 1), (i32 imm:$size), (i32 imm:$num))]>;
2678def DeclareRetRegInst :
2679  NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num),
2680            ".reg .b$size retval$num;",
2681            [(DeclareRet (i32 2), (i32 imm:$size), (i32 imm:$num))]>;
2682
2683def DeclareParamInst :
2684  NVPTXInst<(outs), (ins i32imm:$align, i32imm:$a, i32imm:$size),
2685            ".param .align $align .b8 param$a[$size];",
2686            [(DeclareParam (i32 imm:$align), (i32 imm:$a), (i32 imm:$size))]>;
2687def DeclareScalarParamInst :
2688  NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
2689            ".param .b$size param$a;",
2690            [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 0))]>;
2691def DeclareScalarRegInst :
2692  NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
2693            ".reg .b$size param$a;",
2694            [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 1))]>;
2695
2696class MoveParamInst<ValueType T, NVPTXRegClass regclass, string asmstr> :
2697  NVPTXInst<(outs regclass:$dst), (ins regclass:$src),
2698            !strconcat("mov", asmstr, " \t$dst, $src;"),
2699            [(set T:$dst, (MoveParam T:$src))]>;
2700
2701class MoveParamSymbolInst<NVPTXRegClass regclass, Operand srcty, ValueType vt,
2702                          string asmstr> :
2703  NVPTXInst<(outs regclass:$dst), (ins srcty:$src),
2704            !strconcat("mov", asmstr, " \t$dst, $src;"),
2705            [(set vt:$dst, (MoveParam texternalsym:$src))]>;
2706
2707def MoveParamI64 : MoveParamInst<i64, Int64Regs, ".b64">;
2708def MoveParamI32 : MoveParamInst<i32, Int32Regs, ".b32">;
2709
2710def MoveParamSymbolI64 : MoveParamSymbolInst<Int64Regs, i64imm, i64, ".b64">;
2711def MoveParamSymbolI32 : MoveParamSymbolInst<Int32Regs, i32imm, i32, ".b32">;
2712
2713def MoveParamI16 :
2714  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
2715            "cvt.u16.u32 \t$dst, $src;", // ??? Why cvt.u16.u32 ?
2716            [(set i16:$dst, (MoveParam i16:$src))]>;
2717def MoveParamF64 : MoveParamInst<f64, Float64Regs, ".f64">;
2718def MoveParamF32 : MoveParamInst<f32, Float32Regs, ".f32">;
2719
2720class PseudoUseParamInst<NVPTXRegClass regclass, ValueType vt> :
2721  NVPTXInst<(outs), (ins regclass:$src),
2722            "// Pseudo use of $src",
2723            [(PseudoUseParam vt:$src)]>;
2724
2725def PseudoUseParamI64 : PseudoUseParamInst<Int64Regs, i64>;
2726def PseudoUseParamI32 : PseudoUseParamInst<Int32Regs, i32>;
2727def PseudoUseParamI16 : PseudoUseParamInst<Int16Regs, i16>;
2728def PseudoUseParamF64 : PseudoUseParamInst<Float64Regs, f64>;
2729def PseudoUseParamF32 : PseudoUseParamInst<Float32Regs, f32>;
2730
2731class ProxyRegInst<string SzStr, ValueType T, NVPTXRegClass regclass> :
2732  NVPTXInst<(outs regclass:$dst), (ins regclass:$src),
2733            !strconcat("mov.", SzStr, " \t$dst, $src;"),
2734            [(set T:$dst, (ProxyReg T:$src))]>;
2735
2736def ProxyRegI1    : ProxyRegInst<"pred", i1, Int1Regs>;
2737def ProxyRegI16   : ProxyRegInst<"b16",  i16, Int16Regs>;
2738def ProxyRegI32   : ProxyRegInst<"b32",  i32, Int32Regs>;
2739def ProxyRegI64   : ProxyRegInst<"b64",  i64, Int64Regs>;
2740def ProxyRegF32   : ProxyRegInst<"f32",  f32, Float32Regs>;
2741def ProxyRegF64   : ProxyRegInst<"f64",  f64, Float64Regs>;
2742
2743foreach vt = [f16, bf16] in {
2744  def: Pat<(vt (ProxyReg  vt:$src)), (ProxyRegI16 $src)>;
2745}
2746
2747foreach vt = [v2f16, v2bf16, v2i16, v4i8] in {
2748  def: Pat<(vt (ProxyReg  vt:$src)), (ProxyRegI32 $src)>;
2749}
2750
2751//
2752// Load / Store Handling
2753//
2754multiclass LD<NVPTXRegClass regclass> {
2755  def _avar : NVPTXInst<
2756    (outs regclass:$dst),
2757    (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2758         i32imm:$fromWidth, imem:$addr),
2759    "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2760    "\t$dst, [$addr];", []>;
2761  def _areg : NVPTXInst<
2762    (outs regclass:$dst),
2763    (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2764         i32imm:$fromWidth, Int32Regs:$addr),
2765    "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2766    "\t$dst, [$addr];", []>;
2767  def _areg_64 : NVPTXInst<
2768    (outs regclass:$dst),
2769    (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2770         i32imm:$fromWidth, Int64Regs:$addr),
2771    "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2772    "\t$dst, [$addr];", []>;
2773  def _ari : NVPTXInst<
2774    (outs regclass:$dst),
2775    (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
2776         i32imm:$fromWidth, Int32Regs:$addr, Offseti32imm:$offset),
2777    "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2778    "\t$dst, [$addr$offset];", []>;
2779  def _ari_64 : NVPTXInst<
2780    (outs regclass:$dst),
2781    (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
2782         LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, Offseti32imm:$offset),
2783    "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2784    "\t$dst, [$addr$offset];", []>;
2785  def _asi : NVPTXInst<
2786    (outs regclass:$dst),
2787    (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
2788         LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, Offseti32imm:$offset),
2789    "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2790    "\t$dst, [$addr$offset];", []>;
2791}
2792
2793let mayLoad=1, hasSideEffects=0 in {
2794  defm LD_i8  : LD<Int16Regs>;
2795  defm LD_i16 : LD<Int16Regs>;
2796  defm LD_i32 : LD<Int32Regs>;
2797  defm LD_i64 : LD<Int64Regs>;
2798  defm LD_f32 : LD<Float32Regs>;
2799  defm LD_f64 : LD<Float64Regs>;
2800}
2801
2802multiclass ST<NVPTXRegClass regclass> {
2803  def _avar : NVPTXInst<
2804    (outs),
2805    (ins regclass:$src, LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp,
2806         LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, imem:$addr),
2807    "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
2808    " \t[$addr], $src;", []>;
2809  def _areg : NVPTXInst<
2810    (outs),
2811    (ins regclass:$src, LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp,
2812         LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr),
2813    "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
2814    " \t[$addr], $src;", []>;
2815  def _areg_64 : NVPTXInst<
2816    (outs),
2817    (ins regclass:$src, LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp,
2818         LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr),
2819    "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
2820    " \t[$addr], $src;", []>;
2821  def _ari : NVPTXInst<
2822    (outs),
2823    (ins regclass:$src, LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp,
2824         LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr,
2825	 Offseti32imm:$offset),
2826    "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
2827    " \t[$addr$offset], $src;", []>;
2828  def _ari_64 : NVPTXInst<
2829    (outs),
2830    (ins regclass:$src, LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp,
2831         LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr,
2832	 Offseti32imm:$offset),
2833    "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
2834    " \t[$addr$offset], $src;", []>;
2835  def _asi : NVPTXInst<
2836    (outs),
2837    (ins regclass:$src, LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp,
2838         LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, imem:$addr,
2839	 Offseti32imm:$offset),
2840    "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
2841    " \t[$addr$offset], $src;", []>;
2842}
2843
2844let mayStore=1, hasSideEffects=0 in {
2845  defm ST_i8  : ST<Int16Regs>;
2846  defm ST_i16 : ST<Int16Regs>;
2847  defm ST_i32 : ST<Int32Regs>;
2848  defm ST_i64 : ST<Int64Regs>;
2849  defm ST_f32 : ST<Float32Regs>;
2850  defm ST_f64 : ST<Float64Regs>;
2851}
2852
2853// The following is used only in and after vector elementizations.  Vector
2854// elementization happens at the machine instruction level, so the following
2855// instructions never appear in the DAG.
2856multiclass LD_VEC<NVPTXRegClass regclass> {
2857  def _v2_avar : NVPTXInst<
2858    (outs regclass:$dst1, regclass:$dst2),
2859    (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
2860         LdStCode:$Sign, i32imm:$fromWidth, imem:$addr),
2861    "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2862    "\t{{$dst1, $dst2}}, [$addr];", []>;
2863  def _v2_areg : NVPTXInst<
2864    (outs regclass:$dst1, regclass:$dst2),
2865    (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
2866         LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr),
2867    "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2868    "\t{{$dst1, $dst2}}, [$addr];", []>;
2869  def _v2_areg_64 : NVPTXInst<
2870    (outs regclass:$dst1, regclass:$dst2),
2871    (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
2872         LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr),
2873    "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2874    "\t{{$dst1, $dst2}}, [$addr];", []>;
2875  def _v2_ari : NVPTXInst<
2876    (outs regclass:$dst1, regclass:$dst2),
2877    (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
2878         LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, Offseti32imm:$offset),
2879    "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2880    "\t{{$dst1, $dst2}}, [$addr$offset];", []>;
2881  def _v2_ari_64 : NVPTXInst<
2882    (outs regclass:$dst1, regclass:$dst2),
2883    (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
2884         LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, Offseti32imm:$offset),
2885    "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2886    "\t{{$dst1, $dst2}}, [$addr$offset];", []>;
2887  def _v2_asi : NVPTXInst<
2888    (outs regclass:$dst1, regclass:$dst2),
2889    (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
2890         LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, Offseti32imm:$offset),
2891    "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2892    "\t{{$dst1, $dst2}}, [$addr$offset];", []>;
2893  def _v4_avar : NVPTXInst<
2894    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
2895    (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
2896         LdStCode:$Sign, i32imm:$fromWidth, imem:$addr),
2897    "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2898    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
2899  def _v4_areg : NVPTXInst<
2900    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
2901    (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
2902         LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr),
2903    "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2904    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
2905  def _v4_areg_64 : NVPTXInst<
2906    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
2907    (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
2908         LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr),
2909    "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2910    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
2911  def _v4_ari : NVPTXInst<
2912    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
2913    (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
2914         LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, Offseti32imm:$offset),
2915    "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2916    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr$offset];", []>;
2917  def _v4_ari_64 : NVPTXInst<
2918    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
2919    (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
2920         LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, Offseti32imm:$offset),
2921    "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2922    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr$offset];", []>;
2923  def _v4_asi : NVPTXInst<
2924    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
2925    (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
2926         LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, Offseti32imm:$offset),
2927    "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2928    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr$offset];", []>;
2929}
2930let mayLoad=1, hasSideEffects=0 in {
2931  defm LDV_i8  : LD_VEC<Int16Regs>;
2932  defm LDV_i16 : LD_VEC<Int16Regs>;
2933  defm LDV_i32 : LD_VEC<Int32Regs>;
2934  defm LDV_i64 : LD_VEC<Int64Regs>;
2935  defm LDV_f32 : LD_VEC<Float32Regs>;
2936  defm LDV_f64 : LD_VEC<Float64Regs>;
2937}
2938
2939multiclass ST_VEC<NVPTXRegClass regclass> {
2940  def _v2_avar : NVPTXInst<
2941    (outs),
2942    (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$scope,
2943         LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth,
2944	 imem:$addr),
2945    "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2946    "\t[$addr], {{$src1, $src2}};", []>;
2947  def _v2_areg : NVPTXInst<
2948    (outs),
2949    (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$scope,
2950         LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth,
2951	 Int32Regs:$addr),
2952    "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2953    "\t[$addr], {{$src1, $src2}};", []>;
2954  def _v2_areg_64 : NVPTXInst<
2955    (outs),
2956    (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$scope,
2957         LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth,
2958	 Int64Regs:$addr),
2959    "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2960    "\t[$addr], {{$src1, $src2}};", []>;
2961  def _v2_ari : NVPTXInst<
2962    (outs),
2963    (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$scope,
2964         LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth,
2965	 Int32Regs:$addr, Offseti32imm:$offset),
2966    "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2967    "\t[$addr$offset], {{$src1, $src2}};", []>;
2968  def _v2_ari_64 : NVPTXInst<
2969    (outs),
2970    (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$scope,
2971         LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth,
2972	 Int64Regs:$addr, Offseti32imm:$offset),
2973    "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2974    "\t[$addr$offset], {{$src1, $src2}};", []>;
2975  def _v2_asi : NVPTXInst<
2976    (outs),
2977    (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$scope,
2978         LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth,
2979	 imem:$addr, Offseti32imm:$offset),
2980    "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2981    "\t[$addr$offset], {{$src1, $src2}};", []>;
2982  def _v4_avar : NVPTXInst<
2983    (outs),
2984    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
2985         LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
2986	 LdStCode:$Sign, i32imm:$fromWidth, imem:$addr),
2987    "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2988    "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
2989  def _v4_areg : NVPTXInst<
2990    (outs),
2991    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
2992         LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
2993	 LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr),
2994    "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
2995    "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
2996  def _v4_areg_64 : NVPTXInst<
2997    (outs),
2998    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
2999         LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
3000	 LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr),
3001    "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3002    "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
3003  def _v4_ari : NVPTXInst<
3004    (outs),
3005    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
3006         LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
3007	 LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, Offseti32imm:$offset),
3008    "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3009    "\t[$addr$offset], {{$src1, $src2, $src3, $src4}};", []>;
3010  def _v4_ari_64 : NVPTXInst<
3011    (outs),
3012    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
3013         LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
3014	 LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, Offseti32imm:$offset),
3015    "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
3016    "\t[$addr$offset], {{$src1, $src2, $src3, $src4}};", []>;
3017  def _v4_asi : NVPTXInst<
3018    (outs),
3019    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
3020         LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
3021	 LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, Offseti32imm:$offset),
3022    "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}"
3023    "$fromWidth \t[$addr$offset], {{$src1, $src2, $src3, $src4}};", []>;
3024}
3025
3026let mayStore=1, hasSideEffects=0 in {
3027  defm STV_i8  : ST_VEC<Int16Regs>;
3028  defm STV_i16 : ST_VEC<Int16Regs>;
3029  defm STV_i32 : ST_VEC<Int32Regs>;
3030  defm STV_i64 : ST_VEC<Int64Regs>;
3031  defm STV_f32 : ST_VEC<Float32Regs>;
3032  defm STV_f64 : ST_VEC<Float64Regs>;
3033}
3034
3035//---- Conversion ----
3036
3037class F_BITCONVERT<string SzStr, ValueType TIn, ValueType TOut,
3038  NVPTXRegClass regclassIn = ValueToRegClass<TIn>.ret,
3039  NVPTXRegClass regclassOut = ValueToRegClass<TOut>.ret> :
3040           NVPTXInst<(outs regclassOut:$d), (ins regclassIn:$a),
3041           !strconcat("mov.b", SzStr, " \t$d, $a;"),
3042     [(set TOut:$d, (bitconvert TIn:$a))]>;
3043
3044def BITCONVERT_32_I2F : F_BITCONVERT<"32", i32, f32>;
3045def BITCONVERT_32_F2I : F_BITCONVERT<"32", f32, i32>;
3046def BITCONVERT_64_I2F : F_BITCONVERT<"64", i64, f64>;
3047def BITCONVERT_64_F2I : F_BITCONVERT<"64", f64, i64>;
3048
3049foreach vt = [v2f16, v2bf16, v2i16, v4i8] in {
3050def: Pat<(vt (bitconvert (f32 Float32Regs:$a))),
3051         (BITCONVERT_32_F2I $a)>;
3052def: Pat<(f32 (bitconvert vt:$a)),
3053         (BITCONVERT_32_I2F $a)>;
3054}
3055foreach vt = [f16, bf16] in {
3056  def: Pat<(vt (bitconvert i16:$a)),
3057           (vt Int16Regs:$a)>;
3058  def: Pat<(i16 (bitconvert vt:$a)),
3059           (i16 Int16Regs:$a)>;
3060}
3061
3062foreach ta = [v2f16, v2bf16, v2i16, v4i8, i32] in {
3063  foreach tb = [v2f16, v2bf16, v2i16, v4i8, i32] in {
3064    if !ne(ta, tb) then {
3065      def: Pat<(ta (bitconvert tb:$a)),
3066               (ta Int32Regs:$a)>;
3067    }
3068  }
3069}
3070
3071// NOTE: pred->fp are currently sub-optimal due to an issue in TableGen where
3072// we cannot specify floating-point literals in isel patterns.  Therefore, we
3073// use an integer selp to select either 1 (or -1 in case of signed) or 0
3074// and then cvt to floating-point.
3075
3076// sint -> f16
3077def : Pat<(f16 (sint_to_fp i1:$a)),
3078          (CVT_f16_s32 (SELP_s32ii -1, 0, $a), CvtRN)>;
3079def : Pat<(f16 (sint_to_fp Int16Regs:$a)),
3080          (CVT_f16_s16 $a, CvtRN)>;
3081def : Pat<(f16 (sint_to_fp i32:$a)),
3082          (CVT_f16_s32 $a, CvtRN)>;
3083def : Pat<(f16 (sint_to_fp i64:$a)),
3084          (CVT_f16_s64 $a, CvtRN)>;
3085
3086// uint -> f16
3087def : Pat<(f16 (uint_to_fp i1:$a)),
3088          (CVT_f16_u32 (SELP_u32ii 1, 0, $a), CvtRN)>;
3089def : Pat<(f16 (uint_to_fp Int16Regs:$a)),
3090          (CVT_f16_u16 $a, CvtRN)>;
3091def : Pat<(f16 (uint_to_fp i32:$a)),
3092          (CVT_f16_u32 $a, CvtRN)>;
3093def : Pat<(f16 (uint_to_fp i64:$a)),
3094          (CVT_f16_u64 $a, CvtRN)>;
3095
3096// sint -> bf16
3097def : Pat<(bf16 (sint_to_fp i1:$a)),
3098          (CVT_bf16_s32 (SELP_u32ii 1, 0, $a), CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
3099def : Pat<(bf16 (sint_to_fp i16:$a)),
3100          (CVT_bf16_s16 $a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
3101def : Pat<(bf16 (sint_to_fp i32:$a)),
3102          (CVT_bf16_s32 $a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
3103def : Pat<(bf16 (sint_to_fp i64:$a)),
3104          (CVT_bf16_s64 $a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
3105
3106// uint -> bf16
3107def : Pat<(bf16 (uint_to_fp i1:$a)),
3108          (CVT_bf16_u32 (SELP_u32ii 1, 0, $a), CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
3109def : Pat<(bf16 (uint_to_fp i16:$a)),
3110          (CVT_bf16_u16 $a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
3111def : Pat<(bf16 (uint_to_fp i32:$a)),
3112          (CVT_bf16_u32 $a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
3113def : Pat<(bf16 (uint_to_fp i64:$a)),
3114          (CVT_bf16_u64 $a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
3115
3116// sint -> f32
3117def : Pat<(f32 (sint_to_fp i1:$a)),
3118          (CVT_f32_s32 (SELP_s32ii -1, 0, $a), CvtRN)>;
3119def : Pat<(f32 (sint_to_fp i16:$a)),
3120          (CVT_f32_s16 $a, CvtRN)>;
3121def : Pat<(f32 (sint_to_fp i32:$a)),
3122          (CVT_f32_s32 $a, CvtRN)>;
3123def : Pat<(f32 (sint_to_fp i64:$a)),
3124          (CVT_f32_s64 $a, CvtRN)>;
3125
3126// uint -> f32
3127def : Pat<(f32 (uint_to_fp i1:$a)),
3128          (CVT_f32_u32 (SELP_u32ii 1, 0, $a), CvtRN)>;
3129def : Pat<(f32 (uint_to_fp i16:$a)),
3130          (CVT_f32_u16 $a, CvtRN)>;
3131def : Pat<(f32 (uint_to_fp i32:$a)),
3132          (CVT_f32_u32 $a, CvtRN)>;
3133def : Pat<(f32 (uint_to_fp i64:$a)),
3134          (CVT_f32_u64 $a, CvtRN)>;
3135
3136// sint -> f64
3137def : Pat<(f64 (sint_to_fp i1:$a)),
3138          (CVT_f64_s32 (SELP_s32ii -1, 0, $a), CvtRN)>;
3139def : Pat<(f64 (sint_to_fp i16:$a)),
3140          (CVT_f64_s16 $a, CvtRN)>;
3141def : Pat<(f64 (sint_to_fp i32:$a)),
3142          (CVT_f64_s32 $a, CvtRN)>;
3143def : Pat<(f64 (sint_to_fp i64:$a)),
3144          (CVT_f64_s64 $a, CvtRN)>;
3145
3146// uint -> f64
3147def : Pat<(f64 (uint_to_fp i1:$a)),
3148          (CVT_f64_u32 (SELP_u32ii 1, 0, $a), CvtRN)>;
3149def : Pat<(f64 (uint_to_fp i16:$a)),
3150          (CVT_f64_u16 $a, CvtRN)>;
3151def : Pat<(f64 (uint_to_fp i32:$a)),
3152          (CVT_f64_u32 $a, CvtRN)>;
3153def : Pat<(f64 (uint_to_fp i64:$a)),
3154          (CVT_f64_u64 $a, CvtRN)>;
3155
3156
3157// f16 -> sint
3158def : Pat<(i1 (fp_to_sint f16:$a)),
3159          (SETP_b16ri $a, 0, CmpEQ)>;
3160def : Pat<(i16 (fp_to_sint f16:$a)),
3161          (CVT_s16_f16 $a, CvtRZI)>;
3162def : Pat<(i32 (fp_to_sint f16:$a)),
3163          (CVT_s32_f16 $a, CvtRZI)>;
3164def : Pat<(i64 (fp_to_sint f16:$a)),
3165          (CVT_s64_f16 $a, CvtRZI)>;
3166
3167// f16 -> uint
3168def : Pat<(i1 (fp_to_uint f16:$a)),
3169          (SETP_b16ri $a, 0, CmpEQ)>;
3170def : Pat<(i16 (fp_to_uint f16:$a)),
3171          (CVT_u16_f16 $a, CvtRZI)>;
3172def : Pat<(i32 (fp_to_uint f16:$a)),
3173          (CVT_u32_f16 $a, CvtRZI)>;
3174def : Pat<(i64 (fp_to_uint f16:$a)),
3175          (CVT_u64_f16 $a, CvtRZI)>;
3176
3177// bf16 -> sint
3178def : Pat<(i1 (fp_to_sint bf16:$a)),
3179          (SETP_b16ri $a, 0, CmpEQ)>;
3180def : Pat<(i16 (fp_to_sint bf16:$a)),
3181          (CVT_s16_bf16 $a, CvtRZI)>;
3182def : Pat<(i32 (fp_to_sint bf16:$a)),
3183          (CVT_s32_bf16 $a, CvtRZI)>;
3184def : Pat<(i64 (fp_to_sint bf16:$a)),
3185          (CVT_s64_bf16 $a, CvtRZI)>;
3186
3187// bf16 -> uint
3188def : Pat<(i1 (fp_to_uint bf16:$a)),
3189          (SETP_b16ri $a, 0, CmpEQ)>;
3190def : Pat<(i16 (fp_to_uint bf16:$a)),
3191          (CVT_u16_bf16 $a, CvtRZI)>;
3192def : Pat<(i32 (fp_to_uint bf16:$a)),
3193          (CVT_u32_bf16 $a, CvtRZI)>;
3194def : Pat<(i64 (fp_to_uint bf16:$a)),
3195          (CVT_u64_bf16 $a, CvtRZI)>;
3196// f32 -> sint
3197def : Pat<(i1 (fp_to_sint f32:$a)),
3198          (SETP_b32ri (BITCONVERT_32_F2I $a), 0, CmpEQ)>;
3199def : Pat<(i16 (fp_to_sint f32:$a)),
3200          (CVT_s16_f32 $a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
3201def : Pat<(i16 (fp_to_sint f32:$a)),
3202          (CVT_s16_f32 $a, CvtRZI)>;
3203def : Pat<(i32 (fp_to_sint f32:$a)),
3204          (CVT_s32_f32 $a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
3205def : Pat<(i32 (fp_to_sint f32:$a)),
3206          (CVT_s32_f32 $a, CvtRZI)>;
3207def : Pat<(i64 (fp_to_sint f32:$a)),
3208          (CVT_s64_f32 $a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
3209def : Pat<(i64 (fp_to_sint f32:$a)),
3210          (CVT_s64_f32 $a, CvtRZI)>;
3211
3212// f32 -> uint
3213def : Pat<(i1 (fp_to_uint f32:$a)),
3214          (SETP_b32ri (BITCONVERT_32_F2I $a), 0, CmpEQ)>;
3215def : Pat<(i16 (fp_to_uint f32:$a)),
3216          (CVT_u16_f32 $a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
3217def : Pat<(i16 (fp_to_uint f32:$a)),
3218          (CVT_u16_f32 $a, CvtRZI)>;
3219def : Pat<(i32 (fp_to_uint f32:$a)),
3220          (CVT_u32_f32 $a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
3221def : Pat<(i32 (fp_to_uint f32:$a)),
3222          (CVT_u32_f32 $a, CvtRZI)>;
3223def : Pat<(i64 (fp_to_uint f32:$a)),
3224          (CVT_u64_f32 $a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
3225def : Pat<(i64 (fp_to_uint f32:$a)),
3226          (CVT_u64_f32 $a, CvtRZI)>;
3227
3228// f64 -> sint
3229def : Pat<(i1 (fp_to_sint f64:$a)),
3230          (SETP_b64ri (BITCONVERT_64_F2I $a), 0, CmpEQ)>;
3231def : Pat<(i16 (fp_to_sint f64:$a)),
3232          (CVT_s16_f64 $a, CvtRZI)>;
3233def : Pat<(i32 (fp_to_sint f64:$a)),
3234          (CVT_s32_f64 $a, CvtRZI)>;
3235def : Pat<(i64 (fp_to_sint f64:$a)),
3236          (CVT_s64_f64 $a, CvtRZI)>;
3237
3238// f64 -> uint
3239def : Pat<(i1 (fp_to_uint f64:$a)),
3240          (SETP_b64ri (BITCONVERT_64_F2I $a), 0, CmpEQ)>;
3241def : Pat<(i16 (fp_to_uint f64:$a)),
3242          (CVT_u16_f64 $a, CvtRZI)>;
3243def : Pat<(i32 (fp_to_uint f64:$a)),
3244          (CVT_u32_f64 $a, CvtRZI)>;
3245def : Pat<(i64 (fp_to_uint f64:$a)),
3246          (CVT_u64_f64 $a, CvtRZI)>;
3247
3248// sext i1
3249def : Pat<(i16 (sext i1:$a)),
3250          (SELP_s16ii -1, 0, $a)>;
3251def : Pat<(i32 (sext i1:$a)),
3252          (SELP_s32ii -1, 0, $a)>;
3253def : Pat<(i64 (sext i1:$a)),
3254          (SELP_s64ii -1, 0, $a)>;
3255
3256// zext i1
3257def : Pat<(i16 (zext i1:$a)),
3258          (SELP_u16ii 1, 0, $a)>;
3259def : Pat<(i32 (zext i1:$a)),
3260          (SELP_u32ii 1, 0, $a)>;
3261def : Pat<(i64 (zext i1:$a)),
3262          (SELP_u64ii 1, 0, $a)>;
3263
3264// anyext i1
3265def : Pat<(i16 (anyext i1:$a)),
3266          (SELP_u16ii -1, 0, $a)>;
3267def : Pat<(i32 (anyext i1:$a)),
3268          (SELP_u32ii -1, 0, $a)>;
3269def : Pat<(i64 (anyext i1:$a)),
3270          (SELP_u64ii -1, 0, $a)>;
3271
3272// sext i16
3273def : Pat<(i32 (sext i16:$a)),
3274          (CVT_s32_s16 $a, CvtNONE)>;
3275def : Pat<(i64 (sext i16:$a)),
3276          (CVT_s64_s16 $a, CvtNONE)>;
3277
3278// zext i16
3279def : Pat<(i32 (zext i16:$a)),
3280          (CVT_u32_u16 $a, CvtNONE)>;
3281def : Pat<(i64 (zext i16:$a)),
3282          (CVT_u64_u16 $a, CvtNONE)>;
3283
3284// anyext i16
3285def : Pat<(i32 (anyext i16:$a)),
3286          (CVT_u32_u16 $a, CvtNONE)>;
3287def : Pat<(i64 (anyext i16:$a)),
3288          (CVT_u64_u16 $a, CvtNONE)>;
3289
3290// sext i32
3291def : Pat<(i64 (sext i32:$a)),
3292          (CVT_s64_s32 $a, CvtNONE)>;
3293
3294// zext i32
3295def : Pat<(i64 (zext i32:$a)),
3296          (CVT_u64_u32 $a, CvtNONE)>;
3297
3298// anyext i32
3299def : Pat<(i64 (anyext i32:$a)),
3300          (CVT_u64_u32 $a, CvtNONE)>;
3301
3302
3303// truncate i64
3304def : Pat<(i32 (trunc i64:$a)),
3305          (CVT_u32_u64 $a, CvtNONE)>;
3306def : Pat<(i16 (trunc i64:$a)),
3307          (CVT_u16_u64 $a, CvtNONE)>;
3308def : Pat<(i1 (trunc i64:$a)),
3309          (SETP_b64ri (ANDb64ri $a, 1), 1, CmpEQ)>;
3310
3311// truncate i32
3312def : Pat<(i16 (trunc i32:$a)),
3313          (CVT_u16_u32 $a, CvtNONE)>;
3314def : Pat<(i1 (trunc i32:$a)),
3315          (SETP_b32ri (ANDb32ri $a, 1), 1, CmpEQ)>;
3316
3317// truncate i16
3318def : Pat<(i1 (trunc i16:$a)),
3319          (SETP_b16ri (ANDb16ri $a, 1), 1, CmpEQ)>;
3320
3321// sext_inreg
3322def : Pat<(sext_inreg i16:$a, i8), (CVT_INREG_s16_s8 $a)>;
3323def : Pat<(sext_inreg i32:$a, i8), (CVT_INREG_s32_s8 $a)>;
3324def : Pat<(sext_inreg i32:$a, i16), (CVT_INREG_s32_s16 $a)>;
3325def : Pat<(sext_inreg i64:$a, i8), (CVT_INREG_s64_s8 $a)>;
3326def : Pat<(sext_inreg i64:$a, i16), (CVT_INREG_s64_s16 $a)>;
3327def : Pat<(sext_inreg i64:$a, i32), (CVT_INREG_s64_s32 $a)>;
3328
3329
3330// Select instructions with 32-bit predicates
3331def : Pat<(select i32:$pred, i16:$a, i16:$b),
3332          (SELP_b16rr $a, $b,
3333          (SETP_b32ri (ANDb32ri $pred, 1), 1, CmpEQ))>;
3334def : Pat<(select i32:$pred, i32:$a, i32:$b),
3335          (SELP_b32rr $a, $b,
3336          (SETP_b32ri (ANDb32ri $pred, 1), 1, CmpEQ))>;
3337def : Pat<(select i32:$pred, i64:$a, i64:$b),
3338          (SELP_b64rr $a, $b,
3339          (SETP_b32ri (ANDb32ri $pred, 1), 1, CmpEQ))>;
3340def : Pat<(select i32:$pred, f16:$a, f16:$b),
3341          (SELP_f16rr $a, $b,
3342          (SETP_b32ri (ANDb32ri $pred, 1), 1, CmpEQ))>;
3343def : Pat<(select i32:$pred, bf16:$a, bf16:$b),
3344          (SELP_bf16rr $a, $b,
3345          (SETP_b32ri (ANDb32ri $pred, 1), 1, CmpEQ))>;
3346def : Pat<(select i32:$pred, f32:$a, f32:$b),
3347          (SELP_f32rr $a, $b,
3348          (SETP_b32ri (ANDb32ri $pred, 1), 1, CmpEQ))>;
3349def : Pat<(select i32:$pred, f64:$a, f64:$b),
3350          (SELP_f64rr $a, $b,
3351          (SETP_b32ri (ANDb32ri $pred, 1), 1, CmpEQ))>;
3352
3353
3354let hasSideEffects = false in {
3355  // pack a set of smaller int registers to a larger int register
3356  def V4I16toI64 : NVPTXInst<(outs Int64Regs:$d),
3357                             (ins Int16Regs:$s1, Int16Regs:$s2,
3358                                  Int16Regs:$s3, Int16Regs:$s4),
3359                             "mov.b64 \t$d, {{$s1, $s2, $s3, $s4}};", []>;
3360  def V2I16toI32 : NVPTXInst<(outs Int32Regs:$d),
3361                             (ins Int16Regs:$s1, Int16Regs:$s2),
3362                             "mov.b32 \t$d, {{$s1, $s2}};", []>;
3363  def V2I32toI64 : NVPTXInst<(outs Int64Regs:$d),
3364                             (ins Int32Regs:$s1, Int32Regs:$s2),
3365                             "mov.b64 \t$d, {{$s1, $s2}};", []>;
3366  def V2I64toI128 : NVPTXInst<(outs Int128Regs:$d),
3367                              (ins Int64Regs:$s1, Int64Regs:$s2),
3368                              "mov.b128 \t$d, {{$s1, $s2}};", []>;
3369  def V2F32toF64 : NVPTXInst<(outs Float64Regs:$d),
3370                             (ins Float32Regs:$s1, Float32Regs:$s2),
3371                             "mov.b64 \t$d, {{$s1, $s2}};", []>;
3372
3373  // unpack a larger int register to a set of smaller int registers
3374  def I64toV4I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2,
3375                                   Int16Regs:$d3, Int16Regs:$d4),
3376                             (ins Int64Regs:$s),
3377                             "mov.b64 \t{{$d1, $d2, $d3, $d4}}, $s;", []>;
3378  def I32toV2I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2),
3379                             (ins Int32Regs:$s),
3380                             "mov.b32 \t{{$d1, $d2}}, $s;", []>;
3381  def I64toV2I32 : NVPTXInst<(outs Int32Regs:$d1, Int32Regs:$d2),
3382                             (ins Int64Regs:$s),
3383                             "mov.b64 \t{{$d1, $d2}}, $s;", []>;
3384  def I128toV2I64: NVPTXInst<(outs Int64Regs:$d1, Int64Regs:$d2),
3385                              (ins Int128Regs:$s),
3386                              "mov.b128 \t{{$d1, $d2}}, $s;", []>;
3387  def F64toV2F32 : NVPTXInst<(outs Float32Regs:$d1, Float32Regs:$d2),
3388                             (ins Float64Regs:$s),
3389                             "mov.b64 \t{{$d1, $d2}}, $s;", []>;
3390
3391  def I32toI16H  : NVPTXInst<(outs Int16Regs:$high),
3392                             (ins Int32Regs:$s),
3393                             "{{ .reg .b16 tmp; mov.b32 {tmp, $high}, $s; }}",
3394                             []>;
3395  def I32toI16L  : NVPTXInst<(outs Int16Regs:$low),
3396                             (ins Int32Regs:$s),
3397                             "{{ .reg .b16 tmp; mov.b32 {$low, tmp}, $s; }}",
3398                             []>;
3399  def I64toI32H  : NVPTXInst<(outs Int32Regs:$high),
3400                             (ins Int64Regs:$s),
3401                             "{{ .reg .b32 tmp; mov.b64 {tmp, $high}, $s; }}",
3402                             []>;
3403  def I64toI32L  : NVPTXInst<(outs Int32Regs:$low),
3404                             (ins Int64Regs:$s),
3405                             "{{ .reg .b32 tmp; mov.b64 {$low, tmp}, $s; }}",
3406                             []>;
3407
3408}
3409
3410// Using partial vectorized move produces better SASS code for extraction of
3411// upper/lower parts of an integer.
3412def : Pat<(i16 (trunc (srl i32:$s, (i32 16)))),
3413          (I32toI16H $s)>;
3414def : Pat<(i16 (trunc (sra i32:$s, (i32 16)))),
3415          (I32toI16H $s)>;
3416def : Pat<(i32 (trunc (srl i64:$s, (i32 32)))),
3417          (I64toI32H $s)>;
3418def : Pat<(i32 (trunc (sra i64:$s, (i32 32)))),
3419          (I64toI32H $s)>;
3420
3421def: Pat<(i32 (sext (extractelt v2i16:$src, 0))),
3422         (CVT_INREG_s32_s16 $src)>;
3423
3424foreach vt = [v2f16, v2bf16, v2i16] in {
3425def : Pat<(extractelt vt:$src, 0),
3426          (I32toI16L $src)>;
3427def : Pat<(extractelt vt:$src, 1),
3428          (I32toI16H $src)>;
3429}
3430def : Pat<(v2f16 (build_vector f16:$a, f16:$b)),
3431          (V2I16toI32 $a, $b)>;
3432def : Pat<(v2bf16 (build_vector bf16:$a, bf16:$b)),
3433          (V2I16toI32 $a, $b)>;
3434def : Pat<(v2i16 (build_vector i16:$a, i16:$b)),
3435          (V2I16toI32 $a, $b)>;
3436
3437def: Pat<(v2i16 (scalar_to_vector i16:$a)),
3438         (CVT_u32_u16 $a, CvtNONE)>;
3439
3440//
3441// Funnel-Shift
3442//
3443
3444// Create SDNodes so they can be used in the DAG code, e.g.
3445// NVPTXISelLowering (LowerShiftLeftParts and LowerShiftRightParts)
3446def fshl_clamp : SDNode<"NVPTXISD::FSHL_CLAMP", SDTIntShiftDOp, []>;
3447def fshr_clamp : SDNode<"NVPTXISD::FSHR_CLAMP", SDTIntShiftDOp, []>;
3448
3449// Funnel shift, requires >= sm_32.  Does not trap if amt is out of range, so
3450// no side effects.
3451let hasSideEffects = false in {
3452  multiclass ShfInst<string mode, SDNode op> {
3453    def _i
3454      : NVPTXInst<(outs Int32Regs:$dst),
3455                  (ins  Int32Regs:$lo, Int32Regs:$hi, i32imm:$amt),
3456                  "shf." # mode # ".b32 \t$dst, $lo, $hi, $amt;",
3457                  [(set i32:$dst,
3458                      (op i32:$hi, i32:$lo, (i32 imm:$amt)))]>,
3459        Requires<[hasHWROT32]>;
3460
3461    def _r
3462      : NVPTXInst<(outs Int32Regs:$dst),
3463                  (ins  Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
3464                  "shf." # mode # ".b32 \t$dst, $lo, $hi, $amt;",
3465                  [(set i32:$dst,
3466                      (op i32:$hi, i32:$lo, i32:$amt))]>,
3467        Requires<[hasHWROT32]>;
3468  }
3469
3470  defm SHF_L_CLAMP : ShfInst<"l.clamp", fshl_clamp>;
3471  defm SHF_R_CLAMP : ShfInst<"r.clamp", fshr_clamp>;
3472  defm SHF_L_WRAP  : ShfInst<"l.wrap", fshl>;
3473  defm SHF_R_WRAP  : ShfInst<"r.wrap", fshr>;
3474}
3475
3476def : Pat<(i32 (int_nvvm_fshl_clamp i32:$hi, i32:$lo, i32:$amt)),
3477          (SHF_L_CLAMP_r $lo, $hi, $amt)>;
3478def : Pat<(i32 (int_nvvm_fshl_clamp i32:$hi, i32:$lo, (i32 imm:$amt))),
3479          (SHF_L_CLAMP_i $lo, $hi, imm:$amt)>;
3480def : Pat<(i32 (int_nvvm_fshr_clamp i32:$hi, i32:$lo, i32:$amt)),
3481          (SHF_R_CLAMP_r $lo, $hi, $amt)>;
3482def : Pat<(i32 (int_nvvm_fshr_clamp i32:$hi, i32:$lo, (i32 imm:$amt))),
3483          (SHF_R_CLAMP_i $lo, $hi, imm:$amt)>;
3484
3485// Count leading zeros
3486let hasSideEffects = false in {
3487  def CLZr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
3488                         "clz.b32 \t$d, $a;", []>;
3489  def CLZr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
3490                         "clz.b64 \t$d, $a;", []>;
3491}
3492
3493// 32-bit has a direct PTX instruction
3494def : Pat<(i32 (ctlz i32:$a)), (CLZr32 $a)>;
3495
3496// The return type of the ctlz ISD node is the same as its input, but the PTX
3497// ctz instruction always returns a 32-bit value.  For ctlz.i64, convert the
3498// ptx value to 64 bits to match the ISD node's semantics, unless we know we're
3499// truncating back down to 32 bits.
3500def : Pat<(i64 (ctlz i64:$a)), (CVT_u64_u32 (CLZr64 $a), CvtNONE)>;
3501def : Pat<(i32 (trunc (i64 (ctlz i64:$a)))), (CLZr64 $a)>;
3502
3503// For 16-bit ctlz, we zero-extend to 32-bit, perform the count, then trunc the
3504// result back to 16-bits if necessary.  We also need to subtract 16 because
3505// the high-order 16 zeros were counted.
3506//
3507// TODO: NVPTX has a mov.b32 b32reg, {imm, b16reg} instruction, which we could
3508// use to save one SASS instruction (on sm_35 anyway):
3509//
3510//   mov.b32 $tmp, {0xffff, $a}
3511//   ctlz.b32 $result, $tmp
3512//
3513// That is, instead of zero-extending the input to 32 bits, we'd "one-extend"
3514// and then ctlz that value.  This way we don't have to subtract 16 from the
3515// result.  Unfortunately today we don't have a way to generate
3516// "mov b32reg, {b16imm, b16reg}", so we don't do this optimization.
3517def : Pat<(i16 (ctlz i16:$a)),
3518          (SUBi16ri (CVT_u16_u32
3519           (CLZr32 (CVT_u32_u16 $a, CvtNONE)), CvtNONE), 16)>;
3520def : Pat<(i32 (zext (i16 (ctlz i16:$a)))),
3521          (SUBi32ri (CLZr32 (CVT_u32_u16 $a, CvtNONE)), 16)>;
3522
3523// Population count
3524let hasSideEffects = false in {
3525  def POPCr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
3526                          "popc.b32 \t$d, $a;", []>;
3527  def POPCr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
3528                          "popc.b64 \t$d, $a;", []>;
3529}
3530
3531// 32-bit has a direct PTX instruction
3532def : Pat<(i32 (ctpop i32:$a)), (POPCr32 $a)>;
3533
3534// For 64-bit, the result in PTX is actually 32-bit so we zero-extend to 64-bit
3535// to match the LLVM semantics.  Just as with ctlz.i64, we provide a second
3536// pattern that avoids the type conversion if we're truncating the result to
3537// i32 anyway.
3538def : Pat<(ctpop i64:$a), (CVT_u64_u32 (POPCr64 $a), CvtNONE)>;
3539def : Pat<(i32 (trunc (i64 (ctpop i64:$a)))), (POPCr64 $a)>;
3540
3541// For 16-bit, we zero-extend to 32-bit, then trunc the result back to 16-bits.
3542// If we know that we're storing into an i32, we can avoid the final trunc.
3543def : Pat<(ctpop i16:$a),
3544          (CVT_u16_u32 (POPCr32 (CVT_u32_u16 $a, CvtNONE)), CvtNONE)>;
3545def : Pat<(i32 (zext (i16 (ctpop i16:$a)))),
3546          (POPCr32 (CVT_u32_u16 $a, CvtNONE))>;
3547
3548// fpround f32 -> f16
3549def : Pat<(f16 (fpround f32:$a)),
3550          (CVT_f16_f32 $a, CvtRN)>;
3551
3552// fpround f32 -> bf16
3553def : Pat<(bf16 (fpround f32:$a)),
3554          (CVT_bf16_f32 $a, CvtRN)>, Requires<[hasPTX<70>, hasSM<80>]>;
3555
3556// fpround f64 -> f16
3557def : Pat<(f16 (fpround f64:$a)),
3558          (CVT_f16_f64 $a, CvtRN)>;
3559
3560// fpround f64 -> bf16
3561def : Pat<(bf16 (fpround f64:$a)),
3562          (CVT_bf16_f64 $a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
3563// fpround f64 -> f32
3564def : Pat<(f32 (fpround f64:$a)),
3565          (CVT_f32_f64 $a, CvtRN_FTZ)>, Requires<[doF32FTZ]>;
3566def : Pat<(f32 (fpround f64:$a)),
3567          (CVT_f32_f64 $a, CvtRN)>;
3568
3569// fpextend f16 -> f32
3570def : Pat<(f32 (fpextend f16:$a)),
3571          (CVT_f32_f16 $a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
3572def : Pat<(f32 (fpextend f16:$a)),
3573          (CVT_f32_f16 $a, CvtNONE)>;
3574// fpextend bf16 -> f32
3575def : Pat<(f32 (fpextend bf16:$a)),
3576          (CVT_f32_bf16 $a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
3577def : Pat<(f32 (fpextend bf16:$a)),
3578          (CVT_f32_bf16 $a, CvtNONE)>, Requires<[hasPTX<71>, hasSM<80>]>;
3579
3580// fpextend f16 -> f64
3581def : Pat<(f64 (fpextend f16:$a)),
3582          (CVT_f64_f16 $a, CvtNONE)>;
3583
3584// fpextend bf16 -> f64
3585def : Pat<(f64 (fpextend bf16:$a)),
3586          (CVT_f64_bf16 $a, CvtNONE)>, Requires<[hasPTX<78>, hasSM<90>]>;
3587
3588// fpextend f32 -> f64
3589def : Pat<(f64 (fpextend f32:$a)),
3590          (CVT_f64_f32 $a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
3591def : Pat<(f64 (fpextend f32:$a)),
3592          (CVT_f64_f32 $a, CvtNONE)>;
3593
3594def retglue : SDNode<"NVPTXISD::RET_GLUE", SDTNone,
3595                     [SDNPHasChain, SDNPOptInGlue]>;
3596
3597// fceil, ffloor, froundeven, ftrunc.
3598
3599multiclass CVT_ROUND<SDNode OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
3600  def : Pat<(OpNode f16:$a),
3601            (CVT_f16_f16 $a, Mode)>;
3602  def : Pat<(OpNode bf16:$a),
3603            (CVT_bf16_bf16 $a, Mode)>;
3604  def : Pat<(OpNode f32:$a),
3605            (CVT_f32_f32 $a, ModeFTZ)>, Requires<[doF32FTZ]>;
3606  def : Pat<(OpNode f32:$a),
3607            (CVT_f32_f32 $a, Mode)>, Requires<[doNoF32FTZ]>;
3608  def : Pat<(OpNode f64:$a),
3609            (CVT_f64_f64 $a, Mode)>;
3610}
3611
3612defm : CVT_ROUND<fceil, CvtRPI, CvtRPI_FTZ>;
3613defm : CVT_ROUND<ffloor, CvtRMI, CvtRMI_FTZ>;
3614defm : CVT_ROUND<froundeven, CvtRNI, CvtRNI_FTZ>;
3615defm : CVT_ROUND<ftrunc, CvtRZI, CvtRZI_FTZ>;
3616
3617// nearbyint and rint are implemented as rounding to nearest even.  This isn't
3618// strictly correct, because it causes us to ignore the rounding mode.  But it
3619// matches what CUDA's "libm" does.
3620
3621defm : CVT_ROUND<fnearbyint, CvtRNI, CvtRNI_FTZ>;
3622defm : CVT_ROUND<frint, CvtRNI, CvtRNI_FTZ>;
3623
3624//-----------------------------------
3625// Control-flow
3626//-----------------------------------
3627
3628let isTerminator=1 in {
3629   let isReturn=1, isBarrier=1 in
3630      def Return : NVPTXInst<(outs), (ins), "ret;", [(retglue)]>;
3631
3632   let isBranch=1 in
3633      def CBranch : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target),
3634                              "@$a bra \t$target;",
3635                              [(brcond i1:$a, bb:$target)]>;
3636   let isBranch=1 in
3637      def CBranchOther : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target),
3638                                   "@!$a bra \t$target;", []>;
3639
3640   let isBranch=1, isBarrier=1 in
3641      def GOTO : NVPTXInst<(outs), (ins brtarget:$target),
3642                           "bra.uni \t$target;", [(br bb:$target)]>;
3643}
3644
3645def : Pat<(brcond i32:$a, bb:$target),
3646          (CBranch (SETP_u32ri $a, 0, CmpNE), bb:$target)>;
3647
3648// SelectionDAGBuilder::visitSWitchCase() will invert the condition of a
3649// conditional branch if the target block is the next block so that the code
3650// can fall through to the target block.  The invertion is done by 'xor
3651// condition, 1', which will be translated to (setne condition, -1).  Since ptx
3652// supports '@!pred bra target', we should use it.
3653def : Pat<(brcond (i1 (setne i1:$a, -1)), bb:$target),
3654          (CBranchOther $a, bb:$target)>;
3655
3656// Call
3657def SDT_NVPTXCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>,
3658                                            SDTCisVT<1, i32>]>;
3659def SDT_NVPTXCallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
3660
3661def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_NVPTXCallSeqStart,
3662                           [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
3663def callseq_end   : SDNode<"ISD::CALLSEQ_END", SDT_NVPTXCallSeqEnd,
3664                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
3665                            SDNPSideEffect]>;
3666
3667def SDT_NVPTXCall : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
3668def call          : SDNode<"NVPTXISD::CALL", SDT_NVPTXCall,
3669                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
3670def calltarget : Operand<i32>;
3671let isCall=1 in {
3672   def CALL : NVPTXInst<(outs), (ins calltarget:$dst), "call \t$dst, (1);", []>;
3673}
3674
3675def : Pat<(call tglobaladdr:$dst), (CALL tglobaladdr:$dst)>;
3676def : Pat<(call texternalsym:$dst), (CALL texternalsym:$dst)>;
3677
3678// Pseudo instructions.
3679class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
3680   : NVPTXInst<outs, ins, asmstr, pattern>;
3681
3682def Callseq_Start :
3683  NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
3684            "\\{ // callseq $amt1, $amt2",
3685            [(callseq_start timm:$amt1, timm:$amt2)]>;
3686def Callseq_End :
3687  NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
3688            "\\} // callseq $amt1",
3689            [(callseq_end timm:$amt1, timm:$amt2)]>;
3690
3691// trap instruction
3692def trapinst : NVPTXInst<(outs), (ins), "trap;", [(trap)]>, Requires<[noPTXASUnreachableBug]>;
3693// Emit an `exit` as well to convey to ptxas that `trap` exits the CFG.
3694// This won't be necessary in a future version of ptxas.
3695def trapexitinst : NVPTXInst<(outs), (ins), "trap; exit;", [(trap)]>, Requires<[hasPTXASUnreachableBug]>;
3696// brkpt instruction
3697def debugtrapinst : NVPTXInst<(outs), (ins), "brkpt;", [(debugtrap)]>;
3698
3699// Call prototype wrapper
3700def SDTCallPrototype : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
3701def CallPrototype :
3702  SDNode<"NVPTXISD::CallPrototype", SDTCallPrototype,
3703         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
3704def ProtoIdent : Operand<i32> {
3705  let PrintMethod = "printProtoIdent";
3706}
3707def CALL_PROTOTYPE :
3708  NVPTXInst<(outs), (ins ProtoIdent:$ident),
3709            "$ident", [(CallPrototype (i32 texternalsym:$ident))]>;
3710
3711def SDTDynAllocaOp :
3712  SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, SDTCisInt<1>, SDTCisInt<2>]>;
3713
3714def dyn_alloca :
3715  SDNode<"NVPTXISD::DYNAMIC_STACKALLOC", SDTDynAllocaOp,
3716         [SDNPHasChain, SDNPSideEffect]>;
3717
3718def DYNAMIC_STACKALLOC32 :
3719  NVPTXInst<(outs Int32Regs:$ptr),
3720            (ins Int32Regs:$size, i32imm:$align),
3721            "alloca.u32 \t$ptr, $size, $align;\n\t"
3722            "cvta.local.u32 \t$ptr, $ptr;",
3723            [(set i32:$ptr, (dyn_alloca i32:$size, (i32 timm:$align)))]>,
3724            Requires<[hasPTX<73>, hasSM<52>]>;
3725
3726def DYNAMIC_STACKALLOC64 :
3727  NVPTXInst<(outs Int64Regs:$ptr),
3728            (ins Int64Regs:$size, i32imm:$align),
3729            "alloca.u64 \t$ptr, $size, $align;\n\t"
3730            "cvta.local.u64 \t$ptr, $ptr;",
3731            [(set i64:$ptr, (dyn_alloca i64:$size, (i32 timm:$align)))]>,
3732            Requires<[hasPTX<73>, hasSM<52>]>;
3733
3734
3735//
3736// BRX
3737//
3738
3739def SDTBrxStartProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
3740def SDTBrxItemProfile : SDTypeProfile<0, 1, [SDTCisVT<0, OtherVT>]>;
3741def SDTBrxEndProfile : SDTypeProfile<0, 3, [SDTCisVT<0, OtherVT>, SDTCisInt<1>, SDTCisInt<2>]>;
3742
3743def brx_start :
3744  SDNode<"NVPTXISD::BrxStart", SDTBrxStartProfile,
3745         [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
3746def brx_item :
3747  SDNode<"NVPTXISD::BrxItem", SDTBrxItemProfile,
3748         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
3749def brx_end :
3750  SDNode<"NVPTXISD::BrxEnd", SDTBrxEndProfile,
3751         [SDNPHasChain, SDNPInGlue, SDNPSideEffect]>;
3752
3753let isTerminator = 1, isBranch = 1, isIndirectBranch = 1, isNotDuplicable = 1 in {
3754
3755  def BRX_START :
3756    NVPTXInst<(outs), (ins i32imm:$id),
3757              "$$L_brx_$id: .branchtargets",
3758              [(brx_start (i32 imm:$id))]>;
3759
3760  def BRX_ITEM :
3761    NVPTXInst<(outs), (ins brtarget:$target),
3762              "\t$target,",
3763              [(brx_item bb:$target)]>;
3764
3765  def BRX_END :
3766    NVPTXInst<(outs), (ins brtarget:$target, Int32Regs:$val, i32imm:$id),
3767              "\t$target;\n\tbrx.idx \t$val, $$L_brx_$id;",
3768              [(brx_end bb:$target, i32:$val, (i32 imm:$id))]> {
3769      let isBarrier = 1;
3770    }
3771}
3772
3773
3774foreach a_type = ["s", "u"] in {
3775  foreach b_type = ["s", "u"] in {
3776
3777    def DOT4_ # a_type # b_type :
3778      NVPTXInst<(outs Int32Regs:$dst),
3779                (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c),
3780                "dp4a." # a_type # "32." # b_type # "32 \t$dst, $a, $b, $c;",
3781                [(set i32:$dst,
3782                    (!cast<Intrinsic>("int_nvvm_idp4a_" # a_type # "_" # b_type)
3783                     i32:$a, i32:$b, i32:$c))]>,
3784                Requires<[hasDotInstructions]>;
3785
3786    foreach is_hi = [0, -1] in {
3787      defvar lohi_suffix = !if(is_hi, "hi", "lo");
3788
3789      def DOT2_ # lohi_suffix # _ # a_type # b_type :
3790        NVPTXInst<(outs Int32Regs:$dst),
3791                  (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c),
3792                  "dp2a." # lohi_suffix # "." # a_type # "32." # b_type # "32 \t$dst, $a, $b, $c;",
3793                  [(set i32:$dst,
3794                      (!cast<Intrinsic>("int_nvvm_idp2a_" # a_type # "_" # b_type)
3795                       i32:$a, i32:$b, is_hi, i32:$c))]>,
3796                  Requires<[hasDotInstructions]>;
3797    }
3798  }
3799}
3800
3801//
3802// Stack Manipulation
3803//
3804
3805def SDTStackRestore : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
3806
3807def stackrestore :
3808  SDNode<"NVPTXISD::STACKRESTORE", SDTStackRestore,
3809         [SDNPHasChain, SDNPSideEffect]>;
3810
3811def stacksave :
3812  SDNode<"NVPTXISD::STACKSAVE", SDTIntLeaf,
3813         [SDNPHasChain, SDNPSideEffect]>;
3814
3815def STACKRESTORE_32 :
3816  NVPTXInst<(outs), (ins Int32Regs:$ptr),
3817            "stackrestore.u32 \t$ptr;",
3818            [(stackrestore i32:$ptr)]>,
3819            Requires<[hasPTX<73>, hasSM<52>]>;
3820
3821def STACKSAVE_32 :
3822  NVPTXInst<(outs Int32Regs:$dst), (ins),
3823            "stacksave.u32 \t$dst;",
3824            [(set i32:$dst, (i32 stacksave))]>,
3825            Requires<[hasPTX<73>, hasSM<52>]>;
3826
3827def STACKRESTORE_64 :
3828  NVPTXInst<(outs), (ins Int64Regs:$ptr),
3829            "stackrestore.u64 \t$ptr;",
3830            [(stackrestore i64:$ptr)]>,
3831            Requires<[hasPTX<73>, hasSM<52>]>;
3832
3833def STACKSAVE_64 :
3834  NVPTXInst<(outs Int64Regs:$dst), (ins),
3835            "stacksave.u64 \t$dst;",
3836            [(set i64:$dst, (i64 stacksave))]>,
3837            Requires<[hasPTX<73>, hasSM<52>]>;
3838
3839include "NVPTXIntrinsics.td"
3840
3841//-----------------------------------
3842// Notes
3843//-----------------------------------
3844// BSWAP is currently expanded. The following is a more efficient
3845// - for < sm_20, use vector scalar mov, as tesla support native 16-bit register
3846// - for sm_20, use pmpt (use vector scalar mov to get the pack and
3847//   unpack). sm_20 supports native 32-bit register, but not native 16-bit
3848// register.
3849
3850def : Pat <
3851  (i32 (bswap i32:$a)),
3852  (INT_NVVM_PRMT $a, (i32 0), (i32 0x0123))>;
3853
3854def : Pat <
3855  (v2i16 (bswap v2i16:$a)),
3856  (INT_NVVM_PRMT $a, (i32 0), (i32 0x2301))>;
3857
3858def : Pat <
3859  (i64 (bswap i64:$a)),
3860  (V2I32toI64
3861    (INT_NVVM_PRMT (I64toI32H $a), (i32 0), (i32 0x0123)),
3862    (INT_NVVM_PRMT (I64toI32L $a), (i32 0), (i32 0x0123)))>;
3863
3864
3865////////////////////////////////////////////////////////////////////////////////
3866// PTX Fence instructions
3867////////////////////////////////////////////////////////////////////////////////
3868
3869def atomic_thread_fence_seq_cst_sys :
3870  NVPTXInst<(outs), (ins), "fence.sc.sys;", []>,
3871  Requires<[hasPTX<60>, hasSM<70>]>;
3872def atomic_thread_fence_acq_rel_sys :
3873  NVPTXInst<(outs), (ins), "fence.acq_rel.sys;", []>,
3874  Requires<[hasPTX<60>, hasSM<70>]>;
3875
3876def atomic_thread_fence_seq_cst_gpu :
3877  NVPTXInst<(outs), (ins), "fence.sc.gpu;", []>,
3878  Requires<[hasPTX<60>, hasSM<70>]>;
3879def atomic_thread_fence_acq_rel_gpu :
3880  NVPTXInst<(outs), (ins), "fence.acq_rel.gpu;", []>,
3881  Requires<[hasPTX<60>, hasSM<70>]>;
3882
3883def atomic_thread_fence_seq_cst_cluster :
3884  NVPTXInst<(outs), (ins), "fence.sc.cluster;", []>,
3885  Requires<[hasPTX<78>, hasSM<90>]>;
3886def atomic_thread_fence_acq_rel_cluster :
3887  NVPTXInst<(outs), (ins), "fence.acq_rel.cluster;", []>,
3888  Requires<[hasPTX<78>, hasSM<90>]>;
3889
3890def atomic_thread_fence_seq_cst_cta :
3891  NVPTXInst<(outs), (ins), "fence.sc.cta;", []>,
3892  Requires<[hasPTX<60>, hasSM<70>]>;
3893def atomic_thread_fence_acq_rel_cta :
3894  NVPTXInst<(outs), (ins), "fence.acq_rel.cta;", []>,
3895  Requires<[hasPTX<60>, hasSM<70>]>;
3896
3897def fpimm_any_zero : FPImmLeaf<fAny, [{
3898  return Imm.isZero();
3899}]>;
3900
3901def fpimm_positive_zero_v2f16 : PatFrag<(ops), (v2f16 (bitconvert (i32 0)))>;
3902def fpimm_positive_zero_v2bf16 : PatFrag<(ops), (v2bf16 (bitconvert (i32 0)))>;
3903
3904// Perform substitution if fma only has one use, and also if instruction has
3905// nnan instruction flag or if the TM has NoNaNsFPMath
3906def NVPTX_fma_oneuse_and_nnan : PatFrag<(ops node:$a, node:$b, node:$c),
3907                                  (fma node:$a, node:$b, node:$c), [{
3908  return N->hasOneUse() &&
3909    (N->getFlags().hasNoNaNs() || TM.Options.NoNaNsFPMath);
3910}]>;
3911// fmaxnum will differentiate between signed and unsigned zeros soon, so this
3912// PatFrag is for a fmaxnum node with nsz
3913def NVPTX_fmaxnum_nsz : PatFrag<(ops node:$a, node:$b),
3914                                  (fmaxnum node:$a, node:$b), [{
3915  return N->getFlags().hasNoSignedZeros() || TM.Options.NoSignedZerosFPMath;
3916}]>;
3917
3918class NVPTXInst_rrr<RegisterClass RC, string Instruction, list<Predicate> Preds>
3919  : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c),
3920      !strconcat(Instruction, "\t$dst, $a, $b, $c;"), []>,
3921      Requires<Preds>;
3922
3923def FMARELU_F16 : NVPTXInst_rrr<Int16Regs, "fma.rn.relu.f16", [useFP16Math, hasPTX<70>, hasSM<80>]>;
3924def FMARELU_F16_FTZ : NVPTXInst_rrr<Int16Regs, "fma.rn.ftz.relu.f16", [useFP16Math, hasPTX<70>, hasSM<80>]>;
3925def FMARELU_BF16 : NVPTXInst_rrr<Int16Regs, "fma.rn.relu.bf16", [hasBF16Math, hasPTX<70>, hasSM<80>]>;
3926def FMARELU_F16X2 : NVPTXInst_rrr<Int32Regs, "fma.rn.relu.f16x2", [useFP16Math, hasPTX<70>, hasSM<80>]>;
3927def FMARELU_F16X2_FTZ : NVPTXInst_rrr<Int32Regs, "fma.rn.ftz.relu.f16x2", [useFP16Math, hasPTX<70>, hasSM<80>]>;
3928def FMARELU_BF16X2 : NVPTXInst_rrr<Int32Regs, "fma.rn.relu.bf16x2", [hasBF16Math, hasPTX<70>, hasSM<80>]>;
3929
3930// FTZ
3931def : Pat<(f16 (NVPTX_fmaxnum_nsz (NVPTX_fma_oneuse_and_nnan f16:$a, f16:$b, f16:$c), fpimm_any_zero)),
3932  (FMARELU_F16_FTZ $a, $b, $c)>,
3933  Requires<[doF32FTZ]>;
3934def : Pat<(v2f16 (NVPTX_fmaxnum_nsz (NVPTX_fma_oneuse_and_nnan v2f16:$a, v2f16:$b, v2f16:$c), fpimm_positive_zero_v2f16)),
3935  (FMARELU_F16X2_FTZ $a, $b, $c)>,
3936  Requires<[doF32FTZ]>;
3937
3938// NO FTZ
3939def : Pat<(f16 (NVPTX_fmaxnum_nsz (NVPTX_fma_oneuse_and_nnan f16:$a, f16:$b, f16:$c), fpimm_any_zero)),
3940  (FMARELU_F16 $a, $b, $c)>;
3941def : Pat<(bf16 (NVPTX_fmaxnum_nsz (NVPTX_fma_oneuse_and_nnan bf16:$a, bf16:$b, bf16:$c), fpimm_any_zero)),
3942  (FMARELU_BF16 $a, $b, $c)>;
3943def : Pat<(v2f16 (NVPTX_fmaxnum_nsz (NVPTX_fma_oneuse_and_nnan v2f16:$a, v2f16:$b, v2f16:$c), fpimm_positive_zero_v2f16)),
3944  (FMARELU_F16X2 $a, $b, $c)>;
3945def : Pat<(v2bf16 (NVPTX_fmaxnum_nsz (NVPTX_fma_oneuse_and_nnan v2bf16:$a, v2bf16:$b, v2bf16:$c), fpimm_positive_zero_v2bf16)),
3946  (FMARELU_BF16X2 $a, $b, $c)>;
3947