xref: /llvm-project/clang/test/CodeGen/builtins-nvptx.c (revision 310f55875f2fc69af310b6259e65136f0de4404a)
1 // REQUIRES: nvptx-registered-target
2 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_70 -target-feature +ptx63 \
3 // RUN:            -fcuda-is-device -emit-llvm -o - -x cuda %s \
4 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX63_SM70 -check-prefix=LP64 %s
5 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_80 -target-feature +ptx70 \
6 // RUN:            -fcuda-is-device -emit-llvm -o - -x cuda %s \
7 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 -check-prefix=LP32 %s
8 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_80 -target-feature +ptx70 \
9 // RUN:            -fcuda-is-device -emit-llvm -o - -x cuda %s \
10 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 -check-prefix=LP64 %s
11 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_60 -target-feature +ptx62 \
12 // RUN:            -fcuda-is-device -emit-llvm -o - -x cuda %s \
13 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=LP32 %s
14 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_60 -target-feature +ptx62 \
15 // RUN:            -fcuda-is-device -emit-llvm -o - -x cuda %s \
16 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=LP64 %s
17 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_61 -target-feature +ptx62 \
18 // RUN:            -fcuda-is-device -emit-llvm -o - -x cuda %s \
19 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=LP64 %s
20 // RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_53 -target-feature +ptx62 \
21 // RUN:   -DERROR_CHECK -fcuda-is-device -S -o /dev/null -x cuda -verify %s
22 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_86 -target-feature +ptx72 \
23 // RUN:            -fcuda-is-device -emit-llvm -o - -x cuda %s \
24 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 -check-prefix=LP32 %s
25 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_86 -target-feature +ptx72 \
26 // RUN:            -fcuda-is-device -emit-llvm -o - -x cuda %s \
27 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 -check-prefix=LP64 %s
28 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_89 -target-feature +ptx81 \
29 // RUN:            -fcuda-is-device -emit-llvm -o - -x cuda %s \
30 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX81_SM89 %s
31 // ###  The last run to check with the highest SM and PTX version available
32 // ###  to make sure target builtins are still accepted.
33 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_100a -target-feature +ptx87 \
34 // RUN:            -fcuda-is-device -emit-llvm -o - -x cuda %s \
35 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX81_SM89 %s
36 
37 #define __device__ __attribute__((device))
38 #define __global__ __attribute__((global))
39 #define __shared__ __attribute__((shared))
40 #define __constant__ __attribute__((constant))
41 
42 __device__ int read_tid() {
43 
44 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
45 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
46 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.z()
47 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.w()
48 
49   int x = __nvvm_read_ptx_sreg_tid_x();
50   int y = __nvvm_read_ptx_sreg_tid_y();
51   int z = __nvvm_read_ptx_sreg_tid_z();
52   int w = __nvvm_read_ptx_sreg_tid_w();
53 
54   return x + y + z + w;
55 
56 }
57 
58 __device__ bool reflect() {
59 
60 // CHECK: call i32 @llvm.nvvm.reflect(ptr {{.*}})
61 
62   unsigned x = __nvvm_reflect("__CUDA_ARCH");
63   return x >= 700;
64 }
65 
66 __device__ int read_ntid() {
67 
68 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
69 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
70 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
71 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.w()
72 
73   int x = __nvvm_read_ptx_sreg_ntid_x();
74   int y = __nvvm_read_ptx_sreg_ntid_y();
75   int z = __nvvm_read_ptx_sreg_ntid_z();
76   int w = __nvvm_read_ptx_sreg_ntid_w();
77 
78   return x + y + z + w;
79 
80 }
81 
82 __device__ int read_ctaid() {
83 
84 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
85 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
86 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
87 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.w()
88 
89   int x = __nvvm_read_ptx_sreg_ctaid_x();
90   int y = __nvvm_read_ptx_sreg_ctaid_y();
91   int z = __nvvm_read_ptx_sreg_ctaid_z();
92   int w = __nvvm_read_ptx_sreg_ctaid_w();
93 
94   return x + y + z + w;
95 
96 }
97 
98 __device__ int read_nctaid() {
99 
100 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
101 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
102 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
103 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.w()
104 
105   int x = __nvvm_read_ptx_sreg_nctaid_x();
106   int y = __nvvm_read_ptx_sreg_nctaid_y();
107   int z = __nvvm_read_ptx_sreg_nctaid_z();
108   int w = __nvvm_read_ptx_sreg_nctaid_w();
109 
110   return x + y + z + w;
111 
112 }
113 
114 __device__ int read_ids() {
115 
116 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.laneid()
117 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.warpid()
118 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nwarpid()
119 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.smid()
120 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nsmid()
121 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.gridid()
122 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
123 
124   int a = __nvvm_read_ptx_sreg_laneid();
125   int b = __nvvm_read_ptx_sreg_warpid();
126   int c = __nvvm_read_ptx_sreg_nwarpid();
127   int d = __nvvm_read_ptx_sreg_smid();
128   int e = __nvvm_read_ptx_sreg_nsmid();
129   int f = __nvvm_read_ptx_sreg_gridid();
130   int g = __nvvm_read_ptx_sreg_warpsize();
131 
132   return a + b + c + d + e + f + g;
133 
134 }
135 
136 __device__ int read_lanemasks() {
137 
138 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.eq()
139 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.le()
140 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.lt()
141 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.ge()
142 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.gt()
143 
144   int a = __nvvm_read_ptx_sreg_lanemask_eq();
145   int b = __nvvm_read_ptx_sreg_lanemask_le();
146   int c = __nvvm_read_ptx_sreg_lanemask_lt();
147   int d = __nvvm_read_ptx_sreg_lanemask_ge();
148   int e = __nvvm_read_ptx_sreg_lanemask_gt();
149 
150   return a + b + c + d + e;
151 
152 }
153 
154 __device__ long long read_clocks() {
155 
156 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.clock()
157 // CHECK: call i64 @llvm.nvvm.read.ptx.sreg.clock64()
158 // CHECK: call i64 @llvm.nvvm.read.ptx.sreg.globaltimer()
159 
160   int a = __nvvm_read_ptx_sreg_clock();
161   long long b = __nvvm_read_ptx_sreg_clock64();
162   long long c = __nvvm_read_ptx_sreg_globaltimer();
163 
164   return a + b + c;
165 }
166 
167 __device__ int read_pms() {
168 
169 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.pm0()
170 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.pm1()
171 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.pm2()
172 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.pm3()
173 
174   int a = __nvvm_read_ptx_sreg_pm0();
175   int b = __nvvm_read_ptx_sreg_pm1();
176   int c = __nvvm_read_ptx_sreg_pm2();
177   int d = __nvvm_read_ptx_sreg_pm3();
178 
179   return a + b + c + d;
180 
181 }
182 
183 __device__ void sync() {
184 
185 // CHECK: call void @llvm.nvvm.bar.sync(i32 0)
186 
187   __nvvm_bar_sync(0);
188 
189 }
190 
191 __device__ void activemask() {
192 
193 // CHECK: call i32 @llvm.nvvm.activemask()
194 
195   __nvvm_activemask();
196 
197 }
198 
199 __device__ void exit() {
200 
201 // CHECK: call void @llvm.nvvm.exit()
202 
203   __nvvm_exit();
204 
205 }
206 
207 // NVVM intrinsics
208 
209 // The idea is not to test all intrinsics, just that Clang is recognizing the
210 // builtins defined in BuiltinsNVPTX.td
211 __device__ void nvvm_math(float f1, float f2, double d1, double d2) {
212 // CHECK: call float @llvm.nvvm.fmax.f
213   float t1 = __nvvm_fmax_f(f1, f2);
214 // CHECK: call float @llvm.nvvm.fmin.f
215   float t2 = __nvvm_fmin_f(f1, f2);
216 // CHECK: call float @llvm.nvvm.sqrt.rn.f
217   float t3 = __nvvm_sqrt_rn_f(f1);
218 // CHECK: call float @llvm.nvvm.rcp.rn.f
219   float t4 = __nvvm_rcp_rn_f(f2);
220 // CHECK: call float @llvm.nvvm.add.rn.f
221   float t5 = __nvvm_add_rn_f(f1, f2);
222 
223 // CHECK: call double @llvm.nvvm.fmax.d
224   double td1 = __nvvm_fmax_d(d1, d2);
225 // CHECK: call double @llvm.nvvm.fmin.d
226   double td2 = __nvvm_fmin_d(d1, d2);
227 // CHECK: call double @llvm.nvvm.sqrt.rn.d
228   double td3 = __nvvm_sqrt_rn_d(d1);
229 // CHECK: call double @llvm.nvvm.rcp.rn.d
230   double td4 = __nvvm_rcp_rn_d(d2);
231 
232 // CHECK: call void @llvm.nvvm.membar.cta()
233   __nvvm_membar_cta();
234 // CHECK: call void @llvm.nvvm.membar.gl()
235   __nvvm_membar_gl();
236 // CHECK: call void @llvm.nvvm.membar.sys()
237   __nvvm_membar_sys();
238 // CHECK: call void @llvm.nvvm.barrier0()
239   __syncthreads();
240 }
241 
242 __device__ int di;
243 __shared__ int si;
244 __device__ long dl;
245 __shared__ long sl;
246 __device__ long long dll;
247 __shared__ long long sll;
248 
249 // Check for atomic intrinsics
250 // CHECK-LABEL: nvvm_atom
251 __device__ void nvvm_atom(float *fp, float f, double *dfp, double df,
252                           unsigned short *usp, unsigned short us, int *ip,
253                           int i, unsigned int *uip, unsigned ui, long *lp,
254                           long l, long long *llp, long long ll) {
255   // CHECK: atomicrmw add ptr {{.*}} seq_cst, align 4
256   __nvvm_atom_add_gen_i(ip, i);
257   // CHECK: atomicrmw add ptr {{.*}} seq_cst, align {{4|8}}
258   __nvvm_atom_add_gen_l(&dl, l);
259   // CHECK: atomicrmw add ptr {{.*}} seq_cst, align 8
260   __nvvm_atom_add_gen_ll(&sll, ll);
261 
262   // CHECK: atomicrmw sub ptr {{.*}} seq_cst, align 4
263   __nvvm_atom_sub_gen_i(ip, i);
264   // CHECK: atomicrmw sub ptr {{.*}} seq_cst, align {{4|8}}
265   __nvvm_atom_sub_gen_l(&dl, l);
266   // CHECK: atomicrmw sub ptr {{.*}} seq_cst, align 8
267   __nvvm_atom_sub_gen_ll(&sll, ll);
268 
269   // CHECK: atomicrmw and ptr {{.*}} seq_cst, align 4
270   __nvvm_atom_and_gen_i(ip, i);
271   // CHECK: atomicrmw and ptr {{.*}} seq_cst, align {{4|8}}
272   __nvvm_atom_and_gen_l(&dl, l);
273   // CHECK: atomicrmw and ptr {{.*}} seq_cst, align 8
274   __nvvm_atom_and_gen_ll(&sll, ll);
275 
276   // CHECK: atomicrmw or ptr {{.*}} seq_cst, align 4
277   __nvvm_atom_or_gen_i(ip, i);
278   // CHECK: atomicrmw or ptr {{.*}} seq_cst, align {{4|8}}
279   __nvvm_atom_or_gen_l(&dl, l);
280   // CHECK: atomicrmw or ptr {{.*}} seq_cst, align 8
281   __nvvm_atom_or_gen_ll(&sll, ll);
282 
283   // CHECK: atomicrmw xor ptr {{.*}} seq_cst, align 4
284   __nvvm_atom_xor_gen_i(ip, i);
285   // CHECK: atomicrmw xor ptr {{.*}} seq_cst, align {{4|8}}
286   __nvvm_atom_xor_gen_l(&dl, l);
287   // CHECK: atomicrmw xor ptr {{.*}} seq_cst, align 8
288   __nvvm_atom_xor_gen_ll(&sll, ll);
289 
290   // CHECK: atomicrmw xchg ptr {{.*}} seq_cst, align 4
291   __nvvm_atom_xchg_gen_i(ip, i);
292   // CHECK: atomicrmw xchg ptr {{.*}} seq_cst, align {{4|8}}
293   __nvvm_atom_xchg_gen_l(&dl, l);
294   // CHECK: atomicrmw xchg ptr {{.*}} seq_cst, align 8
295   __nvvm_atom_xchg_gen_ll(&sll, ll);
296 
297   // CHECK: atomicrmw max ptr {{.*}} seq_cst, align 4
298   __nvvm_atom_max_gen_i(ip, i);
299   // CHECK: atomicrmw umax ptr {{.*}} seq_cst, align 4
300   __nvvm_atom_max_gen_ui((unsigned int *)ip, i);
301   // CHECK: atomicrmw max ptr {{.*}} seq_cst, align {{4|8}}
302   __nvvm_atom_max_gen_l(&dl, l);
303   // CHECK: atomicrmw umax ptr {{.*}} seq_cst, align {{4|8}}
304   __nvvm_atom_max_gen_ul((unsigned long *)&dl, l);
305   // CHECK: atomicrmw max ptr {{.*}} seq_cst, align 8
306   __nvvm_atom_max_gen_ll(&sll, ll);
307   // CHECK: atomicrmw umax ptr {{.*}} seq_cst, align 8
308   __nvvm_atom_max_gen_ull((unsigned long long *)&sll, ll);
309 
310   // CHECK: atomicrmw min ptr {{.*}} seq_cst, align 4
311   __nvvm_atom_min_gen_i(ip, i);
312   // CHECK: atomicrmw umin ptr {{.*}} seq_cst, align 4
313   __nvvm_atom_min_gen_ui((unsigned int *)ip, i);
314   // CHECK: atomicrmw min ptr {{.*}} seq_cst, align {{4|8}}
315   __nvvm_atom_min_gen_l(&dl, l);
316   // CHECK: atomicrmw umin ptr {{.*}} seq_cst, align {{4|8}}
317   __nvvm_atom_min_gen_ul((unsigned long *)&dl, l);
318   // CHECK: atomicrmw min ptr {{.*}} seq_cst, align 8
319   __nvvm_atom_min_gen_ll(&sll, ll);
320   // CHECK: atomicrmw umin ptr {{.*}} seq_cst, align 8
321   __nvvm_atom_min_gen_ull((unsigned long long *)&sll, ll);
322 
323   // CHECK: cmpxchg ptr {{.*}} seq_cst seq_cst, align 4
324   // CHECK-NEXT: extractvalue { i32, i1 } {{%[0-9]+}}, 0
325   __nvvm_atom_cas_gen_i(ip, 0, i);
326   // CHECK: cmpxchg ptr {{.*}} seq_cst seq_cst, align {{4|8}}
327   // CHECK-NEXT: extractvalue { {{i32|i64}}, i1 } {{%[0-9]+}}, 0
328   __nvvm_atom_cas_gen_l(&dl, 0, l);
329   // CHECK: cmpxchg ptr {{.*}} seq_cst seq_cst, align 8
330   // CHECK-NEXT: extractvalue { i64, i1 } {{%[0-9]+}}, 0
331   __nvvm_atom_cas_gen_ll(&sll, 0, ll);
332 
333   // CHECK: atomicrmw fadd ptr {{.*}} seq_cst, align 4
334   __nvvm_atom_add_gen_f(fp, f);
335 
336   // CHECK: call i32 @llvm.nvvm.atomic.load.inc.32.p0
337   __nvvm_atom_inc_gen_ui(uip, ui);
338 
339   // CHECK: call i32 @llvm.nvvm.atomic.load.dec.32.p0
340   __nvvm_atom_dec_gen_ui(uip, ui);
341 
342 
343   //////////////////////////////////////////////////////////////////
344   // Atomics with scope (only supported on sm_60+).
345 
346 #if ERROR_CHECK || __CUDA_ARCH__ >= 600
347 
348   // CHECK: call i32 @llvm.nvvm.atomic.add.gen.i.cta.i32.p0
349   // expected-error@+1 {{'__nvvm_atom_cta_add_gen_i' needs target feature sm_60}}
350   __nvvm_atom_cta_add_gen_i(ip, i);
351   // LP32: call i32 @llvm.nvvm.atomic.add.gen.i.cta.i32.p0
352   // LP64: call i64 @llvm.nvvm.atomic.add.gen.i.cta.i64.p0
353   // expected-error@+1 {{'__nvvm_atom_cta_add_gen_l' needs target feature sm_60}}
354   __nvvm_atom_cta_add_gen_l(&dl, l);
355   // CHECK: call i64 @llvm.nvvm.atomic.add.gen.i.cta.i64.p0
356   // expected-error@+1 {{'__nvvm_atom_cta_add_gen_ll' needs target feature sm_60}}
357   __nvvm_atom_cta_add_gen_ll(&sll, ll);
358   // CHECK: call i32 @llvm.nvvm.atomic.add.gen.i.sys.i32.p0
359   // expected-error@+1 {{'__nvvm_atom_sys_add_gen_i' needs target feature sm_60}}
360   __nvvm_atom_sys_add_gen_i(ip, i);
361   // LP32: call i32 @llvm.nvvm.atomic.add.gen.i.sys.i32.p0
362   // LP64: call i64 @llvm.nvvm.atomic.add.gen.i.sys.i64.p0
363   // expected-error@+1 {{'__nvvm_atom_sys_add_gen_l' needs target feature sm_60}}
364   __nvvm_atom_sys_add_gen_l(&dl, l);
365   // CHECK: call i64 @llvm.nvvm.atomic.add.gen.i.sys.i64.p0
366   // expected-error@+1 {{'__nvvm_atom_sys_add_gen_ll' needs target feature sm_60}}
367   __nvvm_atom_sys_add_gen_ll(&sll, ll);
368 
369   // CHECK: call float @llvm.nvvm.atomic.add.gen.f.cta.f32.p0
370   // expected-error@+1 {{'__nvvm_atom_cta_add_gen_f' needs target feature sm_60}}
371   __nvvm_atom_cta_add_gen_f(fp, f);
372   // CHECK: call double @llvm.nvvm.atomic.add.gen.f.cta.f64.p0
373   // expected-error@+1 {{'__nvvm_atom_cta_add_gen_d' needs target feature sm_60}}
374   __nvvm_atom_cta_add_gen_d(dfp, df);
375   // CHECK: call float @llvm.nvvm.atomic.add.gen.f.sys.f32.p0
376   // expected-error@+1 {{'__nvvm_atom_sys_add_gen_f' needs target feature sm_60}}
377   __nvvm_atom_sys_add_gen_f(fp, f);
378   // CHECK: call double @llvm.nvvm.atomic.add.gen.f.sys.f64.p0
379   // expected-error@+1 {{'__nvvm_atom_sys_add_gen_d' needs target feature sm_60}}
380   __nvvm_atom_sys_add_gen_d(dfp, df);
381 
382   // CHECK: call i32 @llvm.nvvm.atomic.exch.gen.i.cta.i32.p0
383   // expected-error@+1 {{'__nvvm_atom_cta_xchg_gen_i' needs target feature sm_60}}
384   __nvvm_atom_cta_xchg_gen_i(ip, i);
385   // LP32: call i32 @llvm.nvvm.atomic.exch.gen.i.cta.i32.p0
386   // LP64: call i64 @llvm.nvvm.atomic.exch.gen.i.cta.i64.p0
387   // expected-error@+1 {{'__nvvm_atom_cta_xchg_gen_l' needs target feature sm_60}}
388   __nvvm_atom_cta_xchg_gen_l(&dl, l);
389   // CHECK: call i64 @llvm.nvvm.atomic.exch.gen.i.cta.i64.p0
390   // expected-error@+1 {{'__nvvm_atom_cta_xchg_gen_ll' needs target feature sm_60}}
391   __nvvm_atom_cta_xchg_gen_ll(&sll, ll);
392 
393   // CHECK: call i32 @llvm.nvvm.atomic.exch.gen.i.sys.i32.p0
394   // expected-error@+1 {{'__nvvm_atom_sys_xchg_gen_i' needs target feature sm_60}}
395   __nvvm_atom_sys_xchg_gen_i(ip, i);
396   // LP32: call i32 @llvm.nvvm.atomic.exch.gen.i.sys.i32.p0
397   // LP64: call i64 @llvm.nvvm.atomic.exch.gen.i.sys.i64.p0
398   // expected-error@+1 {{'__nvvm_atom_sys_xchg_gen_l' needs target feature sm_60}}
399   __nvvm_atom_sys_xchg_gen_l(&dl, l);
400   // CHECK: call i64 @llvm.nvvm.atomic.exch.gen.i.sys.i64.p0
401   // expected-error@+1 {{'__nvvm_atom_sys_xchg_gen_ll' needs target feature sm_60}}
402   __nvvm_atom_sys_xchg_gen_ll(&sll, ll);
403 
404   // CHECK: call i32 @llvm.nvvm.atomic.max.gen.i.cta.i32.p0
405   // expected-error@+1 {{'__nvvm_atom_cta_max_gen_i' needs target feature sm_60}}
406   __nvvm_atom_cta_max_gen_i(ip, i);
407   // CHECK: call i32 @llvm.nvvm.atomic.max.gen.i.cta.i32.p0
408   // expected-error@+1 {{'__nvvm_atom_cta_max_gen_ui' needs target feature sm_60}}
409   __nvvm_atom_cta_max_gen_ui((unsigned int *)ip, i);
410   // LP32: call i32 @llvm.nvvm.atomic.max.gen.i.cta.i32.p0
411   // LP64: call i64 @llvm.nvvm.atomic.max.gen.i.cta.i64.p0
412   // expected-error@+1 {{'__nvvm_atom_cta_max_gen_l' needs target feature sm_60}}
413   __nvvm_atom_cta_max_gen_l(&dl, l);
414   // LP32: call i32 @llvm.nvvm.atomic.max.gen.i.cta.i32.p0
415   // LP64: call i64 @llvm.nvvm.atomic.max.gen.i.cta.i64.p0
416   // expected-error@+1 {{'__nvvm_atom_cta_max_gen_ul' needs target feature sm_60}}
417   __nvvm_atom_cta_max_gen_ul((unsigned long *)lp, l);
418   // CHECK: call i64 @llvm.nvvm.atomic.max.gen.i.cta.i64.p0
419   // expected-error@+1 {{'__nvvm_atom_cta_max_gen_ll' needs target feature sm_60}}
420   __nvvm_atom_cta_max_gen_ll(&sll, ll);
421   // CHECK: call i64 @llvm.nvvm.atomic.max.gen.i.cta.i64.p0
422   // expected-error@+1 {{'__nvvm_atom_cta_max_gen_ull' needs target feature sm_60}}
423   __nvvm_atom_cta_max_gen_ull((unsigned long long *)llp, ll);
424 
425   // CHECK: call i32 @llvm.nvvm.atomic.max.gen.i.sys.i32.p0
426   // expected-error@+1 {{'__nvvm_atom_sys_max_gen_i' needs target feature sm_60}}
427   __nvvm_atom_sys_max_gen_i(ip, i);
428   // CHECK: call i32 @llvm.nvvm.atomic.max.gen.i.sys.i32.p0
429   // expected-error@+1 {{'__nvvm_atom_sys_max_gen_ui' needs target feature sm_60}}
430   __nvvm_atom_sys_max_gen_ui((unsigned int *)ip, i);
431   // LP32: call i32 @llvm.nvvm.atomic.max.gen.i.sys.i32.p0
432   // LP64: call i64 @llvm.nvvm.atomic.max.gen.i.sys.i64.p0
433   // expected-error@+1 {{'__nvvm_atom_sys_max_gen_l' needs target feature sm_60}}
434   __nvvm_atom_sys_max_gen_l(&dl, l);
435   // LP32: call i32 @llvm.nvvm.atomic.max.gen.i.sys.i32.p0
436   // LP64: call i64 @llvm.nvvm.atomic.max.gen.i.sys.i64.p0
437   // expected-error@+1 {{'__nvvm_atom_sys_max_gen_ul' needs target feature sm_60}}
438   __nvvm_atom_sys_max_gen_ul((unsigned long *)lp, l);
439   // CHECK: call i64 @llvm.nvvm.atomic.max.gen.i.sys.i64.p0
440   // expected-error@+1 {{'__nvvm_atom_sys_max_gen_ll' needs target feature sm_60}}
441   __nvvm_atom_sys_max_gen_ll(&sll, ll);
442   // CHECK: call i64 @llvm.nvvm.atomic.max.gen.i.sys.i64.p0
443   // expected-error@+1 {{'__nvvm_atom_sys_max_gen_ull' needs target feature sm_60}}
444   __nvvm_atom_sys_max_gen_ull((unsigned long long *)llp, ll);
445 
446   // CHECK: call i32 @llvm.nvvm.atomic.min.gen.i.cta.i32.p0
447   // expected-error@+1 {{'__nvvm_atom_cta_min_gen_i' needs target feature sm_60}}
448   __nvvm_atom_cta_min_gen_i(ip, i);
449   // CHECK: call i32 @llvm.nvvm.atomic.min.gen.i.cta.i32.p0
450   // expected-error@+1 {{'__nvvm_atom_cta_min_gen_ui' needs target feature sm_60}}
451   __nvvm_atom_cta_min_gen_ui((unsigned int *)ip, i);
452   // LP32: call i32 @llvm.nvvm.atomic.min.gen.i.cta.i32.p0
453   // LP64: call i64 @llvm.nvvm.atomic.min.gen.i.cta.i64.p0
454   // expected-error@+1 {{'__nvvm_atom_cta_min_gen_l' needs target feature sm_60}}
455   __nvvm_atom_cta_min_gen_l(&dl, l);
456   // LP32: call i32 @llvm.nvvm.atomic.min.gen.i.cta.i32.p0
457   // LP64: call i64 @llvm.nvvm.atomic.min.gen.i.cta.i64.p0
458   // expected-error@+1 {{'__nvvm_atom_cta_min_gen_ul' needs target feature sm_60}}
459   __nvvm_atom_cta_min_gen_ul((unsigned long *)lp, l);
460   // CHECK: call i64 @llvm.nvvm.atomic.min.gen.i.cta.i64.p0
461   // expected-error@+1 {{'__nvvm_atom_cta_min_gen_ll' needs target feature sm_60}}
462   __nvvm_atom_cta_min_gen_ll(&sll, ll);
463   // CHECK: call i64 @llvm.nvvm.atomic.min.gen.i.cta.i64.p0
464   // expected-error@+1 {{'__nvvm_atom_cta_min_gen_ull' needs target feature sm_60}}
465   __nvvm_atom_cta_min_gen_ull((unsigned long long *)llp, ll);
466 
467   // CHECK: call i32 @llvm.nvvm.atomic.min.gen.i.sys.i32.p0
468   // expected-error@+1 {{'__nvvm_atom_sys_min_gen_i' needs target feature sm_60}}
469   __nvvm_atom_sys_min_gen_i(ip, i);
470   // CHECK: call i32 @llvm.nvvm.atomic.min.gen.i.sys.i32.p0
471   // expected-error@+1 {{'__nvvm_atom_sys_min_gen_ui' needs target feature sm_60}}
472   __nvvm_atom_sys_min_gen_ui((unsigned int *)ip, i);
473   // LP32: call i32 @llvm.nvvm.atomic.min.gen.i.sys.i32.p0
474   // LP64: call i64 @llvm.nvvm.atomic.min.gen.i.sys.i64.p0
475   // expected-error@+1 {{'__nvvm_atom_sys_min_gen_l' needs target feature sm_60}}
476   __nvvm_atom_sys_min_gen_l(&dl, l);
477   // LP32: call i32 @llvm.nvvm.atomic.min.gen.i.sys.i32.p0
478   // LP64: call i64 @llvm.nvvm.atomic.min.gen.i.sys.i64.p0
479   // expected-error@+1 {{'__nvvm_atom_sys_min_gen_ul' needs target feature sm_60}}
480   __nvvm_atom_sys_min_gen_ul((unsigned long *)lp, l);
481   // CHECK: call i64 @llvm.nvvm.atomic.min.gen.i.sys.i64.p0
482   // expected-error@+1 {{'__nvvm_atom_sys_min_gen_ll' needs target feature sm_60}}
483   __nvvm_atom_sys_min_gen_ll(&sll, ll);
484   // CHECK: call i64 @llvm.nvvm.atomic.min.gen.i.sys.i64.p0
485   // expected-error@+1 {{'__nvvm_atom_sys_min_gen_ull' needs target feature sm_60}}
486   __nvvm_atom_sys_min_gen_ull((unsigned long long *)llp, ll);
487 
488   // CHECK: call i32 @llvm.nvvm.atomic.inc.gen.i.cta.i32.p0
489   // expected-error@+1 {{'__nvvm_atom_cta_inc_gen_ui' needs target feature sm_60}}
490   __nvvm_atom_cta_inc_gen_ui((unsigned int *)ip, i);
491   // CHECK: call i32 @llvm.nvvm.atomic.inc.gen.i.sys.i32.p0
492   // expected-error@+1 {{'__nvvm_atom_sys_inc_gen_ui' needs target feature sm_60}}
493   __nvvm_atom_sys_inc_gen_ui((unsigned int *)ip, i);
494 
495   // CHECK: call i32 @llvm.nvvm.atomic.dec.gen.i.cta.i32.p0
496   // expected-error@+1 {{'__nvvm_atom_cta_dec_gen_ui' needs target feature sm_60}}
497   __nvvm_atom_cta_dec_gen_ui((unsigned int *)ip, i);
498   // CHECK: call i32 @llvm.nvvm.atomic.dec.gen.i.sys.i32.p0
499   // expected-error@+1 {{'__nvvm_atom_sys_dec_gen_ui' needs target feature sm_60}}
500   __nvvm_atom_sys_dec_gen_ui((unsigned int *)ip, i);
501 
502   // CHECK: call i32 @llvm.nvvm.atomic.and.gen.i.cta.i32.p0
503   // expected-error@+1 {{'__nvvm_atom_cta_and_gen_i' needs target feature sm_60}}
504   __nvvm_atom_cta_and_gen_i(ip, i);
505   // LP32: call i32 @llvm.nvvm.atomic.and.gen.i.cta.i32.p0
506   // LP64: call i64 @llvm.nvvm.atomic.and.gen.i.cta.i64.p0
507   // expected-error@+1 {{'__nvvm_atom_cta_and_gen_l' needs target feature sm_60}}
508   __nvvm_atom_cta_and_gen_l(&dl, l);
509   // CHECK: call i64 @llvm.nvvm.atomic.and.gen.i.cta.i64.p0
510   // expected-error@+1 {{'__nvvm_atom_cta_and_gen_ll' needs target feature sm_60}}
511   __nvvm_atom_cta_and_gen_ll(&sll, ll);
512 
513   // CHECK: call i32 @llvm.nvvm.atomic.and.gen.i.sys.i32.p0
514   // expected-error@+1 {{'__nvvm_atom_sys_and_gen_i' needs target feature sm_60}}
515   __nvvm_atom_sys_and_gen_i(ip, i);
516   // LP32: call i32 @llvm.nvvm.atomic.and.gen.i.sys.i32.p0
517   // LP64: call i64 @llvm.nvvm.atomic.and.gen.i.sys.i64.p0
518   // expected-error@+1 {{'__nvvm_atom_sys_and_gen_l' needs target feature sm_60}}
519   __nvvm_atom_sys_and_gen_l(&dl, l);
520   // CHECK: call i64 @llvm.nvvm.atomic.and.gen.i.sys.i64.p0
521   // expected-error@+1 {{'__nvvm_atom_sys_and_gen_ll' needs target feature sm_60}}
522   __nvvm_atom_sys_and_gen_ll(&sll, ll);
523 
524   // CHECK: call i32 @llvm.nvvm.atomic.or.gen.i.cta.i32.p0
525   // expected-error@+1 {{'__nvvm_atom_cta_or_gen_i' needs target feature sm_60}}
526   __nvvm_atom_cta_or_gen_i(ip, i);
527   // LP32: call i32 @llvm.nvvm.atomic.or.gen.i.cta.i32.p0
528   // LP64: call i64 @llvm.nvvm.atomic.or.gen.i.cta.i64.p0
529   // expected-error@+1 {{'__nvvm_atom_cta_or_gen_l' needs target feature sm_60}}
530   __nvvm_atom_cta_or_gen_l(&dl, l);
531   // CHECK: call i64 @llvm.nvvm.atomic.or.gen.i.cta.i64.p0
532   // expected-error@+1 {{'__nvvm_atom_cta_or_gen_ll' needs target feature sm_60}}
533   __nvvm_atom_cta_or_gen_ll(&sll, ll);
534 
535   // CHECK: call i32 @llvm.nvvm.atomic.or.gen.i.sys.i32.p0
536   // expected-error@+1 {{'__nvvm_atom_sys_or_gen_i' needs target feature sm_60}}
537   __nvvm_atom_sys_or_gen_i(ip, i);
538   // LP32: call i32 @llvm.nvvm.atomic.or.gen.i.sys.i32.p0
539   // LP64: call i64 @llvm.nvvm.atomic.or.gen.i.sys.i64.p0
540   // expected-error@+1 {{'__nvvm_atom_sys_or_gen_l' needs target feature sm_60}}
541   __nvvm_atom_sys_or_gen_l(&dl, l);
542   // CHECK: call i64 @llvm.nvvm.atomic.or.gen.i.sys.i64.p0
543   // expected-error@+1 {{'__nvvm_atom_sys_or_gen_ll' needs target feature sm_60}}
544   __nvvm_atom_sys_or_gen_ll(&sll, ll);
545 
546   // CHECK: call i32 @llvm.nvvm.atomic.xor.gen.i.cta.i32.p0
547   // expected-error@+1 {{'__nvvm_atom_cta_xor_gen_i' needs target feature sm_60}}
548   __nvvm_atom_cta_xor_gen_i(ip, i);
549   // LP32: call i32 @llvm.nvvm.atomic.xor.gen.i.cta.i32.p0
550   // LP64: call i64 @llvm.nvvm.atomic.xor.gen.i.cta.i64.p0
551   // expected-error@+1 {{'__nvvm_atom_cta_xor_gen_l' needs target feature sm_60}}
552   __nvvm_atom_cta_xor_gen_l(&dl, l);
553   // CHECK: call i64 @llvm.nvvm.atomic.xor.gen.i.cta.i64.p0
554   // expected-error@+1 {{'__nvvm_atom_cta_xor_gen_ll' needs target feature sm_60}}
555   __nvvm_atom_cta_xor_gen_ll(&sll, ll);
556 
557   // CHECK: call i32 @llvm.nvvm.atomic.xor.gen.i.sys.i32.p0
558   // expected-error@+1 {{'__nvvm_atom_sys_xor_gen_i' needs target feature sm_60}}
559   __nvvm_atom_sys_xor_gen_i(ip, i);
560   // LP32: call i32 @llvm.nvvm.atomic.xor.gen.i.sys.i32.p0
561   // LP64: call i64 @llvm.nvvm.atomic.xor.gen.i.sys.i64.p0
562   // expected-error@+1 {{'__nvvm_atom_sys_xor_gen_l' needs target feature sm_60}}
563   __nvvm_atom_sys_xor_gen_l(&dl, l);
564   // CHECK: call i64 @llvm.nvvm.atomic.xor.gen.i.sys.i64.p0
565   // expected-error@+1 {{'__nvvm_atom_sys_xor_gen_ll' needs target feature sm_60}}
566   __nvvm_atom_sys_xor_gen_ll(&sll, ll);
567 
568   // CHECK: call i32 @llvm.nvvm.atomic.cas.gen.i.cta.i32.p0
569   // expected-error@+1 {{'__nvvm_atom_cta_cas_gen_i' needs target feature sm_60}}
570   __nvvm_atom_cta_cas_gen_i(ip, i, 0);
571   // LP32: call i32 @llvm.nvvm.atomic.cas.gen.i.cta.i32.p0
572   // LP64: call i64 @llvm.nvvm.atomic.cas.gen.i.cta.i64.p0
573   // expected-error@+1 {{'__nvvm_atom_cta_cas_gen_l' needs target feature sm_60}}
574   __nvvm_atom_cta_cas_gen_l(&dl, l, 0);
575   // CHECK: call i64 @llvm.nvvm.atomic.cas.gen.i.cta.i64.p0
576   // expected-error@+1 {{'__nvvm_atom_cta_cas_gen_ll' needs target feature sm_60}}
577   __nvvm_atom_cta_cas_gen_ll(&sll, ll, 0);
578 
579   // CHECK: call i32 @llvm.nvvm.atomic.cas.gen.i.sys.i32.p0
580   // expected-error@+1 {{'__nvvm_atom_sys_cas_gen_i' needs target feature sm_60}}
581   __nvvm_atom_sys_cas_gen_i(ip, i, 0);
582   // LP32: call i32 @llvm.nvvm.atomic.cas.gen.i.sys.i32.p0
583   // LP64: call i64 @llvm.nvvm.atomic.cas.gen.i.sys.i64.p0
584   // expected-error@+1 {{'__nvvm_atom_sys_cas_gen_l' needs target feature sm_60}}
585   __nvvm_atom_sys_cas_gen_l(&dl, l, 0);
586   // CHECK: call i64 @llvm.nvvm.atomic.cas.gen.i.sys.i64.p0
587   // expected-error@+1 {{'__nvvm_atom_sys_cas_gen_ll' needs target feature sm_60}}
588   __nvvm_atom_sys_cas_gen_ll(&sll, ll, 0);
589 #endif
590 
591 #if __CUDA_ARCH__ >= 700
592   // CHECK_PTX63_SM70: cmpxchg ptr {{.*}} seq_cst seq_cst, align 2
593   // CHECK_PTX63_SM70-NEXT: extractvalue { i16, i1 } {{%[0-9]+}}, 0
594   __nvvm_atom_cas_gen_us(usp, 0, us);
595   // CHECK_PTX63_SM70: call i16 @llvm.nvvm.atomic.cas.gen.i.cta.i16.p0
596   __nvvm_atom_cta_cas_gen_us(usp, 0, us);
597   // CHECK_PTX63_SM70: call i16 @llvm.nvvm.atomic.cas.gen.i.sys.i16.p0
598   __nvvm_atom_sys_cas_gen_us(usp, 0, us);
599 #endif
600 
601   // CHECK: ret
602 }
603 
604 // CHECK-LABEL: nvvm_ldg
605 __device__ void nvvm_ldg(const void *p) {
606   // CHECK: load i8, ptr addrspace(1) {{%[0-9]+}}, align 1, !invariant.load
607   // CHECK: load i8, ptr addrspace(1) {{%[0-9]+}}, align 1, !invariant.load
608   // CHECK: load i8, ptr addrspace(1) {{%[0-9]+}}, align 1, !invariant.load
609   __nvvm_ldg_c((const char *)p);
610   __nvvm_ldg_uc((const unsigned char *)p);
611   __nvvm_ldg_sc((const signed char *)p);
612 
613   // CHECK: load i16, ptr addrspace(1) {{%[0-9]+}}, align 2, !invariant.load
614   // CHECK: load i16, ptr addrspace(1) {{%[0-9]+}}, align 2, !invariant.load
615   __nvvm_ldg_s((const short *)p);
616   __nvvm_ldg_us((const unsigned short *)p);
617 
618   // CHECK: load i32, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
619   // CHECK: load i32, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
620   __nvvm_ldg_i((const int *)p);
621   __nvvm_ldg_ui((const unsigned int *)p);
622 
623   // LP32: load i32, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
624   // LP32: load i32, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
625   // LP64: load i64, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
626   // LP64: load i64, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
627   __nvvm_ldg_l((const long *)p);
628   __nvvm_ldg_ul((const unsigned long *)p);
629 
630   // CHECK: load float, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
631   __nvvm_ldg_f((const float *)p);
632   // CHECK: load double, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
633   __nvvm_ldg_d((const double *)p);
634 
635   // In practice, the pointers we pass to __ldg will be aligned as appropriate
636   // for the CUDA <type>N vector types (e.g. short4), which are not the same as
637   // the LLVM vector types.  However, each LLVM vector type has an alignment
638   // less than or equal to its corresponding CUDA type, so we're OK.
639   //
640   // PTX Interoperability section 2.2: "For a vector with an even number of
641   // elements, its alignment is set to number of elements times the alignment of
642   // its member: n*alignof(t)."
643 
644   // CHECK: load <2 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 2, !invariant.load
645   // CHECK: load <2 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 2, !invariant.load
646   // CHECK: load <2 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 2, !invariant.load
647   typedef char char2 __attribute__((ext_vector_type(2)));
648   typedef unsigned char uchar2 __attribute__((ext_vector_type(2)));
649   typedef signed char schar2 __attribute__((ext_vector_type(2)));
650   __nvvm_ldg_c2((const char2 *)p);
651   __nvvm_ldg_uc2((const uchar2 *)p);
652   __nvvm_ldg_sc2((const schar2 *)p);
653 
654   // CHECK: load <4 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
655   // CHECK: load <4 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
656   // CHECK: load <4 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
657   typedef char char4 __attribute__((ext_vector_type(4)));
658   typedef unsigned char uchar4 __attribute__((ext_vector_type(4)));
659   typedef signed char schar4 __attribute__((ext_vector_type(4)));
660   __nvvm_ldg_c4((const char4 *)p);
661   __nvvm_ldg_uc4((const uchar4 *)p);
662   __nvvm_ldg_sc4((const schar4 *)p);
663 
664   // CHECK: load <2 x i16>, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
665   // CHECK: load <2 x i16>, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
666   typedef short short2 __attribute__((ext_vector_type(2)));
667   typedef unsigned short ushort2 __attribute__((ext_vector_type(2)));
668   __nvvm_ldg_s2((const short2 *)p);
669   __nvvm_ldg_us2((const ushort2 *)p);
670 
671   // CHECK: load <4 x i16>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
672   // CHECK: load <4 x i16>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
673   typedef short short4 __attribute__((ext_vector_type(4)));
674   typedef unsigned short ushort4 __attribute__((ext_vector_type(4)));
675   __nvvm_ldg_s4((const short4 *)p);
676   __nvvm_ldg_us4((const ushort4 *)p);
677 
678   // CHECK: load <2 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
679   // CHECK: load <2 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
680   typedef int int2 __attribute__((ext_vector_type(2)));
681   typedef unsigned int uint2 __attribute__((ext_vector_type(2)));
682   __nvvm_ldg_i2((const int2 *)p);
683   __nvvm_ldg_ui2((const uint2 *)p);
684 
685   // CHECK: load <4 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
686   // CHECK: load <4 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
687   typedef int int4 __attribute__((ext_vector_type(4)));
688   typedef unsigned int uint4 __attribute__((ext_vector_type(4)));
689   __nvvm_ldg_i4((const int4 *)p);
690   __nvvm_ldg_ui4((const uint4 *)p);
691 
692   // LP32: load <2 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
693   // LP32: load <2 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
694   // LP64: load <2 x i64>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
695   // LP64: load <2 x i64>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
696   typedef long long2 __attribute__((ext_vector_type(2)));
697   typedef unsigned long ulong2 __attribute__((ext_vector_type(2)));
698   __nvvm_ldg_l2((const long2 *)p);
699   __nvvm_ldg_ul2((const ulong2 *)p);
700 
701   // CHECK: load <2 x i64>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
702   // CHECK: load <2 x i64>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
703   typedef long long longlong2 __attribute__((ext_vector_type(2)));
704   typedef unsigned long long ulonglong2 __attribute__((ext_vector_type(2)));
705   __nvvm_ldg_ll2((const longlong2 *)p);
706   __nvvm_ldg_ull2((const ulonglong2 *)p);
707 
708   // CHECK: load <2 x float>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
709   typedef float float2 __attribute__((ext_vector_type(2)));
710   __nvvm_ldg_f2((const float2 *)p);
711 
712   // CHECK: load <4 x float>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
713   typedef float float4 __attribute__((ext_vector_type(4)));
714   __nvvm_ldg_f4((const float4 *)p);
715 
716   // CHECK: load <2 x double>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
717   typedef double double2 __attribute__((ext_vector_type(2)));
718   __nvvm_ldg_d2((const double2 *)p);
719 }
720 
721 // CHECK-LABEL: nvvm_ldu
722 __device__ void nvvm_ldu(const void *p) {
723   // CHECK: call i8 @llvm.nvvm.ldu.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1)
724   // CHECK: call i8 @llvm.nvvm.ldu.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1)
725   // CHECK: call i8 @llvm.nvvm.ldu.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1)
726   __nvvm_ldu_c((const char *)p);
727   __nvvm_ldu_uc((const unsigned char *)p);
728   __nvvm_ldu_sc((const signed char *)p);
729 
730   // CHECK: call i16 @llvm.nvvm.ldu.global.i.i16.p0(ptr {{%[0-9]+}}, i32 2)
731   // CHECK: call i16 @llvm.nvvm.ldu.global.i.i16.p0(ptr {{%[0-9]+}}, i32 2)
732   __nvvm_ldu_s((const short *)p);
733   __nvvm_ldu_us((const unsigned short *)p);
734 
735   // CHECK: call i32 @llvm.nvvm.ldu.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4)
736   // CHECK: call i32 @llvm.nvvm.ldu.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4)
737   __nvvm_ldu_i((const int *)p);
738   __nvvm_ldu_ui((const unsigned int *)p);
739 
740   // LP32: call i32 @llvm.nvvm.ldu.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4)
741   // LP32: call i32 @llvm.nvvm.ldu.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4)
742   // LP64: call i64 @llvm.nvvm.ldu.global.i.i64.p0(ptr {{%[0-9]+}}, i32 8)
743   // LP64: call i64 @llvm.nvvm.ldu.global.i.i64.p0(ptr {{%[0-9]+}}, i32 8)
744   __nvvm_ldu_l((const long *)p);
745   __nvvm_ldu_ul((const unsigned long *)p);
746 
747   // CHECK: call float @llvm.nvvm.ldu.global.f.f32.p0(ptr {{%[0-9]+}}, i32 4)
748   __nvvm_ldu_f((const float *)p);
749   // CHECK: call double @llvm.nvvm.ldu.global.f.f64.p0(ptr {{%[0-9]+}}, i32 8)
750   __nvvm_ldu_d((const double *)p);
751 
752   // CHECK: call <2 x i8> @llvm.nvvm.ldu.global.i.v2i8.p0(ptr {{%[0-9]+}}, i32 2)
753   // CHECK: call <2 x i8> @llvm.nvvm.ldu.global.i.v2i8.p0(ptr {{%[0-9]+}}, i32 2)
754   // CHECK: call <2 x i8> @llvm.nvvm.ldu.global.i.v2i8.p0(ptr {{%[0-9]+}}, i32 2)
755   typedef char char2 __attribute__((ext_vector_type(2)));
756   typedef unsigned char uchar2 __attribute__((ext_vector_type(2)));
757   typedef signed char schar2 __attribute__((ext_vector_type(2)));
758   __nvvm_ldu_c2((const char2 *)p);
759   __nvvm_ldu_uc2((const uchar2 *)p);
760   __nvvm_ldu_sc2((const schar2 *)p);
761 
762   // CHECK: call <4 x i8> @llvm.nvvm.ldu.global.i.v4i8.p0(ptr {{%[0-9]+}}, i32 4)
763   // CHECK: call <4 x i8> @llvm.nvvm.ldu.global.i.v4i8.p0(ptr {{%[0-9]+}}, i32 4)
764   // CHECK: call <4 x i8> @llvm.nvvm.ldu.global.i.v4i8.p0(ptr {{%[0-9]+}}, i32 4)
765   typedef char char4 __attribute__((ext_vector_type(4)));
766   typedef unsigned char uchar4 __attribute__((ext_vector_type(4)));
767   typedef signed char schar4 __attribute__((ext_vector_type(4)));
768   __nvvm_ldu_c4((const char4 *)p);
769   __nvvm_ldu_uc4((const uchar4 *)p);
770   __nvvm_ldu_sc4((const schar4 *)p);
771 
772   // CHECK: call <2 x i16> @llvm.nvvm.ldu.global.i.v2i16.p0(ptr {{%[0-9]+}}, i32 4)
773   // CHECK: call <2 x i16> @llvm.nvvm.ldu.global.i.v2i16.p0(ptr {{%[0-9]+}}, i32 4)
774   typedef short short2 __attribute__((ext_vector_type(2)));
775   typedef unsigned short ushort2 __attribute__((ext_vector_type(2)));
776   __nvvm_ldu_s2((const short2 *)p);
777   __nvvm_ldu_us2((const ushort2 *)p);
778 
779   // CHECK: call <4 x i16> @llvm.nvvm.ldu.global.i.v4i16.p0(ptr {{%[0-9]+}}, i32 8)
780   // CHECK: call <4 x i16> @llvm.nvvm.ldu.global.i.v4i16.p0(ptr {{%[0-9]+}}, i32 8)
781   typedef short short4 __attribute__((ext_vector_type(4)));
782   typedef unsigned short ushort4 __attribute__((ext_vector_type(4)));
783   __nvvm_ldu_s4((const short4 *)p);
784   __nvvm_ldu_us4((const ushort4 *)p);
785 
786   // CHECK: call <2 x i32> @llvm.nvvm.ldu.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8)
787   // CHECK: call <2 x i32> @llvm.nvvm.ldu.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8)
788   typedef int int2 __attribute__((ext_vector_type(2)));
789   typedef unsigned int uint2 __attribute__((ext_vector_type(2)));
790   __nvvm_ldu_i2((const int2 *)p);
791   __nvvm_ldu_ui2((const uint2 *)p);
792 
793   // CHECK: call <4 x i32> @llvm.nvvm.ldu.global.i.v4i32.p0(ptr {{%[0-9]+}}, i32 16)
794   // CHECK: call <4 x i32> @llvm.nvvm.ldu.global.i.v4i32.p0(ptr {{%[0-9]+}}, i32 16)
795   typedef int int4 __attribute__((ext_vector_type(4)));
796   typedef unsigned int uint4 __attribute__((ext_vector_type(4)));
797   __nvvm_ldu_i4((const int4 *)p);
798   __nvvm_ldu_ui4((const uint4 *)p);
799 
800   // LP32: call <2 x i32> @llvm.nvvm.ldu.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8)
801   // LP32: call <2 x i32> @llvm.nvvm.ldu.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8)
802   // LP64: call <2 x i64> @llvm.nvvm.ldu.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16)
803   // LP64: call <2 x i64> @llvm.nvvm.ldu.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16)
804   typedef long long2 __attribute__((ext_vector_type(2)));
805   typedef unsigned long ulong2 __attribute__((ext_vector_type(2)));
806   __nvvm_ldu_l2((const long2 *)p);
807   __nvvm_ldu_ul2((const ulong2 *)p);
808 
809   // CHECK: call <2 x i64> @llvm.nvvm.ldu.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16)
810   // CHECK: call <2 x i64> @llvm.nvvm.ldu.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16)
811   typedef long long longlong2 __attribute__((ext_vector_type(2)));
812   typedef unsigned long long ulonglong2 __attribute__((ext_vector_type(2)));
813   __nvvm_ldu_ll2((const longlong2 *)p);
814   __nvvm_ldu_ull2((const ulonglong2 *)p);
815 
816   // CHECK: call <2 x float> @llvm.nvvm.ldu.global.f.v2f32.p0(ptr {{%[0-9]+}}, i32 8)
817   typedef float float2 __attribute__((ext_vector_type(2)));
818   __nvvm_ldu_f2((const float2 *)p);
819 
820   // CHECK: call <4 x float> @llvm.nvvm.ldu.global.f.v4f32.p0(ptr {{%[0-9]+}}, i32 16)
821   typedef float float4 __attribute__((ext_vector_type(4)));
822   __nvvm_ldu_f4((const float4 *)p);
823 
824   // CHECK: call <2 x double> @llvm.nvvm.ldu.global.f.v2f64.p0(ptr {{%[0-9]+}}, i32 16)
825   typedef double double2 __attribute__((ext_vector_type(2)));
826   __nvvm_ldu_d2((const double2 *)p);
827 }
828 
829 // CHECK-LABEL: nvvm_shfl
830 __device__ void nvvm_shfl(int i, float f, int a, int b) {
831   // CHECK: call i32 @llvm.nvvm.shfl.down.i32(i32
832   __nvvm_shfl_down_i32(i, a, b);
833   // CHECK: call float @llvm.nvvm.shfl.down.f32(float
834   __nvvm_shfl_down_f32(f, a, b);
835   // CHECK: call i32 @llvm.nvvm.shfl.up.i32(i32
836   __nvvm_shfl_up_i32(i, a, b);
837   // CHECK: call float @llvm.nvvm.shfl.up.f32(float
838   __nvvm_shfl_up_f32(f, a, b);
839   // CHECK: call i32 @llvm.nvvm.shfl.bfly.i32(i32
840   __nvvm_shfl_bfly_i32(i, a, b);
841   // CHECK: call float @llvm.nvvm.shfl.bfly.f32(float
842   __nvvm_shfl_bfly_f32(f, a, b);
843   // CHECK: call i32 @llvm.nvvm.shfl.idx.i32(i32
844   __nvvm_shfl_idx_i32(i, a, b);
845   // CHECK: call float @llvm.nvvm.shfl.idx.f32(float
846   __nvvm_shfl_idx_f32(f, a, b);
847   // CHECK: ret void
848 }
849 
850 __device__ void nvvm_vote(int pred) {
851   // CHECK: call i1 @llvm.nvvm.vote.all(i1
852   __nvvm_vote_all(pred);
853   // CHECK: call i1 @llvm.nvvm.vote.any(i1
854   __nvvm_vote_any(pred);
855   // CHECK: call i1 @llvm.nvvm.vote.uni(i1
856   __nvvm_vote_uni(pred);
857   // CHECK: call i32 @llvm.nvvm.vote.ballot(i1
858   __nvvm_vote_ballot(pred);
859   // CHECK: ret void
860 }
861 
862 // CHECK-LABEL: nvvm_nanosleep
863 __device__ void nvvm_nanosleep(int d) {
864 #if __CUDA_ARCH__ >= 700
865   // CHECK_PTX70_SM80: call void @llvm.nvvm.nanosleep
866   __nvvm_nanosleep(d);
867 
868   // CHECK_PTX70_SM80: call void @llvm.nvvm.nanosleep
869   __nvvm_nanosleep(1);
870 #endif
871 }
872 
873 // CHECK-LABEL: nvvm_mbarrier
874 __device__ void nvvm_mbarrier(long long* addr, __attribute__((address_space(3))) long long* sharedAddr, int count, long long state) {
875   #if __CUDA_ARCH__ >= 800
876   __nvvm_mbarrier_init(addr, count);
877   // CHECK_PTX70_SM80: call void @llvm.nvvm.mbarrier.init
878   __nvvm_mbarrier_init_shared(sharedAddr, count);
879   // CHECK_PTX70_SM80: call void @llvm.nvvm.mbarrier.init.shared
880 
881   __nvvm_mbarrier_inval(addr);
882   // CHECK_PTX70_SM80: call void @llvm.nvvm.mbarrier.inval
883   __nvvm_mbarrier_inval_shared(sharedAddr);
884   // CHECK_PTX70_SM80: call void @llvm.nvvm.mbarrier.inval.shared
885 
886   __nvvm_mbarrier_arrive(addr);
887   // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive
888   __nvvm_mbarrier_arrive_shared(sharedAddr);
889   // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.shared
890   __nvvm_mbarrier_arrive_noComplete(addr, count);
891   // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.noComplete
892   __nvvm_mbarrier_arrive_noComplete_shared(sharedAddr, count);
893   // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.noComplete.shared
894 
895   __nvvm_mbarrier_arrive_drop(addr);
896   // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.drop
897   __nvvm_mbarrier_arrive_drop_shared(sharedAddr);
898   // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.drop.shared
899   __nvvm_mbarrier_arrive_drop_noComplete(addr, count);
900   // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.drop.noComplete
901   __nvvm_mbarrier_arrive_drop_noComplete_shared(sharedAddr, count);
902   // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.drop.noComplete.shared
903 
904   __nvvm_mbarrier_test_wait(addr, state);
905   // CHECK_PTX70_SM80: call i1 @llvm.nvvm.mbarrier.test.wait
906   __nvvm_mbarrier_test_wait_shared(sharedAddr, state);
907   // CHECK_PTX70_SM80: call i1 @llvm.nvvm.mbarrier.test.wait.shared
908 
909   __nvvm_mbarrier_pending_count(state);
910   // CHECK_PTX70_SM80: call i32 @llvm.nvvm.mbarrier.pending.count
911   #endif
912   // CHECK: ret void
913 }
914 
915 // CHECK-LABEL: nvvm_async_copy
916 __device__ void nvvm_async_copy(__attribute__((address_space(3))) void* dst, __attribute__((address_space(1))) const void* src, long long* addr, __attribute__((address_space(3))) long long* sharedAddr) {
917   #if __CUDA_ARCH__ >= 800
918   // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive
919   __nvvm_cp_async_mbarrier_arrive(addr);
920   // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive.shared
921   __nvvm_cp_async_mbarrier_arrive_shared(sharedAddr);
922   // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc
923   __nvvm_cp_async_mbarrier_arrive_noinc(addr);
924   // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared
925   __nvvm_cp_async_mbarrier_arrive_noinc_shared(sharedAddr);
926 
927   // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.4(
928   __nvvm_cp_async_ca_shared_global_4(dst, src);
929   // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.8(
930   __nvvm_cp_async_ca_shared_global_8(dst, src);
931   // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.16(
932   __nvvm_cp_async_ca_shared_global_16(dst, src);
933   // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.cg.shared.global.16(
934   __nvvm_cp_async_cg_shared_global_16(dst, src);
935 
936   // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.4.s({{.*}}, i32 2)
937   __nvvm_cp_async_ca_shared_global_4(dst, src, 2);
938   // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.8.s({{.*}}, i32 2)
939   __nvvm_cp_async_ca_shared_global_8(dst, src, 2);
940   // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.16.s({{.*}}, i32 2)
941   __nvvm_cp_async_ca_shared_global_16(dst, src, 2);
942   // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.cg.shared.global.16.s({{.*}}, i32 2)
943   __nvvm_cp_async_cg_shared_global_16(dst, src, 2);
944 
945   // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.commit.group
946   __nvvm_cp_async_commit_group();
947   // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.wait.group(i32 0)
948   __nvvm_cp_async_wait_group(0);
949     // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.wait.group(i32 8)
950   __nvvm_cp_async_wait_group(8);
951     // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.wait.group(i32 16)
952   __nvvm_cp_async_wait_group(16);
953   // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.wait.all
954   __nvvm_cp_async_wait_all();
955   #endif
956   // CHECK: ret void
957 }
958 
959 // CHECK-LABEL: nvvm_cvt_sm80
960 __device__ void nvvm_cvt_sm80() {
961 #if __CUDA_ARCH__ >= 800
962   // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rn(float 1.000000e+00, float 1.000000e+00)
963   __nvvm_ff2bf16x2_rn(1, 1);
964   // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rn.relu(float 1.000000e+00, float 1.000000e+00)
965   __nvvm_ff2bf16x2_rn_relu(1, 1);
966   // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rz(float 1.000000e+00, float 1.000000e+00)
967   __nvvm_ff2bf16x2_rz(1, 1);
968   // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rz.relu(float 1.000000e+00, float 1.000000e+00)
969   __nvvm_ff2bf16x2_rz_relu(1, 1);
970 
971   // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.ff2f16x2.rn(float 1.000000e+00, float 1.000000e+00)
972   __nvvm_ff2f16x2_rn(1, 1);
973   // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.ff2f16x2.rn.relu(float 1.000000e+00, float 1.000000e+00)
974   __nvvm_ff2f16x2_rn_relu(1, 1);
975   // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.ff2f16x2.rz(float 1.000000e+00, float 1.000000e+00)
976   __nvvm_ff2f16x2_rz(1, 1);
977   // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.ff2f16x2.rz.relu(float 1.000000e+00, float 1.000000e+00)
978   __nvvm_ff2f16x2_rz_relu(1, 1);
979 
980   // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.f2bf16.rn(float 1.000000e+00)
981   __nvvm_f2bf16_rn(1);
982   // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.f2bf16.rn.relu(float 1.000000e+00)
983   __nvvm_f2bf16_rn_relu(1);
984   // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.f2bf16.rz(float 1.000000e+00)
985   __nvvm_f2bf16_rz(1);
986   // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.f2bf16.rz.relu(float 1.000000e+00)
987   __nvvm_f2bf16_rz_relu(1);
988 
989   // CHECK_PTX70_SM80: call i32 @llvm.nvvm.f2tf32.rna(float 1.000000e+00)
990   __nvvm_f2tf32_rna(1);
991 #endif
992   // CHECK: ret void
993 }
994 
995 // CHECK-LABEL: nvvm_cvt_sm89
996 __device__ void nvvm_cvt_sm89() {
997 #if __CUDA_ARCH__ >= 890
998   // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e4m3x2.rn(float 1.000000e+00, float 1.000000e+00)
999   __nvvm_ff_to_e4m3x2_rn(1.0f, 1.0f);
1000   // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e4m3x2.rn.relu(float 1.000000e+00, float 1.000000e+00)
1001   __nvvm_ff_to_e4m3x2_rn_relu(1.0f, 1.0f);
1002   // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e5m2x2.rn(float 1.000000e+00, float 1.000000e+00)
1003   __nvvm_ff_to_e5m2x2_rn(1.0f, 1.0f);
1004   // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e5m2x2.rn.relu(float 1.000000e+00, float 1.000000e+00)
1005   __nvvm_ff_to_e5m2x2_rn_relu(1.0f, 1.0f);
1006 
1007   // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.to.e4m3x2.rn(<2 x half> splat (half 0xH3C00))
1008   __nvvm_f16x2_to_e4m3x2_rn({1.0f16, 1.0f16});
1009   // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.to.e4m3x2.rn.relu(<2 x half> splat (half 0xH3C00))
1010   __nvvm_f16x2_to_e4m3x2_rn_relu({1.0f16, 1.0f16});
1011   // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.to.e5m2x2.rn(<2 x half> splat (half 0xH3C00))
1012   __nvvm_f16x2_to_e5m2x2_rn({1.0f16, 1.0f16});
1013   // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.to.e5m2x2.rn.relu(<2 x half> splat (half 0xH3C00))
1014   __nvvm_f16x2_to_e5m2x2_rn_relu({1.0f16, 1.0f16});
1015 
1016   // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e4m3x2.to.f16x2.rn(i16 18504)
1017   __nvvm_e4m3x2_to_f16x2_rn(0x4848);
1018   // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e4m3x2.to.f16x2.rn.relu(i16 18504)
1019   __nvvm_e4m3x2_to_f16x2_rn_relu(0x4848);
1020   // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e5m2x2.to.f16x2.rn(i16 19532)
1021   __nvvm_e5m2x2_to_f16x2_rn(0x4c4c);
1022   // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e5m2x2.to.f16x2.rn.relu(i16 19532)
1023   __nvvm_e5m2x2_to_f16x2_rn_relu(0x4c4c);
1024 #endif
1025   // CHECK: ret void
1026 }
1027 
1028 #define NAN32 0x7FBFFFFF
1029 #define NAN16 (__bf16)0x7FBF
1030 #define BF16 (__bf16)0.1f
1031 #define BF16_2 (__bf16)0.2f
1032 #define NANBF16 (__bf16)0xFFC1
1033 #define BF16X2 {(__bf16)0.1f, (__bf16)0.1f}
1034 #define BF16X2_2 {(__bf16)0.2f, (__bf16)0.2f}
1035 #define NANBF16X2 {NANBF16, NANBF16}
1036 
1037 // CHECK-LABEL: nvvm_abs_neg_bf16_bf16x2_sm80
1038 __device__ void nvvm_abs_neg_bf16_bf16x2_sm80() {
1039 #if __CUDA_ARCH__ >= 800
1040 
1041   // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.abs.bf16(bfloat 0xR3DCD)
1042   __nvvm_abs_bf16(BF16);
1043   // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.abs.bf16x2(<2 x bfloat> splat (bfloat 0xR3DCD))
1044   __nvvm_abs_bf16x2(BF16X2);
1045 
1046   // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.neg.bf16(bfloat 0xR3DCD)
1047   __nvvm_neg_bf16(BF16);
1048   // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.neg.bf16x2(<2 x bfloat> splat (bfloat 0xR3DCD))
1049   __nvvm_neg_bf16x2(BF16X2);
1050 #endif
1051   // CHECK: ret void
1052 }
1053 
1054 // CHECK-LABEL: nvvm_min_max_sm80
1055 __device__ void nvvm_min_max_sm80() {
1056 #if __CUDA_ARCH__ >= 800
1057 
1058   // CHECK_PTX70_SM80: call float @llvm.nvvm.fmin.nan.f
1059   __nvvm_fmin_nan_f(0.1f, (float)NAN32);
1060   // CHECK_PTX70_SM80: call float @llvm.nvvm.fmin.ftz.nan.f
1061   __nvvm_fmin_ftz_nan_f(0.1f, (float)NAN32);
1062 
1063   // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fmin.bf16
1064   __nvvm_fmin_bf16(BF16, BF16_2);
1065   // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fmin.ftz.bf16
1066   __nvvm_fmin_ftz_bf16(BF16, BF16_2);
1067   // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fmin.nan.bf16
1068   __nvvm_fmin_nan_bf16(BF16, NANBF16);
1069   // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fmin.ftz.nan.bf16
1070   __nvvm_fmin_ftz_nan_bf16(BF16, NANBF16);
1071   // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fmin.bf16x2
1072   __nvvm_fmin_bf16x2(BF16X2, BF16X2_2);
1073   // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fmin.ftz.bf16x2
1074   __nvvm_fmin_ftz_bf16x2(BF16X2, BF16X2_2);
1075   // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fmin.nan.bf16x2
1076   __nvvm_fmin_nan_bf16x2(BF16X2, NANBF16X2);
1077   // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fmin.ftz.nan.bf16x2
1078   __nvvm_fmin_ftz_nan_bf16x2(BF16X2, NANBF16X2);
1079   // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.nan.f
1080   __nvvm_fmax_nan_f(0.1f, 0.11f);
1081   // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.ftz.nan.f
1082   __nvvm_fmax_ftz_nan_f(0.1f, (float)NAN32);
1083 
1084   // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.nan.f
1085   __nvvm_fmax_nan_f(0.1f, (float)NAN32);
1086   // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.ftz.nan.f
1087   __nvvm_fmax_ftz_nan_f(0.1f, (float)NAN32);
1088   // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fmax.bf16
1089   __nvvm_fmax_bf16(BF16, BF16_2);
1090   // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fmax.ftz.bf16
1091   __nvvm_fmax_ftz_bf16(BF16, BF16_2);
1092   // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fmax.nan.bf16
1093   __nvvm_fmax_nan_bf16(BF16, NANBF16);
1094   // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fmax.ftz.nan.bf16
1095   __nvvm_fmax_ftz_nan_bf16(BF16, NANBF16);
1096   // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fmax.bf16x2
1097   __nvvm_fmax_bf16x2(BF16X2, BF16X2_2);
1098   // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fmax.ftz.bf16x2
1099   __nvvm_fmax_ftz_bf16x2(BF16X2, BF16X2_2);
1100   // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fmax.nan.bf16x2
1101   __nvvm_fmax_nan_bf16x2(NANBF16X2, BF16X2);
1102   // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fmax.ftz.nan.bf16x2
1103   __nvvm_fmax_ftz_nan_bf16x2(NANBF16X2, BF16X2);
1104   // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.nan.f
1105   __nvvm_fmax_nan_f(0.1f, (float)NAN32);
1106   // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.ftz.nan.f
1107   __nvvm_fmax_ftz_nan_f(0.1f, (float)NAN32);
1108 
1109 #endif
1110   // CHECK: ret void
1111 }
1112 
1113 // CHECK-LABEL: nvvm_fma_bf16_bf16x2_sm80
1114 __device__ void nvvm_fma_bf16_bf16x2_sm80() {
1115 #if __CUDA_ARCH__ >= 800
1116   // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fma.rn.bf16
1117   __nvvm_fma_rn_bf16(BF16, BF16_2, BF16_2);
1118   // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fma.rn.relu.bf16
1119   __nvvm_fma_rn_relu_bf16(BF16, BF16_2, BF16_2);
1120   // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fma.rn.bf16x2
1121   __nvvm_fma_rn_bf16x2(BF16X2, BF16X2_2, BF16X2_2);
1122   // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fma.rn.relu.bf16x2
1123   __nvvm_fma_rn_relu_bf16x2(BF16X2, BF16X2_2, BF16X2_2);
1124 #endif
1125   // CHECK: ret void
1126 }
1127 
1128 // CHECK-LABEL: nvvm_min_max_sm86
1129 __device__ void nvvm_min_max_sm86() {
1130 #if __CUDA_ARCH__ >= 860
1131 
1132   // CHECK_PTX72_SM86: call bfloat @llvm.nvvm.fmin.xorsign.abs.bf16
1133   __nvvm_fmin_xorsign_abs_bf16(BF16, BF16_2);
1134   // CHECK_PTX72_SM86: call bfloat @llvm.nvvm.fmin.nan.xorsign.abs.bf16
1135   __nvvm_fmin_nan_xorsign_abs_bf16(BF16, NANBF16);
1136   // CHECK_PTX72_SM86: call <2 x bfloat> @llvm.nvvm.fmin.xorsign.abs.bf16x2
1137   __nvvm_fmin_xorsign_abs_bf16x2(BF16X2, BF16X2_2);
1138   // CHECK_PTX72_SM86: call <2 x bfloat> @llvm.nvvm.fmin.nan.xorsign.abs.bf16x2
1139   __nvvm_fmin_nan_xorsign_abs_bf16x2(BF16X2, NANBF16X2);
1140   // CHECK_PTX72_SM86: call float @llvm.nvvm.fmin.xorsign.abs.f
1141   __nvvm_fmin_xorsign_abs_f(-0.1f, 0.1f);
1142   // CHECK_PTX72_SM86: call float @llvm.nvvm.fmin.ftz.xorsign.abs.f
1143   __nvvm_fmin_ftz_xorsign_abs_f(-0.1f, 0.1f);
1144   // CHECK_PTX72_SM86: call float @llvm.nvvm.fmin.nan.xorsign.abs.f
1145   __nvvm_fmin_nan_xorsign_abs_f(-0.1f, (float)NAN32);
1146   // CHECK_PTX72_SM86: call float @llvm.nvvm.fmin.ftz.nan.xorsign.abs.f
1147   __nvvm_fmin_ftz_nan_xorsign_abs_f(-0.1f, (float)NAN32);
1148 
1149   // CHECK_PTX72_SM86: call bfloat @llvm.nvvm.fmax.xorsign.abs.bf16
1150   __nvvm_fmax_xorsign_abs_bf16(BF16, BF16_2);
1151   // CHECK_PTX72_SM86: call bfloat @llvm.nvvm.fmax.nan.xorsign.abs.bf16
1152   __nvvm_fmax_nan_xorsign_abs_bf16(BF16, NANBF16);
1153   // CHECK_PTX72_SM86: call <2 x bfloat> @llvm.nvvm.fmax.xorsign.abs.bf16x2
1154   __nvvm_fmax_xorsign_abs_bf16x2(BF16X2, BF16X2_2);
1155   // CHECK_PTX72_SM86: call <2 x bfloat> @llvm.nvvm.fmax.nan.xorsign.abs.bf16x2
1156   __nvvm_fmax_nan_xorsign_abs_bf16x2(BF16X2, NANBF16X2);
1157   // CHECK_PTX72_SM86: call float @llvm.nvvm.fmax.xorsign.abs.f
1158   __nvvm_fmax_xorsign_abs_f(-0.1f, 0.1f);
1159   // CHECK_PTX72_SM86: call float @llvm.nvvm.fmax.ftz.xorsign.abs.f
1160   __nvvm_fmax_ftz_xorsign_abs_f(-0.1f, 0.1f);
1161   // CHECK_PTX72_SM86: call float @llvm.nvvm.fmax.nan.xorsign.abs.f
1162   __nvvm_fmax_nan_xorsign_abs_f(-0.1f, (float)NAN32);
1163   // CHECK_PTX72_SM86: call float @llvm.nvvm.fmax.ftz.nan.xorsign.abs.f
1164   __nvvm_fmax_ftz_nan_xorsign_abs_f(-0.1f, (float)NAN32);
1165 #endif
1166   // CHECK: ret void
1167 }
1168