1 // REQUIRES: nvptx-registered-target 2 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_70 -target-feature +ptx63 \ 3 // RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \ 4 // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX63_SM70 -check-prefix=LP64 %s 5 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_80 -target-feature +ptx70 \ 6 // RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \ 7 // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 -check-prefix=LP32 %s 8 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_80 -target-feature +ptx70 \ 9 // RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \ 10 // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 -check-prefix=LP64 %s 11 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_60 -target-feature +ptx62 \ 12 // RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \ 13 // RUN: | FileCheck -check-prefix=CHECK -check-prefix=LP32 %s 14 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_60 -target-feature +ptx62 \ 15 // RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \ 16 // RUN: | FileCheck -check-prefix=CHECK -check-prefix=LP64 %s 17 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_61 -target-feature +ptx62 \ 18 // RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \ 19 // RUN: | FileCheck -check-prefix=CHECK -check-prefix=LP64 %s 20 // RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_53 -target-feature +ptx62 \ 21 // RUN: -DERROR_CHECK -fcuda-is-device -S -o /dev/null -x cuda -verify %s 22 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_86 -target-feature +ptx72 \ 23 // RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \ 24 // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 -check-prefix=LP32 %s 25 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_86 -target-feature +ptx72 \ 26 // RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \ 27 // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 -check-prefix=LP64 %s 28 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_89 -target-feature +ptx81 \ 29 // RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \ 30 // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX81_SM89 %s 31 // ### The last run to check with the highest SM and PTX version available 32 // ### to make sure target builtins are still accepted. 33 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_100a -target-feature +ptx87 \ 34 // RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \ 35 // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX81_SM89 %s 36 37 #define __device__ __attribute__((device)) 38 #define __global__ __attribute__((global)) 39 #define __shared__ __attribute__((shared)) 40 #define __constant__ __attribute__((constant)) 41 42 __device__ int read_tid() { 43 44 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.x() 45 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.y() 46 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.z() 47 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.w() 48 49 int x = __nvvm_read_ptx_sreg_tid_x(); 50 int y = __nvvm_read_ptx_sreg_tid_y(); 51 int z = __nvvm_read_ptx_sreg_tid_z(); 52 int w = __nvvm_read_ptx_sreg_tid_w(); 53 54 return x + y + z + w; 55 56 } 57 58 __device__ bool reflect() { 59 60 // CHECK: call i32 @llvm.nvvm.reflect(ptr {{.*}}) 61 62 unsigned x = __nvvm_reflect("__CUDA_ARCH"); 63 return x >= 700; 64 } 65 66 __device__ int read_ntid() { 67 68 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 69 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.y() 70 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.z() 71 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.w() 72 73 int x = __nvvm_read_ptx_sreg_ntid_x(); 74 int y = __nvvm_read_ptx_sreg_ntid_y(); 75 int z = __nvvm_read_ptx_sreg_ntid_z(); 76 int w = __nvvm_read_ptx_sreg_ntid_w(); 77 78 return x + y + z + w; 79 80 } 81 82 __device__ int read_ctaid() { 83 84 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() 85 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() 86 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() 87 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.w() 88 89 int x = __nvvm_read_ptx_sreg_ctaid_x(); 90 int y = __nvvm_read_ptx_sreg_ctaid_y(); 91 int z = __nvvm_read_ptx_sreg_ctaid_z(); 92 int w = __nvvm_read_ptx_sreg_ctaid_w(); 93 94 return x + y + z + w; 95 96 } 97 98 __device__ int read_nctaid() { 99 100 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() 101 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y() 102 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z() 103 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.w() 104 105 int x = __nvvm_read_ptx_sreg_nctaid_x(); 106 int y = __nvvm_read_ptx_sreg_nctaid_y(); 107 int z = __nvvm_read_ptx_sreg_nctaid_z(); 108 int w = __nvvm_read_ptx_sreg_nctaid_w(); 109 110 return x + y + z + w; 111 112 } 113 114 __device__ int read_ids() { 115 116 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.laneid() 117 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.warpid() 118 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nwarpid() 119 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.smid() 120 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nsmid() 121 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.gridid() 122 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.warpsize() 123 124 int a = __nvvm_read_ptx_sreg_laneid(); 125 int b = __nvvm_read_ptx_sreg_warpid(); 126 int c = __nvvm_read_ptx_sreg_nwarpid(); 127 int d = __nvvm_read_ptx_sreg_smid(); 128 int e = __nvvm_read_ptx_sreg_nsmid(); 129 int f = __nvvm_read_ptx_sreg_gridid(); 130 int g = __nvvm_read_ptx_sreg_warpsize(); 131 132 return a + b + c + d + e + f + g; 133 134 } 135 136 __device__ int read_lanemasks() { 137 138 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.eq() 139 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.le() 140 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.lt() 141 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.ge() 142 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.gt() 143 144 int a = __nvvm_read_ptx_sreg_lanemask_eq(); 145 int b = __nvvm_read_ptx_sreg_lanemask_le(); 146 int c = __nvvm_read_ptx_sreg_lanemask_lt(); 147 int d = __nvvm_read_ptx_sreg_lanemask_ge(); 148 int e = __nvvm_read_ptx_sreg_lanemask_gt(); 149 150 return a + b + c + d + e; 151 152 } 153 154 __device__ long long read_clocks() { 155 156 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.clock() 157 // CHECK: call i64 @llvm.nvvm.read.ptx.sreg.clock64() 158 // CHECK: call i64 @llvm.nvvm.read.ptx.sreg.globaltimer() 159 160 int a = __nvvm_read_ptx_sreg_clock(); 161 long long b = __nvvm_read_ptx_sreg_clock64(); 162 long long c = __nvvm_read_ptx_sreg_globaltimer(); 163 164 return a + b + c; 165 } 166 167 __device__ int read_pms() { 168 169 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.pm0() 170 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.pm1() 171 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.pm2() 172 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.pm3() 173 174 int a = __nvvm_read_ptx_sreg_pm0(); 175 int b = __nvvm_read_ptx_sreg_pm1(); 176 int c = __nvvm_read_ptx_sreg_pm2(); 177 int d = __nvvm_read_ptx_sreg_pm3(); 178 179 return a + b + c + d; 180 181 } 182 183 __device__ void sync() { 184 185 // CHECK: call void @llvm.nvvm.bar.sync(i32 0) 186 187 __nvvm_bar_sync(0); 188 189 } 190 191 __device__ void activemask() { 192 193 // CHECK: call i32 @llvm.nvvm.activemask() 194 195 __nvvm_activemask(); 196 197 } 198 199 __device__ void exit() { 200 201 // CHECK: call void @llvm.nvvm.exit() 202 203 __nvvm_exit(); 204 205 } 206 207 // NVVM intrinsics 208 209 // The idea is not to test all intrinsics, just that Clang is recognizing the 210 // builtins defined in BuiltinsNVPTX.td 211 __device__ void nvvm_math(float f1, float f2, double d1, double d2) { 212 // CHECK: call float @llvm.nvvm.fmax.f 213 float t1 = __nvvm_fmax_f(f1, f2); 214 // CHECK: call float @llvm.nvvm.fmin.f 215 float t2 = __nvvm_fmin_f(f1, f2); 216 // CHECK: call float @llvm.nvvm.sqrt.rn.f 217 float t3 = __nvvm_sqrt_rn_f(f1); 218 // CHECK: call float @llvm.nvvm.rcp.rn.f 219 float t4 = __nvvm_rcp_rn_f(f2); 220 // CHECK: call float @llvm.nvvm.add.rn.f 221 float t5 = __nvvm_add_rn_f(f1, f2); 222 223 // CHECK: call double @llvm.nvvm.fmax.d 224 double td1 = __nvvm_fmax_d(d1, d2); 225 // CHECK: call double @llvm.nvvm.fmin.d 226 double td2 = __nvvm_fmin_d(d1, d2); 227 // CHECK: call double @llvm.nvvm.sqrt.rn.d 228 double td3 = __nvvm_sqrt_rn_d(d1); 229 // CHECK: call double @llvm.nvvm.rcp.rn.d 230 double td4 = __nvvm_rcp_rn_d(d2); 231 232 // CHECK: call void @llvm.nvvm.membar.cta() 233 __nvvm_membar_cta(); 234 // CHECK: call void @llvm.nvvm.membar.gl() 235 __nvvm_membar_gl(); 236 // CHECK: call void @llvm.nvvm.membar.sys() 237 __nvvm_membar_sys(); 238 // CHECK: call void @llvm.nvvm.barrier0() 239 __syncthreads(); 240 } 241 242 __device__ int di; 243 __shared__ int si; 244 __device__ long dl; 245 __shared__ long sl; 246 __device__ long long dll; 247 __shared__ long long sll; 248 249 // Check for atomic intrinsics 250 // CHECK-LABEL: nvvm_atom 251 __device__ void nvvm_atom(float *fp, float f, double *dfp, double df, 252 unsigned short *usp, unsigned short us, int *ip, 253 int i, unsigned int *uip, unsigned ui, long *lp, 254 long l, long long *llp, long long ll) { 255 // CHECK: atomicrmw add ptr {{.*}} seq_cst, align 4 256 __nvvm_atom_add_gen_i(ip, i); 257 // CHECK: atomicrmw add ptr {{.*}} seq_cst, align {{4|8}} 258 __nvvm_atom_add_gen_l(&dl, l); 259 // CHECK: atomicrmw add ptr {{.*}} seq_cst, align 8 260 __nvvm_atom_add_gen_ll(&sll, ll); 261 262 // CHECK: atomicrmw sub ptr {{.*}} seq_cst, align 4 263 __nvvm_atom_sub_gen_i(ip, i); 264 // CHECK: atomicrmw sub ptr {{.*}} seq_cst, align {{4|8}} 265 __nvvm_atom_sub_gen_l(&dl, l); 266 // CHECK: atomicrmw sub ptr {{.*}} seq_cst, align 8 267 __nvvm_atom_sub_gen_ll(&sll, ll); 268 269 // CHECK: atomicrmw and ptr {{.*}} seq_cst, align 4 270 __nvvm_atom_and_gen_i(ip, i); 271 // CHECK: atomicrmw and ptr {{.*}} seq_cst, align {{4|8}} 272 __nvvm_atom_and_gen_l(&dl, l); 273 // CHECK: atomicrmw and ptr {{.*}} seq_cst, align 8 274 __nvvm_atom_and_gen_ll(&sll, ll); 275 276 // CHECK: atomicrmw or ptr {{.*}} seq_cst, align 4 277 __nvvm_atom_or_gen_i(ip, i); 278 // CHECK: atomicrmw or ptr {{.*}} seq_cst, align {{4|8}} 279 __nvvm_atom_or_gen_l(&dl, l); 280 // CHECK: atomicrmw or ptr {{.*}} seq_cst, align 8 281 __nvvm_atom_or_gen_ll(&sll, ll); 282 283 // CHECK: atomicrmw xor ptr {{.*}} seq_cst, align 4 284 __nvvm_atom_xor_gen_i(ip, i); 285 // CHECK: atomicrmw xor ptr {{.*}} seq_cst, align {{4|8}} 286 __nvvm_atom_xor_gen_l(&dl, l); 287 // CHECK: atomicrmw xor ptr {{.*}} seq_cst, align 8 288 __nvvm_atom_xor_gen_ll(&sll, ll); 289 290 // CHECK: atomicrmw xchg ptr {{.*}} seq_cst, align 4 291 __nvvm_atom_xchg_gen_i(ip, i); 292 // CHECK: atomicrmw xchg ptr {{.*}} seq_cst, align {{4|8}} 293 __nvvm_atom_xchg_gen_l(&dl, l); 294 // CHECK: atomicrmw xchg ptr {{.*}} seq_cst, align 8 295 __nvvm_atom_xchg_gen_ll(&sll, ll); 296 297 // CHECK: atomicrmw max ptr {{.*}} seq_cst, align 4 298 __nvvm_atom_max_gen_i(ip, i); 299 // CHECK: atomicrmw umax ptr {{.*}} seq_cst, align 4 300 __nvvm_atom_max_gen_ui((unsigned int *)ip, i); 301 // CHECK: atomicrmw max ptr {{.*}} seq_cst, align {{4|8}} 302 __nvvm_atom_max_gen_l(&dl, l); 303 // CHECK: atomicrmw umax ptr {{.*}} seq_cst, align {{4|8}} 304 __nvvm_atom_max_gen_ul((unsigned long *)&dl, l); 305 // CHECK: atomicrmw max ptr {{.*}} seq_cst, align 8 306 __nvvm_atom_max_gen_ll(&sll, ll); 307 // CHECK: atomicrmw umax ptr {{.*}} seq_cst, align 8 308 __nvvm_atom_max_gen_ull((unsigned long long *)&sll, ll); 309 310 // CHECK: atomicrmw min ptr {{.*}} seq_cst, align 4 311 __nvvm_atom_min_gen_i(ip, i); 312 // CHECK: atomicrmw umin ptr {{.*}} seq_cst, align 4 313 __nvvm_atom_min_gen_ui((unsigned int *)ip, i); 314 // CHECK: atomicrmw min ptr {{.*}} seq_cst, align {{4|8}} 315 __nvvm_atom_min_gen_l(&dl, l); 316 // CHECK: atomicrmw umin ptr {{.*}} seq_cst, align {{4|8}} 317 __nvvm_atom_min_gen_ul((unsigned long *)&dl, l); 318 // CHECK: atomicrmw min ptr {{.*}} seq_cst, align 8 319 __nvvm_atom_min_gen_ll(&sll, ll); 320 // CHECK: atomicrmw umin ptr {{.*}} seq_cst, align 8 321 __nvvm_atom_min_gen_ull((unsigned long long *)&sll, ll); 322 323 // CHECK: cmpxchg ptr {{.*}} seq_cst seq_cst, align 4 324 // CHECK-NEXT: extractvalue { i32, i1 } {{%[0-9]+}}, 0 325 __nvvm_atom_cas_gen_i(ip, 0, i); 326 // CHECK: cmpxchg ptr {{.*}} seq_cst seq_cst, align {{4|8}} 327 // CHECK-NEXT: extractvalue { {{i32|i64}}, i1 } {{%[0-9]+}}, 0 328 __nvvm_atom_cas_gen_l(&dl, 0, l); 329 // CHECK: cmpxchg ptr {{.*}} seq_cst seq_cst, align 8 330 // CHECK-NEXT: extractvalue { i64, i1 } {{%[0-9]+}}, 0 331 __nvvm_atom_cas_gen_ll(&sll, 0, ll); 332 333 // CHECK: atomicrmw fadd ptr {{.*}} seq_cst, align 4 334 __nvvm_atom_add_gen_f(fp, f); 335 336 // CHECK: call i32 @llvm.nvvm.atomic.load.inc.32.p0 337 __nvvm_atom_inc_gen_ui(uip, ui); 338 339 // CHECK: call i32 @llvm.nvvm.atomic.load.dec.32.p0 340 __nvvm_atom_dec_gen_ui(uip, ui); 341 342 343 ////////////////////////////////////////////////////////////////// 344 // Atomics with scope (only supported on sm_60+). 345 346 #if ERROR_CHECK || __CUDA_ARCH__ >= 600 347 348 // CHECK: call i32 @llvm.nvvm.atomic.add.gen.i.cta.i32.p0 349 // expected-error@+1 {{'__nvvm_atom_cta_add_gen_i' needs target feature sm_60}} 350 __nvvm_atom_cta_add_gen_i(ip, i); 351 // LP32: call i32 @llvm.nvvm.atomic.add.gen.i.cta.i32.p0 352 // LP64: call i64 @llvm.nvvm.atomic.add.gen.i.cta.i64.p0 353 // expected-error@+1 {{'__nvvm_atom_cta_add_gen_l' needs target feature sm_60}} 354 __nvvm_atom_cta_add_gen_l(&dl, l); 355 // CHECK: call i64 @llvm.nvvm.atomic.add.gen.i.cta.i64.p0 356 // expected-error@+1 {{'__nvvm_atom_cta_add_gen_ll' needs target feature sm_60}} 357 __nvvm_atom_cta_add_gen_ll(&sll, ll); 358 // CHECK: call i32 @llvm.nvvm.atomic.add.gen.i.sys.i32.p0 359 // expected-error@+1 {{'__nvvm_atom_sys_add_gen_i' needs target feature sm_60}} 360 __nvvm_atom_sys_add_gen_i(ip, i); 361 // LP32: call i32 @llvm.nvvm.atomic.add.gen.i.sys.i32.p0 362 // LP64: call i64 @llvm.nvvm.atomic.add.gen.i.sys.i64.p0 363 // expected-error@+1 {{'__nvvm_atom_sys_add_gen_l' needs target feature sm_60}} 364 __nvvm_atom_sys_add_gen_l(&dl, l); 365 // CHECK: call i64 @llvm.nvvm.atomic.add.gen.i.sys.i64.p0 366 // expected-error@+1 {{'__nvvm_atom_sys_add_gen_ll' needs target feature sm_60}} 367 __nvvm_atom_sys_add_gen_ll(&sll, ll); 368 369 // CHECK: call float @llvm.nvvm.atomic.add.gen.f.cta.f32.p0 370 // expected-error@+1 {{'__nvvm_atom_cta_add_gen_f' needs target feature sm_60}} 371 __nvvm_atom_cta_add_gen_f(fp, f); 372 // CHECK: call double @llvm.nvvm.atomic.add.gen.f.cta.f64.p0 373 // expected-error@+1 {{'__nvvm_atom_cta_add_gen_d' needs target feature sm_60}} 374 __nvvm_atom_cta_add_gen_d(dfp, df); 375 // CHECK: call float @llvm.nvvm.atomic.add.gen.f.sys.f32.p0 376 // expected-error@+1 {{'__nvvm_atom_sys_add_gen_f' needs target feature sm_60}} 377 __nvvm_atom_sys_add_gen_f(fp, f); 378 // CHECK: call double @llvm.nvvm.atomic.add.gen.f.sys.f64.p0 379 // expected-error@+1 {{'__nvvm_atom_sys_add_gen_d' needs target feature sm_60}} 380 __nvvm_atom_sys_add_gen_d(dfp, df); 381 382 // CHECK: call i32 @llvm.nvvm.atomic.exch.gen.i.cta.i32.p0 383 // expected-error@+1 {{'__nvvm_atom_cta_xchg_gen_i' needs target feature sm_60}} 384 __nvvm_atom_cta_xchg_gen_i(ip, i); 385 // LP32: call i32 @llvm.nvvm.atomic.exch.gen.i.cta.i32.p0 386 // LP64: call i64 @llvm.nvvm.atomic.exch.gen.i.cta.i64.p0 387 // expected-error@+1 {{'__nvvm_atom_cta_xchg_gen_l' needs target feature sm_60}} 388 __nvvm_atom_cta_xchg_gen_l(&dl, l); 389 // CHECK: call i64 @llvm.nvvm.atomic.exch.gen.i.cta.i64.p0 390 // expected-error@+1 {{'__nvvm_atom_cta_xchg_gen_ll' needs target feature sm_60}} 391 __nvvm_atom_cta_xchg_gen_ll(&sll, ll); 392 393 // CHECK: call i32 @llvm.nvvm.atomic.exch.gen.i.sys.i32.p0 394 // expected-error@+1 {{'__nvvm_atom_sys_xchg_gen_i' needs target feature sm_60}} 395 __nvvm_atom_sys_xchg_gen_i(ip, i); 396 // LP32: call i32 @llvm.nvvm.atomic.exch.gen.i.sys.i32.p0 397 // LP64: call i64 @llvm.nvvm.atomic.exch.gen.i.sys.i64.p0 398 // expected-error@+1 {{'__nvvm_atom_sys_xchg_gen_l' needs target feature sm_60}} 399 __nvvm_atom_sys_xchg_gen_l(&dl, l); 400 // CHECK: call i64 @llvm.nvvm.atomic.exch.gen.i.sys.i64.p0 401 // expected-error@+1 {{'__nvvm_atom_sys_xchg_gen_ll' needs target feature sm_60}} 402 __nvvm_atom_sys_xchg_gen_ll(&sll, ll); 403 404 // CHECK: call i32 @llvm.nvvm.atomic.max.gen.i.cta.i32.p0 405 // expected-error@+1 {{'__nvvm_atom_cta_max_gen_i' needs target feature sm_60}} 406 __nvvm_atom_cta_max_gen_i(ip, i); 407 // CHECK: call i32 @llvm.nvvm.atomic.max.gen.i.cta.i32.p0 408 // expected-error@+1 {{'__nvvm_atom_cta_max_gen_ui' needs target feature sm_60}} 409 __nvvm_atom_cta_max_gen_ui((unsigned int *)ip, i); 410 // LP32: call i32 @llvm.nvvm.atomic.max.gen.i.cta.i32.p0 411 // LP64: call i64 @llvm.nvvm.atomic.max.gen.i.cta.i64.p0 412 // expected-error@+1 {{'__nvvm_atom_cta_max_gen_l' needs target feature sm_60}} 413 __nvvm_atom_cta_max_gen_l(&dl, l); 414 // LP32: call i32 @llvm.nvvm.atomic.max.gen.i.cta.i32.p0 415 // LP64: call i64 @llvm.nvvm.atomic.max.gen.i.cta.i64.p0 416 // expected-error@+1 {{'__nvvm_atom_cta_max_gen_ul' needs target feature sm_60}} 417 __nvvm_atom_cta_max_gen_ul((unsigned long *)lp, l); 418 // CHECK: call i64 @llvm.nvvm.atomic.max.gen.i.cta.i64.p0 419 // expected-error@+1 {{'__nvvm_atom_cta_max_gen_ll' needs target feature sm_60}} 420 __nvvm_atom_cta_max_gen_ll(&sll, ll); 421 // CHECK: call i64 @llvm.nvvm.atomic.max.gen.i.cta.i64.p0 422 // expected-error@+1 {{'__nvvm_atom_cta_max_gen_ull' needs target feature sm_60}} 423 __nvvm_atom_cta_max_gen_ull((unsigned long long *)llp, ll); 424 425 // CHECK: call i32 @llvm.nvvm.atomic.max.gen.i.sys.i32.p0 426 // expected-error@+1 {{'__nvvm_atom_sys_max_gen_i' needs target feature sm_60}} 427 __nvvm_atom_sys_max_gen_i(ip, i); 428 // CHECK: call i32 @llvm.nvvm.atomic.max.gen.i.sys.i32.p0 429 // expected-error@+1 {{'__nvvm_atom_sys_max_gen_ui' needs target feature sm_60}} 430 __nvvm_atom_sys_max_gen_ui((unsigned int *)ip, i); 431 // LP32: call i32 @llvm.nvvm.atomic.max.gen.i.sys.i32.p0 432 // LP64: call i64 @llvm.nvvm.atomic.max.gen.i.sys.i64.p0 433 // expected-error@+1 {{'__nvvm_atom_sys_max_gen_l' needs target feature sm_60}} 434 __nvvm_atom_sys_max_gen_l(&dl, l); 435 // LP32: call i32 @llvm.nvvm.atomic.max.gen.i.sys.i32.p0 436 // LP64: call i64 @llvm.nvvm.atomic.max.gen.i.sys.i64.p0 437 // expected-error@+1 {{'__nvvm_atom_sys_max_gen_ul' needs target feature sm_60}} 438 __nvvm_atom_sys_max_gen_ul((unsigned long *)lp, l); 439 // CHECK: call i64 @llvm.nvvm.atomic.max.gen.i.sys.i64.p0 440 // expected-error@+1 {{'__nvvm_atom_sys_max_gen_ll' needs target feature sm_60}} 441 __nvvm_atom_sys_max_gen_ll(&sll, ll); 442 // CHECK: call i64 @llvm.nvvm.atomic.max.gen.i.sys.i64.p0 443 // expected-error@+1 {{'__nvvm_atom_sys_max_gen_ull' needs target feature sm_60}} 444 __nvvm_atom_sys_max_gen_ull((unsigned long long *)llp, ll); 445 446 // CHECK: call i32 @llvm.nvvm.atomic.min.gen.i.cta.i32.p0 447 // expected-error@+1 {{'__nvvm_atom_cta_min_gen_i' needs target feature sm_60}} 448 __nvvm_atom_cta_min_gen_i(ip, i); 449 // CHECK: call i32 @llvm.nvvm.atomic.min.gen.i.cta.i32.p0 450 // expected-error@+1 {{'__nvvm_atom_cta_min_gen_ui' needs target feature sm_60}} 451 __nvvm_atom_cta_min_gen_ui((unsigned int *)ip, i); 452 // LP32: call i32 @llvm.nvvm.atomic.min.gen.i.cta.i32.p0 453 // LP64: call i64 @llvm.nvvm.atomic.min.gen.i.cta.i64.p0 454 // expected-error@+1 {{'__nvvm_atom_cta_min_gen_l' needs target feature sm_60}} 455 __nvvm_atom_cta_min_gen_l(&dl, l); 456 // LP32: call i32 @llvm.nvvm.atomic.min.gen.i.cta.i32.p0 457 // LP64: call i64 @llvm.nvvm.atomic.min.gen.i.cta.i64.p0 458 // expected-error@+1 {{'__nvvm_atom_cta_min_gen_ul' needs target feature sm_60}} 459 __nvvm_atom_cta_min_gen_ul((unsigned long *)lp, l); 460 // CHECK: call i64 @llvm.nvvm.atomic.min.gen.i.cta.i64.p0 461 // expected-error@+1 {{'__nvvm_atom_cta_min_gen_ll' needs target feature sm_60}} 462 __nvvm_atom_cta_min_gen_ll(&sll, ll); 463 // CHECK: call i64 @llvm.nvvm.atomic.min.gen.i.cta.i64.p0 464 // expected-error@+1 {{'__nvvm_atom_cta_min_gen_ull' needs target feature sm_60}} 465 __nvvm_atom_cta_min_gen_ull((unsigned long long *)llp, ll); 466 467 // CHECK: call i32 @llvm.nvvm.atomic.min.gen.i.sys.i32.p0 468 // expected-error@+1 {{'__nvvm_atom_sys_min_gen_i' needs target feature sm_60}} 469 __nvvm_atom_sys_min_gen_i(ip, i); 470 // CHECK: call i32 @llvm.nvvm.atomic.min.gen.i.sys.i32.p0 471 // expected-error@+1 {{'__nvvm_atom_sys_min_gen_ui' needs target feature sm_60}} 472 __nvvm_atom_sys_min_gen_ui((unsigned int *)ip, i); 473 // LP32: call i32 @llvm.nvvm.atomic.min.gen.i.sys.i32.p0 474 // LP64: call i64 @llvm.nvvm.atomic.min.gen.i.sys.i64.p0 475 // expected-error@+1 {{'__nvvm_atom_sys_min_gen_l' needs target feature sm_60}} 476 __nvvm_atom_sys_min_gen_l(&dl, l); 477 // LP32: call i32 @llvm.nvvm.atomic.min.gen.i.sys.i32.p0 478 // LP64: call i64 @llvm.nvvm.atomic.min.gen.i.sys.i64.p0 479 // expected-error@+1 {{'__nvvm_atom_sys_min_gen_ul' needs target feature sm_60}} 480 __nvvm_atom_sys_min_gen_ul((unsigned long *)lp, l); 481 // CHECK: call i64 @llvm.nvvm.atomic.min.gen.i.sys.i64.p0 482 // expected-error@+1 {{'__nvvm_atom_sys_min_gen_ll' needs target feature sm_60}} 483 __nvvm_atom_sys_min_gen_ll(&sll, ll); 484 // CHECK: call i64 @llvm.nvvm.atomic.min.gen.i.sys.i64.p0 485 // expected-error@+1 {{'__nvvm_atom_sys_min_gen_ull' needs target feature sm_60}} 486 __nvvm_atom_sys_min_gen_ull((unsigned long long *)llp, ll); 487 488 // CHECK: call i32 @llvm.nvvm.atomic.inc.gen.i.cta.i32.p0 489 // expected-error@+1 {{'__nvvm_atom_cta_inc_gen_ui' needs target feature sm_60}} 490 __nvvm_atom_cta_inc_gen_ui((unsigned int *)ip, i); 491 // CHECK: call i32 @llvm.nvvm.atomic.inc.gen.i.sys.i32.p0 492 // expected-error@+1 {{'__nvvm_atom_sys_inc_gen_ui' needs target feature sm_60}} 493 __nvvm_atom_sys_inc_gen_ui((unsigned int *)ip, i); 494 495 // CHECK: call i32 @llvm.nvvm.atomic.dec.gen.i.cta.i32.p0 496 // expected-error@+1 {{'__nvvm_atom_cta_dec_gen_ui' needs target feature sm_60}} 497 __nvvm_atom_cta_dec_gen_ui((unsigned int *)ip, i); 498 // CHECK: call i32 @llvm.nvvm.atomic.dec.gen.i.sys.i32.p0 499 // expected-error@+1 {{'__nvvm_atom_sys_dec_gen_ui' needs target feature sm_60}} 500 __nvvm_atom_sys_dec_gen_ui((unsigned int *)ip, i); 501 502 // CHECK: call i32 @llvm.nvvm.atomic.and.gen.i.cta.i32.p0 503 // expected-error@+1 {{'__nvvm_atom_cta_and_gen_i' needs target feature sm_60}} 504 __nvvm_atom_cta_and_gen_i(ip, i); 505 // LP32: call i32 @llvm.nvvm.atomic.and.gen.i.cta.i32.p0 506 // LP64: call i64 @llvm.nvvm.atomic.and.gen.i.cta.i64.p0 507 // expected-error@+1 {{'__nvvm_atom_cta_and_gen_l' needs target feature sm_60}} 508 __nvvm_atom_cta_and_gen_l(&dl, l); 509 // CHECK: call i64 @llvm.nvvm.atomic.and.gen.i.cta.i64.p0 510 // expected-error@+1 {{'__nvvm_atom_cta_and_gen_ll' needs target feature sm_60}} 511 __nvvm_atom_cta_and_gen_ll(&sll, ll); 512 513 // CHECK: call i32 @llvm.nvvm.atomic.and.gen.i.sys.i32.p0 514 // expected-error@+1 {{'__nvvm_atom_sys_and_gen_i' needs target feature sm_60}} 515 __nvvm_atom_sys_and_gen_i(ip, i); 516 // LP32: call i32 @llvm.nvvm.atomic.and.gen.i.sys.i32.p0 517 // LP64: call i64 @llvm.nvvm.atomic.and.gen.i.sys.i64.p0 518 // expected-error@+1 {{'__nvvm_atom_sys_and_gen_l' needs target feature sm_60}} 519 __nvvm_atom_sys_and_gen_l(&dl, l); 520 // CHECK: call i64 @llvm.nvvm.atomic.and.gen.i.sys.i64.p0 521 // expected-error@+1 {{'__nvvm_atom_sys_and_gen_ll' needs target feature sm_60}} 522 __nvvm_atom_sys_and_gen_ll(&sll, ll); 523 524 // CHECK: call i32 @llvm.nvvm.atomic.or.gen.i.cta.i32.p0 525 // expected-error@+1 {{'__nvvm_atom_cta_or_gen_i' needs target feature sm_60}} 526 __nvvm_atom_cta_or_gen_i(ip, i); 527 // LP32: call i32 @llvm.nvvm.atomic.or.gen.i.cta.i32.p0 528 // LP64: call i64 @llvm.nvvm.atomic.or.gen.i.cta.i64.p0 529 // expected-error@+1 {{'__nvvm_atom_cta_or_gen_l' needs target feature sm_60}} 530 __nvvm_atom_cta_or_gen_l(&dl, l); 531 // CHECK: call i64 @llvm.nvvm.atomic.or.gen.i.cta.i64.p0 532 // expected-error@+1 {{'__nvvm_atom_cta_or_gen_ll' needs target feature sm_60}} 533 __nvvm_atom_cta_or_gen_ll(&sll, ll); 534 535 // CHECK: call i32 @llvm.nvvm.atomic.or.gen.i.sys.i32.p0 536 // expected-error@+1 {{'__nvvm_atom_sys_or_gen_i' needs target feature sm_60}} 537 __nvvm_atom_sys_or_gen_i(ip, i); 538 // LP32: call i32 @llvm.nvvm.atomic.or.gen.i.sys.i32.p0 539 // LP64: call i64 @llvm.nvvm.atomic.or.gen.i.sys.i64.p0 540 // expected-error@+1 {{'__nvvm_atom_sys_or_gen_l' needs target feature sm_60}} 541 __nvvm_atom_sys_or_gen_l(&dl, l); 542 // CHECK: call i64 @llvm.nvvm.atomic.or.gen.i.sys.i64.p0 543 // expected-error@+1 {{'__nvvm_atom_sys_or_gen_ll' needs target feature sm_60}} 544 __nvvm_atom_sys_or_gen_ll(&sll, ll); 545 546 // CHECK: call i32 @llvm.nvvm.atomic.xor.gen.i.cta.i32.p0 547 // expected-error@+1 {{'__nvvm_atom_cta_xor_gen_i' needs target feature sm_60}} 548 __nvvm_atom_cta_xor_gen_i(ip, i); 549 // LP32: call i32 @llvm.nvvm.atomic.xor.gen.i.cta.i32.p0 550 // LP64: call i64 @llvm.nvvm.atomic.xor.gen.i.cta.i64.p0 551 // expected-error@+1 {{'__nvvm_atom_cta_xor_gen_l' needs target feature sm_60}} 552 __nvvm_atom_cta_xor_gen_l(&dl, l); 553 // CHECK: call i64 @llvm.nvvm.atomic.xor.gen.i.cta.i64.p0 554 // expected-error@+1 {{'__nvvm_atom_cta_xor_gen_ll' needs target feature sm_60}} 555 __nvvm_atom_cta_xor_gen_ll(&sll, ll); 556 557 // CHECK: call i32 @llvm.nvvm.atomic.xor.gen.i.sys.i32.p0 558 // expected-error@+1 {{'__nvvm_atom_sys_xor_gen_i' needs target feature sm_60}} 559 __nvvm_atom_sys_xor_gen_i(ip, i); 560 // LP32: call i32 @llvm.nvvm.atomic.xor.gen.i.sys.i32.p0 561 // LP64: call i64 @llvm.nvvm.atomic.xor.gen.i.sys.i64.p0 562 // expected-error@+1 {{'__nvvm_atom_sys_xor_gen_l' needs target feature sm_60}} 563 __nvvm_atom_sys_xor_gen_l(&dl, l); 564 // CHECK: call i64 @llvm.nvvm.atomic.xor.gen.i.sys.i64.p0 565 // expected-error@+1 {{'__nvvm_atom_sys_xor_gen_ll' needs target feature sm_60}} 566 __nvvm_atom_sys_xor_gen_ll(&sll, ll); 567 568 // CHECK: call i32 @llvm.nvvm.atomic.cas.gen.i.cta.i32.p0 569 // expected-error@+1 {{'__nvvm_atom_cta_cas_gen_i' needs target feature sm_60}} 570 __nvvm_atom_cta_cas_gen_i(ip, i, 0); 571 // LP32: call i32 @llvm.nvvm.atomic.cas.gen.i.cta.i32.p0 572 // LP64: call i64 @llvm.nvvm.atomic.cas.gen.i.cta.i64.p0 573 // expected-error@+1 {{'__nvvm_atom_cta_cas_gen_l' needs target feature sm_60}} 574 __nvvm_atom_cta_cas_gen_l(&dl, l, 0); 575 // CHECK: call i64 @llvm.nvvm.atomic.cas.gen.i.cta.i64.p0 576 // expected-error@+1 {{'__nvvm_atom_cta_cas_gen_ll' needs target feature sm_60}} 577 __nvvm_atom_cta_cas_gen_ll(&sll, ll, 0); 578 579 // CHECK: call i32 @llvm.nvvm.atomic.cas.gen.i.sys.i32.p0 580 // expected-error@+1 {{'__nvvm_atom_sys_cas_gen_i' needs target feature sm_60}} 581 __nvvm_atom_sys_cas_gen_i(ip, i, 0); 582 // LP32: call i32 @llvm.nvvm.atomic.cas.gen.i.sys.i32.p0 583 // LP64: call i64 @llvm.nvvm.atomic.cas.gen.i.sys.i64.p0 584 // expected-error@+1 {{'__nvvm_atom_sys_cas_gen_l' needs target feature sm_60}} 585 __nvvm_atom_sys_cas_gen_l(&dl, l, 0); 586 // CHECK: call i64 @llvm.nvvm.atomic.cas.gen.i.sys.i64.p0 587 // expected-error@+1 {{'__nvvm_atom_sys_cas_gen_ll' needs target feature sm_60}} 588 __nvvm_atom_sys_cas_gen_ll(&sll, ll, 0); 589 #endif 590 591 #if __CUDA_ARCH__ >= 700 592 // CHECK_PTX63_SM70: cmpxchg ptr {{.*}} seq_cst seq_cst, align 2 593 // CHECK_PTX63_SM70-NEXT: extractvalue { i16, i1 } {{%[0-9]+}}, 0 594 __nvvm_atom_cas_gen_us(usp, 0, us); 595 // CHECK_PTX63_SM70: call i16 @llvm.nvvm.atomic.cas.gen.i.cta.i16.p0 596 __nvvm_atom_cta_cas_gen_us(usp, 0, us); 597 // CHECK_PTX63_SM70: call i16 @llvm.nvvm.atomic.cas.gen.i.sys.i16.p0 598 __nvvm_atom_sys_cas_gen_us(usp, 0, us); 599 #endif 600 601 // CHECK: ret 602 } 603 604 // CHECK-LABEL: nvvm_ldg 605 __device__ void nvvm_ldg(const void *p) { 606 // CHECK: load i8, ptr addrspace(1) {{%[0-9]+}}, align 1, !invariant.load 607 // CHECK: load i8, ptr addrspace(1) {{%[0-9]+}}, align 1, !invariant.load 608 // CHECK: load i8, ptr addrspace(1) {{%[0-9]+}}, align 1, !invariant.load 609 __nvvm_ldg_c((const char *)p); 610 __nvvm_ldg_uc((const unsigned char *)p); 611 __nvvm_ldg_sc((const signed char *)p); 612 613 // CHECK: load i16, ptr addrspace(1) {{%[0-9]+}}, align 2, !invariant.load 614 // CHECK: load i16, ptr addrspace(1) {{%[0-9]+}}, align 2, !invariant.load 615 __nvvm_ldg_s((const short *)p); 616 __nvvm_ldg_us((const unsigned short *)p); 617 618 // CHECK: load i32, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load 619 // CHECK: load i32, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load 620 __nvvm_ldg_i((const int *)p); 621 __nvvm_ldg_ui((const unsigned int *)p); 622 623 // LP32: load i32, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load 624 // LP32: load i32, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load 625 // LP64: load i64, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load 626 // LP64: load i64, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load 627 __nvvm_ldg_l((const long *)p); 628 __nvvm_ldg_ul((const unsigned long *)p); 629 630 // CHECK: load float, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load 631 __nvvm_ldg_f((const float *)p); 632 // CHECK: load double, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load 633 __nvvm_ldg_d((const double *)p); 634 635 // In practice, the pointers we pass to __ldg will be aligned as appropriate 636 // for the CUDA <type>N vector types (e.g. short4), which are not the same as 637 // the LLVM vector types. However, each LLVM vector type has an alignment 638 // less than or equal to its corresponding CUDA type, so we're OK. 639 // 640 // PTX Interoperability section 2.2: "For a vector with an even number of 641 // elements, its alignment is set to number of elements times the alignment of 642 // its member: n*alignof(t)." 643 644 // CHECK: load <2 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 2, !invariant.load 645 // CHECK: load <2 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 2, !invariant.load 646 // CHECK: load <2 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 2, !invariant.load 647 typedef char char2 __attribute__((ext_vector_type(2))); 648 typedef unsigned char uchar2 __attribute__((ext_vector_type(2))); 649 typedef signed char schar2 __attribute__((ext_vector_type(2))); 650 __nvvm_ldg_c2((const char2 *)p); 651 __nvvm_ldg_uc2((const uchar2 *)p); 652 __nvvm_ldg_sc2((const schar2 *)p); 653 654 // CHECK: load <4 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load 655 // CHECK: load <4 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load 656 // CHECK: load <4 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load 657 typedef char char4 __attribute__((ext_vector_type(4))); 658 typedef unsigned char uchar4 __attribute__((ext_vector_type(4))); 659 typedef signed char schar4 __attribute__((ext_vector_type(4))); 660 __nvvm_ldg_c4((const char4 *)p); 661 __nvvm_ldg_uc4((const uchar4 *)p); 662 __nvvm_ldg_sc4((const schar4 *)p); 663 664 // CHECK: load <2 x i16>, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load 665 // CHECK: load <2 x i16>, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load 666 typedef short short2 __attribute__((ext_vector_type(2))); 667 typedef unsigned short ushort2 __attribute__((ext_vector_type(2))); 668 __nvvm_ldg_s2((const short2 *)p); 669 __nvvm_ldg_us2((const ushort2 *)p); 670 671 // CHECK: load <4 x i16>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load 672 // CHECK: load <4 x i16>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load 673 typedef short short4 __attribute__((ext_vector_type(4))); 674 typedef unsigned short ushort4 __attribute__((ext_vector_type(4))); 675 __nvvm_ldg_s4((const short4 *)p); 676 __nvvm_ldg_us4((const ushort4 *)p); 677 678 // CHECK: load <2 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load 679 // CHECK: load <2 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load 680 typedef int int2 __attribute__((ext_vector_type(2))); 681 typedef unsigned int uint2 __attribute__((ext_vector_type(2))); 682 __nvvm_ldg_i2((const int2 *)p); 683 __nvvm_ldg_ui2((const uint2 *)p); 684 685 // CHECK: load <4 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load 686 // CHECK: load <4 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load 687 typedef int int4 __attribute__((ext_vector_type(4))); 688 typedef unsigned int uint4 __attribute__((ext_vector_type(4))); 689 __nvvm_ldg_i4((const int4 *)p); 690 __nvvm_ldg_ui4((const uint4 *)p); 691 692 // LP32: load <2 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load 693 // LP32: load <2 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load 694 // LP64: load <2 x i64>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load 695 // LP64: load <2 x i64>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load 696 typedef long long2 __attribute__((ext_vector_type(2))); 697 typedef unsigned long ulong2 __attribute__((ext_vector_type(2))); 698 __nvvm_ldg_l2((const long2 *)p); 699 __nvvm_ldg_ul2((const ulong2 *)p); 700 701 // CHECK: load <2 x i64>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load 702 // CHECK: load <2 x i64>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load 703 typedef long long longlong2 __attribute__((ext_vector_type(2))); 704 typedef unsigned long long ulonglong2 __attribute__((ext_vector_type(2))); 705 __nvvm_ldg_ll2((const longlong2 *)p); 706 __nvvm_ldg_ull2((const ulonglong2 *)p); 707 708 // CHECK: load <2 x float>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load 709 typedef float float2 __attribute__((ext_vector_type(2))); 710 __nvvm_ldg_f2((const float2 *)p); 711 712 // CHECK: load <4 x float>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load 713 typedef float float4 __attribute__((ext_vector_type(4))); 714 __nvvm_ldg_f4((const float4 *)p); 715 716 // CHECK: load <2 x double>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load 717 typedef double double2 __attribute__((ext_vector_type(2))); 718 __nvvm_ldg_d2((const double2 *)p); 719 } 720 721 // CHECK-LABEL: nvvm_ldu 722 __device__ void nvvm_ldu(const void *p) { 723 // CHECK: call i8 @llvm.nvvm.ldu.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1) 724 // CHECK: call i8 @llvm.nvvm.ldu.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1) 725 // CHECK: call i8 @llvm.nvvm.ldu.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1) 726 __nvvm_ldu_c((const char *)p); 727 __nvvm_ldu_uc((const unsigned char *)p); 728 __nvvm_ldu_sc((const signed char *)p); 729 730 // CHECK: call i16 @llvm.nvvm.ldu.global.i.i16.p0(ptr {{%[0-9]+}}, i32 2) 731 // CHECK: call i16 @llvm.nvvm.ldu.global.i.i16.p0(ptr {{%[0-9]+}}, i32 2) 732 __nvvm_ldu_s((const short *)p); 733 __nvvm_ldu_us((const unsigned short *)p); 734 735 // CHECK: call i32 @llvm.nvvm.ldu.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4) 736 // CHECK: call i32 @llvm.nvvm.ldu.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4) 737 __nvvm_ldu_i((const int *)p); 738 __nvvm_ldu_ui((const unsigned int *)p); 739 740 // LP32: call i32 @llvm.nvvm.ldu.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4) 741 // LP32: call i32 @llvm.nvvm.ldu.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4) 742 // LP64: call i64 @llvm.nvvm.ldu.global.i.i64.p0(ptr {{%[0-9]+}}, i32 8) 743 // LP64: call i64 @llvm.nvvm.ldu.global.i.i64.p0(ptr {{%[0-9]+}}, i32 8) 744 __nvvm_ldu_l((const long *)p); 745 __nvvm_ldu_ul((const unsigned long *)p); 746 747 // CHECK: call float @llvm.nvvm.ldu.global.f.f32.p0(ptr {{%[0-9]+}}, i32 4) 748 __nvvm_ldu_f((const float *)p); 749 // CHECK: call double @llvm.nvvm.ldu.global.f.f64.p0(ptr {{%[0-9]+}}, i32 8) 750 __nvvm_ldu_d((const double *)p); 751 752 // CHECK: call <2 x i8> @llvm.nvvm.ldu.global.i.v2i8.p0(ptr {{%[0-9]+}}, i32 2) 753 // CHECK: call <2 x i8> @llvm.nvvm.ldu.global.i.v2i8.p0(ptr {{%[0-9]+}}, i32 2) 754 // CHECK: call <2 x i8> @llvm.nvvm.ldu.global.i.v2i8.p0(ptr {{%[0-9]+}}, i32 2) 755 typedef char char2 __attribute__((ext_vector_type(2))); 756 typedef unsigned char uchar2 __attribute__((ext_vector_type(2))); 757 typedef signed char schar2 __attribute__((ext_vector_type(2))); 758 __nvvm_ldu_c2((const char2 *)p); 759 __nvvm_ldu_uc2((const uchar2 *)p); 760 __nvvm_ldu_sc2((const schar2 *)p); 761 762 // CHECK: call <4 x i8> @llvm.nvvm.ldu.global.i.v4i8.p0(ptr {{%[0-9]+}}, i32 4) 763 // CHECK: call <4 x i8> @llvm.nvvm.ldu.global.i.v4i8.p0(ptr {{%[0-9]+}}, i32 4) 764 // CHECK: call <4 x i8> @llvm.nvvm.ldu.global.i.v4i8.p0(ptr {{%[0-9]+}}, i32 4) 765 typedef char char4 __attribute__((ext_vector_type(4))); 766 typedef unsigned char uchar4 __attribute__((ext_vector_type(4))); 767 typedef signed char schar4 __attribute__((ext_vector_type(4))); 768 __nvvm_ldu_c4((const char4 *)p); 769 __nvvm_ldu_uc4((const uchar4 *)p); 770 __nvvm_ldu_sc4((const schar4 *)p); 771 772 // CHECK: call <2 x i16> @llvm.nvvm.ldu.global.i.v2i16.p0(ptr {{%[0-9]+}}, i32 4) 773 // CHECK: call <2 x i16> @llvm.nvvm.ldu.global.i.v2i16.p0(ptr {{%[0-9]+}}, i32 4) 774 typedef short short2 __attribute__((ext_vector_type(2))); 775 typedef unsigned short ushort2 __attribute__((ext_vector_type(2))); 776 __nvvm_ldu_s2((const short2 *)p); 777 __nvvm_ldu_us2((const ushort2 *)p); 778 779 // CHECK: call <4 x i16> @llvm.nvvm.ldu.global.i.v4i16.p0(ptr {{%[0-9]+}}, i32 8) 780 // CHECK: call <4 x i16> @llvm.nvvm.ldu.global.i.v4i16.p0(ptr {{%[0-9]+}}, i32 8) 781 typedef short short4 __attribute__((ext_vector_type(4))); 782 typedef unsigned short ushort4 __attribute__((ext_vector_type(4))); 783 __nvvm_ldu_s4((const short4 *)p); 784 __nvvm_ldu_us4((const ushort4 *)p); 785 786 // CHECK: call <2 x i32> @llvm.nvvm.ldu.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8) 787 // CHECK: call <2 x i32> @llvm.nvvm.ldu.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8) 788 typedef int int2 __attribute__((ext_vector_type(2))); 789 typedef unsigned int uint2 __attribute__((ext_vector_type(2))); 790 __nvvm_ldu_i2((const int2 *)p); 791 __nvvm_ldu_ui2((const uint2 *)p); 792 793 // CHECK: call <4 x i32> @llvm.nvvm.ldu.global.i.v4i32.p0(ptr {{%[0-9]+}}, i32 16) 794 // CHECK: call <4 x i32> @llvm.nvvm.ldu.global.i.v4i32.p0(ptr {{%[0-9]+}}, i32 16) 795 typedef int int4 __attribute__((ext_vector_type(4))); 796 typedef unsigned int uint4 __attribute__((ext_vector_type(4))); 797 __nvvm_ldu_i4((const int4 *)p); 798 __nvvm_ldu_ui4((const uint4 *)p); 799 800 // LP32: call <2 x i32> @llvm.nvvm.ldu.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8) 801 // LP32: call <2 x i32> @llvm.nvvm.ldu.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8) 802 // LP64: call <2 x i64> @llvm.nvvm.ldu.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16) 803 // LP64: call <2 x i64> @llvm.nvvm.ldu.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16) 804 typedef long long2 __attribute__((ext_vector_type(2))); 805 typedef unsigned long ulong2 __attribute__((ext_vector_type(2))); 806 __nvvm_ldu_l2((const long2 *)p); 807 __nvvm_ldu_ul2((const ulong2 *)p); 808 809 // CHECK: call <2 x i64> @llvm.nvvm.ldu.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16) 810 // CHECK: call <2 x i64> @llvm.nvvm.ldu.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16) 811 typedef long long longlong2 __attribute__((ext_vector_type(2))); 812 typedef unsigned long long ulonglong2 __attribute__((ext_vector_type(2))); 813 __nvvm_ldu_ll2((const longlong2 *)p); 814 __nvvm_ldu_ull2((const ulonglong2 *)p); 815 816 // CHECK: call <2 x float> @llvm.nvvm.ldu.global.f.v2f32.p0(ptr {{%[0-9]+}}, i32 8) 817 typedef float float2 __attribute__((ext_vector_type(2))); 818 __nvvm_ldu_f2((const float2 *)p); 819 820 // CHECK: call <4 x float> @llvm.nvvm.ldu.global.f.v4f32.p0(ptr {{%[0-9]+}}, i32 16) 821 typedef float float4 __attribute__((ext_vector_type(4))); 822 __nvvm_ldu_f4((const float4 *)p); 823 824 // CHECK: call <2 x double> @llvm.nvvm.ldu.global.f.v2f64.p0(ptr {{%[0-9]+}}, i32 16) 825 typedef double double2 __attribute__((ext_vector_type(2))); 826 __nvvm_ldu_d2((const double2 *)p); 827 } 828 829 // CHECK-LABEL: nvvm_shfl 830 __device__ void nvvm_shfl(int i, float f, int a, int b) { 831 // CHECK: call i32 @llvm.nvvm.shfl.down.i32(i32 832 __nvvm_shfl_down_i32(i, a, b); 833 // CHECK: call float @llvm.nvvm.shfl.down.f32(float 834 __nvvm_shfl_down_f32(f, a, b); 835 // CHECK: call i32 @llvm.nvvm.shfl.up.i32(i32 836 __nvvm_shfl_up_i32(i, a, b); 837 // CHECK: call float @llvm.nvvm.shfl.up.f32(float 838 __nvvm_shfl_up_f32(f, a, b); 839 // CHECK: call i32 @llvm.nvvm.shfl.bfly.i32(i32 840 __nvvm_shfl_bfly_i32(i, a, b); 841 // CHECK: call float @llvm.nvvm.shfl.bfly.f32(float 842 __nvvm_shfl_bfly_f32(f, a, b); 843 // CHECK: call i32 @llvm.nvvm.shfl.idx.i32(i32 844 __nvvm_shfl_idx_i32(i, a, b); 845 // CHECK: call float @llvm.nvvm.shfl.idx.f32(float 846 __nvvm_shfl_idx_f32(f, a, b); 847 // CHECK: ret void 848 } 849 850 __device__ void nvvm_vote(int pred) { 851 // CHECK: call i1 @llvm.nvvm.vote.all(i1 852 __nvvm_vote_all(pred); 853 // CHECK: call i1 @llvm.nvvm.vote.any(i1 854 __nvvm_vote_any(pred); 855 // CHECK: call i1 @llvm.nvvm.vote.uni(i1 856 __nvvm_vote_uni(pred); 857 // CHECK: call i32 @llvm.nvvm.vote.ballot(i1 858 __nvvm_vote_ballot(pred); 859 // CHECK: ret void 860 } 861 862 // CHECK-LABEL: nvvm_nanosleep 863 __device__ void nvvm_nanosleep(int d) { 864 #if __CUDA_ARCH__ >= 700 865 // CHECK_PTX70_SM80: call void @llvm.nvvm.nanosleep 866 __nvvm_nanosleep(d); 867 868 // CHECK_PTX70_SM80: call void @llvm.nvvm.nanosleep 869 __nvvm_nanosleep(1); 870 #endif 871 } 872 873 // CHECK-LABEL: nvvm_mbarrier 874 __device__ void nvvm_mbarrier(long long* addr, __attribute__((address_space(3))) long long* sharedAddr, int count, long long state) { 875 #if __CUDA_ARCH__ >= 800 876 __nvvm_mbarrier_init(addr, count); 877 // CHECK_PTX70_SM80: call void @llvm.nvvm.mbarrier.init 878 __nvvm_mbarrier_init_shared(sharedAddr, count); 879 // CHECK_PTX70_SM80: call void @llvm.nvvm.mbarrier.init.shared 880 881 __nvvm_mbarrier_inval(addr); 882 // CHECK_PTX70_SM80: call void @llvm.nvvm.mbarrier.inval 883 __nvvm_mbarrier_inval_shared(sharedAddr); 884 // CHECK_PTX70_SM80: call void @llvm.nvvm.mbarrier.inval.shared 885 886 __nvvm_mbarrier_arrive(addr); 887 // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive 888 __nvvm_mbarrier_arrive_shared(sharedAddr); 889 // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.shared 890 __nvvm_mbarrier_arrive_noComplete(addr, count); 891 // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.noComplete 892 __nvvm_mbarrier_arrive_noComplete_shared(sharedAddr, count); 893 // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.noComplete.shared 894 895 __nvvm_mbarrier_arrive_drop(addr); 896 // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.drop 897 __nvvm_mbarrier_arrive_drop_shared(sharedAddr); 898 // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.drop.shared 899 __nvvm_mbarrier_arrive_drop_noComplete(addr, count); 900 // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.drop.noComplete 901 __nvvm_mbarrier_arrive_drop_noComplete_shared(sharedAddr, count); 902 // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.drop.noComplete.shared 903 904 __nvvm_mbarrier_test_wait(addr, state); 905 // CHECK_PTX70_SM80: call i1 @llvm.nvvm.mbarrier.test.wait 906 __nvvm_mbarrier_test_wait_shared(sharedAddr, state); 907 // CHECK_PTX70_SM80: call i1 @llvm.nvvm.mbarrier.test.wait.shared 908 909 __nvvm_mbarrier_pending_count(state); 910 // CHECK_PTX70_SM80: call i32 @llvm.nvvm.mbarrier.pending.count 911 #endif 912 // CHECK: ret void 913 } 914 915 // CHECK-LABEL: nvvm_async_copy 916 __device__ void nvvm_async_copy(__attribute__((address_space(3))) void* dst, __attribute__((address_space(1))) const void* src, long long* addr, __attribute__((address_space(3))) long long* sharedAddr) { 917 #if __CUDA_ARCH__ >= 800 918 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive 919 __nvvm_cp_async_mbarrier_arrive(addr); 920 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive.shared 921 __nvvm_cp_async_mbarrier_arrive_shared(sharedAddr); 922 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc 923 __nvvm_cp_async_mbarrier_arrive_noinc(addr); 924 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared 925 __nvvm_cp_async_mbarrier_arrive_noinc_shared(sharedAddr); 926 927 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.4( 928 __nvvm_cp_async_ca_shared_global_4(dst, src); 929 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.8( 930 __nvvm_cp_async_ca_shared_global_8(dst, src); 931 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.16( 932 __nvvm_cp_async_ca_shared_global_16(dst, src); 933 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.cg.shared.global.16( 934 __nvvm_cp_async_cg_shared_global_16(dst, src); 935 936 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.4.s({{.*}}, i32 2) 937 __nvvm_cp_async_ca_shared_global_4(dst, src, 2); 938 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.8.s({{.*}}, i32 2) 939 __nvvm_cp_async_ca_shared_global_8(dst, src, 2); 940 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.16.s({{.*}}, i32 2) 941 __nvvm_cp_async_ca_shared_global_16(dst, src, 2); 942 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.cg.shared.global.16.s({{.*}}, i32 2) 943 __nvvm_cp_async_cg_shared_global_16(dst, src, 2); 944 945 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.commit.group 946 __nvvm_cp_async_commit_group(); 947 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.wait.group(i32 0) 948 __nvvm_cp_async_wait_group(0); 949 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.wait.group(i32 8) 950 __nvvm_cp_async_wait_group(8); 951 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.wait.group(i32 16) 952 __nvvm_cp_async_wait_group(16); 953 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.wait.all 954 __nvvm_cp_async_wait_all(); 955 #endif 956 // CHECK: ret void 957 } 958 959 // CHECK-LABEL: nvvm_cvt_sm80 960 __device__ void nvvm_cvt_sm80() { 961 #if __CUDA_ARCH__ >= 800 962 // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rn(float 1.000000e+00, float 1.000000e+00) 963 __nvvm_ff2bf16x2_rn(1, 1); 964 // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rn.relu(float 1.000000e+00, float 1.000000e+00) 965 __nvvm_ff2bf16x2_rn_relu(1, 1); 966 // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rz(float 1.000000e+00, float 1.000000e+00) 967 __nvvm_ff2bf16x2_rz(1, 1); 968 // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rz.relu(float 1.000000e+00, float 1.000000e+00) 969 __nvvm_ff2bf16x2_rz_relu(1, 1); 970 971 // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.ff2f16x2.rn(float 1.000000e+00, float 1.000000e+00) 972 __nvvm_ff2f16x2_rn(1, 1); 973 // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.ff2f16x2.rn.relu(float 1.000000e+00, float 1.000000e+00) 974 __nvvm_ff2f16x2_rn_relu(1, 1); 975 // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.ff2f16x2.rz(float 1.000000e+00, float 1.000000e+00) 976 __nvvm_ff2f16x2_rz(1, 1); 977 // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.ff2f16x2.rz.relu(float 1.000000e+00, float 1.000000e+00) 978 __nvvm_ff2f16x2_rz_relu(1, 1); 979 980 // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.f2bf16.rn(float 1.000000e+00) 981 __nvvm_f2bf16_rn(1); 982 // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.f2bf16.rn.relu(float 1.000000e+00) 983 __nvvm_f2bf16_rn_relu(1); 984 // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.f2bf16.rz(float 1.000000e+00) 985 __nvvm_f2bf16_rz(1); 986 // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.f2bf16.rz.relu(float 1.000000e+00) 987 __nvvm_f2bf16_rz_relu(1); 988 989 // CHECK_PTX70_SM80: call i32 @llvm.nvvm.f2tf32.rna(float 1.000000e+00) 990 __nvvm_f2tf32_rna(1); 991 #endif 992 // CHECK: ret void 993 } 994 995 // CHECK-LABEL: nvvm_cvt_sm89 996 __device__ void nvvm_cvt_sm89() { 997 #if __CUDA_ARCH__ >= 890 998 // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e4m3x2.rn(float 1.000000e+00, float 1.000000e+00) 999 __nvvm_ff_to_e4m3x2_rn(1.0f, 1.0f); 1000 // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e4m3x2.rn.relu(float 1.000000e+00, float 1.000000e+00) 1001 __nvvm_ff_to_e4m3x2_rn_relu(1.0f, 1.0f); 1002 // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e5m2x2.rn(float 1.000000e+00, float 1.000000e+00) 1003 __nvvm_ff_to_e5m2x2_rn(1.0f, 1.0f); 1004 // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e5m2x2.rn.relu(float 1.000000e+00, float 1.000000e+00) 1005 __nvvm_ff_to_e5m2x2_rn_relu(1.0f, 1.0f); 1006 1007 // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.to.e4m3x2.rn(<2 x half> splat (half 0xH3C00)) 1008 __nvvm_f16x2_to_e4m3x2_rn({1.0f16, 1.0f16}); 1009 // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.to.e4m3x2.rn.relu(<2 x half> splat (half 0xH3C00)) 1010 __nvvm_f16x2_to_e4m3x2_rn_relu({1.0f16, 1.0f16}); 1011 // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.to.e5m2x2.rn(<2 x half> splat (half 0xH3C00)) 1012 __nvvm_f16x2_to_e5m2x2_rn({1.0f16, 1.0f16}); 1013 // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.to.e5m2x2.rn.relu(<2 x half> splat (half 0xH3C00)) 1014 __nvvm_f16x2_to_e5m2x2_rn_relu({1.0f16, 1.0f16}); 1015 1016 // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e4m3x2.to.f16x2.rn(i16 18504) 1017 __nvvm_e4m3x2_to_f16x2_rn(0x4848); 1018 // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e4m3x2.to.f16x2.rn.relu(i16 18504) 1019 __nvvm_e4m3x2_to_f16x2_rn_relu(0x4848); 1020 // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e5m2x2.to.f16x2.rn(i16 19532) 1021 __nvvm_e5m2x2_to_f16x2_rn(0x4c4c); 1022 // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e5m2x2.to.f16x2.rn.relu(i16 19532) 1023 __nvvm_e5m2x2_to_f16x2_rn_relu(0x4c4c); 1024 #endif 1025 // CHECK: ret void 1026 } 1027 1028 #define NAN32 0x7FBFFFFF 1029 #define NAN16 (__bf16)0x7FBF 1030 #define BF16 (__bf16)0.1f 1031 #define BF16_2 (__bf16)0.2f 1032 #define NANBF16 (__bf16)0xFFC1 1033 #define BF16X2 {(__bf16)0.1f, (__bf16)0.1f} 1034 #define BF16X2_2 {(__bf16)0.2f, (__bf16)0.2f} 1035 #define NANBF16X2 {NANBF16, NANBF16} 1036 1037 // CHECK-LABEL: nvvm_abs_neg_bf16_bf16x2_sm80 1038 __device__ void nvvm_abs_neg_bf16_bf16x2_sm80() { 1039 #if __CUDA_ARCH__ >= 800 1040 1041 // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.abs.bf16(bfloat 0xR3DCD) 1042 __nvvm_abs_bf16(BF16); 1043 // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.abs.bf16x2(<2 x bfloat> splat (bfloat 0xR3DCD)) 1044 __nvvm_abs_bf16x2(BF16X2); 1045 1046 // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.neg.bf16(bfloat 0xR3DCD) 1047 __nvvm_neg_bf16(BF16); 1048 // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.neg.bf16x2(<2 x bfloat> splat (bfloat 0xR3DCD)) 1049 __nvvm_neg_bf16x2(BF16X2); 1050 #endif 1051 // CHECK: ret void 1052 } 1053 1054 // CHECK-LABEL: nvvm_min_max_sm80 1055 __device__ void nvvm_min_max_sm80() { 1056 #if __CUDA_ARCH__ >= 800 1057 1058 // CHECK_PTX70_SM80: call float @llvm.nvvm.fmin.nan.f 1059 __nvvm_fmin_nan_f(0.1f, (float)NAN32); 1060 // CHECK_PTX70_SM80: call float @llvm.nvvm.fmin.ftz.nan.f 1061 __nvvm_fmin_ftz_nan_f(0.1f, (float)NAN32); 1062 1063 // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fmin.bf16 1064 __nvvm_fmin_bf16(BF16, BF16_2); 1065 // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fmin.ftz.bf16 1066 __nvvm_fmin_ftz_bf16(BF16, BF16_2); 1067 // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fmin.nan.bf16 1068 __nvvm_fmin_nan_bf16(BF16, NANBF16); 1069 // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fmin.ftz.nan.bf16 1070 __nvvm_fmin_ftz_nan_bf16(BF16, NANBF16); 1071 // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fmin.bf16x2 1072 __nvvm_fmin_bf16x2(BF16X2, BF16X2_2); 1073 // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fmin.ftz.bf16x2 1074 __nvvm_fmin_ftz_bf16x2(BF16X2, BF16X2_2); 1075 // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fmin.nan.bf16x2 1076 __nvvm_fmin_nan_bf16x2(BF16X2, NANBF16X2); 1077 // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fmin.ftz.nan.bf16x2 1078 __nvvm_fmin_ftz_nan_bf16x2(BF16X2, NANBF16X2); 1079 // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.nan.f 1080 __nvvm_fmax_nan_f(0.1f, 0.11f); 1081 // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.ftz.nan.f 1082 __nvvm_fmax_ftz_nan_f(0.1f, (float)NAN32); 1083 1084 // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.nan.f 1085 __nvvm_fmax_nan_f(0.1f, (float)NAN32); 1086 // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.ftz.nan.f 1087 __nvvm_fmax_ftz_nan_f(0.1f, (float)NAN32); 1088 // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fmax.bf16 1089 __nvvm_fmax_bf16(BF16, BF16_2); 1090 // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fmax.ftz.bf16 1091 __nvvm_fmax_ftz_bf16(BF16, BF16_2); 1092 // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fmax.nan.bf16 1093 __nvvm_fmax_nan_bf16(BF16, NANBF16); 1094 // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fmax.ftz.nan.bf16 1095 __nvvm_fmax_ftz_nan_bf16(BF16, NANBF16); 1096 // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fmax.bf16x2 1097 __nvvm_fmax_bf16x2(BF16X2, BF16X2_2); 1098 // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fmax.ftz.bf16x2 1099 __nvvm_fmax_ftz_bf16x2(BF16X2, BF16X2_2); 1100 // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fmax.nan.bf16x2 1101 __nvvm_fmax_nan_bf16x2(NANBF16X2, BF16X2); 1102 // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fmax.ftz.nan.bf16x2 1103 __nvvm_fmax_ftz_nan_bf16x2(NANBF16X2, BF16X2); 1104 // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.nan.f 1105 __nvvm_fmax_nan_f(0.1f, (float)NAN32); 1106 // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.ftz.nan.f 1107 __nvvm_fmax_ftz_nan_f(0.1f, (float)NAN32); 1108 1109 #endif 1110 // CHECK: ret void 1111 } 1112 1113 // CHECK-LABEL: nvvm_fma_bf16_bf16x2_sm80 1114 __device__ void nvvm_fma_bf16_bf16x2_sm80() { 1115 #if __CUDA_ARCH__ >= 800 1116 // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fma.rn.bf16 1117 __nvvm_fma_rn_bf16(BF16, BF16_2, BF16_2); 1118 // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fma.rn.relu.bf16 1119 __nvvm_fma_rn_relu_bf16(BF16, BF16_2, BF16_2); 1120 // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fma.rn.bf16x2 1121 __nvvm_fma_rn_bf16x2(BF16X2, BF16X2_2, BF16X2_2); 1122 // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fma.rn.relu.bf16x2 1123 __nvvm_fma_rn_relu_bf16x2(BF16X2, BF16X2_2, BF16X2_2); 1124 #endif 1125 // CHECK: ret void 1126 } 1127 1128 // CHECK-LABEL: nvvm_min_max_sm86 1129 __device__ void nvvm_min_max_sm86() { 1130 #if __CUDA_ARCH__ >= 860 1131 1132 // CHECK_PTX72_SM86: call bfloat @llvm.nvvm.fmin.xorsign.abs.bf16 1133 __nvvm_fmin_xorsign_abs_bf16(BF16, BF16_2); 1134 // CHECK_PTX72_SM86: call bfloat @llvm.nvvm.fmin.nan.xorsign.abs.bf16 1135 __nvvm_fmin_nan_xorsign_abs_bf16(BF16, NANBF16); 1136 // CHECK_PTX72_SM86: call <2 x bfloat> @llvm.nvvm.fmin.xorsign.abs.bf16x2 1137 __nvvm_fmin_xorsign_abs_bf16x2(BF16X2, BF16X2_2); 1138 // CHECK_PTX72_SM86: call <2 x bfloat> @llvm.nvvm.fmin.nan.xorsign.abs.bf16x2 1139 __nvvm_fmin_nan_xorsign_abs_bf16x2(BF16X2, NANBF16X2); 1140 // CHECK_PTX72_SM86: call float @llvm.nvvm.fmin.xorsign.abs.f 1141 __nvvm_fmin_xorsign_abs_f(-0.1f, 0.1f); 1142 // CHECK_PTX72_SM86: call float @llvm.nvvm.fmin.ftz.xorsign.abs.f 1143 __nvvm_fmin_ftz_xorsign_abs_f(-0.1f, 0.1f); 1144 // CHECK_PTX72_SM86: call float @llvm.nvvm.fmin.nan.xorsign.abs.f 1145 __nvvm_fmin_nan_xorsign_abs_f(-0.1f, (float)NAN32); 1146 // CHECK_PTX72_SM86: call float @llvm.nvvm.fmin.ftz.nan.xorsign.abs.f 1147 __nvvm_fmin_ftz_nan_xorsign_abs_f(-0.1f, (float)NAN32); 1148 1149 // CHECK_PTX72_SM86: call bfloat @llvm.nvvm.fmax.xorsign.abs.bf16 1150 __nvvm_fmax_xorsign_abs_bf16(BF16, BF16_2); 1151 // CHECK_PTX72_SM86: call bfloat @llvm.nvvm.fmax.nan.xorsign.abs.bf16 1152 __nvvm_fmax_nan_xorsign_abs_bf16(BF16, NANBF16); 1153 // CHECK_PTX72_SM86: call <2 x bfloat> @llvm.nvvm.fmax.xorsign.abs.bf16x2 1154 __nvvm_fmax_xorsign_abs_bf16x2(BF16X2, BF16X2_2); 1155 // CHECK_PTX72_SM86: call <2 x bfloat> @llvm.nvvm.fmax.nan.xorsign.abs.bf16x2 1156 __nvvm_fmax_nan_xorsign_abs_bf16x2(BF16X2, NANBF16X2); 1157 // CHECK_PTX72_SM86: call float @llvm.nvvm.fmax.xorsign.abs.f 1158 __nvvm_fmax_xorsign_abs_f(-0.1f, 0.1f); 1159 // CHECK_PTX72_SM86: call float @llvm.nvvm.fmax.ftz.xorsign.abs.f 1160 __nvvm_fmax_ftz_xorsign_abs_f(-0.1f, 0.1f); 1161 // CHECK_PTX72_SM86: call float @llvm.nvvm.fmax.nan.xorsign.abs.f 1162 __nvvm_fmax_nan_xorsign_abs_f(-0.1f, (float)NAN32); 1163 // CHECK_PTX72_SM86: call float @llvm.nvvm.fmax.ftz.nan.xorsign.abs.f 1164 __nvvm_fmax_ftz_nan_xorsign_abs_f(-0.1f, (float)NAN32); 1165 #endif 1166 // CHECK: ret void 1167 } 1168