1; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-cuda | FileCheck %s --check-prefixes COMMON,IR,IRC 2; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-nvcl | FileCheck %s --check-prefixes COMMON,IR,IRO 3; RUN: llc < %s -mcpu=sm_20 --mtriple nvptx64-nvidia-cuda | FileCheck %s --check-prefixes COMMON,PTX,PTXC 4; RUN: llc < %s -mcpu=sm_20 --mtriple nvptx64-nvidia-nvcl| FileCheck %s --check-prefixes COMMON,PTX,PTXO 5; RUN: %if ptxas %{ llc < %s -mcpu=sm_20 | %ptxas-verify %} 6 7target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" 8target triple = "nvptx64-nvidia-cuda" 9 10%class.outer = type <{ %class.inner, i32, [4 x i8] }> 11%class.inner = type { ptr, ptr } 12%class.padded = type { i8, i32 } 13 14; Check that nvptx-lower-args preserves arg alignment 15; COMMON-LABEL: load_alignment 16define void @load_alignment(ptr nocapture readonly byval(%class.outer) align 8 %arg) { 17entry: 18; IR: call void @llvm.memcpy.p0.p101.i64(ptr align 8 19; PTX: ld.param.u64 20; PTX-NOT: ld.param.u8 21 %arg.idx.val = load ptr, ptr %arg, align 8 22 %arg.idx1 = getelementptr %class.outer, ptr %arg, i64 0, i32 0, i32 1 23 %arg.idx1.val = load ptr, ptr %arg.idx1, align 8 24 %arg.idx2 = getelementptr %class.outer, ptr %arg, i64 0, i32 1 25 %arg.idx2.val = load i32, ptr %arg.idx2, align 8 26 %arg.idx.val.val = load i32, ptr %arg.idx.val, align 4 27 %add.i = add nsw i32 %arg.idx.val.val, %arg.idx2.val 28 store i32 %add.i, ptr %arg.idx1.val, align 4 29 30 ; let the pointer escape so we still create a local copy this test uses to 31 ; check the load alignment. 32 %tmp = call ptr @escape(ptr nonnull %arg.idx2) 33 ret void 34} 35 36; Check that nvptx-lower-args copies padding as the struct may have been a union 37; COMMON-LABEL: load_padding 38define void @load_padding(ptr nocapture readonly byval(%class.padded) %arg) { 39; PTX: { 40; PTX-NEXT: .local .align 8 .b8 __local_depot1[8]; 41; PTX-NEXT: .reg .b64 %SP; 42; PTX-NEXT: .reg .b64 %SPL; 43; PTX-NEXT: .reg .b64 %rd<5>; 44; PTX-EMPTY: 45; PTX-NEXT: // %bb.0: 46; PTX-NEXT: mov.u64 %SPL, __local_depot1; 47; PTX-NEXT: cvta.local.u64 %SP, %SPL; 48; PTX-NEXT: ld.param.u64 %rd1, [load_padding_param_0]; 49; PTX-NEXT: st.u64 [%SP], %rd1; 50; PTX-NEXT: add.u64 %rd2, %SP, 0; 51; PTX-NEXT: { // callseq 1, 0 52; PTX-NEXT: .param .b64 param0; 53; PTX-NEXT: st.param.b64 [param0], %rd2; 54; PTX-NEXT: .param .b64 retval0; 55; PTX-NEXT: call.uni (retval0), 56; PTX-NEXT: escape, 57; PTX-NEXT: ( 58; PTX-NEXT: param0 59; PTX-NEXT: ); 60; PTX-NEXT: ld.param.b64 %rd3, [retval0]; 61; PTX-NEXT: } // callseq 1 62; PTX-NEXT: ret; 63 %tmp = call ptr @escape(ptr nonnull align 16 %arg) 64 ret void 65} 66 67; COMMON-LABEL: ptr_generic 68define ptx_kernel void @ptr_generic(ptr %out, ptr %in) { 69; IRC: %in3 = addrspacecast ptr %in to ptr addrspace(1) 70; IRC: %in4 = addrspacecast ptr addrspace(1) %in3 to ptr 71; IRC: %out1 = addrspacecast ptr %out to ptr addrspace(1) 72; IRC: %out2 = addrspacecast ptr addrspace(1) %out1 to ptr 73; PTXC: cvta.to.global.u64 74; PTXC: cvta.to.global.u64 75; PTXC: ld.global.u32 76; PTXC: st.global.u32 77 78; OpenCL can't make assumptions about incoming pointer, so we should generate 79; generic pointers load/store. 80; IRO-NOT: addrspacecast 81; PTXO-NOT: cvta.to.global 82; PTXO: ld.u32 83; PTXO: st.u32 84 %v = load i32, ptr %in, align 4 85 store i32 %v, ptr %out, align 4 86 ret void 87} 88 89; COMMON-LABEL: ptr_nongeneric 90define ptx_kernel void @ptr_nongeneric(ptr addrspace(1) %out, ptr addrspace(4) %in) { 91; IR-NOT: addrspacecast 92; PTX-NOT: cvta.to.global 93; PTX: ld.const.u32 94; PTX st.global.u32 95 %v = load i32, ptr addrspace(4) %in, align 4 96 store i32 %v, ptr addrspace(1) %out, align 4 97 ret void 98} 99 100; COMMON-LABEL: ptr_as_int 101 define ptx_kernel void @ptr_as_int(i64 noundef %i, i32 noundef %v) { 102; IR: [[P:%.*]] = inttoptr i64 %i to ptr 103; IRC: [[P1:%.*]] = addrspacecast ptr [[P]] to ptr addrspace(1) 104; IRC: addrspacecast ptr addrspace(1) [[P1]] to ptr 105; IRO-NOT: addrspacecast 106 107; PTXC-DAG: ld.param.u64 [[I:%rd.*]], [ptr_as_int_param_0]; 108; PTXC-DAG: ld.param.u32 [[V:%r.*]], [ptr_as_int_param_1]; 109; PTXC: cvta.to.global.u64 %[[P:rd.*]], [[I]]; 110; PTXC: st.global.u32 [%[[P]]], [[V]]; 111 112; PTXO-DAG: ld.param.u64 %[[P:rd.*]], [ptr_as_int_param_0]; 113; PTXO-DAG: ld.param.u32 [[V:%r.*]], [ptr_as_int_param_1]; 114; PTXO: st.u32 [%[[P]]], [[V]]; 115 116 %p = inttoptr i64 %i to ptr 117 store i32 %v, ptr %p, align 4 118 ret void 119} 120 121%struct.S = type { i64 } 122 123; COMMON-LABEL: ptr_as_int_aggr 124define ptx_kernel void @ptr_as_int_aggr(ptr nocapture noundef readonly byval(%struct.S) align 8 %s, i32 noundef %v) { 125; IR: [[S:%.*]] = addrspacecast ptr %s to ptr addrspace(101) 126; IR: [[I:%.*]] = load i64, ptr addrspace(101) [[S]], align 8 127; IR: [[P0:%.*]] = inttoptr i64 [[I]] to ptr 128; IRC: [[P1:%.*]] = addrspacecast ptr [[P]] to ptr addrspace(1) 129; IRC: [[P:%.*]] = addrspacecast ptr addrspace(1) [[P1]] to ptr 130; IRO-NOT: addrspacecast 131 132; PTXC-DAG: ld.param.u64 [[I:%rd.*]], [ptr_as_int_aggr_param_0]; 133; PTXC-DAG: ld.param.u32 [[V:%r.*]], [ptr_as_int_aggr_param_1]; 134; PTXC: cvta.to.global.u64 %[[P:rd.*]], [[I]]; 135; PTXC: st.global.u32 [%[[P]]], [[V]]; 136 137; PTXO-DAG: ld.param.u64 %[[P:rd.*]], [ptr_as_int_aggr_param_0]; 138; PTXO-DAG: ld.param.u32 [[V:%r.*]], [ptr_as_int_aggr_param_1]; 139; PTXO: st.u32 [%[[P]]], [[V]]; 140 %i = load i64, ptr %s, align 8 141 %p = inttoptr i64 %i to ptr 142 store i32 %v, ptr %p, align 4 143 ret void 144} 145 146 147; Function Attrs: convergent nounwind 148declare dso_local ptr @escape(ptr) local_unnamed_addr 149