1; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-ir-lower-kernel-arguments=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,HSA-VI,FUNC %s 2 3; Repeat of some problematic tests in kernel-args.ll, with the IR 4; argument lowering pass disabled. Struct padding needs to be 5; accounted for, as well as legalization of types changing offsets. 6 7; FUNC-LABEL: {{^}}i1_arg: 8 9; GCN: s_load_dword s 10; GCN: s_and_b32 11 12; HSA-VI: .amdhsa_kernarg_size 12 13define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) #0 { 14 store i1 %x, ptr addrspace(1) %out, align 1 15 ret void 16} 17 18; FUNC-LABEL: {{^}}v3i8_arg: 19 20; HSA-VI: s_load_dword s{{[0-9]+}}, s[8:9], 0x8 21; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[8:9], 0x0 22 23; HSA-VI: .amdhsa_kernarg_size 12 24define amdgpu_kernel void @v3i8_arg(ptr addrspace(1) nocapture %out, <3 x i8> %in) #0 { 25entry: 26 store <3 x i8> %in, ptr addrspace(1) %out, align 4 27 ret void 28} 29 30; FUNC-LABEL: {{^}}v5i8_arg: 31; GCN: s_load_dwordx2 s[0:1], s[8:9], 0x0 32define amdgpu_kernel void @v5i8_arg(<5 x i8> %in) nounwind { 33 store <5 x i8> %in, ptr addrspace(1) null 34 ret void 35} 36 37; FUNC-LABEL: {{^}}v6i8_arg: 38; GCN: s_load_dwordx2 s[0:1], s[8:9], 0x0 39define amdgpu_kernel void @v6i8_arg(<6 x i8> %in) nounwind { 40 store <6 x i8> %in, ptr addrspace(1) null 41 ret void 42} 43 44; FUNC-LABEL: {{^}}v5i16_arg: 45; GCN: s_load_dwordx4 s[0:3], s[8:9], 0x0 46define amdgpu_kernel void @v5i16_arg(<5 x i16> %in) nounwind { 47 store <5 x i16> %in, ptr addrspace(1) null 48 ret void 49} 50 51; FUNC-LABEL: {{^}}v6i16_arg: 52; GCN-DAG: s_load_dwordx4 s[0:3], s[8:9], 0x0 53define amdgpu_kernel void @v6i16_arg(<6 x i16> %in) nounwind { 54 store <6 x i16> %in, ptr addrspace(1) null 55 ret void 56} 57 58; FUNC-LABEL: {{^}}v5i32_arg: 59; GCN: s_load_dwordx8 s[0:7], s[8:9], 0x0 60define amdgpu_kernel void @v5i32_arg(<5 x i32> %in) nounwind { 61 store <5 x i32> %in, ptr addrspace(1) null 62 ret void 63} 64 65; FUNC-LABEL: {{^}}v6i32_arg: 66; GCN: s_load_dwordx8 s[0:7], s[8:9], 0x0 67define amdgpu_kernel void @v6i32_arg(<6 x i32> %in) nounwind { 68 store <6 x i32> %in, ptr addrspace(1) null 69 ret void 70} 71 72; FUNC-LABEL: {{^}}i65_arg: 73; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[8:9], 0x0 74 75; HSA-VI: .amdhsa_kernarg_size 24 76define amdgpu_kernel void @i65_arg(ptr addrspace(1) nocapture %out, i65 %in) #0 { 77entry: 78 store i65 %in, ptr addrspace(1) %out, align 4 79 ret void 80} 81 82; FUNC-LABEL: {{^}}empty_struct_arg: 83; HSA-VI: .amdhsa_kernarg_size 0 84define amdgpu_kernel void @empty_struct_arg({} %in) #0 { 85 ret void 86} 87 88; The correct load offsets for these: 89; load 4 from 0, 90; load 8 from 8 91; load 4 from 24 92; load 8 from 32 93 94; With the SelectionDAG argument lowering, the alignments for the 95; struct members is not properly considered, making these wrong. 96 97; FIXME: Total argument size is computed wrong 98; FUNC-LABEL: {{^}}struct_argument_alignment: 99; HSA-VI: s_load_dword s{{[0-9]+}}, s[8:9], 0x0 100; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[8:9], 0x8 101; HSA-VI: s_load_dword s{{[0-9]+}}, s[8:9], 0x18 102; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[8:9], 0x20 103 104; HSA-VI: .amdhsa_kernarg_size 40 105define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, i64} %arg1) #0 { 106 %val0 = extractvalue {i32, i64} %arg0, 0 107 %val1 = extractvalue {i32, i64} %arg0, 1 108 %val2 = extractvalue {i32, i64} %arg1, 0 109 %val3 = extractvalue {i32, i64} %arg1, 1 110 store volatile i32 %val0, ptr addrspace(1) null 111 store volatile i64 %val1, ptr addrspace(1) null 112 store volatile i32 %val2, ptr addrspace(1) null 113 store volatile i64 %val3, ptr addrspace(1) null 114 ret void 115} 116 117; No padding between i8 and next struct, but round up at end to 4 byte 118; multiple. 119; FUNC-LABEL: {{^}}packed_struct_argument_alignment: 120; HSA-VI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} 121; HSA-VI: global_load_dword v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:13 122; HSA-VI: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:17 123; HSA-VI: s_load_dword s{{[0-9]+}}, s[8:9], 0x0 124; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[8:9], 0x4 125 126; HSA-VI: .amdhsa_kernarg_size 28 127define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) #0 { 128 %val0 = extractvalue <{i32, i64}> %arg0, 0 129 %val1 = extractvalue <{i32, i64}> %arg0, 1 130 %val2 = extractvalue <{i32, i64}> %arg1, 0 131 %val3 = extractvalue <{i32, i64}> %arg1, 1 132 store volatile i32 %val0, ptr addrspace(1) null 133 store volatile i64 %val1, ptr addrspace(1) null 134 store volatile i32 %val2, ptr addrspace(1) null 135 store volatile i64 %val3, ptr addrspace(1) null 136 ret void 137} 138 139; GCN-LABEL: {{^}}struct_argument_alignment_after: 140; HSA-VI: s_load_dword s{{[0-9]+}}, s[8:9], 0x0 141; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[8:9], 0x8 142; HSA-VI: s_load_dword s{{[0-9]+}}, s[8:9], 0x18 143; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[8:9], 0x20 144; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[8:9], 0x30 145 146; HSA-VI: .amdhsa_kernarg_size 64 147define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, {i32, i64} %arg2, i8, <4 x i32> %arg4) #0 { 148 %val0 = extractvalue {i32, i64} %arg0, 0 149 %val1 = extractvalue {i32, i64} %arg0, 1 150 %val2 = extractvalue {i32, i64} %arg2, 0 151 %val3 = extractvalue {i32, i64} %arg2, 1 152 store volatile i32 %val0, ptr addrspace(1) null 153 store volatile i64 %val1, ptr addrspace(1) null 154 store volatile i32 %val2, ptr addrspace(1) null 155 store volatile i64 %val3, ptr addrspace(1) null 156 store volatile <4 x i32> %arg4, ptr addrspace(1) null 157 ret void 158} 159 160; GCN-LABEL: {{^}}array_3xi32: 161; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[8:9], 0x0 162define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) { 163 store volatile i16 %arg0, ptr addrspace(1) undef 164 store volatile [3 x i32] %arg1, ptr addrspace(1) undef 165 ret void 166} 167 168; GCN-LABEL: {{^}}array_3xi16: 169; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[8:9], 0x0 170define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) { 171 store volatile i8 %arg0, ptr addrspace(1) undef 172 store volatile [3 x i16] %arg1, ptr addrspace(1) undef 173 ret void 174} 175 176; GCN-LABEL: {{^}}v2i15_arg: 177; GCN: s_load_dword [[DWORD:s[0-9]+]] 178; GCN-DAG: s_bfe_u32 [[BFE:s[0-9]+]], [[DWORD]], 0x100010{{$}} 179; GCN-DAG: s_and_b32 [[AND:s[0-9]+]], [[DWORD]], 0x7fff{{$}} 180define amdgpu_kernel void @v2i15_arg(ptr addrspace(1) nocapture %out, <2 x i15> %in) #0 { 181entry: 182 store <2 x i15> %in, ptr addrspace(1) %out, align 4 183 ret void 184} 185 186; GCN-LABEL: {{^}}v3i15_arg: 187; GCN: s_load_dwordx4 [[DWORDX4:s\[[0-9]+:[0-9]+\]]] 188; GCN: s_lshl_b64 189; GCN: s_and_b32 190; GCN: s_and_b32 191; GCN: s_or_b32 192define amdgpu_kernel void @v3i15_arg(ptr addrspace(1) nocapture %out, <3 x i15> %in) #0 { 193entry: 194 store <3 x i15> %in, ptr addrspace(1) %out, align 4 195 ret void 196} 197 198; Byref pointers should only be treated as offsets from kernarg 199; GCN-LABEL: {{^}}byref_constant_i8_arg: 200; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} 201; GCN: global_load_ubyte v{{[0-9]+}}, [[ZERO]], s[8:9] offset:8 202; GCN: .amdhsa_kernarg_size 12 203define amdgpu_kernel void @byref_constant_i8_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i8) %in.byref) #0 { 204 %in = load i8, ptr addrspace(4) %in.byref 205 %ext = zext i8 %in to i32 206 store i32 %ext, ptr addrspace(1) %out, align 4 207 ret void 208} 209 210; GCN-LABEL: {{^}}byref_constant_i16_arg: 211; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} 212; GCN: global_load_ushort v{{[0-9]+}}, [[ZERO]], s[8:9] offset:8 213; GCN: .amdhsa_kernarg_size 12 214define amdgpu_kernel void @byref_constant_i16_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i16) %in.byref) #0 { 215 %in = load i16, ptr addrspace(4) %in.byref 216 %ext = zext i16 %in to i32 217 store i32 %ext, ptr addrspace(1) %out, align 4 218 ret void 219} 220 221; GCN-LABEL: {{^}}byref_constant_i32_arg: 222; GCN: s_load_dwordx4 [[LOAD:s\[[0-9]+:[0-9]+\]]], s[8:9], 0x0{{$}} 223; GCN: .amdhsa_kernarg_size 16 224define amdgpu_kernel void @byref_constant_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i32) %in.byref, i32 %after.offset) #0 { 225 %in = load i32, ptr addrspace(4) %in.byref 226 store volatile i32 %in, ptr addrspace(1) %out, align 4 227 store volatile i32 %after.offset, ptr addrspace(1) %out, align 4 228 ret void 229} 230 231; GCN-LABEL: {{^}}byref_constant_v4i32_arg: 232; GCN: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[8:9], 0x10{{$}} 233; GCN: s_load_dword s{{[0-9]+}}, s[8:9], 0x20{{$}} 234; GCN: .amdhsa_kernarg_size 36 235define amdgpu_kernel void @byref_constant_v4i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(<4 x i32>) %in.byref, i32 %after.offset) #0 { 236 %in = load <4 x i32>, ptr addrspace(4) %in.byref 237 store volatile <4 x i32> %in, ptr addrspace(1) %out, align 4 238 store volatile i32 %after.offset, ptr addrspace(1) %out, align 4 239 ret void 240} 241 242; GCN-LABEL: {{^}}byref_align_constant_i32_arg: 243; GCN-DAG: s_load_dwordx2 s[[[IN:[0-9]+]]:[[AFTER_OFFSET:[0-9]+]]], s[8:9], 0x100{{$}} 244; GCN-DAG: v_mov_b32_e32 [[V_IN:v[0-9]+]], s[[IN]] 245; GCN-DAG: v_mov_b32_e32 [[V_AFTER_OFFSET:v[0-9]+]], s[[AFTER_OFFSET]] 246; GCN: global_store_dword v{{[0-9]+}}, [[V_IN]], s 247; GCN: global_store_dword v{{[0-9]+}}, [[V_AFTER_OFFSET]], s 248; GCN: .amdhsa_kernarg_size 264 249define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) #0 { 250 %in = load i32, ptr addrspace(4) %in.byref 251 store volatile i32 %in, ptr addrspace(1) %out, align 4 252 store volatile i32 %after.offset, ptr addrspace(1) %out, align 4 253 ret void 254} 255 256; GCN-LABEL: {{^}}byref_natural_align_constant_v16i32_arg: 257; GCN-DAG: s_load_dword s{{[0-9]+}}, s[8:9], 0x80 258; GCN-DAG: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[8:9], 0x40{{$}} 259; GCN: .amdhsa_kernarg_size 132 260define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(ptr addrspace(1) nocapture %out, i8, ptr addrspace(4) byref(<16 x i32>) align(64) %in.byref, i32 %after.offset) #0 { 261 %in = load <16 x i32>, ptr addrspace(4) %in.byref 262 store volatile <16 x i32> %in, ptr addrspace(1) %out, align 4 263 store volatile i32 %after.offset, ptr addrspace(1) %out, align 4 264 ret void 265} 266 267; Also accept byref kernel arguments with other global address spaces. 268; GCN-LABEL: {{^}}byref_global_i32_arg: 269; GCN: s_load_dword [[IN:s[0-9]+]], s[8:9], 0x8{{$}} 270; GCN: .amdhsa_kernarg_size 12 271define amdgpu_kernel void @byref_global_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(1) byref(i32) %in.byref) #0 { 272 %in = load i32, ptr addrspace(1) %in.byref 273 store i32 %in, ptr addrspace(1) %out, align 4 274 ret void 275} 276 277; GCN-LABEL: {{^}}byref_flat_i32_arg: 278; GCN: flat_load_dword [[IN:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}} offset:8{{$}} 279define amdgpu_kernel void @byref_flat_i32_arg(ptr addrspace(1) nocapture %out, ptr byref(i32) %in.byref) #0 { 280 %in = load i32, ptr %in.byref 281 store i32 %in, ptr addrspace(1) %out, align 4 282 ret void 283} 284 285; GCN-LABEL: {{^}}byref_constant_32bit_i32_arg: 286; GCN: s_add_i32 s[[PTR_LO:[0-9]+]], s8, 8 287; GCN: s_mov_b32 s[[PTR_HI:[0-9]+]], 0{{$}} 288; GCN: s_load_dword s{{[0-9]+}}, s[[[PTR_LO]]:[[PTR_HI]]], 0x0{{$}} 289define amdgpu_kernel void @byref_constant_32bit_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(6) byref(i32) %in.byref) #0 { 290 %in = load i32, ptr addrspace(6) %in.byref 291 store i32 %in, ptr addrspace(1) %out, align 4 292 ret void 293} 294 295; define amdgpu_kernel void @byref_unknown_as_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(999) byref %in.byref) { 296; %in = load i32, ptr addrspace(999) %in.byref 297; store i32 %in, ptr addrspace(1) %out, align 4 298; ret void 299; } 300 301; GCN-LABEL: {{^}}multi_byref_constant_i32_arg: 302; GCN: s_load_dwordx4 {{s\[[0-9]+:[0-9]+\]}}, s[8:9], 0x0 303; GCN: .amdhsa_kernarg_size 20 304define amdgpu_kernel void @multi_byref_constant_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i32) %in0.byref, ptr addrspace(4) byref(i32) %in1.byref, i32 %after.offset) #0 { 305 %in0 = load i32, ptr addrspace(4) %in0.byref 306 %in1 = load i32, ptr addrspace(4) %in1.byref 307 store volatile i32 %in0, ptr addrspace(1) %out, align 4 308 store volatile i32 %in1, ptr addrspace(1) %out, align 4 309 store volatile i32 %after.offset, ptr addrspace(1) %out, align 4 310 ret void 311} 312 313; GCN-LABEL: {{^}}byref_constant_i32_arg_offset0: 314; GCN-NOT: s4 315; GCN-NOT: s5 316; GCN: s_load_dword {{s[0-9]+}}, s[8:9], 0x0{{$}} 317; GCN: .amdhsa_kernarg_size 4 318define amdgpu_kernel void @byref_constant_i32_arg_offset0(ptr addrspace(4) byref(i32) %in.byref) #0 { 319 %in = load i32, ptr addrspace(4) %in.byref 320 store i32 %in, ptr addrspace(1) undef, align 4 321 ret void 322} 323 324attributes #0 = { "amdgpu-no-implicitarg-ptr" } 325 326!llvm.module.flags = !{!0} 327!0 = !{i32 1, !"amdhsa_code_object_version", i32 400} 328