xref: /llvm-project/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-IR-lowering.ll (revision c4d89203f3822b0466f5cc58654cb016aeb86648)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes='amdgpu-attributor,function(amdgpu-lower-kernel-arguments)' -S < %s | FileCheck -check-prefix=NO-PRELOAD %s
3; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes='amdgpu-attributor,function(amdgpu-lower-kernel-arguments)' -amdgpu-kernarg-preload-count=16 -S < %s | FileCheck -check-prefix=PRELOAD %s
4
5define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) %out) {
6; NO-PRELOAD-LABEL: define amdgpu_kernel void @preload_block_count_x(
7; NO-PRELOAD-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
8; NO-PRELOAD-NEXT:    [[PRELOAD_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
9; NO-PRELOAD-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0
10; NO-PRELOAD-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0:![0-9]+]]
11; NO-PRELOAD-NEXT:    [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
12; NO-PRELOAD-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
13; NO-PRELOAD-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
14; NO-PRELOAD-NEXT:    ret void
15;
16; PRELOAD-LABEL: define amdgpu_kernel void @preload_block_count_x(
17; PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]]) #[[ATTR0:[0-9]+]] {
18; PRELOAD-NEXT:    [[PRELOAD_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
19; PRELOAD-NEXT:    [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
20; PRELOAD-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
21; PRELOAD-NEXT:    store i32 [[_HIDDEN_BLOCK_COUNT_X]], ptr addrspace(1) [[OUT]], align 4
22; PRELOAD-NEXT:    ret void
23;
24  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
25  %load = load i32, ptr addrspace(4) %imp_arg_ptr
26  store i32 %load, ptr addrspace(1) %out
27  ret void
28}
29
30define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) %out, i512) {
31; NO-PRELOAD-LABEL: define amdgpu_kernel void @no_free_sgprs_block_count_x(
32; NO-PRELOAD-SAME: ptr addrspace(1) [[OUT:%.*]], i512 [[TMP0:%.*]]) #[[ATTR0]] {
33; NO-PRELOAD-NEXT:    [[NO_FREE_SGPRS_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(328) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
34; NO-PRELOAD-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[NO_FREE_SGPRS_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0
35; NO-PRELOAD-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
36; NO-PRELOAD-NEXT:    [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
37; NO-PRELOAD-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
38; NO-PRELOAD-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
39; NO-PRELOAD-NEXT:    ret void
40;
41; PRELOAD-LABEL: define amdgpu_kernel void @no_free_sgprs_block_count_x(
42; PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]], i512 inreg [[TMP0:%.*]]) #[[ATTR0]] {
43; PRELOAD-NEXT:    [[NO_FREE_SGPRS_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(328) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
44; PRELOAD-NEXT:    [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
45; PRELOAD-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
46; PRELOAD-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
47; PRELOAD-NEXT:    ret void
48;
49  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
50  %load = load i32, ptr addrspace(4) %imp_arg_ptr
51  store i32 %load, ptr addrspace(1) %out
52  ret void
53}
54
55define amdgpu_kernel void @preloadremainder_z(ptr addrspace(1) %out) {
56; NO-PRELOAD-LABEL: define amdgpu_kernel void @preloadremainder_z(
57; NO-PRELOAD-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
58; NO-PRELOAD-NEXT:    [[PRELOADREMAINDER_Z_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
59; NO-PRELOAD-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOADREMAINDER_Z_KERNARG_SEGMENT]], i64 0
60; NO-PRELOAD-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
61; NO-PRELOAD-NEXT:    [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
62; NO-PRELOAD-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 22
63; NO-PRELOAD-NEXT:    [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2
64; NO-PRELOAD-NEXT:    [[CONV:%.*]] = zext i16 [[LOAD]] to i32
65; NO-PRELOAD-NEXT:    store i32 [[CONV]], ptr addrspace(1) [[OUT_LOAD]], align 4
66; NO-PRELOAD-NEXT:    ret void
67;
68; PRELOAD-LABEL: define amdgpu_kernel void @preloadremainder_z(
69; PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_Z:%.*]]) #[[ATTR0]] {
70; PRELOAD-NEXT:    [[PRELOADREMAINDER_Z_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
71; PRELOAD-NEXT:    [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
72; PRELOAD-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 22
73; PRELOAD-NEXT:    [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2
74; PRELOAD-NEXT:    [[CONV:%.*]] = zext i16 [[_HIDDEN_REMAINDER_Z]] to i32
75; PRELOAD-NEXT:    store i32 [[CONV]], ptr addrspace(1) [[OUT]], align 4
76; PRELOAD-NEXT:    ret void
77;
78  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
79  %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22
80  %load = load i16, ptr addrspace(4) %gep
81  %conv = zext i16 %load to i32
82  store i32 %conv, ptr addrspace(1) %out
83  ret void
84}
85
86define amdgpu_kernel void @preload_workgroup_size_xyz(ptr addrspace(1) %out) {
87; NO-PRELOAD-LABEL: define amdgpu_kernel void @preload_workgroup_size_xyz(
88; NO-PRELOAD-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
89; NO-PRELOAD-NEXT:    [[PRELOAD_WORKGROUP_SIZE_XYZ_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
90; NO-PRELOAD-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_WORKGROUP_SIZE_XYZ_KERNARG_SEGMENT]], i64 0
91; NO-PRELOAD-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
92; NO-PRELOAD-NEXT:    [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
93; NO-PRELOAD-NEXT:    [[GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 12
94; NO-PRELOAD-NEXT:    [[LOAD_X:%.*]] = load i16, ptr addrspace(4) [[GEP_X]], align 2
95; NO-PRELOAD-NEXT:    [[CONV_X:%.*]] = zext i16 [[LOAD_X]] to i32
96; NO-PRELOAD-NEXT:    [[GEP_Y:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 14
97; NO-PRELOAD-NEXT:    [[LOAD_Y:%.*]] = load i16, ptr addrspace(4) [[GEP_Y]], align 2
98; NO-PRELOAD-NEXT:    [[CONV_Y:%.*]] = zext i16 [[LOAD_Y]] to i32
99; NO-PRELOAD-NEXT:    [[GEP_Z:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 16
100; NO-PRELOAD-NEXT:    [[LOAD_Z:%.*]] = load i16, ptr addrspace(4) [[GEP_Z]], align 2
101; NO-PRELOAD-NEXT:    [[CONV_Z:%.*]] = zext i16 [[LOAD_Z]] to i32
102; NO-PRELOAD-NEXT:    [[INS_0:%.*]] = insertelement <3 x i32> poison, i32 [[CONV_X]], i32 0
103; NO-PRELOAD-NEXT:    [[INS_1:%.*]] = insertelement <3 x i32> [[INS_0]], i32 [[CONV_Y]], i32 1
104; NO-PRELOAD-NEXT:    [[INS_2:%.*]] = insertelement <3 x i32> [[INS_1]], i32 [[CONV_Z]], i32 2
105; NO-PRELOAD-NEXT:    store <3 x i32> [[INS_2]], ptr addrspace(1) [[OUT_LOAD]], align 16
106; NO-PRELOAD-NEXT:    ret void
107;
108; PRELOAD-LABEL: define amdgpu_kernel void @preload_workgroup_size_xyz(
109; PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Z:%.*]]) #[[ATTR0]] {
110; PRELOAD-NEXT:    [[PRELOAD_WORKGROUP_SIZE_XYZ_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
111; PRELOAD-NEXT:    [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
112; PRELOAD-NEXT:    [[GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 12
113; PRELOAD-NEXT:    [[LOAD_X:%.*]] = load i16, ptr addrspace(4) [[GEP_X]], align 2
114; PRELOAD-NEXT:    [[CONV_X:%.*]] = zext i16 [[_HIDDEN_GROUP_SIZE_X]] to i32
115; PRELOAD-NEXT:    [[GEP_Y:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 14
116; PRELOAD-NEXT:    [[LOAD_Y:%.*]] = load i16, ptr addrspace(4) [[GEP_Y]], align 2
117; PRELOAD-NEXT:    [[CONV_Y:%.*]] = zext i16 [[_HIDDEN_GROUP_SIZE_Y]] to i32
118; PRELOAD-NEXT:    [[GEP_Z:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 16
119; PRELOAD-NEXT:    [[LOAD_Z:%.*]] = load i16, ptr addrspace(4) [[GEP_Z]], align 2
120; PRELOAD-NEXT:    [[CONV_Z:%.*]] = zext i16 [[_HIDDEN_GROUP_SIZE_Z]] to i32
121; PRELOAD-NEXT:    [[INS_0:%.*]] = insertelement <3 x i32> poison, i32 [[CONV_X]], i32 0
122; PRELOAD-NEXT:    [[INS_1:%.*]] = insertelement <3 x i32> [[INS_0]], i32 [[CONV_Y]], i32 1
123; PRELOAD-NEXT:    [[INS_2:%.*]] = insertelement <3 x i32> [[INS_1]], i32 [[CONV_Z]], i32 2
124; PRELOAD-NEXT:    store <3 x i32> [[INS_2]], ptr addrspace(1) [[OUT]], align 16
125; PRELOAD-NEXT:    ret void
126;
127  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
128  %gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 12
129  %load_x = load i16, ptr addrspace(4) %gep_x
130  %conv_x = zext i16 %load_x to i32
131  %gep_y = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 14
132  %load_y = load i16, ptr addrspace(4) %gep_y
133  %conv_y = zext i16 %load_y to i32
134  %gep_z = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 16
135  %load_z = load i16, ptr addrspace(4) %gep_z
136  %conv_z = zext i16 %load_z to i32
137  %ins.0 =  insertelement <3 x i32> poison, i32 %conv_x, i32 0
138  %ins.1 =  insertelement <3 x i32> %ins.0, i32 %conv_y, i32 1
139  %ins.2 =  insertelement <3 x i32> %ins.1, i32 %conv_z, i32 2
140  store <3 x i32> %ins.2, ptr addrspace(1) %out
141  ret void
142}
143
144define amdgpu_kernel void @incorrect_type_i64_block_count_x(ptr addrspace(1) inreg %out) {
145; NO-PRELOAD-LABEL: define amdgpu_kernel void @incorrect_type_i64_block_count_x(
146; NO-PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] {
147; NO-PRELOAD-NEXT:    [[INCORRECT_TYPE_I64_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
148; NO-PRELOAD-NEXT:    [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
149; NO-PRELOAD-NEXT:    [[LOAD:%.*]] = load i64, ptr addrspace(4) [[IMP_ARG_PTR]], align 8
150; NO-PRELOAD-NEXT:    store i64 [[LOAD]], ptr addrspace(1) [[OUT]], align 8
151; NO-PRELOAD-NEXT:    ret void
152;
153; PRELOAD-LABEL: define amdgpu_kernel void @incorrect_type_i64_block_count_x(
154; PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] {
155; PRELOAD-NEXT:    [[INCORRECT_TYPE_I64_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
156; PRELOAD-NEXT:    [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
157; PRELOAD-NEXT:    [[LOAD:%.*]] = load i64, ptr addrspace(4) [[IMP_ARG_PTR]], align 8
158; PRELOAD-NEXT:    store i64 [[LOAD]], ptr addrspace(1) [[OUT]], align 8
159; PRELOAD-NEXT:    ret void
160;
161  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
162  %load = load i64, ptr addrspace(4) %imp_arg_ptr
163  store i64 %load, ptr addrspace(1) %out
164  ret void
165}
166
167define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out) {
168; NO-PRELOAD-LABEL: define amdgpu_kernel void @random_incorrect_offset(
169; NO-PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] {
170; NO-PRELOAD-NEXT:    [[RANDOM_INCORRECT_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
171; NO-PRELOAD-NEXT:    [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
172; NO-PRELOAD-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 2
173; NO-PRELOAD-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(4) [[GEP]], align 4
174; NO-PRELOAD-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
175; NO-PRELOAD-NEXT:    ret void
176;
177; PRELOAD-LABEL: define amdgpu_kernel void @random_incorrect_offset(
178; PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] {
179; PRELOAD-NEXT:    [[RANDOM_INCORRECT_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
180; PRELOAD-NEXT:    [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
181; PRELOAD-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 2
182; PRELOAD-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(4) [[GEP]], align 4
183; PRELOAD-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
184; PRELOAD-NEXT:    ret void
185;
186  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
187  %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 2
188  %load = load i32, ptr addrspace(4) %gep
189  store i32 %load, ptr addrspace(1) %out
190  ret void
191}
192
193define amdgpu_kernel void @incompatible_attribute_block_count_x(ptr addrspace(1) byref(i32) %out) {
194; NO-PRELOAD-LABEL: define amdgpu_kernel void @incompatible_attribute_block_count_x(
195; NO-PRELOAD-SAME: ptr addrspace(1) byref(i32) [[OUT:%.*]]) #[[ATTR0]] {
196; NO-PRELOAD-NEXT:    [[INCOMPATIBLE_ATTRIBUTE_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
197; NO-PRELOAD-NEXT:    [[OUT_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[INCOMPATIBLE_ATTRIBUTE_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0
198; NO-PRELOAD-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[OUT_BYVAL_KERNARG_OFFSET]] to ptr addrspace(1)
199; NO-PRELOAD-NEXT:    [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
200; NO-PRELOAD-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
201; NO-PRELOAD-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[TMP1]], align 4
202; NO-PRELOAD-NEXT:    ret void
203;
204; PRELOAD-LABEL: define amdgpu_kernel void @incompatible_attribute_block_count_x(
205; PRELOAD-SAME: ptr addrspace(1) byref(i32) [[OUT:%.*]]) #[[ATTR0]] {
206; PRELOAD-NEXT:    [[INCOMPATIBLE_ATTRIBUTE_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
207; PRELOAD-NEXT:    [[OUT_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[INCOMPATIBLE_ATTRIBUTE_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0
208; PRELOAD-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[OUT_BYVAL_KERNARG_OFFSET]] to ptr addrspace(1)
209; PRELOAD-NEXT:    [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
210; PRELOAD-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
211; PRELOAD-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[TMP1]], align 4
212; PRELOAD-NEXT:    ret void
213;
214  %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
215  %load = load i32, ptr addrspace(4) %imp_arg_ptr
216  store i32 %load, ptr addrspace(1) %out
217  ret void
218}
219
220;.
221; NO-PRELOAD: [[META0]] = !{}
222;.
223