xref: /llvm-project/llvm/test/CodeGen/AMDGPU/lower-kernargs.ll (revision 87c21bf0644c640e34c3eaa2e9a7c97eda0bf4a4)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
2; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -o - -passes=amdgpu-lower-kernel-arguments %s | FileCheck -check-prefixes=GCN,HSA %s
3; RUN: opt -mtriple=amdgcn-- -S -o - -passes=amdgpu-lower-kernel-arguments %s | FileCheck -check-prefixes=GCN,MESA %s
4
5target datalayout = "A5"
6
7declare void @llvm.fake.use(...)
8
9define amdgpu_kernel void @kern_noargs() {
10; GCN-LABEL: @kern_noargs(
11; GCN-NEXT:    ret void
12;
13  ret void
14}
15
16define amdgpu_kernel void @kern_i8(i8 %arg) #0 {
17; HSA-LABEL: @kern_i8(
18; HSA-NEXT:    [[KERN_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
19; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_I8_KERNARG_SEGMENT]], i64 0
20; HSA-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META1:![0-9]+]]
21; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
22; HSA-NEXT:    store i8 [[TMP2]], ptr addrspace(1) undef, align 1
23; HSA-NEXT:    ret void
24;
25; MESA-LABEL: @kern_i8(
26; MESA-NEXT:    [[KERN_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
27; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_I8_KERNARG_SEGMENT]], i64 36
28; MESA-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load [[META1:![0-9]+]]
29; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
30; MESA-NEXT:    store i8 [[TMP2]], ptr addrspace(1) undef, align 1
31; MESA-NEXT:    ret void
32;
33  store i8 %arg, ptr addrspace(1) undef, align 1
34  ret void
35}
36
37define amdgpu_kernel void @kern_i16(i16 %arg) #0 {
38; HSA-LABEL: @kern_i16(
39; HSA-NEXT:    [[KERN_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
40; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_I16_KERNARG_SEGMENT]], i64 0
41; HSA-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META1]]
42; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
43; HSA-NEXT:    store i16 [[TMP2]], ptr addrspace(1) undef, align 1
44; HSA-NEXT:    ret void
45;
46; MESA-LABEL: @kern_i16(
47; MESA-NEXT:    [[KERN_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
48; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_I16_KERNARG_SEGMENT]], i64 36
49; MESA-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load [[META1]]
50; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
51; MESA-NEXT:    store i16 [[TMP2]], ptr addrspace(1) undef, align 1
52; MESA-NEXT:    ret void
53;
54  store i16 %arg, ptr addrspace(1) undef, align 1
55  ret void
56}
57
58define amdgpu_kernel void @kern_f16(half %arg) #0 {
59; HSA-LABEL: @kern_f16(
60; HSA-NEXT:    [[KERN_F16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
61; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_F16_KERNARG_SEGMENT]], i64 0
62; HSA-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META1]]
63; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
64; HSA-NEXT:    [[ARG_LOAD:%.*]] = bitcast i16 [[TMP2]] to half
65; HSA-NEXT:    store half [[ARG_LOAD]], ptr addrspace(1) undef, align 1
66; HSA-NEXT:    ret void
67;
68; MESA-LABEL: @kern_f16(
69; MESA-NEXT:    [[KERN_F16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
70; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_F16_KERNARG_SEGMENT]], i64 36
71; MESA-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load [[META1]]
72; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
73; MESA-NEXT:    [[ARG_LOAD:%.*]] = bitcast i16 [[TMP2]] to half
74; MESA-NEXT:    store half [[ARG_LOAD]], ptr addrspace(1) undef, align 1
75; MESA-NEXT:    ret void
76;
77  store half %arg, ptr addrspace(1) undef, align 1
78  ret void
79}
80
81define amdgpu_kernel void @kern_zeroext_i8(i8 zeroext %arg) #0 {
82; HSA-LABEL: @kern_zeroext_i8(
83; HSA-NEXT:    [[KERN_ZEROEXT_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
84; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_ZEROEXT_I8_KERNARG_SEGMENT]], i64 0
85; HSA-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META1]]
86; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
87; HSA-NEXT:    store i8 [[TMP2]], ptr addrspace(1) undef, align 1
88; HSA-NEXT:    ret void
89;
90; MESA-LABEL: @kern_zeroext_i8(
91; MESA-NEXT:    [[KERN_ZEROEXT_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
92; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_ZEROEXT_I8_KERNARG_SEGMENT]], i64 36
93; MESA-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load [[META1]]
94; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
95; MESA-NEXT:    store i8 [[TMP2]], ptr addrspace(1) undef, align 1
96; MESA-NEXT:    ret void
97;
98  store i8 %arg, ptr addrspace(1) undef, align 1
99  ret void
100}
101
102define amdgpu_kernel void @kern_zeroext_i16(i16 zeroext %arg) #0 {
103; HSA-LABEL: @kern_zeroext_i16(
104; HSA-NEXT:    [[KERN_ZEROEXT_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
105; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_ZEROEXT_I16_KERNARG_SEGMENT]], i64 0
106; HSA-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META1]]
107; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
108; HSA-NEXT:    store i16 [[TMP2]], ptr addrspace(1) undef, align 1
109; HSA-NEXT:    ret void
110;
111; MESA-LABEL: @kern_zeroext_i16(
112; MESA-NEXT:    [[KERN_ZEROEXT_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
113; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_ZEROEXT_I16_KERNARG_SEGMENT]], i64 36
114; MESA-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load [[META1]]
115; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
116; MESA-NEXT:    store i16 [[TMP2]], ptr addrspace(1) undef, align 1
117; MESA-NEXT:    ret void
118;
119  store i16 %arg, ptr addrspace(1) undef, align 1
120  ret void
121}
122
123define amdgpu_kernel void @kern_signext_i8(i8 signext %arg) #0 {
124; HSA-LABEL: @kern_signext_i8(
125; HSA-NEXT:    [[KERN_SIGNEXT_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
126; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_SIGNEXT_I8_KERNARG_SEGMENT]], i64 0
127; HSA-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META1]]
128; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
129; HSA-NEXT:    store i8 [[TMP2]], ptr addrspace(1) undef, align 1
130; HSA-NEXT:    ret void
131;
132; MESA-LABEL: @kern_signext_i8(
133; MESA-NEXT:    [[KERN_SIGNEXT_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
134; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_SIGNEXT_I8_KERNARG_SEGMENT]], i64 36
135; MESA-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load [[META1]]
136; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
137; MESA-NEXT:    store i8 [[TMP2]], ptr addrspace(1) undef, align 1
138; MESA-NEXT:    ret void
139;
140  store i8 %arg, ptr addrspace(1) undef, align 1
141  ret void
142}
143
144define amdgpu_kernel void @kern_signext_i16(i16 signext %arg) #0 {
145; HSA-LABEL: @kern_signext_i16(
146; HSA-NEXT:    [[KERN_SIGNEXT_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
147; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_SIGNEXT_I16_KERNARG_SEGMENT]], i64 0
148; HSA-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META1]]
149; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
150; HSA-NEXT:    store i16 [[TMP2]], ptr addrspace(1) undef, align 1
151; HSA-NEXT:    ret void
152;
153; MESA-LABEL: @kern_signext_i16(
154; MESA-NEXT:    [[KERN_SIGNEXT_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
155; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_SIGNEXT_I16_KERNARG_SEGMENT]], i64 36
156; MESA-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load [[META1]]
157; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
158; MESA-NEXT:    store i16 [[TMP2]], ptr addrspace(1) undef, align 1
159; MESA-NEXT:    ret void
160;
161  store i16 %arg, ptr addrspace(1) undef, align 1
162  ret void
163}
164
165define amdgpu_kernel void @kern_i8_i8(i8 %arg0, i8 %arg1) {
166; HSA-LABEL: @kern_i8_i8(
167; HSA-NEXT:    [[KERN_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
168; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_I8_I8_KERNARG_SEGMENT]], i64 0
169; HSA-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META1]]
170; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
171; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_I8_I8_KERNARG_SEGMENT]], i64 0
172; HSA-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META1]]
173; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
174; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
175; HSA-NEXT:    store volatile i8 [[TMP2]], ptr addrspace(1) undef, align 1
176; HSA-NEXT:    store volatile i8 [[TMP5]], ptr addrspace(1) undef, align 1
177; HSA-NEXT:    ret void
178;
179; MESA-LABEL: @kern_i8_i8(
180; MESA-NEXT:    [[KERN_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
181; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_I8_I8_KERNARG_SEGMENT]], i64 36
182; MESA-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load [[META1]]
183; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
184; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_I8_I8_KERNARG_SEGMENT]], i64 36
185; MESA-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load [[META1]]
186; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
187; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
188; MESA-NEXT:    store volatile i8 [[TMP2]], ptr addrspace(1) undef, align 1
189; MESA-NEXT:    store volatile i8 [[TMP5]], ptr addrspace(1) undef, align 1
190; MESA-NEXT:    ret void
191;
192  store volatile i8 %arg0, ptr addrspace(1) undef, align 1
193  store volatile i8 %arg1, ptr addrspace(1) undef, align 1
194  ret void
195}
196
197define amdgpu_kernel void @kern_v3i8(<3 x i8> %arg) {
198; HSA-LABEL: @kern_v3i8(
199; HSA-NEXT:    [[KERN_V3I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
200; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_V3I8_KERNARG_SEGMENT]], i64 0
201; HSA-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META1]]
202; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i24
203; HSA-NEXT:    [[ARG_LOAD:%.*]] = bitcast i24 [[TMP2]] to <3 x i8>
204; HSA-NEXT:    store <3 x i8> [[ARG_LOAD]], ptr addrspace(1) undef, align 4
205; HSA-NEXT:    ret void
206;
207; MESA-LABEL: @kern_v3i8(
208; MESA-NEXT:    [[KERN_V3I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
209; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_V3I8_KERNARG_SEGMENT]], i64 36
210; MESA-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load [[META1]]
211; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i24
212; MESA-NEXT:    [[ARG_LOAD:%.*]] = bitcast i24 [[TMP2]] to <3 x i8>
213; MESA-NEXT:    store <3 x i8> [[ARG_LOAD]], ptr addrspace(1) undef, align 4
214; MESA-NEXT:    ret void
215;
216  store <3 x i8> %arg, ptr addrspace(1) undef, align 4
217  ret void
218}
219
220define amdgpu_kernel void @kern_i24(i24 %arg0) {
221; HSA-LABEL: @kern_i24(
222; HSA-NEXT:    [[KERN_I24_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
223; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_I24_KERNARG_SEGMENT]], i64 0
224; HSA-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META1]]
225; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i24
226; HSA-NEXT:    store i24 [[TMP2]], ptr addrspace(1) undef, align 4
227; HSA-NEXT:    ret void
228;
229; MESA-LABEL: @kern_i24(
230; MESA-NEXT:    [[KERN_I24_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
231; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_I24_KERNARG_SEGMENT]], i64 36
232; MESA-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load [[META1]]
233; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i24
234; MESA-NEXT:    store i24 [[TMP2]], ptr addrspace(1) undef, align 4
235; MESA-NEXT:    ret void
236;
237  store i24 %arg0, ptr addrspace(1) undef
238  ret void
239}
240
241define amdgpu_kernel void @kern_i32(i32 %arg0) {
242; HSA-LABEL: @kern_i32(
243; HSA-NEXT:    [[KERN_I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
244; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_I32_KERNARG_SEGMENT]], i64 0
245; HSA-NEXT:    [[ARG0_LOAD:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET]], align 16, !invariant.load [[META1]]
246; HSA-NEXT:    store i32 [[ARG0_LOAD]], ptr addrspace(1) undef, align 4
247; HSA-NEXT:    ret void
248;
249; MESA-LABEL: @kern_i32(
250; MESA-NEXT:    [[KERN_I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
251; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_I32_KERNARG_SEGMENT]], i64 36
252; MESA-NEXT:    [[ARG0_LOAD:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET]], align 4, !invariant.load [[META1]]
253; MESA-NEXT:    store i32 [[ARG0_LOAD]], ptr addrspace(1) undef, align 4
254; MESA-NEXT:    ret void
255;
256  store i32 %arg0, ptr addrspace(1) undef
257  ret void
258}
259
260define amdgpu_kernel void @kern_range_noundef_i32(i32 noundef range(i32 0, 8) %arg0) {
261; HSA-LABEL: @kern_range_noundef_i32(
262; HSA-NEXT:    [[KERN_RANGE_NOUNDEF_I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
263; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_RANGE_NOUNDEF_I32_KERNARG_SEGMENT]], i64 0
264; HSA-NEXT:    [[ARG0_LOAD:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET]], align 16, !range [[RNG2:![0-9]+]], !invariant.load [[META1]], !noundef [[META1]]
265; HSA-NEXT:    call void (...) @llvm.fake.use(i32 [[ARG0_LOAD]])
266; HSA-NEXT:    ret void
267;
268; MESA-LABEL: @kern_range_noundef_i32(
269; MESA-NEXT:    [[KERN_RANGE_NOUNDEF_I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
270; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_RANGE_NOUNDEF_I32_KERNARG_SEGMENT]], i64 36
271; MESA-NEXT:    [[ARG0_LOAD:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET]], align 4, !range [[RNG2:![0-9]+]], !invariant.load [[META1]], !noundef [[META1]]
272; MESA-NEXT:    call void (...) @llvm.fake.use(i32 [[ARG0_LOAD]])
273; MESA-NEXT:    ret void
274;
275  call void (...) @llvm.fake.use(i32 %arg0)
276  ret void
277}
278
279define amdgpu_kernel void @kern_f32(float %arg0) {
280; HSA-LABEL: @kern_f32(
281; HSA-NEXT:    [[KERN_F32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
282; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_F32_KERNARG_SEGMENT]], i64 0
283; HSA-NEXT:    [[ARG0_LOAD:%.*]] = load float, ptr addrspace(4) [[ARG0_KERNARG_OFFSET]], align 16, !invariant.load [[META1]]
284; HSA-NEXT:    store float [[ARG0_LOAD]], ptr addrspace(1) undef, align 4
285; HSA-NEXT:    ret void
286;
287; MESA-LABEL: @kern_f32(
288; MESA-NEXT:    [[KERN_F32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
289; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_F32_KERNARG_SEGMENT]], i64 36
290; MESA-NEXT:    [[ARG0_LOAD:%.*]] = load float, ptr addrspace(4) [[ARG0_KERNARG_OFFSET]], align 4, !invariant.load [[META1]]
291; MESA-NEXT:    store float [[ARG0_LOAD]], ptr addrspace(1) undef, align 4
292; MESA-NEXT:    ret void
293;
294  store float %arg0, ptr addrspace(1) undef
295  ret void
296}
297
298define amdgpu_kernel void @kern_v3i32(<3 x i32> %arg0) {
299; HSA-LABEL: @kern_v3i32(
300; HSA-NEXT:    [[KERN_V3I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
301; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_V3I32_KERNARG_SEGMENT]], i64 0
302; HSA-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr addrspace(4) [[ARG0_KERNARG_OFFSET]], align 16, !invariant.load [[META1]]
303; HSA-NEXT:    [[ARG0_LOAD:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
304; HSA-NEXT:    store <3 x i32> [[ARG0_LOAD]], ptr addrspace(1) undef, align 4
305; HSA-NEXT:    ret void
306;
307; MESA-LABEL: @kern_v3i32(
308; MESA-NEXT:    [[KERN_V3I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
309; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_V3I32_KERNARG_SEGMENT]], i64 36
310; MESA-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr addrspace(4) [[ARG0_KERNARG_OFFSET]], align 4, !invariant.load [[META1]]
311; MESA-NEXT:    [[ARG0_LOAD:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
312; MESA-NEXT:    store <3 x i32> [[ARG0_LOAD]], ptr addrspace(1) undef, align 4
313; MESA-NEXT:    ret void
314;
315  store <3 x i32> %arg0, ptr addrspace(1) undef, align 4
316  ret void
317}
318
319define amdgpu_kernel void @kern_v8i32(<8 x i32> %arg) #0 {
320; HSA-LABEL: @kern_v8i32(
321; HSA-NEXT:    [[KERN_V8I32_KERNARG_SEGMENT:%.*]] = call nonnull align 32 dereferenceable(288) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
322; HSA-NEXT:    [[ARG_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_V8I32_KERNARG_SEGMENT]], i64 0
323; HSA-NEXT:    [[ARG_LOAD:%.*]] = load <8 x i32>, ptr addrspace(4) [[ARG_KERNARG_OFFSET]], align 16, !invariant.load [[META1]]
324; HSA-NEXT:    store <8 x i32> [[ARG_LOAD]], ptr addrspace(1) undef, align 32
325; HSA-NEXT:    ret void
326;
327; MESA-LABEL: @kern_v8i32(
328; MESA-NEXT:    [[KERN_V8I32_KERNARG_SEGMENT:%.*]] = call nonnull align 32 dereferenceable(288) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
329; MESA-NEXT:    [[ARG_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_V8I32_KERNARG_SEGMENT]], i64 36
330; MESA-NEXT:    [[ARG_LOAD:%.*]] = load <8 x i32>, ptr addrspace(4) [[ARG_KERNARG_OFFSET]], align 4, !invariant.load [[META1]]
331; MESA-NEXT:    store <8 x i32> [[ARG_LOAD]], ptr addrspace(1) undef, align 32
332; MESA-NEXT:    ret void
333;
334  store <8 x i32> %arg, ptr addrspace(1) undef
335  ret void
336}
337
338define amdgpu_kernel void @kern_v8i64(<8 x i64> %arg) #0 {
339; HSA-LABEL: @kern_v8i64(
340; HSA-NEXT:    [[KERN_V8I64_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(320) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
341; HSA-NEXT:    [[ARG_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_V8I64_KERNARG_SEGMENT]], i64 0
342; HSA-NEXT:    [[ARG_LOAD:%.*]] = load <8 x i64>, ptr addrspace(4) [[ARG_KERNARG_OFFSET]], align 16, !invariant.load [[META1]]
343; HSA-NEXT:    store <8 x i64> [[ARG_LOAD]], ptr addrspace(1) undef, align 64
344; HSA-NEXT:    ret void
345;
346; MESA-LABEL: @kern_v8i64(
347; MESA-NEXT:    [[KERN_V8I64_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(320) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
348; MESA-NEXT:    [[ARG_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_V8I64_KERNARG_SEGMENT]], i64 36
349; MESA-NEXT:    [[ARG_LOAD:%.*]] = load <8 x i64>, ptr addrspace(4) [[ARG_KERNARG_OFFSET]], align 4, !invariant.load [[META1]]
350; MESA-NEXT:    store <8 x i64> [[ARG_LOAD]], ptr addrspace(1) undef, align 64
351; MESA-NEXT:    ret void
352;
353  store <8 x i64> %arg, ptr addrspace(1) undef
354  ret void
355}
356
357define amdgpu_kernel void @kern_v16i64(<16 x i64> %arg) #0 {
358; HSA-LABEL: @kern_v16i64(
359; HSA-NEXT:    [[KERN_V16I64_KERNARG_SEGMENT:%.*]] = call nonnull align 128 dereferenceable(384) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
360; HSA-NEXT:    [[ARG_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_V16I64_KERNARG_SEGMENT]], i64 0
361; HSA-NEXT:    [[ARG_LOAD:%.*]] = load <16 x i64>, ptr addrspace(4) [[ARG_KERNARG_OFFSET]], align 16, !invariant.load [[META1]]
362; HSA-NEXT:    store <16 x i64> [[ARG_LOAD]], ptr addrspace(1) undef, align 128
363; HSA-NEXT:    ret void
364;
365; MESA-LABEL: @kern_v16i64(
366; MESA-NEXT:    [[KERN_V16I64_KERNARG_SEGMENT:%.*]] = call nonnull align 128 dereferenceable(384) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
367; MESA-NEXT:    [[ARG_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_V16I64_KERNARG_SEGMENT]], i64 36
368; MESA-NEXT:    [[ARG_LOAD:%.*]] = load <16 x i64>, ptr addrspace(4) [[ARG_KERNARG_OFFSET]], align 4, !invariant.load [[META1]]
369; MESA-NEXT:    store <16 x i64> [[ARG_LOAD]], ptr addrspace(1) undef, align 128
370; MESA-NEXT:    ret void
371;
372  store <16 x i64> %arg, ptr addrspace(1) undef
373  ret void
374}
375
376define amdgpu_kernel void @kern_i32_v3i32(i32 %arg0, <3 x i32> %arg1) {
377; HSA-LABEL: @kern_i32_v3i32(
378; HSA-NEXT:    [[KERN_I32_V3I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(288) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
379; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_I32_V3I32_KERNARG_SEGMENT]], i64 0
380; HSA-NEXT:    [[ARG0_LOAD:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET]], align 16, !invariant.load [[META1]]
381; HSA-NEXT:    [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_I32_V3I32_KERNARG_SEGMENT]], i64 16
382; HSA-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr addrspace(4) [[ARG1_KERNARG_OFFSET]], align 16, !invariant.load [[META1]]
383; HSA-NEXT:    [[ARG1_LOAD:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
384; HSA-NEXT:    store i32 [[ARG0_LOAD]], ptr addrspace(1) undef, align 4
385; HSA-NEXT:    store <3 x i32> [[ARG1_LOAD]], ptr addrspace(1) undef, align 4
386; HSA-NEXT:    ret void
387;
388; MESA-LABEL: @kern_i32_v3i32(
389; MESA-NEXT:    [[KERN_I32_V3I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(288) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
390; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_I32_V3I32_KERNARG_SEGMENT]], i64 36
391; MESA-NEXT:    [[ARG0_LOAD:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET]], align 4, !invariant.load [[META1]]
392; MESA-NEXT:    [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_I32_V3I32_KERNARG_SEGMENT]], i64 52
393; MESA-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr addrspace(4) [[ARG1_KERNARG_OFFSET]], align 4, !invariant.load [[META1]]
394; MESA-NEXT:    [[ARG1_LOAD:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
395; MESA-NEXT:    store i32 [[ARG0_LOAD]], ptr addrspace(1) undef, align 4
396; MESA-NEXT:    store <3 x i32> [[ARG1_LOAD]], ptr addrspace(1) undef, align 4
397; MESA-NEXT:    ret void
398;
399  store i32 %arg0, ptr addrspace(1) undef
400  store <3 x i32> %arg1, ptr addrspace(1) undef, align 4
401  ret void
402}
403
404%struct.a = type { i32, i8, [4 x i8] }
405%struct.b.packed = type { i8, i32, [3 x i16], <2 x double> }
406
407define amdgpu_kernel void @kern_struct_a(%struct.a %arg0) {
408; HSA-LABEL: @kern_struct_a(
409; HSA-NEXT:    [[KERN_STRUCT_A_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
410; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_STRUCT_A_KERNARG_SEGMENT]], i64 0
411; HSA-NEXT:    [[ARG0_LOAD:%.*]] = load [[STRUCT_A:%.*]], ptr addrspace(4) [[ARG0_KERNARG_OFFSET]], align 16, !invariant.load [[META1]]
412; HSA-NEXT:    store [[STRUCT_A]] [[ARG0_LOAD]], ptr addrspace(1) undef, align 4
413; HSA-NEXT:    ret void
414;
415; MESA-LABEL: @kern_struct_a(
416; MESA-NEXT:    [[KERN_STRUCT_A_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(268) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
417; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_STRUCT_A_KERNARG_SEGMENT]], i64 36
418; MESA-NEXT:    [[ARG0_LOAD:%.*]] = load [[STRUCT_A:%.*]], ptr addrspace(4) [[ARG0_KERNARG_OFFSET]], align 4, !invariant.load [[META1]]
419; MESA-NEXT:    store [[STRUCT_A]] [[ARG0_LOAD]], ptr addrspace(1) undef, align 4
420; MESA-NEXT:    ret void
421;
422  store %struct.a %arg0, ptr addrspace(1) undef
423  ret void
424}
425
426define amdgpu_kernel void @kern_struct_b_packed(%struct.b.packed %arg0) #0 {
427; HSA-LABEL: @kern_struct_b_packed(
428; HSA-NEXT:    [[KERN_STRUCT_B_PACKED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(288) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
429; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_STRUCT_B_PACKED_KERNARG_SEGMENT]], i64 0
430; HSA-NEXT:    [[ARG0_LOAD:%.*]] = load [[STRUCT_B_PACKED:%.*]], ptr addrspace(4) [[ARG0_KERNARG_OFFSET]], align 16, !invariant.load [[META1]]
431; HSA-NEXT:    store [[STRUCT_B_PACKED]] [[ARG0_LOAD]], ptr addrspace(1) undef, align 16
432; HSA-NEXT:    ret void
433;
434; MESA-LABEL: @kern_struct_b_packed(
435; MESA-NEXT:    [[KERN_STRUCT_B_PACKED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(288) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
436; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_STRUCT_B_PACKED_KERNARG_SEGMENT]], i64 36
437; MESA-NEXT:    [[ARG0_LOAD:%.*]] = load [[STRUCT_B_PACKED:%.*]], ptr addrspace(4) [[ARG0_KERNARG_OFFSET]], align 4, !invariant.load [[META1]]
438; MESA-NEXT:    store [[STRUCT_B_PACKED]] [[ARG0_LOAD]], ptr addrspace(1) undef, align 16
439; MESA-NEXT:    ret void
440;
441  store %struct.b.packed %arg0, ptr addrspace(1) undef
442  ret void
443}
444
445define amdgpu_kernel void @kern_implicit_arg_num_bytes(i32 %arg0) #1 {
446; HSA-LABEL: @kern_implicit_arg_num_bytes(
447; HSA-NEXT:    [[KERN_IMPLICIT_ARG_NUM_BYTES_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(48) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
448; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_IMPLICIT_ARG_NUM_BYTES_KERNARG_SEGMENT]], i64 0
449; HSA-NEXT:    [[ARG0_LOAD:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET]], align 16, !invariant.load [[META1]]
450; HSA-NEXT:    store i32 [[ARG0_LOAD]], ptr addrspace(1) undef, align 4
451; HSA-NEXT:    ret void
452;
453; MESA-LABEL: @kern_implicit_arg_num_bytes(
454; MESA-NEXT:    [[KERN_IMPLICIT_ARG_NUM_BYTES_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
455; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_IMPLICIT_ARG_NUM_BYTES_KERNARG_SEGMENT]], i64 36
456; MESA-NEXT:    [[ARG0_LOAD:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET]], align 4, !invariant.load [[META1]]
457; MESA-NEXT:    store i32 [[ARG0_LOAD]], ptr addrspace(1) undef, align 4
458; MESA-NEXT:    ret void
459;
460  store i32 %arg0, ptr addrspace(1) undef
461  ret void
462}
463
464define amdgpu_kernel void @kernel_implicitarg_no_struct_align(<16 x i32>, i32 %arg1) #1 {
465; HSA-LABEL: @kernel_implicitarg_no_struct_align(
466; HSA-NEXT:    [[KERNEL_IMPLICITARG_NO_STRUCT_ALIGN_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(112) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
467; HSA-NEXT:    [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERNEL_IMPLICITARG_NO_STRUCT_ALIGN_KERNARG_SEGMENT]], i64 64
468; HSA-NEXT:    [[ARG1_LOAD:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET]], align 16, !invariant.load [[META1]]
469; HSA-NEXT:    store i32 [[ARG1_LOAD]], ptr addrspace(1) undef, align 4
470; HSA-NEXT:    ret void
471;
472; MESA-LABEL: @kernel_implicitarg_no_struct_align(
473; MESA-NEXT:    [[KERNEL_IMPLICITARG_NO_STRUCT_ALIGN_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(108) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
474; MESA-NEXT:    [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERNEL_IMPLICITARG_NO_STRUCT_ALIGN_KERNARG_SEGMENT]], i64 100
475; MESA-NEXT:    [[ARG1_LOAD:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET]], align 4, !invariant.load [[META1]]
476; MESA-NEXT:    store i32 [[ARG1_LOAD]], ptr addrspace(1) undef, align 4
477; MESA-NEXT:    ret void
478;
479  store i32 %arg1, ptr addrspace(1) undef
480  ret void
481}
482
483define amdgpu_kernel void @kern_lds_ptr(ptr addrspace(3) %lds) #0 {
484; HSA-LABEL: @kern_lds_ptr(
485; HSA-NEXT:    [[KERN_LDS_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
486; HSA-NEXT:    [[LDS_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_LDS_PTR_KERNARG_SEGMENT]], i64 0
487; HSA-NEXT:    [[LDS_LOAD:%.*]] = load ptr addrspace(3), ptr addrspace(4) [[LDS_KERNARG_OFFSET]], align 16, !invariant.load [[META1]]
488; HSA-NEXT:    store i32 0, ptr addrspace(3) [[LDS_LOAD]], align 4
489; HSA-NEXT:    ret void
490;
491; MESA-LABEL: @kern_lds_ptr(
492; MESA-NEXT:    [[KERN_LDS_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
493; MESA-NEXT:    [[LDS_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_LDS_PTR_KERNARG_SEGMENT]], i64 36
494; MESA-NEXT:    [[LDS_LOAD:%.*]] = load ptr addrspace(3), ptr addrspace(4) [[LDS_KERNARG_OFFSET]], align 4, !invariant.load [[META1]]
495; MESA-NEXT:    store i32 0, ptr addrspace(3) [[LDS_LOAD]], align 4
496; MESA-NEXT:    ret void
497;
498  store i32 0, ptr addrspace(3) %lds, align 4
499  ret void
500}
501
502define amdgpu_kernel void @kern_lds_ptr_si(ptr addrspace(3) %lds) #2 {
503; GCN-LABEL: @kern_lds_ptr_si(
504; GCN-NEXT:    [[KERN_LDS_PTR_SI_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
505; GCN-NEXT:    store i32 0, ptr addrspace(3) [[LDS:%.*]], align 4
506; GCN-NEXT:    ret void
507;
508  store i32 0, ptr addrspace(3) %lds, align 4
509  ret void
510}
511
512define amdgpu_kernel void @kern_realign_i8_i8(i8 %arg0, i8 %arg1) #0 {
513; HSA-LABEL: @kern_realign_i8_i8(
514; HSA-NEXT:    [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
515; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT]], i64 0
516; HSA-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META1]]
517; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
518; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT]], i64 0
519; HSA-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META1]]
520; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
521; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
522; HSA-NEXT:    store volatile i8 [[TMP2]], ptr addrspace(1) undef, align 1
523; HSA-NEXT:    store volatile i8 [[TMP5]], ptr addrspace(1) undef, align 1
524; HSA-NEXT:    ret void
525;
526; MESA-LABEL: @kern_realign_i8_i8(
527; MESA-NEXT:    [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
528; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT]], i64 36
529; MESA-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load [[META1]]
530; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
531; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT]], i64 36
532; MESA-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load [[META1]]
533; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
534; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
535; MESA-NEXT:    store volatile i8 [[TMP2]], ptr addrspace(1) undef, align 1
536; MESA-NEXT:    store volatile i8 [[TMP5]], ptr addrspace(1) undef, align 1
537; MESA-NEXT:    ret void
538;
539  store volatile i8 %arg0, ptr addrspace(1) undef
540  store volatile i8 %arg1, ptr addrspace(1) undef
541  ret void
542}
543
544define amdgpu_kernel void @kern_realign_i8_i8_i8(i8 %arg0, i8 %arg1, i8 %arg2) #0 {
545; HSA-LABEL: @kern_realign_i8_i8_i8(
546; HSA-NEXT:    [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
547; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 0
548; HSA-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META1]]
549; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
550; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 0
551; HSA-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META1]]
552; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
553; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
554; HSA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 0
555; HSA-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(4) [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META1]]
556; HSA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
557; HSA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8
558; HSA-NEXT:    store volatile i8 [[TMP2]], ptr addrspace(1) undef, align 1
559; HSA-NEXT:    store volatile i8 [[TMP5]], ptr addrspace(1) undef, align 1
560; HSA-NEXT:    store volatile i8 [[TMP8]], ptr addrspace(1) undef, align 1
561; HSA-NEXT:    ret void
562;
563; MESA-LABEL: @kern_realign_i8_i8_i8(
564; MESA-NEXT:    [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
565; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 36
566; MESA-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load [[META1]]
567; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
568; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 36
569; MESA-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load [[META1]]
570; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
571; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
572; MESA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 36
573; MESA-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(4) [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load [[META1]]
574; MESA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
575; MESA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8
576; MESA-NEXT:    store volatile i8 [[TMP2]], ptr addrspace(1) undef, align 1
577; MESA-NEXT:    store volatile i8 [[TMP5]], ptr addrspace(1) undef, align 1
578; MESA-NEXT:    store volatile i8 [[TMP8]], ptr addrspace(1) undef, align 1
579; MESA-NEXT:    ret void
580;
581  store volatile i8 %arg0, ptr addrspace(1) undef
582  store volatile i8 %arg1, ptr addrspace(1) undef
583  store volatile i8 %arg2, ptr addrspace(1) undef
584  ret void
585}
586
587define amdgpu_kernel void @kern_realign_i8_i8_i8_i8(i8 %arg0, i8 %arg1, i8 %arg2, i8 %arg3) #0 {
588; HSA-LABEL: @kern_realign_i8_i8_i8_i8(
589; HSA-NEXT:    [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
590; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
591; HSA-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META1]]
592; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
593; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
594; HSA-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META1]]
595; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
596; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
597; HSA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
598; HSA-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(4) [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META1]]
599; HSA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
600; HSA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8
601; HSA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
602; HSA-NEXT:    [[TMP9:%.*]] = load i32, ptr addrspace(4) [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META1]]
603; HSA-NEXT:    [[TMP10:%.*]] = lshr i32 [[TMP9]], 24
604; HSA-NEXT:    [[TMP11:%.*]] = trunc i32 [[TMP10]] to i8
605; HSA-NEXT:    store volatile i8 [[TMP2]], ptr addrspace(1) undef, align 1
606; HSA-NEXT:    store volatile i8 [[TMP5]], ptr addrspace(1) undef, align 1
607; HSA-NEXT:    store volatile i8 [[TMP8]], ptr addrspace(1) undef, align 1
608; HSA-NEXT:    store volatile i8 [[TMP11]], ptr addrspace(1) undef, align 1
609; HSA-NEXT:    ret void
610;
611; MESA-LABEL: @kern_realign_i8_i8_i8_i8(
612; MESA-NEXT:    [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
613; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36
614; MESA-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load [[META1]]
615; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
616; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36
617; MESA-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load [[META1]]
618; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
619; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
620; MESA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36
621; MESA-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(4) [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load [[META1]]
622; MESA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
623; MESA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8
624; MESA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36
625; MESA-NEXT:    [[TMP9:%.*]] = load i32, ptr addrspace(4) [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load [[META1]]
626; MESA-NEXT:    [[TMP10:%.*]] = lshr i32 [[TMP9]], 24
627; MESA-NEXT:    [[TMP11:%.*]] = trunc i32 [[TMP10]] to i8
628; MESA-NEXT:    store volatile i8 [[TMP2]], ptr addrspace(1) undef, align 1
629; MESA-NEXT:    store volatile i8 [[TMP5]], ptr addrspace(1) undef, align 1
630; MESA-NEXT:    store volatile i8 [[TMP8]], ptr addrspace(1) undef, align 1
631; MESA-NEXT:    store volatile i8 [[TMP11]], ptr addrspace(1) undef, align 1
632; MESA-NEXT:    ret void
633;
634  store volatile i8 %arg0, ptr addrspace(1) undef
635  store volatile i8 %arg1, ptr addrspace(1) undef
636  store volatile i8 %arg2, ptr addrspace(1) undef
637  store volatile i8 %arg3, ptr addrspace(1) undef
638  ret void
639}
640
641define amdgpu_kernel void @kern_realign_i8_v3i8(i8 %arg0, <3 x i8> %arg1) #0 {
642; HSA-LABEL: @kern_realign_i8_v3i8(
643; HSA-NEXT:    [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
644; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT]], i64 0
645; HSA-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META1]]
646; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
647; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT]], i64 4
648; HSA-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load [[META1]]
649; HSA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i24
650; HSA-NEXT:    [[ARG1_LOAD:%.*]] = bitcast i24 [[TMP4]] to <3 x i8>
651; HSA-NEXT:    store volatile i8 [[TMP2]], ptr addrspace(1) undef, align 1
652; HSA-NEXT:    store volatile <3 x i8> [[ARG1_LOAD]], ptr addrspace(1) undef, align 4
653; HSA-NEXT:    ret void
654;
655; MESA-LABEL: @kern_realign_i8_v3i8(
656; MESA-NEXT:    [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
657; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT]], i64 36
658; MESA-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load [[META1]]
659; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
660; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT]], i64 40
661; MESA-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META1]]
662; MESA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i24
663; MESA-NEXT:    [[ARG1_LOAD:%.*]] = bitcast i24 [[TMP4]] to <3 x i8>
664; MESA-NEXT:    store volatile i8 [[TMP2]], ptr addrspace(1) undef, align 1
665; MESA-NEXT:    store volatile <3 x i8> [[ARG1_LOAD]], ptr addrspace(1) undef, align 4
666; MESA-NEXT:    ret void
667;
668  store volatile i8 %arg0, ptr addrspace(1) undef
669  store volatile <3 x i8> %arg1, ptr addrspace(1) undef
670  ret void
671}
672
673define amdgpu_kernel void @kern_realign_i8_i16(i8 %arg0, i16 %arg1) #0 {
674; HSA-LABEL: @kern_realign_i8_i16(
675; HSA-NEXT:    [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
676; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT]], i64 0
677; HSA-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META1]]
678; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
679; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT]], i64 0
680; HSA-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META1]]
681; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 16
682; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
683; HSA-NEXT:    store volatile i8 [[TMP2]], ptr addrspace(1) undef, align 1
684; HSA-NEXT:    store volatile i16 [[TMP5]], ptr addrspace(1) undef, align 2
685; HSA-NEXT:    ret void
686;
687; MESA-LABEL: @kern_realign_i8_i16(
688; MESA-NEXT:    [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
689; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT]], i64 36
690; MESA-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load [[META1]]
691; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
692; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT]], i64 36
693; MESA-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load [[META1]]
694; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 16
695; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
696; MESA-NEXT:    store volatile i8 [[TMP2]], ptr addrspace(1) undef, align 1
697; MESA-NEXT:    store volatile i16 [[TMP5]], ptr addrspace(1) undef, align 2
698; MESA-NEXT:    ret void
699;
700  store volatile i8 %arg0, ptr addrspace(1) undef
701  store volatile i16 %arg1, ptr addrspace(1) undef
702  ret void
703}
704
705define amdgpu_kernel void @kern_realign_i1_i1(i1 %arg0, i1 %arg1) #0 {
706; HSA-LABEL: @kern_realign_i1_i1(
707; HSA-NEXT:    [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
708; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT]], i64 0
709; HSA-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META1]]
710; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
711; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT]], i64 0
712; HSA-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META1]]
713; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
714; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1
715; HSA-NEXT:    store volatile i1 [[TMP2]], ptr addrspace(1) undef, align 1
716; HSA-NEXT:    store volatile i1 [[TMP5]], ptr addrspace(1) undef, align 1
717; HSA-NEXT:    ret void
718;
719; MESA-LABEL: @kern_realign_i1_i1(
720; MESA-NEXT:    [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
721; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT]], i64 36
722; MESA-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load [[META1]]
723; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
724; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT]], i64 36
725; MESA-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load [[META1]]
726; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
727; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1
728; MESA-NEXT:    store volatile i1 [[TMP2]], ptr addrspace(1) undef, align 1
729; MESA-NEXT:    store volatile i1 [[TMP5]], ptr addrspace(1) undef, align 1
730; MESA-NEXT:    ret void
731;
732  store volatile i1 %arg0, ptr addrspace(1) undef
733  store volatile i1 %arg1, ptr addrspace(1) undef
734  ret void
735}
736
737define amdgpu_kernel void @kern_realign_i1_i1_i1(i1 %arg0, i1 %arg1, i1 %arg2) #0 {
738; HSA-LABEL: @kern_realign_i1_i1_i1(
739; HSA-NEXT:    [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
740; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 0
741; HSA-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META1]]
742; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
743; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 0
744; HSA-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META1]]
745; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
746; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1
747; HSA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 0
748; HSA-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(4) [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META1]]
749; HSA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
750; HSA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i1
751; HSA-NEXT:    store volatile i1 [[TMP2]], ptr addrspace(1) undef, align 1
752; HSA-NEXT:    store volatile i1 [[TMP5]], ptr addrspace(1) undef, align 1
753; HSA-NEXT:    store volatile i1 [[TMP8]], ptr addrspace(1) undef, align 1
754; HSA-NEXT:    ret void
755;
756; MESA-LABEL: @kern_realign_i1_i1_i1(
757; MESA-NEXT:    [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
758; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 36
759; MESA-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load [[META1]]
760; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
761; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 36
762; MESA-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load [[META1]]
763; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
764; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1
765; MESA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 36
766; MESA-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(4) [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load [[META1]]
767; MESA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
768; MESA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i1
769; MESA-NEXT:    store volatile i1 [[TMP2]], ptr addrspace(1) undef, align 1
770; MESA-NEXT:    store volatile i1 [[TMP5]], ptr addrspace(1) undef, align 1
771; MESA-NEXT:    store volatile i1 [[TMP8]], ptr addrspace(1) undef, align 1
772; MESA-NEXT:    ret void
773;
774  store volatile i1 %arg0, ptr addrspace(1) undef
775  store volatile i1 %arg1, ptr addrspace(1) undef
776  store volatile i1 %arg2, ptr addrspace(1) undef
777  ret void
778}
779
780define amdgpu_kernel void @kern_realign_i1_i1_i1_i1(i1 %arg0, i1 %arg1, i1 %arg2, i1 %arg3) #0 {
781; HSA-LABEL: @kern_realign_i1_i1_i1_i1(
782; HSA-NEXT:    [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
783; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 0
784; HSA-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META1]]
785; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
786; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 0
787; HSA-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META1]]
788; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
789; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1
790; HSA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 0
791; HSA-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(4) [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META1]]
792; HSA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
793; HSA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i1
794; HSA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 0
795; HSA-NEXT:    [[TMP9:%.*]] = load i32, ptr addrspace(4) [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META1]]
796; HSA-NEXT:    [[TMP10:%.*]] = lshr i32 [[TMP9]], 24
797; HSA-NEXT:    [[TMP11:%.*]] = trunc i32 [[TMP10]] to i1
798; HSA-NEXT:    store volatile i1 [[TMP2]], ptr addrspace(1) undef, align 1
799; HSA-NEXT:    store volatile i1 [[TMP5]], ptr addrspace(1) undef, align 1
800; HSA-NEXT:    store volatile i1 [[TMP8]], ptr addrspace(1) undef, align 1
801; HSA-NEXT:    store volatile i1 [[TMP11]], ptr addrspace(1) undef, align 1
802; HSA-NEXT:    ret void
803;
804; MESA-LABEL: @kern_realign_i1_i1_i1_i1(
805; MESA-NEXT:    [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
806; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 36
807; MESA-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load [[META1]]
808; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
809; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 36
810; MESA-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load [[META1]]
811; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
812; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1
813; MESA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 36
814; MESA-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(4) [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load [[META1]]
815; MESA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
816; MESA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i1
817; MESA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 36
818; MESA-NEXT:    [[TMP9:%.*]] = load i32, ptr addrspace(4) [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load [[META1]]
819; MESA-NEXT:    [[TMP10:%.*]] = lshr i32 [[TMP9]], 24
820; MESA-NEXT:    [[TMP11:%.*]] = trunc i32 [[TMP10]] to i1
821; MESA-NEXT:    store volatile i1 [[TMP2]], ptr addrspace(1) undef, align 1
822; MESA-NEXT:    store volatile i1 [[TMP5]], ptr addrspace(1) undef, align 1
823; MESA-NEXT:    store volatile i1 [[TMP8]], ptr addrspace(1) undef, align 1
824; MESA-NEXT:    store volatile i1 [[TMP11]], ptr addrspace(1) undef, align 1
825; MESA-NEXT:    ret void
826;
827  store volatile i1 %arg0, ptr addrspace(1) undef
828  store volatile i1 %arg1, ptr addrspace(1) undef
829  store volatile i1 %arg2, ptr addrspace(1) undef
830  store volatile i1 %arg3, ptr addrspace(1) undef
831  ret void
832}
833
834define amdgpu_kernel void @kern_realign_i1_v3i1(i1 %arg0, <3 x i1> %arg1) #0 {
835; HSA-LABEL: @kern_realign_i1_v3i1(
836; HSA-NEXT:    [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
837; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT]], i64 0
838; HSA-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META1]]
839; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
840; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT]], i64 0
841; HSA-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META1]]
842; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
843; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i3
844; HSA-NEXT:    [[ARG1_LOAD:%.*]] = bitcast i3 [[TMP5]] to <3 x i1>
845; HSA-NEXT:    store volatile i1 [[TMP2]], ptr addrspace(1) undef, align 1
846; HSA-NEXT:    store volatile <3 x i1> [[ARG1_LOAD]], ptr addrspace(1) undef, align 1
847; HSA-NEXT:    ret void
848;
849; MESA-LABEL: @kern_realign_i1_v3i1(
850; MESA-NEXT:    [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
851; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT]], i64 36
852; MESA-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load [[META1]]
853; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
854; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT]], i64 36
855; MESA-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load [[META1]]
856; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
857; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i3
858; MESA-NEXT:    [[ARG1_LOAD:%.*]] = bitcast i3 [[TMP5]] to <3 x i1>
859; MESA-NEXT:    store volatile i1 [[TMP2]], ptr addrspace(1) undef, align 1
860; MESA-NEXT:    store volatile <3 x i1> [[ARG1_LOAD]], ptr addrspace(1) undef, align 1
861; MESA-NEXT:    ret void
862;
863  store volatile i1 %arg0, ptr addrspace(1) undef
864  store volatile <3 x i1> %arg1, ptr addrspace(1) undef
865  ret void
866}
867
868define amdgpu_kernel void @kern_realign_i1_i16(i1 %arg0, i16 %arg1) #0 {
869; HSA-LABEL: @kern_realign_i1_i16(
870; HSA-NEXT:    [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
871; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT]], i64 0
872; HSA-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META1]]
873; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
874; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT]], i64 0
875; HSA-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META1]]
876; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 16
877; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
878; HSA-NEXT:    store volatile i1 [[TMP2]], ptr addrspace(1) undef, align 1
879; HSA-NEXT:    store volatile i16 [[TMP5]], ptr addrspace(1) undef, align 2
880; HSA-NEXT:    ret void
881;
882; MESA-LABEL: @kern_realign_i1_i16(
883; MESA-NEXT:    [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
884; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT]], i64 36
885; MESA-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load [[META1]]
886; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
887; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT]], i64 36
888; MESA-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load [[META1]]
889; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 16
890; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
891; MESA-NEXT:    store volatile i1 [[TMP2]], ptr addrspace(1) undef, align 1
892; MESA-NEXT:    store volatile i16 [[TMP5]], ptr addrspace(1) undef, align 2
893; MESA-NEXT:    ret void
894;
895  store volatile i1 %arg0, ptr addrspace(1) undef
896  store volatile i16 %arg1, ptr addrspace(1) undef
897  ret void
898}
899
900define amdgpu_kernel void @kern_realign_i8_i8_i8_i8_i8_i8_i8_i8(i8 %arg0, i8 %arg1, i8 %arg2, i8 %arg3, i8 %arg4, i8 %arg5, i8 %arg6, i8 %arg7) #0 {
901; HSA-LABEL: @kern_realign_i8_i8_i8_i8_i8_i8_i8_i8(
902; HSA-NEXT:    [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
903; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
904; HSA-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META1]]
905; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
906; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
907; HSA-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META1]]
908; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
909; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
910; HSA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
911; HSA-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(4) [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META1]]
912; HSA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
913; HSA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8
914; HSA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
915; HSA-NEXT:    [[TMP9:%.*]] = load i32, ptr addrspace(4) [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META1]]
916; HSA-NEXT:    [[TMP10:%.*]] = lshr i32 [[TMP9]], 24
917; HSA-NEXT:    [[TMP11:%.*]] = trunc i32 [[TMP10]] to i8
918; HSA-NEXT:    [[ARG5_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 4
919; HSA-NEXT:    [[TMP12:%.*]] = load i32, ptr addrspace(4) [[ARG5_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load [[META1]]
920; HSA-NEXT:    [[TMP13:%.*]] = lshr i32 [[TMP12]], 8
921; HSA-NEXT:    [[TMP14:%.*]] = trunc i32 [[TMP13]] to i8
922; HSA-NEXT:    [[ARG6_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 4
923; HSA-NEXT:    [[TMP15:%.*]] = load i32, ptr addrspace(4) [[ARG6_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load [[META1]]
924; HSA-NEXT:    [[TMP16:%.*]] = lshr i32 [[TMP15]], 16
925; HSA-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8
926; HSA-NEXT:    [[ARG7_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 4
927; HSA-NEXT:    [[TMP18:%.*]] = load i32, ptr addrspace(4) [[ARG7_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load [[META1]]
928; HSA-NEXT:    [[TMP19:%.*]] = lshr i32 [[TMP18]], 24
929; HSA-NEXT:    [[TMP20:%.*]] = trunc i32 [[TMP19]] to i8
930; HSA-NEXT:    store volatile i8 [[TMP2]], ptr addrspace(1) undef, align 1
931; HSA-NEXT:    store volatile i8 [[TMP5]], ptr addrspace(1) undef, align 1
932; HSA-NEXT:    store volatile i8 [[TMP8]], ptr addrspace(1) undef, align 1
933; HSA-NEXT:    store volatile i8 [[TMP11]], ptr addrspace(1) undef, align 1
934; HSA-NEXT:    store volatile i8 [[TMP14]], ptr addrspace(1) undef, align 1
935; HSA-NEXT:    store volatile i8 [[TMP17]], ptr addrspace(1) undef, align 1
936; HSA-NEXT:    store volatile i8 [[TMP20]], ptr addrspace(1) undef, align 1
937; HSA-NEXT:    ret void
938;
939; MESA-LABEL: @kern_realign_i8_i8_i8_i8_i8_i8_i8_i8(
940; MESA-NEXT:    [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
941; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36
942; MESA-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load [[META1]]
943; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
944; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36
945; MESA-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load [[META1]]
946; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
947; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
948; MESA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36
949; MESA-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(4) [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load [[META1]]
950; MESA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
951; MESA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8
952; MESA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36
953; MESA-NEXT:    [[TMP9:%.*]] = load i32, ptr addrspace(4) [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load [[META1]]
954; MESA-NEXT:    [[TMP10:%.*]] = lshr i32 [[TMP9]], 24
955; MESA-NEXT:    [[TMP11:%.*]] = trunc i32 [[TMP10]] to i8
956; MESA-NEXT:    [[ARG5_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 40
957; MESA-NEXT:    [[TMP12:%.*]] = load i32, ptr addrspace(4) [[ARG5_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META1]]
958; MESA-NEXT:    [[TMP13:%.*]] = lshr i32 [[TMP12]], 8
959; MESA-NEXT:    [[TMP14:%.*]] = trunc i32 [[TMP13]] to i8
960; MESA-NEXT:    [[ARG6_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 40
961; MESA-NEXT:    [[TMP15:%.*]] = load i32, ptr addrspace(4) [[ARG6_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META1]]
962; MESA-NEXT:    [[TMP16:%.*]] = lshr i32 [[TMP15]], 16
963; MESA-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8
964; MESA-NEXT:    [[ARG7_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 40
965; MESA-NEXT:    [[TMP18:%.*]] = load i32, ptr addrspace(4) [[ARG7_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META1]]
966; MESA-NEXT:    [[TMP19:%.*]] = lshr i32 [[TMP18]], 24
967; MESA-NEXT:    [[TMP20:%.*]] = trunc i32 [[TMP19]] to i8
968; MESA-NEXT:    store volatile i8 [[TMP2]], ptr addrspace(1) undef, align 1
969; MESA-NEXT:    store volatile i8 [[TMP5]], ptr addrspace(1) undef, align 1
970; MESA-NEXT:    store volatile i8 [[TMP8]], ptr addrspace(1) undef, align 1
971; MESA-NEXT:    store volatile i8 [[TMP11]], ptr addrspace(1) undef, align 1
972; MESA-NEXT:    store volatile i8 [[TMP14]], ptr addrspace(1) undef, align 1
973; MESA-NEXT:    store volatile i8 [[TMP17]], ptr addrspace(1) undef, align 1
974; MESA-NEXT:    store volatile i8 [[TMP20]], ptr addrspace(1) undef, align 1
975; MESA-NEXT:    ret void
976;
977  store volatile i8 %arg0, ptr addrspace(1) undef
978  store volatile i8 %arg1, ptr addrspace(1) undef
979  store volatile i8 %arg2, ptr addrspace(1) undef
980  store volatile i8 %arg3, ptr addrspace(1) undef
981  store volatile i8 %arg5, ptr addrspace(1) undef
982  store volatile i8 %arg6, ptr addrspace(1) undef
983  store volatile i8 %arg7, ptr addrspace(1) undef
984  ret void
985}
986
987define amdgpu_kernel void @kern_realign_f16_f16(half %arg0, half %arg1) #0 {
988; HSA-LABEL: @kern_realign_f16_f16(
989; HSA-NEXT:    [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
990; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT]], i64 0
991; HSA-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META1]]
992; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
993; HSA-NEXT:    [[ARG0_LOAD:%.*]] = bitcast i16 [[TMP2]] to half
994; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT]], i64 0
995; HSA-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META1]]
996; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 16
997; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
998; HSA-NEXT:    [[ARG1_LOAD:%.*]] = bitcast i16 [[TMP5]] to half
999; HSA-NEXT:    store volatile half [[ARG0_LOAD]], ptr addrspace(1) undef, align 2
1000; HSA-NEXT:    store volatile half [[ARG1_LOAD]], ptr addrspace(1) undef, align 2
1001; HSA-NEXT:    ret void
1002;
1003; MESA-LABEL: @kern_realign_f16_f16(
1004; MESA-NEXT:    [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1005; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT]], i64 36
1006; MESA-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load [[META1]]
1007; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
1008; MESA-NEXT:    [[ARG0_LOAD:%.*]] = bitcast i16 [[TMP2]] to half
1009; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT]], i64 36
1010; MESA-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load [[META1]]
1011; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 16
1012; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
1013; MESA-NEXT:    [[ARG1_LOAD:%.*]] = bitcast i16 [[TMP5]] to half
1014; MESA-NEXT:    store volatile half [[ARG0_LOAD]], ptr addrspace(1) undef, align 2
1015; MESA-NEXT:    store volatile half [[ARG1_LOAD]], ptr addrspace(1) undef, align 2
1016; MESA-NEXT:    ret void
1017;
1018  store volatile half %arg0, ptr addrspace(1) undef
1019  store volatile half %arg1, ptr addrspace(1) undef
1020  ret void
1021}
1022
1023define amdgpu_kernel void @kern_global_ptr(ptr addrspace(1) %ptr) #0 {
1024; HSA-LABEL: @kern_global_ptr(
1025; HSA-NEXT:    [[KERN_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1026; HSA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_GLOBAL_PTR_KERNARG_SEGMENT]], i64 0
1027; HSA-NEXT:    [[PTR_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[PTR_KERNARG_OFFSET]], align 16, !invariant.load [[META1]]
1028; HSA-NEXT:    store volatile ptr addrspace(1) [[PTR_LOAD]], ptr addrspace(1) undef, align 8
1029; HSA-NEXT:    ret void
1030;
1031; MESA-LABEL: @kern_global_ptr(
1032; MESA-NEXT:    [[KERN_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1033; MESA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_GLOBAL_PTR_KERNARG_SEGMENT]], i64 36
1034; MESA-NEXT:    [[PTR_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[PTR_KERNARG_OFFSET]], align 4, !invariant.load [[META1]]
1035; MESA-NEXT:    store volatile ptr addrspace(1) [[PTR_LOAD]], ptr addrspace(1) undef, align 8
1036; MESA-NEXT:    ret void
1037;
1038  store volatile ptr addrspace(1) %ptr, ptr addrspace(1) undef
1039  ret void
1040}
1041
1042define amdgpu_kernel void @kern_global_ptr_dereferencable(ptr addrspace(1) dereferenceable(42) %ptr) #0 {
1043; HSA-LABEL: @kern_global_ptr_dereferencable(
1044; HSA-NEXT:    [[KERN_GLOBAL_PTR_DEREFERENCABLE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1045; HSA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_GLOBAL_PTR_DEREFERENCABLE_KERNARG_SEGMENT]], i64 0
1046; HSA-NEXT:    [[PTR_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[PTR_KERNARG_OFFSET]], align 16, !invariant.load [[META1]], !dereferenceable [[META3:![0-9]+]]
1047; HSA-NEXT:    store volatile ptr addrspace(1) [[PTR_LOAD]], ptr addrspace(1) undef, align 8
1048; HSA-NEXT:    ret void
1049;
1050; MESA-LABEL: @kern_global_ptr_dereferencable(
1051; MESA-NEXT:    [[KERN_GLOBAL_PTR_DEREFERENCABLE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1052; MESA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_GLOBAL_PTR_DEREFERENCABLE_KERNARG_SEGMENT]], i64 36
1053; MESA-NEXT:    [[PTR_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[PTR_KERNARG_OFFSET]], align 4, !invariant.load [[META1]], !dereferenceable [[META3:![0-9]+]]
1054; MESA-NEXT:    store volatile ptr addrspace(1) [[PTR_LOAD]], ptr addrspace(1) undef, align 8
1055; MESA-NEXT:    ret void
1056;
1057  store volatile ptr addrspace(1) %ptr, ptr addrspace(1) undef
1058  ret void
1059}
1060
1061define amdgpu_kernel void @kern_global_ptr_dereferencable_or_null(ptr addrspace(1) dereferenceable_or_null(128) %ptr) #0 {
1062; HSA-LABEL: @kern_global_ptr_dereferencable_or_null(
1063; HSA-NEXT:    [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1064; HSA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL_KERNARG_SEGMENT]], i64 0
1065; HSA-NEXT:    [[PTR_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[PTR_KERNARG_OFFSET]], align 16, !invariant.load [[META1]], !dereferenceable_or_null [[META4:![0-9]+]]
1066; HSA-NEXT:    store volatile ptr addrspace(1) [[PTR_LOAD]], ptr addrspace(1) undef, align 8
1067; HSA-NEXT:    ret void
1068;
1069; MESA-LABEL: @kern_global_ptr_dereferencable_or_null(
1070; MESA-NEXT:    [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1071; MESA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL_KERNARG_SEGMENT]], i64 36
1072; MESA-NEXT:    [[PTR_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[PTR_KERNARG_OFFSET]], align 4, !invariant.load [[META1]], !dereferenceable_or_null [[META4:![0-9]+]]
1073; MESA-NEXT:    store volatile ptr addrspace(1) [[PTR_LOAD]], ptr addrspace(1) undef, align 8
1074; MESA-NEXT:    ret void
1075;
1076  store volatile ptr addrspace(1) %ptr, ptr addrspace(1) undef
1077  ret void
1078}
1079
1080define amdgpu_kernel void @kern_nonnull_global_ptr(ptr addrspace(1) nonnull %ptr) #0 {
1081; HSA-LABEL: @kern_nonnull_global_ptr(
1082; HSA-NEXT:    [[KERN_NONNULL_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1083; HSA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_NONNULL_GLOBAL_PTR_KERNARG_SEGMENT]], i64 0
1084; HSA-NEXT:    [[PTR_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[PTR_KERNARG_OFFSET]], align 16, !invariant.load [[META1]], !nonnull [[META1]]
1085; HSA-NEXT:    store volatile ptr addrspace(1) [[PTR_LOAD]], ptr addrspace(1) undef, align 8
1086; HSA-NEXT:    ret void
1087;
1088; MESA-LABEL: @kern_nonnull_global_ptr(
1089; MESA-NEXT:    [[KERN_NONNULL_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1090; MESA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_NONNULL_GLOBAL_PTR_KERNARG_SEGMENT]], i64 36
1091; MESA-NEXT:    [[PTR_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[PTR_KERNARG_OFFSET]], align 4, !invariant.load [[META1]], !nonnull [[META1]]
1092; MESA-NEXT:    store volatile ptr addrspace(1) [[PTR_LOAD]], ptr addrspace(1) undef, align 8
1093; MESA-NEXT:    ret void
1094;
1095  store volatile ptr addrspace(1) %ptr, ptr addrspace(1) undef
1096  ret void
1097}
1098
1099define amdgpu_kernel void @kern_align32_global_ptr(ptr addrspace(1) align 1024 %ptr) #0 {
1100; HSA-LABEL: @kern_align32_global_ptr(
1101; HSA-NEXT:    [[KERN_ALIGN32_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1102; HSA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_ALIGN32_GLOBAL_PTR_KERNARG_SEGMENT]], i64 0
1103; HSA-NEXT:    [[PTR_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[PTR_KERNARG_OFFSET]], align 16, !invariant.load [[META1]], !align [[META5:![0-9]+]]
1104; HSA-NEXT:    store volatile ptr addrspace(1) [[PTR_LOAD]], ptr addrspace(1) undef, align 8
1105; HSA-NEXT:    ret void
1106;
1107; MESA-LABEL: @kern_align32_global_ptr(
1108; MESA-NEXT:    [[KERN_ALIGN32_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1109; MESA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_ALIGN32_GLOBAL_PTR_KERNARG_SEGMENT]], i64 36
1110; MESA-NEXT:    [[PTR_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[PTR_KERNARG_OFFSET]], align 4, !invariant.load [[META1]], !align [[META5:![0-9]+]]
1111; MESA-NEXT:    store volatile ptr addrspace(1) [[PTR_LOAD]], ptr addrspace(1) undef, align 8
1112; MESA-NEXT:    ret void
1113;
1114  store volatile ptr addrspace(1) %ptr, ptr addrspace(1) undef
1115  ret void
1116}
1117
1118define amdgpu_kernel void @kern_noalias_global_ptr(ptr addrspace(1) noalias %ptr) #0 {
1119; GCN-LABEL: @kern_noalias_global_ptr(
1120; GCN-NEXT:    [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1121; GCN-NEXT:    store volatile ptr addrspace(1) [[PTR:%.*]], ptr addrspace(1) undef, align 8
1122; GCN-NEXT:    ret void
1123;
1124  store volatile ptr addrspace(1) %ptr, ptr addrspace(1) undef
1125  ret void
1126}
1127
1128define amdgpu_kernel void @kern_noalias_global_ptr_x2(ptr addrspace(1) noalias %ptr0, ptr addrspace(1) noalias %ptr1) #0 {
1129; GCN-LABEL: @kern_noalias_global_ptr_x2(
1130; GCN-NEXT:    [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1131; GCN-NEXT:    store volatile ptr addrspace(1) [[PTR0:%.*]], ptr addrspace(1) undef, align 8
1132; GCN-NEXT:    store volatile ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(1) undef, align 8
1133; GCN-NEXT:    ret void
1134;
1135  store volatile ptr addrspace(1) %ptr0, ptr addrspace(1) undef
1136  store volatile ptr addrspace(1) %ptr1, ptr addrspace(1) undef
1137  ret void
1138}
1139
1140define amdgpu_kernel void @kern_noundef_global_ptr(ptr addrspace(1) noundef %ptr) #0 {
1141; HSA-LABEL: @kern_noundef_global_ptr(
1142; HSA-NEXT:    [[KERN_NOUNDEF_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1143; HSA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_NOUNDEF_GLOBAL_PTR_KERNARG_SEGMENT]], i64 0
1144; HSA-NEXT:    [[PTR_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[PTR_KERNARG_OFFSET]], align 16, !invariant.load [[META1]], !noundef [[META1]]
1145; HSA-NEXT:    store volatile ptr addrspace(1) [[PTR_LOAD]], ptr addrspace(1) null, align 8
1146; HSA-NEXT:    ret void
1147;
1148; MESA-LABEL: @kern_noundef_global_ptr(
1149; MESA-NEXT:    [[KERN_NOUNDEF_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1150; MESA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_NOUNDEF_GLOBAL_PTR_KERNARG_SEGMENT]], i64 36
1151; MESA-NEXT:    [[PTR_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[PTR_KERNARG_OFFSET]], align 4, !invariant.load [[META1]], !noundef [[META1]]
1152; MESA-NEXT:    store volatile ptr addrspace(1) [[PTR_LOAD]], ptr addrspace(1) null, align 8
1153; MESA-NEXT:    ret void
1154;
1155  store volatile ptr addrspace(1) %ptr, ptr addrspace(1) null
1156  ret void
1157}
1158
1159define amdgpu_kernel void @struct_i8_i8_arg({i8, i8} %in) #0 {
1160; HSA-LABEL: @struct_i8_i8_arg(
1161; HSA-NEXT:  entry:
1162; HSA-NEXT:    [[STRUCT_I8_I8_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1163; HSA-NEXT:    [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[STRUCT_I8_I8_ARG_KERNARG_SEGMENT]], i64 0
1164; HSA-NEXT:    [[IN_LOAD:%.*]] = load { i8, i8 }, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META1]]
1165; HSA-NEXT:    [[ELT0:%.*]] = extractvalue { i8, i8 } [[IN_LOAD]], 0
1166; HSA-NEXT:    [[ELT1:%.*]] = extractvalue { i8, i8 } [[IN_LOAD]], 1
1167; HSA-NEXT:    store volatile i8 [[ELT0]], ptr addrspace(1) null, align 4
1168; HSA-NEXT:    store volatile i8 [[ELT1]], ptr addrspace(1) null, align 4
1169; HSA-NEXT:    ret void
1170;
1171; MESA-LABEL: @struct_i8_i8_arg(
1172; MESA-NEXT:  entry:
1173; MESA-NEXT:    [[STRUCT_I8_I8_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1174; MESA-NEXT:    [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[STRUCT_I8_I8_ARG_KERNARG_SEGMENT]], i64 36
1175; MESA-NEXT:    [[IN_LOAD:%.*]] = load { i8, i8 }, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 4, !invariant.load [[META1]]
1176; MESA-NEXT:    [[ELT0:%.*]] = extractvalue { i8, i8 } [[IN_LOAD]], 0
1177; MESA-NEXT:    [[ELT1:%.*]] = extractvalue { i8, i8 } [[IN_LOAD]], 1
1178; MESA-NEXT:    store volatile i8 [[ELT0]], ptr addrspace(1) null, align 4
1179; MESA-NEXT:    store volatile i8 [[ELT1]], ptr addrspace(1) null, align 4
1180; MESA-NEXT:    ret void
1181;
1182entry:
1183  %elt0 = extractvalue {i8, i8} %in, 0
1184  %elt1 = extractvalue {i8, i8} %in, 1
1185  store volatile i8 %elt0, ptr addrspace(1) null, align 4
1186  store volatile i8 %elt1, ptr addrspace(1) null, align 4
1187  ret void
1188}
1189
1190define amdgpu_kernel void @struct_i8_i16_arg({i8, i16} %in) #0 {
1191; HSA-LABEL: @struct_i8_i16_arg(
1192; HSA-NEXT:  entry:
1193; HSA-NEXT:    [[STRUCT_I8_I16_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1194; HSA-NEXT:    [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[STRUCT_I8_I16_ARG_KERNARG_SEGMENT]], i64 0
1195; HSA-NEXT:    [[IN_LOAD:%.*]] = load { i8, i16 }, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META1]]
1196; HSA-NEXT:    [[ELT0:%.*]] = extractvalue { i8, i16 } [[IN_LOAD]], 0
1197; HSA-NEXT:    [[ELT1:%.*]] = extractvalue { i8, i16 } [[IN_LOAD]], 1
1198; HSA-NEXT:    store volatile i8 [[ELT0]], ptr addrspace(1) null, align 4
1199; HSA-NEXT:    store volatile i16 [[ELT1]], ptr addrspace(1) null, align 4
1200; HSA-NEXT:    ret void
1201;
1202; MESA-LABEL: @struct_i8_i16_arg(
1203; MESA-NEXT:  entry:
1204; MESA-NEXT:    [[STRUCT_I8_I16_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1205; MESA-NEXT:    [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[STRUCT_I8_I16_ARG_KERNARG_SEGMENT]], i64 36
1206; MESA-NEXT:    [[IN_LOAD:%.*]] = load { i8, i16 }, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 4, !invariant.load [[META1]]
1207; MESA-NEXT:    [[ELT0:%.*]] = extractvalue { i8, i16 } [[IN_LOAD]], 0
1208; MESA-NEXT:    [[ELT1:%.*]] = extractvalue { i8, i16 } [[IN_LOAD]], 1
1209; MESA-NEXT:    store volatile i8 [[ELT0]], ptr addrspace(1) null, align 4
1210; MESA-NEXT:    store volatile i16 [[ELT1]], ptr addrspace(1) null, align 4
1211; MESA-NEXT:    ret void
1212;
1213entry:
1214  %elt0 = extractvalue {i8, i16} %in, 0
1215  %elt1 = extractvalue {i8, i16} %in, 1
1216  store volatile i8 %elt0, ptr addrspace(1) null, align 4
1217  store volatile i16 %elt1, ptr addrspace(1) null, align 4
1218  ret void
1219}
1220
1221define amdgpu_kernel void @array_2xi8_arg([2 x i8] %in) #0 {
1222; HSA-LABEL: @array_2xi8_arg(
1223; HSA-NEXT:  entry:
1224; HSA-NEXT:    [[ARRAY_2XI8_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1225; HSA-NEXT:    [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ARRAY_2XI8_ARG_KERNARG_SEGMENT]], i64 0
1226; HSA-NEXT:    [[IN_LOAD:%.*]] = load [2 x i8], ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META1]]
1227; HSA-NEXT:    [[ELT0:%.*]] = extractvalue [2 x i8] [[IN_LOAD]], 0
1228; HSA-NEXT:    [[ELT1:%.*]] = extractvalue [2 x i8] [[IN_LOAD]], 1
1229; HSA-NEXT:    store volatile i8 [[ELT0]], ptr addrspace(1) null, align 4
1230; HSA-NEXT:    store volatile i8 [[ELT1]], ptr addrspace(1) null, align 4
1231; HSA-NEXT:    ret void
1232;
1233; MESA-LABEL: @array_2xi8_arg(
1234; MESA-NEXT:  entry:
1235; MESA-NEXT:    [[ARRAY_2XI8_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1236; MESA-NEXT:    [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ARRAY_2XI8_ARG_KERNARG_SEGMENT]], i64 36
1237; MESA-NEXT:    [[IN_LOAD:%.*]] = load [2 x i8], ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 4, !invariant.load [[META1]]
1238; MESA-NEXT:    [[ELT0:%.*]] = extractvalue [2 x i8] [[IN_LOAD]], 0
1239; MESA-NEXT:    [[ELT1:%.*]] = extractvalue [2 x i8] [[IN_LOAD]], 1
1240; MESA-NEXT:    store volatile i8 [[ELT0]], ptr addrspace(1) null, align 4
1241; MESA-NEXT:    store volatile i8 [[ELT1]], ptr addrspace(1) null, align 4
1242; MESA-NEXT:    ret void
1243;
1244entry:
1245  %elt0 = extractvalue [2 x i8] %in, 0
1246  %elt1 = extractvalue [2 x i8] %in, 1
1247  store volatile i8 %elt0, ptr addrspace(1) null, align 4
1248  store volatile i8 %elt1, ptr addrspace(1) null, align 4
1249  ret void
1250}
1251
1252define amdgpu_kernel void @array_2xi1_arg([2 x i1] %in) #0 {
1253; HSA-LABEL: @array_2xi1_arg(
1254; HSA-NEXT:  entry:
1255; HSA-NEXT:    [[ARRAY_2XI1_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1256; HSA-NEXT:    [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ARRAY_2XI1_ARG_KERNARG_SEGMENT]], i64 0
1257; HSA-NEXT:    [[IN_LOAD:%.*]] = load [2 x i1], ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META1]]
1258; HSA-NEXT:    [[ELT0:%.*]] = extractvalue [2 x i1] [[IN_LOAD]], 0
1259; HSA-NEXT:    [[ELT1:%.*]] = extractvalue [2 x i1] [[IN_LOAD]], 1
1260; HSA-NEXT:    store volatile i1 [[ELT0]], ptr addrspace(1) null, align 4
1261; HSA-NEXT:    store volatile i1 [[ELT1]], ptr addrspace(1) null, align 4
1262; HSA-NEXT:    ret void
1263;
1264; MESA-LABEL: @array_2xi1_arg(
1265; MESA-NEXT:  entry:
1266; MESA-NEXT:    [[ARRAY_2XI1_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1267; MESA-NEXT:    [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ARRAY_2XI1_ARG_KERNARG_SEGMENT]], i64 36
1268; MESA-NEXT:    [[IN_LOAD:%.*]] = load [2 x i1], ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 4, !invariant.load [[META1]]
1269; MESA-NEXT:    [[ELT0:%.*]] = extractvalue [2 x i1] [[IN_LOAD]], 0
1270; MESA-NEXT:    [[ELT1:%.*]] = extractvalue [2 x i1] [[IN_LOAD]], 1
1271; MESA-NEXT:    store volatile i1 [[ELT0]], ptr addrspace(1) null, align 4
1272; MESA-NEXT:    store volatile i1 [[ELT1]], ptr addrspace(1) null, align 4
1273; MESA-NEXT:    ret void
1274;
1275entry:
1276  %elt0 = extractvalue [2 x i1] %in, 0
1277  %elt1 = extractvalue [2 x i1] %in, 1
1278  store volatile i1 %elt0, ptr addrspace(1) null, align 4
1279  store volatile i1 %elt1, ptr addrspace(1) null, align 4
1280  ret void
1281}
1282
1283define amdgpu_kernel void @only_empty_struct({} %empty) #0 {
1284; GCN-LABEL: @only_empty_struct(
1285; GCN-NEXT:    [[ONLY_EMPTY_STRUCT_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1286; GCN-NEXT:    ret void
1287;
1288  ret void
1289}
1290
1291define amdgpu_kernel void @empty_struct_with_other({} %empty, i32 %arg1) #0 {
1292; HSA-LABEL: @empty_struct_with_other(
1293; HSA-NEXT:    [[EMPTY_STRUCT_WITH_OTHER_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1294; HSA-NEXT:    [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[EMPTY_STRUCT_WITH_OTHER_KERNARG_SEGMENT]], i64 0
1295; HSA-NEXT:    [[ARG1_LOAD:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET]], align 16, !invariant.load [[META1]]
1296; HSA-NEXT:    store i32 [[ARG1_LOAD]], ptr addrspace(1) undef, align 4
1297; HSA-NEXT:    ret void
1298;
1299; MESA-LABEL: @empty_struct_with_other(
1300; MESA-NEXT:    [[EMPTY_STRUCT_WITH_OTHER_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1301; MESA-NEXT:    [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[EMPTY_STRUCT_WITH_OTHER_KERNARG_SEGMENT]], i64 36
1302; MESA-NEXT:    [[ARG1_LOAD:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET]], align 4, !invariant.load [[META1]]
1303; MESA-NEXT:    store i32 [[ARG1_LOAD]], ptr addrspace(1) undef, align 4
1304; MESA-NEXT:    ret void
1305;
1306  store i32 %arg1, ptr addrspace(1) undef
1307  ret void
1308}
1309
1310; Should insert code after the allocas
1311define amdgpu_kernel void @static_alloca_kern_i32(i32 %arg0) {
1312; HSA-LABEL: @static_alloca_kern_i32(
1313; HSA-NEXT:    [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5)
1314; HSA-NEXT:    [[STATIC_ALLOCA_KERN_I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1315; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[STATIC_ALLOCA_KERN_I32_KERNARG_SEGMENT]], i64 0
1316; HSA-NEXT:    [[ARG0_LOAD:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET]], align 16, !invariant.load [[META1]]
1317; HSA-NEXT:    store volatile i32 [[ARG0_LOAD]], ptr addrspace(5) [[ALLOCA]], align 4
1318; HSA-NEXT:    ret void
1319;
1320; MESA-LABEL: @static_alloca_kern_i32(
1321; MESA-NEXT:    [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5)
1322; MESA-NEXT:    [[STATIC_ALLOCA_KERN_I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1323; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[STATIC_ALLOCA_KERN_I32_KERNARG_SEGMENT]], i64 36
1324; MESA-NEXT:    [[ARG0_LOAD:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET]], align 4, !invariant.load [[META1]]
1325; MESA-NEXT:    store volatile i32 [[ARG0_LOAD]], ptr addrspace(5) [[ALLOCA]], align 4
1326; MESA-NEXT:    ret void
1327;
1328  %alloca = alloca i32, addrspace(5)
1329  store volatile i32 %arg0, ptr addrspace(5) %alloca
1330  ret void
1331}
1332
1333; Make sure we don't break the IR if an alloca depends on the
1334; kernargs.
1335define amdgpu_kernel void @dyn_alloca_kernarg_i32(i32 %n) {
1336; HSA-LABEL: @dyn_alloca_kernarg_i32(
1337; HSA-NEXT:    [[ALLOCA0:%.*]] = alloca i32, align 4, addrspace(5)
1338; HSA-NEXT:    [[DYN_ALLOCA_KERNARG_I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1339; HSA-NEXT:    [[N_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[DYN_ALLOCA_KERNARG_I32_KERNARG_SEGMENT]], i64 0
1340; HSA-NEXT:    [[N_LOAD:%.*]] = load i32, ptr addrspace(4) [[N_KERNARG_OFFSET]], align 16, !invariant.load [[META1]]
1341; HSA-NEXT:    [[ALLOCA1:%.*]] = alloca i32, i32 [[N_LOAD]], align 4, addrspace(5)
1342; HSA-NEXT:    store volatile i32 0, ptr addrspace(5) [[ALLOCA0]], align 4
1343; HSA-NEXT:    store volatile i32 1, ptr addrspace(5) [[ALLOCA1]], align 4
1344; HSA-NEXT:    ret void
1345;
1346; MESA-LABEL: @dyn_alloca_kernarg_i32(
1347; MESA-NEXT:    [[ALLOCA0:%.*]] = alloca i32, align 4, addrspace(5)
1348; MESA-NEXT:    [[DYN_ALLOCA_KERNARG_I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1349; MESA-NEXT:    [[N_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[DYN_ALLOCA_KERNARG_I32_KERNARG_SEGMENT]], i64 36
1350; MESA-NEXT:    [[N_LOAD:%.*]] = load i32, ptr addrspace(4) [[N_KERNARG_OFFSET]], align 4, !invariant.load [[META1]]
1351; MESA-NEXT:    [[ALLOCA1:%.*]] = alloca i32, i32 [[N_LOAD]], align 4, addrspace(5)
1352; MESA-NEXT:    store volatile i32 0, ptr addrspace(5) [[ALLOCA0]], align 4
1353; MESA-NEXT:    store volatile i32 1, ptr addrspace(5) [[ALLOCA1]], align 4
1354; MESA-NEXT:    ret void
1355;
1356  %alloca0 = alloca i32, addrspace(5)
1357  %alloca1 = alloca i32, i32 %n, addrspace(5)
1358  store volatile i32 0, ptr addrspace(5) %alloca0
1359  store volatile i32 1, ptr addrspace(5) %alloca1
1360  ret void
1361}
1362
1363; Byref pointers should only be treated as offsets from kernarg
1364define amdgpu_kernel void @byref_constant_i8_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i8) %in.byref) {
1365; HSA-LABEL: @byref_constant_i8_arg(
1366; HSA-NEXT:    [[BYREF_CONSTANT_I8_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1367; HSA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_CONSTANT_I8_ARG_KERNARG_SEGMENT]], i64 0
1368; HSA-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META1]]
1369; HSA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_CONSTANT_I8_ARG_KERNARG_SEGMENT]], i64 8
1370; HSA-NEXT:    [[IN:%.*]] = load i8, ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]], align 1
1371; HSA-NEXT:    [[EXT:%.*]] = zext i8 [[IN]] to i32
1372; HSA-NEXT:    store i32 [[EXT]], ptr addrspace(1) [[OUT_LOAD]], align 4
1373; HSA-NEXT:    ret void
1374;
1375; MESA-LABEL: @byref_constant_i8_arg(
1376; MESA-NEXT:    [[BYREF_CONSTANT_I8_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(268) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1377; MESA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_CONSTANT_I8_ARG_KERNARG_SEGMENT]], i64 36
1378; MESA-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 4, !invariant.load [[META1]]
1379; MESA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_CONSTANT_I8_ARG_KERNARG_SEGMENT]], i64 44
1380; MESA-NEXT:    [[IN:%.*]] = load i8, ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]], align 1
1381; MESA-NEXT:    [[EXT:%.*]] = zext i8 [[IN]] to i32
1382; MESA-NEXT:    store i32 [[EXT]], ptr addrspace(1) [[OUT_LOAD]], align 4
1383; MESA-NEXT:    ret void
1384;
1385  %in = load i8, ptr addrspace(4) %in.byref
1386  %ext = zext i8 %in to i32
1387  store i32 %ext, ptr addrspace(1) %out, align 4
1388  ret void
1389}
1390
1391define amdgpu_kernel void @byref_constant_i16_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i16) %in.byref) {
1392; HSA-LABEL: @byref_constant_i16_arg(
1393; HSA-NEXT:    [[BYREF_CONSTANT_I16_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1394; HSA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_CONSTANT_I16_ARG_KERNARG_SEGMENT]], i64 0
1395; HSA-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META1]]
1396; HSA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_CONSTANT_I16_ARG_KERNARG_SEGMENT]], i64 8
1397; HSA-NEXT:    [[IN:%.*]] = load i16, ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]], align 2
1398; HSA-NEXT:    [[EXT:%.*]] = zext i16 [[IN]] to i32
1399; HSA-NEXT:    store i32 [[EXT]], ptr addrspace(1) [[OUT_LOAD]], align 4
1400; HSA-NEXT:    ret void
1401;
1402; MESA-LABEL: @byref_constant_i16_arg(
1403; MESA-NEXT:    [[BYREF_CONSTANT_I16_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(268) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1404; MESA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_CONSTANT_I16_ARG_KERNARG_SEGMENT]], i64 36
1405; MESA-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 4, !invariant.load [[META1]]
1406; MESA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_CONSTANT_I16_ARG_KERNARG_SEGMENT]], i64 44
1407; MESA-NEXT:    [[IN:%.*]] = load i16, ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]], align 2
1408; MESA-NEXT:    [[EXT:%.*]] = zext i16 [[IN]] to i32
1409; MESA-NEXT:    store i32 [[EXT]], ptr addrspace(1) [[OUT_LOAD]], align 4
1410; MESA-NEXT:    ret void
1411;
1412  %in = load i16, ptr addrspace(4) %in.byref
1413  %ext = zext i16 %in to i32
1414  store i32 %ext, ptr addrspace(1) %out, align 4
1415  ret void
1416}
1417
1418define amdgpu_kernel void @byref_constant_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i32) %in.byref, i32 %after.offset) {
1419; HSA-LABEL: @byref_constant_i32_arg(
1420; HSA-NEXT:    [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1421; HSA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 0
1422; HSA-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META1]]
1423; HSA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 8
1424; HSA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 12
1425; HSA-NEXT:    [[AFTER_OFFSET_LOAD:%.*]] = load i32, ptr addrspace(4) [[AFTER_OFFSET_KERNARG_OFFSET]], align 4, !invariant.load [[META1]]
1426; HSA-NEXT:    [[IN:%.*]] = load i32, ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]], align 4
1427; HSA-NEXT:    store volatile i32 [[IN]], ptr addrspace(1) [[OUT_LOAD]], align 4
1428; HSA-NEXT:    store volatile i32 [[AFTER_OFFSET_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
1429; HSA-NEXT:    ret void
1430;
1431; MESA-LABEL: @byref_constant_i32_arg(
1432; MESA-NEXT:    [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1433; MESA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 36
1434; MESA-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 4, !invariant.load [[META1]]
1435; MESA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 44
1436; MESA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 48
1437; MESA-NEXT:    [[AFTER_OFFSET_LOAD:%.*]] = load i32, ptr addrspace(4) [[AFTER_OFFSET_KERNARG_OFFSET]], align 16, !invariant.load [[META1]]
1438; MESA-NEXT:    [[IN:%.*]] = load i32, ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]], align 4
1439; MESA-NEXT:    store volatile i32 [[IN]], ptr addrspace(1) [[OUT_LOAD]], align 4
1440; MESA-NEXT:    store volatile i32 [[AFTER_OFFSET_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
1441; MESA-NEXT:    ret void
1442;
1443  %in = load i32, ptr addrspace(4) %in.byref
1444  store volatile i32 %in, ptr addrspace(1) %out, align 4
1445  store volatile i32 %after.offset, ptr addrspace(1) %out, align 4
1446  ret void
1447}
1448
1449define amdgpu_kernel void @byref_constant_v4i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(<4 x i32>) %in.byref, i32 %after.offset) {
1450; HSA-LABEL: @byref_constant_v4i32_arg(
1451; HSA-NEXT:    [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(296) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1452; HSA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT]], i64 0
1453; HSA-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META1]]
1454; HSA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT]], i64 16
1455; HSA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT]], i64 32
1456; HSA-NEXT:    [[AFTER_OFFSET_LOAD:%.*]] = load i32, ptr addrspace(4) [[AFTER_OFFSET_KERNARG_OFFSET]], align 16, !invariant.load [[META1]]
1457; HSA-NEXT:    [[IN:%.*]] = load <4 x i32>, ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]], align 16
1458; HSA-NEXT:    store volatile <4 x i32> [[IN]], ptr addrspace(1) [[OUT_LOAD]], align 4
1459; HSA-NEXT:    store volatile i32 [[AFTER_OFFSET_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
1460; HSA-NEXT:    ret void
1461;
1462; MESA-LABEL: @byref_constant_v4i32_arg(
1463; MESA-NEXT:    [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(292) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1464; MESA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT]], i64 36
1465; MESA-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 4, !invariant.load [[META1]]
1466; MESA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT]], i64 52
1467; MESA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT]], i64 68
1468; MESA-NEXT:    [[AFTER_OFFSET_LOAD:%.*]] = load i32, ptr addrspace(4) [[AFTER_OFFSET_KERNARG_OFFSET]], align 4, !invariant.load [[META1]]
1469; MESA-NEXT:    [[IN:%.*]] = load <4 x i32>, ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]], align 16
1470; MESA-NEXT:    store volatile <4 x i32> [[IN]], ptr addrspace(1) [[OUT_LOAD]], align 4
1471; MESA-NEXT:    store volatile i32 [[AFTER_OFFSET_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
1472; MESA-NEXT:    ret void
1473;
1474  %in = load <4 x i32>, ptr addrspace(4) %in.byref
1475  store volatile <4 x i32> %in, ptr addrspace(1) %out, align 4
1476  store volatile i32 %after.offset, ptr addrspace(1) %out, align 4
1477  ret void
1478}
1479
1480define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) {
1481; HSA-LABEL: @byref_align_constant_i32_arg(
1482; HSA-NEXT:    [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 256 dereferenceable(520) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1483; HSA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 0
1484; HSA-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META1]]
1485; HSA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 256
1486; HSA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 260
1487; HSA-NEXT:    [[AFTER_OFFSET_LOAD:%.*]] = load i32, ptr addrspace(4) [[AFTER_OFFSET_KERNARG_OFFSET]], align 4, !invariant.load [[META1]]
1488; HSA-NEXT:    [[IN:%.*]] = load i32, ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]], align 4
1489; HSA-NEXT:    store volatile i32 [[IN]], ptr addrspace(1) [[OUT_LOAD]], align 4
1490; HSA-NEXT:    store volatile i32 [[AFTER_OFFSET_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
1491; HSA-NEXT:    ret void
1492;
1493; MESA-LABEL: @byref_align_constant_i32_arg(
1494; MESA-NEXT:    [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 256 dereferenceable(520) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1495; MESA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 36
1496; MESA-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 4, !invariant.load [[META1]]
1497; MESA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 292
1498; MESA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 296
1499; MESA-NEXT:    [[AFTER_OFFSET_LOAD:%.*]] = load i32, ptr addrspace(4) [[AFTER_OFFSET_KERNARG_OFFSET]], align 8, !invariant.load [[META1]]
1500; MESA-NEXT:    [[IN:%.*]] = load i32, ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]], align 4
1501; MESA-NEXT:    store volatile i32 [[IN]], ptr addrspace(1) [[OUT_LOAD]], align 4
1502; MESA-NEXT:    store volatile i32 [[AFTER_OFFSET_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
1503; MESA-NEXT:    ret void
1504;
1505  %in = load i32, ptr addrspace(4) %in.byref
1506  store volatile i32 %in, ptr addrspace(1) %out, align 4
1507  store volatile i32 %after.offset, ptr addrspace(1) %out, align 4
1508  ret void
1509}
1510
1511define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(ptr addrspace(1) nocapture %out, i8, ptr addrspace(4) byref(<16 x i32>) %in.byref, i32 %after.offset) {
1512; HSA-LABEL: @byref_natural_align_constant_v16i32_arg(
1513; HSA-NEXT:    [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(392) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1514; HSA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT]], i64 0
1515; HSA-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META1]]
1516; HSA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT]], i64 64
1517; HSA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT]], i64 128
1518; HSA-NEXT:    [[AFTER_OFFSET_LOAD:%.*]] = load i32, ptr addrspace(4) [[AFTER_OFFSET_KERNARG_OFFSET]], align 16, !invariant.load [[META1]]
1519; HSA-NEXT:    [[IN:%.*]] = load <16 x i32>, ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]], align 64
1520; HSA-NEXT:    store volatile <16 x i32> [[IN]], ptr addrspace(1) [[OUT_LOAD]], align 4
1521; HSA-NEXT:    store volatile i32 [[AFTER_OFFSET_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
1522; HSA-NEXT:    ret void
1523;
1524; MESA-LABEL: @byref_natural_align_constant_v16i32_arg(
1525; MESA-NEXT:    [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(388) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1526; MESA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT]], i64 36
1527; MESA-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 4, !invariant.load [[META1]]
1528; MESA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT]], i64 100
1529; MESA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT]], i64 164
1530; MESA-NEXT:    [[AFTER_OFFSET_LOAD:%.*]] = load i32, ptr addrspace(4) [[AFTER_OFFSET_KERNARG_OFFSET]], align 4, !invariant.load [[META1]]
1531; MESA-NEXT:    [[IN:%.*]] = load <16 x i32>, ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]], align 64
1532; MESA-NEXT:    store volatile <16 x i32> [[IN]], ptr addrspace(1) [[OUT_LOAD]], align 4
1533; MESA-NEXT:    store volatile i32 [[AFTER_OFFSET_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
1534; MESA-NEXT:    ret void
1535;
1536  %in = load <16 x i32>, ptr addrspace(4) %in.byref
1537  store volatile <16 x i32> %in, ptr addrspace(1) %out, align 4
1538  store volatile i32 %after.offset, ptr addrspace(1) %out, align 4
1539  ret void
1540}
1541
1542; Also accept byref kernel arguments with other global address spaces.
1543define amdgpu_kernel void @byref_global_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(1) byref(i32) %in.byref) {
1544; HSA-LABEL: @byref_global_i32_arg(
1545; HSA-NEXT:    [[BYREF_GLOBAL_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1546; HSA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_GLOBAL_I32_ARG_KERNARG_SEGMENT]], i64 0
1547; HSA-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META1]]
1548; HSA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_GLOBAL_I32_ARG_KERNARG_SEGMENT]], i64 8
1549; HSA-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to ptr addrspace(1)
1550; HSA-NEXT:    [[IN:%.*]] = load i32, ptr addrspace(1) [[TMP1]], align 4
1551; HSA-NEXT:    store i32 [[IN]], ptr addrspace(1) [[OUT_LOAD]], align 4
1552; HSA-NEXT:    ret void
1553;
1554; MESA-LABEL: @byref_global_i32_arg(
1555; MESA-NEXT:    [[BYREF_GLOBAL_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(268) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1556; MESA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_GLOBAL_I32_ARG_KERNARG_SEGMENT]], i64 36
1557; MESA-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 4, !invariant.load [[META1]]
1558; MESA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_GLOBAL_I32_ARG_KERNARG_SEGMENT]], i64 44
1559; MESA-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to ptr addrspace(1)
1560; MESA-NEXT:    [[IN:%.*]] = load i32, ptr addrspace(1) [[TMP1]], align 4
1561; MESA-NEXT:    store i32 [[IN]], ptr addrspace(1) [[OUT_LOAD]], align 4
1562; MESA-NEXT:    ret void
1563;
1564  %in = load i32, ptr addrspace(1) %in.byref
1565  store i32 %in, ptr addrspace(1) %out, align 4
1566  ret void
1567}
1568
1569define amdgpu_kernel void @byref_flat_i32_arg(ptr addrspace(1) nocapture %out, ptr byref(i32) %in.byref) {
1570; HSA-LABEL: @byref_flat_i32_arg(
1571; HSA-NEXT:    [[BYREF_FLAT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1572; HSA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_FLAT_I32_ARG_KERNARG_SEGMENT]], i64 0
1573; HSA-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META1]]
1574; HSA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_FLAT_I32_ARG_KERNARG_SEGMENT]], i64 8
1575; HSA-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to ptr
1576; HSA-NEXT:    [[IN:%.*]] = load i32, ptr [[TMP1]], align 4
1577; HSA-NEXT:    store i32 [[IN]], ptr addrspace(1) [[OUT_LOAD]], align 4
1578; HSA-NEXT:    ret void
1579;
1580; MESA-LABEL: @byref_flat_i32_arg(
1581; MESA-NEXT:    [[BYREF_FLAT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(268) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1582; MESA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_FLAT_I32_ARG_KERNARG_SEGMENT]], i64 36
1583; MESA-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 4, !invariant.load [[META1]]
1584; MESA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_FLAT_I32_ARG_KERNARG_SEGMENT]], i64 44
1585; MESA-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to ptr
1586; MESA-NEXT:    [[IN:%.*]] = load i32, ptr [[TMP1]], align 4
1587; MESA-NEXT:    store i32 [[IN]], ptr addrspace(1) [[OUT_LOAD]], align 4
1588; MESA-NEXT:    ret void
1589;
1590  %in = load i32, ptr %in.byref
1591  store i32 %in, ptr addrspace(1) %out, align 4
1592  ret void
1593}
1594
1595define amdgpu_kernel void @byref_constant_32bit_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(6) byref(i32) %in.byref) {
1596; HSA-LABEL: @byref_constant_32bit_i32_arg(
1597; HSA-NEXT:    [[BYREF_CONSTANT_32BIT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1598; HSA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_CONSTANT_32BIT_I32_ARG_KERNARG_SEGMENT]], i64 0
1599; HSA-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META1]]
1600; HSA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_CONSTANT_32BIT_I32_ARG_KERNARG_SEGMENT]], i64 8
1601; HSA-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to ptr addrspace(6)
1602; HSA-NEXT:    [[IN:%.*]] = load i32, ptr addrspace(6) [[TMP1]], align 4
1603; HSA-NEXT:    store i32 [[IN]], ptr addrspace(1) [[OUT_LOAD]], align 4
1604; HSA-NEXT:    ret void
1605;
1606; MESA-LABEL: @byref_constant_32bit_i32_arg(
1607; MESA-NEXT:    [[BYREF_CONSTANT_32BIT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(268) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1608; MESA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_CONSTANT_32BIT_I32_ARG_KERNARG_SEGMENT]], i64 36
1609; MESA-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 4, !invariant.load [[META1]]
1610; MESA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_CONSTANT_32BIT_I32_ARG_KERNARG_SEGMENT]], i64 44
1611; MESA-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to ptr addrspace(6)
1612; MESA-NEXT:    [[IN:%.*]] = load i32, ptr addrspace(6) [[TMP1]], align 4
1613; MESA-NEXT:    store i32 [[IN]], ptr addrspace(1) [[OUT_LOAD]], align 4
1614; MESA-NEXT:    ret void
1615;
1616  %in = load i32, ptr addrspace(6) %in.byref
1617  store i32 %in, ptr addrspace(1) %out, align 4
1618  ret void
1619}
1620
1621define amdgpu_kernel void @byref_unknown_as_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(999) byref(i32) %in.byref) {
1622; HSA-LABEL: @byref_unknown_as_i32_arg(
1623; HSA-NEXT:    [[BYREF_UNKNOWN_AS_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1624; HSA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_UNKNOWN_AS_I32_ARG_KERNARG_SEGMENT]], i64 0
1625; HSA-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META1]]
1626; HSA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_UNKNOWN_AS_I32_ARG_KERNARG_SEGMENT]], i64 8
1627; HSA-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to ptr addrspace(999)
1628; HSA-NEXT:    [[IN:%.*]] = load i32, ptr addrspace(999) [[TMP1]], align 4
1629; HSA-NEXT:    store i32 [[IN]], ptr addrspace(1) [[OUT_LOAD]], align 4
1630; HSA-NEXT:    ret void
1631;
1632; MESA-LABEL: @byref_unknown_as_i32_arg(
1633; MESA-NEXT:    [[BYREF_UNKNOWN_AS_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(268) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1634; MESA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_UNKNOWN_AS_I32_ARG_KERNARG_SEGMENT]], i64 36
1635; MESA-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 4, !invariant.load [[META1]]
1636; MESA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_UNKNOWN_AS_I32_ARG_KERNARG_SEGMENT]], i64 44
1637; MESA-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to ptr addrspace(999)
1638; MESA-NEXT:    [[IN:%.*]] = load i32, ptr addrspace(999) [[TMP1]], align 4
1639; MESA-NEXT:    store i32 [[IN]], ptr addrspace(1) [[OUT_LOAD]], align 4
1640; MESA-NEXT:    ret void
1641;
1642  %in = load i32, ptr addrspace(999) %in.byref
1643  store i32 %in, ptr addrspace(1) %out, align 4
1644  ret void
1645}
1646
1647; Invalid, but should not crash.
1648define amdgpu_kernel void @byref_local_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(3) byref(i32) %in.byref) {
1649; HSA-LABEL: @byref_local_i32_arg(
1650; HSA-NEXT:    [[BYREF_LOCAL_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1651; HSA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_LOCAL_I32_ARG_KERNARG_SEGMENT]], i64 0
1652; HSA-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META1]]
1653; HSA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_LOCAL_I32_ARG_KERNARG_SEGMENT]], i64 8
1654; HSA-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to ptr addrspace(3)
1655; HSA-NEXT:    [[IN:%.*]] = load i32, ptr addrspace(3) [[TMP1]], align 4
1656; HSA-NEXT:    store i32 [[IN]], ptr addrspace(1) [[OUT_LOAD]], align 4
1657; HSA-NEXT:    ret void
1658;
1659; MESA-LABEL: @byref_local_i32_arg(
1660; MESA-NEXT:    [[BYREF_LOCAL_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(268) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1661; MESA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_LOCAL_I32_ARG_KERNARG_SEGMENT]], i64 36
1662; MESA-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 4, !invariant.load [[META1]]
1663; MESA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_LOCAL_I32_ARG_KERNARG_SEGMENT]], i64 44
1664; MESA-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to ptr addrspace(3)
1665; MESA-NEXT:    [[IN:%.*]] = load i32, ptr addrspace(3) [[TMP1]], align 4
1666; MESA-NEXT:    store i32 [[IN]], ptr addrspace(1) [[OUT_LOAD]], align 4
1667; MESA-NEXT:    ret void
1668;
1669  %in = load i32, ptr addrspace(3) %in.byref
1670  store i32 %in, ptr addrspace(1) %out, align 4
1671  ret void
1672}
1673
1674define amdgpu_kernel void @multi_byref_constant_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i32) %in0.byref, ptr addrspace(4) byref(i32) %in1.byref, i32 %after.offset) {
1675; HSA-LABEL: @multi_byref_constant_i32_arg(
1676; HSA-NEXT:    [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(280) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1677; HSA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 0
1678; HSA-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META1]]
1679; HSA-NEXT:    [[IN0_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 8
1680; HSA-NEXT:    [[IN1_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 12
1681; HSA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 16
1682; HSA-NEXT:    [[AFTER_OFFSET_LOAD:%.*]] = load i32, ptr addrspace(4) [[AFTER_OFFSET_KERNARG_OFFSET]], align 16, !invariant.load [[META1]]
1683; HSA-NEXT:    [[IN0:%.*]] = load i32, ptr addrspace(4) [[IN0_BYREF_BYVAL_KERNARG_OFFSET]], align 4
1684; HSA-NEXT:    [[IN1:%.*]] = load i32, ptr addrspace(4) [[IN1_BYREF_BYVAL_KERNARG_OFFSET]], align 4
1685; HSA-NEXT:    store volatile i32 [[IN0]], ptr addrspace(1) [[OUT_LOAD]], align 4
1686; HSA-NEXT:    store volatile i32 [[IN1]], ptr addrspace(1) [[OUT_LOAD]], align 4
1687; HSA-NEXT:    store volatile i32 [[AFTER_OFFSET_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
1688; HSA-NEXT:    ret void
1689;
1690; MESA-LABEL: @multi_byref_constant_i32_arg(
1691; MESA-NEXT:    [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(276) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1692; MESA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 36
1693; MESA-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 4, !invariant.load [[META1]]
1694; MESA-NEXT:    [[IN0_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 44
1695; MESA-NEXT:    [[IN1_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 48
1696; MESA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 52
1697; MESA-NEXT:    [[AFTER_OFFSET_LOAD:%.*]] = load i32, ptr addrspace(4) [[AFTER_OFFSET_KERNARG_OFFSET]], align 4, !invariant.load [[META1]]
1698; MESA-NEXT:    [[IN0:%.*]] = load i32, ptr addrspace(4) [[IN0_BYREF_BYVAL_KERNARG_OFFSET]], align 4
1699; MESA-NEXT:    [[IN1:%.*]] = load i32, ptr addrspace(4) [[IN1_BYREF_BYVAL_KERNARG_OFFSET]], align 4
1700; MESA-NEXT:    store volatile i32 [[IN0]], ptr addrspace(1) [[OUT_LOAD]], align 4
1701; MESA-NEXT:    store volatile i32 [[IN1]], ptr addrspace(1) [[OUT_LOAD]], align 4
1702; MESA-NEXT:    store volatile i32 [[AFTER_OFFSET_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
1703; MESA-NEXT:    ret void
1704;
1705  %in0 = load i32, ptr addrspace(4) %in0.byref
1706  %in1 = load i32, ptr addrspace(4) %in1.byref
1707  store volatile i32 %in0, ptr addrspace(1) %out, align 4
1708  store volatile i32 %in1, ptr addrspace(1) %out, align 4
1709  store volatile i32 %after.offset, ptr addrspace(1) %out, align 4
1710  ret void
1711}
1712
1713define amdgpu_kernel void @byref_constant_i32_arg_offset0(ptr addrspace(4) byref(i32) %in.byref) {
1714; HSA-LABEL: @byref_constant_i32_arg_offset0(
1715; HSA-NEXT:    [[BYREF_CONSTANT_I32_ARG_OFFSET0_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1716; HSA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_CONSTANT_I32_ARG_OFFSET0_KERNARG_SEGMENT]], i64 0
1717; HSA-NEXT:    [[IN:%.*]] = load i32, ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]], align 4
1718; HSA-NEXT:    store i32 [[IN]], ptr addrspace(1) undef, align 4
1719; HSA-NEXT:    ret void
1720;
1721; MESA-LABEL: @byref_constant_i32_arg_offset0(
1722; MESA-NEXT:    [[BYREF_CONSTANT_I32_ARG_OFFSET0_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1723; MESA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[BYREF_CONSTANT_I32_ARG_OFFSET0_KERNARG_SEGMENT]], i64 36
1724; MESA-NEXT:    [[IN:%.*]] = load i32, ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]], align 4
1725; MESA-NEXT:    store i32 [[IN]], ptr addrspace(1) undef, align 4
1726; MESA-NEXT:    ret void
1727;
1728  %in = load i32, ptr addrspace(4) %in.byref
1729  store i32 %in, ptr addrspace(1) undef, align 4
1730  ret void
1731}
1732
1733define amdgpu_kernel void @noundef_f32(float noundef %arg0) {
1734; HSA-LABEL: @noundef_f32(
1735; HSA-NEXT:    [[NOUNDEF_F32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1736; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[NOUNDEF_F32_KERNARG_SEGMENT]], i64 0
1737; HSA-NEXT:    [[ARG0_LOAD:%.*]] = load float, ptr addrspace(4) [[ARG0_KERNARG_OFFSET]], align 16, !invariant.load [[META1]], !noundef [[META1]]
1738; HSA-NEXT:    call void (...) @llvm.fake.use(float [[ARG0_LOAD]])
1739; HSA-NEXT:    ret void
1740;
1741; MESA-LABEL: @noundef_f32(
1742; MESA-NEXT:    [[NOUNDEF_F32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1743; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[NOUNDEF_F32_KERNARG_SEGMENT]], i64 36
1744; MESA-NEXT:    [[ARG0_LOAD:%.*]] = load float, ptr addrspace(4) [[ARG0_KERNARG_OFFSET]], align 4, !invariant.load [[META1]], !noundef [[META1]]
1745; MESA-NEXT:    call void (...) @llvm.fake.use(float [[ARG0_LOAD]])
1746; MESA-NEXT:    ret void
1747;
1748  call void (...) @llvm.fake.use(float %arg0)
1749  ret void
1750}
1751
1752define amdgpu_kernel void @noundef_f16(half noundef %arg0) {
1753; HSA-LABEL: @noundef_f16(
1754; HSA-NEXT:    [[NOUNDEF_F16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1755; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[NOUNDEF_F16_KERNARG_SEGMENT]], i64 0
1756; HSA-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META1]], !noundef [[META1]]
1757; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
1758; HSA-NEXT:    [[ARG0_LOAD:%.*]] = bitcast i16 [[TMP2]] to half
1759; HSA-NEXT:    call void (...) @llvm.fake.use(half [[ARG0_LOAD]])
1760; HSA-NEXT:    ret void
1761;
1762; MESA-LABEL: @noundef_f16(
1763; MESA-NEXT:    [[NOUNDEF_F16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(260) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1764; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[NOUNDEF_F16_KERNARG_SEGMENT]], i64 36
1765; MESA-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load [[META1]], !noundef [[META1]]
1766; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
1767; MESA-NEXT:    [[ARG0_LOAD:%.*]] = bitcast i16 [[TMP2]] to half
1768; MESA-NEXT:    call void (...) @llvm.fake.use(half [[ARG0_LOAD]])
1769; MESA-NEXT:    ret void
1770;
1771  call void (...) @llvm.fake.use(half %arg0)
1772  ret void
1773}
1774
1775define amdgpu_kernel void @noundef_v2i32(<2 x i32> noundef %arg0) {
1776; HSA-LABEL: @noundef_v2i32(
1777; HSA-NEXT:    [[NOUNDEF_V2I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1778; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[NOUNDEF_V2I32_KERNARG_SEGMENT]], i64 0
1779; HSA-NEXT:    [[ARG0_LOAD:%.*]] = load <2 x i32>, ptr addrspace(4) [[ARG0_KERNARG_OFFSET]], align 16, !invariant.load [[META1]], !noundef [[META1]]
1780; HSA-NEXT:    call void (...) @llvm.fake.use(<2 x i32> [[ARG0_LOAD]])
1781; HSA-NEXT:    ret void
1782;
1783; MESA-LABEL: @noundef_v2i32(
1784; MESA-NEXT:    [[NOUNDEF_V2I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1785; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[NOUNDEF_V2I32_KERNARG_SEGMENT]], i64 36
1786; MESA-NEXT:    [[ARG0_LOAD:%.*]] = load <2 x i32>, ptr addrspace(4) [[ARG0_KERNARG_OFFSET]], align 4, !invariant.load [[META1]], !noundef [[META1]]
1787; MESA-NEXT:    call void (...) @llvm.fake.use(<2 x i32> [[ARG0_LOAD]])
1788; MESA-NEXT:    ret void
1789;
1790  call void (...) @llvm.fake.use(<2 x i32> %arg0)
1791  ret void
1792}
1793
1794define amdgpu_kernel void @noundef_p0(ptr noundef %arg0) {
1795; HSA-LABEL: @noundef_p0(
1796; HSA-NEXT:    [[NOUNDEF_P0_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1797; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[NOUNDEF_P0_KERNARG_SEGMENT]], i64 0
1798; HSA-NEXT:    [[ARG0_LOAD:%.*]] = load ptr, ptr addrspace(4) [[ARG0_KERNARG_OFFSET]], align 16, !invariant.load [[META1]], !noundef [[META1]]
1799; HSA-NEXT:    call void (...) @llvm.fake.use(ptr [[ARG0_LOAD]])
1800; HSA-NEXT:    ret void
1801;
1802; MESA-LABEL: @noundef_p0(
1803; MESA-NEXT:    [[NOUNDEF_P0_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1804; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[NOUNDEF_P0_KERNARG_SEGMENT]], i64 36
1805; MESA-NEXT:    [[ARG0_LOAD:%.*]] = load ptr, ptr addrspace(4) [[ARG0_KERNARG_OFFSET]], align 4, !invariant.load [[META1]], !noundef [[META1]]
1806; MESA-NEXT:    call void (...) @llvm.fake.use(ptr [[ARG0_LOAD]])
1807; MESA-NEXT:    ret void
1808;
1809  call void (...) @llvm.fake.use(ptr %arg0)
1810  ret void
1811}
1812
1813define amdgpu_kernel void @noundef_v2p0(<2 x ptr> noundef %arg0) {
1814; HSA-LABEL: @noundef_v2p0(
1815; HSA-NEXT:    [[NOUNDEF_V2P0_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1816; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[NOUNDEF_V2P0_KERNARG_SEGMENT]], i64 0
1817; HSA-NEXT:    [[ARG0_LOAD:%.*]] = load <2 x ptr>, ptr addrspace(4) [[ARG0_KERNARG_OFFSET]], align 16, !invariant.load [[META1]], !noundef [[META1]]
1818; HSA-NEXT:    call void (...) @llvm.fake.use(<2 x ptr> [[ARG0_LOAD]])
1819; HSA-NEXT:    ret void
1820;
1821; MESA-LABEL: @noundef_v2p0(
1822; MESA-NEXT:    [[NOUNDEF_V2P0_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
1823; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[NOUNDEF_V2P0_KERNARG_SEGMENT]], i64 36
1824; MESA-NEXT:    [[ARG0_LOAD:%.*]] = load <2 x ptr>, ptr addrspace(4) [[ARG0_KERNARG_OFFSET]], align 4, !invariant.load [[META1]], !noundef [[META1]]
1825; MESA-NEXT:    call void (...) @llvm.fake.use(<2 x ptr> [[ARG0_LOAD]])
1826; MESA-NEXT:    ret void
1827;
1828  call void (...) @llvm.fake.use(<2 x ptr> %arg0)
1829  ret void
1830}
1831
1832attributes #0 = { nounwind "target-cpu"="kaveri" }
1833attributes #1 = { nounwind "target-cpu"="kaveri" "amdgpu-implicitarg-num-bytes"="40" }
1834attributes #2 = { nounwind "target-cpu"="tahiti" }
1835
1836
1837!llvm.module.flags = !{!0}
1838!0 = !{i32 1, !"amdhsa_code_object_version", i32 500}
1839;.
1840; HSA: attributes #[[ATTR0:[0-9]+]] = { nounwind }
1841; HSA: attributes #[[ATTR1:[0-9]+]] = { nounwind "target-cpu"="kaveri" }
1842; HSA: attributes #[[ATTR2:[0-9]+]] = { nounwind "amdgpu-implicitarg-num-bytes"="40" "target-cpu"="kaveri" }
1843; HSA: attributes #[[ATTR3:[0-9]+]] = { nounwind "target-cpu"="tahiti" }
1844; HSA: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
1845;.
1846; MESA: attributes #[[ATTR0:[0-9]+]] = { nounwind }
1847; MESA: attributes #[[ATTR1:[0-9]+]] = { nounwind "target-cpu"="kaveri" }
1848; MESA: attributes #[[ATTR2:[0-9]+]] = { nounwind "amdgpu-implicitarg-num-bytes"="40" "target-cpu"="kaveri" }
1849; MESA: attributes #[[ATTR3:[0-9]+]] = { nounwind "target-cpu"="tahiti" }
1850; MESA: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
1851;.
1852; HSA: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500}
1853; HSA: [[META1]] = !{}
1854; HSA: [[RNG2]] = !{i32 0, i32 8}
1855; HSA: [[META3]] = !{i64 42}
1856; HSA: [[META4]] = !{i64 128}
1857; HSA: [[META5]] = !{i64 1024}
1858;.
1859; MESA: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500}
1860; MESA: [[META1]] = !{}
1861; MESA: [[RNG2]] = !{i32 0, i32 8}
1862; MESA: [[META3]] = !{i64 42}
1863; MESA: [[META4]] = !{i64 128}
1864; MESA: [[META5]] = !{i64 1024}
1865;.
1866