xref: /llvm-project/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll (revision f0415f2a456d54daaa231c228d2c9f4ef2ce9b89)
1; RUN: opt -mtriple=amdgcn-amd-amdhsa --mcpu=hawaii -passes=load-store-vectorizer -S -o - %s | FileCheck %s
2; Copy of test/CodeGen/AMDGPU/merge-stores.ll with some additions
3
4target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
5
6; TODO: Vector element tests
7; TODO: Non-zero base offset for load and store combinations
8; TODO: Same base addrspacecasted
9
10
11define amdgpu_kernel void @merge_global_store_2_constants_i8(ptr addrspace(1) %out) #0 {
12; CHECK-LABEL: @merge_global_store_2_constants_i8(
13; CHECK-NEXT:    store <2 x i8> <i8 -56, i8 123>, ptr addrspace(1) [[OUT:%.*]], align 2
14; CHECK-NEXT:    ret void
15;
16  %out.gep.1 = getelementptr i8, ptr addrspace(1) %out, i32 1
17
18  store i8 123, ptr addrspace(1) %out.gep.1
19  store i8 456, ptr addrspace(1) %out, align 2
20  ret void
21}
22
23define amdgpu_kernel void @merge_global_store_2_constants_i8_natural_align(ptr addrspace(1) %out) #0 {
24; CHECK-LABEL: @merge_global_store_2_constants_i8_natural_align(
25; CHECK-NEXT:    store <2 x i8> <i8 -56, i8 123>, ptr addrspace(1) [[OUT:%.*]], align 1
26; CHECK-NEXT:    ret void
27;
28  %out.gep.1 = getelementptr i8, ptr addrspace(1) %out, i32 1
29
30  store i8 123, ptr addrspace(1) %out.gep.1
31  store i8 456, ptr addrspace(1) %out
32  ret void
33}
34
35define amdgpu_kernel void @merge_global_store_2_constants_i16(ptr addrspace(1) %out) #0 {
36; CHECK-LABEL: @merge_global_store_2_constants_i16(
37; CHECK-NEXT:    store <2 x i16> <i16 456, i16 123>, ptr addrspace(1) [[OUT:%.*]], align 4
38; CHECK-NEXT:    ret void
39;
40  %out.gep.1 = getelementptr i16, ptr addrspace(1) %out, i32 1
41
42  store i16 123, ptr addrspace(1) %out.gep.1
43  store i16 456, ptr addrspace(1) %out, align 4
44  ret void
45}
46
47define amdgpu_kernel void @merge_global_store_2_constants_0_i16(ptr addrspace(1) %out) #0 {
48; CHECK-LABEL: @merge_global_store_2_constants_0_i16(
49; CHECK-NEXT:    store <2 x i16> zeroinitializer, ptr addrspace(1) [[OUT:%.*]], align 4
50; CHECK-NEXT:    ret void
51;
52  %out.gep.1 = getelementptr i16, ptr addrspace(1) %out, i32 1
53
54  store i16 0, ptr addrspace(1) %out.gep.1
55  store i16 0, ptr addrspace(1) %out, align 4
56  ret void
57}
58
59define amdgpu_kernel void @merge_global_store_2_constants_i16_natural_align(ptr addrspace(1) %out) #0 {
60; CHECK-LABEL: @merge_global_store_2_constants_i16_natural_align(
61; CHECK-NEXT:    store <2 x i16> <i16 456, i16 123>, ptr addrspace(1) [[OUT:%.*]], align 2
62; CHECK-NEXT:    ret void
63;
64  %out.gep.1 = getelementptr i16, ptr addrspace(1) %out, i32 1
65
66  store i16 123, ptr addrspace(1) %out.gep.1
67  store i16 456, ptr addrspace(1) %out
68  ret void
69}
70
71define amdgpu_kernel void @merge_global_store_2_constants_i16_align_1(ptr addrspace(1) %out) #0 {
72; CHECK-LABEL: @merge_global_store_2_constants_i16_align_1(
73; CHECK-NEXT:    store <2 x i16> <i16 456, i16 123>, ptr addrspace(1) [[OUT:%.*]], align 1
74; CHECK-NEXT:    ret void
75;
76  %out.gep.1 = getelementptr i16, ptr addrspace(1) %out, i32 1
77
78  store i16 123, ptr addrspace(1) %out.gep.1, align 1
79  store i16 456, ptr addrspace(1) %out, align 1
80  ret void
81}
82
83define amdgpu_kernel void @merge_global_store_2_constants_half_natural_align(ptr addrspace(1) %out) #0 {
84; CHECK-LABEL: @merge_global_store_2_constants_half_natural_align(
85; CHECK-NEXT:    store <2 x half> <half 0xH3C00, half 0xH4000>, ptr addrspace(1) [[OUT:%.*]], align 2
86; CHECK-NEXT:    ret void
87;
88  %out.gep.1 = getelementptr half, ptr addrspace(1) %out, i32 1
89
90  store half 2.0, ptr addrspace(1) %out.gep.1
91  store half 1.0, ptr addrspace(1) %out
92  ret void
93}
94
95define amdgpu_kernel void @merge_global_store_2_constants_half_align_1(ptr addrspace(1) %out) #0 {
96; CHECK-LABEL: @merge_global_store_2_constants_half_align_1(
97; CHECK-NEXT:    store <2 x half> <half 0xH3C00, half 0xH4000>, ptr addrspace(1) [[OUT:%.*]], align 1
98; CHECK-NEXT:    ret void
99;
100  %out.gep.1 = getelementptr half, ptr addrspace(1) %out, i32 1
101
102  store half 2.0, ptr addrspace(1) %out.gep.1, align 1
103  store half 1.0, ptr addrspace(1) %out, align 1
104  ret void
105}
106
107define amdgpu_kernel void @merge_global_store_2_constants_i32(ptr addrspace(1) %out) #0 {
108; CHECK-LABEL: @merge_global_store_2_constants_i32(
109; CHECK-NEXT:    store <2 x i32> <i32 456, i32 123>, ptr addrspace(1) [[OUT:%.*]], align 4
110; CHECK-NEXT:    ret void
111;
112  %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
113
114  store i32 123, ptr addrspace(1) %out.gep.1
115  store i32 456, ptr addrspace(1) %out
116  ret void
117}
118
119define amdgpu_kernel void @merge_global_store_2_constants_i32_f32(ptr addrspace(1) %out) #0 {
120; CHECK-LABEL: @merge_global_store_2_constants_i32_f32(
121; CHECK-NEXT:    store <2 x i32> <i32 456, i32 1065353216>, ptr addrspace(1) [[OUT:%.*]], align 4
122; CHECK-NEXT:    ret void
123;
124  %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
125  store float 1.0, ptr addrspace(1) %out.gep.1
126  store i32 456, ptr addrspace(1) %out
127  ret void
128}
129
130define amdgpu_kernel void @merge_global_store_2_constants_f32_i32(ptr addrspace(1) %out) #0 {
131; CHECK-LABEL: @merge_global_store_2_constants_f32_i32(
132; CHECK-NEXT:    store <2 x i32> <i32 1082130432, i32 123>, ptr addrspace(1) [[OUT:%.*]], align 4
133; CHECK-NEXT:    ret void
134;
135  %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1
136  store i32 123, ptr addrspace(1) %out.gep.1
137  store float 4.0, ptr addrspace(1) %out
138  ret void
139}
140
141define amdgpu_kernel void @merge_global_store_4_constants_i32(ptr addrspace(1) %out) #0 {
142; CHECK-LABEL: @merge_global_store_4_constants_i32(
143; CHECK-NEXT:    store <4 x i32> <i32 1234, i32 123, i32 456, i32 333>, ptr addrspace(1) [[OUT:%.*]], align 4
144; CHECK-NEXT:    ret void
145;
146  %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
147  %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2
148  %out.gep.3 = getelementptr i32, ptr addrspace(1) %out, i32 3
149
150  store i32 123, ptr addrspace(1) %out.gep.1
151  store i32 456, ptr addrspace(1) %out.gep.2
152  store i32 333, ptr addrspace(1) %out.gep.3
153  store i32 1234, ptr addrspace(1) %out
154  ret void
155}
156
157define amdgpu_kernel void @merge_global_store_4_constants_f32_order(ptr addrspace(1) %out) #0 {
158; CHECK-LABEL: @merge_global_store_4_constants_f32_order(
159; CHECK-NEXT:    store <4 x float> <float 8.000000e+00, float 1.000000e+00, float 2.000000e+00, float 4.000000e+00>, ptr addrspace(1) [[OUT:%.*]], align 4
160; CHECK-NEXT:    ret void
161;
162  %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1
163  %out.gep.2 = getelementptr float, ptr addrspace(1) %out, i32 2
164  %out.gep.3 = getelementptr float, ptr addrspace(1) %out, i32 3
165
166  store float 8.0, ptr addrspace(1) %out
167  store float 1.0, ptr addrspace(1) %out.gep.1
168  store float 2.0, ptr addrspace(1) %out.gep.2
169  store float 4.0, ptr addrspace(1) %out.gep.3
170  ret void
171}
172
173; First store is out of order.
174define amdgpu_kernel void @merge_global_store_4_constants_f32(ptr addrspace(1) %out) #0 {
175; CHECK-LABEL: @merge_global_store_4_constants_f32(
176; CHECK-NEXT:    store <4 x float> <float 8.000000e+00, float 1.000000e+00, float 2.000000e+00, float 4.000000e+00>, ptr addrspace(1) [[OUT:%.*]], align 4
177; CHECK-NEXT:    ret void
178;
179  %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1
180  %out.gep.2 = getelementptr float, ptr addrspace(1) %out, i32 2
181  %out.gep.3 = getelementptr float, ptr addrspace(1) %out, i32 3
182
183  store float 1.0, ptr addrspace(1) %out.gep.1
184  store float 2.0, ptr addrspace(1) %out.gep.2
185  store float 4.0, ptr addrspace(1) %out.gep.3
186  store float 8.0, ptr addrspace(1) %out
187  ret void
188}
189
190define amdgpu_kernel void @merge_global_store_4_constants_mixed_i32_f32(ptr addrspace(1) %out) #0 {
191; CHECK-LABEL: @merge_global_store_4_constants_mixed_i32_f32(
192; CHECK-NEXT:    store <4 x i32> <i32 1090519040, i32 11, i32 1073741824, i32 17>, ptr addrspace(1) [[OUT:%.*]], align 4
193; CHECK-NEXT:    ret void
194;
195  %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1
196  %out.gep.2 = getelementptr float, ptr addrspace(1) %out, i32 2
197  %out.gep.3 = getelementptr float, ptr addrspace(1) %out, i32 3
198
199
200  store i32 11, ptr addrspace(1) %out.gep.1
201  store float 2.0, ptr addrspace(1) %out.gep.2
202  store i32 17, ptr addrspace(1) %out.gep.3
203  store float 8.0, ptr addrspace(1) %out
204  ret void
205}
206
207define amdgpu_kernel void @merge_global_store_3_constants_i32(ptr addrspace(1) %out) #0 {
208; CHECK-LABEL: @merge_global_store_3_constants_i32(
209; CHECK-NEXT:    store <3 x i32> <i32 1234, i32 123, i32 456>, ptr addrspace(1) [[OUT:%.*]], align 4
210; CHECK-NEXT:    ret void
211;
212  %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
213  %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2
214
215  store i32 123, ptr addrspace(1) %out.gep.1
216  store i32 456, ptr addrspace(1) %out.gep.2
217  store i32 1234, ptr addrspace(1) %out
218  ret void
219}
220
221define amdgpu_kernel void @merge_global_store_2_constants_i64(ptr addrspace(1) %out) #0 {
222; CHECK-LABEL: @merge_global_store_2_constants_i64(
223; CHECK-NEXT:    store <2 x i64> <i64 456, i64 123>, ptr addrspace(1) [[OUT:%.*]], align 8
224; CHECK-NEXT:    ret void
225;
226  %out.gep.1 = getelementptr i64, ptr addrspace(1) %out, i64 1
227
228  store i64 123, ptr addrspace(1) %out.gep.1
229  store i64 456, ptr addrspace(1) %out
230  ret void
231}
232
233define amdgpu_kernel void @merge_global_store_4_constants_i64(ptr addrspace(1) %out) #0 {
234; CHECK-LABEL: @merge_global_store_4_constants_i64(
235; CHECK-NEXT:    [[OUT_GEP_2:%.*]] = getelementptr i64, ptr addrspace(1) [[OUT:%.*]], i64 2
236; CHECK-NEXT:    store <2 x i64> <i64 456, i64 333>, ptr addrspace(1) [[OUT_GEP_2]], align 8
237; CHECK-NEXT:    store <2 x i64> <i64 1234, i64 123>, ptr addrspace(1) [[OUT]], align 8
238; CHECK-NEXT:    ret void
239;
240  %out.gep.1 = getelementptr i64, ptr addrspace(1) %out, i64 1
241  %out.gep.2 = getelementptr i64, ptr addrspace(1) %out, i64 2
242  %out.gep.3 = getelementptr i64, ptr addrspace(1) %out, i64 3
243
244  store i64 123, ptr addrspace(1) %out.gep.1
245  store i64 456, ptr addrspace(1) %out.gep.2
246  store i64 333, ptr addrspace(1) %out.gep.3
247  store i64 1234, ptr addrspace(1) %out
248  ret void
249}
250
251define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
252; CHECK-LABEL: @merge_global_store_2_adjacent_loads_i32(
253; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[IN:%.*]], align 4
254; CHECK-NEXT:    [[LO1:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
255; CHECK-NEXT:    [[HI2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
256; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> poison, i32 [[LO1]], i32 0
257; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[HI2]], i32 1
258; CHECK-NEXT:    store <2 x i32> [[TMP3]], ptr addrspace(1) [[OUT:%.*]], align 4
259; CHECK-NEXT:    ret void
260;
261  %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
262  %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 1
263
264  %lo = load i32, ptr addrspace(1) %in
265  %hi = load i32, ptr addrspace(1) %in.gep.1
266
267  store i32 %lo, ptr addrspace(1) %out
268  store i32 %hi, ptr addrspace(1) %out.gep.1
269  ret void
270}
271
272define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32_nonzero_base(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
273; CHECK-LABEL: @merge_global_store_2_adjacent_loads_i32_nonzero_base(
274; CHECK-NEXT:    [[IN_GEP_0:%.*]] = getelementptr i32, ptr addrspace(1) [[IN:%.*]], i32 2
275; CHECK-NEXT:    [[OUT_GEP_0:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i32 2
276; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[IN_GEP_0]], align 4
277; CHECK-NEXT:    [[LO1:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
278; CHECK-NEXT:    [[HI2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
279; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> poison, i32 [[LO1]], i32 0
280; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[HI2]], i32 1
281; CHECK-NEXT:    store <2 x i32> [[TMP3]], ptr addrspace(1) [[OUT_GEP_0]], align 4
282; CHECK-NEXT:    ret void
283;
284  %in.gep.0 = getelementptr i32, ptr addrspace(1) %in, i32 2
285  %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 3
286
287  %out.gep.0 = getelementptr i32, ptr addrspace(1) %out, i32 2
288  %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 3
289  %lo = load i32, ptr addrspace(1) %in.gep.0
290  %hi = load i32, ptr addrspace(1) %in.gep.1
291
292  store i32 %lo, ptr addrspace(1) %out.gep.0
293  store i32 %hi, ptr addrspace(1) %out.gep.1
294  ret void
295}
296
297define amdgpu_kernel void @merge_global_store_2_adjacent_loads_shuffle_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
298; CHECK-LABEL: @merge_global_store_2_adjacent_loads_shuffle_i32(
299; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[IN:%.*]], align 4
300; CHECK-NEXT:    [[LO1:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
301; CHECK-NEXT:    [[HI2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
302; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> poison, i32 [[HI2]], i32 0
303; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[LO1]], i32 1
304; CHECK-NEXT:    store <2 x i32> [[TMP3]], ptr addrspace(1) [[OUT:%.*]], align 4
305; CHECK-NEXT:    ret void
306;
307  %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
308  %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 1
309
310  %lo = load i32, ptr addrspace(1) %in
311  %hi = load i32, ptr addrspace(1) %in.gep.1
312
313  store i32 %hi, ptr addrspace(1) %out
314  store i32 %lo, ptr addrspace(1) %out.gep.1
315  ret void
316}
317
318define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
319; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i32(
320; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr addrspace(1) [[IN:%.*]], align 4
321; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
322; CHECK-NEXT:    [[Y2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
323; CHECK-NEXT:    [[Z3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
324; CHECK-NEXT:    [[W4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
325; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[X1]], i32 0
326; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[Y2]], i32 1
327; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[Z3]], i32 2
328; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[W4]], i32 3
329; CHECK-NEXT:    store <4 x i32> [[TMP5]], ptr addrspace(1) [[OUT:%.*]], align 4
330; CHECK-NEXT:    ret void
331;
332  %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
333  %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2
334  %out.gep.3 = getelementptr i32, ptr addrspace(1) %out, i32 3
335  %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 1
336  %in.gep.2 = getelementptr i32, ptr addrspace(1) %in, i32 2
337  %in.gep.3 = getelementptr i32, ptr addrspace(1) %in, i32 3
338
339  %x = load i32, ptr addrspace(1) %in
340  %y = load i32, ptr addrspace(1) %in.gep.1
341  %z = load i32, ptr addrspace(1) %in.gep.2
342  %w = load i32, ptr addrspace(1) %in.gep.3
343
344  store i32 %x, ptr addrspace(1) %out
345  store i32 %y, ptr addrspace(1) %out.gep.1
346  store i32 %z, ptr addrspace(1) %out.gep.2
347  store i32 %w, ptr addrspace(1) %out.gep.3
348  ret void
349}
350
351define amdgpu_kernel void @merge_global_store_3_adjacent_loads_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
352; CHECK-LABEL: @merge_global_store_3_adjacent_loads_i32(
353; CHECK-NEXT:    [[TMP1:%.*]] = load <3 x i32>, ptr addrspace(1) [[IN:%.*]], align 4
354; CHECK-NEXT:    [[X1:%.*]] = extractelement <3 x i32> [[TMP1]], i32 0
355; CHECK-NEXT:    [[Y2:%.*]] = extractelement <3 x i32> [[TMP1]], i32 1
356; CHECK-NEXT:    [[Z3:%.*]] = extractelement <3 x i32> [[TMP1]], i32 2
357; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <3 x i32> poison, i32 [[X1]], i32 0
358; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <3 x i32> [[TMP2]], i32 [[Y2]], i32 1
359; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <3 x i32> [[TMP3]], i32 [[Z3]], i32 2
360; CHECK-NEXT:    store <3 x i32> [[TMP4]], ptr addrspace(1) [[OUT:%.*]], align 4
361; CHECK-NEXT:    ret void
362;
363  %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
364  %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2
365  %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 1
366  %in.gep.2 = getelementptr i32, ptr addrspace(1) %in, i32 2
367
368  %x = load i32, ptr addrspace(1) %in
369  %y = load i32, ptr addrspace(1) %in.gep.1
370  %z = load i32, ptr addrspace(1) %in.gep.2
371
372  store i32 %x, ptr addrspace(1) %out
373  store i32 %y, ptr addrspace(1) %out.gep.1
374  store i32 %z, ptr addrspace(1) %out.gep.2
375  ret void
376}
377
378define amdgpu_kernel void @merge_global_store_4_adjacent_loads_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
379; CHECK-LABEL: @merge_global_store_4_adjacent_loads_f32(
380; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr addrspace(1) [[IN:%.*]], align 4
381; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
382; CHECK-NEXT:    [[Y2:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
383; CHECK-NEXT:    [[Z3:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
384; CHECK-NEXT:    [[W4:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
385; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> poison, float [[X1]], i32 0
386; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[Y2]], i32 1
387; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[Z3]], i32 2
388; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float [[W4]], i32 3
389; CHECK-NEXT:    store <4 x float> [[TMP5]], ptr addrspace(1) [[OUT:%.*]], align 4
390; CHECK-NEXT:    ret void
391;
392  %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1
393  %out.gep.2 = getelementptr float, ptr addrspace(1) %out, i32 2
394  %out.gep.3 = getelementptr float, ptr addrspace(1) %out, i32 3
395  %in.gep.1 = getelementptr float, ptr addrspace(1) %in, i32 1
396  %in.gep.2 = getelementptr float, ptr addrspace(1) %in, i32 2
397  %in.gep.3 = getelementptr float, ptr addrspace(1) %in, i32 3
398
399  %x = load float, ptr addrspace(1) %in
400  %y = load float, ptr addrspace(1) %in.gep.1
401  %z = load float, ptr addrspace(1) %in.gep.2
402  %w = load float, ptr addrspace(1) %in.gep.3
403
404  store float %x, ptr addrspace(1) %out
405  store float %y, ptr addrspace(1) %out.gep.1
406  store float %z, ptr addrspace(1) %out.gep.2
407  store float %w, ptr addrspace(1) %out.gep.3
408  ret void
409}
410
411define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32_nonzero_base(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
412; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i32_nonzero_base(
413; CHECK-NEXT:    [[IN_GEP_0:%.*]] = getelementptr i32, ptr addrspace(1) [[IN:%.*]], i32 11
414; CHECK-NEXT:    [[OUT_GEP_0:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i32 7
415; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr addrspace(1) [[IN_GEP_0]], align 4
416; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
417; CHECK-NEXT:    [[Y2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
418; CHECK-NEXT:    [[Z3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
419; CHECK-NEXT:    [[W4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
420; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[X1]], i32 0
421; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[Y2]], i32 1
422; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[Z3]], i32 2
423; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[W4]], i32 3
424; CHECK-NEXT:    store <4 x i32> [[TMP5]], ptr addrspace(1) [[OUT_GEP_0]], align 4
425; CHECK-NEXT:    ret void
426;
427  %in.gep.0 = getelementptr i32, ptr addrspace(1) %in, i32 11
428  %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 12
429  %in.gep.2 = getelementptr i32, ptr addrspace(1) %in, i32 13
430  %in.gep.3 = getelementptr i32, ptr addrspace(1) %in, i32 14
431  %out.gep.0 = getelementptr i32, ptr addrspace(1) %out, i32 7
432  %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 8
433  %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 9
434  %out.gep.3 = getelementptr i32, ptr addrspace(1) %out, i32 10
435
436  %x = load i32, ptr addrspace(1) %in.gep.0
437  %y = load i32, ptr addrspace(1) %in.gep.1
438  %z = load i32, ptr addrspace(1) %in.gep.2
439  %w = load i32, ptr addrspace(1) %in.gep.3
440
441  store i32 %x, ptr addrspace(1) %out.gep.0
442  store i32 %y, ptr addrspace(1) %out.gep.1
443  store i32 %z, ptr addrspace(1) %out.gep.2
444  store i32 %w, ptr addrspace(1) %out.gep.3
445  ret void
446}
447
448define amdgpu_kernel void @merge_global_store_4_adjacent_loads_inverse_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
449; CHECK-LABEL: @merge_global_store_4_adjacent_loads_inverse_i32(
450; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr addrspace(1) [[IN:%.*]], align 4
451; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
452; CHECK-NEXT:    [[Y2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
453; CHECK-NEXT:    [[Z3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
454; CHECK-NEXT:    [[W4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
455; CHECK-NEXT:    tail call void @llvm.amdgcn.s.barrier() #[[ATTR3:[0-9]+]]
456; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[X1]], i32 0
457; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[Y2]], i32 1
458; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[Z3]], i32 2
459; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[W4]], i32 3
460; CHECK-NEXT:    store <4 x i32> [[TMP5]], ptr addrspace(1) [[OUT:%.*]], align 4
461; CHECK-NEXT:    ret void
462;
463  %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
464  %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2
465  %out.gep.3 = getelementptr i32, ptr addrspace(1) %out, i32 3
466  %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 1
467  %in.gep.2 = getelementptr i32, ptr addrspace(1) %in, i32 2
468  %in.gep.3 = getelementptr i32, ptr addrspace(1) %in, i32 3
469
470  %x = load i32, ptr addrspace(1) %in
471  %y = load i32, ptr addrspace(1) %in.gep.1
472  %z = load i32, ptr addrspace(1) %in.gep.2
473  %w = load i32, ptr addrspace(1) %in.gep.3
474
475  ; Make sure the barrier doesn't stop this
476  tail call void @llvm.amdgcn.s.barrier() #1
477
478  store i32 %w, ptr addrspace(1) %out.gep.3
479  store i32 %z, ptr addrspace(1) %out.gep.2
480  store i32 %y, ptr addrspace(1) %out.gep.1
481  store i32 %x, ptr addrspace(1) %out
482
483  ret void
484}
485
486define amdgpu_kernel void @merge_global_store_4_adjacent_loads_shuffle_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
487; CHECK-LABEL: @merge_global_store_4_adjacent_loads_shuffle_i32(
488; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr addrspace(1) [[IN:%.*]], align 4
489; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
490; CHECK-NEXT:    [[Y2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
491; CHECK-NEXT:    [[Z3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
492; CHECK-NEXT:    [[W4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
493; CHECK-NEXT:    tail call void @llvm.amdgcn.s.barrier() #[[ATTR3]]
494; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[W4]], i32 0
495; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[Z3]], i32 1
496; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[Y2]], i32 2
497; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[X1]], i32 3
498; CHECK-NEXT:    store <4 x i32> [[TMP5]], ptr addrspace(1) [[OUT:%.*]], align 4
499; CHECK-NEXT:    ret void
500;
501  %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
502  %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2
503  %out.gep.3 = getelementptr i32, ptr addrspace(1) %out, i32 3
504  %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 1
505  %in.gep.2 = getelementptr i32, ptr addrspace(1) %in, i32 2
506  %in.gep.3 = getelementptr i32, ptr addrspace(1) %in, i32 3
507
508  %x = load i32, ptr addrspace(1) %in
509  %y = load i32, ptr addrspace(1) %in.gep.1
510  %z = load i32, ptr addrspace(1) %in.gep.2
511  %w = load i32, ptr addrspace(1) %in.gep.3
512
513  ; Make sure the barrier doesn't stop this
514  tail call void @llvm.amdgcn.s.barrier() #1
515
516  store i32 %w, ptr addrspace(1) %out
517  store i32 %z, ptr addrspace(1) %out.gep.1
518  store i32 %y, ptr addrspace(1) %out.gep.2
519  store i32 %x, ptr addrspace(1) %out.gep.3
520
521  ret void
522}
523
524define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
525; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i8(
526; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr addrspace(1) [[IN:%.*]], align 4
527; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0
528; CHECK-NEXT:    [[Y2:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1
529; CHECK-NEXT:    [[Z3:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2
530; CHECK-NEXT:    [[W4:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3
531; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[X1]], i32 0
532; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> [[TMP2]], i8 [[Y2]], i32 1
533; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i8> [[TMP3]], i8 [[Z3]], i32 2
534; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i8> [[TMP4]], i8 [[W4]], i32 3
535; CHECK-NEXT:    store <4 x i8> [[TMP5]], ptr addrspace(1) [[OUT:%.*]], align 4
536; CHECK-NEXT:    ret void
537;
538  %out.gep.1 = getelementptr i8, ptr addrspace(1) %out, i8 1
539  %out.gep.2 = getelementptr i8, ptr addrspace(1) %out, i8 2
540  %out.gep.3 = getelementptr i8, ptr addrspace(1) %out, i8 3
541  %in.gep.1 = getelementptr i8, ptr addrspace(1) %in, i8 1
542  %in.gep.2 = getelementptr i8, ptr addrspace(1) %in, i8 2
543  %in.gep.3 = getelementptr i8, ptr addrspace(1) %in, i8 3
544
545  %x = load i8, ptr addrspace(1) %in, align 4
546  %y = load i8, ptr addrspace(1) %in.gep.1
547  %z = load i8, ptr addrspace(1) %in.gep.2
548  %w = load i8, ptr addrspace(1) %in.gep.3
549
550  store i8 %x, ptr addrspace(1) %out, align 4
551  store i8 %y, ptr addrspace(1) %out.gep.1
552  store i8 %z, ptr addrspace(1) %out.gep.2
553  store i8 %w, ptr addrspace(1) %out.gep.3
554  ret void
555}
556
557define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8_natural_align(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
558; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i8_natural_align(
559; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr addrspace(1) [[IN:%.*]], align 1
560; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0
561; CHECK-NEXT:    [[Y2:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1
562; CHECK-NEXT:    [[Z3:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2
563; CHECK-NEXT:    [[W4:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3
564; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[X1]], i32 0
565; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> [[TMP2]], i8 [[Y2]], i32 1
566; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i8> [[TMP3]], i8 [[Z3]], i32 2
567; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i8> [[TMP4]], i8 [[W4]], i32 3
568; CHECK-NEXT:    store <4 x i8> [[TMP5]], ptr addrspace(1) [[OUT:%.*]], align 1
569; CHECK-NEXT:    ret void
570;
571  %out.gep.1 = getelementptr i8, ptr addrspace(1) %out, i8 1
572  %out.gep.2 = getelementptr i8, ptr addrspace(1) %out, i8 2
573  %out.gep.3 = getelementptr i8, ptr addrspace(1) %out, i8 3
574  %in.gep.1 = getelementptr i8, ptr addrspace(1) %in, i8 1
575  %in.gep.2 = getelementptr i8, ptr addrspace(1) %in, i8 2
576  %in.gep.3 = getelementptr i8, ptr addrspace(1) %in, i8 3
577
578  %x = load i8, ptr addrspace(1) %in
579  %y = load i8, ptr addrspace(1) %in.gep.1
580  %z = load i8, ptr addrspace(1) %in.gep.2
581  %w = load i8, ptr addrspace(1) %in.gep.3
582
583  store i8 %x, ptr addrspace(1) %out
584  store i8 %y, ptr addrspace(1) %out.gep.1
585  store i8 %z, ptr addrspace(1) %out.gep.2
586  store i8 %w, ptr addrspace(1) %out.gep.3
587  ret void
588}
589
590define amdgpu_kernel void @merge_global_store_4_vector_elts_loads_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
591; CHECK-LABEL: @merge_global_store_4_vector_elts_loads_v4i32(
592; CHECK-NEXT:    [[VEC:%.*]] = load <4 x i32>, ptr addrspace(1) [[IN:%.*]], align 16
593; CHECK-NEXT:    [[X:%.*]] = extractelement <4 x i32> [[VEC]], i32 0
594; CHECK-NEXT:    [[Y:%.*]] = extractelement <4 x i32> [[VEC]], i32 1
595; CHECK-NEXT:    [[Z:%.*]] = extractelement <4 x i32> [[VEC]], i32 2
596; CHECK-NEXT:    [[W:%.*]] = extractelement <4 x i32> [[VEC]], i32 3
597; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i32 0
598; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[Y]], i32 1
599; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[Z]], i32 2
600; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[W]], i32 3
601; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr addrspace(1) [[OUT:%.*]], align 4
602; CHECK-NEXT:    ret void
603;
604  %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
605  %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2
606  %out.gep.3 = getelementptr i32, ptr addrspace(1) %out, i32 3
607  %vec = load <4 x i32>, ptr addrspace(1) %in
608
609  %x = extractelement <4 x i32> %vec, i32 0
610  %y = extractelement <4 x i32> %vec, i32 1
611  %z = extractelement <4 x i32> %vec, i32 2
612  %w = extractelement <4 x i32> %vec, i32 3
613
614  store i32 %x, ptr addrspace(1) %out
615  store i32 %y, ptr addrspace(1) %out.gep.1
616  store i32 %z, ptr addrspace(1) %out.gep.2
617  store i32 %w, ptr addrspace(1) %out.gep.3
618  ret void
619}
620
621define amdgpu_kernel void @merge_local_store_2_constants_i8(ptr addrspace(3) %out) #0 {
622; CHECK-LABEL: @merge_local_store_2_constants_i8(
623; CHECK-NEXT:    store <2 x i8> <i8 -56, i8 123>, ptr addrspace(3) [[OUT:%.*]], align 2
624; CHECK-NEXT:    ret void
625;
626  %out.gep.1 = getelementptr i8, ptr addrspace(3) %out, i32 1
627
628  store i8 123, ptr addrspace(3) %out.gep.1
629  store i8 456, ptr addrspace(3) %out, align 2
630  ret void
631}
632
633define amdgpu_kernel void @merge_local_store_2_constants_i32(ptr addrspace(3) %out) #0 {
634; CHECK-LABEL: @merge_local_store_2_constants_i32(
635; CHECK-NEXT:    store <2 x i32> <i32 456, i32 123>, ptr addrspace(3) [[OUT:%.*]], align 4
636; CHECK-NEXT:    ret void
637;
638  %out.gep.1 = getelementptr i32, ptr addrspace(3) %out, i32 1
639
640  store i32 123, ptr addrspace(3) %out.gep.1
641  store i32 456, ptr addrspace(3) %out
642  ret void
643}
644
645define amdgpu_kernel void @merge_local_store_2_constants_i32_align_2(ptr addrspace(3) %out) #0 {
646; CHECK-LABEL: @merge_local_store_2_constants_i32_align_2(
647; CHECK-NEXT:    [[OUT_GEP_1:%.*]] = getelementptr i32, ptr addrspace(3) [[OUT:%.*]], i32 1
648; CHECK-NEXT:    store i32 123, ptr addrspace(3) [[OUT_GEP_1]], align 2
649; CHECK-NEXT:    store i32 456, ptr addrspace(3) [[OUT]], align 2
650; CHECK-NEXT:    ret void
651;
652  %out.gep.1 = getelementptr i32, ptr addrspace(3) %out, i32 1
653
654  store i32 123, ptr addrspace(3) %out.gep.1, align 2
655  store i32 456, ptr addrspace(3) %out, align 2
656  ret void
657}
658
659define amdgpu_kernel void @merge_local_store_4_constants_i32(ptr addrspace(3) %out) #0 {
660; CHECK-LABEL: @merge_local_store_4_constants_i32(
661; CHECK-NEXT:    [[OUT_GEP_2:%.*]] = getelementptr i32, ptr addrspace(3) [[OUT:%.*]], i32 2
662; CHECK-NEXT:    store <2 x i32> <i32 456, i32 333>, ptr addrspace(3) [[OUT_GEP_2]], align 4
663; CHECK-NEXT:    store <2 x i32> <i32 1234, i32 123>, ptr addrspace(3) [[OUT]], align 4
664; CHECK-NEXT:    ret void
665;
666  %out.gep.1 = getelementptr i32, ptr addrspace(3) %out, i32 1
667  %out.gep.2 = getelementptr i32, ptr addrspace(3) %out, i32 2
668  %out.gep.3 = getelementptr i32, ptr addrspace(3) %out, i32 3
669
670  store i32 123, ptr addrspace(3) %out.gep.1
671  store i32 456, ptr addrspace(3) %out.gep.2
672  store i32 333, ptr addrspace(3) %out.gep.3
673  store i32 1234, ptr addrspace(3) %out
674  ret void
675}
676
677define amdgpu_kernel void @merge_global_store_5_constants_i32(ptr addrspace(1) %out) {
678; CHECK-LABEL: @merge_global_store_5_constants_i32(
679; CHECK-NEXT:    store <4 x i32> <i32 9, i32 12, i32 16, i32 -12>, ptr addrspace(1) [[OUT:%.*]], align 4
680; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[OUT]], i64 4
681; CHECK-NEXT:    store i32 11, ptr addrspace(1) [[IDX4]], align 4
682; CHECK-NEXT:    ret void
683;
684  store i32 9, ptr addrspace(1) %out, align 4
685  %idx1 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 1
686  store i32 12, ptr addrspace(1) %idx1, align 4
687  %idx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 2
688  store i32 16, ptr addrspace(1) %idx2, align 4
689  %idx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 3
690  store i32 -12, ptr addrspace(1) %idx3, align 4
691  %idx4 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 4
692  store i32 11, ptr addrspace(1) %idx4, align 4
693  ret void
694}
695
696define amdgpu_kernel void @merge_global_store_6_constants_i32(ptr addrspace(1) %out) {
697; CHECK-LABEL: @merge_global_store_6_constants_i32(
698; CHECK-NEXT:    store <4 x i32> <i32 13, i32 15, i32 62, i32 63>, ptr addrspace(1) [[OUT:%.*]], align 4
699; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[OUT]], i64 4
700; CHECK-NEXT:    store <2 x i32> <i32 11, i32 123>, ptr addrspace(1) [[IDX4]], align 4
701; CHECK-NEXT:    ret void
702;
703  store i32 13, ptr addrspace(1) %out, align 4
704  %idx1 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 1
705  store i32 15, ptr addrspace(1) %idx1, align 4
706  %idx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 2
707  store i32 62, ptr addrspace(1) %idx2, align 4
708  %idx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 3
709  store i32 63, ptr addrspace(1) %idx3, align 4
710  %idx4 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 4
711  store i32 11, ptr addrspace(1) %idx4, align 4
712  %idx5 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 5
713  store i32 123, ptr addrspace(1) %idx5, align 4
714  ret void
715}
716
717define amdgpu_kernel void @merge_global_store_7_constants_i32(ptr addrspace(1) %out) {
718; CHECK-LABEL: @merge_global_store_7_constants_i32(
719; CHECK-NEXT:    store <4 x i32> <i32 34, i32 999, i32 65, i32 33>, ptr addrspace(1) [[OUT:%.*]], align 4
720; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[OUT]], i64 4
721; CHECK-NEXT:    store <3 x i32> <i32 98, i32 91, i32 212>, ptr addrspace(1) [[IDX4]], align 4
722; CHECK-NEXT:    ret void
723;
724  store i32 34, ptr addrspace(1) %out, align 4
725  %idx1 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 1
726  store i32 999, ptr addrspace(1) %idx1, align 4
727  %idx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 2
728  store i32 65, ptr addrspace(1) %idx2, align 4
729  %idx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 3
730  store i32 33, ptr addrspace(1) %idx3, align 4
731  %idx4 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 4
732  store i32 98, ptr addrspace(1) %idx4, align 4
733  %idx5 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 5
734  store i32 91, ptr addrspace(1) %idx5, align 4
735  %idx6 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 6
736  store i32 212, ptr addrspace(1) %idx6, align 4
737  ret void
738}
739
740define amdgpu_kernel void @merge_global_store_8_constants_i32(ptr addrspace(1) %out) {
741; CHECK-LABEL: @merge_global_store_8_constants_i32(
742; CHECK-NEXT:    store <4 x i32> <i32 34, i32 999, i32 65, i32 33>, ptr addrspace(1) [[OUT:%.*]], align 4
743; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[OUT]], i64 4
744; CHECK-NEXT:    store <4 x i32> <i32 98, i32 91, i32 212, i32 999>, ptr addrspace(1) [[IDX4]], align 4
745; CHECK-NEXT:    ret void
746;
747  store i32 34, ptr addrspace(1) %out, align 4
748  %idx1 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 1
749  store i32 999, ptr addrspace(1) %idx1, align 4
750  %idx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 2
751  store i32 65, ptr addrspace(1) %idx2, align 4
752  %idx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 3
753  store i32 33, ptr addrspace(1) %idx3, align 4
754  %idx4 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 4
755  store i32 98, ptr addrspace(1) %idx4, align 4
756  %idx5 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 5
757  store i32 91, ptr addrspace(1) %idx5, align 4
758  %idx6 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 6
759  store i32 212, ptr addrspace(1) %idx6, align 4
760  %idx7 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 7
761  store i32 999, ptr addrspace(1) %idx7, align 4
762  ret void
763}
764
765define amdgpu_kernel void @copy_v3i32_align4(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 {
766; CHECK-LABEL: @copy_v3i32_align4(
767; CHECK-NEXT:    [[VEC:%.*]] = load <3 x i32>, ptr addrspace(1) [[IN:%.*]], align 4
768; CHECK-NEXT:    store <3 x i32> [[VEC]], ptr addrspace(1) [[OUT:%.*]], align 16
769; CHECK-NEXT:    ret void
770;
771  %vec = load <3 x i32>, ptr addrspace(1) %in, align 4
772  store <3 x i32> %vec, ptr addrspace(1) %out
773  ret void
774}
775
776define amdgpu_kernel void @copy_v3i64_align4(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 {
777; CHECK-LABEL: @copy_v3i64_align4(
778; CHECK-NEXT:    [[VEC:%.*]] = load <3 x i64>, ptr addrspace(1) [[IN:%.*]], align 4
779; CHECK-NEXT:    store <3 x i64> [[VEC]], ptr addrspace(1) [[OUT:%.*]], align 32
780; CHECK-NEXT:    ret void
781;
782  %vec = load <3 x i64>, ptr addrspace(1) %in, align 4
783  store <3 x i64> %vec, ptr addrspace(1) %out
784  ret void
785}
786
787define amdgpu_kernel void @copy_v3f32_align4(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 {
788; CHECK-LABEL: @copy_v3f32_align4(
789; CHECK-NEXT:    [[VEC:%.*]] = load <3 x float>, ptr addrspace(1) [[IN:%.*]], align 4
790; CHECK-NEXT:    [[FADD:%.*]] = fadd <3 x float> [[VEC]], <float 1.000000e+00, float 2.000000e+00, float 4.000000e+00>
791; CHECK-NEXT:    store <3 x float> [[FADD]], ptr addrspace(1) [[OUT:%.*]], align 16
792; CHECK-NEXT:    ret void
793;
794  %vec = load <3 x float>, ptr addrspace(1) %in, align 4
795  %fadd = fadd <3 x float> %vec, <float 1.0, float 2.0, float 4.0>
796  store <3 x float> %fadd, ptr addrspace(1) %out
797  ret void
798}
799
800define amdgpu_kernel void @copy_v3f64_align4(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 {
801; CHECK-LABEL: @copy_v3f64_align4(
802; CHECK-NEXT:    [[VEC:%.*]] = load <3 x double>, ptr addrspace(1) [[IN:%.*]], align 4
803; CHECK-NEXT:    [[FADD:%.*]] = fadd <3 x double> [[VEC]], <double 1.000000e+00, double 2.000000e+00, double 4.000000e+00>
804; CHECK-NEXT:    store <3 x double> [[FADD]], ptr addrspace(1) [[OUT:%.*]], align 32
805; CHECK-NEXT:    ret void
806;
807  %vec = load <3 x double>, ptr addrspace(1) %in, align 4
808  %fadd = fadd <3 x double> %vec, <double 1.0, double 2.0, double 4.0>
809  store <3 x double> %fadd, ptr addrspace(1) %out
810  ret void
811}
812
813; Verify that we no longer hit asserts for this test case. No change expected.
814define amdgpu_kernel void @copy_vec_of_ptrs(ptr addrspace(1) %out,
815; CHECK-LABEL: @copy_vec_of_ptrs(
816; CHECK-NEXT:    [[IN_GEP_1:%.*]] = getelementptr <2 x ptr>, ptr addrspace(1) [[IN:%.*]], i32 1
817; CHECK-NEXT:    [[VEC1:%.*]] = load <2 x ptr>, ptr addrspace(1) [[IN_GEP_1]], align 16
818; CHECK-NEXT:    [[VEC2:%.*]] = load <2 x ptr>, ptr addrspace(1) [[IN]], align 4
819; CHECK-NEXT:    [[OUT_GEP_1:%.*]] = getelementptr <2 x ptr>, ptr addrspace(1) [[OUT:%.*]], i32 1
820; CHECK-NEXT:    store <2 x ptr> [[VEC1]], ptr addrspace(1) [[OUT_GEP_1]], align 16
821; CHECK-NEXT:    store <2 x ptr> [[VEC2]], ptr addrspace(1) [[OUT]], align 4
822; CHECK-NEXT:    ret void
823;
824  ptr addrspace(1) %in ) #0 {
825  %in.gep.1 = getelementptr <2 x ptr>, ptr addrspace(1) %in, i32 1
826  %vec1 = load <2 x ptr>, ptr addrspace(1) %in.gep.1
827  %vec2 = load <2 x ptr>, ptr addrspace(1) %in, align 4
828
829  %out.gep.1 = getelementptr <2 x ptr>, ptr addrspace(1) %out, i32 1
830  store <2 x ptr> %vec1, ptr addrspace(1) %out.gep.1
831  store <2 x ptr> %vec2, ptr addrspace(1) %out, align 4
832  ret void
833}
834
835declare void @llvm.amdgcn.s.barrier() #1
836
837attributes #0 = { nounwind }
838attributes #1 = { convergent nounwind }
839