xref: /llvm-project/llvm/test/CodeGen/AMDGPU/merge-stores.ll (revision 9e9907f1cfa424366fba58d9520f9305b537cec9)
1; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
2; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=GCN-AA %s
3
4; This test is mostly to test DAG store merging, so disable the vectorizer.
5; Run with devices with different unaligned load restrictions.
6
7; TODO: Vector element tests
8; TODO: Non-zero base offset for load and store combinations
9; TODO: Same base addrspacecasted
10
11
12; GCN-LABEL: {{^}}merge_global_store_2_constants_i8:
13; GCN: buffer_store_short
14; GCN: s_endpgm
15define amdgpu_kernel void @merge_global_store_2_constants_i8(ptr addrspace(1) %out) #0 {
16  %out.gep.1 = getelementptr i8, ptr addrspace(1) %out, i32 1
17
18  store i8 123, ptr addrspace(1) %out.gep.1
19  store i8 456, ptr addrspace(1) %out, align 2
20  ret void
21}
22
23; GCN-LABEL: {{^}}merge_global_store_2_constants_i8_natural_align:
24; GCN: buffer_store_byte
25; GCN: buffer_store_byte
26; GCN: s_endpgm
27define amdgpu_kernel void @merge_global_store_2_constants_i8_natural_align(ptr addrspace(1) %out) #0 {
28  %out.gep.1 = getelementptr i8, ptr addrspace(1) %out, i32 1
29
30  store i8 123, ptr addrspace(1) %out.gep.1
31  store i8 456, ptr addrspace(1) %out
32  ret void
33}
34
35; GCN-LABEL: {{^}}merge_global_store_2_constants_i16:
36; GCN: buffer_store_dword v
37define amdgpu_kernel void @merge_global_store_2_constants_i16(ptr addrspace(1) %out) #0 {
38  %out.gep.1 = getelementptr i16, ptr addrspace(1) %out, i32 1
39
40  store i16 123, ptr addrspace(1) %out.gep.1
41  store i16 456, ptr addrspace(1) %out, align 4
42  ret void
43}
44
45; GCN-LABEL: {{^}}merge_global_store_2_constants_0_i16:
46; GCN: buffer_store_dword v
47define amdgpu_kernel void @merge_global_store_2_constants_0_i16(ptr addrspace(1) %out) #0 {
48  %out.gep.1 = getelementptr i16, ptr addrspace(1) %out, i32 1
49
50  store i16 0, ptr addrspace(1) %out.gep.1
51  store i16 0, ptr addrspace(1) %out, align 4
52  ret void
53}
54
55; GCN-LABEL: {{^}}merge_global_store_2_constants_i16_natural_align:
56; GCN: buffer_store_short
57; GCN: buffer_store_short
58; GCN: s_endpgm
59define amdgpu_kernel void @merge_global_store_2_constants_i16_natural_align(ptr addrspace(1) %out) #0 {
60  %out.gep.1 = getelementptr i16, ptr addrspace(1) %out, i32 1
61
62  store i16 123, ptr addrspace(1) %out.gep.1
63  store i16 456, ptr addrspace(1) %out
64  ret void
65}
66
67; GCN-LABEL: {{^}}merge_global_store_2_constants_i32:
68; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8
69; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b
70; GCN: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
71define amdgpu_kernel void @merge_global_store_2_constants_i32(ptr addrspace(1) %out) #0 {
72  %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
73
74  store i32 123, ptr addrspace(1) %out.gep.1
75  store i32 456, ptr addrspace(1) %out
76  ret void
77}
78
79; GCN-LABEL: {{^}}merge_global_store_2_constants_i32_f32:
80; GCN: buffer_store_dwordx2
81define amdgpu_kernel void @merge_global_store_2_constants_i32_f32(ptr addrspace(1) %out) #0 {
82  %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
83  store float 1.0, ptr addrspace(1) %out.gep.1
84  store i32 456, ptr addrspace(1) %out
85  ret void
86}
87
88; GCN-LABEL: {{^}}merge_global_store_2_constants_f32_i32:
89; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 4.0
90; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0x7b
91; GCN: buffer_store_dwordx2 v[[[VLO]]:[[VHI]]]
92define amdgpu_kernel void @merge_global_store_2_constants_f32_i32(ptr addrspace(1) %out) #0 {
93  %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1
94  store i32 123, ptr addrspace(1) %out.gep.1
95  store float 4.0, ptr addrspace(1) %out
96  ret void
97}
98
99; GCN-LABEL: {{^}}merge_global_store_4_constants_i32:
100; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x14d{{$}}
101; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x1c8{{$}}
102; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x7b{{$}}
103; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x4d2{{$}}
104; GCN: buffer_store_dwordx4 v[[[LO]]:[[HI]]]
105define amdgpu_kernel void @merge_global_store_4_constants_i32(ptr addrspace(1) %out) #0 {
106  %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
107  %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2
108  %out.gep.3 = getelementptr i32, ptr addrspace(1) %out, i32 3
109
110  store i32 123, ptr addrspace(1) %out.gep.1
111  store i32 456, ptr addrspace(1) %out.gep.2
112  store i32 333, ptr addrspace(1) %out.gep.3
113  store i32 1234, ptr addrspace(1) %out
114  ret void
115}
116
117; GCN-LABEL: {{^}}merge_global_store_4_constants_f32_order:
118; GCN: buffer_store_dwordx4
119define amdgpu_kernel void @merge_global_store_4_constants_f32_order(ptr addrspace(1) %out) #0 {
120  %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1
121  %out.gep.2 = getelementptr float, ptr addrspace(1) %out, i32 2
122  %out.gep.3 = getelementptr float, ptr addrspace(1) %out, i32 3
123
124  store float 8.0, ptr addrspace(1) %out
125  store float 1.0, ptr addrspace(1) %out.gep.1
126  store float 2.0, ptr addrspace(1) %out.gep.2
127  store float 4.0, ptr addrspace(1) %out.gep.3
128  ret void
129}
130
131; First store is out of order.
132; GCN-LABEL: {{^}}merge_global_store_4_constants_f32:
133; GCN: buffer_store_dwordx4
134define amdgpu_kernel void @merge_global_store_4_constants_f32(ptr addrspace(1) %out) #0 {
135  %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1
136  %out.gep.2 = getelementptr float, ptr addrspace(1) %out, i32 2
137  %out.gep.3 = getelementptr float, ptr addrspace(1) %out, i32 3
138
139  store float 1.0, ptr addrspace(1) %out.gep.1
140  store float 2.0, ptr addrspace(1) %out.gep.2
141  store float 4.0, ptr addrspace(1) %out.gep.3
142  store float 8.0, ptr addrspace(1) %out
143  ret void
144}
145
146; GCN-LABEL: {{^}}merge_global_store_4_constants_mixed_i32_f32:
147; GCN-AA: buffer_store_dwordx4 v
148; GCN: s_endpgm
149define amdgpu_kernel void @merge_global_store_4_constants_mixed_i32_f32(ptr addrspace(1) %out) #0 {
150  %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1
151  %out.gep.2 = getelementptr float, ptr addrspace(1) %out, i32 2
152  %out.gep.3 = getelementptr float, ptr addrspace(1) %out, i32 3
153
154
155  store i32 11, ptr addrspace(1) %out.gep.1
156  store float 2.0, ptr addrspace(1) %out.gep.2
157  store i32 17, ptr addrspace(1) %out.gep.3
158  store float 8.0, ptr addrspace(1) %out
159  ret void
160}
161
162; GCN-LABEL: {{^}}merge_global_store_3_constants_i32:
163; SI-DAG: buffer_store_dwordx2
164; SI-DAG: buffer_store_dword v
165; CI-DAG: buffer_store_dwordx3
166; GCN-NOT: buffer_store_dword
167; GCN: s_endpgm
168define amdgpu_kernel void @merge_global_store_3_constants_i32(ptr addrspace(1) %out) #0 {
169  %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
170  %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2
171
172  store i32 123, ptr addrspace(1) %out.gep.1
173  store i32 456, ptr addrspace(1) %out.gep.2
174  store i32 1234, ptr addrspace(1) %out
175  ret void
176}
177
178; GCN-LABEL: {{^}}merge_global_store_2_constants_i64:
179; GCN: buffer_store_dwordx4
180define amdgpu_kernel void @merge_global_store_2_constants_i64(ptr addrspace(1) %out) #0 {
181  %out.gep.1 = getelementptr i64, ptr addrspace(1) %out, i64 1
182
183  store i64 123, ptr addrspace(1) %out.gep.1
184  store i64 456, ptr addrspace(1) %out
185  ret void
186}
187
188; GCN-LABEL: {{^}}merge_global_store_4_constants_i64:
189; GCN: buffer_store_dwordx4
190; GCN: buffer_store_dwordx4
191define amdgpu_kernel void @merge_global_store_4_constants_i64(ptr addrspace(1) %out) #0 {
192  %out.gep.1 = getelementptr i64, ptr addrspace(1) %out, i64 1
193  %out.gep.2 = getelementptr i64, ptr addrspace(1) %out, i64 2
194  %out.gep.3 = getelementptr i64, ptr addrspace(1) %out, i64 3
195
196  store i64 123, ptr addrspace(1) %out.gep.1
197  store i64 456, ptr addrspace(1) %out.gep.2
198  store i64 333, ptr addrspace(1) %out.gep.3
199  store i64 1234, ptr addrspace(1) %out
200  ret void
201}
202
203; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32:
204; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]]
205; GCN: buffer_store_dwordx2 [[LOAD]]
206define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
207  %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
208  %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 1
209
210  %lo = load i32, ptr addrspace(1) %in
211  %hi = load i32, ptr addrspace(1) %in.gep.1
212
213  store i32 %lo, ptr addrspace(1) %out
214  store i32 %hi, ptr addrspace(1) %out.gep.1
215  ret void
216}
217
218; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32_nonzero_base:
219; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
220; GCN: buffer_store_dwordx2 [[LOAD]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
221define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32_nonzero_base(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
222  %in.gep.0 = getelementptr i32, ptr addrspace(1) %in, i32 2
223  %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 3
224
225  %out.gep.0 = getelementptr i32, ptr addrspace(1) %out, i32 2
226  %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 3
227  %lo = load i32, ptr addrspace(1) %in.gep.0
228  %hi = load i32, ptr addrspace(1) %in.gep.1
229
230  store i32 %lo, ptr addrspace(1) %out.gep.0
231  store i32 %hi, ptr addrspace(1) %out.gep.1
232  ret void
233}
234
235; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_shuffle_i32:
236; GCN: buffer_load_dwordx2 v
237; GCN: buffer_store_dwordx2 v
238define amdgpu_kernel void @merge_global_store_2_adjacent_loads_shuffle_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
239  %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
240  %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 1
241
242  %lo = load i32, ptr addrspace(1) %in
243  %hi = load i32, ptr addrspace(1) %in.gep.1
244
245  store i32 %hi, ptr addrspace(1) %out
246  store i32 %lo, ptr addrspace(1) %out.gep.1
247  ret void
248}
249
250; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32:
251; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
252; GCN: buffer_store_dwordx4 [[LOAD]]
253define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
254  %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
255  %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2
256  %out.gep.3 = getelementptr i32, ptr addrspace(1) %out, i32 3
257  %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 1
258  %in.gep.2 = getelementptr i32, ptr addrspace(1) %in, i32 2
259  %in.gep.3 = getelementptr i32, ptr addrspace(1) %in, i32 3
260
261  %x = load i32, ptr addrspace(1) %in
262  %y = load i32, ptr addrspace(1) %in.gep.1
263  %z = load i32, ptr addrspace(1) %in.gep.2
264  %w = load i32, ptr addrspace(1) %in.gep.3
265
266  store i32 %x, ptr addrspace(1) %out
267  store i32 %y, ptr addrspace(1) %out.gep.1
268  store i32 %z, ptr addrspace(1) %out.gep.2
269  store i32 %w, ptr addrspace(1) %out.gep.3
270  ret void
271}
272
273; GCN-LABEL: {{^}}merge_global_store_3_adjacent_loads_i32:
274; SI-DAG: buffer_load_dwordx2
275; SI-DAG: buffer_load_dword
276; CI-DAG: buffer_load_dwordx3
277; GCN: s_waitcnt
278; SI-DAG: buffer_store_dwordx2
279; SI-DAG: buffer_store_dword v
280; CI-DAG: buffer_store_dwordx3
281; GCN: s_endpgm
282define amdgpu_kernel void @merge_global_store_3_adjacent_loads_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
283  %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
284  %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2
285  %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 1
286  %in.gep.2 = getelementptr i32, ptr addrspace(1) %in, i32 2
287
288  %x = load i32, ptr addrspace(1) %in
289  %y = load i32, ptr addrspace(1) %in.gep.1
290  %z = load i32, ptr addrspace(1) %in.gep.2
291
292  store i32 %x, ptr addrspace(1) %out
293  store i32 %y, ptr addrspace(1) %out.gep.1
294  store i32 %z, ptr addrspace(1) %out.gep.2
295  ret void
296}
297
298; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_f32:
299; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
300; GCN: buffer_store_dwordx4 [[LOAD]]
301define amdgpu_kernel void @merge_global_store_4_adjacent_loads_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
302  %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1
303  %out.gep.2 = getelementptr float, ptr addrspace(1) %out, i32 2
304  %out.gep.3 = getelementptr float, ptr addrspace(1) %out, i32 3
305  %in.gep.1 = getelementptr float, ptr addrspace(1) %in, i32 1
306  %in.gep.2 = getelementptr float, ptr addrspace(1) %in, i32 2
307  %in.gep.3 = getelementptr float, ptr addrspace(1) %in, i32 3
308
309  %x = load float, ptr addrspace(1) %in
310  %y = load float, ptr addrspace(1) %in.gep.1
311  %z = load float, ptr addrspace(1) %in.gep.2
312  %w = load float, ptr addrspace(1) %in.gep.3
313
314  store float %x, ptr addrspace(1) %out
315  store float %y, ptr addrspace(1) %out.gep.1
316  store float %z, ptr addrspace(1) %out.gep.2
317  store float %w, ptr addrspace(1) %out.gep.3
318  ret void
319}
320
321; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32_nonzero_base:
322; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
323; GCN: buffer_store_dwordx4 [[LOAD]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28
324define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32_nonzero_base(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
325  %in.gep.0 = getelementptr i32, ptr addrspace(1) %in, i32 11
326  %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 12
327  %in.gep.2 = getelementptr i32, ptr addrspace(1) %in, i32 13
328  %in.gep.3 = getelementptr i32, ptr addrspace(1) %in, i32 14
329  %out.gep.0 = getelementptr i32, ptr addrspace(1) %out, i32 7
330  %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 8
331  %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 9
332  %out.gep.3 = getelementptr i32, ptr addrspace(1) %out, i32 10
333
334  %x = load i32, ptr addrspace(1) %in.gep.0
335  %y = load i32, ptr addrspace(1) %in.gep.1
336  %z = load i32, ptr addrspace(1) %in.gep.2
337  %w = load i32, ptr addrspace(1) %in.gep.3
338
339  store i32 %x, ptr addrspace(1) %out.gep.0
340  store i32 %y, ptr addrspace(1) %out.gep.1
341  store i32 %z, ptr addrspace(1) %out.gep.2
342  store i32 %w, ptr addrspace(1) %out.gep.3
343  ret void
344}
345
346; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_inverse_i32:
347; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
348; GCN: s_barrier
349; GCN: buffer_store_dwordx4 [[LOAD]]
350define amdgpu_kernel void @merge_global_store_4_adjacent_loads_inverse_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
351  %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
352  %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2
353  %out.gep.3 = getelementptr i32, ptr addrspace(1) %out, i32 3
354  %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 1
355  %in.gep.2 = getelementptr i32, ptr addrspace(1) %in, i32 2
356  %in.gep.3 = getelementptr i32, ptr addrspace(1) %in, i32 3
357
358  %x = load i32, ptr addrspace(1) %in
359  %y = load i32, ptr addrspace(1) %in.gep.1
360  %z = load i32, ptr addrspace(1) %in.gep.2
361  %w = load i32, ptr addrspace(1) %in.gep.3
362
363  ; Make sure the barrier doesn't stop this
364  tail call void @llvm.amdgcn.s.barrier() #1
365
366  store i32 %w, ptr addrspace(1) %out.gep.3
367  store i32 %z, ptr addrspace(1) %out.gep.2
368  store i32 %y, ptr addrspace(1) %out.gep.1
369  store i32 %x, ptr addrspace(1) %out
370
371  ret void
372}
373
374; TODO: Re-packing of loaded register required. Maybe an IR pass
375; should catch this?
376
377; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_shuffle_i32:
378; GCN: buffer_load_dwordx4 v
379; GCN: s_barrier
380; GCN: buffer_store_dwordx4 v
381define amdgpu_kernel void @merge_global_store_4_adjacent_loads_shuffle_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
382  %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
383  %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2
384  %out.gep.3 = getelementptr i32, ptr addrspace(1) %out, i32 3
385  %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 1
386  %in.gep.2 = getelementptr i32, ptr addrspace(1) %in, i32 2
387  %in.gep.3 = getelementptr i32, ptr addrspace(1) %in, i32 3
388
389  %x = load i32, ptr addrspace(1) %in
390  %y = load i32, ptr addrspace(1) %in.gep.1
391  %z = load i32, ptr addrspace(1) %in.gep.2
392  %w = load i32, ptr addrspace(1) %in.gep.3
393
394  ; Make sure the barrier doesn't stop this
395  tail call void @llvm.amdgcn.s.barrier() #1
396
397  store i32 %w, ptr addrspace(1) %out
398  store i32 %z, ptr addrspace(1) %out.gep.1
399  store i32 %y, ptr addrspace(1) %out.gep.2
400  store i32 %x, ptr addrspace(1) %out.gep.3
401
402  ret void
403}
404
405; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8:
406; GCN: buffer_load_dword [[LOAD:v[0-9]+]]
407; GCN: buffer_store_dword [[LOAD]]
408; GCN: s_endpgm
409define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
410  %out.gep.1 = getelementptr i8, ptr addrspace(1) %out, i8 1
411  %out.gep.2 = getelementptr i8, ptr addrspace(1) %out, i8 2
412  %out.gep.3 = getelementptr i8, ptr addrspace(1) %out, i8 3
413  %in.gep.1 = getelementptr i8, ptr addrspace(1) %in, i8 1
414  %in.gep.2 = getelementptr i8, ptr addrspace(1) %in, i8 2
415  %in.gep.3 = getelementptr i8, ptr addrspace(1) %in, i8 3
416
417  %x = load i8, ptr addrspace(1) %in, align 4
418  %y = load i8, ptr addrspace(1) %in.gep.1
419  %z = load i8, ptr addrspace(1) %in.gep.2
420  %w = load i8, ptr addrspace(1) %in.gep.3
421
422  store i8 %x, ptr addrspace(1) %out, align 4
423  store i8 %y, ptr addrspace(1) %out.gep.1
424  store i8 %z, ptr addrspace(1) %out.gep.2
425  store i8 %w, ptr addrspace(1) %out.gep.3
426  ret void
427}
428
429; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8_natural_align:
430; GCN: buffer_load_ubyte
431; GCN: buffer_load_ubyte
432; GCN: buffer_load_ubyte
433; GCN: buffer_load_ubyte
434; GCN: buffer_store_byte
435; GCN: buffer_store_byte
436; GCN: buffer_store_byte
437; GCN: buffer_store_byte
438; GCN: s_endpgm
439define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8_natural_align(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
440  %out.gep.1 = getelementptr i8, ptr addrspace(1) %out, i8 1
441  %out.gep.2 = getelementptr i8, ptr addrspace(1) %out, i8 2
442  %out.gep.3 = getelementptr i8, ptr addrspace(1) %out, i8 3
443  %in.gep.1 = getelementptr i8, ptr addrspace(1) %in, i8 1
444  %in.gep.2 = getelementptr i8, ptr addrspace(1) %in, i8 2
445  %in.gep.3 = getelementptr i8, ptr addrspace(1) %in, i8 3
446
447  %x = load i8, ptr addrspace(1) %in
448  %y = load i8, ptr addrspace(1) %in.gep.1
449  %z = load i8, ptr addrspace(1) %in.gep.2
450  %w = load i8, ptr addrspace(1) %in.gep.3
451
452  store i8 %x, ptr addrspace(1) %out
453  store i8 %y, ptr addrspace(1) %out.gep.1
454  store i8 %z, ptr addrspace(1) %out.gep.2
455  store i8 %w, ptr addrspace(1) %out.gep.3
456  ret void
457}
458
459; GCN-LABEL: {{^}}merge_global_store_4_vector_elts_loads_v4i32:
460; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
461; GCN: buffer_store_dwordx4 [[LOAD]]
462; GCN: s_endpgm
463define amdgpu_kernel void @merge_global_store_4_vector_elts_loads_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
464  %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
465  %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2
466  %out.gep.3 = getelementptr i32, ptr addrspace(1) %out, i32 3
467  %vec = load <4 x i32>, ptr addrspace(1) %in
468
469  %x = extractelement <4 x i32> %vec, i32 0
470  %y = extractelement <4 x i32> %vec, i32 1
471  %z = extractelement <4 x i32> %vec, i32 2
472  %w = extractelement <4 x i32> %vec, i32 3
473
474  store i32 %x, ptr addrspace(1) %out
475  store i32 %y, ptr addrspace(1) %out.gep.1
476  store i32 %z, ptr addrspace(1) %out.gep.2
477  store i32 %w, ptr addrspace(1) %out.gep.3
478  ret void
479}
480
481; GCN-LABEL: {{^}}merge_local_store_2_constants_i8:
482; GCN: ds_write_b16
483; GCN: s_endpgm
484define amdgpu_kernel void @merge_local_store_2_constants_i8(ptr addrspace(3) %out) #0 {
485  %out.gep.1 = getelementptr i8, ptr addrspace(3) %out, i32 1
486
487  store i8 123, ptr addrspace(3) %out.gep.1
488  store i8 456, ptr addrspace(3) %out, align 2
489  ret void
490}
491
492; GCN-LABEL: {{^}}merge_local_store_2_constants_i32:
493; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8
494; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b
495; GCN: ds_write2_b32 v{{[0-9]+}}, v[[LO]], v[[HI]] offset1:1{{$}}
496define amdgpu_kernel void @merge_local_store_2_constants_i32(ptr addrspace(3) %out) #0 {
497  %out.gep.1 = getelementptr i32, ptr addrspace(3) %out, i32 1
498
499  store i32 123, ptr addrspace(3) %out.gep.1
500  store i32 456, ptr addrspace(3) %out
501  ret void
502}
503
504; GCN-LABEL: {{^}}merge_local_store_4_constants_i32:
505; GCN-DAG: v_mov_b32_e32 [[K2:v[0-9]+]], 0x1c8
506; GCN-DAG: v_mov_b32_e32 [[K3:v[0-9]+]], 0x14d
507; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, [[K2]], [[K3]] offset0:2 offset1:3
508
509; GCN-DAG: v_mov_b32_e32 [[K0:v[0-9]+]], 0x4d2
510; GCN-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x7b
511; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, [[K0]], [[K1]] offset1:1
512
513; GCN: s_endpgm
514define amdgpu_kernel void @merge_local_store_4_constants_i32(ptr addrspace(3) %out) #0 {
515  %out.gep.1 = getelementptr i32, ptr addrspace(3) %out, i32 1
516  %out.gep.2 = getelementptr i32, ptr addrspace(3) %out, i32 2
517  %out.gep.3 = getelementptr i32, ptr addrspace(3) %out, i32 3
518
519  store i32 123, ptr addrspace(3) %out.gep.1
520  store i32 456, ptr addrspace(3) %out.gep.2
521  store i32 333, ptr addrspace(3) %out.gep.3
522  store i32 1234, ptr addrspace(3) %out
523  ret void
524}
525
526; GCN-LABEL: {{^}}merge_global_store_5_constants_i32:
527; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 9{{$}}
528; GCN-DAG: v_mov_b32_e32 v[[HI4:[0-9]+]], -12{{$}}
529; GCN: buffer_store_dwordx4 v[[[LO]]:[[HI4]]]
530; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 11{{$}}
531; GCN: buffer_store_dword v[[HI]]
532define amdgpu_kernel void @merge_global_store_5_constants_i32(ptr addrspace(1) %out) {
533  store i32 9, ptr addrspace(1) %out, align 4
534  %idx1 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 1
535  store i32 12, ptr addrspace(1) %idx1, align 4
536  %idx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 2
537  store i32 16, ptr addrspace(1) %idx2, align 4
538  %idx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 3
539  store i32 -12, ptr addrspace(1) %idx3, align 4
540  %idx4 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 4
541  store i32 11, ptr addrspace(1) %idx4, align 4
542  ret void
543}
544
545; GCN-LABEL: {{^}}merge_global_store_6_constants_i32:
546; GCN: buffer_store_dwordx4
547; GCN: buffer_store_dwordx2
548define amdgpu_kernel void @merge_global_store_6_constants_i32(ptr addrspace(1) %out) {
549  store i32 13, ptr addrspace(1) %out, align 4
550  %idx1 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 1
551  store i32 15, ptr addrspace(1) %idx1, align 4
552  %idx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 2
553  store i32 62, ptr addrspace(1) %idx2, align 4
554  %idx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 3
555  store i32 63, ptr addrspace(1) %idx3, align 4
556  %idx4 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 4
557  store i32 11, ptr addrspace(1) %idx4, align 4
558  %idx5 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 5
559  store i32 123, ptr addrspace(1) %idx5, align 4
560  ret void
561}
562
563; GCN-LABEL: {{^}}merge_global_store_7_constants_i32:
564; GCN: buffer_store_dwordx4
565; SI-DAG: buffer_store_dwordx2
566; CI: buffer_store_dwordx3
567define amdgpu_kernel void @merge_global_store_7_constants_i32(ptr addrspace(1) %out) {
568  store i32 34, ptr addrspace(1) %out, align 4
569  %idx1 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 1
570  store i32 999, ptr addrspace(1) %idx1, align 4
571  %idx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 2
572  store i32 65, ptr addrspace(1) %idx2, align 4
573  %idx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 3
574  store i32 33, ptr addrspace(1) %idx3, align 4
575  %idx4 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 4
576  store i32 98, ptr addrspace(1) %idx4, align 4
577  %idx5 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 5
578  store i32 91, ptr addrspace(1) %idx5, align 4
579  %idx6 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 6
580  store i32 212, ptr addrspace(1) %idx6, align 4
581  ret void
582}
583
584; GCN-LABEL: {{^}}merge_global_store_8_constants_i32:
585; GCN: buffer_store_dwordx4
586; GCN: buffer_store_dwordx4
587; GCN: s_endpgm
588define amdgpu_kernel void @merge_global_store_8_constants_i32(ptr addrspace(1) %out) {
589  store i32 34, ptr addrspace(1) %out, align 4
590  %idx1 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 1
591  store i32 999, ptr addrspace(1) %idx1, align 4
592  %idx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 2
593  store i32 65, ptr addrspace(1) %idx2, align 4
594  %idx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 3
595  store i32 33, ptr addrspace(1) %idx3, align 4
596  %idx4 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 4
597  store i32 98, ptr addrspace(1) %idx4, align 4
598  %idx5 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 5
599  store i32 91, ptr addrspace(1) %idx5, align 4
600  %idx6 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 6
601  store i32 212, ptr addrspace(1) %idx6, align 4
602  %idx7 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 7
603  store i32 999, ptr addrspace(1) %idx7, align 4
604  ret void
605}
606
607; This requires handling of scalar_to_vector for v2i64 to avoid
608; scratch usage.
609; FIXME: Should do single load and store
610
611; GCN-LABEL: {{^}}copy_v3i32_align4:
612; GCN-NOT: SCRATCH_RSRC_DWORD
613; SI-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
614; SI-DAG: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
615; CI-DAG: buffer_load_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
616; GCN-NOT: offen
617; GCN: s_waitcnt vmcnt
618; GCN-NOT: offen
619; SI-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
620; SI-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
621; CI-DAG: buffer_store_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
622
623; GCN: ScratchSize: 0{{$}}
624define amdgpu_kernel void @copy_v3i32_align4(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 {
625  %vec = load <3 x i32>, ptr addrspace(1) %in, align 4
626  store <3 x i32> %vec, ptr addrspace(1) %out
627  ret void
628}
629
630; GCN-LABEL: {{^}}copy_v3i64_align4:
631; GCN-NOT: SCRATCH_RSRC_DWORD
632; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
633; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
634; GCN-NOT: offen
635; GCN: s_waitcnt vmcnt
636; GCN-NOT: offen
637; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
638; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
639; GCN: ScratchSize: 0{{$}}
640define amdgpu_kernel void @copy_v3i64_align4(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 {
641  %vec = load <3 x i64>, ptr addrspace(1) %in, align 4
642  store <3 x i64> %vec, ptr addrspace(1) %out
643  ret void
644}
645
646; GCN-LABEL: {{^}}copy_v3f32_align4:
647; GCN-NOT: SCRATCH_RSRC_DWORD
648; SI-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
649; SI-DAG: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
650; CI-DAG: buffer_load_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
651; GCN-NOT: offen
652; GCN: s_waitcnt vmcnt
653; GCN-NOT: offen
654; SI-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
655; SI-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
656; CI-DAG: buffer_store_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
657; GCN: ScratchSize: 0{{$}}
658define amdgpu_kernel void @copy_v3f32_align4(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 {
659  %vec = load <3 x float>, ptr addrspace(1) %in, align 4
660  %fadd = fadd <3 x float> %vec, <float 1.0, float 2.0, float 4.0>
661  store <3 x float> %fadd, ptr addrspace(1) %out
662  ret void
663}
664
665; GCN-LABEL: {{^}}copy_v3f64_align4:
666; GCN-NOT: SCRATCH_RSRC_DWORD
667; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
668; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
669; GCN-NOT: offen
670; GCN: s_waitcnt vmcnt
671; GCN-NOT: offen
672; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
673; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
674; GCN: ScratchSize: 0{{$}}
675define amdgpu_kernel void @copy_v3f64_align4(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 {
676  %vec = load <3 x double>, ptr addrspace(1) %in, align 4
677  %fadd = fadd <3 x double> %vec, <double 1.0, double 2.0, double 4.0>
678  store <3 x double> %fadd, ptr addrspace(1) %out
679  ret void
680}
681
682declare void @llvm.amdgcn.s.barrier() #1
683
684attributes #0 = { nounwind }
685attributes #1 = { convergent nounwind }
686