xref: /llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
3; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s
4; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
5; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
6
7; FIXME:
8; XUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s
9
10define amdgpu_kernel void @store_lds_v4i32(ptr addrspace(3) %out, <4 x i32> %x) {
11; GFX9-LABEL: store_lds_v4i32:
12; GFX9:       ; %bb.0:
13; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
14; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x0
15; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
16; GFX9-NEXT:    v_mov_b32_e32 v0, s0
17; GFX9-NEXT:    v_mov_b32_e32 v1, s1
18; GFX9-NEXT:    v_mov_b32_e32 v2, s2
19; GFX9-NEXT:    v_mov_b32_e32 v3, s3
20; GFX9-NEXT:    v_mov_b32_e32 v4, s6
21; GFX9-NEXT:    ds_write_b128 v4, v[0:3]
22; GFX9-NEXT:    s_endpgm
23;
24; GFX7-LABEL: store_lds_v4i32:
25; GFX7:       ; %bb.0:
26; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
27; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
28; GFX7-NEXT:    s_mov_b32 m0, -1
29; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
30; GFX7-NEXT:    v_mov_b32_e32 v0, s0
31; GFX7-NEXT:    v_mov_b32_e32 v1, s1
32; GFX7-NEXT:    v_mov_b32_e32 v2, s2
33; GFX7-NEXT:    v_mov_b32_e32 v3, s3
34; GFX7-NEXT:    v_mov_b32_e32 v4, s4
35; GFX7-NEXT:    ds_write_b128 v4, v[0:3]
36; GFX7-NEXT:    s_endpgm
37;
38; GFX10-LABEL: store_lds_v4i32:
39; GFX10:       ; %bb.0:
40; GFX10-NEXT:    s_clause 0x1
41; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
42; GFX10-NEXT:    s_load_dword s6, s[4:5], 0x0
43; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
44; GFX10-NEXT:    v_mov_b32_e32 v0, s0
45; GFX10-NEXT:    v_mov_b32_e32 v1, s1
46; GFX10-NEXT:    v_mov_b32_e32 v2, s2
47; GFX10-NEXT:    v_mov_b32_e32 v3, s3
48; GFX10-NEXT:    v_mov_b32_e32 v4, s6
49; GFX10-NEXT:    ds_write_b128 v4, v[0:3]
50; GFX10-NEXT:    s_endpgm
51;
52; GFX11-LABEL: store_lds_v4i32:
53; GFX11:       ; %bb.0:
54; GFX11-NEXT:    s_clause 0x1
55; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x10
56; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x0
57; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
58; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
59; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
60; GFX11-NEXT:    v_mov_b32_e32 v4, s4
61; GFX11-NEXT:    ds_store_b128 v4, v[0:3]
62; GFX11-NEXT:    s_endpgm
63  store <4 x i32> %x, ptr addrspace(3) %out
64  ret void
65}
66
67define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i32> %x) {
68; GFX9-LABEL: store_lds_v4i32_align1:
69; GFX9:       ; %bb.0:
70; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
71; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x0
72; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
73; GFX9-NEXT:    s_and_b32 s5, 0xffff, s0
74; GFX9-NEXT:    s_lshr_b32 s4, s0, 16
75; GFX9-NEXT:    v_mov_b32_e32 v0, s0
76; GFX9-NEXT:    v_mov_b32_e32 v1, s6
77; GFX9-NEXT:    s_lshr_b32 s0, s5, 8
78; GFX9-NEXT:    ds_write_b8 v1, v0
79; GFX9-NEXT:    v_mov_b32_e32 v0, s0
80; GFX9-NEXT:    ds_write_b8 v1, v0 offset:1
81; GFX9-NEXT:    s_lshr_b32 s0, s4, 8
82; GFX9-NEXT:    v_mov_b32_e32 v0, s4
83; GFX9-NEXT:    ds_write_b8 v1, v0 offset:2
84; GFX9-NEXT:    v_mov_b32_e32 v0, s0
85; GFX9-NEXT:    s_and_b32 s4, 0xffff, s1
86; GFX9-NEXT:    ds_write_b8 v1, v0 offset:3
87; GFX9-NEXT:    s_lshr_b32 s4, s4, 8
88; GFX9-NEXT:    v_mov_b32_e32 v0, s1
89; GFX9-NEXT:    s_lshr_b32 s0, s1, 16
90; GFX9-NEXT:    ds_write_b8 v1, v0 offset:4
91; GFX9-NEXT:    v_mov_b32_e32 v0, s4
92; GFX9-NEXT:    ds_write_b8 v1, v0 offset:5
93; GFX9-NEXT:    s_lshr_b32 s1, s0, 8
94; GFX9-NEXT:    v_mov_b32_e32 v0, s0
95; GFX9-NEXT:    ds_write_b8 v1, v0 offset:6
96; GFX9-NEXT:    v_mov_b32_e32 v0, s1
97; GFX9-NEXT:    s_and_b32 s1, 0xffff, s2
98; GFX9-NEXT:    ds_write_b8 v1, v0 offset:7
99; GFX9-NEXT:    s_lshr_b32 s1, s1, 8
100; GFX9-NEXT:    v_mov_b32_e32 v0, s2
101; GFX9-NEXT:    s_lshr_b32 s0, s2, 16
102; GFX9-NEXT:    ds_write_b8 v1, v0 offset:8
103; GFX9-NEXT:    v_mov_b32_e32 v0, s1
104; GFX9-NEXT:    ds_write_b8 v1, v0 offset:9
105; GFX9-NEXT:    s_lshr_b32 s1, s0, 8
106; GFX9-NEXT:    v_mov_b32_e32 v0, s0
107; GFX9-NEXT:    ds_write_b8 v1, v0 offset:10
108; GFX9-NEXT:    v_mov_b32_e32 v0, s1
109; GFX9-NEXT:    s_and_b32 s1, 0xffff, s3
110; GFX9-NEXT:    ds_write_b8 v1, v0 offset:11
111; GFX9-NEXT:    s_lshr_b32 s1, s1, 8
112; GFX9-NEXT:    v_mov_b32_e32 v0, s3
113; GFX9-NEXT:    s_lshr_b32 s0, s3, 16
114; GFX9-NEXT:    ds_write_b8 v1, v0 offset:12
115; GFX9-NEXT:    v_mov_b32_e32 v0, s1
116; GFX9-NEXT:    ds_write_b8 v1, v0 offset:13
117; GFX9-NEXT:    s_lshr_b32 s1, s0, 8
118; GFX9-NEXT:    v_mov_b32_e32 v0, s0
119; GFX9-NEXT:    ds_write_b8 v1, v0 offset:14
120; GFX9-NEXT:    v_mov_b32_e32 v0, s1
121; GFX9-NEXT:    ds_write_b8 v1, v0 offset:15
122; GFX9-NEXT:    s_endpgm
123;
124; GFX7-LABEL: store_lds_v4i32_align1:
125; GFX7:       ; %bb.0:
126; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
127; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
128; GFX7-NEXT:    s_mov_b32 m0, -1
129; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
130; GFX7-NEXT:    s_bfe_u32 s6, s0, 0x80008
131; GFX7-NEXT:    v_mov_b32_e32 v0, s0
132; GFX7-NEXT:    v_mov_b32_e32 v1, s4
133; GFX7-NEXT:    s_lshr_b32 s5, s0, 16
134; GFX7-NEXT:    ds_write_b8 v1, v0
135; GFX7-NEXT:    v_mov_b32_e32 v0, s6
136; GFX7-NEXT:    ds_write_b8 v1, v0 offset:1
137; GFX7-NEXT:    s_lshr_b32 s0, s0, 24
138; GFX7-NEXT:    v_mov_b32_e32 v0, s5
139; GFX7-NEXT:    ds_write_b8 v1, v0 offset:2
140; GFX7-NEXT:    v_mov_b32_e32 v0, s0
141; GFX7-NEXT:    ds_write_b8 v1, v0 offset:3
142; GFX7-NEXT:    s_bfe_u32 s4, s1, 0x80008
143; GFX7-NEXT:    v_mov_b32_e32 v0, s1
144; GFX7-NEXT:    s_lshr_b32 s0, s1, 16
145; GFX7-NEXT:    ds_write_b8 v1, v0 offset:4
146; GFX7-NEXT:    v_mov_b32_e32 v0, s4
147; GFX7-NEXT:    ds_write_b8 v1, v0 offset:5
148; GFX7-NEXT:    s_lshr_b32 s1, s1, 24
149; GFX7-NEXT:    v_mov_b32_e32 v0, s0
150; GFX7-NEXT:    ds_write_b8 v1, v0 offset:6
151; GFX7-NEXT:    v_mov_b32_e32 v0, s1
152; GFX7-NEXT:    ds_write_b8 v1, v0 offset:7
153; GFX7-NEXT:    s_bfe_u32 s1, s2, 0x80008
154; GFX7-NEXT:    v_mov_b32_e32 v0, s2
155; GFX7-NEXT:    s_lshr_b32 s0, s2, 16
156; GFX7-NEXT:    ds_write_b8 v1, v0 offset:8
157; GFX7-NEXT:    v_mov_b32_e32 v0, s1
158; GFX7-NEXT:    ds_write_b8 v1, v0 offset:9
159; GFX7-NEXT:    s_lshr_b32 s1, s2, 24
160; GFX7-NEXT:    v_mov_b32_e32 v0, s0
161; GFX7-NEXT:    ds_write_b8 v1, v0 offset:10
162; GFX7-NEXT:    v_mov_b32_e32 v0, s1
163; GFX7-NEXT:    ds_write_b8 v1, v0 offset:11
164; GFX7-NEXT:    s_bfe_u32 s1, s3, 0x80008
165; GFX7-NEXT:    v_mov_b32_e32 v0, s3
166; GFX7-NEXT:    s_lshr_b32 s0, s3, 16
167; GFX7-NEXT:    ds_write_b8 v1, v0 offset:12
168; GFX7-NEXT:    v_mov_b32_e32 v0, s1
169; GFX7-NEXT:    ds_write_b8 v1, v0 offset:13
170; GFX7-NEXT:    s_lshr_b32 s1, s3, 24
171; GFX7-NEXT:    v_mov_b32_e32 v0, s0
172; GFX7-NEXT:    ds_write_b8 v1, v0 offset:14
173; GFX7-NEXT:    v_mov_b32_e32 v0, s1
174; GFX7-NEXT:    ds_write_b8 v1, v0 offset:15
175; GFX7-NEXT:    s_endpgm
176;
177; GFX10-LABEL: store_lds_v4i32_align1:
178; GFX10:       ; %bb.0:
179; GFX10-NEXT:    s_clause 0x1
180; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
181; GFX10-NEXT:    s_load_dword s6, s[4:5], 0x0
182; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
183; GFX10-NEXT:    s_lshr_b32 s4, s0, 16
184; GFX10-NEXT:    s_and_b32 s5, 0xffff, s0
185; GFX10-NEXT:    v_mov_b32_e32 v0, s0
186; GFX10-NEXT:    v_mov_b32_e32 v1, s6
187; GFX10-NEXT:    s_lshr_b32 s0, s1, 16
188; GFX10-NEXT:    s_and_b32 s6, 0xffff, s1
189; GFX10-NEXT:    v_mov_b32_e32 v2, s1
190; GFX10-NEXT:    s_lshr_b32 s1, s2, 16
191; GFX10-NEXT:    s_and_b32 s7, 0xffff, s2
192; GFX10-NEXT:    v_mov_b32_e32 v3, s2
193; GFX10-NEXT:    s_lshr_b32 s2, s5, 8
194; GFX10-NEXT:    v_mov_b32_e32 v4, s4
195; GFX10-NEXT:    s_lshr_b32 s5, s4, 8
196; GFX10-NEXT:    s_lshr_b32 s4, s6, 8
197; GFX10-NEXT:    s_lshr_b32 s6, s0, 8
198; GFX10-NEXT:    v_mov_b32_e32 v5, s0
199; GFX10-NEXT:    v_mov_b32_e32 v6, s2
200; GFX10-NEXT:    s_lshr_b32 s0, s7, 8
201; GFX10-NEXT:    v_mov_b32_e32 v7, s5
202; GFX10-NEXT:    v_mov_b32_e32 v8, s4
203; GFX10-NEXT:    v_mov_b32_e32 v9, s6
204; GFX10-NEXT:    ds_write_b8 v1, v0
205; GFX10-NEXT:    ds_write_b8 v1, v2 offset:4
206; GFX10-NEXT:    ds_write_b8 v1, v4 offset:2
207; GFX10-NEXT:    ds_write_b8 v1, v5 offset:6
208; GFX10-NEXT:    ds_write_b8 v1, v6 offset:1
209; GFX10-NEXT:    ds_write_b8 v1, v7 offset:3
210; GFX10-NEXT:    ds_write_b8 v1, v8 offset:5
211; GFX10-NEXT:    v_mov_b32_e32 v0, s1
212; GFX10-NEXT:    v_mov_b32_e32 v10, s0
213; GFX10-NEXT:    s_lshr_b32 s0, s1, 8
214; GFX10-NEXT:    ds_write_b8 v1, v9 offset:7
215; GFX10-NEXT:    ds_write_b8 v1, v3 offset:8
216; GFX10-NEXT:    ds_write_b8 v1, v10 offset:9
217; GFX10-NEXT:    ds_write_b8 v1, v0 offset:10
218; GFX10-NEXT:    v_mov_b32_e32 v0, s0
219; GFX10-NEXT:    s_and_b32 s0, 0xffff, s3
220; GFX10-NEXT:    s_lshr_b32 s1, s3, 16
221; GFX10-NEXT:    s_lshr_b32 s0, s0, 8
222; GFX10-NEXT:    v_mov_b32_e32 v2, s3
223; GFX10-NEXT:    v_mov_b32_e32 v3, s0
224; GFX10-NEXT:    s_lshr_b32 s0, s1, 8
225; GFX10-NEXT:    v_mov_b32_e32 v4, s1
226; GFX10-NEXT:    v_mov_b32_e32 v5, s0
227; GFX10-NEXT:    ds_write_b8 v1, v0 offset:11
228; GFX10-NEXT:    ds_write_b8 v1, v2 offset:12
229; GFX10-NEXT:    ds_write_b8 v1, v3 offset:13
230; GFX10-NEXT:    ds_write_b8 v1, v4 offset:14
231; GFX10-NEXT:    ds_write_b8 v1, v5 offset:15
232; GFX10-NEXT:    s_endpgm
233;
234; GFX11-LABEL: store_lds_v4i32_align1:
235; GFX11:       ; %bb.0:
236; GFX11-NEXT:    s_clause 0x1
237; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x10
238; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x0
239; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
240; GFX11-NEXT:    s_and_b32 s6, 0xffff, s0
241; GFX11-NEXT:    s_lshr_b32 s5, s0, 16
242; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s4
243; GFX11-NEXT:    s_lshr_b32 s0, s1, 16
244; GFX11-NEXT:    s_and_b32 s4, 0xffff, s1
245; GFX11-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s2
246; GFX11-NEXT:    s_lshr_b32 s1, s2, 16
247; GFX11-NEXT:    s_and_b32 s7, 0xffff, s2
248; GFX11-NEXT:    s_lshr_b32 s2, s6, 8
249; GFX11-NEXT:    s_lshr_b32 s6, s5, 8
250; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
251; GFX11-NEXT:    v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s6
252; GFX11-NEXT:    v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s0
253; GFX11-NEXT:    s_lshr_b32 s4, s4, 8
254; GFX11-NEXT:    s_lshr_b32 s5, s0, 8
255; GFX11-NEXT:    s_lshr_b32 s0, s7, 8
256; GFX11-NEXT:    v_dual_mov_b32 v8, s4 :: v_dual_mov_b32 v9, s5
257; GFX11-NEXT:    ds_store_b8 v1, v0
258; GFX11-NEXT:    ds_store_b8 v1, v6 offset:1
259; GFX11-NEXT:    ds_store_b8 v1, v4 offset:2
260; GFX11-NEXT:    ds_store_b8 v1, v7 offset:3
261; GFX11-NEXT:    ds_store_b8 v1, v2 offset:4
262; GFX11-NEXT:    ds_store_b8 v1, v8 offset:5
263; GFX11-NEXT:    ds_store_b8 v1, v5 offset:6
264; GFX11-NEXT:    ds_store_b8 v1, v9 offset:7
265; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s3
266; GFX11-NEXT:    s_lshr_b32 s0, s1, 8
267; GFX11-NEXT:    v_mov_b32_e32 v2, s1
268; GFX11-NEXT:    v_mov_b32_e32 v4, s0
269; GFX11-NEXT:    s_and_b32 s0, 0xffff, s3
270; GFX11-NEXT:    s_lshr_b32 s1, s3, 16
271; GFX11-NEXT:    s_lshr_b32 s0, s0, 8
272; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
273; GFX11-NEXT:    v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
274; GFX11-NEXT:    s_lshr_b32 s0, s1, 8
275; GFX11-NEXT:    v_mov_b32_e32 v8, s0
276; GFX11-NEXT:    ds_store_b8 v1, v3 offset:8
277; GFX11-NEXT:    ds_store_b8 v1, v0 offset:9
278; GFX11-NEXT:    ds_store_b8 v1, v2 offset:10
279; GFX11-NEXT:    ds_store_b8 v1, v4 offset:11
280; GFX11-NEXT:    ds_store_b8 v1, v5 offset:12
281; GFX11-NEXT:    ds_store_b8 v1, v6 offset:13
282; GFX11-NEXT:    ds_store_b8 v1, v7 offset:14
283; GFX11-NEXT:    ds_store_b8 v1, v8 offset:15
284; GFX11-NEXT:    s_endpgm
285  store <4 x i32> %x, ptr addrspace(3) %out, align 1
286  ret void
287}
288
289define amdgpu_kernel void @store_lds_v4i32_align2(ptr addrspace(3) %out, <4 x i32> %x) {
290; GFX9-LABEL: store_lds_v4i32_align2:
291; GFX9:       ; %bb.0:
292; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
293; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x0
294; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
295; GFX9-NEXT:    s_lshr_b32 s4, s0, 16
296; GFX9-NEXT:    v_mov_b32_e32 v0, s0
297; GFX9-NEXT:    v_mov_b32_e32 v1, s6
298; GFX9-NEXT:    ds_write_b16 v1, v0
299; GFX9-NEXT:    v_mov_b32_e32 v0, s4
300; GFX9-NEXT:    ds_write_b16 v1, v0 offset:2
301; GFX9-NEXT:    s_lshr_b32 s0, s1, 16
302; GFX9-NEXT:    v_mov_b32_e32 v0, s1
303; GFX9-NEXT:    ds_write_b16 v1, v0 offset:4
304; GFX9-NEXT:    v_mov_b32_e32 v0, s0
305; GFX9-NEXT:    ds_write_b16 v1, v0 offset:6
306; GFX9-NEXT:    s_lshr_b32 s0, s2, 16
307; GFX9-NEXT:    v_mov_b32_e32 v0, s2
308; GFX9-NEXT:    ds_write_b16 v1, v0 offset:8
309; GFX9-NEXT:    v_mov_b32_e32 v0, s0
310; GFX9-NEXT:    ds_write_b16 v1, v0 offset:10
311; GFX9-NEXT:    s_lshr_b32 s0, s3, 16
312; GFX9-NEXT:    v_mov_b32_e32 v0, s3
313; GFX9-NEXT:    ds_write_b16 v1, v0 offset:12
314; GFX9-NEXT:    v_mov_b32_e32 v0, s0
315; GFX9-NEXT:    ds_write_b16 v1, v0 offset:14
316; GFX9-NEXT:    s_endpgm
317;
318; GFX7-LABEL: store_lds_v4i32_align2:
319; GFX7:       ; %bb.0:
320; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
321; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
322; GFX7-NEXT:    s_mov_b32 m0, -1
323; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
324; GFX7-NEXT:    s_lshr_b32 s5, s0, 16
325; GFX7-NEXT:    v_mov_b32_e32 v0, s0
326; GFX7-NEXT:    v_mov_b32_e32 v1, s4
327; GFX7-NEXT:    ds_write_b16 v1, v0
328; GFX7-NEXT:    v_mov_b32_e32 v0, s5
329; GFX7-NEXT:    ds_write_b16 v1, v0 offset:2
330; GFX7-NEXT:    s_lshr_b32 s0, s1, 16
331; GFX7-NEXT:    v_mov_b32_e32 v0, s1
332; GFX7-NEXT:    ds_write_b16 v1, v0 offset:4
333; GFX7-NEXT:    v_mov_b32_e32 v0, s0
334; GFX7-NEXT:    ds_write_b16 v1, v0 offset:6
335; GFX7-NEXT:    s_lshr_b32 s0, s2, 16
336; GFX7-NEXT:    v_mov_b32_e32 v0, s2
337; GFX7-NEXT:    ds_write_b16 v1, v0 offset:8
338; GFX7-NEXT:    v_mov_b32_e32 v0, s0
339; GFX7-NEXT:    ds_write_b16 v1, v0 offset:10
340; GFX7-NEXT:    s_lshr_b32 s0, s3, 16
341; GFX7-NEXT:    v_mov_b32_e32 v0, s3
342; GFX7-NEXT:    ds_write_b16 v1, v0 offset:12
343; GFX7-NEXT:    v_mov_b32_e32 v0, s0
344; GFX7-NEXT:    ds_write_b16 v1, v0 offset:14
345; GFX7-NEXT:    s_endpgm
346;
347; GFX10-LABEL: store_lds_v4i32_align2:
348; GFX10:       ; %bb.0:
349; GFX10-NEXT:    s_clause 0x1
350; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
351; GFX10-NEXT:    s_load_dword s6, s[4:5], 0x0
352; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
353; GFX10-NEXT:    v_mov_b32_e32 v0, s0
354; GFX10-NEXT:    v_mov_b32_e32 v1, s6
355; GFX10-NEXT:    v_mov_b32_e32 v2, s1
356; GFX10-NEXT:    s_lshr_b32 s4, s0, 16
357; GFX10-NEXT:    v_mov_b32_e32 v3, s2
358; GFX10-NEXT:    s_lshr_b32 s0, s1, 16
359; GFX10-NEXT:    s_lshr_b32 s1, s2, 16
360; GFX10-NEXT:    s_lshr_b32 s2, s3, 16
361; GFX10-NEXT:    v_mov_b32_e32 v4, s3
362; GFX10-NEXT:    v_mov_b32_e32 v5, s4
363; GFX10-NEXT:    v_mov_b32_e32 v6, s0
364; GFX10-NEXT:    v_mov_b32_e32 v7, s1
365; GFX10-NEXT:    v_mov_b32_e32 v8, s2
366; GFX10-NEXT:    ds_write_b16 v1, v0
367; GFX10-NEXT:    ds_write_b16 v1, v2 offset:4
368; GFX10-NEXT:    ds_write_b16 v1, v3 offset:8
369; GFX10-NEXT:    ds_write_b16 v1, v4 offset:12
370; GFX10-NEXT:    ds_write_b16 v1, v5 offset:2
371; GFX10-NEXT:    ds_write_b16 v1, v6 offset:6
372; GFX10-NEXT:    ds_write_b16 v1, v7 offset:10
373; GFX10-NEXT:    ds_write_b16 v1, v8 offset:14
374; GFX10-NEXT:    s_endpgm
375;
376; GFX11-LABEL: store_lds_v4i32_align2:
377; GFX11:       ; %bb.0:
378; GFX11-NEXT:    s_clause 0x1
379; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x10
380; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x0
381; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
382; GFX11-NEXT:    s_lshr_b32 s5, s0, 16
383; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s4
384; GFX11-NEXT:    s_lshr_b32 s0, s1, 16
385; GFX11-NEXT:    v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v5, s5
386; GFX11-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s2
387; GFX11-NEXT:    s_lshr_b32 s1, s2, 16
388; GFX11-NEXT:    s_lshr_b32 s2, s3, 16
389; GFX11-NEXT:    v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v7, s1
390; GFX11-NEXT:    v_mov_b32_e32 v8, s2
391; GFX11-NEXT:    ds_store_b16 v1, v0
392; GFX11-NEXT:    ds_store_b16 v1, v5 offset:2
393; GFX11-NEXT:    ds_store_b16 v1, v2 offset:4
394; GFX11-NEXT:    ds_store_b16 v1, v6 offset:6
395; GFX11-NEXT:    ds_store_b16 v1, v3 offset:8
396; GFX11-NEXT:    ds_store_b16 v1, v7 offset:10
397; GFX11-NEXT:    ds_store_b16 v1, v4 offset:12
398; GFX11-NEXT:    ds_store_b16 v1, v8 offset:14
399; GFX11-NEXT:    s_endpgm
400  store <4 x i32> %x, ptr addrspace(3) %out, align 2
401  ret void
402}
403
404define amdgpu_kernel void @store_lds_v4i32_align4(ptr addrspace(3) %out, <4 x i32> %x) {
405; GFX9-LABEL: store_lds_v4i32_align4:
406; GFX9:       ; %bb.0:
407; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
408; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x0
409; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
410; GFX9-NEXT:    v_mov_b32_e32 v0, s0
411; GFX9-NEXT:    v_mov_b32_e32 v1, s6
412; GFX9-NEXT:    v_mov_b32_e32 v2, s1
413; GFX9-NEXT:    v_mov_b32_e32 v3, s2
414; GFX9-NEXT:    ds_write2_b32 v1, v0, v2 offset1:1
415; GFX9-NEXT:    v_mov_b32_e32 v0, s3
416; GFX9-NEXT:    ds_write2_b32 v1, v3, v0 offset0:2 offset1:3
417; GFX9-NEXT:    s_endpgm
418;
419; GFX7-LABEL: store_lds_v4i32_align4:
420; GFX7:       ; %bb.0:
421; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
422; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
423; GFX7-NEXT:    s_mov_b32 m0, -1
424; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
425; GFX7-NEXT:    v_mov_b32_e32 v0, s0
426; GFX7-NEXT:    v_mov_b32_e32 v1, s4
427; GFX7-NEXT:    v_mov_b32_e32 v2, s1
428; GFX7-NEXT:    ds_write2_b32 v1, v0, v2 offset1:1
429; GFX7-NEXT:    v_mov_b32_e32 v0, s2
430; GFX7-NEXT:    v_mov_b32_e32 v2, s3
431; GFX7-NEXT:    ds_write2_b32 v1, v0, v2 offset0:2 offset1:3
432; GFX7-NEXT:    s_endpgm
433;
434; GFX10-LABEL: store_lds_v4i32_align4:
435; GFX10:       ; %bb.0:
436; GFX10-NEXT:    s_clause 0x1
437; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
438; GFX10-NEXT:    s_load_dword s6, s[4:5], 0x0
439; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
440; GFX10-NEXT:    v_mov_b32_e32 v0, s0
441; GFX10-NEXT:    v_mov_b32_e32 v1, s6
442; GFX10-NEXT:    v_mov_b32_e32 v2, s1
443; GFX10-NEXT:    v_mov_b32_e32 v3, s2
444; GFX10-NEXT:    v_mov_b32_e32 v4, s3
445; GFX10-NEXT:    ds_write2_b32 v1, v0, v2 offset1:1
446; GFX10-NEXT:    ds_write2_b32 v1, v3, v4 offset0:2 offset1:3
447; GFX10-NEXT:    s_endpgm
448;
449; GFX11-LABEL: store_lds_v4i32_align4:
450; GFX11:       ; %bb.0:
451; GFX11-NEXT:    s_clause 0x1
452; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x10
453; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x0
454; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
455; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s4
456; GFX11-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s2
457; GFX11-NEXT:    v_mov_b32_e32 v4, s3
458; GFX11-NEXT:    ds_store_2addr_b32 v1, v0, v2 offset1:1
459; GFX11-NEXT:    ds_store_2addr_b32 v1, v3, v4 offset0:2 offset1:3
460; GFX11-NEXT:    s_endpgm
461  store <4 x i32> %x, ptr addrspace(3) %out, align 4
462  ret void
463}
464
465define amdgpu_kernel void @store_lds_v4i32_align8(ptr addrspace(3) %out, <4 x i32> %x) {
466; GFX9-LABEL: store_lds_v4i32_align8:
467; GFX9:       ; %bb.0:
468; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
469; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x0
470; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
471; GFX9-NEXT:    v_mov_b32_e32 v0, s0
472; GFX9-NEXT:    v_mov_b32_e32 v1, s1
473; GFX9-NEXT:    v_mov_b32_e32 v2, s2
474; GFX9-NEXT:    v_mov_b32_e32 v3, s3
475; GFX9-NEXT:    v_mov_b32_e32 v4, s6
476; GFX9-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
477; GFX9-NEXT:    s_endpgm
478;
479; GFX7-LABEL: store_lds_v4i32_align8:
480; GFX7:       ; %bb.0:
481; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
482; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
483; GFX7-NEXT:    s_mov_b32 m0, -1
484; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
485; GFX7-NEXT:    v_mov_b32_e32 v0, s0
486; GFX7-NEXT:    v_mov_b32_e32 v1, s1
487; GFX7-NEXT:    v_mov_b32_e32 v2, s2
488; GFX7-NEXT:    v_mov_b32_e32 v3, s3
489; GFX7-NEXT:    v_mov_b32_e32 v4, s4
490; GFX7-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
491; GFX7-NEXT:    s_endpgm
492;
493; GFX10-LABEL: store_lds_v4i32_align8:
494; GFX10:       ; %bb.0:
495; GFX10-NEXT:    s_clause 0x1
496; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
497; GFX10-NEXT:    s_load_dword s6, s[4:5], 0x0
498; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
499; GFX10-NEXT:    v_mov_b32_e32 v0, s0
500; GFX10-NEXT:    v_mov_b32_e32 v1, s6
501; GFX10-NEXT:    v_mov_b32_e32 v2, s1
502; GFX10-NEXT:    v_mov_b32_e32 v3, s2
503; GFX10-NEXT:    v_mov_b32_e32 v4, s3
504; GFX10-NEXT:    ds_write2_b32 v1, v0, v2 offset1:1
505; GFX10-NEXT:    ds_write2_b32 v1, v3, v4 offset0:2 offset1:3
506; GFX10-NEXT:    s_endpgm
507;
508; GFX11-LABEL: store_lds_v4i32_align8:
509; GFX11:       ; %bb.0:
510; GFX11-NEXT:    s_clause 0x1
511; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x10
512; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x0
513; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
514; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
515; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
516; GFX11-NEXT:    v_mov_b32_e32 v4, s4
517; GFX11-NEXT:    ds_store_2addr_b64 v4, v[0:1], v[2:3] offset1:1
518; GFX11-NEXT:    s_endpgm
519  store <4 x i32> %x, ptr addrspace(3) %out, align 8
520  ret void
521}
522
523define amdgpu_kernel void @store_lds_v4i32_align16(ptr addrspace(3) %out, <4 x i32> %x) {
524; GFX9-LABEL: store_lds_v4i32_align16:
525; GFX9:       ; %bb.0:
526; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
527; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x0
528; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
529; GFX9-NEXT:    v_mov_b32_e32 v0, s0
530; GFX9-NEXT:    v_mov_b32_e32 v1, s1
531; GFX9-NEXT:    v_mov_b32_e32 v2, s2
532; GFX9-NEXT:    v_mov_b32_e32 v3, s3
533; GFX9-NEXT:    v_mov_b32_e32 v4, s6
534; GFX9-NEXT:    ds_write_b128 v4, v[0:3]
535; GFX9-NEXT:    s_endpgm
536;
537; GFX7-LABEL: store_lds_v4i32_align16:
538; GFX7:       ; %bb.0:
539; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
540; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
541; GFX7-NEXT:    s_mov_b32 m0, -1
542; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
543; GFX7-NEXT:    v_mov_b32_e32 v0, s0
544; GFX7-NEXT:    v_mov_b32_e32 v1, s1
545; GFX7-NEXT:    v_mov_b32_e32 v2, s2
546; GFX7-NEXT:    v_mov_b32_e32 v3, s3
547; GFX7-NEXT:    v_mov_b32_e32 v4, s4
548; GFX7-NEXT:    ds_write_b128 v4, v[0:3]
549; GFX7-NEXT:    s_endpgm
550;
551; GFX10-LABEL: store_lds_v4i32_align16:
552; GFX10:       ; %bb.0:
553; GFX10-NEXT:    s_clause 0x1
554; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
555; GFX10-NEXT:    s_load_dword s6, s[4:5], 0x0
556; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
557; GFX10-NEXT:    v_mov_b32_e32 v0, s0
558; GFX10-NEXT:    v_mov_b32_e32 v1, s1
559; GFX10-NEXT:    v_mov_b32_e32 v2, s2
560; GFX10-NEXT:    v_mov_b32_e32 v3, s3
561; GFX10-NEXT:    v_mov_b32_e32 v4, s6
562; GFX10-NEXT:    ds_write_b128 v4, v[0:3]
563; GFX10-NEXT:    s_endpgm
564;
565; GFX11-LABEL: store_lds_v4i32_align16:
566; GFX11:       ; %bb.0:
567; GFX11-NEXT:    s_clause 0x1
568; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x10
569; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x0
570; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
571; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
572; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
573; GFX11-NEXT:    v_mov_b32_e32 v4, s4
574; GFX11-NEXT:    ds_store_b128 v4, v[0:3]
575; GFX11-NEXT:    s_endpgm
576  store <4 x i32> %x, ptr addrspace(3) %out, align 16
577  ret void
578}
579