xref: /llvm-project/llvm/test/CodeGen/AMDGPU/store-local.128.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
3; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s
4; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s
5; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
6; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
7
8define amdgpu_kernel void @store_lds_v4i32(ptr addrspace(3) %out, <4 x i32> %x) {
9; GFX9-LABEL: store_lds_v4i32:
10; GFX9:       ; %bb.0:
11; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x0
12; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
13; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
14; GFX9-NEXT:    v_mov_b32_e32 v4, s6
15; GFX9-NEXT:    v_mov_b32_e32 v0, s0
16; GFX9-NEXT:    v_mov_b32_e32 v1, s1
17; GFX9-NEXT:    v_mov_b32_e32 v2, s2
18; GFX9-NEXT:    v_mov_b32_e32 v3, s3
19; GFX9-NEXT:    ds_write_b128 v4, v[0:3]
20; GFX9-NEXT:    s_endpgm
21;
22; GFX7-LABEL: store_lds_v4i32:
23; GFX7:       ; %bb.0:
24; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
25; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
26; GFX7-NEXT:    s_mov_b32 m0, -1
27; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
28; GFX7-NEXT:    v_mov_b32_e32 v4, s6
29; GFX7-NEXT:    v_mov_b32_e32 v0, s0
30; GFX7-NEXT:    v_mov_b32_e32 v1, s1
31; GFX7-NEXT:    v_mov_b32_e32 v2, s2
32; GFX7-NEXT:    v_mov_b32_e32 v3, s3
33; GFX7-NEXT:    ds_write_b128 v4, v[0:3]
34; GFX7-NEXT:    s_endpgm
35;
36; GFX6-LABEL: store_lds_v4i32:
37; GFX6:       ; %bb.0:
38; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
39; GFX6-NEXT:    s_load_dword s4, s[4:5], 0x0
40; GFX6-NEXT:    s_mov_b32 m0, -1
41; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
42; GFX6-NEXT:    v_mov_b32_e32 v0, s2
43; GFX6-NEXT:    v_mov_b32_e32 v1, s3
44; GFX6-NEXT:    v_mov_b32_e32 v4, s4
45; GFX6-NEXT:    v_mov_b32_e32 v2, s0
46; GFX6-NEXT:    v_mov_b32_e32 v3, s1
47; GFX6-NEXT:    ds_write2_b64 v4, v[2:3], v[0:1] offset1:1
48; GFX6-NEXT:    s_endpgm
49;
50; GFX10-LABEL: store_lds_v4i32:
51; GFX10:       ; %bb.0:
52; GFX10-NEXT:    s_clause 0x1
53; GFX10-NEXT:    s_load_dword s6, s[4:5], 0x0
54; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
55; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
56; GFX10-NEXT:    v_mov_b32_e32 v4, s6
57; GFX10-NEXT:    v_mov_b32_e32 v0, s0
58; GFX10-NEXT:    v_mov_b32_e32 v1, s1
59; GFX10-NEXT:    v_mov_b32_e32 v2, s2
60; GFX10-NEXT:    v_mov_b32_e32 v3, s3
61; GFX10-NEXT:    ds_write_b128 v4, v[0:3]
62; GFX10-NEXT:    s_endpgm
63;
64; GFX11-LABEL: store_lds_v4i32:
65; GFX11:       ; %bb.0:
66; GFX11-NEXT:    s_clause 0x1
67; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x0
68; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x10
69; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
70; GFX11-NEXT:    v_mov_b32_e32 v4, s6
71; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
72; GFX11-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
73; GFX11-NEXT:    ds_store_b128 v4, v[0:3]
74; GFX11-NEXT:    s_endpgm
75  store <4 x i32> %x, ptr addrspace(3) %out
76  ret void
77}
78
79define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i32> %x) {
80; GFX9-LABEL: store_lds_v4i32_align1:
81; GFX9:       ; %bb.0:
82; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x0
83; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
84; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
85; GFX9-NEXT:    v_mov_b32_e32 v0, s6
86; GFX9-NEXT:    v_mov_b32_e32 v1, s3
87; GFX9-NEXT:    v_mov_b32_e32 v2, s2
88; GFX9-NEXT:    ds_write_b8 v0, v1 offset:12
89; GFX9-NEXT:    ds_write_b8_d16_hi v0, v1 offset:14
90; GFX9-NEXT:    ds_write_b8 v0, v2 offset:8
91; GFX9-NEXT:    ds_write_b8_d16_hi v0, v2 offset:10
92; GFX9-NEXT:    v_mov_b32_e32 v1, s1
93; GFX9-NEXT:    ds_write_b8 v0, v1 offset:4
94; GFX9-NEXT:    ds_write_b8_d16_hi v0, v1 offset:6
95; GFX9-NEXT:    v_mov_b32_e32 v1, s0
96; GFX9-NEXT:    s_lshr_b32 s4, s3, 8
97; GFX9-NEXT:    ds_write_b8 v0, v1
98; GFX9-NEXT:    ds_write_b8_d16_hi v0, v1 offset:2
99; GFX9-NEXT:    v_mov_b32_e32 v1, s4
100; GFX9-NEXT:    s_lshr_b32 s3, s3, 24
101; GFX9-NEXT:    ds_write_b8 v0, v1 offset:13
102; GFX9-NEXT:    v_mov_b32_e32 v1, s3
103; GFX9-NEXT:    s_lshr_b32 s3, s2, 8
104; GFX9-NEXT:    ds_write_b8 v0, v1 offset:15
105; GFX9-NEXT:    v_mov_b32_e32 v1, s3
106; GFX9-NEXT:    s_lshr_b32 s2, s2, 24
107; GFX9-NEXT:    ds_write_b8 v0, v1 offset:9
108; GFX9-NEXT:    v_mov_b32_e32 v1, s2
109; GFX9-NEXT:    s_lshr_b32 s2, s1, 8
110; GFX9-NEXT:    ds_write_b8 v0, v1 offset:11
111; GFX9-NEXT:    v_mov_b32_e32 v1, s2
112; GFX9-NEXT:    s_lshr_b32 s1, s1, 24
113; GFX9-NEXT:    ds_write_b8 v0, v1 offset:5
114; GFX9-NEXT:    v_mov_b32_e32 v1, s1
115; GFX9-NEXT:    s_lshr_b32 s1, s0, 8
116; GFX9-NEXT:    ds_write_b8 v0, v1 offset:7
117; GFX9-NEXT:    v_mov_b32_e32 v1, s1
118; GFX9-NEXT:    s_lshr_b32 s0, s0, 24
119; GFX9-NEXT:    ds_write_b8 v0, v1 offset:1
120; GFX9-NEXT:    v_mov_b32_e32 v1, s0
121; GFX9-NEXT:    ds_write_b8 v0, v1 offset:3
122; GFX9-NEXT:    s_endpgm
123;
124; GFX7-LABEL: store_lds_v4i32_align1:
125; GFX7:       ; %bb.0:
126; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
127; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
128; GFX7-NEXT:    s_mov_b32 m0, -1
129; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
130; GFX7-NEXT:    v_mov_b32_e32 v0, s6
131; GFX7-NEXT:    v_mov_b32_e32 v1, s3
132; GFX7-NEXT:    v_mov_b32_e32 v2, s2
133; GFX7-NEXT:    ds_write_b8 v0, v1 offset:12
134; GFX7-NEXT:    ds_write_b8 v0, v2 offset:8
135; GFX7-NEXT:    v_mov_b32_e32 v1, s1
136; GFX7-NEXT:    ds_write_b8 v0, v1 offset:4
137; GFX7-NEXT:    v_mov_b32_e32 v1, s0
138; GFX7-NEXT:    s_lshr_b32 s4, s3, 8
139; GFX7-NEXT:    ds_write_b8 v0, v1
140; GFX7-NEXT:    v_mov_b32_e32 v1, s4
141; GFX7-NEXT:    s_lshr_b32 s4, s3, 24
142; GFX7-NEXT:    ds_write_b8 v0, v1 offset:13
143; GFX7-NEXT:    v_mov_b32_e32 v1, s4
144; GFX7-NEXT:    s_lshr_b32 s3, s3, 16
145; GFX7-NEXT:    ds_write_b8 v0, v1 offset:15
146; GFX7-NEXT:    v_mov_b32_e32 v1, s3
147; GFX7-NEXT:    s_lshr_b32 s3, s2, 8
148; GFX7-NEXT:    ds_write_b8 v0, v1 offset:14
149; GFX7-NEXT:    v_mov_b32_e32 v1, s3
150; GFX7-NEXT:    s_lshr_b32 s3, s2, 24
151; GFX7-NEXT:    ds_write_b8 v0, v1 offset:9
152; GFX7-NEXT:    v_mov_b32_e32 v1, s3
153; GFX7-NEXT:    s_lshr_b32 s2, s2, 16
154; GFX7-NEXT:    ds_write_b8 v0, v1 offset:11
155; GFX7-NEXT:    v_mov_b32_e32 v1, s2
156; GFX7-NEXT:    s_lshr_b32 s2, s1, 8
157; GFX7-NEXT:    ds_write_b8 v0, v1 offset:10
158; GFX7-NEXT:    v_mov_b32_e32 v1, s2
159; GFX7-NEXT:    s_lshr_b32 s2, s1, 24
160; GFX7-NEXT:    ds_write_b8 v0, v1 offset:5
161; GFX7-NEXT:    v_mov_b32_e32 v1, s2
162; GFX7-NEXT:    s_lshr_b32 s1, s1, 16
163; GFX7-NEXT:    ds_write_b8 v0, v1 offset:7
164; GFX7-NEXT:    v_mov_b32_e32 v1, s1
165; GFX7-NEXT:    s_lshr_b32 s1, s0, 8
166; GFX7-NEXT:    ds_write_b8 v0, v1 offset:6
167; GFX7-NEXT:    v_mov_b32_e32 v1, s1
168; GFX7-NEXT:    s_lshr_b32 s1, s0, 24
169; GFX7-NEXT:    ds_write_b8 v0, v1 offset:1
170; GFX7-NEXT:    v_mov_b32_e32 v1, s1
171; GFX7-NEXT:    s_lshr_b32 s0, s0, 16
172; GFX7-NEXT:    ds_write_b8 v0, v1 offset:3
173; GFX7-NEXT:    v_mov_b32_e32 v1, s0
174; GFX7-NEXT:    ds_write_b8 v0, v1 offset:2
175; GFX7-NEXT:    s_endpgm
176;
177; GFX6-LABEL: store_lds_v4i32_align1:
178; GFX6:       ; %bb.0:
179; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
180; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
181; GFX6-NEXT:    s_mov_b32 m0, -1
182; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
183; GFX6-NEXT:    v_mov_b32_e32 v0, s6
184; GFX6-NEXT:    v_mov_b32_e32 v1, s3
185; GFX6-NEXT:    v_mov_b32_e32 v2, s2
186; GFX6-NEXT:    ds_write_b8 v0, v1 offset:12
187; GFX6-NEXT:    ds_write_b8 v0, v2 offset:8
188; GFX6-NEXT:    v_mov_b32_e32 v1, s1
189; GFX6-NEXT:    ds_write_b8 v0, v1 offset:4
190; GFX6-NEXT:    v_mov_b32_e32 v1, s0
191; GFX6-NEXT:    s_lshr_b32 s4, s3, 8
192; GFX6-NEXT:    ds_write_b8 v0, v1
193; GFX6-NEXT:    v_mov_b32_e32 v1, s4
194; GFX6-NEXT:    s_lshr_b32 s4, s3, 24
195; GFX6-NEXT:    ds_write_b8 v0, v1 offset:13
196; GFX6-NEXT:    v_mov_b32_e32 v1, s4
197; GFX6-NEXT:    s_lshr_b32 s3, s3, 16
198; GFX6-NEXT:    ds_write_b8 v0, v1 offset:15
199; GFX6-NEXT:    v_mov_b32_e32 v1, s3
200; GFX6-NEXT:    s_lshr_b32 s3, s2, 8
201; GFX6-NEXT:    ds_write_b8 v0, v1 offset:14
202; GFX6-NEXT:    v_mov_b32_e32 v1, s3
203; GFX6-NEXT:    s_lshr_b32 s3, s2, 24
204; GFX6-NEXT:    ds_write_b8 v0, v1 offset:9
205; GFX6-NEXT:    v_mov_b32_e32 v1, s3
206; GFX6-NEXT:    s_lshr_b32 s2, s2, 16
207; GFX6-NEXT:    ds_write_b8 v0, v1 offset:11
208; GFX6-NEXT:    v_mov_b32_e32 v1, s2
209; GFX6-NEXT:    s_lshr_b32 s2, s1, 8
210; GFX6-NEXT:    ds_write_b8 v0, v1 offset:10
211; GFX6-NEXT:    v_mov_b32_e32 v1, s2
212; GFX6-NEXT:    s_lshr_b32 s2, s1, 24
213; GFX6-NEXT:    ds_write_b8 v0, v1 offset:5
214; GFX6-NEXT:    v_mov_b32_e32 v1, s2
215; GFX6-NEXT:    s_lshr_b32 s1, s1, 16
216; GFX6-NEXT:    ds_write_b8 v0, v1 offset:7
217; GFX6-NEXT:    v_mov_b32_e32 v1, s1
218; GFX6-NEXT:    s_lshr_b32 s1, s0, 8
219; GFX6-NEXT:    ds_write_b8 v0, v1 offset:6
220; GFX6-NEXT:    v_mov_b32_e32 v1, s1
221; GFX6-NEXT:    s_lshr_b32 s1, s0, 24
222; GFX6-NEXT:    ds_write_b8 v0, v1 offset:1
223; GFX6-NEXT:    v_mov_b32_e32 v1, s1
224; GFX6-NEXT:    s_lshr_b32 s0, s0, 16
225; GFX6-NEXT:    ds_write_b8 v0, v1 offset:3
226; GFX6-NEXT:    v_mov_b32_e32 v1, s0
227; GFX6-NEXT:    ds_write_b8 v0, v1 offset:2
228; GFX6-NEXT:    s_endpgm
229;
230; GFX10-LABEL: store_lds_v4i32_align1:
231; GFX10:       ; %bb.0:
232; GFX10-NEXT:    s_clause 0x1
233; GFX10-NEXT:    s_load_dword s6, s[4:5], 0x0
234; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
235; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
236; GFX10-NEXT:    v_mov_b32_e32 v0, s6
237; GFX10-NEXT:    v_mov_b32_e32 v1, s3
238; GFX10-NEXT:    v_mov_b32_e32 v2, s2
239; GFX10-NEXT:    s_lshr_b32 s5, s2, 8
240; GFX10-NEXT:    s_lshr_b32 s2, s2, 24
241; GFX10-NEXT:    v_mov_b32_e32 v3, s1
242; GFX10-NEXT:    s_lshr_b32 s4, s3, 8
243; GFX10-NEXT:    s_lshr_b32 s3, s3, 24
244; GFX10-NEXT:    s_lshr_b32 s6, s1, 8
245; GFX10-NEXT:    s_lshr_b32 s1, s1, 24
246; GFX10-NEXT:    v_mov_b32_e32 v8, s2
247; GFX10-NEXT:    v_mov_b32_e32 v4, s0
248; GFX10-NEXT:    v_mov_b32_e32 v5, s4
249; GFX10-NEXT:    v_mov_b32_e32 v6, s3
250; GFX10-NEXT:    v_mov_b32_e32 v7, s5
251; GFX10-NEXT:    v_mov_b32_e32 v9, s6
252; GFX10-NEXT:    ds_write_b8 v0, v1 offset:12
253; GFX10-NEXT:    ds_write_b8_d16_hi v0, v1 offset:14
254; GFX10-NEXT:    ds_write_b8 v0, v2 offset:8
255; GFX10-NEXT:    ds_write_b8_d16_hi v0, v2 offset:10
256; GFX10-NEXT:    ds_write_b8 v0, v3 offset:4
257; GFX10-NEXT:    ds_write_b8_d16_hi v0, v3 offset:6
258; GFX10-NEXT:    ds_write_b8 v0, v4
259; GFX10-NEXT:    ds_write_b8_d16_hi v0, v4 offset:2
260; GFX10-NEXT:    ds_write_b8 v0, v5 offset:13
261; GFX10-NEXT:    ds_write_b8 v0, v6 offset:15
262; GFX10-NEXT:    ds_write_b8 v0, v7 offset:9
263; GFX10-NEXT:    v_mov_b32_e32 v1, s1
264; GFX10-NEXT:    s_lshr_b32 s1, s0, 8
265; GFX10-NEXT:    s_lshr_b32 s0, s0, 24
266; GFX10-NEXT:    v_mov_b32_e32 v2, s1
267; GFX10-NEXT:    v_mov_b32_e32 v3, s0
268; GFX10-NEXT:    ds_write_b8 v0, v8 offset:11
269; GFX10-NEXT:    ds_write_b8 v0, v9 offset:5
270; GFX10-NEXT:    ds_write_b8 v0, v1 offset:7
271; GFX10-NEXT:    ds_write_b8 v0, v2 offset:1
272; GFX10-NEXT:    ds_write_b8 v0, v3 offset:3
273; GFX10-NEXT:    s_endpgm
274;
275; GFX11-LABEL: store_lds_v4i32_align1:
276; GFX11:       ; %bb.0:
277; GFX11-NEXT:    s_clause 0x1
278; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x0
279; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x10
280; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
281; GFX11-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s3
282; GFX11-NEXT:    s_lshr_b32 s4, s3, 8
283; GFX11-NEXT:    s_lshr_b32 s3, s3, 24
284; GFX11-NEXT:    s_lshr_b32 s5, s2, 8
285; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s1
286; GFX11-NEXT:    s_lshr_b32 s2, s2, 24
287; GFX11-NEXT:    s_lshr_b32 s6, s1, 8
288; GFX11-NEXT:    v_dual_mov_b32 v6, s3 :: v_dual_mov_b32 v7, s5
289; GFX11-NEXT:    v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v9, s6
290; GFX11-NEXT:    v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s4
291; GFX11-NEXT:    s_lshr_b32 s1, s1, 24
292; GFX11-NEXT:    s_lshr_b32 s7, s0, 8
293; GFX11-NEXT:    s_lshr_b32 s0, s0, 24
294; GFX11-NEXT:    ds_store_b8 v0, v2 offset:8
295; GFX11-NEXT:    ds_store_b8_d16_hi v0, v2 offset:10
296; GFX11-NEXT:    ds_store_b8 v0, v1 offset:12
297; GFX11-NEXT:    ds_store_b8 v0, v4
298; GFX11-NEXT:    ds_store_b8_d16_hi v0, v4 offset:2
299; GFX11-NEXT:    ds_store_b8 v0, v3 offset:4
300; GFX11-NEXT:    ds_store_b8 v0, v5 offset:13
301; GFX11-NEXT:    ds_store_b8_d16_hi v0, v1 offset:14
302; GFX11-NEXT:    ds_store_b8 v0, v6 offset:15
303; GFX11-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v10, s7
304; GFX11-NEXT:    v_mov_b32_e32 v11, s0
305; GFX11-NEXT:    ds_store_b8 v0, v7 offset:9
306; GFX11-NEXT:    ds_store_b8 v0, v8 offset:11
307; GFX11-NEXT:    ds_store_b8 v0, v9 offset:5
308; GFX11-NEXT:    ds_store_b8_d16_hi v0, v3 offset:6
309; GFX11-NEXT:    ds_store_b8 v0, v1 offset:7
310; GFX11-NEXT:    ds_store_b8 v0, v10 offset:1
311; GFX11-NEXT:    ds_store_b8 v0, v11 offset:3
312; GFX11-NEXT:    s_endpgm
313  store <4 x i32> %x, ptr addrspace(3) %out, align 1
314  ret void
315}
316
317define amdgpu_kernel void @store_lds_v4i32_align2(ptr addrspace(3) %out, <4 x i32> %x) {
318; GFX9-LABEL: store_lds_v4i32_align2:
319; GFX9:       ; %bb.0:
320; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x0
321; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
322; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
323; GFX9-NEXT:    v_mov_b32_e32 v0, s6
324; GFX9-NEXT:    v_mov_b32_e32 v1, s3
325; GFX9-NEXT:    v_mov_b32_e32 v2, s2
326; GFX9-NEXT:    ds_write_b16 v0, v1 offset:12
327; GFX9-NEXT:    ds_write_b16_d16_hi v0, v1 offset:14
328; GFX9-NEXT:    ds_write_b16 v0, v2 offset:8
329; GFX9-NEXT:    ds_write_b16_d16_hi v0, v2 offset:10
330; GFX9-NEXT:    v_mov_b32_e32 v1, s1
331; GFX9-NEXT:    ds_write_b16 v0, v1 offset:4
332; GFX9-NEXT:    ds_write_b16_d16_hi v0, v1 offset:6
333; GFX9-NEXT:    v_mov_b32_e32 v1, s0
334; GFX9-NEXT:    ds_write_b16 v0, v1
335; GFX9-NEXT:    ds_write_b16_d16_hi v0, v1 offset:2
336; GFX9-NEXT:    s_endpgm
337;
338; GFX7-LABEL: store_lds_v4i32_align2:
339; GFX7:       ; %bb.0:
340; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
341; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
342; GFX7-NEXT:    s_mov_b32 m0, -1
343; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
344; GFX7-NEXT:    v_mov_b32_e32 v0, s6
345; GFX7-NEXT:    v_mov_b32_e32 v1, s3
346; GFX7-NEXT:    v_mov_b32_e32 v2, s2
347; GFX7-NEXT:    ds_write_b16 v0, v1 offset:12
348; GFX7-NEXT:    ds_write_b16 v0, v2 offset:8
349; GFX7-NEXT:    v_mov_b32_e32 v1, s1
350; GFX7-NEXT:    ds_write_b16 v0, v1 offset:4
351; GFX7-NEXT:    v_mov_b32_e32 v1, s0
352; GFX7-NEXT:    s_lshr_b32 s3, s3, 16
353; GFX7-NEXT:    ds_write_b16 v0, v1
354; GFX7-NEXT:    v_mov_b32_e32 v1, s3
355; GFX7-NEXT:    s_lshr_b32 s2, s2, 16
356; GFX7-NEXT:    ds_write_b16 v0, v1 offset:14
357; GFX7-NEXT:    v_mov_b32_e32 v1, s2
358; GFX7-NEXT:    s_lshr_b32 s1, s1, 16
359; GFX7-NEXT:    ds_write_b16 v0, v1 offset:10
360; GFX7-NEXT:    v_mov_b32_e32 v1, s1
361; GFX7-NEXT:    s_lshr_b32 s0, s0, 16
362; GFX7-NEXT:    ds_write_b16 v0, v1 offset:6
363; GFX7-NEXT:    v_mov_b32_e32 v1, s0
364; GFX7-NEXT:    ds_write_b16 v0, v1 offset:2
365; GFX7-NEXT:    s_endpgm
366;
367; GFX6-LABEL: store_lds_v4i32_align2:
368; GFX6:       ; %bb.0:
369; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
370; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
371; GFX6-NEXT:    s_mov_b32 m0, -1
372; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
373; GFX6-NEXT:    v_mov_b32_e32 v0, s6
374; GFX6-NEXT:    v_mov_b32_e32 v1, s3
375; GFX6-NEXT:    v_mov_b32_e32 v2, s2
376; GFX6-NEXT:    ds_write_b16 v0, v1 offset:12
377; GFX6-NEXT:    ds_write_b16 v0, v2 offset:8
378; GFX6-NEXT:    v_mov_b32_e32 v1, s1
379; GFX6-NEXT:    ds_write_b16 v0, v1 offset:4
380; GFX6-NEXT:    v_mov_b32_e32 v1, s0
381; GFX6-NEXT:    s_lshr_b32 s3, s3, 16
382; GFX6-NEXT:    ds_write_b16 v0, v1
383; GFX6-NEXT:    v_mov_b32_e32 v1, s3
384; GFX6-NEXT:    s_lshr_b32 s2, s2, 16
385; GFX6-NEXT:    ds_write_b16 v0, v1 offset:14
386; GFX6-NEXT:    v_mov_b32_e32 v1, s2
387; GFX6-NEXT:    s_lshr_b32 s1, s1, 16
388; GFX6-NEXT:    ds_write_b16 v0, v1 offset:10
389; GFX6-NEXT:    v_mov_b32_e32 v1, s1
390; GFX6-NEXT:    s_lshr_b32 s0, s0, 16
391; GFX6-NEXT:    ds_write_b16 v0, v1 offset:6
392; GFX6-NEXT:    v_mov_b32_e32 v1, s0
393; GFX6-NEXT:    ds_write_b16 v0, v1 offset:2
394; GFX6-NEXT:    s_endpgm
395;
396; GFX10-LABEL: store_lds_v4i32_align2:
397; GFX10:       ; %bb.0:
398; GFX10-NEXT:    s_clause 0x1
399; GFX10-NEXT:    s_load_dword s6, s[4:5], 0x0
400; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
401; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
402; GFX10-NEXT:    v_mov_b32_e32 v0, s6
403; GFX10-NEXT:    v_mov_b32_e32 v1, s3
404; GFX10-NEXT:    v_mov_b32_e32 v2, s2
405; GFX10-NEXT:    v_mov_b32_e32 v3, s1
406; GFX10-NEXT:    v_mov_b32_e32 v4, s0
407; GFX10-NEXT:    ds_write_b16 v0, v1 offset:12
408; GFX10-NEXT:    ds_write_b16_d16_hi v0, v1 offset:14
409; GFX10-NEXT:    ds_write_b16 v0, v2 offset:8
410; GFX10-NEXT:    ds_write_b16_d16_hi v0, v2 offset:10
411; GFX10-NEXT:    ds_write_b16 v0, v3 offset:4
412; GFX10-NEXT:    ds_write_b16_d16_hi v0, v3 offset:6
413; GFX10-NEXT:    ds_write_b16 v0, v4
414; GFX10-NEXT:    ds_write_b16_d16_hi v0, v4 offset:2
415; GFX10-NEXT:    s_endpgm
416;
417; GFX11-LABEL: store_lds_v4i32_align2:
418; GFX11:       ; %bb.0:
419; GFX11-NEXT:    s_clause 0x1
420; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x0
421; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x10
422; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
423; GFX11-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s3
424; GFX11-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
425; GFX11-NEXT:    v_mov_b32_e32 v4, s2
426; GFX11-NEXT:    ds_store_b16_d16_hi v0, v1 offset:14
427; GFX11-NEXT:    ds_store_b16 v0, v2
428; GFX11-NEXT:    ds_store_b16 v0, v3 offset:4
429; GFX11-NEXT:    ds_store_b16 v0, v4 offset:8
430; GFX11-NEXT:    ds_store_b16 v0, v1 offset:12
431; GFX11-NEXT:    ds_store_b16_d16_hi v0, v4 offset:10
432; GFX11-NEXT:    ds_store_b16_d16_hi v0, v3 offset:6
433; GFX11-NEXT:    ds_store_b16_d16_hi v0, v2 offset:2
434; GFX11-NEXT:    s_endpgm
435  store <4 x i32> %x, ptr addrspace(3) %out, align 2
436  ret void
437}
438
439define amdgpu_kernel void @store_lds_v4i32_align4(ptr addrspace(3) %out, <4 x i32> %x) {
440; GFX9-LABEL: store_lds_v4i32_align4:
441; GFX9:       ; %bb.0:
442; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x0
443; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
444; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
445; GFX9-NEXT:    v_mov_b32_e32 v0, s6
446; GFX9-NEXT:    v_mov_b32_e32 v1, s0
447; GFX9-NEXT:    v_mov_b32_e32 v2, s1
448; GFX9-NEXT:    v_mov_b32_e32 v3, s2
449; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
450; GFX9-NEXT:    v_mov_b32_e32 v1, s3
451; GFX9-NEXT:    ds_write2_b32 v0, v3, v1 offset0:2 offset1:3
452; GFX9-NEXT:    s_endpgm
453;
454; GFX7-LABEL: store_lds_v4i32_align4:
455; GFX7:       ; %bb.0:
456; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
457; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
458; GFX7-NEXT:    s_mov_b32 m0, -1
459; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
460; GFX7-NEXT:    v_mov_b32_e32 v0, s6
461; GFX7-NEXT:    v_mov_b32_e32 v1, s0
462; GFX7-NEXT:    v_mov_b32_e32 v2, s1
463; GFX7-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
464; GFX7-NEXT:    v_mov_b32_e32 v1, s2
465; GFX7-NEXT:    v_mov_b32_e32 v2, s3
466; GFX7-NEXT:    ds_write2_b32 v0, v1, v2 offset0:2 offset1:3
467; GFX7-NEXT:    s_endpgm
468;
469; GFX6-LABEL: store_lds_v4i32_align4:
470; GFX6:       ; %bb.0:
471; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
472; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
473; GFX6-NEXT:    s_mov_b32 m0, -1
474; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
475; GFX6-NEXT:    v_mov_b32_e32 v0, s6
476; GFX6-NEXT:    v_mov_b32_e32 v1, s2
477; GFX6-NEXT:    v_mov_b32_e32 v2, s3
478; GFX6-NEXT:    ds_write2_b32 v0, v1, v2 offset0:2 offset1:3
479; GFX6-NEXT:    v_mov_b32_e32 v1, s0
480; GFX6-NEXT:    v_mov_b32_e32 v2, s1
481; GFX6-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
482; GFX6-NEXT:    s_endpgm
483;
484; GFX10-LABEL: store_lds_v4i32_align4:
485; GFX10:       ; %bb.0:
486; GFX10-NEXT:    s_clause 0x1
487; GFX10-NEXT:    s_load_dword s6, s[4:5], 0x0
488; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
489; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
490; GFX10-NEXT:    v_mov_b32_e32 v0, s6
491; GFX10-NEXT:    v_mov_b32_e32 v1, s2
492; GFX10-NEXT:    v_mov_b32_e32 v2, s3
493; GFX10-NEXT:    v_mov_b32_e32 v3, s0
494; GFX10-NEXT:    v_mov_b32_e32 v4, s1
495; GFX10-NEXT:    ds_write2_b32 v0, v1, v2 offset0:2 offset1:3
496; GFX10-NEXT:    ds_write2_b32 v0, v3, v4 offset1:1
497; GFX10-NEXT:    s_endpgm
498;
499; GFX11-LABEL: store_lds_v4i32_align4:
500; GFX11:       ; %bb.0:
501; GFX11-NEXT:    s_clause 0x1
502; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x0
503; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x10
504; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
505; GFX11-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s0
506; GFX11-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s2
507; GFX11-NEXT:    v_mov_b32_e32 v4, s3
508; GFX11-NEXT:    ds_store_2addr_b32 v0, v1, v2 offset1:1
509; GFX11-NEXT:    ds_store_2addr_b32 v0, v3, v4 offset0:2 offset1:3
510; GFX11-NEXT:    s_endpgm
511  store <4 x i32> %x, ptr addrspace(3) %out, align 4
512  ret void
513}
514
515define amdgpu_kernel void @store_lds_v4i32_align8(ptr addrspace(3) %out, <4 x i32> %x) {
516; GFX9-LABEL: store_lds_v4i32_align8:
517; GFX9:       ; %bb.0:
518; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x0
519; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
520; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
521; GFX9-NEXT:    v_mov_b32_e32 v4, s6
522; GFX9-NEXT:    v_mov_b32_e32 v0, s0
523; GFX9-NEXT:    v_mov_b32_e32 v2, s2
524; GFX9-NEXT:    v_mov_b32_e32 v1, s1
525; GFX9-NEXT:    v_mov_b32_e32 v3, s3
526; GFX9-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
527; GFX9-NEXT:    s_endpgm
528;
529; GFX7-LABEL: store_lds_v4i32_align8:
530; GFX7:       ; %bb.0:
531; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
532; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
533; GFX7-NEXT:    s_mov_b32 m0, -1
534; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
535; GFX7-NEXT:    v_mov_b32_e32 v4, s6
536; GFX7-NEXT:    v_mov_b32_e32 v0, s0
537; GFX7-NEXT:    v_mov_b32_e32 v2, s2
538; GFX7-NEXT:    v_mov_b32_e32 v1, s1
539; GFX7-NEXT:    v_mov_b32_e32 v3, s3
540; GFX7-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
541; GFX7-NEXT:    s_endpgm
542;
543; GFX6-LABEL: store_lds_v4i32_align8:
544; GFX6:       ; %bb.0:
545; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
546; GFX6-NEXT:    s_load_dword s4, s[4:5], 0x0
547; GFX6-NEXT:    s_mov_b32 m0, -1
548; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
549; GFX6-NEXT:    v_mov_b32_e32 v0, s0
550; GFX6-NEXT:    v_mov_b32_e32 v1, s1
551; GFX6-NEXT:    v_mov_b32_e32 v4, s4
552; GFX6-NEXT:    v_mov_b32_e32 v2, s2
553; GFX6-NEXT:    v_mov_b32_e32 v3, s3
554; GFX6-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
555; GFX6-NEXT:    s_endpgm
556;
557; GFX10-LABEL: store_lds_v4i32_align8:
558; GFX10:       ; %bb.0:
559; GFX10-NEXT:    s_clause 0x1
560; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
561; GFX10-NEXT:    s_load_dword s6, s[4:5], 0x0
562; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
563; GFX10-NEXT:    v_mov_b32_e32 v0, s0
564; GFX10-NEXT:    v_mov_b32_e32 v1, s1
565; GFX10-NEXT:    v_mov_b32_e32 v4, s6
566; GFX10-NEXT:    v_mov_b32_e32 v2, s2
567; GFX10-NEXT:    v_mov_b32_e32 v3, s3
568; GFX10-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
569; GFX10-NEXT:    s_endpgm
570;
571; GFX11-LABEL: store_lds_v4i32_align8:
572; GFX11:       ; %bb.0:
573; GFX11-NEXT:    s_clause 0x1
574; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x0
575; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x10
576; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
577; GFX11-NEXT:    v_mov_b32_e32 v4, s6
578; GFX11-NEXT:    v_mov_b32_e32 v0, s0
579; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
580; GFX11-NEXT:    v_mov_b32_e32 v1, s1
581; GFX11-NEXT:    ds_store_2addr_b64 v4, v[0:1], v[2:3] offset1:1
582; GFX11-NEXT:    s_endpgm
583  store <4 x i32> %x, ptr addrspace(3) %out, align 8
584  ret void
585}
586
587define amdgpu_kernel void @store_lds_v4i32_align16(ptr addrspace(3) %out, <4 x i32> %x) {
588; GFX9-LABEL: store_lds_v4i32_align16:
589; GFX9:       ; %bb.0:
590; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x0
591; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
592; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
593; GFX9-NEXT:    v_mov_b32_e32 v4, s6
594; GFX9-NEXT:    v_mov_b32_e32 v0, s0
595; GFX9-NEXT:    v_mov_b32_e32 v1, s1
596; GFX9-NEXT:    v_mov_b32_e32 v2, s2
597; GFX9-NEXT:    v_mov_b32_e32 v3, s3
598; GFX9-NEXT:    ds_write_b128 v4, v[0:3]
599; GFX9-NEXT:    s_endpgm
600;
601; GFX7-LABEL: store_lds_v4i32_align16:
602; GFX7:       ; %bb.0:
603; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
604; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
605; GFX7-NEXT:    s_mov_b32 m0, -1
606; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
607; GFX7-NEXT:    v_mov_b32_e32 v4, s6
608; GFX7-NEXT:    v_mov_b32_e32 v0, s0
609; GFX7-NEXT:    v_mov_b32_e32 v1, s1
610; GFX7-NEXT:    v_mov_b32_e32 v2, s2
611; GFX7-NEXT:    v_mov_b32_e32 v3, s3
612; GFX7-NEXT:    ds_write_b128 v4, v[0:3]
613; GFX7-NEXT:    s_endpgm
614;
615; GFX6-LABEL: store_lds_v4i32_align16:
616; GFX6:       ; %bb.0:
617; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
618; GFX6-NEXT:    s_load_dword s4, s[4:5], 0x0
619; GFX6-NEXT:    s_mov_b32 m0, -1
620; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
621; GFX6-NEXT:    v_mov_b32_e32 v0, s2
622; GFX6-NEXT:    v_mov_b32_e32 v1, s3
623; GFX6-NEXT:    v_mov_b32_e32 v4, s4
624; GFX6-NEXT:    v_mov_b32_e32 v2, s0
625; GFX6-NEXT:    v_mov_b32_e32 v3, s1
626; GFX6-NEXT:    ds_write2_b64 v4, v[2:3], v[0:1] offset1:1
627; GFX6-NEXT:    s_endpgm
628;
629; GFX10-LABEL: store_lds_v4i32_align16:
630; GFX10:       ; %bb.0:
631; GFX10-NEXT:    s_clause 0x1
632; GFX10-NEXT:    s_load_dword s6, s[4:5], 0x0
633; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
634; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
635; GFX10-NEXT:    v_mov_b32_e32 v4, s6
636; GFX10-NEXT:    v_mov_b32_e32 v0, s0
637; GFX10-NEXT:    v_mov_b32_e32 v1, s1
638; GFX10-NEXT:    v_mov_b32_e32 v2, s2
639; GFX10-NEXT:    v_mov_b32_e32 v3, s3
640; GFX10-NEXT:    ds_write_b128 v4, v[0:3]
641; GFX10-NEXT:    s_endpgm
642;
643; GFX11-LABEL: store_lds_v4i32_align16:
644; GFX11:       ; %bb.0:
645; GFX11-NEXT:    s_clause 0x1
646; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x0
647; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x10
648; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
649; GFX11-NEXT:    v_mov_b32_e32 v4, s6
650; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
651; GFX11-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
652; GFX11-NEXT:    ds_store_b128 v4, v[0:3]
653; GFX11-NEXT:    s_endpgm
654  store <4 x i32> %x, ptr addrspace(3) %out, align 16
655  ret void
656}
657