xref: /llvm-project/llvm/test/CodeGen/AMDGPU/store-local.96.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
3; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s
4; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s
5; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
6; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
7
8define amdgpu_kernel void @store_lds_v3i32(ptr addrspace(3) %out, <3 x i32> %x) {
9; GFX9-LABEL: store_lds_v3i32:
10; GFX9:       ; %bb.0:
11; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
12; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
13; GFX9-NEXT:    s_load_dword s3, s[4:5], 0x0
14; GFX9-NEXT:    ; kill: killed $sgpr4_sgpr5
15; GFX9-NEXT:    v_mov_b32_e32 v0, s0
16; GFX9-NEXT:    v_mov_b32_e32 v1, s1
17; GFX9-NEXT:    v_mov_b32_e32 v2, s2
18; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
19; GFX9-NEXT:    v_mov_b32_e32 v3, s3
20; GFX9-NEXT:    ds_write_b96 v3, v[0:2]
21; GFX9-NEXT:    s_endpgm
22;
23; GFX7-LABEL: store_lds_v3i32:
24; GFX7:       ; %bb.0:
25; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
26; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
27; GFX7-NEXT:    s_load_dword s3, s[4:5], 0x0
28; GFX7-NEXT:    s_mov_b32 m0, -1
29; GFX7-NEXT:    v_mov_b32_e32 v0, s0
30; GFX7-NEXT:    v_mov_b32_e32 v1, s1
31; GFX7-NEXT:    v_mov_b32_e32 v2, s2
32; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
33; GFX7-NEXT:    v_mov_b32_e32 v3, s3
34; GFX7-NEXT:    ds_write_b96 v3, v[0:2]
35; GFX7-NEXT:    s_endpgm
36;
37; GFX6-LABEL: store_lds_v3i32:
38; GFX6:       ; %bb.0:
39; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
40; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
41; GFX6-NEXT:    s_mov_b32 m0, -1
42; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
43; GFX6-NEXT:    v_mov_b32_e32 v2, s6
44; GFX6-NEXT:    v_mov_b32_e32 v1, s2
45; GFX6-NEXT:    v_mov_b32_e32 v0, s0
46; GFX6-NEXT:    ds_write_b32 v2, v1 offset:8
47; GFX6-NEXT:    v_mov_b32_e32 v1, s1
48; GFX6-NEXT:    ds_write_b64 v2, v[0:1]
49; GFX6-NEXT:    s_endpgm
50;
51; GFX10-LABEL: store_lds_v3i32:
52; GFX10:       ; %bb.0:
53; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
54; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
55; GFX10-NEXT:    s_load_dword s3, s[4:5], 0x0
56; GFX10-NEXT:    ; kill: killed $sgpr4_sgpr5
57; GFX10-NEXT:    v_mov_b32_e32 v0, s0
58; GFX10-NEXT:    v_mov_b32_e32 v1, s1
59; GFX10-NEXT:    v_mov_b32_e32 v2, s2
60; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
61; GFX10-NEXT:    v_mov_b32_e32 v3, s3
62; GFX10-NEXT:    ds_write_b96 v3, v[0:2]
63; GFX10-NEXT:    s_endpgm
64;
65; GFX11-LABEL: store_lds_v3i32:
66; GFX11:       ; %bb.0:
67; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x10
68; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
69; GFX11-NEXT:    s_load_b32 s3, s[4:5], 0x0
70; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
71; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
72; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
73; GFX11-NEXT:    ds_store_b96 v3, v[0:2]
74; GFX11-NEXT:    s_endpgm
75  store <3 x i32> %x, ptr addrspace(3) %out
76  ret void
77}
78
79define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i32> %x) {
80; GFX9-LABEL: store_lds_v3i32_align1:
81; GFX9:       ; %bb.0:
82; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x0
83; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
84; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
85; GFX9-NEXT:    v_mov_b32_e32 v0, s6
86; GFX9-NEXT:    v_mov_b32_e32 v1, s2
87; GFX9-NEXT:    v_mov_b32_e32 v2, s1
88; GFX9-NEXT:    ds_write_b8 v0, v1 offset:8
89; GFX9-NEXT:    ds_write_b8_d16_hi v0, v1 offset:10
90; GFX9-NEXT:    ds_write_b8 v0, v2 offset:4
91; GFX9-NEXT:    ds_write_b8_d16_hi v0, v2 offset:6
92; GFX9-NEXT:    v_mov_b32_e32 v1, s0
93; GFX9-NEXT:    s_lshr_b32 s3, s2, 8
94; GFX9-NEXT:    ds_write_b8 v0, v1
95; GFX9-NEXT:    ds_write_b8_d16_hi v0, v1 offset:2
96; GFX9-NEXT:    v_mov_b32_e32 v1, s3
97; GFX9-NEXT:    s_lshr_b32 s2, s2, 24
98; GFX9-NEXT:    ds_write_b8 v0, v1 offset:9
99; GFX9-NEXT:    v_mov_b32_e32 v1, s2
100; GFX9-NEXT:    s_lshr_b32 s2, s1, 8
101; GFX9-NEXT:    ds_write_b8 v0, v1 offset:11
102; GFX9-NEXT:    v_mov_b32_e32 v1, s2
103; GFX9-NEXT:    s_lshr_b32 s1, s1, 24
104; GFX9-NEXT:    ds_write_b8 v0, v1 offset:5
105; GFX9-NEXT:    v_mov_b32_e32 v1, s1
106; GFX9-NEXT:    s_lshr_b32 s1, s0, 8
107; GFX9-NEXT:    ds_write_b8 v0, v1 offset:7
108; GFX9-NEXT:    v_mov_b32_e32 v1, s1
109; GFX9-NEXT:    s_lshr_b32 s0, s0, 24
110; GFX9-NEXT:    ds_write_b8 v0, v1 offset:1
111; GFX9-NEXT:    v_mov_b32_e32 v1, s0
112; GFX9-NEXT:    ds_write_b8 v0, v1 offset:3
113; GFX9-NEXT:    s_endpgm
114;
115; GFX7-LABEL: store_lds_v3i32_align1:
116; GFX7:       ; %bb.0:
117; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
118; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
119; GFX7-NEXT:    s_mov_b32 m0, -1
120; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
121; GFX7-NEXT:    v_mov_b32_e32 v0, s6
122; GFX7-NEXT:    v_mov_b32_e32 v1, s2
123; GFX7-NEXT:    v_mov_b32_e32 v2, s1
124; GFX7-NEXT:    ds_write_b8 v0, v1 offset:8
125; GFX7-NEXT:    ds_write_b8 v0, v2 offset:4
126; GFX7-NEXT:    v_mov_b32_e32 v1, s0
127; GFX7-NEXT:    s_lshr_b32 s3, s2, 8
128; GFX7-NEXT:    ds_write_b8 v0, v1
129; GFX7-NEXT:    v_mov_b32_e32 v1, s3
130; GFX7-NEXT:    s_lshr_b32 s3, s2, 24
131; GFX7-NEXT:    ds_write_b8 v0, v1 offset:9
132; GFX7-NEXT:    v_mov_b32_e32 v1, s3
133; GFX7-NEXT:    s_lshr_b32 s2, s2, 16
134; GFX7-NEXT:    ds_write_b8 v0, v1 offset:11
135; GFX7-NEXT:    v_mov_b32_e32 v1, s2
136; GFX7-NEXT:    s_lshr_b32 s2, s1, 8
137; GFX7-NEXT:    ds_write_b8 v0, v1 offset:10
138; GFX7-NEXT:    v_mov_b32_e32 v1, s2
139; GFX7-NEXT:    s_lshr_b32 s2, s1, 24
140; GFX7-NEXT:    ds_write_b8 v0, v1 offset:5
141; GFX7-NEXT:    v_mov_b32_e32 v1, s2
142; GFX7-NEXT:    s_lshr_b32 s1, s1, 16
143; GFX7-NEXT:    ds_write_b8 v0, v1 offset:7
144; GFX7-NEXT:    v_mov_b32_e32 v1, s1
145; GFX7-NEXT:    s_lshr_b32 s1, s0, 8
146; GFX7-NEXT:    ds_write_b8 v0, v1 offset:6
147; GFX7-NEXT:    v_mov_b32_e32 v1, s1
148; GFX7-NEXT:    s_lshr_b32 s1, s0, 24
149; GFX7-NEXT:    ds_write_b8 v0, v1 offset:1
150; GFX7-NEXT:    v_mov_b32_e32 v1, s1
151; GFX7-NEXT:    s_lshr_b32 s0, s0, 16
152; GFX7-NEXT:    ds_write_b8 v0, v1 offset:3
153; GFX7-NEXT:    v_mov_b32_e32 v1, s0
154; GFX7-NEXT:    ds_write_b8 v0, v1 offset:2
155; GFX7-NEXT:    s_endpgm
156;
157; GFX6-LABEL: store_lds_v3i32_align1:
158; GFX6:       ; %bb.0:
159; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
160; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
161; GFX6-NEXT:    s_mov_b32 m0, -1
162; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
163; GFX6-NEXT:    v_mov_b32_e32 v0, s6
164; GFX6-NEXT:    v_mov_b32_e32 v1, s2
165; GFX6-NEXT:    v_mov_b32_e32 v2, s1
166; GFX6-NEXT:    ds_write_b8 v0, v1 offset:8
167; GFX6-NEXT:    ds_write_b8 v0, v2 offset:4
168; GFX6-NEXT:    v_mov_b32_e32 v1, s0
169; GFX6-NEXT:    s_lshr_b32 s3, s2, 8
170; GFX6-NEXT:    ds_write_b8 v0, v1
171; GFX6-NEXT:    v_mov_b32_e32 v1, s3
172; GFX6-NEXT:    s_lshr_b32 s3, s2, 24
173; GFX6-NEXT:    ds_write_b8 v0, v1 offset:9
174; GFX6-NEXT:    v_mov_b32_e32 v1, s3
175; GFX6-NEXT:    s_lshr_b32 s2, s2, 16
176; GFX6-NEXT:    ds_write_b8 v0, v1 offset:11
177; GFX6-NEXT:    v_mov_b32_e32 v1, s2
178; GFX6-NEXT:    s_lshr_b32 s2, s1, 8
179; GFX6-NEXT:    ds_write_b8 v0, v1 offset:10
180; GFX6-NEXT:    v_mov_b32_e32 v1, s2
181; GFX6-NEXT:    s_lshr_b32 s2, s1, 24
182; GFX6-NEXT:    ds_write_b8 v0, v1 offset:5
183; GFX6-NEXT:    v_mov_b32_e32 v1, s2
184; GFX6-NEXT:    s_lshr_b32 s1, s1, 16
185; GFX6-NEXT:    ds_write_b8 v0, v1 offset:7
186; GFX6-NEXT:    v_mov_b32_e32 v1, s1
187; GFX6-NEXT:    s_lshr_b32 s1, s0, 8
188; GFX6-NEXT:    ds_write_b8 v0, v1 offset:6
189; GFX6-NEXT:    v_mov_b32_e32 v1, s1
190; GFX6-NEXT:    s_lshr_b32 s1, s0, 24
191; GFX6-NEXT:    ds_write_b8 v0, v1 offset:1
192; GFX6-NEXT:    v_mov_b32_e32 v1, s1
193; GFX6-NEXT:    s_lshr_b32 s0, s0, 16
194; GFX6-NEXT:    ds_write_b8 v0, v1 offset:3
195; GFX6-NEXT:    v_mov_b32_e32 v1, s0
196; GFX6-NEXT:    ds_write_b8 v0, v1 offset:2
197; GFX6-NEXT:    s_endpgm
198;
199; GFX10-LABEL: store_lds_v3i32_align1:
200; GFX10:       ; %bb.0:
201; GFX10-NEXT:    s_clause 0x1
202; GFX10-NEXT:    s_load_dword s6, s[4:5], 0x0
203; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
204; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
205; GFX10-NEXT:    v_mov_b32_e32 v0, s6
206; GFX10-NEXT:    v_mov_b32_e32 v1, s2
207; GFX10-NEXT:    v_mov_b32_e32 v2, s1
208; GFX10-NEXT:    v_mov_b32_e32 v3, s0
209; GFX10-NEXT:    s_lshr_b32 s3, s2, 8
210; GFX10-NEXT:    s_lshr_b32 s2, s2, 24
211; GFX10-NEXT:    s_lshr_b32 s4, s1, 8
212; GFX10-NEXT:    s_lshr_b32 s1, s1, 24
213; GFX10-NEXT:    s_lshr_b32 s5, s0, 8
214; GFX10-NEXT:    s_lshr_b32 s0, s0, 24
215; GFX10-NEXT:    v_mov_b32_e32 v4, s3
216; GFX10-NEXT:    v_mov_b32_e32 v5, s2
217; GFX10-NEXT:    v_mov_b32_e32 v6, s4
218; GFX10-NEXT:    v_mov_b32_e32 v7, s1
219; GFX10-NEXT:    v_mov_b32_e32 v8, s5
220; GFX10-NEXT:    v_mov_b32_e32 v9, s0
221; GFX10-NEXT:    ds_write_b8 v0, v1 offset:8
222; GFX10-NEXT:    ds_write_b8_d16_hi v0, v1 offset:10
223; GFX10-NEXT:    ds_write_b8 v0, v2 offset:4
224; GFX10-NEXT:    ds_write_b8_d16_hi v0, v2 offset:6
225; GFX10-NEXT:    ds_write_b8 v0, v3
226; GFX10-NEXT:    ds_write_b8_d16_hi v0, v3 offset:2
227; GFX10-NEXT:    ds_write_b8 v0, v4 offset:9
228; GFX10-NEXT:    ds_write_b8 v0, v5 offset:11
229; GFX10-NEXT:    ds_write_b8 v0, v6 offset:5
230; GFX10-NEXT:    ds_write_b8 v0, v7 offset:7
231; GFX10-NEXT:    ds_write_b8 v0, v8 offset:1
232; GFX10-NEXT:    ds_write_b8 v0, v9 offset:3
233; GFX10-NEXT:    s_endpgm
234;
235; GFX11-LABEL: store_lds_v3i32_align1:
236; GFX11:       ; %bb.0:
237; GFX11-NEXT:    s_clause 0x1
238; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x0
239; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x10
240; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
241; GFX11-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s2
242; GFX11-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s0
243; GFX11-NEXT:    s_lshr_b32 s3, s2, 8
244; GFX11-NEXT:    s_lshr_b32 s2, s2, 24
245; GFX11-NEXT:    s_lshr_b32 s4, s1, 8
246; GFX11-NEXT:    s_lshr_b32 s1, s1, 24
247; GFX11-NEXT:    s_lshr_b32 s5, s0, 8
248; GFX11-NEXT:    s_lshr_b32 s0, s0, 24
249; GFX11-NEXT:    v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v5, s2
250; GFX11-NEXT:    v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s1
251; GFX11-NEXT:    v_dual_mov_b32 v8, s5 :: v_dual_mov_b32 v9, s0
252; GFX11-NEXT:    ds_store_b8 v0, v1 offset:8
253; GFX11-NEXT:    ds_store_b8 v0, v3
254; GFX11-NEXT:    ds_store_b8_d16_hi v0, v3 offset:2
255; GFX11-NEXT:    ds_store_b8 v0, v2 offset:4
256; GFX11-NEXT:    ds_store_b8 v0, v4 offset:9
257; GFX11-NEXT:    ds_store_b8_d16_hi v0, v1 offset:10
258; GFX11-NEXT:    ds_store_b8 v0, v5 offset:11
259; GFX11-NEXT:    ds_store_b8 v0, v6 offset:5
260; GFX11-NEXT:    ds_store_b8_d16_hi v0, v2 offset:6
261; GFX11-NEXT:    ds_store_b8 v0, v7 offset:7
262; GFX11-NEXT:    ds_store_b8 v0, v8 offset:1
263; GFX11-NEXT:    ds_store_b8 v0, v9 offset:3
264; GFX11-NEXT:    s_endpgm
265  store <3 x i32> %x, ptr addrspace(3) %out, align 1
266  ret void
267}
268
269define amdgpu_kernel void @store_lds_v3i32_align2(ptr addrspace(3) %out, <3 x i32> %x) {
270; GFX9-LABEL: store_lds_v3i32_align2:
271; GFX9:       ; %bb.0:
272; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x0
273; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
274; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
275; GFX9-NEXT:    v_mov_b32_e32 v0, s6
276; GFX9-NEXT:    v_mov_b32_e32 v1, s2
277; GFX9-NEXT:    v_mov_b32_e32 v2, s1
278; GFX9-NEXT:    ds_write_b16 v0, v1 offset:8
279; GFX9-NEXT:    ds_write_b16_d16_hi v0, v1 offset:10
280; GFX9-NEXT:    ds_write_b16 v0, v2 offset:4
281; GFX9-NEXT:    ds_write_b16_d16_hi v0, v2 offset:6
282; GFX9-NEXT:    v_mov_b32_e32 v1, s0
283; GFX9-NEXT:    ds_write_b16 v0, v1
284; GFX9-NEXT:    ds_write_b16_d16_hi v0, v1 offset:2
285; GFX9-NEXT:    s_endpgm
286;
287; GFX7-LABEL: store_lds_v3i32_align2:
288; GFX7:       ; %bb.0:
289; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
290; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
291; GFX7-NEXT:    s_mov_b32 m0, -1
292; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
293; GFX7-NEXT:    v_mov_b32_e32 v0, s6
294; GFX7-NEXT:    v_mov_b32_e32 v1, s2
295; GFX7-NEXT:    v_mov_b32_e32 v2, s1
296; GFX7-NEXT:    ds_write_b16 v0, v1 offset:8
297; GFX7-NEXT:    ds_write_b16 v0, v2 offset:4
298; GFX7-NEXT:    v_mov_b32_e32 v1, s0
299; GFX7-NEXT:    s_lshr_b32 s2, s2, 16
300; GFX7-NEXT:    ds_write_b16 v0, v1
301; GFX7-NEXT:    v_mov_b32_e32 v1, s2
302; GFX7-NEXT:    s_lshr_b32 s1, s1, 16
303; GFX7-NEXT:    ds_write_b16 v0, v1 offset:10
304; GFX7-NEXT:    v_mov_b32_e32 v1, s1
305; GFX7-NEXT:    s_lshr_b32 s0, s0, 16
306; GFX7-NEXT:    ds_write_b16 v0, v1 offset:6
307; GFX7-NEXT:    v_mov_b32_e32 v1, s0
308; GFX7-NEXT:    ds_write_b16 v0, v1 offset:2
309; GFX7-NEXT:    s_endpgm
310;
311; GFX6-LABEL: store_lds_v3i32_align2:
312; GFX6:       ; %bb.0:
313; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
314; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
315; GFX6-NEXT:    s_mov_b32 m0, -1
316; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
317; GFX6-NEXT:    v_mov_b32_e32 v0, s6
318; GFX6-NEXT:    v_mov_b32_e32 v1, s2
319; GFX6-NEXT:    v_mov_b32_e32 v2, s1
320; GFX6-NEXT:    ds_write_b16 v0, v1 offset:8
321; GFX6-NEXT:    ds_write_b16 v0, v2 offset:4
322; GFX6-NEXT:    v_mov_b32_e32 v1, s0
323; GFX6-NEXT:    s_lshr_b32 s2, s2, 16
324; GFX6-NEXT:    ds_write_b16 v0, v1
325; GFX6-NEXT:    v_mov_b32_e32 v1, s2
326; GFX6-NEXT:    s_lshr_b32 s1, s1, 16
327; GFX6-NEXT:    ds_write_b16 v0, v1 offset:10
328; GFX6-NEXT:    v_mov_b32_e32 v1, s1
329; GFX6-NEXT:    s_lshr_b32 s0, s0, 16
330; GFX6-NEXT:    ds_write_b16 v0, v1 offset:6
331; GFX6-NEXT:    v_mov_b32_e32 v1, s0
332; GFX6-NEXT:    ds_write_b16 v0, v1 offset:2
333; GFX6-NEXT:    s_endpgm
334;
335; GFX10-LABEL: store_lds_v3i32_align2:
336; GFX10:       ; %bb.0:
337; GFX10-NEXT:    s_clause 0x1
338; GFX10-NEXT:    s_load_dword s6, s[4:5], 0x0
339; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
340; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
341; GFX10-NEXT:    v_mov_b32_e32 v0, s6
342; GFX10-NEXT:    v_mov_b32_e32 v1, s2
343; GFX10-NEXT:    v_mov_b32_e32 v2, s1
344; GFX10-NEXT:    v_mov_b32_e32 v3, s0
345; GFX10-NEXT:    ds_write_b16 v0, v1 offset:8
346; GFX10-NEXT:    ds_write_b16_d16_hi v0, v1 offset:10
347; GFX10-NEXT:    ds_write_b16 v0, v2 offset:4
348; GFX10-NEXT:    ds_write_b16_d16_hi v0, v2 offset:6
349; GFX10-NEXT:    ds_write_b16 v0, v3
350; GFX10-NEXT:    ds_write_b16_d16_hi v0, v3 offset:2
351; GFX10-NEXT:    s_endpgm
352;
353; GFX11-LABEL: store_lds_v3i32_align2:
354; GFX11:       ; %bb.0:
355; GFX11-NEXT:    s_clause 0x1
356; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x0
357; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x10
358; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
359; GFX11-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s2
360; GFX11-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
361; GFX11-NEXT:    ds_store_b16_d16_hi v0, v1 offset:10
362; GFX11-NEXT:    ds_store_b16 v0, v2
363; GFX11-NEXT:    ds_store_b16 v0, v3 offset:4
364; GFX11-NEXT:    ds_store_b16 v0, v1 offset:8
365; GFX11-NEXT:    ds_store_b16_d16_hi v0, v3 offset:6
366; GFX11-NEXT:    ds_store_b16_d16_hi v0, v2 offset:2
367; GFX11-NEXT:    s_endpgm
368  store <3 x i32> %x, ptr addrspace(3) %out, align 2
369  ret void
370}
371
372define amdgpu_kernel void @store_lds_v3i32_align4(ptr addrspace(3) %out, <3 x i32> %x) {
373; GFX9-LABEL: store_lds_v3i32_align4:
374; GFX9:       ; %bb.0:
375; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x0
376; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
377; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
378; GFX9-NEXT:    v_mov_b32_e32 v0, s6
379; GFX9-NEXT:    v_mov_b32_e32 v1, s0
380; GFX9-NEXT:    v_mov_b32_e32 v2, s1
381; GFX9-NEXT:    v_mov_b32_e32 v3, s2
382; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
383; GFX9-NEXT:    ds_write_b32 v0, v3 offset:8
384; GFX9-NEXT:    s_endpgm
385;
386; GFX7-LABEL: store_lds_v3i32_align4:
387; GFX7:       ; %bb.0:
388; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
389; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
390; GFX7-NEXT:    s_mov_b32 m0, -1
391; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
392; GFX7-NEXT:    v_mov_b32_e32 v0, s6
393; GFX7-NEXT:    v_mov_b32_e32 v1, s0
394; GFX7-NEXT:    v_mov_b32_e32 v2, s1
395; GFX7-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
396; GFX7-NEXT:    v_mov_b32_e32 v1, s2
397; GFX7-NEXT:    ds_write_b32 v0, v1 offset:8
398; GFX7-NEXT:    s_endpgm
399;
400; GFX6-LABEL: store_lds_v3i32_align4:
401; GFX6:       ; %bb.0:
402; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
403; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
404; GFX6-NEXT:    s_mov_b32 m0, -1
405; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
406; GFX6-NEXT:    v_mov_b32_e32 v0, s6
407; GFX6-NEXT:    v_mov_b32_e32 v1, s2
408; GFX6-NEXT:    v_mov_b32_e32 v2, s0
409; GFX6-NEXT:    ds_write_b32 v0, v1 offset:8
410; GFX6-NEXT:    v_mov_b32_e32 v1, s1
411; GFX6-NEXT:    ds_write2_b32 v0, v2, v1 offset1:1
412; GFX6-NEXT:    s_endpgm
413;
414; GFX10-LABEL: store_lds_v3i32_align4:
415; GFX10:       ; %bb.0:
416; GFX10-NEXT:    s_clause 0x1
417; GFX10-NEXT:    s_load_dword s6, s[4:5], 0x0
418; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
419; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
420; GFX10-NEXT:    v_mov_b32_e32 v0, s6
421; GFX10-NEXT:    v_mov_b32_e32 v1, s2
422; GFX10-NEXT:    v_mov_b32_e32 v2, s0
423; GFX10-NEXT:    v_mov_b32_e32 v3, s1
424; GFX10-NEXT:    ds_write_b32 v0, v1 offset:8
425; GFX10-NEXT:    ds_write2_b32 v0, v2, v3 offset1:1
426; GFX10-NEXT:    s_endpgm
427;
428; GFX11-LABEL: store_lds_v3i32_align4:
429; GFX11:       ; %bb.0:
430; GFX11-NEXT:    s_clause 0x1
431; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x0
432; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x10
433; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
434; GFX11-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s0
435; GFX11-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s2
436; GFX11-NEXT:    ds_store_2addr_b32 v0, v1, v2 offset1:1
437; GFX11-NEXT:    ds_store_b32 v0, v3 offset:8
438; GFX11-NEXT:    s_endpgm
439  store <3 x i32> %x, ptr addrspace(3) %out, align 4
440  ret void
441}
442
443define amdgpu_kernel void @store_lds_v3i32_align8(ptr addrspace(3) %out, <3 x i32> %x) {
444; GFX9-LABEL: store_lds_v3i32_align8:
445; GFX9:       ; %bb.0:
446; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x0
447; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
448; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
449; GFX9-NEXT:    v_mov_b32_e32 v2, s6
450; GFX9-NEXT:    v_mov_b32_e32 v3, s2
451; GFX9-NEXT:    v_mov_b32_e32 v0, s0
452; GFX9-NEXT:    v_mov_b32_e32 v1, s1
453; GFX9-NEXT:    ds_write_b32 v2, v3 offset:8
454; GFX9-NEXT:    ds_write_b64 v2, v[0:1]
455; GFX9-NEXT:    s_endpgm
456;
457; GFX7-LABEL: store_lds_v3i32_align8:
458; GFX7:       ; %bb.0:
459; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
460; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
461; GFX7-NEXT:    s_mov_b32 m0, -1
462; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
463; GFX7-NEXT:    v_mov_b32_e32 v2, s6
464; GFX7-NEXT:    v_mov_b32_e32 v1, s2
465; GFX7-NEXT:    v_mov_b32_e32 v0, s0
466; GFX7-NEXT:    ds_write_b32 v2, v1 offset:8
467; GFX7-NEXT:    v_mov_b32_e32 v1, s1
468; GFX7-NEXT:    ds_write_b64 v2, v[0:1]
469; GFX7-NEXT:    s_endpgm
470;
471; GFX6-LABEL: store_lds_v3i32_align8:
472; GFX6:       ; %bb.0:
473; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
474; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
475; GFX6-NEXT:    s_mov_b32 m0, -1
476; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
477; GFX6-NEXT:    v_mov_b32_e32 v2, s6
478; GFX6-NEXT:    v_mov_b32_e32 v1, s2
479; GFX6-NEXT:    v_mov_b32_e32 v0, s0
480; GFX6-NEXT:    ds_write_b32 v2, v1 offset:8
481; GFX6-NEXT:    v_mov_b32_e32 v1, s1
482; GFX6-NEXT:    ds_write_b64 v2, v[0:1]
483; GFX6-NEXT:    s_endpgm
484;
485; GFX10-LABEL: store_lds_v3i32_align8:
486; GFX10:       ; %bb.0:
487; GFX10-NEXT:    s_clause 0x1
488; GFX10-NEXT:    s_load_dword s6, s[4:5], 0x0
489; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
490; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
491; GFX10-NEXT:    v_mov_b32_e32 v2, s6
492; GFX10-NEXT:    v_mov_b32_e32 v3, s2
493; GFX10-NEXT:    v_mov_b32_e32 v0, s0
494; GFX10-NEXT:    v_mov_b32_e32 v1, s1
495; GFX10-NEXT:    ds_write_b32 v2, v3 offset:8
496; GFX10-NEXT:    ds_write_b64 v2, v[0:1]
497; GFX10-NEXT:    s_endpgm
498;
499; GFX11-LABEL: store_lds_v3i32_align8:
500; GFX11:       ; %bb.0:
501; GFX11-NEXT:    s_clause 0x1
502; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x0
503; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x10
504; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
505; GFX11-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s2
506; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
507; GFX11-NEXT:    ds_store_b32 v2, v3 offset:8
508; GFX11-NEXT:    ds_store_b64 v2, v[0:1]
509; GFX11-NEXT:    s_endpgm
510  store <3 x i32> %x, ptr addrspace(3) %out, align 8
511  ret void
512}
513
514define amdgpu_kernel void @store_lds_v3i32_align16(ptr addrspace(3) %out, <3 x i32> %x) {
515; GFX9-LABEL: store_lds_v3i32_align16:
516; GFX9:       ; %bb.0:
517; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
518; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
519; GFX9-NEXT:    s_load_dword s3, s[4:5], 0x0
520; GFX9-NEXT:    ; kill: killed $sgpr4_sgpr5
521; GFX9-NEXT:    v_mov_b32_e32 v0, s0
522; GFX9-NEXT:    v_mov_b32_e32 v1, s1
523; GFX9-NEXT:    v_mov_b32_e32 v2, s2
524; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
525; GFX9-NEXT:    v_mov_b32_e32 v3, s3
526; GFX9-NEXT:    ds_write_b96 v3, v[0:2]
527; GFX9-NEXT:    s_endpgm
528;
529; GFX7-LABEL: store_lds_v3i32_align16:
530; GFX7:       ; %bb.0:
531; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
532; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
533; GFX7-NEXT:    s_load_dword s3, s[4:5], 0x0
534; GFX7-NEXT:    s_mov_b32 m0, -1
535; GFX7-NEXT:    v_mov_b32_e32 v0, s0
536; GFX7-NEXT:    v_mov_b32_e32 v1, s1
537; GFX7-NEXT:    v_mov_b32_e32 v2, s2
538; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
539; GFX7-NEXT:    v_mov_b32_e32 v3, s3
540; GFX7-NEXT:    ds_write_b96 v3, v[0:2]
541; GFX7-NEXT:    s_endpgm
542;
543; GFX6-LABEL: store_lds_v3i32_align16:
544; GFX6:       ; %bb.0:
545; GFX6-NEXT:    s_load_dword s6, s[4:5], 0x0
546; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
547; GFX6-NEXT:    s_mov_b32 m0, -1
548; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
549; GFX6-NEXT:    v_mov_b32_e32 v2, s6
550; GFX6-NEXT:    v_mov_b32_e32 v1, s2
551; GFX6-NEXT:    v_mov_b32_e32 v0, s0
552; GFX6-NEXT:    ds_write_b32 v2, v1 offset:8
553; GFX6-NEXT:    v_mov_b32_e32 v1, s1
554; GFX6-NEXT:    ds_write_b64 v2, v[0:1]
555; GFX6-NEXT:    s_endpgm
556;
557; GFX10-LABEL: store_lds_v3i32_align16:
558; GFX10:       ; %bb.0:
559; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
560; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
561; GFX10-NEXT:    s_load_dword s3, s[4:5], 0x0
562; GFX10-NEXT:    ; kill: killed $sgpr4_sgpr5
563; GFX10-NEXT:    v_mov_b32_e32 v0, s0
564; GFX10-NEXT:    v_mov_b32_e32 v1, s1
565; GFX10-NEXT:    v_mov_b32_e32 v2, s2
566; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
567; GFX10-NEXT:    v_mov_b32_e32 v3, s3
568; GFX10-NEXT:    ds_write_b96 v3, v[0:2]
569; GFX10-NEXT:    s_endpgm
570;
571; GFX11-LABEL: store_lds_v3i32_align16:
572; GFX11:       ; %bb.0:
573; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x10
574; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
575; GFX11-NEXT:    s_load_b32 s3, s[4:5], 0x0
576; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
577; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
578; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
579; GFX11-NEXT:    ds_store_b96 v3, v[0:2]
580; GFX11-NEXT:    s_endpgm
581  store <3 x i32> %x, ptr addrspace(3) %out, align 16
582  ret void
583}
584