xref: /llvm-project/llvm/test/CodeGen/AMDGPU/load-local.128.ll (revision f2c164c8150548d983565c4ddc0fde790f9e2a5b)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
3; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s
4; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s
5; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
6; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
7
8define <4 x i32> @load_lds_v4i32(ptr addrspace(3) %ptr) {
9; GFX9-LABEL: load_lds_v4i32:
10; GFX9:       ; %bb.0:
11; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12; GFX9-NEXT:    ds_read_b128 v[0:3], v0
13; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
14; GFX9-NEXT:    s_setpc_b64 s[30:31]
15;
16; GFX7-LABEL: load_lds_v4i32:
17; GFX7:       ; %bb.0:
18; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19; GFX7-NEXT:    s_mov_b32 m0, -1
20; GFX7-NEXT:    ds_read_b128 v[0:3], v0
21; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
22; GFX7-NEXT:    s_setpc_b64 s[30:31]
23;
24; GFX6-LABEL: load_lds_v4i32:
25; GFX6:       ; %bb.0:
26; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27; GFX6-NEXT:    v_mov_b32_e32 v2, v0
28; GFX6-NEXT:    s_mov_b32 m0, -1
29; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 8, v2
30; GFX6-NEXT:    ds_read_b64 v[0:1], v0
31; GFX6-NEXT:    ds_read_b64 v[2:3], v2
32; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
33; GFX6-NEXT:    s_setpc_b64 s[30:31]
34;
35; GFX10-LABEL: load_lds_v4i32:
36; GFX10:       ; %bb.0:
37; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
38; GFX10-NEXT:    ds_read_b128 v[0:3], v0
39; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
40; GFX10-NEXT:    s_setpc_b64 s[30:31]
41;
42; GFX11-LABEL: load_lds_v4i32:
43; GFX11:       ; %bb.0:
44; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
45; GFX11-NEXT:    ds_load_b128 v[0:3], v0
46; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
47; GFX11-NEXT:    s_setpc_b64 s[30:31]
48  %load = load <4 x i32>, ptr addrspace(3) %ptr
49  ret <4 x i32> %load
50}
51
52define <4 x i32> @load_lds_v4i32_align1(ptr addrspace(3) %ptr) {
53; GFX9-LABEL: load_lds_v4i32_align1:
54; GFX9:       ; %bb.0:
55; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
56; GFX9-NEXT:    ds_read_u8 v1, v0
57; GFX9-NEXT:    ds_read_u8 v2, v0 offset:1
58; GFX9-NEXT:    ds_read_u8 v3, v0 offset:2
59; GFX9-NEXT:    ds_read_u8 v4, v0 offset:3
60; GFX9-NEXT:    ds_read_u8 v5, v0 offset:4
61; GFX9-NEXT:    ds_read_u8 v6, v0 offset:5
62; GFX9-NEXT:    ds_read_u8 v7, v0 offset:6
63; GFX9-NEXT:    ds_read_u8 v8, v0 offset:7
64; GFX9-NEXT:    ds_read_u8 v9, v0 offset:8
65; GFX9-NEXT:    ds_read_u8 v10, v0 offset:9
66; GFX9-NEXT:    ds_read_u8 v11, v0 offset:10
67; GFX9-NEXT:    ds_read_u8 v12, v0 offset:11
68; GFX9-NEXT:    ds_read_u8 v13, v0 offset:12
69; GFX9-NEXT:    ds_read_u8 v14, v0 offset:13
70; GFX9-NEXT:    ds_read_u8 v15, v0 offset:14
71; GFX9-NEXT:    ds_read_u8 v16, v0 offset:15
72; GFX9-NEXT:    s_waitcnt lgkmcnt(14)
73; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 8, v1
74; GFX9-NEXT:    s_waitcnt lgkmcnt(12)
75; GFX9-NEXT:    v_lshl_or_b32 v1, v4, 8, v3
76; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
77; GFX9-NEXT:    s_waitcnt lgkmcnt(10)
78; GFX9-NEXT:    v_lshl_or_b32 v1, v6, 8, v5
79; GFX9-NEXT:    s_waitcnt lgkmcnt(8)
80; GFX9-NEXT:    v_lshl_or_b32 v2, v8, 8, v7
81; GFX9-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
82; GFX9-NEXT:    s_waitcnt lgkmcnt(6)
83; GFX9-NEXT:    v_lshl_or_b32 v2, v10, 8, v9
84; GFX9-NEXT:    s_waitcnt lgkmcnt(4)
85; GFX9-NEXT:    v_lshl_or_b32 v3, v12, 8, v11
86; GFX9-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
87; GFX9-NEXT:    s_waitcnt lgkmcnt(2)
88; GFX9-NEXT:    v_lshl_or_b32 v3, v14, 8, v13
89; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
90; GFX9-NEXT:    v_lshl_or_b32 v4, v16, 8, v15
91; GFX9-NEXT:    v_lshl_or_b32 v3, v4, 16, v3
92; GFX9-NEXT:    s_setpc_b64 s[30:31]
93;
94; GFX7-LABEL: load_lds_v4i32_align1:
95; GFX7:       ; %bb.0:
96; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
97; GFX7-NEXT:    s_mov_b32 m0, -1
98; GFX7-NEXT:    ds_read_u8 v1, v0 offset:6
99; GFX7-NEXT:    ds_read_u8 v2, v0 offset:4
100; GFX7-NEXT:    ds_read_u8 v3, v0 offset:2
101; GFX7-NEXT:    ds_read_u8 v4, v0 offset:1
102; GFX7-NEXT:    ds_read_u8 v5, v0
103; GFX7-NEXT:    ds_read_u8 v6, v0 offset:3
104; GFX7-NEXT:    ds_read_u8 v7, v0 offset:5
105; GFX7-NEXT:    ds_read_u8 v8, v0 offset:7
106; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
107; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
108; GFX7-NEXT:    s_waitcnt lgkmcnt(3)
109; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
110; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
111; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v6
112; GFX7-NEXT:    v_or_b32_e32 v3, v5, v3
113; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
114; GFX7-NEXT:    v_or_b32_e32 v4, v3, v4
115; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
116; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v7
117; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
118; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
119; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v8
120; GFX7-NEXT:    ds_read_u8 v5, v0 offset:15
121; GFX7-NEXT:    ds_read_u8 v6, v0 offset:14
122; GFX7-NEXT:    ds_read_u8 v7, v0 offset:13
123; GFX7-NEXT:    ds_read_u8 v8, v0 offset:12
124; GFX7-NEXT:    ds_read_u8 v9, v0 offset:11
125; GFX7-NEXT:    ds_read_u8 v10, v0 offset:10
126; GFX7-NEXT:    ds_read_u8 v11, v0 offset:9
127; GFX7-NEXT:    ds_read_u8 v0, v0 offset:8
128; GFX7-NEXT:    v_or_b32_e32 v1, v3, v1
129; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
130; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
131; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
132; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v11
133; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
134; GFX7-NEXT:    v_or_b32_e32 v0, v2, v0
135; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v9
136; GFX7-NEXT:    v_or_b32_e32 v2, v2, v10
137; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
138; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v5
139; GFX7-NEXT:    v_or_b32_e32 v2, v2, v0
140; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 8, v7
141; GFX7-NEXT:    v_or_b32_e32 v3, v3, v6
142; GFX7-NEXT:    v_or_b32_e32 v0, v0, v8
143; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
144; GFX7-NEXT:    v_or_b32_e32 v3, v3, v0
145; GFX7-NEXT:    v_mov_b32_e32 v0, v4
146; GFX7-NEXT:    s_setpc_b64 s[30:31]
147;
148; GFX6-LABEL: load_lds_v4i32_align1:
149; GFX6:       ; %bb.0:
150; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
151; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 5, v0
152; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 4, v0
153; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 7, v0
154; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 6, v0
155; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 9, v0
156; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 8, v0
157; GFX6-NEXT:    v_add_i32_e32 v7, vcc, 11, v0
158; GFX6-NEXT:    s_mov_b32 m0, -1
159; GFX6-NEXT:    ds_read_u8 v1, v1
160; GFX6-NEXT:    ds_read_u8 v2, v2
161; GFX6-NEXT:    ds_read_u8 v3, v3
162; GFX6-NEXT:    ds_read_u8 v4, v4
163; GFX6-NEXT:    ds_read_u8 v5, v5
164; GFX6-NEXT:    ds_read_u8 v6, v6
165; GFX6-NEXT:    ds_read_u8 v7, v7
166; GFX6-NEXT:    ds_read_u8 v8, v0
167; GFX6-NEXT:    s_waitcnt lgkmcnt(7)
168; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
169; GFX6-NEXT:    s_waitcnt lgkmcnt(6)
170; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
171; GFX6-NEXT:    s_waitcnt lgkmcnt(5)
172; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 8, v3
173; GFX6-NEXT:    s_waitcnt lgkmcnt(4)
174; GFX6-NEXT:    v_or_b32_e32 v2, v2, v4
175; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
176; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
177; GFX6-NEXT:    s_waitcnt lgkmcnt(3)
178; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 8, v5
179; GFX6-NEXT:    s_waitcnt lgkmcnt(2)
180; GFX6-NEXT:    v_or_b32_e32 v2, v2, v6
181; GFX6-NEXT:    s_waitcnt lgkmcnt(1)
182; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 8, v7
183; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 10, v0
184; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 13, v0
185; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 12, v0
186; GFX6-NEXT:    v_add_i32_e32 v7, vcc, 15, v0
187; GFX6-NEXT:    v_add_i32_e32 v9, vcc, 14, v0
188; GFX6-NEXT:    v_add_i32_e32 v10, vcc, 3, v0
189; GFX6-NEXT:    v_add_i32_e32 v11, vcc, 2, v0
190; GFX6-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
191; GFX6-NEXT:    ds_read_u8 v4, v4
192; GFX6-NEXT:    ds_read_u8 v5, v5
193; GFX6-NEXT:    ds_read_u8 v6, v6
194; GFX6-NEXT:    ds_read_u8 v7, v7
195; GFX6-NEXT:    ds_read_u8 v9, v9
196; GFX6-NEXT:    ds_read_u8 v10, v10
197; GFX6-NEXT:    ds_read_u8 v11, v11
198; GFX6-NEXT:    ds_read_u8 v0, v0
199; GFX6-NEXT:    s_waitcnt lgkmcnt(7)
200; GFX6-NEXT:    v_or_b32_e32 v3, v3, v4
201; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
202; GFX6-NEXT:    s_waitcnt lgkmcnt(4)
203; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 8, v7
204; GFX6-NEXT:    v_or_b32_e32 v2, v3, v2
205; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 8, v5
206; GFX6-NEXT:    s_waitcnt lgkmcnt(3)
207; GFX6-NEXT:    v_or_b32_e32 v4, v4, v9
208; GFX6-NEXT:    v_or_b32_e32 v3, v3, v6
209; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
210; GFX6-NEXT:    v_or_b32_e32 v3, v4, v3
211; GFX6-NEXT:    s_waitcnt lgkmcnt(2)
212; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 8, v10
213; GFX6-NEXT:    s_waitcnt lgkmcnt(1)
214; GFX6-NEXT:    v_or_b32_e32 v4, v4, v11
215; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
216; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
217; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
218; GFX6-NEXT:    v_or_b32_e32 v0, v0, v8
219; GFX6-NEXT:    v_or_b32_e32 v0, v4, v0
220; GFX6-NEXT:    s_setpc_b64 s[30:31]
221;
222; GFX10-LABEL: load_lds_v4i32_align1:
223; GFX10:       ; %bb.0:
224; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
225; GFX10-NEXT:    ds_read_u8 v1, v0
226; GFX10-NEXT:    ds_read_u8 v2, v0 offset:1
227; GFX10-NEXT:    ds_read_u8 v3, v0 offset:2
228; GFX10-NEXT:    ds_read_u8 v4, v0 offset:3
229; GFX10-NEXT:    ds_read_u8 v5, v0 offset:4
230; GFX10-NEXT:    ds_read_u8 v6, v0 offset:5
231; GFX10-NEXT:    ds_read_u8 v7, v0 offset:6
232; GFX10-NEXT:    ds_read_u8 v8, v0 offset:7
233; GFX10-NEXT:    ds_read_u8 v9, v0 offset:8
234; GFX10-NEXT:    ds_read_u8 v10, v0 offset:9
235; GFX10-NEXT:    ds_read_u8 v11, v0 offset:10
236; GFX10-NEXT:    ds_read_u8 v12, v0 offset:11
237; GFX10-NEXT:    ds_read_u8 v13, v0 offset:12
238; GFX10-NEXT:    ds_read_u8 v14, v0 offset:13
239; GFX10-NEXT:    ds_read_u8 v15, v0 offset:14
240; GFX10-NEXT:    ds_read_u8 v0, v0 offset:15
241; GFX10-NEXT:    s_waitcnt lgkmcnt(14)
242; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
243; GFX10-NEXT:    s_waitcnt lgkmcnt(12)
244; GFX10-NEXT:    v_lshl_or_b32 v2, v4, 8, v3
245; GFX10-NEXT:    s_waitcnt lgkmcnt(10)
246; GFX10-NEXT:    v_lshl_or_b32 v3, v6, 8, v5
247; GFX10-NEXT:    s_waitcnt lgkmcnt(8)
248; GFX10-NEXT:    v_lshl_or_b32 v4, v8, 8, v7
249; GFX10-NEXT:    s_waitcnt lgkmcnt(6)
250; GFX10-NEXT:    v_lshl_or_b32 v5, v10, 8, v9
251; GFX10-NEXT:    s_waitcnt lgkmcnt(4)
252; GFX10-NEXT:    v_lshl_or_b32 v6, v12, 8, v11
253; GFX10-NEXT:    s_waitcnt lgkmcnt(2)
254; GFX10-NEXT:    v_lshl_or_b32 v7, v14, 8, v13
255; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
256; GFX10-NEXT:    v_lshl_or_b32 v8, v0, 8, v15
257; GFX10-NEXT:    v_lshl_or_b32 v0, v2, 16, v1
258; GFX10-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
259; GFX10-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
260; GFX10-NEXT:    v_lshl_or_b32 v3, v8, 16, v7
261; GFX10-NEXT:    s_setpc_b64 s[30:31]
262;
263; GFX11-LABEL: load_lds_v4i32_align1:
264; GFX11:       ; %bb.0:
265; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
266; GFX11-NEXT:    ds_load_u8 v1, v0
267; GFX11-NEXT:    ds_load_u8 v2, v0 offset:1
268; GFX11-NEXT:    ds_load_u8 v3, v0 offset:2
269; GFX11-NEXT:    ds_load_u8 v4, v0 offset:3
270; GFX11-NEXT:    ds_load_u8 v5, v0 offset:4
271; GFX11-NEXT:    ds_load_u8 v6, v0 offset:5
272; GFX11-NEXT:    ds_load_u8 v7, v0 offset:6
273; GFX11-NEXT:    ds_load_u8 v8, v0 offset:7
274; GFX11-NEXT:    ds_load_u8 v9, v0 offset:8
275; GFX11-NEXT:    ds_load_u8 v10, v0 offset:9
276; GFX11-NEXT:    ds_load_u8 v11, v0 offset:10
277; GFX11-NEXT:    ds_load_u8 v12, v0 offset:11
278; GFX11-NEXT:    ds_load_u8 v13, v0 offset:12
279; GFX11-NEXT:    ds_load_u8 v14, v0 offset:13
280; GFX11-NEXT:    ds_load_u8 v15, v0 offset:14
281; GFX11-NEXT:    ds_load_u8 v0, v0 offset:15
282; GFX11-NEXT:    s_waitcnt lgkmcnt(14)
283; GFX11-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
284; GFX11-NEXT:    s_waitcnt lgkmcnt(12)
285; GFX11-NEXT:    v_lshl_or_b32 v2, v4, 8, v3
286; GFX11-NEXT:    s_waitcnt lgkmcnt(10)
287; GFX11-NEXT:    v_lshl_or_b32 v3, v6, 8, v5
288; GFX11-NEXT:    s_waitcnt lgkmcnt(8)
289; GFX11-NEXT:    v_lshl_or_b32 v4, v8, 8, v7
290; GFX11-NEXT:    s_waitcnt lgkmcnt(6)
291; GFX11-NEXT:    v_lshl_or_b32 v5, v10, 8, v9
292; GFX11-NEXT:    s_waitcnt lgkmcnt(4)
293; GFX11-NEXT:    v_lshl_or_b32 v6, v12, 8, v11
294; GFX11-NEXT:    s_waitcnt lgkmcnt(2)
295; GFX11-NEXT:    v_lshl_or_b32 v7, v14, 8, v13
296; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
297; GFX11-NEXT:    v_lshl_or_b32 v8, v0, 8, v15
298; GFX11-NEXT:    v_lshl_or_b32 v0, v2, 16, v1
299; GFX11-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
300; GFX11-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
301; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
302; GFX11-NEXT:    v_lshl_or_b32 v3, v8, 16, v7
303; GFX11-NEXT:    s_setpc_b64 s[30:31]
304  %load = load <4 x i32>, ptr addrspace(3) %ptr, align 1
305  ret <4 x i32> %load
306}
307
308define <4 x i32> @load_lds_v4i32_align2(ptr addrspace(3) %ptr) {
309; GFX9-LABEL: load_lds_v4i32_align2:
310; GFX9:       ; %bb.0:
311; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
312; GFX9-NEXT:    ds_read_u16 v1, v0
313; GFX9-NEXT:    ds_read_u16 v2, v0 offset:2
314; GFX9-NEXT:    ds_read_u16 v3, v0 offset:4
315; GFX9-NEXT:    ds_read_u16 v4, v0 offset:6
316; GFX9-NEXT:    ds_read_u16 v5, v0 offset:8
317; GFX9-NEXT:    ds_read_u16 v6, v0 offset:10
318; GFX9-NEXT:    ds_read_u16 v7, v0 offset:12
319; GFX9-NEXT:    ds_read_u16 v8, v0 offset:14
320; GFX9-NEXT:    s_waitcnt lgkmcnt(6)
321; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v1
322; GFX9-NEXT:    s_waitcnt lgkmcnt(4)
323; GFX9-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
324; GFX9-NEXT:    s_waitcnt lgkmcnt(2)
325; GFX9-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
326; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
327; GFX9-NEXT:    v_lshl_or_b32 v3, v8, 16, v7
328; GFX9-NEXT:    s_setpc_b64 s[30:31]
329;
330; GFX7-LABEL: load_lds_v4i32_align2:
331; GFX7:       ; %bb.0:
332; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
333; GFX7-NEXT:    s_mov_b32 m0, -1
334; GFX7-NEXT:    ds_read_u16 v3, v0 offset:12
335; GFX7-NEXT:    ds_read_u16 v2, v0 offset:8
336; GFX7-NEXT:    ds_read_u16 v1, v0 offset:4
337; GFX7-NEXT:    ds_read_u16 v4, v0 offset:2
338; GFX7-NEXT:    ds_read_u16 v5, v0
339; GFX7-NEXT:    ds_read_u16 v6, v0 offset:6
340; GFX7-NEXT:    ds_read_u16 v7, v0 offset:10
341; GFX7-NEXT:    ds_read_u16 v8, v0 offset:14
342; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
343; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
344; GFX7-NEXT:    s_waitcnt lgkmcnt(3)
345; GFX7-NEXT:    v_or_b32_e32 v0, v0, v5
346; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
347; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
348; GFX7-NEXT:    v_or_b32_e32 v1, v4, v1
349; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
350; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
351; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
352; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
353; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
354; GFX7-NEXT:    v_or_b32_e32 v3, v4, v3
355; GFX7-NEXT:    s_setpc_b64 s[30:31]
356;
357; GFX6-LABEL: load_lds_v4i32_align2:
358; GFX6:       ; %bb.0:
359; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
360; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 6, v0
361; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 4, v0
362; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 10, v0
363; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 8, v0
364; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 14, v0
365; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 12, v0
366; GFX6-NEXT:    v_add_i32_e32 v7, vcc, 2, v0
367; GFX6-NEXT:    s_mov_b32 m0, -1
368; GFX6-NEXT:    ds_read_u16 v1, v1
369; GFX6-NEXT:    ds_read_u16 v2, v2
370; GFX6-NEXT:    ds_read_u16 v3, v3
371; GFX6-NEXT:    ds_read_u16 v4, v4
372; GFX6-NEXT:    ds_read_u16 v5, v5
373; GFX6-NEXT:    ds_read_u16 v6, v6
374; GFX6-NEXT:    ds_read_u16 v7, v7
375; GFX6-NEXT:    ds_read_u16 v0, v0
376; GFX6-NEXT:    s_waitcnt lgkmcnt(7)
377; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
378; GFX6-NEXT:    s_waitcnt lgkmcnt(6)
379; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
380; GFX6-NEXT:    s_waitcnt lgkmcnt(5)
381; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
382; GFX6-NEXT:    s_waitcnt lgkmcnt(4)
383; GFX6-NEXT:    v_or_b32_e32 v2, v2, v4
384; GFX6-NEXT:    s_waitcnt lgkmcnt(3)
385; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
386; GFX6-NEXT:    s_waitcnt lgkmcnt(1)
387; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
388; GFX6-NEXT:    v_or_b32_e32 v3, v3, v6
389; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
390; GFX6-NEXT:    v_or_b32_e32 v0, v4, v0
391; GFX6-NEXT:    s_setpc_b64 s[30:31]
392;
393; GFX10-LABEL: load_lds_v4i32_align2:
394; GFX10:       ; %bb.0:
395; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
396; GFX10-NEXT:    ds_read_u16 v1, v0
397; GFX10-NEXT:    ds_read_u16 v2, v0 offset:2
398; GFX10-NEXT:    ds_read_u16 v3, v0 offset:4
399; GFX10-NEXT:    ds_read_u16 v4, v0 offset:6
400; GFX10-NEXT:    ds_read_u16 v5, v0 offset:8
401; GFX10-NEXT:    ds_read_u16 v6, v0 offset:10
402; GFX10-NEXT:    ds_read_u16 v7, v0 offset:12
403; GFX10-NEXT:    ds_read_u16 v8, v0 offset:14
404; GFX10-NEXT:    s_waitcnt lgkmcnt(6)
405; GFX10-NEXT:    v_lshl_or_b32 v0, v2, 16, v1
406; GFX10-NEXT:    s_waitcnt lgkmcnt(4)
407; GFX10-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
408; GFX10-NEXT:    s_waitcnt lgkmcnt(2)
409; GFX10-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
410; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
411; GFX10-NEXT:    v_lshl_or_b32 v3, v8, 16, v7
412; GFX10-NEXT:    s_setpc_b64 s[30:31]
413;
414; GFX11-LABEL: load_lds_v4i32_align2:
415; GFX11:       ; %bb.0:
416; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
417; GFX11-NEXT:    ds_load_u16 v1, v0
418; GFX11-NEXT:    ds_load_u16 v2, v0 offset:2
419; GFX11-NEXT:    ds_load_u16 v3, v0 offset:4
420; GFX11-NEXT:    ds_load_u16 v4, v0 offset:6
421; GFX11-NEXT:    ds_load_u16 v5, v0 offset:8
422; GFX11-NEXT:    ds_load_u16 v6, v0 offset:10
423; GFX11-NEXT:    ds_load_u16 v7, v0 offset:12
424; GFX11-NEXT:    ds_load_u16 v8, v0 offset:14
425; GFX11-NEXT:    s_waitcnt lgkmcnt(6)
426; GFX11-NEXT:    v_lshl_or_b32 v0, v2, 16, v1
427; GFX11-NEXT:    s_waitcnt lgkmcnt(4)
428; GFX11-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
429; GFX11-NEXT:    s_waitcnt lgkmcnt(2)
430; GFX11-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
431; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
432; GFX11-NEXT:    v_lshl_or_b32 v3, v8, 16, v7
433; GFX11-NEXT:    s_setpc_b64 s[30:31]
434  %load = load <4 x i32>, ptr addrspace(3) %ptr, align 2
435  ret <4 x i32> %load
436}
437
438define <4 x i32> @load_lds_v4i32_align4(ptr addrspace(3) %ptr) {
439; GFX9-LABEL: load_lds_v4i32_align4:
440; GFX9:       ; %bb.0:
441; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
442; GFX9-NEXT:    v_mov_b32_e32 v2, v0
443; GFX9-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
444; GFX9-NEXT:    ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
445; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
446; GFX9-NEXT:    s_setpc_b64 s[30:31]
447;
448; GFX7-LABEL: load_lds_v4i32_align4:
449; GFX7:       ; %bb.0:
450; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
451; GFX7-NEXT:    v_mov_b32_e32 v2, v0
452; GFX7-NEXT:    s_mov_b32 m0, -1
453; GFX7-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
454; GFX7-NEXT:    ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
455; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
456; GFX7-NEXT:    s_setpc_b64 s[30:31]
457;
458; GFX6-LABEL: load_lds_v4i32_align4:
459; GFX6:       ; %bb.0:
460; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
461; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 4, v0
462; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 8, v0
463; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 12, v0
464; GFX6-NEXT:    s_mov_b32 m0, -1
465; GFX6-NEXT:    ds_read_b32 v2, v2
466; GFX6-NEXT:    ds_read_b32 v3, v3
467; GFX6-NEXT:    ds_read_b32 v0, v0
468; GFX6-NEXT:    ds_read_b32 v1, v1
469; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
470; GFX6-NEXT:    s_setpc_b64 s[30:31]
471;
472; GFX10-LABEL: load_lds_v4i32_align4:
473; GFX10:       ; %bb.0:
474; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
475; GFX10-NEXT:    v_mov_b32_e32 v2, v0
476; GFX10-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
477; GFX10-NEXT:    ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
478; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
479; GFX10-NEXT:    s_setpc_b64 s[30:31]
480;
481; GFX11-LABEL: load_lds_v4i32_align4:
482; GFX11:       ; %bb.0:
483; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
484; GFX11-NEXT:    v_mov_b32_e32 v2, v0
485; GFX11-NEXT:    ds_load_2addr_b32 v[0:1], v0 offset1:1
486; GFX11-NEXT:    ds_load_2addr_b32 v[2:3], v2 offset0:2 offset1:3
487; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
488; GFX11-NEXT:    s_setpc_b64 s[30:31]
489  %load = load <4 x i32>, ptr addrspace(3) %ptr, align 4
490  ret <4 x i32> %load
491}
492
493define <4 x i32> @load_lds_v4i32_align8(ptr addrspace(3) %ptr) {
494; GFX9-LABEL: load_lds_v4i32_align8:
495; GFX9:       ; %bb.0:
496; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
497; GFX9-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
498; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
499; GFX9-NEXT:    s_setpc_b64 s[30:31]
500;
501; GFX7-LABEL: load_lds_v4i32_align8:
502; GFX7:       ; %bb.0:
503; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
504; GFX7-NEXT:    s_mov_b32 m0, -1
505; GFX7-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
506; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
507; GFX7-NEXT:    s_setpc_b64 s[30:31]
508;
509; GFX6-LABEL: load_lds_v4i32_align8:
510; GFX6:       ; %bb.0:
511; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
512; GFX6-NEXT:    v_mov_b32_e32 v2, v0
513; GFX6-NEXT:    s_mov_b32 m0, -1
514; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 8, v2
515; GFX6-NEXT:    ds_read_b64 v[0:1], v0
516; GFX6-NEXT:    ds_read_b64 v[2:3], v2
517; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
518; GFX6-NEXT:    s_setpc_b64 s[30:31]
519;
520; GFX10-LABEL: load_lds_v4i32_align8:
521; GFX10:       ; %bb.0:
522; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
523; GFX10-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
524; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
525; GFX10-NEXT:    s_setpc_b64 s[30:31]
526;
527; GFX11-LABEL: load_lds_v4i32_align8:
528; GFX11:       ; %bb.0:
529; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
530; GFX11-NEXT:    ds_load_2addr_b64 v[0:3], v0 offset1:1
531; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
532; GFX11-NEXT:    s_setpc_b64 s[30:31]
533  %load = load <4 x i32>, ptr addrspace(3) %ptr, align 8
534  ret <4 x i32> %load
535}
536
537define <4 x i32> @load_lds_v4i32_align16(ptr addrspace(3) %ptr) {
538; GFX9-LABEL: load_lds_v4i32_align16:
539; GFX9:       ; %bb.0:
540; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
541; GFX9-NEXT:    ds_read_b128 v[0:3], v0
542; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
543; GFX9-NEXT:    s_setpc_b64 s[30:31]
544;
545; GFX7-LABEL: load_lds_v4i32_align16:
546; GFX7:       ; %bb.0:
547; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
548; GFX7-NEXT:    s_mov_b32 m0, -1
549; GFX7-NEXT:    ds_read_b128 v[0:3], v0
550; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
551; GFX7-NEXT:    s_setpc_b64 s[30:31]
552;
553; GFX6-LABEL: load_lds_v4i32_align16:
554; GFX6:       ; %bb.0:
555; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
556; GFX6-NEXT:    v_mov_b32_e32 v2, v0
557; GFX6-NEXT:    s_mov_b32 m0, -1
558; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 8, v2
559; GFX6-NEXT:    ds_read_b64 v[0:1], v0
560; GFX6-NEXT:    ds_read_b64 v[2:3], v2
561; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
562; GFX6-NEXT:    s_setpc_b64 s[30:31]
563;
564; GFX10-LABEL: load_lds_v4i32_align16:
565; GFX10:       ; %bb.0:
566; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
567; GFX10-NEXT:    ds_read_b128 v[0:3], v0
568; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
569; GFX10-NEXT:    s_setpc_b64 s[30:31]
570;
571; GFX11-LABEL: load_lds_v4i32_align16:
572; GFX11:       ; %bb.0:
573; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
574; GFX11-NEXT:    ds_load_b128 v[0:3], v0
575; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
576; GFX11-NEXT:    s_setpc_b64 s[30:31]
577  %load = load <4 x i32>, ptr addrspace(3) %ptr, align 16
578  ret <4 x i32> %load
579}
580