xref: /llvm-project/llvm/test/CodeGen/AMDGPU/load-local.96.ll (revision f2c164c8150548d983565c4ddc0fde790f9e2a5b)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
3; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s
4; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s
5; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
6; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
7
8define <3 x i32> @load_lds_v3i32(ptr addrspace(3) %ptr) {
9; GFX9-LABEL: load_lds_v3i32:
10; GFX9:       ; %bb.0:
11; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12; GFX9-NEXT:    ds_read_b96 v[0:2], v0
13; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
14; GFX9-NEXT:    s_setpc_b64 s[30:31]
15;
16; GFX7-LABEL: load_lds_v3i32:
17; GFX7:       ; %bb.0:
18; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19; GFX7-NEXT:    s_mov_b32 m0, -1
20; GFX7-NEXT:    ds_read_b96 v[0:2], v0
21; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
22; GFX7-NEXT:    s_setpc_b64 s[30:31]
23;
24; GFX6-LABEL: load_lds_v3i32:
25; GFX6:       ; %bb.0:
26; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27; GFX6-NEXT:    v_mov_b32_e32 v2, v0
28; GFX6-NEXT:    s_mov_b32 m0, -1
29; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 8, v2
30; GFX6-NEXT:    ds_read_b64 v[0:1], v0
31; GFX6-NEXT:    ds_read_b32 v2, v2
32; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
33; GFX6-NEXT:    s_setpc_b64 s[30:31]
34;
35; GFX10-LABEL: load_lds_v3i32:
36; GFX10:       ; %bb.0:
37; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
38; GFX10-NEXT:    ds_read_b96 v[0:2], v0
39; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
40; GFX10-NEXT:    s_setpc_b64 s[30:31]
41;
42; GFX11-LABEL: load_lds_v3i32:
43; GFX11:       ; %bb.0:
44; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
45; GFX11-NEXT:    ds_load_b96 v[0:2], v0
46; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
47; GFX11-NEXT:    s_setpc_b64 s[30:31]
48  %load = load <3 x i32>, ptr addrspace(3) %ptr
49  ret <3 x i32> %load
50}
51
52define <3 x i32> @load_lds_v3i32_align1(ptr addrspace(3) %ptr) {
53; GFX9-LABEL: load_lds_v3i32_align1:
54; GFX9:       ; %bb.0:
55; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
56; GFX9-NEXT:    ds_read_u8 v1, v0
57; GFX9-NEXT:    ds_read_u8 v2, v0 offset:1
58; GFX9-NEXT:    ds_read_u8 v3, v0 offset:2
59; GFX9-NEXT:    ds_read_u8 v4, v0 offset:3
60; GFX9-NEXT:    ds_read_u8 v5, v0 offset:4
61; GFX9-NEXT:    ds_read_u8 v6, v0 offset:5
62; GFX9-NEXT:    ds_read_u8 v7, v0 offset:6
63; GFX9-NEXT:    ds_read_u8 v8, v0 offset:7
64; GFX9-NEXT:    ds_read_u8 v9, v0 offset:8
65; GFX9-NEXT:    ds_read_u8 v10, v0 offset:9
66; GFX9-NEXT:    ds_read_u8 v11, v0 offset:10
67; GFX9-NEXT:    ds_read_u8 v12, v0 offset:11
68; GFX9-NEXT:    s_waitcnt lgkmcnt(10)
69; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 8, v1
70; GFX9-NEXT:    s_waitcnt lgkmcnt(8)
71; GFX9-NEXT:    v_lshl_or_b32 v1, v4, 8, v3
72; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
73; GFX9-NEXT:    s_waitcnt lgkmcnt(6)
74; GFX9-NEXT:    v_lshl_or_b32 v1, v6, 8, v5
75; GFX9-NEXT:    s_waitcnt lgkmcnt(4)
76; GFX9-NEXT:    v_lshl_or_b32 v2, v8, 8, v7
77; GFX9-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
78; GFX9-NEXT:    s_waitcnt lgkmcnt(2)
79; GFX9-NEXT:    v_lshl_or_b32 v2, v10, 8, v9
80; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
81; GFX9-NEXT:    v_lshl_or_b32 v3, v12, 8, v11
82; GFX9-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
83; GFX9-NEXT:    s_setpc_b64 s[30:31]
84;
85; GFX7-LABEL: load_lds_v3i32_align1:
86; GFX7:       ; %bb.0:
87; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
88; GFX7-NEXT:    s_mov_b32 m0, -1
89; GFX7-NEXT:    ds_read_u8 v1, v0 offset:6
90; GFX7-NEXT:    ds_read_u8 v2, v0 offset:4
91; GFX7-NEXT:    ds_read_u8 v3, v0 offset:2
92; GFX7-NEXT:    ds_read_u8 v4, v0 offset:1
93; GFX7-NEXT:    ds_read_u8 v5, v0
94; GFX7-NEXT:    ds_read_u8 v6, v0 offset:3
95; GFX7-NEXT:    ds_read_u8 v7, v0 offset:5
96; GFX7-NEXT:    ds_read_u8 v8, v0 offset:7
97; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
98; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
99; GFX7-NEXT:    s_waitcnt lgkmcnt(3)
100; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
101; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
102; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v6
103; GFX7-NEXT:    v_or_b32_e32 v3, v5, v3
104; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
105; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
106; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
107; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v7
108; GFX7-NEXT:    ds_read_u8 v5, v0 offset:11
109; GFX7-NEXT:    ds_read_u8 v6, v0 offset:10
110; GFX7-NEXT:    ds_read_u8 v7, v0 offset:9
111; GFX7-NEXT:    ds_read_u8 v0, v0 offset:8
112; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
113; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
114; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v8
115; GFX7-NEXT:    v_or_b32_e32 v1, v4, v1
116; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
117; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
118; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
119; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v7
120; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
121; GFX7-NEXT:    v_or_b32_e32 v0, v2, v0
122; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v5
123; GFX7-NEXT:    v_or_b32_e32 v2, v2, v6
124; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
125; GFX7-NEXT:    v_or_b32_e32 v2, v2, v0
126; GFX7-NEXT:    v_mov_b32_e32 v0, v3
127; GFX7-NEXT:    s_setpc_b64 s[30:31]
128;
129; GFX6-LABEL: load_lds_v3i32_align1:
130; GFX6:       ; %bb.0:
131; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
132; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 5, v0
133; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 4, v0
134; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 7, v0
135; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 6, v0
136; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 9, v0
137; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 8, v0
138; GFX6-NEXT:    v_add_i32_e32 v7, vcc, 11, v0
139; GFX6-NEXT:    s_mov_b32 m0, -1
140; GFX6-NEXT:    ds_read_u8 v1, v1
141; GFX6-NEXT:    ds_read_u8 v2, v2
142; GFX6-NEXT:    ds_read_u8 v3, v3
143; GFX6-NEXT:    ds_read_u8 v4, v4
144; GFX6-NEXT:    ds_read_u8 v5, v5
145; GFX6-NEXT:    ds_read_u8 v6, v6
146; GFX6-NEXT:    ds_read_u8 v7, v7
147; GFX6-NEXT:    ds_read_u8 v8, v0
148; GFX6-NEXT:    s_waitcnt lgkmcnt(7)
149; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
150; GFX6-NEXT:    s_waitcnt lgkmcnt(6)
151; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
152; GFX6-NEXT:    s_waitcnt lgkmcnt(5)
153; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 8, v3
154; GFX6-NEXT:    s_waitcnt lgkmcnt(4)
155; GFX6-NEXT:    v_or_b32_e32 v2, v2, v4
156; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 10, v0
157; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
158; GFX6-NEXT:    ds_read_u8 v4, v4
159; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
160; GFX6-NEXT:    s_waitcnt lgkmcnt(4)
161; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 8, v5
162; GFX6-NEXT:    s_waitcnt lgkmcnt(3)
163; GFX6-NEXT:    v_or_b32_e32 v2, v2, v6
164; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 3, v0
165; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 2, v0
166; GFX6-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
167; GFX6-NEXT:    ds_read_u8 v5, v5
168; GFX6-NEXT:    ds_read_u8 v6, v6
169; GFX6-NEXT:    ds_read_u8 v0, v0
170; GFX6-NEXT:    s_waitcnt lgkmcnt(5)
171; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 8, v7
172; GFX6-NEXT:    s_waitcnt lgkmcnt(3)
173; GFX6-NEXT:    v_or_b32_e32 v3, v3, v4
174; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
175; GFX6-NEXT:    v_or_b32_e32 v2, v3, v2
176; GFX6-NEXT:    s_waitcnt lgkmcnt(2)
177; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 8, v5
178; GFX6-NEXT:    s_waitcnt lgkmcnt(1)
179; GFX6-NEXT:    v_or_b32_e32 v3, v3, v6
180; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
181; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
182; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
183; GFX6-NEXT:    v_or_b32_e32 v0, v0, v8
184; GFX6-NEXT:    v_or_b32_e32 v0, v3, v0
185; GFX6-NEXT:    s_setpc_b64 s[30:31]
186;
187; GFX10-LABEL: load_lds_v3i32_align1:
188; GFX10:       ; %bb.0:
189; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
190; GFX10-NEXT:    ds_read_u8 v1, v0
191; GFX10-NEXT:    ds_read_u8 v2, v0 offset:1
192; GFX10-NEXT:    ds_read_u8 v3, v0 offset:2
193; GFX10-NEXT:    ds_read_u8 v4, v0 offset:3
194; GFX10-NEXT:    ds_read_u8 v5, v0 offset:4
195; GFX10-NEXT:    ds_read_u8 v6, v0 offset:5
196; GFX10-NEXT:    ds_read_u8 v7, v0 offset:6
197; GFX10-NEXT:    ds_read_u8 v8, v0 offset:7
198; GFX10-NEXT:    ds_read_u8 v9, v0 offset:8
199; GFX10-NEXT:    ds_read_u8 v10, v0 offset:9
200; GFX10-NEXT:    ds_read_u8 v11, v0 offset:10
201; GFX10-NEXT:    ds_read_u8 v0, v0 offset:11
202; GFX10-NEXT:    s_waitcnt lgkmcnt(10)
203; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
204; GFX10-NEXT:    s_waitcnt lgkmcnt(8)
205; GFX10-NEXT:    v_lshl_or_b32 v2, v4, 8, v3
206; GFX10-NEXT:    s_waitcnt lgkmcnt(6)
207; GFX10-NEXT:    v_lshl_or_b32 v3, v6, 8, v5
208; GFX10-NEXT:    s_waitcnt lgkmcnt(4)
209; GFX10-NEXT:    v_lshl_or_b32 v4, v8, 8, v7
210; GFX10-NEXT:    s_waitcnt lgkmcnt(2)
211; GFX10-NEXT:    v_lshl_or_b32 v5, v10, 8, v9
212; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
213; GFX10-NEXT:    v_lshl_or_b32 v6, v0, 8, v11
214; GFX10-NEXT:    v_lshl_or_b32 v0, v2, 16, v1
215; GFX10-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
216; GFX10-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
217; GFX10-NEXT:    s_setpc_b64 s[30:31]
218;
219; GFX11-LABEL: load_lds_v3i32_align1:
220; GFX11:       ; %bb.0:
221; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
222; GFX11-NEXT:    ds_load_u8 v1, v0
223; GFX11-NEXT:    ds_load_u8 v2, v0 offset:1
224; GFX11-NEXT:    ds_load_u8 v3, v0 offset:2
225; GFX11-NEXT:    ds_load_u8 v4, v0 offset:3
226; GFX11-NEXT:    ds_load_u8 v5, v0 offset:4
227; GFX11-NEXT:    ds_load_u8 v6, v0 offset:5
228; GFX11-NEXT:    ds_load_u8 v7, v0 offset:6
229; GFX11-NEXT:    ds_load_u8 v8, v0 offset:7
230; GFX11-NEXT:    ds_load_u8 v9, v0 offset:8
231; GFX11-NEXT:    ds_load_u8 v10, v0 offset:9
232; GFX11-NEXT:    ds_load_u8 v11, v0 offset:10
233; GFX11-NEXT:    ds_load_u8 v0, v0 offset:11
234; GFX11-NEXT:    s_waitcnt lgkmcnt(10)
235; GFX11-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
236; GFX11-NEXT:    s_waitcnt lgkmcnt(8)
237; GFX11-NEXT:    v_lshl_or_b32 v2, v4, 8, v3
238; GFX11-NEXT:    s_waitcnt lgkmcnt(6)
239; GFX11-NEXT:    v_lshl_or_b32 v3, v6, 8, v5
240; GFX11-NEXT:    s_waitcnt lgkmcnt(4)
241; GFX11-NEXT:    v_lshl_or_b32 v4, v8, 8, v7
242; GFX11-NEXT:    s_waitcnt lgkmcnt(2)
243; GFX11-NEXT:    v_lshl_or_b32 v5, v10, 8, v9
244; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
245; GFX11-NEXT:    v_lshl_or_b32 v6, v0, 8, v11
246; GFX11-NEXT:    v_lshl_or_b32 v0, v2, 16, v1
247; GFX11-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
248; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
249; GFX11-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
250; GFX11-NEXT:    s_setpc_b64 s[30:31]
251  %load = load <3 x i32>, ptr addrspace(3) %ptr, align 1
252  ret <3 x i32> %load
253}
254
255define <3 x i32> @load_lds_v3i32_align2(ptr addrspace(3) %ptr) {
256; GFX9-LABEL: load_lds_v3i32_align2:
257; GFX9:       ; %bb.0:
258; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
259; GFX9-NEXT:    ds_read_u16 v1, v0
260; GFX9-NEXT:    ds_read_u16 v2, v0 offset:2
261; GFX9-NEXT:    ds_read_u16 v3, v0 offset:4
262; GFX9-NEXT:    ds_read_u16 v4, v0 offset:6
263; GFX9-NEXT:    ds_read_u16 v5, v0 offset:8
264; GFX9-NEXT:    ds_read_u16 v6, v0 offset:10
265; GFX9-NEXT:    s_waitcnt lgkmcnt(4)
266; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v1
267; GFX9-NEXT:    s_waitcnt lgkmcnt(2)
268; GFX9-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
269; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
270; GFX9-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
271; GFX9-NEXT:    s_setpc_b64 s[30:31]
272;
273; GFX7-LABEL: load_lds_v3i32_align2:
274; GFX7:       ; %bb.0:
275; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
276; GFX7-NEXT:    s_mov_b32 m0, -1
277; GFX7-NEXT:    ds_read_u16 v2, v0 offset:8
278; GFX7-NEXT:    ds_read_u16 v1, v0 offset:4
279; GFX7-NEXT:    ds_read_u16 v3, v0 offset:2
280; GFX7-NEXT:    ds_read_u16 v4, v0
281; GFX7-NEXT:    ds_read_u16 v5, v0 offset:6
282; GFX7-NEXT:    ds_read_u16 v6, v0 offset:10
283; GFX7-NEXT:    s_waitcnt lgkmcnt(3)
284; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
285; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
286; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
287; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
288; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
289; GFX7-NEXT:    v_or_b32_e32 v1, v3, v1
290; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
291; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
292; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
293; GFX7-NEXT:    s_setpc_b64 s[30:31]
294;
295; GFX6-LABEL: load_lds_v3i32_align2:
296; GFX6:       ; %bb.0:
297; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
298; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 6, v0
299; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 4, v0
300; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 10, v0
301; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 8, v0
302; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 2, v0
303; GFX6-NEXT:    s_mov_b32 m0, -1
304; GFX6-NEXT:    ds_read_u16 v1, v1
305; GFX6-NEXT:    ds_read_u16 v2, v2
306; GFX6-NEXT:    ds_read_u16 v3, v3
307; GFX6-NEXT:    ds_read_u16 v4, v4
308; GFX6-NEXT:    ds_read_u16 v5, v5
309; GFX6-NEXT:    ds_read_u16 v0, v0
310; GFX6-NEXT:    s_waitcnt lgkmcnt(5)
311; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
312; GFX6-NEXT:    s_waitcnt lgkmcnt(4)
313; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
314; GFX6-NEXT:    s_waitcnt lgkmcnt(3)
315; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
316; GFX6-NEXT:    s_waitcnt lgkmcnt(1)
317; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
318; GFX6-NEXT:    v_or_b32_e32 v2, v2, v4
319; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
320; GFX6-NEXT:    v_or_b32_e32 v0, v3, v0
321; GFX6-NEXT:    s_setpc_b64 s[30:31]
322;
323; GFX10-LABEL: load_lds_v3i32_align2:
324; GFX10:       ; %bb.0:
325; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
326; GFX10-NEXT:    ds_read_u16 v1, v0
327; GFX10-NEXT:    ds_read_u16 v2, v0 offset:2
328; GFX10-NEXT:    ds_read_u16 v3, v0 offset:4
329; GFX10-NEXT:    ds_read_u16 v4, v0 offset:6
330; GFX10-NEXT:    ds_read_u16 v5, v0 offset:8
331; GFX10-NEXT:    ds_read_u16 v6, v0 offset:10
332; GFX10-NEXT:    s_waitcnt lgkmcnt(4)
333; GFX10-NEXT:    v_lshl_or_b32 v0, v2, 16, v1
334; GFX10-NEXT:    s_waitcnt lgkmcnt(2)
335; GFX10-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
336; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
337; GFX10-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
338; GFX10-NEXT:    s_setpc_b64 s[30:31]
339;
340; GFX11-LABEL: load_lds_v3i32_align2:
341; GFX11:       ; %bb.0:
342; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
343; GFX11-NEXT:    ds_load_u16 v1, v0
344; GFX11-NEXT:    ds_load_u16 v2, v0 offset:2
345; GFX11-NEXT:    ds_load_u16 v3, v0 offset:4
346; GFX11-NEXT:    ds_load_u16 v4, v0 offset:6
347; GFX11-NEXT:    ds_load_u16 v5, v0 offset:8
348; GFX11-NEXT:    ds_load_u16 v6, v0 offset:10
349; GFX11-NEXT:    s_waitcnt lgkmcnt(4)
350; GFX11-NEXT:    v_lshl_or_b32 v0, v2, 16, v1
351; GFX11-NEXT:    s_waitcnt lgkmcnt(2)
352; GFX11-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
353; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
354; GFX11-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
355; GFX11-NEXT:    s_setpc_b64 s[30:31]
356  %load = load <3 x i32>, ptr addrspace(3) %ptr, align 2
357  ret <3 x i32> %load
358}
359
360define <3 x i32> @load_lds_v3i32_align4(ptr addrspace(3) %ptr) {
361; GFX9-LABEL: load_lds_v3i32_align4:
362; GFX9:       ; %bb.0:
363; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
364; GFX9-NEXT:    v_mov_b32_e32 v2, v0
365; GFX9-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
366; GFX9-NEXT:    ds_read_b32 v2, v2 offset:8
367; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
368; GFX9-NEXT:    s_setpc_b64 s[30:31]
369;
370; GFX7-LABEL: load_lds_v3i32_align4:
371; GFX7:       ; %bb.0:
372; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
373; GFX7-NEXT:    v_mov_b32_e32 v2, v0
374; GFX7-NEXT:    s_mov_b32 m0, -1
375; GFX7-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
376; GFX7-NEXT:    ds_read_b32 v2, v2 offset:8
377; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
378; GFX7-NEXT:    s_setpc_b64 s[30:31]
379;
380; GFX6-LABEL: load_lds_v3i32_align4:
381; GFX6:       ; %bb.0:
382; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
383; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 4, v0
384; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 8, v0
385; GFX6-NEXT:    s_mov_b32 m0, -1
386; GFX6-NEXT:    ds_read_b32 v2, v2
387; GFX6-NEXT:    ds_read_b32 v0, v0
388; GFX6-NEXT:    ds_read_b32 v1, v1
389; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
390; GFX6-NEXT:    s_setpc_b64 s[30:31]
391;
392; GFX10-LABEL: load_lds_v3i32_align4:
393; GFX10:       ; %bb.0:
394; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
395; GFX10-NEXT:    v_mov_b32_e32 v2, v0
396; GFX10-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
397; GFX10-NEXT:    ds_read_b32 v2, v2 offset:8
398; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
399; GFX10-NEXT:    s_setpc_b64 s[30:31]
400;
401; GFX11-LABEL: load_lds_v3i32_align4:
402; GFX11:       ; %bb.0:
403; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
404; GFX11-NEXT:    v_mov_b32_e32 v2, v0
405; GFX11-NEXT:    ds_load_2addr_b32 v[0:1], v0 offset1:1
406; GFX11-NEXT:    ds_load_b32 v2, v2 offset:8
407; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
408; GFX11-NEXT:    s_setpc_b64 s[30:31]
409  %load = load <3 x i32>, ptr addrspace(3) %ptr, align 4
410  ret <3 x i32> %load
411}
412
413define <3 x i32> @load_lds_v3i32_align8(ptr addrspace(3) %ptr) {
414; GFX9-LABEL: load_lds_v3i32_align8:
415; GFX9:       ; %bb.0:
416; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
417; GFX9-NEXT:    v_mov_b32_e32 v2, v0
418; GFX9-NEXT:    ds_read_b64 v[0:1], v0
419; GFX9-NEXT:    ds_read_b32 v2, v2 offset:8
420; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
421; GFX9-NEXT:    s_setpc_b64 s[30:31]
422;
423; GFX7-LABEL: load_lds_v3i32_align8:
424; GFX7:       ; %bb.0:
425; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
426; GFX7-NEXT:    v_mov_b32_e32 v2, v0
427; GFX7-NEXT:    s_mov_b32 m0, -1
428; GFX7-NEXT:    ds_read_b64 v[0:1], v0
429; GFX7-NEXT:    ds_read_b32 v2, v2 offset:8
430; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
431; GFX7-NEXT:    s_setpc_b64 s[30:31]
432;
433; GFX6-LABEL: load_lds_v3i32_align8:
434; GFX6:       ; %bb.0:
435; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
436; GFX6-NEXT:    v_mov_b32_e32 v2, v0
437; GFX6-NEXT:    s_mov_b32 m0, -1
438; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 8, v2
439; GFX6-NEXT:    ds_read_b64 v[0:1], v0
440; GFX6-NEXT:    ds_read_b32 v2, v2
441; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
442; GFX6-NEXT:    s_setpc_b64 s[30:31]
443;
444; GFX10-LABEL: load_lds_v3i32_align8:
445; GFX10:       ; %bb.0:
446; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
447; GFX10-NEXT:    v_mov_b32_e32 v2, v0
448; GFX10-NEXT:    ds_read_b64 v[0:1], v0
449; GFX10-NEXT:    ds_read_b32 v2, v2 offset:8
450; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
451; GFX10-NEXT:    s_setpc_b64 s[30:31]
452;
453; GFX11-LABEL: load_lds_v3i32_align8:
454; GFX11:       ; %bb.0:
455; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
456; GFX11-NEXT:    v_mov_b32_e32 v2, v0
457; GFX11-NEXT:    ds_load_b64 v[0:1], v0
458; GFX11-NEXT:    ds_load_b32 v2, v2 offset:8
459; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
460; GFX11-NEXT:    s_setpc_b64 s[30:31]
461  %load = load <3 x i32>, ptr addrspace(3) %ptr, align 8
462  ret <3 x i32> %load
463}
464
465define <3 x i32> @load_lds_v3i32_align16(ptr addrspace(3) %ptr) {
466; GFX9-LABEL: load_lds_v3i32_align16:
467; GFX9:       ; %bb.0:
468; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
469; GFX9-NEXT:    ds_read_b96 v[0:2], v0
470; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
471; GFX9-NEXT:    s_setpc_b64 s[30:31]
472;
473; GFX7-LABEL: load_lds_v3i32_align16:
474; GFX7:       ; %bb.0:
475; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
476; GFX7-NEXT:    s_mov_b32 m0, -1
477; GFX7-NEXT:    ds_read_b96 v[0:2], v0
478; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
479; GFX7-NEXT:    s_setpc_b64 s[30:31]
480;
481; GFX6-LABEL: load_lds_v3i32_align16:
482; GFX6:       ; %bb.0:
483; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
484; GFX6-NEXT:    v_mov_b32_e32 v2, v0
485; GFX6-NEXT:    s_mov_b32 m0, -1
486; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 8, v2
487; GFX6-NEXT:    ds_read_b64 v[0:1], v0
488; GFX6-NEXT:    ds_read_b32 v2, v2
489; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
490; GFX6-NEXT:    s_setpc_b64 s[30:31]
491;
492; GFX10-LABEL: load_lds_v3i32_align16:
493; GFX10:       ; %bb.0:
494; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
495; GFX10-NEXT:    ds_read_b96 v[0:2], v0
496; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
497; GFX10-NEXT:    s_setpc_b64 s[30:31]
498;
499; GFX11-LABEL: load_lds_v3i32_align16:
500; GFX11:       ; %bb.0:
501; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
502; GFX11-NEXT:    ds_load_b96 v[0:2], v0
503; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
504; GFX11-NEXT:    s_setpc_b64 s[30:31]
505  %load = load <3 x i32>, ptr addrspace(3) %ptr, align 16
506  ret <3 x i32> %load
507}
508