xref: /llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll (revision ba52f06f9d92c7ca04b440f618f8d352ea121fcc)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+unaligned-access-mode < %s | FileCheck -check-prefixes=GFX12,GFX12-UNALIGNED %s
3; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GFX12,GFX12-NOUNALIGNED %s
4; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-UNALIGNED %s
5; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-NOUNALIGNED %s
6; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -mattr=+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX7,GFX7-UNALIGNED %s
7; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX7,GFX7-NOUNALIGNED %s
8
9; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s
10
11define <3 x i32> @v_load_constant_v3i32_align1(ptr addrspace(4) %ptr) {
12; GFX12-UNALIGNED-LABEL: v_load_constant_v3i32_align1:
13; GFX12-UNALIGNED:       ; %bb.0:
14; GFX12-UNALIGNED-NEXT:    s_wait_loadcnt_dscnt 0x0
15; GFX12-UNALIGNED-NEXT:    s_wait_expcnt 0x0
16; GFX12-UNALIGNED-NEXT:    s_wait_samplecnt 0x0
17; GFX12-UNALIGNED-NEXT:    s_wait_bvhcnt 0x0
18; GFX12-UNALIGNED-NEXT:    s_wait_kmcnt 0x0
19; GFX12-UNALIGNED-NEXT:    global_load_b96 v[0:2], v[0:1], off
20; GFX12-UNALIGNED-NEXT:    s_wait_loadcnt 0x0
21; GFX12-UNALIGNED-NEXT:    s_setpc_b64 s[30:31]
22;
23; GFX12-NOUNALIGNED-LABEL: v_load_constant_v3i32_align1:
24; GFX12-NOUNALIGNED:       ; %bb.0:
25; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt_dscnt 0x0
26; GFX12-NOUNALIGNED-NEXT:    s_wait_expcnt 0x0
27; GFX12-NOUNALIGNED-NEXT:    s_wait_samplecnt 0x0
28; GFX12-NOUNALIGNED-NEXT:    s_wait_bvhcnt 0x0
29; GFX12-NOUNALIGNED-NEXT:    s_wait_kmcnt 0x0
30; GFX12-NOUNALIGNED-NEXT:    s_clause 0xb
31; GFX12-NOUNALIGNED-NEXT:    global_load_u8 v2, v[0:1], off
32; GFX12-NOUNALIGNED-NEXT:    global_load_u8 v3, v[0:1], off offset:1
33; GFX12-NOUNALIGNED-NEXT:    global_load_u8 v4, v[0:1], off offset:2
34; GFX12-NOUNALIGNED-NEXT:    global_load_u8 v5, v[0:1], off offset:3
35; GFX12-NOUNALIGNED-NEXT:    global_load_u8 v6, v[0:1], off offset:4
36; GFX12-NOUNALIGNED-NEXT:    global_load_u8 v7, v[0:1], off offset:5
37; GFX12-NOUNALIGNED-NEXT:    global_load_u8 v8, v[0:1], off offset:6
38; GFX12-NOUNALIGNED-NEXT:    global_load_u8 v9, v[0:1], off offset:7
39; GFX12-NOUNALIGNED-NEXT:    global_load_u8 v10, v[0:1], off offset:8
40; GFX12-NOUNALIGNED-NEXT:    global_load_u8 v11, v[0:1], off offset:9
41; GFX12-NOUNALIGNED-NEXT:    global_load_u8 v12, v[0:1], off offset:11
42; GFX12-NOUNALIGNED-NEXT:    global_load_u8 v0, v[0:1], off offset:10
43; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt 0xa
44; GFX12-NOUNALIGNED-NEXT:    v_lshl_or_b32 v1, v3, 8, v2
45; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x9
46; GFX12-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
47; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x8
48; GFX12-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 24, v5
49; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x6
50; GFX12-NOUNALIGNED-NEXT:    v_lshl_or_b32 v4, v7, 8, v6
51; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x5
52; GFX12-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v6, 16, v8
53; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x4
54; GFX12-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v5, 24, v9
55; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x2
56; GFX12-NOUNALIGNED-NEXT:    v_lshl_or_b32 v7, v11, 8, v10
57; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x1
58; GFX12-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v8, 24, v12
59; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x0
60; GFX12-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
61; GFX12-NOUNALIGNED-NEXT:    v_or3_b32 v0, v2, v3, v1
62; GFX12-NOUNALIGNED-NEXT:    v_or3_b32 v1, v5, v6, v4
63; GFX12-NOUNALIGNED-NEXT:    s_delay_alu instid0(VALU_DEP_3)
64; GFX12-NOUNALIGNED-NEXT:    v_or3_b32 v2, v8, v9, v7
65; GFX12-NOUNALIGNED-NEXT:    s_setpc_b64 s[30:31]
66;
67; GFX9-UNALIGNED-LABEL: v_load_constant_v3i32_align1:
68; GFX9-UNALIGNED:       ; %bb.0:
69; GFX9-UNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
70; GFX9-UNALIGNED-NEXT:    global_load_dwordx3 v[0:2], v[0:1], off
71; GFX9-UNALIGNED-NEXT:    s_waitcnt vmcnt(0)
72; GFX9-UNALIGNED-NEXT:    s_setpc_b64 s[30:31]
73;
74; GFX9-NOUNALIGNED-LABEL: v_load_constant_v3i32_align1:
75; GFX9-NOUNALIGNED:       ; %bb.0:
76; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
77; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v2, v[0:1], off
78; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v3, v[0:1], off offset:1
79; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v4, v[0:1], off offset:2
80; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v5, v[0:1], off offset:3
81; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v6, v[0:1], off offset:4
82; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v7, v[0:1], off offset:5
83; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v8, v[0:1], off offset:6
84; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v9, v[0:1], off offset:7
85; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v10, v[0:1], off offset:8
86; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v11, v[0:1], off offset:9
87; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v12, v[0:1], off offset:11
88; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v13, v[0:1], off offset:10
89; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(10)
90; GFX9-NOUNALIGNED-NEXT:    v_lshl_or_b32 v0, v3, 8, v2
91; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(9)
92; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
93; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(8)
94; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 24, v5
95; GFX9-NOUNALIGNED-NEXT:    v_or3_b32 v0, v1, v2, v0
96; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(6)
97; GFX9-NOUNALIGNED-NEXT:    v_lshl_or_b32 v3, v7, 8, v6
98; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(5)
99; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v5, 16, v8
100; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(4)
101; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v4, 24, v9
102; GFX9-NOUNALIGNED-NEXT:    v_or3_b32 v1, v4, v5, v3
103; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(2)
104; GFX9-NOUNALIGNED-NEXT:    v_lshl_or_b32 v6, v11, 8, v10
105; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(1)
106; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v7, 24, v12
107; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0)
108; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v8, 16, v13
109; GFX9-NOUNALIGNED-NEXT:    v_or3_b32 v2, v7, v8, v6
110; GFX9-NOUNALIGNED-NEXT:    s_setpc_b64 s[30:31]
111;
112; GFX7-UNALIGNED-LABEL: v_load_constant_v3i32_align1:
113; GFX7-UNALIGNED:       ; %bb.0:
114; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
115; GFX7-UNALIGNED-NEXT:    s_mov_b32 s6, 0
116; GFX7-UNALIGNED-NEXT:    s_mov_b32 s7, 0xf000
117; GFX7-UNALIGNED-NEXT:    s_mov_b64 s[4:5], 0
118; GFX7-UNALIGNED-NEXT:    buffer_load_dwordx3 v[0:2], v[0:1], s[4:7], 0 addr64
119; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0)
120; GFX7-UNALIGNED-NEXT:    s_setpc_b64 s[30:31]
121;
122; GFX7-NOUNALIGNED-LABEL: v_load_constant_v3i32_align1:
123; GFX7-NOUNALIGNED:       ; %bb.0:
124; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
125; GFX7-NOUNALIGNED-NEXT:    s_mov_b32 s6, 0
126; GFX7-NOUNALIGNED-NEXT:    s_mov_b32 s7, 0xf000
127; GFX7-NOUNALIGNED-NEXT:    s_mov_b64 s[4:5], 0
128; GFX7-NOUNALIGNED-NEXT:    buffer_load_ubyte v2, v[0:1], s[4:7], 0 addr64 offset:1
129; GFX7-NOUNALIGNED-NEXT:    buffer_load_ubyte v3, v[0:1], s[4:7], 0 addr64 offset:3
130; GFX7-NOUNALIGNED-NEXT:    buffer_load_ubyte v4, v[0:1], s[4:7], 0 addr64 offset:2
131; GFX7-NOUNALIGNED-NEXT:    buffer_load_ubyte v5, v[0:1], s[4:7], 0 addr64 offset:5
132; GFX7-NOUNALIGNED-NEXT:    buffer_load_ubyte v6, v[0:1], s[4:7], 0 addr64 offset:7
133; GFX7-NOUNALIGNED-NEXT:    buffer_load_ubyte v7, v[0:1], s[4:7], 0 addr64 offset:6
134; GFX7-NOUNALIGNED-NEXT:    buffer_load_ubyte v8, v[0:1], s[4:7], 0 addr64 offset:9
135; GFX7-NOUNALIGNED-NEXT:    buffer_load_ubyte v9, v[0:1], s[4:7], 0 addr64 offset:11
136; GFX7-NOUNALIGNED-NEXT:    buffer_load_ubyte v10, v[0:1], s[4:7], 0 addr64 offset:10
137; GFX7-NOUNALIGNED-NEXT:    buffer_load_ubyte v11, v[0:1], s[4:7], 0 addr64
138; GFX7-NOUNALIGNED-NEXT:    buffer_load_ubyte v12, v[0:1], s[4:7], 0 addr64 offset:4
139; GFX7-NOUNALIGNED-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:8
140; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(11)
141; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
142; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(10)
143; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
144; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(9)
145; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
146; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(8)
147; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v4, 8, v5
148; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(7)
149; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v5, 24, v6
150; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(6)
151; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
152; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(5)
153; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v7, 8, v8
154; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(4)
155; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v8, 24, v9
156; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(3)
157; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
158; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(2)
159; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v1, v1, v11
160; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v2, v2, v3
161; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(1)
162; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v3, v4, v12
163; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v4, v5, v6
164; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0)
165; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v5, v7, v0
166; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v6, v8, v9
167; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v0, v2, v1
168; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v1, v4, v3
169; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v2, v6, v5
170; GFX7-NOUNALIGNED-NEXT:    s_setpc_b64 s[30:31]
171;
172; GFX6-LABEL: v_load_constant_v3i32_align1:
173; GFX6:       ; %bb.0:
174; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
175; GFX6-NEXT:    s_mov_b32 s6, 0
176; GFX6-NEXT:    s_mov_b32 s7, 0xf000
177; GFX6-NEXT:    s_mov_b64 s[4:5], 0
178; GFX6-NEXT:    buffer_load_ubyte v2, v[0:1], s[4:7], 0 addr64 offset:1
179; GFX6-NEXT:    buffer_load_ubyte v3, v[0:1], s[4:7], 0 addr64 offset:3
180; GFX6-NEXT:    buffer_load_ubyte v4, v[0:1], s[4:7], 0 addr64 offset:2
181; GFX6-NEXT:    buffer_load_ubyte v5, v[0:1], s[4:7], 0 addr64 offset:5
182; GFX6-NEXT:    buffer_load_ubyte v6, v[0:1], s[4:7], 0 addr64 offset:7
183; GFX6-NEXT:    buffer_load_ubyte v7, v[0:1], s[4:7], 0 addr64 offset:6
184; GFX6-NEXT:    buffer_load_ubyte v8, v[0:1], s[4:7], 0 addr64 offset:9
185; GFX6-NEXT:    buffer_load_ubyte v9, v[0:1], s[4:7], 0 addr64 offset:11
186; GFX6-NEXT:    buffer_load_ubyte v10, v[0:1], s[4:7], 0 addr64 offset:10
187; GFX6-NEXT:    buffer_load_ubyte v11, v[0:1], s[4:7], 0 addr64
188; GFX6-NEXT:    buffer_load_ubyte v12, v[0:1], s[4:7], 0 addr64 offset:4
189; GFX6-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:8
190; GFX6-NEXT:    s_waitcnt vmcnt(11)
191; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
192; GFX6-NEXT:    s_waitcnt vmcnt(10)
193; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
194; GFX6-NEXT:    s_waitcnt vmcnt(9)
195; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
196; GFX6-NEXT:    s_waitcnt vmcnt(8)
197; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 8, v5
198; GFX6-NEXT:    s_waitcnt vmcnt(7)
199; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 24, v6
200; GFX6-NEXT:    s_waitcnt vmcnt(6)
201; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
202; GFX6-NEXT:    s_waitcnt vmcnt(5)
203; GFX6-NEXT:    v_lshlrev_b32_e32 v7, 8, v8
204; GFX6-NEXT:    s_waitcnt vmcnt(4)
205; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 24, v9
206; GFX6-NEXT:    s_waitcnt vmcnt(3)
207; GFX6-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
208; GFX6-NEXT:    s_waitcnt vmcnt(2)
209; GFX6-NEXT:    v_or_b32_e32 v1, v1, v11
210; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
211; GFX6-NEXT:    s_waitcnt vmcnt(1)
212; GFX6-NEXT:    v_or_b32_e32 v3, v4, v12
213; GFX6-NEXT:    v_or_b32_e32 v4, v5, v6
214; GFX6-NEXT:    s_waitcnt vmcnt(0)
215; GFX6-NEXT:    v_or_b32_e32 v5, v7, v0
216; GFX6-NEXT:    v_or_b32_e32 v6, v8, v9
217; GFX6-NEXT:    v_or_b32_e32 v0, v2, v1
218; GFX6-NEXT:    v_or_b32_e32 v1, v4, v3
219; GFX6-NEXT:    v_or_b32_e32 v2, v6, v5
220; GFX6-NEXT:    s_setpc_b64 s[30:31]
221  %load = load <3 x i32>, ptr addrspace(4) %ptr, align 1
222  ret <3 x i32> %load
223}
224
225define <3 x i32> @v_load_constant_v3i32_align2(ptr addrspace(4) %ptr) {
226; GFX12-UNALIGNED-LABEL: v_load_constant_v3i32_align2:
227; GFX12-UNALIGNED:       ; %bb.0:
228; GFX12-UNALIGNED-NEXT:    s_wait_loadcnt_dscnt 0x0
229; GFX12-UNALIGNED-NEXT:    s_wait_expcnt 0x0
230; GFX12-UNALIGNED-NEXT:    s_wait_samplecnt 0x0
231; GFX12-UNALIGNED-NEXT:    s_wait_bvhcnt 0x0
232; GFX12-UNALIGNED-NEXT:    s_wait_kmcnt 0x0
233; GFX12-UNALIGNED-NEXT:    global_load_b96 v[0:2], v[0:1], off
234; GFX12-UNALIGNED-NEXT:    s_wait_loadcnt 0x0
235; GFX12-UNALIGNED-NEXT:    s_setpc_b64 s[30:31]
236;
237; GFX12-NOUNALIGNED-LABEL: v_load_constant_v3i32_align2:
238; GFX12-NOUNALIGNED:       ; %bb.0:
239; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt_dscnt 0x0
240; GFX12-NOUNALIGNED-NEXT:    s_wait_expcnt 0x0
241; GFX12-NOUNALIGNED-NEXT:    s_wait_samplecnt 0x0
242; GFX12-NOUNALIGNED-NEXT:    s_wait_bvhcnt 0x0
243; GFX12-NOUNALIGNED-NEXT:    s_wait_kmcnt 0x0
244; GFX12-NOUNALIGNED-NEXT:    s_clause 0x5
245; GFX12-NOUNALIGNED-NEXT:    global_load_u16 v2, v[0:1], off
246; GFX12-NOUNALIGNED-NEXT:    global_load_u16 v3, v[0:1], off offset:2
247; GFX12-NOUNALIGNED-NEXT:    global_load_u16 v4, v[0:1], off offset:4
248; GFX12-NOUNALIGNED-NEXT:    global_load_u16 v5, v[0:1], off offset:6
249; GFX12-NOUNALIGNED-NEXT:    global_load_u16 v6, v[0:1], off offset:8
250; GFX12-NOUNALIGNED-NEXT:    global_load_u16 v7, v[0:1], off offset:10
251; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x4
252; GFX12-NOUNALIGNED-NEXT:    v_lshl_or_b32 v0, v3, 16, v2
253; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x2
254; GFX12-NOUNALIGNED-NEXT:    v_lshl_or_b32 v1, v5, 16, v4
255; GFX12-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x0
256; GFX12-NOUNALIGNED-NEXT:    v_lshl_or_b32 v2, v7, 16, v6
257; GFX12-NOUNALIGNED-NEXT:    s_setpc_b64 s[30:31]
258;
259; GFX9-UNALIGNED-LABEL: v_load_constant_v3i32_align2:
260; GFX9-UNALIGNED:       ; %bb.0:
261; GFX9-UNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
262; GFX9-UNALIGNED-NEXT:    global_load_dwordx3 v[0:2], v[0:1], off
263; GFX9-UNALIGNED-NEXT:    s_waitcnt vmcnt(0)
264; GFX9-UNALIGNED-NEXT:    s_setpc_b64 s[30:31]
265;
266; GFX9-NOUNALIGNED-LABEL: v_load_constant_v3i32_align2:
267; GFX9-NOUNALIGNED:       ; %bb.0:
268; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
269; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v2, v[0:1], off
270; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v3, v[0:1], off offset:2
271; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v4, v[0:1], off offset:4
272; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v5, v[0:1], off offset:6
273; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v6, v[0:1], off offset:8
274; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v7, v[0:1], off offset:10
275; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(4)
276; GFX9-NOUNALIGNED-NEXT:    v_lshl_or_b32 v0, v3, 16, v2
277; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(2)
278; GFX9-NOUNALIGNED-NEXT:    v_lshl_or_b32 v1, v5, 16, v4
279; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0)
280; GFX9-NOUNALIGNED-NEXT:    v_lshl_or_b32 v2, v7, 16, v6
281; GFX9-NOUNALIGNED-NEXT:    s_setpc_b64 s[30:31]
282;
283; GFX7-UNALIGNED-LABEL: v_load_constant_v3i32_align2:
284; GFX7-UNALIGNED:       ; %bb.0:
285; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
286; GFX7-UNALIGNED-NEXT:    s_mov_b32 s6, 0
287; GFX7-UNALIGNED-NEXT:    s_mov_b32 s7, 0xf000
288; GFX7-UNALIGNED-NEXT:    s_mov_b64 s[4:5], 0
289; GFX7-UNALIGNED-NEXT:    buffer_load_dwordx3 v[0:2], v[0:1], s[4:7], 0 addr64
290; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0)
291; GFX7-UNALIGNED-NEXT:    s_setpc_b64 s[30:31]
292;
293; GFX7-NOUNALIGNED-LABEL: v_load_constant_v3i32_align2:
294; GFX7-NOUNALIGNED:       ; %bb.0:
295; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
296; GFX7-NOUNALIGNED-NEXT:    s_mov_b32 s6, 0
297; GFX7-NOUNALIGNED-NEXT:    s_mov_b32 s7, 0xf000
298; GFX7-NOUNALIGNED-NEXT:    s_mov_b64 s[4:5], 0
299; GFX7-NOUNALIGNED-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:2
300; GFX7-NOUNALIGNED-NEXT:    buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:6
301; GFX7-NOUNALIGNED-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:10
302; GFX7-NOUNALIGNED-NEXT:    buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64
303; GFX7-NOUNALIGNED-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:4
304; GFX7-NOUNALIGNED-NEXT:    buffer_load_ushort v7, v[0:1], s[4:7], 0 addr64 offset:8
305; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(5)
306; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
307; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(4)
308; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
309; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(3)
310; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
311; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(2)
312; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v0, v0, v5
313; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(1)
314; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v1, v1, v6
315; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0)
316; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v2, v2, v7
317; GFX7-NOUNALIGNED-NEXT:    s_setpc_b64 s[30:31]
318;
319; GFX6-LABEL: v_load_constant_v3i32_align2:
320; GFX6:       ; %bb.0:
321; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
322; GFX6-NEXT:    s_mov_b32 s6, 0
323; GFX6-NEXT:    s_mov_b32 s7, 0xf000
324; GFX6-NEXT:    s_mov_b64 s[4:5], 0
325; GFX6-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:2
326; GFX6-NEXT:    buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:6
327; GFX6-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:10
328; GFX6-NEXT:    buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64
329; GFX6-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:4
330; GFX6-NEXT:    buffer_load_ushort v7, v[0:1], s[4:7], 0 addr64 offset:8
331; GFX6-NEXT:    s_waitcnt vmcnt(5)
332; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
333; GFX6-NEXT:    s_waitcnt vmcnt(4)
334; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
335; GFX6-NEXT:    s_waitcnt vmcnt(3)
336; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
337; GFX6-NEXT:    s_waitcnt vmcnt(2)
338; GFX6-NEXT:    v_or_b32_e32 v0, v0, v5
339; GFX6-NEXT:    s_waitcnt vmcnt(1)
340; GFX6-NEXT:    v_or_b32_e32 v1, v1, v6
341; GFX6-NEXT:    s_waitcnt vmcnt(0)
342; GFX6-NEXT:    v_or_b32_e32 v2, v2, v7
343; GFX6-NEXT:    s_setpc_b64 s[30:31]
344  %load = load <3 x i32>, ptr addrspace(4) %ptr, align 2
345  ret <3 x i32> %load
346}
347
348define <3 x i32> @v_load_constant_v3i32_align4(ptr addrspace(4) %ptr) {
349; GFX12-LABEL: v_load_constant_v3i32_align4:
350; GFX12:       ; %bb.0:
351; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
352; GFX12-NEXT:    s_wait_expcnt 0x0
353; GFX12-NEXT:    s_wait_samplecnt 0x0
354; GFX12-NEXT:    s_wait_bvhcnt 0x0
355; GFX12-NEXT:    s_wait_kmcnt 0x0
356; GFX12-NEXT:    global_load_b96 v[0:2], v[0:1], off
357; GFX12-NEXT:    s_wait_loadcnt 0x0
358; GFX12-NEXT:    s_setpc_b64 s[30:31]
359;
360; GFX9-LABEL: v_load_constant_v3i32_align4:
361; GFX9:       ; %bb.0:
362; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
363; GFX9-NEXT:    global_load_dwordx3 v[0:2], v[0:1], off
364; GFX9-NEXT:    s_waitcnt vmcnt(0)
365; GFX9-NEXT:    s_setpc_b64 s[30:31]
366;
367; GFX7-LABEL: v_load_constant_v3i32_align4:
368; GFX7:       ; %bb.0:
369; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
370; GFX7-NEXT:    s_mov_b32 s6, 0
371; GFX7-NEXT:    s_mov_b32 s7, 0xf000
372; GFX7-NEXT:    s_mov_b64 s[4:5], 0
373; GFX7-NEXT:    buffer_load_dwordx3 v[0:2], v[0:1], s[4:7], 0 addr64
374; GFX7-NEXT:    s_waitcnt vmcnt(0)
375; GFX7-NEXT:    s_setpc_b64 s[30:31]
376;
377; GFX6-LABEL: v_load_constant_v3i32_align4:
378; GFX6:       ; %bb.0:
379; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
380; GFX6-NEXT:    s_mov_b32 s6, 0
381; GFX6-NEXT:    s_mov_b32 s7, 0xf000
382; GFX6-NEXT:    s_mov_b64 s[4:5], 0
383; GFX6-NEXT:    buffer_load_dwordx2 v[3:4], v[0:1], s[4:7], 0 addr64
384; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 offset:8
385; GFX6-NEXT:    s_waitcnt vmcnt(1)
386; GFX6-NEXT:    v_mov_b32_e32 v0, v3
387; GFX6-NEXT:    v_mov_b32_e32 v1, v4
388; GFX6-NEXT:    s_waitcnt vmcnt(0)
389; GFX6-NEXT:    s_setpc_b64 s[30:31]
390  %load = load <3 x i32>, ptr addrspace(4) %ptr, align 4
391  ret <3 x i32> %load
392}
393
394define i96 @v_load_constant_i96_align8(ptr addrspace(4) %ptr) {
395; GFX12-LABEL: v_load_constant_i96_align8:
396; GFX12:       ; %bb.0:
397; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
398; GFX12-NEXT:    s_wait_expcnt 0x0
399; GFX12-NEXT:    s_wait_samplecnt 0x0
400; GFX12-NEXT:    s_wait_bvhcnt 0x0
401; GFX12-NEXT:    s_wait_kmcnt 0x0
402; GFX12-NEXT:    global_load_b96 v[0:2], v[0:1], off
403; GFX12-NEXT:    s_wait_loadcnt 0x0
404; GFX12-NEXT:    s_setpc_b64 s[30:31]
405;
406; GFX9-LABEL: v_load_constant_i96_align8:
407; GFX9:       ; %bb.0:
408; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
409; GFX9-NEXT:    global_load_dwordx3 v[0:2], v[0:1], off
410; GFX9-NEXT:    s_waitcnt vmcnt(0)
411; GFX9-NEXT:    s_setpc_b64 s[30:31]
412;
413; GFX7-LABEL: v_load_constant_i96_align8:
414; GFX7:       ; %bb.0:
415; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
416; GFX7-NEXT:    s_mov_b32 s6, 0
417; GFX7-NEXT:    s_mov_b32 s7, 0xf000
418; GFX7-NEXT:    s_mov_b64 s[4:5], 0
419; GFX7-NEXT:    buffer_load_dwordx3 v[0:2], v[0:1], s[4:7], 0 addr64
420; GFX7-NEXT:    s_waitcnt vmcnt(0)
421; GFX7-NEXT:    s_setpc_b64 s[30:31]
422;
423; GFX6-LABEL: v_load_constant_i96_align8:
424; GFX6:       ; %bb.0:
425; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
426; GFX6-NEXT:    s_mov_b32 s6, 0
427; GFX6-NEXT:    s_mov_b32 s7, 0xf000
428; GFX6-NEXT:    s_mov_b64 s[4:5], 0
429; GFX6-NEXT:    buffer_load_dwordx2 v[3:4], v[0:1], s[4:7], 0 addr64
430; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 offset:8
431; GFX6-NEXT:    s_waitcnt vmcnt(1)
432; GFX6-NEXT:    v_mov_b32_e32 v0, v3
433; GFX6-NEXT:    v_mov_b32_e32 v1, v4
434; GFX6-NEXT:    s_waitcnt vmcnt(0)
435; GFX6-NEXT:    s_setpc_b64 s[30:31]
436  %load = load i96, ptr addrspace(4) %ptr, align 8
437  ret i96 %load
438}
439
440define <3 x i32> @v_load_constant_v3i32_align8(ptr addrspace(4) %ptr) {
441; GFX12-LABEL: v_load_constant_v3i32_align8:
442; GFX12:       ; %bb.0:
443; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
444; GFX12-NEXT:    s_wait_expcnt 0x0
445; GFX12-NEXT:    s_wait_samplecnt 0x0
446; GFX12-NEXT:    s_wait_bvhcnt 0x0
447; GFX12-NEXT:    s_wait_kmcnt 0x0
448; GFX12-NEXT:    global_load_b96 v[0:2], v[0:1], off
449; GFX12-NEXT:    s_wait_loadcnt 0x0
450; GFX12-NEXT:    s_setpc_b64 s[30:31]
451;
452; GFX9-LABEL: v_load_constant_v3i32_align8:
453; GFX9:       ; %bb.0:
454; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
455; GFX9-NEXT:    global_load_dwordx3 v[0:2], v[0:1], off
456; GFX9-NEXT:    s_waitcnt vmcnt(0)
457; GFX9-NEXT:    s_setpc_b64 s[30:31]
458;
459; GFX7-LABEL: v_load_constant_v3i32_align8:
460; GFX7:       ; %bb.0:
461; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
462; GFX7-NEXT:    s_mov_b32 s6, 0
463; GFX7-NEXT:    s_mov_b32 s7, 0xf000
464; GFX7-NEXT:    s_mov_b64 s[4:5], 0
465; GFX7-NEXT:    buffer_load_dwordx3 v[0:2], v[0:1], s[4:7], 0 addr64
466; GFX7-NEXT:    s_waitcnt vmcnt(0)
467; GFX7-NEXT:    s_setpc_b64 s[30:31]
468;
469; GFX6-LABEL: v_load_constant_v3i32_align8:
470; GFX6:       ; %bb.0:
471; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
472; GFX6-NEXT:    s_mov_b32 s6, 0
473; GFX6-NEXT:    s_mov_b32 s7, 0xf000
474; GFX6-NEXT:    s_mov_b64 s[4:5], 0
475; GFX6-NEXT:    buffer_load_dwordx2 v[3:4], v[0:1], s[4:7], 0 addr64
476; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 offset:8
477; GFX6-NEXT:    s_waitcnt vmcnt(1)
478; GFX6-NEXT:    v_mov_b32_e32 v0, v3
479; GFX6-NEXT:    v_mov_b32_e32 v1, v4
480; GFX6-NEXT:    s_waitcnt vmcnt(0)
481; GFX6-NEXT:    s_setpc_b64 s[30:31]
482  %load = load <3 x i32>, ptr addrspace(4) %ptr, align 8
483  ret <3 x i32> %load
484}
485
486define <6 x i16> @v_load_constant_v6i16_align8(ptr addrspace(4) %ptr) {
487; GFX12-LABEL: v_load_constant_v6i16_align8:
488; GFX12:       ; %bb.0:
489; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
490; GFX12-NEXT:    s_wait_expcnt 0x0
491; GFX12-NEXT:    s_wait_samplecnt 0x0
492; GFX12-NEXT:    s_wait_bvhcnt 0x0
493; GFX12-NEXT:    s_wait_kmcnt 0x0
494; GFX12-NEXT:    global_load_b96 v[0:2], v[0:1], off
495; GFX12-NEXT:    s_wait_loadcnt 0x0
496; GFX12-NEXT:    s_setpc_b64 s[30:31]
497;
498; GFX9-LABEL: v_load_constant_v6i16_align8:
499; GFX9:       ; %bb.0:
500; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
501; GFX9-NEXT:    global_load_dwordx3 v[0:2], v[0:1], off
502; GFX9-NEXT:    s_waitcnt vmcnt(0)
503; GFX9-NEXT:    s_setpc_b64 s[30:31]
504;
505; GFX7-LABEL: v_load_constant_v6i16_align8:
506; GFX7:       ; %bb.0:
507; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
508; GFX7-NEXT:    s_mov_b32 s6, 0
509; GFX7-NEXT:    s_mov_b32 s7, 0xf000
510; GFX7-NEXT:    s_mov_b64 s[4:5], 0
511; GFX7-NEXT:    buffer_load_dwordx3 v[6:8], v[0:1], s[4:7], 0 addr64
512; GFX7-NEXT:    s_waitcnt vmcnt(0)
513; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
514; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v7
515; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v8
516; GFX7-NEXT:    v_mov_b32_e32 v0, v6
517; GFX7-NEXT:    v_mov_b32_e32 v2, v7
518; GFX7-NEXT:    v_mov_b32_e32 v4, v8
519; GFX7-NEXT:    s_setpc_b64 s[30:31]
520;
521; GFX6-LABEL: v_load_constant_v6i16_align8:
522; GFX6:       ; %bb.0:
523; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
524; GFX6-NEXT:    s_mov_b32 s6, 0
525; GFX6-NEXT:    s_mov_b32 s7, 0xf000
526; GFX6-NEXT:    s_mov_b64 s[4:5], 0
527; GFX6-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
528; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8
529; GFX6-NEXT:    s_waitcnt vmcnt(1)
530; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
531; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v7
532; GFX6-NEXT:    s_waitcnt vmcnt(0)
533; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
534; GFX6-NEXT:    v_mov_b32_e32 v0, v6
535; GFX6-NEXT:    v_mov_b32_e32 v2, v7
536; GFX6-NEXT:    s_setpc_b64 s[30:31]
537  %load = load <6 x i16>, ptr addrspace(4) %ptr, align 8
538  ret <6 x i16> %load
539}
540
541define <12 x i8> @v_load_constant_v12i8_align8(ptr addrspace(4) %ptr) {
542; GFX12-LABEL: v_load_constant_v12i8_align8:
543; GFX12:       ; %bb.0:
544; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
545; GFX12-NEXT:    s_wait_expcnt 0x0
546; GFX12-NEXT:    s_wait_samplecnt 0x0
547; GFX12-NEXT:    s_wait_bvhcnt 0x0
548; GFX12-NEXT:    s_wait_kmcnt 0x0
549; GFX12-NEXT:    global_load_b96 v[0:2], v[0:1], off
550; GFX12-NEXT:    s_wait_loadcnt 0x0
551; GFX12-NEXT:    v_lshrrev_b32_e32 v13, 8, v0
552; GFX12-NEXT:    v_lshrrev_b32_e32 v12, 16, v0
553; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
554; GFX12-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
555; GFX12-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
556; GFX12-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
557; GFX12-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
558; GFX12-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
559; GFX12-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
560; GFX12-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v1, v13
561; GFX12-NEXT:    v_mov_b32_e32 v8, v2
562; GFX12-NEXT:    v_mov_b32_e32 v2, v12
563; GFX12-NEXT:    s_setpc_b64 s[30:31]
564;
565; GFX9-LABEL: v_load_constant_v12i8_align8:
566; GFX9:       ; %bb.0:
567; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
568; GFX9-NEXT:    global_load_dwordx3 v[0:2], v[0:1], off
569; GFX9-NEXT:    s_waitcnt vmcnt(0)
570; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 8, v0
571; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v0
572; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
573; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
574; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
575; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
576; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
577; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
578; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
579; GFX9-NEXT:    v_mov_b32_e32 v4, v1
580; GFX9-NEXT:    v_mov_b32_e32 v8, v2
581; GFX9-NEXT:    v_mov_b32_e32 v1, v13
582; GFX9-NEXT:    v_mov_b32_e32 v2, v12
583; GFX9-NEXT:    s_setpc_b64 s[30:31]
584;
585; GFX7-LABEL: v_load_constant_v12i8_align8:
586; GFX7:       ; %bb.0:
587; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
588; GFX7-NEXT:    s_mov_b32 s6, 0
589; GFX7-NEXT:    s_mov_b32 s7, 0xf000
590; GFX7-NEXT:    s_mov_b64 s[4:5], 0
591; GFX7-NEXT:    buffer_load_dwordx3 v[0:2], v[0:1], s[4:7], 0 addr64
592; GFX7-NEXT:    s_waitcnt vmcnt(0)
593; GFX7-NEXT:    v_lshrrev_b32_e32 v13, 8, v0
594; GFX7-NEXT:    v_lshrrev_b32_e32 v12, 16, v0
595; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
596; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
597; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
598; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
599; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
600; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
601; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
602; GFX7-NEXT:    v_mov_b32_e32 v4, v1
603; GFX7-NEXT:    v_mov_b32_e32 v8, v2
604; GFX7-NEXT:    v_mov_b32_e32 v1, v13
605; GFX7-NEXT:    v_mov_b32_e32 v2, v12
606; GFX7-NEXT:    s_setpc_b64 s[30:31]
607;
608; GFX6-LABEL: v_load_constant_v12i8_align8:
609; GFX6:       ; %bb.0:
610; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
611; GFX6-NEXT:    s_mov_b32 s6, 0
612; GFX6-NEXT:    s_mov_b32 s7, 0xf000
613; GFX6-NEXT:    s_mov_b64 s[4:5], 0
614; GFX6-NEXT:    buffer_load_dwordx2 v[12:13], v[0:1], s[4:7], 0 addr64
615; GFX6-NEXT:    buffer_load_dword v8, v[0:1], s[4:7], 0 addr64 offset:8
616; GFX6-NEXT:    s_waitcnt vmcnt(1)
617; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 8, v12
618; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v12
619; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 24, v12
620; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 8, v13
621; GFX6-NEXT:    v_lshrrev_b32_e32 v6, 16, v13
622; GFX6-NEXT:    v_lshrrev_b32_e32 v7, 24, v13
623; GFX6-NEXT:    s_waitcnt vmcnt(0)
624; GFX6-NEXT:    v_lshrrev_b32_e32 v9, 8, v8
625; GFX6-NEXT:    v_lshrrev_b32_e32 v10, 16, v8
626; GFX6-NEXT:    v_lshrrev_b32_e32 v11, 24, v8
627; GFX6-NEXT:    v_mov_b32_e32 v0, v12
628; GFX6-NEXT:    v_mov_b32_e32 v4, v13
629; GFX6-NEXT:    s_setpc_b64 s[30:31]
630  %load = load <12 x i8>, ptr addrspace(4) %ptr, align 8
631  ret <12 x i8> %load
632}
633
634define <3 x i32> @v_load_constant_v3i32_align16(ptr addrspace(4) %ptr) {
635; GFX12-LABEL: v_load_constant_v3i32_align16:
636; GFX12:       ; %bb.0:
637; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
638; GFX12-NEXT:    s_wait_expcnt 0x0
639; GFX12-NEXT:    s_wait_samplecnt 0x0
640; GFX12-NEXT:    s_wait_bvhcnt 0x0
641; GFX12-NEXT:    s_wait_kmcnt 0x0
642; GFX12-NEXT:    global_load_b96 v[0:2], v[0:1], off
643; GFX12-NEXT:    s_wait_loadcnt 0x0
644; GFX12-NEXT:    s_setpc_b64 s[30:31]
645;
646; GFX9-LABEL: v_load_constant_v3i32_align16:
647; GFX9:       ; %bb.0:
648; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
649; GFX9-NEXT:    global_load_dwordx3 v[0:2], v[0:1], off
650; GFX9-NEXT:    s_waitcnt vmcnt(0)
651; GFX9-NEXT:    s_setpc_b64 s[30:31]
652;
653; GFX7-LABEL: v_load_constant_v3i32_align16:
654; GFX7:       ; %bb.0:
655; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
656; GFX7-NEXT:    s_mov_b32 s6, 0
657; GFX7-NEXT:    s_mov_b32 s7, 0xf000
658; GFX7-NEXT:    s_mov_b64 s[4:5], 0
659; GFX7-NEXT:    buffer_load_dwordx3 v[0:2], v[0:1], s[4:7], 0 addr64
660; GFX7-NEXT:    s_waitcnt vmcnt(0)
661; GFX7-NEXT:    s_setpc_b64 s[30:31]
662;
663; GFX6-LABEL: v_load_constant_v3i32_align16:
664; GFX6:       ; %bb.0:
665; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
666; GFX6-NEXT:    s_mov_b32 s6, 0
667; GFX6-NEXT:    s_mov_b32 s7, 0xf000
668; GFX6-NEXT:    s_mov_b64 s[4:5], 0
669; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
670; GFX6-NEXT:    s_waitcnt vmcnt(0)
671; GFX6-NEXT:    s_setpc_b64 s[30:31]
672  %load = load <3 x i32>, ptr addrspace(4) %ptr, align 16
673  ret <3 x i32> %load
674}
675
676define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(ptr addrspace(4) inreg %ptr) {
677; GFX12-UNALIGNED-LABEL: s_load_constant_v3i32_align1:
678; GFX12-UNALIGNED:       ; %bb.0:
679; GFX12-UNALIGNED-NEXT:    v_mov_b32_e32 v0, 0
680; GFX12-UNALIGNED-NEXT:    global_load_b96 v[0:2], v0, s[0:1]
681; GFX12-UNALIGNED-NEXT:    s_wait_loadcnt 0x0
682; GFX12-UNALIGNED-NEXT:    v_readfirstlane_b32 s0, v0
683; GFX12-UNALIGNED-NEXT:    v_readfirstlane_b32 s1, v1
684; GFX12-UNALIGNED-NEXT:    v_readfirstlane_b32 s2, v2
685; GFX12-UNALIGNED-NEXT:    ; return to shader part epilog
686;
687; GFX12-NOUNALIGNED-LABEL: s_load_constant_v3i32_align1:
688; GFX12-NOUNALIGNED:       ; %bb.0:
689; GFX12-NOUNALIGNED-NEXT:    s_clause 0xb
690; GFX12-NOUNALIGNED-NEXT:    s_load_u8 s2, s[0:1], 0x1
691; GFX12-NOUNALIGNED-NEXT:    s_load_u8 s3, s[0:1], 0x3
692; GFX12-NOUNALIGNED-NEXT:    s_load_u8 s4, s[0:1], 0x2
693; GFX12-NOUNALIGNED-NEXT:    s_load_u8 s5, s[0:1], 0x5
694; GFX12-NOUNALIGNED-NEXT:    s_load_u8 s6, s[0:1], 0x7
695; GFX12-NOUNALIGNED-NEXT:    s_load_u8 s7, s[0:1], 0x6
696; GFX12-NOUNALIGNED-NEXT:    s_load_u8 s8, s[0:1], 0x9
697; GFX12-NOUNALIGNED-NEXT:    s_load_u8 s9, s[0:1], 0xb
698; GFX12-NOUNALIGNED-NEXT:    s_load_u8 s10, s[0:1], 0x0
699; GFX12-NOUNALIGNED-NEXT:    s_load_u8 s11, s[0:1], 0x4
700; GFX12-NOUNALIGNED-NEXT:    s_load_u8 s12, s[0:1], 0xa
701; GFX12-NOUNALIGNED-NEXT:    s_load_u8 s1, s[0:1], 0x8
702; GFX12-NOUNALIGNED-NEXT:    s_wait_kmcnt 0x0
703; GFX12-NOUNALIGNED-NEXT:    s_lshl_b32 s0, s2, 8
704; GFX12-NOUNALIGNED-NEXT:    s_lshl_b32 s2, s3, 24
705; GFX12-NOUNALIGNED-NEXT:    s_lshl_b32 s3, s4, 16
706; GFX12-NOUNALIGNED-NEXT:    s_lshl_b32 s4, s5, 8
707; GFX12-NOUNALIGNED-NEXT:    s_or_b32 s2, s2, s3
708; GFX12-NOUNALIGNED-NEXT:    s_lshl_b32 s5, s6, 24
709; GFX12-NOUNALIGNED-NEXT:    s_lshl_b32 s6, s7, 16
710; GFX12-NOUNALIGNED-NEXT:    s_lshl_b32 s7, s8, 8
711; GFX12-NOUNALIGNED-NEXT:    s_or_b32 s0, s0, s10
712; GFX12-NOUNALIGNED-NEXT:    s_lshl_b32 s8, s9, 24
713; GFX12-NOUNALIGNED-NEXT:    s_or_b32 s0, s2, s0
714; GFX12-NOUNALIGNED-NEXT:    s_lshl_b32 s2, s12, 16
715; GFX12-NOUNALIGNED-NEXT:    s_or_b32 s3, s4, s11
716; GFX12-NOUNALIGNED-NEXT:    s_or_b32 s4, s5, s6
717; GFX12-NOUNALIGNED-NEXT:    s_or_b32 s5, s7, s1
718; GFX12-NOUNALIGNED-NEXT:    s_or_b32 s2, s8, s2
719; GFX12-NOUNALIGNED-NEXT:    s_or_b32 s1, s4, s3
720; GFX12-NOUNALIGNED-NEXT:    s_or_b32 s2, s2, s5
721; GFX12-NOUNALIGNED-NEXT:    ; return to shader part epilog
722;
723; GFX9-UNALIGNED-LABEL: s_load_constant_v3i32_align1:
724; GFX9-UNALIGNED:       ; %bb.0:
725; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v0, 0
726; GFX9-UNALIGNED-NEXT:    global_load_dwordx3 v[0:2], v0, s[0:1]
727; GFX9-UNALIGNED-NEXT:    s_waitcnt vmcnt(0)
728; GFX9-UNALIGNED-NEXT:    v_readfirstlane_b32 s0, v0
729; GFX9-UNALIGNED-NEXT:    v_readfirstlane_b32 s1, v1
730; GFX9-UNALIGNED-NEXT:    v_readfirstlane_b32 s2, v2
731; GFX9-UNALIGNED-NEXT:    ; return to shader part epilog
732;
733; GFX9-NOUNALIGNED-LABEL: s_load_constant_v3i32_align1:
734; GFX9-NOUNALIGNED:       ; %bb.0:
735; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v0, 0
736; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v1, v0, s[0:1]
737; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:1
738; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:2
739; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:3
740; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:4
741; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:5
742; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:6
743; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:7
744; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:8
745; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:9
746; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:11
747; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:10
748; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(10)
749; GFX9-NOUNALIGNED-NEXT:    v_lshl_or_b32 v0, v2, 8, v1
750; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(9)
751; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
752; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(8)
753; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 24, v4
754; GFX9-NOUNALIGNED-NEXT:    v_or3_b32 v0, v1, v2, v0
755; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(6)
756; GFX9-NOUNALIGNED-NEXT:    v_lshl_or_b32 v3, v6, 8, v5
757; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(5)
758; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
759; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(4)
760; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v4, 24, v8
761; GFX9-NOUNALIGNED-NEXT:    v_or3_b32 v1, v4, v5, v3
762; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(2)
763; GFX9-NOUNALIGNED-NEXT:    v_lshl_or_b32 v6, v10, 8, v9
764; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(1)
765; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v7, 24, v11
766; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0)
767; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
768; GFX9-NOUNALIGNED-NEXT:    v_or3_b32 v2, v7, v8, v6
769; GFX9-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s0, v0
770; GFX9-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s1, v1
771; GFX9-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s2, v2
772; GFX9-NOUNALIGNED-NEXT:    ; return to shader part epilog
773;
774; GFX7-UNALIGNED-LABEL: s_load_constant_v3i32_align1:
775; GFX7-UNALIGNED:       ; %bb.0:
776; GFX7-UNALIGNED-NEXT:    s_mov_b32 s2, -1
777; GFX7-UNALIGNED-NEXT:    s_mov_b32 s3, 0xf000
778; GFX7-UNALIGNED-NEXT:    buffer_load_dwordx3 v[0:2], off, s[0:3], 0
779; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0)
780; GFX7-UNALIGNED-NEXT:    v_readfirstlane_b32 s0, v0
781; GFX7-UNALIGNED-NEXT:    v_readfirstlane_b32 s1, v1
782; GFX7-UNALIGNED-NEXT:    v_readfirstlane_b32 s2, v2
783; GFX7-UNALIGNED-NEXT:    ; return to shader part epilog
784;
785; GFX7-NOUNALIGNED-LABEL: s_load_constant_v3i32_align1:
786; GFX7-NOUNALIGNED:       ; %bb.0:
787; GFX7-NOUNALIGNED-NEXT:    s_mov_b32 s2, -1
788; GFX7-NOUNALIGNED-NEXT:    s_mov_b32 s3, 0xf000
789; GFX7-NOUNALIGNED-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 offset:1
790; GFX7-NOUNALIGNED-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0 offset:3
791; GFX7-NOUNALIGNED-NEXT:    buffer_load_ubyte v2, off, s[0:3], 0 offset:2
792; GFX7-NOUNALIGNED-NEXT:    buffer_load_ubyte v3, off, s[0:3], 0 offset:5
793; GFX7-NOUNALIGNED-NEXT:    buffer_load_ubyte v4, off, s[0:3], 0 offset:7
794; GFX7-NOUNALIGNED-NEXT:    buffer_load_ubyte v5, off, s[0:3], 0 offset:6
795; GFX7-NOUNALIGNED-NEXT:    buffer_load_ubyte v6, off, s[0:3], 0 offset:9
796; GFX7-NOUNALIGNED-NEXT:    buffer_load_ubyte v7, off, s[0:3], 0 offset:11
797; GFX7-NOUNALIGNED-NEXT:    buffer_load_ubyte v8, off, s[0:3], 0 offset:10
798; GFX7-NOUNALIGNED-NEXT:    buffer_load_ubyte v9, off, s[0:3], 0
799; GFX7-NOUNALIGNED-NEXT:    buffer_load_ubyte v10, off, s[0:3], 0 offset:4
800; GFX7-NOUNALIGNED-NEXT:    buffer_load_ubyte v11, off, s[0:3], 0 offset:8
801; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(11)
802; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
803; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(10)
804; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
805; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(9)
806; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
807; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(8)
808; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
809; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(7)
810; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
811; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(6)
812; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
813; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(5)
814; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
815; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(4)
816; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
817; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(3)
818; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
819; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(2)
820; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v0, v0, v9
821; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v1, v1, v2
822; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(1)
823; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v2, v3, v10
824; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v3, v4, v5
825; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0)
826; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v4, v6, v11
827; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v5, v7, v8
828; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v0, v1, v0
829; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v1, v3, v2
830; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v2, v5, v4
831; GFX7-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s0, v0
832; GFX7-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s1, v1
833; GFX7-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s2, v2
834; GFX7-NOUNALIGNED-NEXT:    ; return to shader part epilog
835;
836; GFX6-LABEL: s_load_constant_v3i32_align1:
837; GFX6:       ; %bb.0:
838; GFX6-NEXT:    s_mov_b32 s2, -1
839; GFX6-NEXT:    s_mov_b32 s3, 0xf000
840; GFX6-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 offset:1
841; GFX6-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0 offset:3
842; GFX6-NEXT:    buffer_load_ubyte v2, off, s[0:3], 0 offset:2
843; GFX6-NEXT:    buffer_load_ubyte v3, off, s[0:3], 0 offset:5
844; GFX6-NEXT:    buffer_load_ubyte v4, off, s[0:3], 0 offset:7
845; GFX6-NEXT:    buffer_load_ubyte v5, off, s[0:3], 0 offset:6
846; GFX6-NEXT:    buffer_load_ubyte v6, off, s[0:3], 0 offset:9
847; GFX6-NEXT:    buffer_load_ubyte v7, off, s[0:3], 0 offset:11
848; GFX6-NEXT:    buffer_load_ubyte v8, off, s[0:3], 0 offset:10
849; GFX6-NEXT:    buffer_load_ubyte v9, off, s[0:3], 0
850; GFX6-NEXT:    buffer_load_ubyte v10, off, s[0:3], 0 offset:4
851; GFX6-NEXT:    buffer_load_ubyte v11, off, s[0:3], 0 offset:8
852; GFX6-NEXT:    s_waitcnt vmcnt(11)
853; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
854; GFX6-NEXT:    s_waitcnt vmcnt(10)
855; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
856; GFX6-NEXT:    s_waitcnt vmcnt(9)
857; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
858; GFX6-NEXT:    s_waitcnt vmcnt(8)
859; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
860; GFX6-NEXT:    s_waitcnt vmcnt(7)
861; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
862; GFX6-NEXT:    s_waitcnt vmcnt(6)
863; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
864; GFX6-NEXT:    s_waitcnt vmcnt(5)
865; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
866; GFX6-NEXT:    s_waitcnt vmcnt(4)
867; GFX6-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
868; GFX6-NEXT:    s_waitcnt vmcnt(3)
869; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
870; GFX6-NEXT:    s_waitcnt vmcnt(2)
871; GFX6-NEXT:    v_or_b32_e32 v0, v0, v9
872; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
873; GFX6-NEXT:    s_waitcnt vmcnt(1)
874; GFX6-NEXT:    v_or_b32_e32 v2, v3, v10
875; GFX6-NEXT:    v_or_b32_e32 v3, v4, v5
876; GFX6-NEXT:    s_waitcnt vmcnt(0)
877; GFX6-NEXT:    v_or_b32_e32 v4, v6, v11
878; GFX6-NEXT:    v_or_b32_e32 v5, v7, v8
879; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
880; GFX6-NEXT:    v_or_b32_e32 v1, v3, v2
881; GFX6-NEXT:    v_or_b32_e32 v2, v5, v4
882; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
883; GFX6-NEXT:    v_readfirstlane_b32 s1, v1
884; GFX6-NEXT:    v_readfirstlane_b32 s2, v2
885; GFX6-NEXT:    ; return to shader part epilog
886  %load = load <3 x i32>, ptr addrspace(4) %ptr, align 1
887  ret <3 x i32> %load
888}
889
890define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align2(ptr addrspace(4) inreg %ptr) {
891; GFX12-UNALIGNED-LABEL: s_load_constant_v3i32_align2:
892; GFX12-UNALIGNED:       ; %bb.0:
893; GFX12-UNALIGNED-NEXT:    v_mov_b32_e32 v0, 0
894; GFX12-UNALIGNED-NEXT:    global_load_b96 v[0:2], v0, s[0:1]
895; GFX12-UNALIGNED-NEXT:    s_wait_loadcnt 0x0
896; GFX12-UNALIGNED-NEXT:    v_readfirstlane_b32 s0, v0
897; GFX12-UNALIGNED-NEXT:    v_readfirstlane_b32 s1, v1
898; GFX12-UNALIGNED-NEXT:    v_readfirstlane_b32 s2, v2
899; GFX12-UNALIGNED-NEXT:    ; return to shader part epilog
900;
901; GFX12-NOUNALIGNED-LABEL: s_load_constant_v3i32_align2:
902; GFX12-NOUNALIGNED:       ; %bb.0:
903; GFX12-NOUNALIGNED-NEXT:    s_clause 0x5
904; GFX12-NOUNALIGNED-NEXT:    s_load_u16 s2, s[0:1], 0x2
905; GFX12-NOUNALIGNED-NEXT:    s_load_u16 s3, s[0:1], 0x6
906; GFX12-NOUNALIGNED-NEXT:    s_load_u16 s4, s[0:1], 0xa
907; GFX12-NOUNALIGNED-NEXT:    s_load_u16 s5, s[0:1], 0x0
908; GFX12-NOUNALIGNED-NEXT:    s_load_u16 s6, s[0:1], 0x4
909; GFX12-NOUNALIGNED-NEXT:    s_load_u16 s7, s[0:1], 0x8
910; GFX12-NOUNALIGNED-NEXT:    s_wait_kmcnt 0x0
911; GFX12-NOUNALIGNED-NEXT:    s_lshl_b32 s0, s2, 16
912; GFX12-NOUNALIGNED-NEXT:    s_lshl_b32 s1, s3, 16
913; GFX12-NOUNALIGNED-NEXT:    s_lshl_b32 s2, s4, 16
914; GFX12-NOUNALIGNED-NEXT:    s_or_b32 s0, s0, s5
915; GFX12-NOUNALIGNED-NEXT:    s_or_b32 s1, s1, s6
916; GFX12-NOUNALIGNED-NEXT:    s_or_b32 s2, s2, s7
917; GFX12-NOUNALIGNED-NEXT:    ; return to shader part epilog
918;
919; GFX9-UNALIGNED-LABEL: s_load_constant_v3i32_align2:
920; GFX9-UNALIGNED:       ; %bb.0:
921; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v0, 0
922; GFX9-UNALIGNED-NEXT:    global_load_dwordx3 v[0:2], v0, s[0:1]
923; GFX9-UNALIGNED-NEXT:    s_waitcnt vmcnt(0)
924; GFX9-UNALIGNED-NEXT:    v_readfirstlane_b32 s0, v0
925; GFX9-UNALIGNED-NEXT:    v_readfirstlane_b32 s1, v1
926; GFX9-UNALIGNED-NEXT:    v_readfirstlane_b32 s2, v2
927; GFX9-UNALIGNED-NEXT:    ; return to shader part epilog
928;
929; GFX9-NOUNALIGNED-LABEL: s_load_constant_v3i32_align2:
930; GFX9-NOUNALIGNED:       ; %bb.0:
931; GFX9-NOUNALIGNED-NEXT:    v_mov_b32_e32 v0, 0
932; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v1, v0, s[0:1]
933; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2
934; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v3, v0, s[0:1] offset:4
935; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v4, v0, s[0:1] offset:6
936; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v5, v0, s[0:1] offset:8
937; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v6, v0, s[0:1] offset:10
938; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(4)
939; GFX9-NOUNALIGNED-NEXT:    v_lshl_or_b32 v0, v2, 16, v1
940; GFX9-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s0, v0
941; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(2)
942; GFX9-NOUNALIGNED-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
943; GFX9-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s1, v1
944; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0)
945; GFX9-NOUNALIGNED-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
946; GFX9-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s2, v2
947; GFX9-NOUNALIGNED-NEXT:    ; return to shader part epilog
948;
949; GFX7-UNALIGNED-LABEL: s_load_constant_v3i32_align2:
950; GFX7-UNALIGNED:       ; %bb.0:
951; GFX7-UNALIGNED-NEXT:    s_mov_b32 s2, -1
952; GFX7-UNALIGNED-NEXT:    s_mov_b32 s3, 0xf000
953; GFX7-UNALIGNED-NEXT:    buffer_load_dwordx3 v[0:2], off, s[0:3], 0
954; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0)
955; GFX7-UNALIGNED-NEXT:    v_readfirstlane_b32 s0, v0
956; GFX7-UNALIGNED-NEXT:    v_readfirstlane_b32 s1, v1
957; GFX7-UNALIGNED-NEXT:    v_readfirstlane_b32 s2, v2
958; GFX7-UNALIGNED-NEXT:    ; return to shader part epilog
959;
960; GFX7-NOUNALIGNED-LABEL: s_load_constant_v3i32_align2:
961; GFX7-NOUNALIGNED:       ; %bb.0:
962; GFX7-NOUNALIGNED-NEXT:    s_mov_b32 s2, -1
963; GFX7-NOUNALIGNED-NEXT:    s_mov_b32 s3, 0xf000
964; GFX7-NOUNALIGNED-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:2
965; GFX7-NOUNALIGNED-NEXT:    buffer_load_ushort v1, off, s[0:3], 0 offset:6
966; GFX7-NOUNALIGNED-NEXT:    buffer_load_ushort v2, off, s[0:3], 0 offset:10
967; GFX7-NOUNALIGNED-NEXT:    buffer_load_ushort v3, off, s[0:3], 0
968; GFX7-NOUNALIGNED-NEXT:    buffer_load_ushort v4, off, s[0:3], 0 offset:4
969; GFX7-NOUNALIGNED-NEXT:    buffer_load_ushort v5, off, s[0:3], 0 offset:8
970; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(5)
971; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
972; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(4)
973; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
974; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(3)
975; GFX7-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
976; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(2)
977; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v0, v0, v3
978; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(1)
979; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v1, v1, v4
980; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0)
981; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v2, v2, v5
982; GFX7-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s0, v0
983; GFX7-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s1, v1
984; GFX7-NOUNALIGNED-NEXT:    v_readfirstlane_b32 s2, v2
985; GFX7-NOUNALIGNED-NEXT:    ; return to shader part epilog
986;
987; GFX6-LABEL: s_load_constant_v3i32_align2:
988; GFX6:       ; %bb.0:
989; GFX6-NEXT:    s_mov_b32 s2, -1
990; GFX6-NEXT:    s_mov_b32 s3, 0xf000
991; GFX6-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:2
992; GFX6-NEXT:    buffer_load_ushort v1, off, s[0:3], 0 offset:6
993; GFX6-NEXT:    buffer_load_ushort v2, off, s[0:3], 0 offset:10
994; GFX6-NEXT:    buffer_load_ushort v3, off, s[0:3], 0
995; GFX6-NEXT:    buffer_load_ushort v4, off, s[0:3], 0 offset:4
996; GFX6-NEXT:    buffer_load_ushort v5, off, s[0:3], 0 offset:8
997; GFX6-NEXT:    s_waitcnt vmcnt(5)
998; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
999; GFX6-NEXT:    s_waitcnt vmcnt(4)
1000; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1001; GFX6-NEXT:    s_waitcnt vmcnt(3)
1002; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
1003; GFX6-NEXT:    s_waitcnt vmcnt(2)
1004; GFX6-NEXT:    v_or_b32_e32 v0, v0, v3
1005; GFX6-NEXT:    s_waitcnt vmcnt(1)
1006; GFX6-NEXT:    v_or_b32_e32 v1, v1, v4
1007; GFX6-NEXT:    s_waitcnt vmcnt(0)
1008; GFX6-NEXT:    v_or_b32_e32 v2, v2, v5
1009; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
1010; GFX6-NEXT:    v_readfirstlane_b32 s1, v1
1011; GFX6-NEXT:    v_readfirstlane_b32 s2, v2
1012; GFX6-NEXT:    ; return to shader part epilog
1013  %load = load <3 x i32>, ptr addrspace(4) %ptr, align 2
1014  ret <3 x i32> %load
1015}
1016
1017define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align4(ptr addrspace(4) inreg %ptr) {
1018; GFX12-LABEL: s_load_constant_v3i32_align4:
1019; GFX12:       ; %bb.0:
1020; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
1021; GFX12-NEXT:    s_wait_kmcnt 0x0
1022; GFX12-NEXT:    ; return to shader part epilog
1023;
1024; GFX9-LABEL: s_load_constant_v3i32_align4:
1025; GFX9:       ; %bb.0:
1026; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
1027; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x8
1028; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1029; GFX9-NEXT:    s_mov_b32 s0, s4
1030; GFX9-NEXT:    s_mov_b32 s1, s5
1031; GFX9-NEXT:    ; return to shader part epilog
1032;
1033; GFX7-LABEL: s_load_constant_v3i32_align4:
1034; GFX7:       ; %bb.0:
1035; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
1036; GFX7-NEXT:    s_load_dword s2, s[0:1], 0x2
1037; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1038; GFX7-NEXT:    s_mov_b32 s0, s4
1039; GFX7-NEXT:    s_mov_b32 s1, s5
1040; GFX7-NEXT:    ; return to shader part epilog
1041;
1042; GFX6-LABEL: s_load_constant_v3i32_align4:
1043; GFX6:       ; %bb.0:
1044; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
1045; GFX6-NEXT:    s_load_dword s2, s[0:1], 0x2
1046; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1047; GFX6-NEXT:    s_mov_b32 s0, s4
1048; GFX6-NEXT:    s_mov_b32 s1, s5
1049; GFX6-NEXT:    ; return to shader part epilog
1050  %load = load <3 x i32>, ptr addrspace(4) %ptr, align 4
1051  ret <3 x i32> %load
1052}
1053
1054define amdgpu_ps i96 @s_load_constant_i96_align8(ptr addrspace(4) inreg %ptr) {
1055; GFX12-LABEL: s_load_constant_i96_align8:
1056; GFX12:       ; %bb.0:
1057; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
1058; GFX12-NEXT:    s_wait_kmcnt 0x0
1059; GFX12-NEXT:    ; return to shader part epilog
1060;
1061; GFX9-LABEL: s_load_constant_i96_align8:
1062; GFX9:       ; %bb.0:
1063; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
1064; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x8
1065; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1066; GFX9-NEXT:    s_mov_b32 s0, s4
1067; GFX9-NEXT:    s_mov_b32 s1, s5
1068; GFX9-NEXT:    ; return to shader part epilog
1069;
1070; GFX7-LABEL: s_load_constant_i96_align8:
1071; GFX7:       ; %bb.0:
1072; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
1073; GFX7-NEXT:    s_load_dword s2, s[0:1], 0x2
1074; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1075; GFX7-NEXT:    s_mov_b32 s0, s4
1076; GFX7-NEXT:    s_mov_b32 s1, s5
1077; GFX7-NEXT:    ; return to shader part epilog
1078;
1079; GFX6-LABEL: s_load_constant_i96_align8:
1080; GFX6:       ; %bb.0:
1081; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
1082; GFX6-NEXT:    s_load_dword s2, s[0:1], 0x2
1083; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1084; GFX6-NEXT:    s_mov_b32 s0, s4
1085; GFX6-NEXT:    s_mov_b32 s1, s5
1086; GFX6-NEXT:    ; return to shader part epilog
1087  %load = load i96, ptr addrspace(4) %ptr, align 8
1088  ret i96 %load
1089}
1090
1091define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align8(ptr addrspace(4) inreg %ptr) {
1092; GFX12-LABEL: s_load_constant_v3i32_align8:
1093; GFX12:       ; %bb.0:
1094; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
1095; GFX12-NEXT:    s_wait_kmcnt 0x0
1096; GFX12-NEXT:    ; return to shader part epilog
1097;
1098; GFX9-LABEL: s_load_constant_v3i32_align8:
1099; GFX9:       ; %bb.0:
1100; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
1101; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x8
1102; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1103; GFX9-NEXT:    s_mov_b32 s0, s4
1104; GFX9-NEXT:    s_mov_b32 s1, s5
1105; GFX9-NEXT:    ; return to shader part epilog
1106;
1107; GFX7-LABEL: s_load_constant_v3i32_align8:
1108; GFX7:       ; %bb.0:
1109; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
1110; GFX7-NEXT:    s_load_dword s2, s[0:1], 0x2
1111; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1112; GFX7-NEXT:    s_mov_b32 s0, s4
1113; GFX7-NEXT:    s_mov_b32 s1, s5
1114; GFX7-NEXT:    ; return to shader part epilog
1115;
1116; GFX6-LABEL: s_load_constant_v3i32_align8:
1117; GFX6:       ; %bb.0:
1118; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
1119; GFX6-NEXT:    s_load_dword s2, s[0:1], 0x2
1120; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1121; GFX6-NEXT:    s_mov_b32 s0, s4
1122; GFX6-NEXT:    s_mov_b32 s1, s5
1123; GFX6-NEXT:    ; return to shader part epilog
1124  %load = load <3 x i32>, ptr addrspace(4) %ptr, align 8
1125  ret <3 x i32> %load
1126}
1127
1128define amdgpu_ps <3 x i32> @s_load_constant_v6i16_align8(ptr addrspace(4) inreg %ptr) {
1129; GFX12-LABEL: s_load_constant_v6i16_align8:
1130; GFX12:       ; %bb.0:
1131; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
1132; GFX12-NEXT:    s_wait_kmcnt 0x0
1133; GFX12-NEXT:    ; return to shader part epilog
1134;
1135; GFX9-LABEL: s_load_constant_v6i16_align8:
1136; GFX9:       ; %bb.0:
1137; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
1138; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x8
1139; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1140; GFX9-NEXT:    s_mov_b32 s0, s4
1141; GFX9-NEXT:    s_mov_b32 s1, s5
1142; GFX9-NEXT:    ; return to shader part epilog
1143;
1144; GFX7-LABEL: s_load_constant_v6i16_align8:
1145; GFX7:       ; %bb.0:
1146; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
1147; GFX7-NEXT:    s_load_dword s2, s[0:1], 0x2
1148; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1149; GFX7-NEXT:    s_mov_b32 s0, s4
1150; GFX7-NEXT:    s_mov_b32 s1, s5
1151; GFX7-NEXT:    ; return to shader part epilog
1152;
1153; GFX6-LABEL: s_load_constant_v6i16_align8:
1154; GFX6:       ; %bb.0:
1155; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
1156; GFX6-NEXT:    s_load_dword s2, s[0:1], 0x2
1157; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1158; GFX6-NEXT:    s_mov_b32 s0, s4
1159; GFX6-NEXT:    s_mov_b32 s1, s5
1160; GFX6-NEXT:    ; return to shader part epilog
1161  %load = load <6 x i16>, ptr addrspace(4) %ptr, align 8
1162  %cast = bitcast <6 x i16> %load to <3 x i32>
1163  ret <3 x i32> %cast
1164}
1165
1166define amdgpu_ps <12 x i8> @s_load_constant_v12i8_align8(ptr addrspace(4) inreg %ptr) {
1167; GFX12-LABEL: s_load_constant_v12i8_align8:
1168; GFX12:       ; %bb.0:
1169; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
1170; GFX12-NEXT:    s_wait_kmcnt 0x0
1171; GFX12-NEXT:    s_lshr_b32 s13, s0, 8
1172; GFX12-NEXT:    s_lshr_b32 s12, s0, 16
1173; GFX12-NEXT:    s_lshr_b32 s3, s0, 24
1174; GFX12-NEXT:    s_lshr_b32 s5, s1, 8
1175; GFX12-NEXT:    s_lshr_b32 s6, s1, 16
1176; GFX12-NEXT:    s_lshr_b32 s7, s1, 24
1177; GFX12-NEXT:    s_lshr_b32 s9, s2, 8
1178; GFX12-NEXT:    s_lshr_b32 s10, s2, 16
1179; GFX12-NEXT:    s_lshr_b32 s11, s2, 24
1180; GFX12-NEXT:    s_mov_b32 s4, s1
1181; GFX12-NEXT:    s_mov_b32 s8, s2
1182; GFX12-NEXT:    s_mov_b32 s1, s13
1183; GFX12-NEXT:    s_mov_b32 s2, s12
1184; GFX12-NEXT:    ; return to shader part epilog
1185;
1186; GFX9-LABEL: s_load_constant_v12i8_align8:
1187; GFX9:       ; %bb.0:
1188; GFX9-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x0
1189; GFX9-NEXT:    s_load_dword s8, s[0:1], 0x8
1190; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1191; GFX9-NEXT:    s_lshr_b32 s1, s12, 8
1192; GFX9-NEXT:    s_lshr_b32 s2, s12, 16
1193; GFX9-NEXT:    s_lshr_b32 s3, s12, 24
1194; GFX9-NEXT:    s_lshr_b32 s5, s13, 8
1195; GFX9-NEXT:    s_lshr_b32 s6, s13, 16
1196; GFX9-NEXT:    s_lshr_b32 s7, s13, 24
1197; GFX9-NEXT:    s_lshr_b32 s9, s8, 8
1198; GFX9-NEXT:    s_lshr_b32 s10, s8, 16
1199; GFX9-NEXT:    s_lshr_b32 s11, s8, 24
1200; GFX9-NEXT:    s_mov_b32 s0, s12
1201; GFX9-NEXT:    s_mov_b32 s4, s13
1202; GFX9-NEXT:    ; return to shader part epilog
1203;
1204; GFX7-LABEL: s_load_constant_v12i8_align8:
1205; GFX7:       ; %bb.0:
1206; GFX7-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x0
1207; GFX7-NEXT:    s_load_dword s8, s[0:1], 0x2
1208; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1209; GFX7-NEXT:    s_lshr_b32 s1, s12, 8
1210; GFX7-NEXT:    s_lshr_b32 s2, s12, 16
1211; GFX7-NEXT:    s_lshr_b32 s3, s12, 24
1212; GFX7-NEXT:    s_lshr_b32 s5, s13, 8
1213; GFX7-NEXT:    s_lshr_b32 s6, s13, 16
1214; GFX7-NEXT:    s_lshr_b32 s7, s13, 24
1215; GFX7-NEXT:    s_lshr_b32 s9, s8, 8
1216; GFX7-NEXT:    s_lshr_b32 s10, s8, 16
1217; GFX7-NEXT:    s_lshr_b32 s11, s8, 24
1218; GFX7-NEXT:    s_mov_b32 s0, s12
1219; GFX7-NEXT:    s_mov_b32 s4, s13
1220; GFX7-NEXT:    ; return to shader part epilog
1221;
1222; GFX6-LABEL: s_load_constant_v12i8_align8:
1223; GFX6:       ; %bb.0:
1224; GFX6-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x0
1225; GFX6-NEXT:    s_load_dword s8, s[0:1], 0x2
1226; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1227; GFX6-NEXT:    s_lshr_b32 s1, s12, 8
1228; GFX6-NEXT:    s_lshr_b32 s2, s12, 16
1229; GFX6-NEXT:    s_lshr_b32 s3, s12, 24
1230; GFX6-NEXT:    s_lshr_b32 s5, s13, 8
1231; GFX6-NEXT:    s_lshr_b32 s6, s13, 16
1232; GFX6-NEXT:    s_lshr_b32 s7, s13, 24
1233; GFX6-NEXT:    s_lshr_b32 s9, s8, 8
1234; GFX6-NEXT:    s_lshr_b32 s10, s8, 16
1235; GFX6-NEXT:    s_lshr_b32 s11, s8, 24
1236; GFX6-NEXT:    s_mov_b32 s0, s12
1237; GFX6-NEXT:    s_mov_b32 s4, s13
1238; GFX6-NEXT:    ; return to shader part epilog
1239  %load = load <12 x i8>, ptr addrspace(4) %ptr, align 8
1240  ret <12 x i8> %load
1241}
1242
1243define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align16(ptr addrspace(4) inreg %ptr) {
1244; GFX12-LABEL: s_load_constant_v3i32_align16:
1245; GFX12:       ; %bb.0:
1246; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
1247; GFX12-NEXT:    s_wait_kmcnt 0x0
1248; GFX12-NEXT:    ; return to shader part epilog
1249;
1250; GCN-LABEL: s_load_constant_v3i32_align16:
1251; GCN:       ; %bb.0:
1252; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
1253; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1254; GCN-NEXT:    ; return to shader part epilog
1255  %load = load <3 x i32>, ptr addrspace(4) %ptr, align 16
1256  ret <3 x i32> %load
1257}
1258