xref: /llvm-project/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll (revision e28e93550a74752714db6fffe50233aa96e536a5)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GX900 %s
3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX940 %s
4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
5; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s
6
7define <4 x half> @shuffle_v4f16_23uu(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
8; GFX9-LABEL: shuffle_v4f16_23uu:
9; GFX9:       ; %bb.0:
10; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
12; GFX9-NEXT:    s_waitcnt vmcnt(0)
13; GFX9-NEXT:    s_setpc_b64 s[30:31]
14;
15; GFX10-LABEL: shuffle_v4f16_23uu:
16; GFX10:       ; %bb.0:
17; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
19; GFX10-NEXT:    s_waitcnt vmcnt(0)
20; GFX10-NEXT:    s_setpc_b64 s[30:31]
21;
22; GFX11-LABEL: shuffle_v4f16_23uu:
23; GFX11:       ; %bb.0:
24; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
26; GFX11-NEXT:    s_waitcnt vmcnt(0)
27; GFX11-NEXT:    s_setpc_b64 s[30:31]
28  %val0 = load <4 x half>, ptr addrspace(1) %arg0
29  %val1 = load <4 x half>, ptr addrspace(1) %arg1
30  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
31  ret <4 x half> %shuffle
32}
33
34define <4 x half> @shuffle_v4f16_234u(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
35; GX900-LABEL: shuffle_v4f16_234u:
36; GX900:       ; %bb.0:
37; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
38; GX900-NEXT:    global_load_dword v6, v[0:1], off offset:4
39; GX900-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
40; GX900-NEXT:    s_waitcnt vmcnt(1)
41; GX900-NEXT:    v_mov_b32_e32 v0, v6
42; GX900-NEXT:    s_waitcnt vmcnt(0)
43; GX900-NEXT:    v_mov_b32_e32 v1, v4
44; GX900-NEXT:    s_setpc_b64 s[30:31]
45;
46; GFX940-LABEL: shuffle_v4f16_234u:
47; GFX940:       ; %bb.0:
48; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
49; GFX940-NEXT:    global_load_dword v4, v[0:1], off offset:4
50; GFX940-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off
51; GFX940-NEXT:    s_waitcnt vmcnt(1)
52; GFX940-NEXT:    v_mov_b32_e32 v0, v4
53; GFX940-NEXT:    s_waitcnt vmcnt(0)
54; GFX940-NEXT:    v_mov_b32_e32 v1, v6
55; GFX940-NEXT:    s_setpc_b64 s[30:31]
56;
57; GFX10-LABEL: shuffle_v4f16_234u:
58; GFX10:       ; %bb.0:
59; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
60; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
61; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
62; GFX10-NEXT:    s_waitcnt vmcnt(1)
63; GFX10-NEXT:    v_mov_b32_e32 v0, v6
64; GFX10-NEXT:    s_waitcnt vmcnt(0)
65; GFX10-NEXT:    v_mov_b32_e32 v1, v4
66; GFX10-NEXT:    s_setpc_b64 s[30:31]
67;
68; GFX11-LABEL: shuffle_v4f16_234u:
69; GFX11:       ; %bb.0:
70; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
71; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
72; GFX11-NEXT:    global_load_b64 v[1:2], v[2:3], off
73; GFX11-NEXT:    s_waitcnt vmcnt(0)
74; GFX11-NEXT:    s_setpc_b64 s[30:31]
75  %val0 = load <4 x half>, ptr addrspace(1) %arg0
76  %val1 = load <4 x half>, ptr addrspace(1) %arg1
77  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 4, i32 undef>
78  ret <4 x half> %shuffle
79}
80
81define <4 x half> @shuffle_v4f16_u1u3(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
82; GFX9-LABEL: shuffle_v4f16_u1u3:
83; GFX9:       ; %bb.0:
84; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
85; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
86; GFX9-NEXT:    s_waitcnt vmcnt(0)
87; GFX9-NEXT:    s_setpc_b64 s[30:31]
88;
89; GFX10-LABEL: shuffle_v4f16_u1u3:
90; GFX10:       ; %bb.0:
91; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
92; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
93; GFX10-NEXT:    s_waitcnt vmcnt(0)
94; GFX10-NEXT:    s_setpc_b64 s[30:31]
95;
96; GFX11-LABEL: shuffle_v4f16_u1u3:
97; GFX11:       ; %bb.0:
98; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
99; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
100; GFX11-NEXT:    s_waitcnt vmcnt(0)
101; GFX11-NEXT:    s_setpc_b64 s[30:31]
102  %val0 = load <4 x half>, ptr addrspace(1) %arg0
103  %val1 = load <4 x half>, ptr addrspace(1) %arg1
104  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 undef, i32 1, i32 undef, i32 3>
105  ret <4 x half> %shuffle
106}
107
108define <4 x half> @shuffle_v4f16_u3u1(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
109; GX900-LABEL: shuffle_v4f16_u3u1:
110; GX900:       ; %bb.0:
111; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
112; GX900-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
113; GX900-NEXT:    s_waitcnt vmcnt(0)
114; GX900-NEXT:    v_mov_b32_e32 v0, v2
115; GX900-NEXT:    s_setpc_b64 s[30:31]
116;
117; GFX940-LABEL: shuffle_v4f16_u3u1:
118; GFX940:       ; %bb.0:
119; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
120; GFX940-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
121; GFX940-NEXT:    s_waitcnt vmcnt(0)
122; GFX940-NEXT:    v_mov_b32_e32 v0, v3
123; GFX940-NEXT:    v_mov_b32_e32 v1, v2
124; GFX940-NEXT:    s_setpc_b64 s[30:31]
125;
126; GFX10-LABEL: shuffle_v4f16_u3u1:
127; GFX10:       ; %bb.0:
128; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
129; GFX10-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
130; GFX10-NEXT:    s_waitcnt vmcnt(0)
131; GFX10-NEXT:    v_mov_b32_e32 v0, v2
132; GFX10-NEXT:    s_setpc_b64 s[30:31]
133;
134; GFX11-LABEL: shuffle_v4f16_u3u1:
135; GFX11:       ; %bb.0:
136; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
137; GFX11-NEXT:    global_load_b64 v[1:2], v[0:1], off
138; GFX11-NEXT:    s_waitcnt vmcnt(0)
139; GFX11-NEXT:    v_mov_b32_e32 v0, v2
140; GFX11-NEXT:    s_setpc_b64 s[30:31]
141  %val0 = load <4 x half>, ptr addrspace(1) %arg0
142  %val1 = load <4 x half>, ptr addrspace(1) %arg1
143  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 undef, i32 3, i32 undef, i32 1>
144  ret <4 x half> %shuffle
145}
146
147define <4 x half> @shuffle_v4f16_u3uu(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
148; GFX9-LABEL: shuffle_v4f16_u3uu:
149; GFX9:       ; %bb.0:
150; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
151; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
152; GFX9-NEXT:    s_waitcnt vmcnt(0)
153; GFX9-NEXT:    s_setpc_b64 s[30:31]
154;
155; GFX10-LABEL: shuffle_v4f16_u3uu:
156; GFX10:       ; %bb.0:
157; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
158; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
159; GFX10-NEXT:    s_waitcnt vmcnt(0)
160; GFX10-NEXT:    s_setpc_b64 s[30:31]
161;
162; GFX11-LABEL: shuffle_v4f16_u3uu:
163; GFX11:       ; %bb.0:
164; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
165; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
166; GFX11-NEXT:    s_waitcnt vmcnt(0)
167; GFX11-NEXT:    s_setpc_b64 s[30:31]
168  %val0 = load <4 x half>, ptr addrspace(1) %arg0
169  %val1 = load <4 x half>, ptr addrspace(1) %arg1
170  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef>
171  ret <4 x half> %shuffle
172}
173
174define <4 x half> @shuffle_v4f16_3u6u(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
175; GX900-LABEL: shuffle_v4f16_3u6u:
176; GX900:       ; %bb.0:
177; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
178; GX900-NEXT:    global_load_dword v5, v[0:1], off offset:4
179; GX900-NEXT:    global_load_dword v4, v[2:3], off offset:4
180; GX900-NEXT:    s_waitcnt vmcnt(1)
181; GX900-NEXT:    v_alignbit_b32 v0, s4, v5, 16
182; GX900-NEXT:    s_waitcnt vmcnt(0)
183; GX900-NEXT:    v_mov_b32_e32 v1, v4
184; GX900-NEXT:    s_setpc_b64 s[30:31]
185;
186; GFX940-LABEL: shuffle_v4f16_3u6u:
187; GFX940:       ; %bb.0:
188; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
189; GFX940-NEXT:    global_load_dword v5, v[0:1], off offset:4
190; GFX940-NEXT:    global_load_dword v4, v[2:3], off offset:4
191; GFX940-NEXT:    s_waitcnt vmcnt(1)
192; GFX940-NEXT:    v_alignbit_b32 v0, s0, v5, 16
193; GFX940-NEXT:    s_waitcnt vmcnt(0)
194; GFX940-NEXT:    v_mov_b32_e32 v1, v4
195; GFX940-NEXT:    s_setpc_b64 s[30:31]
196;
197; GFX10-LABEL: shuffle_v4f16_3u6u:
198; GFX10:       ; %bb.0:
199; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
200; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
201; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
202; GFX10-NEXT:    s_waitcnt vmcnt(1)
203; GFX10-NEXT:    v_alignbit_b32 v0, s4, v5, 16
204; GFX10-NEXT:    s_waitcnt vmcnt(0)
205; GFX10-NEXT:    v_mov_b32_e32 v1, v4
206; GFX10-NEXT:    s_setpc_b64 s[30:31]
207;
208; GFX11-LABEL: shuffle_v4f16_3u6u:
209; GFX11:       ; %bb.0:
210; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
211; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
212; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:4
213; GFX11-NEXT:    s_waitcnt vmcnt(1)
214; GFX11-NEXT:    v_alignbit_b32 v0, s0, v0, 16
215; GFX11-NEXT:    s_waitcnt vmcnt(0)
216; GFX11-NEXT:    s_setpc_b64 s[30:31]
217  %val0 = load <4 x half>, ptr addrspace(1) %arg0
218  %val1 = load <4 x half>, ptr addrspace(1) %arg1
219  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 undef, i32 6, i32 undef>
220  ret <4 x half> %shuffle
221}
222
223define <4 x half> @shuffle_v4f16_3uu7(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
224; GX900-LABEL: shuffle_v4f16_3uu7:
225; GX900:       ; %bb.0:
226; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
227; GX900-NEXT:    global_load_dword v5, v[0:1], off offset:4
228; GX900-NEXT:    global_load_dword v4, v[2:3], off offset:4
229; GX900-NEXT:    s_waitcnt vmcnt(1)
230; GX900-NEXT:    v_alignbit_b32 v0, s4, v5, 16
231; GX900-NEXT:    s_waitcnt vmcnt(0)
232; GX900-NEXT:    v_mov_b32_e32 v1, v4
233; GX900-NEXT:    s_setpc_b64 s[30:31]
234;
235; GFX940-LABEL: shuffle_v4f16_3uu7:
236; GFX940:       ; %bb.0:
237; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
238; GFX940-NEXT:    global_load_dword v5, v[0:1], off offset:4
239; GFX940-NEXT:    global_load_dword v4, v[2:3], off offset:4
240; GFX940-NEXT:    s_waitcnt vmcnt(1)
241; GFX940-NEXT:    v_alignbit_b32 v0, s0, v5, 16
242; GFX940-NEXT:    s_waitcnt vmcnt(0)
243; GFX940-NEXT:    v_mov_b32_e32 v1, v4
244; GFX940-NEXT:    s_setpc_b64 s[30:31]
245;
246; GFX10-LABEL: shuffle_v4f16_3uu7:
247; GFX10:       ; %bb.0:
248; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
249; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
250; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
251; GFX10-NEXT:    s_waitcnt vmcnt(1)
252; GFX10-NEXT:    v_alignbit_b32 v0, s4, v5, 16
253; GFX10-NEXT:    s_waitcnt vmcnt(0)
254; GFX10-NEXT:    v_mov_b32_e32 v1, v4
255; GFX10-NEXT:    s_setpc_b64 s[30:31]
256;
257; GFX11-LABEL: shuffle_v4f16_3uu7:
258; GFX11:       ; %bb.0:
259; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
260; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
261; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:4
262; GFX11-NEXT:    s_waitcnt vmcnt(1)
263; GFX11-NEXT:    v_alignbit_b32 v0, s0, v0, 16
264; GFX11-NEXT:    s_waitcnt vmcnt(0)
265; GFX11-NEXT:    s_setpc_b64 s[30:31]
266  %val0 = load <4 x half>, ptr addrspace(1) %arg0
267  %val1 = load <4 x half>, ptr addrspace(1) %arg1
268  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 undef, i32 undef, i32 7>
269  ret <4 x half> %shuffle
270}
271
272define <4 x half> @shuffle_v4f16_35u5(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
273; GX900-LABEL: shuffle_v4f16_35u5:
274; GX900:       ; %bb.0:
275; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
276; GX900-NEXT:    global_load_dword v5, v[0:1], off offset:4
277; GX900-NEXT:    global_load_dword v4, v[2:3], off
278; GX900-NEXT:    s_mov_b32 s4, 0x7060302
279; GX900-NEXT:    s_waitcnt vmcnt(0)
280; GX900-NEXT:    v_perm_b32 v0, v4, v5, s4
281; GX900-NEXT:    v_mov_b32_e32 v1, v4
282; GX900-NEXT:    s_setpc_b64 s[30:31]
283;
284; GFX940-LABEL: shuffle_v4f16_35u5:
285; GFX940:       ; %bb.0:
286; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
287; GFX940-NEXT:    global_load_dword v5, v[0:1], off offset:4
288; GFX940-NEXT:    global_load_dword v4, v[2:3], off
289; GFX940-NEXT:    s_mov_b32 s0, 0x7060302
290; GFX940-NEXT:    s_waitcnt vmcnt(0)
291; GFX940-NEXT:    v_perm_b32 v0, v4, v5, s0
292; GFX940-NEXT:    v_mov_b32_e32 v1, v4
293; GFX940-NEXT:    s_setpc_b64 s[30:31]
294;
295; GFX10-LABEL: shuffle_v4f16_35u5:
296; GFX10:       ; %bb.0:
297; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
298; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
299; GFX10-NEXT:    global_load_dword v4, v[2:3], off
300; GFX10-NEXT:    s_waitcnt vmcnt(0)
301; GFX10-NEXT:    v_perm_b32 v0, v4, v5, 0x7060302
302; GFX10-NEXT:    v_mov_b32_e32 v1, v4
303; GFX10-NEXT:    s_setpc_b64 s[30:31]
304;
305; GFX11-LABEL: shuffle_v4f16_35u5:
306; GFX11:       ; %bb.0:
307; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
308; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
309; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
310; GFX11-NEXT:    s_waitcnt vmcnt(0)
311; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
312; GFX11-NEXT:    s_setpc_b64 s[30:31]
313  %val0 = load <4 x half>, ptr addrspace(1) %arg0
314  %val1 = load <4 x half>, ptr addrspace(1) %arg1
315  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 5, i32 undef, i32 5>
316  ret <4 x half> %shuffle
317}
318
319define <4 x half> @shuffle_v4f16_357u(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
320; GX900-LABEL: shuffle_v4f16_357u:
321; GX900:       ; %bb.0:
322; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
323; GX900-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
324; GX900-NEXT:    global_load_dword v6, v[0:1], off offset:4
325; GX900-NEXT:    s_mov_b32 s4, 0x7060302
326; GX900-NEXT:    s_waitcnt vmcnt(1)
327; GX900-NEXT:    v_alignbit_b32 v1, s4, v5, 16
328; GX900-NEXT:    s_waitcnt vmcnt(0)
329; GX900-NEXT:    v_perm_b32 v0, v4, v6, s4
330; GX900-NEXT:    s_setpc_b64 s[30:31]
331;
332; GFX940-LABEL: shuffle_v4f16_357u:
333; GFX940:       ; %bb.0:
334; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
335; GFX940-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
336; GFX940-NEXT:    global_load_dword v6, v[0:1], off offset:4
337; GFX940-NEXT:    s_mov_b32 s0, 0x7060302
338; GFX940-NEXT:    s_waitcnt vmcnt(1)
339; GFX940-NEXT:    v_alignbit_b32 v1, s0, v5, 16
340; GFX940-NEXT:    s_waitcnt vmcnt(0)
341; GFX940-NEXT:    v_perm_b32 v0, v4, v6, s0
342; GFX940-NEXT:    s_setpc_b64 s[30:31]
343;
344; GFX10-LABEL: shuffle_v4f16_357u:
345; GFX10:       ; %bb.0:
346; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
347; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
348; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
349; GFX10-NEXT:    s_waitcnt vmcnt(1)
350; GFX10-NEXT:    v_alignbit_b32 v1, s4, v5, 16
351; GFX10-NEXT:    s_waitcnt vmcnt(0)
352; GFX10-NEXT:    v_perm_b32 v0, v4, v6, 0x7060302
353; GFX10-NEXT:    s_setpc_b64 s[30:31]
354;
355; GFX11-LABEL: shuffle_v4f16_357u:
356; GFX11:       ; %bb.0:
357; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
358; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
359; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
360; GFX11-NEXT:    s_waitcnt vmcnt(1)
361; GFX11-NEXT:    v_alignbit_b32 v1, s0, v3, 16
362; GFX11-NEXT:    s_waitcnt vmcnt(0)
363; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
364; GFX11-NEXT:    s_setpc_b64 s[30:31]
365  %val0 = load <4 x half>, ptr addrspace(1) %arg0
366  %val1 = load <4 x half>, ptr addrspace(1) %arg1
367  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 5, i32 7, i32 undef>
368  ret <4 x half> %shuffle
369}
370
371define <4 x half> @shuffle_v4f16_0101(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
372; GFX9-LABEL: shuffle_v4f16_0101:
373; GFX9:       ; %bb.0:
374; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
375; GFX9-NEXT:    global_load_dword v0, v[0:1], off
376; GFX9-NEXT:    s_waitcnt vmcnt(0)
377; GFX9-NEXT:    v_mov_b32_e32 v1, v0
378; GFX9-NEXT:    s_setpc_b64 s[30:31]
379;
380; GFX10-LABEL: shuffle_v4f16_0101:
381; GFX10:       ; %bb.0:
382; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
383; GFX10-NEXT:    global_load_dword v0, v[0:1], off
384; GFX10-NEXT:    s_waitcnt vmcnt(0)
385; GFX10-NEXT:    v_mov_b32_e32 v1, v0
386; GFX10-NEXT:    s_setpc_b64 s[30:31]
387;
388; GFX11-LABEL: shuffle_v4f16_0101:
389; GFX11:       ; %bb.0:
390; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
391; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
392; GFX11-NEXT:    s_waitcnt vmcnt(0)
393; GFX11-NEXT:    v_mov_b32_e32 v1, v0
394; GFX11-NEXT:    s_setpc_b64 s[30:31]
395  %val0 = load <4 x half>, ptr addrspace(1) %arg0
396  %val1 = load <4 x half>, ptr addrspace(1) %arg1
397  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
398  ret <4 x half> %shuffle
399}
400
401define <4 x half> @shuffle_v4f16_0123(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
402; GFX9-LABEL: shuffle_v4f16_0123:
403; GFX9:       ; %bb.0:
404; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
405; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
406; GFX9-NEXT:    s_waitcnt vmcnt(0)
407; GFX9-NEXT:    s_setpc_b64 s[30:31]
408;
409; GFX10-LABEL: shuffle_v4f16_0123:
410; GFX10:       ; %bb.0:
411; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
412; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
413; GFX10-NEXT:    s_waitcnt vmcnt(0)
414; GFX10-NEXT:    s_setpc_b64 s[30:31]
415;
416; GFX11-LABEL: shuffle_v4f16_0123:
417; GFX11:       ; %bb.0:
418; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
419; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
420; GFX11-NEXT:    s_waitcnt vmcnt(0)
421; GFX11-NEXT:    s_setpc_b64 s[30:31]
422  %val0 = load <4 x half>, ptr addrspace(1) %arg0
423  %val1 = load <4 x half>, ptr addrspace(1) %arg1
424  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
425  ret <4 x half> %shuffle
426}
427
428define <4 x half> @shuffle_v4f16_0145(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
429; GFX9-LABEL: shuffle_v4f16_0145:
430; GFX9:       ; %bb.0:
431; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
432; GFX9-NEXT:    global_load_dword v4, v[0:1], off
433; GFX9-NEXT:    global_load_dword v5, v[2:3], off
434; GFX9-NEXT:    s_waitcnt vmcnt(1)
435; GFX9-NEXT:    v_mov_b32_e32 v0, v4
436; GFX9-NEXT:    s_waitcnt vmcnt(0)
437; GFX9-NEXT:    v_mov_b32_e32 v1, v5
438; GFX9-NEXT:    s_setpc_b64 s[30:31]
439;
440; GFX10-LABEL: shuffle_v4f16_0145:
441; GFX10:       ; %bb.0:
442; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
443; GFX10-NEXT:    global_load_dword v4, v[0:1], off
444; GFX10-NEXT:    global_load_dword v5, v[2:3], off
445; GFX10-NEXT:    s_waitcnt vmcnt(1)
446; GFX10-NEXT:    v_mov_b32_e32 v0, v4
447; GFX10-NEXT:    s_waitcnt vmcnt(0)
448; GFX10-NEXT:    v_mov_b32_e32 v1, v5
449; GFX10-NEXT:    s_setpc_b64 s[30:31]
450;
451; GFX11-LABEL: shuffle_v4f16_0145:
452; GFX11:       ; %bb.0:
453; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
454; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
455; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
456; GFX11-NEXT:    s_waitcnt vmcnt(0)
457; GFX11-NEXT:    s_setpc_b64 s[30:31]
458  %val0 = load <4 x half>, ptr addrspace(1) %arg0
459  %val1 = load <4 x half>, ptr addrspace(1) %arg1
460  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
461  ret <4 x half> %shuffle
462}
463
464define <4 x half> @shuffle_v4f16_0167(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
465; GFX9-LABEL: shuffle_v4f16_0167:
466; GFX9:       ; %bb.0:
467; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
468; GFX9-NEXT:    global_load_dword v4, v[0:1], off
469; GFX9-NEXT:    global_load_dword v5, v[2:3], off offset:4
470; GFX9-NEXT:    s_waitcnt vmcnt(1)
471; GFX9-NEXT:    v_mov_b32_e32 v0, v4
472; GFX9-NEXT:    s_waitcnt vmcnt(0)
473; GFX9-NEXT:    v_mov_b32_e32 v1, v5
474; GFX9-NEXT:    s_setpc_b64 s[30:31]
475;
476; GFX10-LABEL: shuffle_v4f16_0167:
477; GFX10:       ; %bb.0:
478; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
479; GFX10-NEXT:    global_load_dword v4, v[0:1], off
480; GFX10-NEXT:    global_load_dword v5, v[2:3], off offset:4
481; GFX10-NEXT:    s_waitcnt vmcnt(1)
482; GFX10-NEXT:    v_mov_b32_e32 v0, v4
483; GFX10-NEXT:    s_waitcnt vmcnt(0)
484; GFX10-NEXT:    v_mov_b32_e32 v1, v5
485; GFX10-NEXT:    s_setpc_b64 s[30:31]
486;
487; GFX11-LABEL: shuffle_v4f16_0167:
488; GFX11:       ; %bb.0:
489; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
490; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
491; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:4
492; GFX11-NEXT:    s_waitcnt vmcnt(0)
493; GFX11-NEXT:    s_setpc_b64 s[30:31]
494  %val0 = load <4 x half>, ptr addrspace(1) %arg0
495  %val1 = load <4 x half>, ptr addrspace(1) %arg1
496  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
497  ret <4 x half> %shuffle
498}
499
500define <4 x half> @shuffle_v4f16_2301(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
501; GX900-LABEL: shuffle_v4f16_2301:
502; GX900:       ; %bb.0:
503; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
504; GX900-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
505; GX900-NEXT:    s_waitcnt vmcnt(0)
506; GX900-NEXT:    v_mov_b32_e32 v0, v2
507; GX900-NEXT:    s_setpc_b64 s[30:31]
508;
509; GFX940-LABEL: shuffle_v4f16_2301:
510; GFX940:       ; %bb.0:
511; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
512; GFX940-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
513; GFX940-NEXT:    s_waitcnt vmcnt(0)
514; GFX940-NEXT:    v_mov_b32_e32 v0, v3
515; GFX940-NEXT:    v_mov_b32_e32 v1, v2
516; GFX940-NEXT:    s_setpc_b64 s[30:31]
517;
518; GFX10-LABEL: shuffle_v4f16_2301:
519; GFX10:       ; %bb.0:
520; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
521; GFX10-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
522; GFX10-NEXT:    s_waitcnt vmcnt(0)
523; GFX10-NEXT:    v_mov_b32_e32 v0, v2
524; GFX10-NEXT:    s_setpc_b64 s[30:31]
525;
526; GFX11-LABEL: shuffle_v4f16_2301:
527; GFX11:       ; %bb.0:
528; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
529; GFX11-NEXT:    global_load_b64 v[1:2], v[0:1], off
530; GFX11-NEXT:    s_waitcnt vmcnt(0)
531; GFX11-NEXT:    v_mov_b32_e32 v0, v2
532; GFX11-NEXT:    s_setpc_b64 s[30:31]
533  %val0 = load <4 x half>, ptr addrspace(1) %arg0
534  %val1 = load <4 x half>, ptr addrspace(1) %arg1
535  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
536  ret <4 x half> %shuffle
537}
538
539define <4 x half> @shuffle_v4f16_2323(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
540; GFX9-LABEL: shuffle_v4f16_2323:
541; GFX9:       ; %bb.0:
542; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
543; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
544; GFX9-NEXT:    s_waitcnt vmcnt(0)
545; GFX9-NEXT:    v_mov_b32_e32 v1, v0
546; GFX9-NEXT:    s_setpc_b64 s[30:31]
547;
548; GFX10-LABEL: shuffle_v4f16_2323:
549; GFX10:       ; %bb.0:
550; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
551; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
552; GFX10-NEXT:    s_waitcnt vmcnt(0)
553; GFX10-NEXT:    v_mov_b32_e32 v1, v0
554; GFX10-NEXT:    s_setpc_b64 s[30:31]
555;
556; GFX11-LABEL: shuffle_v4f16_2323:
557; GFX11:       ; %bb.0:
558; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
559; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
560; GFX11-NEXT:    s_waitcnt vmcnt(0)
561; GFX11-NEXT:    v_mov_b32_e32 v1, v0
562; GFX11-NEXT:    s_setpc_b64 s[30:31]
563  %val0 = load <4 x half>, ptr addrspace(1) %arg0
564  %val1 = load <4 x half>, ptr addrspace(1) %arg1
565  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
566  ret <4 x half> %shuffle
567}
568
569define <4 x half> @shuffle_v4f16_2345(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
570; GFX9-LABEL: shuffle_v4f16_2345:
571; GFX9:       ; %bb.0:
572; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
573; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
574; GFX9-NEXT:    global_load_dword v5, v[2:3], off
575; GFX9-NEXT:    s_waitcnt vmcnt(1)
576; GFX9-NEXT:    v_mov_b32_e32 v0, v4
577; GFX9-NEXT:    s_waitcnt vmcnt(0)
578; GFX9-NEXT:    v_mov_b32_e32 v1, v5
579; GFX9-NEXT:    s_setpc_b64 s[30:31]
580;
581; GFX10-LABEL: shuffle_v4f16_2345:
582; GFX10:       ; %bb.0:
583; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
584; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
585; GFX10-NEXT:    global_load_dword v5, v[2:3], off
586; GFX10-NEXT:    s_waitcnt vmcnt(1)
587; GFX10-NEXT:    v_mov_b32_e32 v0, v4
588; GFX10-NEXT:    s_waitcnt vmcnt(0)
589; GFX10-NEXT:    v_mov_b32_e32 v1, v5
590; GFX10-NEXT:    s_setpc_b64 s[30:31]
591;
592; GFX11-LABEL: shuffle_v4f16_2345:
593; GFX11:       ; %bb.0:
594; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
595; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
596; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
597; GFX11-NEXT:    s_waitcnt vmcnt(0)
598; GFX11-NEXT:    s_setpc_b64 s[30:31]
599  %val0 = load <4 x half>, ptr addrspace(1) %arg0
600  %val1 = load <4 x half>, ptr addrspace(1) %arg1
601  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
602  ret <4 x half> %shuffle
603}
604
605define <4 x half> @shuffle_v4f16_2367(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
606; GFX9-LABEL: shuffle_v4f16_2367:
607; GFX9:       ; %bb.0:
608; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
609; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
610; GFX9-NEXT:    global_load_dword v5, v[2:3], off offset:4
611; GFX9-NEXT:    s_waitcnt vmcnt(1)
612; GFX9-NEXT:    v_mov_b32_e32 v0, v4
613; GFX9-NEXT:    s_waitcnt vmcnt(0)
614; GFX9-NEXT:    v_mov_b32_e32 v1, v5
615; GFX9-NEXT:    s_setpc_b64 s[30:31]
616;
617; GFX10-LABEL: shuffle_v4f16_2367:
618; GFX10:       ; %bb.0:
619; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
620; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
621; GFX10-NEXT:    global_load_dword v5, v[2:3], off offset:4
622; GFX10-NEXT:    s_waitcnt vmcnt(1)
623; GFX10-NEXT:    v_mov_b32_e32 v0, v4
624; GFX10-NEXT:    s_waitcnt vmcnt(0)
625; GFX10-NEXT:    v_mov_b32_e32 v1, v5
626; GFX10-NEXT:    s_setpc_b64 s[30:31]
627;
628; GFX11-LABEL: shuffle_v4f16_2367:
629; GFX11:       ; %bb.0:
630; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
631; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
632; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:4
633; GFX11-NEXT:    s_waitcnt vmcnt(0)
634; GFX11-NEXT:    s_setpc_b64 s[30:31]
635  %val0 = load <4 x half>, ptr addrspace(1) %arg0
636  %val1 = load <4 x half>, ptr addrspace(1) %arg1
637  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
638  ret <4 x half> %shuffle
639}
640
641define <4 x half> @shuffle_v4f16_4501(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
642; GFX9-LABEL: shuffle_v4f16_4501:
643; GFX9:       ; %bb.0:
644; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
645; GFX9-NEXT:    global_load_dword v4, v[2:3], off
646; GFX9-NEXT:    global_load_dword v5, v[0:1], off
647; GFX9-NEXT:    s_waitcnt vmcnt(1)
648; GFX9-NEXT:    v_mov_b32_e32 v0, v4
649; GFX9-NEXT:    s_waitcnt vmcnt(0)
650; GFX9-NEXT:    v_mov_b32_e32 v1, v5
651; GFX9-NEXT:    s_setpc_b64 s[30:31]
652;
653; GFX10-LABEL: shuffle_v4f16_4501:
654; GFX10:       ; %bb.0:
655; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
656; GFX10-NEXT:    global_load_dword v4, v[2:3], off
657; GFX10-NEXT:    global_load_dword v5, v[0:1], off
658; GFX10-NEXT:    s_waitcnt vmcnt(1)
659; GFX10-NEXT:    v_mov_b32_e32 v0, v4
660; GFX10-NEXT:    s_waitcnt vmcnt(0)
661; GFX10-NEXT:    v_mov_b32_e32 v1, v5
662; GFX10-NEXT:    s_setpc_b64 s[30:31]
663;
664; GFX11-LABEL: shuffle_v4f16_4501:
665; GFX11:       ; %bb.0:
666; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
667; GFX11-NEXT:    global_load_b32 v2, v[2:3], off
668; GFX11-NEXT:    global_load_b32 v1, v[0:1], off
669; GFX11-NEXT:    s_waitcnt vmcnt(1)
670; GFX11-NEXT:    v_mov_b32_e32 v0, v2
671; GFX11-NEXT:    s_waitcnt vmcnt(0)
672; GFX11-NEXT:    s_setpc_b64 s[30:31]
673  %val0 = load <4 x half>, ptr addrspace(1) %arg0
674  %val1 = load <4 x half>, ptr addrspace(1) %arg1
675  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
676  ret <4 x half> %shuffle
677}
678
679define <4 x half> @shuffle_v4f16_4523(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
680; GFX9-LABEL: shuffle_v4f16_4523:
681; GFX9:       ; %bb.0:
682; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
683; GFX9-NEXT:    global_load_dword v4, v[2:3], off
684; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
685; GFX9-NEXT:    s_waitcnt vmcnt(1)
686; GFX9-NEXT:    v_mov_b32_e32 v0, v4
687; GFX9-NEXT:    s_waitcnt vmcnt(0)
688; GFX9-NEXT:    v_mov_b32_e32 v1, v5
689; GFX9-NEXT:    s_setpc_b64 s[30:31]
690;
691; GFX10-LABEL: shuffle_v4f16_4523:
692; GFX10:       ; %bb.0:
693; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
694; GFX10-NEXT:    global_load_dword v4, v[2:3], off
695; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
696; GFX10-NEXT:    s_waitcnt vmcnt(1)
697; GFX10-NEXT:    v_mov_b32_e32 v0, v4
698; GFX10-NEXT:    s_waitcnt vmcnt(0)
699; GFX10-NEXT:    v_mov_b32_e32 v1, v5
700; GFX10-NEXT:    s_setpc_b64 s[30:31]
701;
702; GFX11-LABEL: shuffle_v4f16_4523:
703; GFX11:       ; %bb.0:
704; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
705; GFX11-NEXT:    global_load_b32 v2, v[2:3], off
706; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
707; GFX11-NEXT:    s_waitcnt vmcnt(1)
708; GFX11-NEXT:    v_mov_b32_e32 v0, v2
709; GFX11-NEXT:    s_waitcnt vmcnt(0)
710; GFX11-NEXT:    s_setpc_b64 s[30:31]
711  %val0 = load <4 x half>, ptr addrspace(1) %arg0
712  %val1 = load <4 x half>, ptr addrspace(1) %arg1
713  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
714  ret <4 x half> %shuffle
715}
716
717define <4 x half> @shuffle_v4f16_4545(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
718; GFX9-LABEL: shuffle_v4f16_4545:
719; GFX9:       ; %bb.0:
720; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
721; GFX9-NEXT:    global_load_dword v0, v[2:3], off
722; GFX9-NEXT:    s_waitcnt vmcnt(0)
723; GFX9-NEXT:    v_mov_b32_e32 v1, v0
724; GFX9-NEXT:    s_setpc_b64 s[30:31]
725;
726; GFX10-LABEL: shuffle_v4f16_4545:
727; GFX10:       ; %bb.0:
728; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
729; GFX10-NEXT:    global_load_dword v0, v[2:3], off
730; GFX10-NEXT:    s_waitcnt vmcnt(0)
731; GFX10-NEXT:    v_mov_b32_e32 v1, v0
732; GFX10-NEXT:    s_setpc_b64 s[30:31]
733;
734; GFX11-LABEL: shuffle_v4f16_4545:
735; GFX11:       ; %bb.0:
736; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
737; GFX11-NEXT:    global_load_b32 v0, v[2:3], off
738; GFX11-NEXT:    s_waitcnt vmcnt(0)
739; GFX11-NEXT:    v_mov_b32_e32 v1, v0
740; GFX11-NEXT:    s_setpc_b64 s[30:31]
741  %val0 = load <4 x half>, ptr addrspace(1) %arg0
742  %val1 = load <4 x half>, ptr addrspace(1) %arg1
743  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 4, i32 5>
744  ret <4 x half> %shuffle
745}
746
747define <4 x half> @shuffle_v4f16_4567(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
748; GFX9-LABEL: shuffle_v4f16_4567:
749; GFX9:       ; %bb.0:
750; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
751; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
752; GFX9-NEXT:    s_waitcnt vmcnt(0)
753; GFX9-NEXT:    s_setpc_b64 s[30:31]
754;
755; GFX10-LABEL: shuffle_v4f16_4567:
756; GFX10:       ; %bb.0:
757; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
758; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
759; GFX10-NEXT:    s_waitcnt vmcnt(0)
760; GFX10-NEXT:    s_setpc_b64 s[30:31]
761;
762; GFX11-LABEL: shuffle_v4f16_4567:
763; GFX11:       ; %bb.0:
764; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
765; GFX11-NEXT:    global_load_b64 v[0:1], v[2:3], off
766; GFX11-NEXT:    s_waitcnt vmcnt(0)
767; GFX11-NEXT:    s_setpc_b64 s[30:31]
768  %val0 = load <4 x half>, ptr addrspace(1) %arg0
769  %val1 = load <4 x half>, ptr addrspace(1) %arg1
770  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
771  ret <4 x half> %shuffle
772}
773
774define <4 x half> @shuffle_v4f16_6701(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
775; GFX9-LABEL: shuffle_v4f16_6701:
776; GFX9:       ; %bb.0:
777; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
778; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
779; GFX9-NEXT:    global_load_dword v5, v[0:1], off
780; GFX9-NEXT:    s_waitcnt vmcnt(1)
781; GFX9-NEXT:    v_mov_b32_e32 v0, v4
782; GFX9-NEXT:    s_waitcnt vmcnt(0)
783; GFX9-NEXT:    v_mov_b32_e32 v1, v5
784; GFX9-NEXT:    s_setpc_b64 s[30:31]
785;
786; GFX10-LABEL: shuffle_v4f16_6701:
787; GFX10:       ; %bb.0:
788; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
789; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
790; GFX10-NEXT:    global_load_dword v5, v[0:1], off
791; GFX10-NEXT:    s_waitcnt vmcnt(1)
792; GFX10-NEXT:    v_mov_b32_e32 v0, v4
793; GFX10-NEXT:    s_waitcnt vmcnt(0)
794; GFX10-NEXT:    v_mov_b32_e32 v1, v5
795; GFX10-NEXT:    s_setpc_b64 s[30:31]
796;
797; GFX11-LABEL: shuffle_v4f16_6701:
798; GFX11:       ; %bb.0:
799; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
800; GFX11-NEXT:    global_load_b32 v2, v[2:3], off offset:4
801; GFX11-NEXT:    global_load_b32 v1, v[0:1], off
802; GFX11-NEXT:    s_waitcnt vmcnt(1)
803; GFX11-NEXT:    v_mov_b32_e32 v0, v2
804; GFX11-NEXT:    s_waitcnt vmcnt(0)
805; GFX11-NEXT:    s_setpc_b64 s[30:31]
806  %val0 = load <4 x half>, ptr addrspace(1) %arg0
807  %val1 = load <4 x half>, ptr addrspace(1) %arg1
808  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
809  ret <4 x half> %shuffle
810}
811
812define <4 x half> @shuffle_v4f16_6723(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
813; GFX9-LABEL: shuffle_v4f16_6723:
814; GFX9:       ; %bb.0:
815; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
816; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
817; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
818; GFX9-NEXT:    s_waitcnt vmcnt(1)
819; GFX9-NEXT:    v_mov_b32_e32 v0, v4
820; GFX9-NEXT:    s_waitcnt vmcnt(0)
821; GFX9-NEXT:    v_mov_b32_e32 v1, v5
822; GFX9-NEXT:    s_setpc_b64 s[30:31]
823;
824; GFX10-LABEL: shuffle_v4f16_6723:
825; GFX10:       ; %bb.0:
826; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
827; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
828; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
829; GFX10-NEXT:    s_waitcnt vmcnt(1)
830; GFX10-NEXT:    v_mov_b32_e32 v0, v4
831; GFX10-NEXT:    s_waitcnt vmcnt(0)
832; GFX10-NEXT:    v_mov_b32_e32 v1, v5
833; GFX10-NEXT:    s_setpc_b64 s[30:31]
834;
835; GFX11-LABEL: shuffle_v4f16_6723:
836; GFX11:       ; %bb.0:
837; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
838; GFX11-NEXT:    global_load_b32 v2, v[2:3], off offset:4
839; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
840; GFX11-NEXT:    s_waitcnt vmcnt(1)
841; GFX11-NEXT:    v_mov_b32_e32 v0, v2
842; GFX11-NEXT:    s_waitcnt vmcnt(0)
843; GFX11-NEXT:    s_setpc_b64 s[30:31]
844  %val0 = load <4 x half>, ptr addrspace(1) %arg0
845  %val1 = load <4 x half>, ptr addrspace(1) %arg1
846  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
847  ret <4 x half> %shuffle
848}
849
850define <4 x half> @shuffle_v4f16_6745(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
851; GX900-LABEL: shuffle_v4f16_6745:
852; GX900:       ; %bb.0:
853; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
854; GX900-NEXT:    global_load_dwordx2 v[1:2], v[2:3], off
855; GX900-NEXT:    s_waitcnt vmcnt(0)
856; GX900-NEXT:    v_mov_b32_e32 v0, v2
857; GX900-NEXT:    s_setpc_b64 s[30:31]
858;
859; GFX940-LABEL: shuffle_v4f16_6745:
860; GFX940:       ; %bb.0:
861; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
862; GFX940-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
863; GFX940-NEXT:    s_waitcnt vmcnt(0)
864; GFX940-NEXT:    v_mov_b32_e32 v0, v3
865; GFX940-NEXT:    v_mov_b32_e32 v1, v2
866; GFX940-NEXT:    s_setpc_b64 s[30:31]
867;
868; GFX10-LABEL: shuffle_v4f16_6745:
869; GFX10:       ; %bb.0:
870; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
871; GFX10-NEXT:    global_load_dwordx2 v[1:2], v[2:3], off
872; GFX10-NEXT:    s_waitcnt vmcnt(0)
873; GFX10-NEXT:    v_mov_b32_e32 v0, v2
874; GFX10-NEXT:    s_setpc_b64 s[30:31]
875;
876; GFX11-LABEL: shuffle_v4f16_6745:
877; GFX11:       ; %bb.0:
878; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
879; GFX11-NEXT:    global_load_b64 v[1:2], v[2:3], off
880; GFX11-NEXT:    s_waitcnt vmcnt(0)
881; GFX11-NEXT:    v_mov_b32_e32 v0, v2
882; GFX11-NEXT:    s_setpc_b64 s[30:31]
883  %val0 = load <4 x half>, ptr addrspace(1) %arg0
884  %val1 = load <4 x half>, ptr addrspace(1) %arg1
885  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 4, i32 5>
886  ret <4 x half> %shuffle
887}
888
889define <4 x half> @shuffle_v4f16_6767(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
890; GFX9-LABEL: shuffle_v4f16_6767:
891; GFX9:       ; %bb.0:
892; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
893; GFX9-NEXT:    global_load_dword v0, v[2:3], off offset:4
894; GFX9-NEXT:    s_waitcnt vmcnt(0)
895; GFX9-NEXT:    v_mov_b32_e32 v1, v0
896; GFX9-NEXT:    s_setpc_b64 s[30:31]
897;
898; GFX10-LABEL: shuffle_v4f16_6767:
899; GFX10:       ; %bb.0:
900; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
901; GFX10-NEXT:    global_load_dword v0, v[2:3], off offset:4
902; GFX10-NEXT:    s_waitcnt vmcnt(0)
903; GFX10-NEXT:    v_mov_b32_e32 v1, v0
904; GFX10-NEXT:    s_setpc_b64 s[30:31]
905;
906; GFX11-LABEL: shuffle_v4f16_6767:
907; GFX11:       ; %bb.0:
908; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
909; GFX11-NEXT:    global_load_b32 v0, v[2:3], off offset:4
910; GFX11-NEXT:    s_waitcnt vmcnt(0)
911; GFX11-NEXT:    v_mov_b32_e32 v1, v0
912; GFX11-NEXT:    s_setpc_b64 s[30:31]
913  %val0 = load <4 x half>, ptr addrspace(1) %arg0
914  %val1 = load <4 x half>, ptr addrspace(1) %arg1
915  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 7, i32 6, i32 7>
916  ret <4 x half> %shuffle
917}
918
919define <4 x half> @shuffle_v4f16_2356(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
920; GX900-LABEL: shuffle_v4f16_2356:
921; GX900:       ; %bb.0:
922; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
923; GX900-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
924; GX900-NEXT:    global_load_dword v4, v[0:1], off offset:4
925; GX900-NEXT:    s_waitcnt vmcnt(1)
926; GX900-NEXT:    v_alignbit_b32 v1, v6, v5, 16
927; GX900-NEXT:    s_waitcnt vmcnt(0)
928; GX900-NEXT:    v_mov_b32_e32 v0, v4
929; GX900-NEXT:    s_setpc_b64 s[30:31]
930;
931; GFX940-LABEL: shuffle_v4f16_2356:
932; GFX940:       ; %bb.0:
933; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
934; GFX940-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off
935; GFX940-NEXT:    global_load_dword v4, v[0:1], off offset:4
936; GFX940-NEXT:    s_waitcnt vmcnt(1)
937; GFX940-NEXT:    v_alignbit_b32 v1, v7, v6, 16
938; GFX940-NEXT:    s_waitcnt vmcnt(0)
939; GFX940-NEXT:    v_mov_b32_e32 v0, v4
940; GFX940-NEXT:    s_setpc_b64 s[30:31]
941;
942; GFX10-LABEL: shuffle_v4f16_2356:
943; GFX10:       ; %bb.0:
944; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
945; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
946; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
947; GFX10-NEXT:    s_waitcnt vmcnt(1)
948; GFX10-NEXT:    v_alignbit_b32 v1, v6, v5, 16
949; GFX10-NEXT:    s_waitcnt vmcnt(0)
950; GFX10-NEXT:    v_mov_b32_e32 v0, v4
951; GFX10-NEXT:    s_setpc_b64 s[30:31]
952;
953; GFX11-LABEL: shuffle_v4f16_2356:
954; GFX11:       ; %bb.0:
955; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
956; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
957; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
958; GFX11-NEXT:    s_waitcnt vmcnt(1)
959; GFX11-NEXT:    v_alignbit_b32 v1, v3, v2, 16
960; GFX11-NEXT:    s_waitcnt vmcnt(0)
961; GFX11-NEXT:    s_setpc_b64 s[30:31]
962  %val0 = load <4 x half>, ptr addrspace(1) %arg0
963  %val1 = load <4 x half>, ptr addrspace(1) %arg1
964  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 5, i32 6>
965  ret <4 x half> %shuffle
966}
967
968define <4 x half> @shuffle_v4f16_5623(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
969; GX900-LABEL: shuffle_v4f16_5623:
970; GX900:       ; %bb.0:
971; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
972; GX900-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
973; GX900-NEXT:    global_load_dword v4, v[0:1], off offset:4
974; GX900-NEXT:    s_waitcnt vmcnt(1)
975; GX900-NEXT:    v_alignbit_b32 v0, v6, v5, 16
976; GX900-NEXT:    s_waitcnt vmcnt(0)
977; GX900-NEXT:    v_mov_b32_e32 v1, v4
978; GX900-NEXT:    s_setpc_b64 s[30:31]
979;
980; GFX940-LABEL: shuffle_v4f16_5623:
981; GFX940:       ; %bb.0:
982; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
983; GFX940-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off
984; GFX940-NEXT:    global_load_dword v4, v[0:1], off offset:4
985; GFX940-NEXT:    s_waitcnt vmcnt(1)
986; GFX940-NEXT:    v_alignbit_b32 v0, v7, v6, 16
987; GFX940-NEXT:    s_waitcnt vmcnt(0)
988; GFX940-NEXT:    v_mov_b32_e32 v1, v4
989; GFX940-NEXT:    s_setpc_b64 s[30:31]
990;
991; GFX10-LABEL: shuffle_v4f16_5623:
992; GFX10:       ; %bb.0:
993; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
994; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
995; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
996; GFX10-NEXT:    s_waitcnt vmcnt(1)
997; GFX10-NEXT:    v_alignbit_b32 v0, v6, v5, 16
998; GFX10-NEXT:    s_waitcnt vmcnt(0)
999; GFX10-NEXT:    v_mov_b32_e32 v1, v4
1000; GFX10-NEXT:    s_setpc_b64 s[30:31]
1001;
1002; GFX11-LABEL: shuffle_v4f16_5623:
1003; GFX11:       ; %bb.0:
1004; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1005; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
1006; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
1007; GFX11-NEXT:    s_waitcnt vmcnt(1)
1008; GFX11-NEXT:    v_alignbit_b32 v0, v3, v2, 16
1009; GFX11-NEXT:    s_waitcnt vmcnt(0)
1010; GFX11-NEXT:    s_setpc_b64 s[30:31]
1011  %val0 = load <4 x half>, ptr addrspace(1) %arg0
1012  %val1 = load <4 x half>, ptr addrspace(1) %arg1
1013  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 6, i32 2, i32 3>
1014  ret <4 x half> %shuffle
1015}
1016
1017define <4 x half> @shuffle_v4f16_3456(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
1018; GFX9-LABEL: shuffle_v4f16_3456:
1019; GFX9:       ; %bb.0:
1020; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1021; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
1022; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
1023; GFX9-NEXT:    s_waitcnt vmcnt(1)
1024; GFX9-NEXT:    v_alignbit_b32 v1, v5, v4, 16
1025; GFX9-NEXT:    s_waitcnt vmcnt(0)
1026; GFX9-NEXT:    v_alignbit_b32 v0, v4, v6, 16
1027; GFX9-NEXT:    s_setpc_b64 s[30:31]
1028;
1029; GFX10-LABEL: shuffle_v4f16_3456:
1030; GFX10:       ; %bb.0:
1031; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1032; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
1033; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
1034; GFX10-NEXT:    s_waitcnt vmcnt(1)
1035; GFX10-NEXT:    v_alignbit_b32 v1, v5, v4, 16
1036; GFX10-NEXT:    s_waitcnt vmcnt(0)
1037; GFX10-NEXT:    v_alignbit_b32 v0, v4, v6, 16
1038; GFX10-NEXT:    s_setpc_b64 s[30:31]
1039;
1040; GFX11-LABEL: shuffle_v4f16_3456:
1041; GFX11:       ; %bb.0:
1042; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1043; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
1044; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
1045; GFX11-NEXT:    s_waitcnt vmcnt(1)
1046; GFX11-NEXT:    v_alignbit_b32 v1, v3, v2, 16
1047; GFX11-NEXT:    s_waitcnt vmcnt(0)
1048; GFX11-NEXT:    v_alignbit_b32 v0, v2, v0, 16
1049; GFX11-NEXT:    s_setpc_b64 s[30:31]
1050  %val0 = load <4 x half>, ptr addrspace(1) %arg0
1051  %val1 = load <4 x half>, ptr addrspace(1) %arg1
1052  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
1053  ret <4 x half> %shuffle
1054}
1055
1056define <4 x half> @shuffle_v4f16_5634(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
1057; GFX9-LABEL: shuffle_v4f16_5634:
1058; GFX9:       ; %bb.0:
1059; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1060; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
1061; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
1062; GFX9-NEXT:    s_waitcnt vmcnt(1)
1063; GFX9-NEXT:    v_alignbit_b32 v0, v5, v4, 16
1064; GFX9-NEXT:    s_waitcnt vmcnt(0)
1065; GFX9-NEXT:    v_alignbit_b32 v1, v4, v6, 16
1066; GFX9-NEXT:    s_setpc_b64 s[30:31]
1067;
1068; GFX10-LABEL: shuffle_v4f16_5634:
1069; GFX10:       ; %bb.0:
1070; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1071; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
1072; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
1073; GFX10-NEXT:    s_waitcnt vmcnt(1)
1074; GFX10-NEXT:    v_alignbit_b32 v0, v5, v4, 16
1075; GFX10-NEXT:    s_waitcnt vmcnt(0)
1076; GFX10-NEXT:    v_alignbit_b32 v1, v4, v6, 16
1077; GFX10-NEXT:    s_setpc_b64 s[30:31]
1078;
1079; GFX11-LABEL: shuffle_v4f16_5634:
1080; GFX11:       ; %bb.0:
1081; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1082; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
1083; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
1084; GFX11-NEXT:    s_waitcnt vmcnt(1)
1085; GFX11-NEXT:    v_alignbit_b32 v0, v3, v2, 16
1086; GFX11-NEXT:    s_waitcnt vmcnt(0)
1087; GFX11-NEXT:    v_alignbit_b32 v1, v2, v1, 16
1088; GFX11-NEXT:    s_setpc_b64 s[30:31]
1089  %val0 = load <4 x half>, ptr addrspace(1) %arg0
1090  %val1 = load <4 x half>, ptr addrspace(1) %arg1
1091  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 6, i32 3, i32 4>
1092  ret <4 x half> %shuffle
1093}
1094
1095define <4 x half> @shuffle_v4f16_5734(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
1096; GX900-LABEL: shuffle_v4f16_5734:
1097; GX900:       ; %bb.0:
1098; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1099; GX900-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
1100; GX900-NEXT:    global_load_dword v6, v[0:1], off offset:4
1101; GX900-NEXT:    s_mov_b32 s4, 0x7060302
1102; GX900-NEXT:    s_waitcnt vmcnt(1)
1103; GX900-NEXT:    v_perm_b32 v0, v5, v4, s4
1104; GX900-NEXT:    s_waitcnt vmcnt(0)
1105; GX900-NEXT:    v_alignbit_b32 v1, v4, v6, 16
1106; GX900-NEXT:    s_setpc_b64 s[30:31]
1107;
1108; GFX940-LABEL: shuffle_v4f16_5734:
1109; GFX940:       ; %bb.0:
1110; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1111; GFX940-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
1112; GFX940-NEXT:    global_load_dword v6, v[0:1], off offset:4
1113; GFX940-NEXT:    s_mov_b32 s0, 0x7060302
1114; GFX940-NEXT:    s_waitcnt vmcnt(1)
1115; GFX940-NEXT:    v_perm_b32 v0, v5, v4, s0
1116; GFX940-NEXT:    s_waitcnt vmcnt(0)
1117; GFX940-NEXT:    v_alignbit_b32 v1, v4, v6, 16
1118; GFX940-NEXT:    s_setpc_b64 s[30:31]
1119;
1120; GFX10-LABEL: shuffle_v4f16_5734:
1121; GFX10:       ; %bb.0:
1122; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1123; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
1124; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
1125; GFX10-NEXT:    s_waitcnt vmcnt(1)
1126; GFX10-NEXT:    v_perm_b32 v0, v5, v4, 0x7060302
1127; GFX10-NEXT:    s_waitcnt vmcnt(0)
1128; GFX10-NEXT:    v_alignbit_b32 v1, v4, v6, 16
1129; GFX10-NEXT:    s_setpc_b64 s[30:31]
1130;
1131; GFX11-LABEL: shuffle_v4f16_5734:
1132; GFX11:       ; %bb.0:
1133; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1134; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
1135; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
1136; GFX11-NEXT:    s_waitcnt vmcnt(1)
1137; GFX11-NEXT:    v_perm_b32 v0, v3, v2, 0x7060302
1138; GFX11-NEXT:    s_waitcnt vmcnt(0)
1139; GFX11-NEXT:    v_alignbit_b32 v1, v2, v1, 16
1140; GFX11-NEXT:    s_setpc_b64 s[30:31]
1141  %val0 = load <4 x half>, ptr addrspace(1) %arg0
1142  %val1 = load <4 x half>, ptr addrspace(1) %arg1
1143  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 7, i32 3, i32 4>
1144  ret <4 x half> %shuffle
1145}
1146
1147define <4 x i16> @shuffle_v4i16_2356(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
1148; GX900-LABEL: shuffle_v4i16_2356:
1149; GX900:       ; %bb.0:
1150; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1151; GX900-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
1152; GX900-NEXT:    global_load_dword v4, v[0:1], off offset:4
1153; GX900-NEXT:    s_waitcnt vmcnt(1)
1154; GX900-NEXT:    v_alignbit_b32 v1, v6, v5, 16
1155; GX900-NEXT:    s_waitcnt vmcnt(0)
1156; GX900-NEXT:    v_mov_b32_e32 v0, v4
1157; GX900-NEXT:    s_setpc_b64 s[30:31]
1158;
1159; GFX940-LABEL: shuffle_v4i16_2356:
1160; GFX940:       ; %bb.0:
1161; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1162; GFX940-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off
1163; GFX940-NEXT:    global_load_dword v4, v[0:1], off offset:4
1164; GFX940-NEXT:    s_waitcnt vmcnt(1)
1165; GFX940-NEXT:    v_alignbit_b32 v1, v7, v6, 16
1166; GFX940-NEXT:    s_waitcnt vmcnt(0)
1167; GFX940-NEXT:    v_mov_b32_e32 v0, v4
1168; GFX940-NEXT:    s_setpc_b64 s[30:31]
1169;
1170; GFX10-LABEL: shuffle_v4i16_2356:
1171; GFX10:       ; %bb.0:
1172; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1173; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
1174; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
1175; GFX10-NEXT:    s_waitcnt vmcnt(1)
1176; GFX10-NEXT:    v_alignbit_b32 v1, v6, v5, 16
1177; GFX10-NEXT:    s_waitcnt vmcnt(0)
1178; GFX10-NEXT:    v_mov_b32_e32 v0, v4
1179; GFX10-NEXT:    s_setpc_b64 s[30:31]
1180;
1181; GFX11-LABEL: shuffle_v4i16_2356:
1182; GFX11:       ; %bb.0:
1183; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1184; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
1185; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
1186; GFX11-NEXT:    s_waitcnt vmcnt(1)
1187; GFX11-NEXT:    v_alignbit_b32 v1, v3, v2, 16
1188; GFX11-NEXT:    s_waitcnt vmcnt(0)
1189; GFX11-NEXT:    s_setpc_b64 s[30:31]
1190  %val0 = load <4 x i16>, ptr addrspace(1) %arg0
1191  %val1 = load <4 x i16>, ptr addrspace(1) %arg1
1192  %shuffle = shufflevector <4 x i16> %val0, <4 x i16> %val1, <4 x i32> <i32 2, i32 3, i32 5, i32 6>
1193  ret <4 x i16> %shuffle
1194}
1195
1196define <4 x i16> @shuffle_v4i16_0167(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
1197; GFX9-LABEL: shuffle_v4i16_0167:
1198; GFX9:       ; %bb.0:
1199; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1200; GFX9-NEXT:    global_load_dword v4, v[0:1], off
1201; GFX9-NEXT:    global_load_dword v5, v[2:3], off offset:4
1202; GFX9-NEXT:    s_waitcnt vmcnt(1)
1203; GFX9-NEXT:    v_mov_b32_e32 v0, v4
1204; GFX9-NEXT:    s_waitcnt vmcnt(0)
1205; GFX9-NEXT:    v_mov_b32_e32 v1, v5
1206; GFX9-NEXT:    s_setpc_b64 s[30:31]
1207;
1208; GFX10-LABEL: shuffle_v4i16_0167:
1209; GFX10:       ; %bb.0:
1210; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1211; GFX10-NEXT:    global_load_dword v4, v[0:1], off
1212; GFX10-NEXT:    global_load_dword v5, v[2:3], off offset:4
1213; GFX10-NEXT:    s_waitcnt vmcnt(1)
1214; GFX10-NEXT:    v_mov_b32_e32 v0, v4
1215; GFX10-NEXT:    s_waitcnt vmcnt(0)
1216; GFX10-NEXT:    v_mov_b32_e32 v1, v5
1217; GFX10-NEXT:    s_setpc_b64 s[30:31]
1218;
1219; GFX11-LABEL: shuffle_v4i16_0167:
1220; GFX11:       ; %bb.0:
1221; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1222; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
1223; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:4
1224; GFX11-NEXT:    s_waitcnt vmcnt(0)
1225; GFX11-NEXT:    s_setpc_b64 s[30:31]
1226  %val0 = load <4 x i16>, ptr addrspace(1) %arg0
1227  %val1 = load <4 x i16>, ptr addrspace(1) %arg1
1228  %shuffle = shufflevector <4 x i16> %val0, <4 x i16> %val1, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
1229  ret <4 x i16> %shuffle
1230}
1231
1232define <4 x half> @shuffle_v4f16_0000(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
1233; GX900-LABEL: shuffle_v4f16_0000:
1234; GX900:       ; %bb.0:
1235; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1236; GX900-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
1237; GX900-NEXT:    s_mov_b32 s4, 0x5040100
1238; GX900-NEXT:    s_waitcnt vmcnt(0)
1239; GX900-NEXT:    v_perm_b32 v0, v0, v0, s4
1240; GX900-NEXT:    v_mov_b32_e32 v1, v0
1241; GX900-NEXT:    s_setpc_b64 s[30:31]
1242;
1243; GFX940-LABEL: shuffle_v4f16_0000:
1244; GFX940:       ; %bb.0:
1245; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1246; GFX940-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
1247; GFX940-NEXT:    s_mov_b32 s0, 0x5040100
1248; GFX940-NEXT:    s_waitcnt vmcnt(0)
1249; GFX940-NEXT:    v_perm_b32 v0, v0, v0, s0
1250; GFX940-NEXT:    v_mov_b32_e32 v1, v0
1251; GFX940-NEXT:    s_setpc_b64 s[30:31]
1252;
1253; GFX10-LABEL: shuffle_v4f16_0000:
1254; GFX10:       ; %bb.0:
1255; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1256; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
1257; GFX10-NEXT:    s_waitcnt vmcnt(0)
1258; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x5040100
1259; GFX10-NEXT:    v_mov_b32_e32 v1, v0
1260; GFX10-NEXT:    s_setpc_b64 s[30:31]
1261;
1262; GFX11-LABEL: shuffle_v4f16_0000:
1263; GFX11:       ; %bb.0:
1264; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1265; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
1266; GFX11-NEXT:    s_waitcnt vmcnt(0)
1267; GFX11-NEXT:    v_perm_b32 v0, v0, v0, 0x5040100
1268; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1269; GFX11-NEXT:    v_mov_b32_e32 v1, v0
1270; GFX11-NEXT:    s_setpc_b64 s[30:31]
1271  %val0 = load <4 x half>, ptr addrspace(1) %arg0
1272  %val1 = load <4 x half>, ptr addrspace(1) %arg1
1273  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> zeroinitializer
1274  ret <4 x half> %shuffle
1275}
1276
1277define <4 x half> @shuffle_v4f16_1010(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
1278; GFX9-LABEL: shuffle_v4f16_1010:
1279; GFX9:       ; %bb.0:
1280; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1281; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
1282; GFX9-NEXT:    s_waitcnt vmcnt(0)
1283; GFX9-NEXT:    v_alignbit_b32 v0, v0, v0, 16
1284; GFX9-NEXT:    v_mov_b32_e32 v1, v0
1285; GFX9-NEXT:    s_setpc_b64 s[30:31]
1286;
1287; GFX10-LABEL: shuffle_v4f16_1010:
1288; GFX10:       ; %bb.0:
1289; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1290; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
1291; GFX10-NEXT:    s_waitcnt vmcnt(0)
1292; GFX10-NEXT:    v_alignbit_b32 v0, v0, v0, 16
1293; GFX10-NEXT:    v_mov_b32_e32 v1, v0
1294; GFX10-NEXT:    s_setpc_b64 s[30:31]
1295;
1296; GFX11-LABEL: shuffle_v4f16_1010:
1297; GFX11:       ; %bb.0:
1298; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1299; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
1300; GFX11-NEXT:    s_waitcnt vmcnt(0)
1301; GFX11-NEXT:    v_alignbit_b32 v0, v0, v0, 16
1302; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1303; GFX11-NEXT:    v_mov_b32_e32 v1, v0
1304; GFX11-NEXT:    s_setpc_b64 s[30:31]
1305  %val0 = load <4 x half>, ptr addrspace(1) %arg0
1306  %val1 = load <4 x half>, ptr addrspace(1) %arg1
1307  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
1308  ret <4 x half> %shuffle
1309}
1310
1311define <4 x half> @shuffle_v4f16_1100(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
1312; GX900-LABEL: shuffle_v4f16_1100:
1313; GX900:       ; %bb.0:
1314; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1315; GX900-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
1316; GX900-NEXT:    s_mov_b32 s4, 0x7060302
1317; GX900-NEXT:    s_mov_b32 s5, 0x5040100
1318; GX900-NEXT:    s_waitcnt vmcnt(0)
1319; GX900-NEXT:    v_perm_b32 v0, v1, v1, s4
1320; GX900-NEXT:    v_perm_b32 v1, v1, v1, s5
1321; GX900-NEXT:    s_setpc_b64 s[30:31]
1322;
1323; GFX940-LABEL: shuffle_v4f16_1100:
1324; GFX940:       ; %bb.0:
1325; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1326; GFX940-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
1327; GFX940-NEXT:    s_mov_b32 s0, 0x7060302
1328; GFX940-NEXT:    s_mov_b32 s1, 0x5040100
1329; GFX940-NEXT:    s_waitcnt vmcnt(0)
1330; GFX940-NEXT:    v_perm_b32 v0, v2, v2, s0
1331; GFX940-NEXT:    v_perm_b32 v1, v2, v2, s1
1332; GFX940-NEXT:    s_setpc_b64 s[30:31]
1333;
1334; GFX10-LABEL: shuffle_v4f16_1100:
1335; GFX10:       ; %bb.0:
1336; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1337; GFX10-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
1338; GFX10-NEXT:    s_waitcnt vmcnt(0)
1339; GFX10-NEXT:    v_perm_b32 v0, v1, v1, 0x7060302
1340; GFX10-NEXT:    v_perm_b32 v1, v1, v1, 0x5040100
1341; GFX10-NEXT:    s_setpc_b64 s[30:31]
1342;
1343; GFX11-LABEL: shuffle_v4f16_1100:
1344; GFX11:       ; %bb.0:
1345; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1346; GFX11-NEXT:    global_load_b64 v[1:2], v[0:1], off
1347; GFX11-NEXT:    s_waitcnt vmcnt(0)
1348; GFX11-NEXT:    v_perm_b32 v0, v1, v1, 0x7060302
1349; GFX11-NEXT:    v_perm_b32 v1, v1, v1, 0x5040100
1350; GFX11-NEXT:    s_setpc_b64 s[30:31]
1351  %val0 = load <4 x half>, ptr addrspace(1) %arg0
1352  %val1 = load <4 x half>, ptr addrspace(1) %arg1
1353  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 1, i32 1, i32 0, i32 0>
1354  ret <4 x half> %shuffle
1355}
1356
1357define <4 x half> @shuffle_v4f16_6161(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
1358; GX900-LABEL: shuffle_v4f16_6161:
1359; GX900:       ; %bb.0:
1360; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1361; GX900-NEXT:    global_load_dword v4, v[0:1], off
1362; GX900-NEXT:    global_load_dword v5, v[2:3], off offset:4
1363; GX900-NEXT:    s_mov_b32 s4, 0xffff
1364; GX900-NEXT:    s_waitcnt vmcnt(0)
1365; GX900-NEXT:    v_bfi_b32 v0, s4, v5, v4
1366; GX900-NEXT:    v_mov_b32_e32 v1, v0
1367; GX900-NEXT:    s_setpc_b64 s[30:31]
1368;
1369; GFX940-LABEL: shuffle_v4f16_6161:
1370; GFX940:       ; %bb.0:
1371; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1372; GFX940-NEXT:    global_load_dword v4, v[0:1], off
1373; GFX940-NEXT:    global_load_dword v5, v[2:3], off offset:4
1374; GFX940-NEXT:    s_mov_b32 s0, 0xffff
1375; GFX940-NEXT:    s_waitcnt vmcnt(0)
1376; GFX940-NEXT:    v_bfi_b32 v0, s0, v5, v4
1377; GFX940-NEXT:    v_mov_b32_e32 v1, v0
1378; GFX940-NEXT:    s_setpc_b64 s[30:31]
1379;
1380; GFX10-LABEL: shuffle_v4f16_6161:
1381; GFX10:       ; %bb.0:
1382; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1383; GFX10-NEXT:    global_load_dword v4, v[0:1], off
1384; GFX10-NEXT:    global_load_dword v5, v[2:3], off offset:4
1385; GFX10-NEXT:    s_waitcnt vmcnt(0)
1386; GFX10-NEXT:    v_bfi_b32 v0, 0xffff, v5, v4
1387; GFX10-NEXT:    v_mov_b32_e32 v1, v0
1388; GFX10-NEXT:    s_setpc_b64 s[30:31]
1389;
1390; GFX11-LABEL: shuffle_v4f16_6161:
1391; GFX11:       ; %bb.0:
1392; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1393; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
1394; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:4
1395; GFX11-NEXT:    s_waitcnt vmcnt(0)
1396; GFX11-NEXT:    v_bfi_b32 v0, 0xffff, v1, v0
1397; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1398; GFX11-NEXT:    v_mov_b32_e32 v1, v0
1399; GFX11-NEXT:    s_setpc_b64 s[30:31]
1400  %val0 = load <4 x half>, ptr addrspace(1) %arg0
1401  %val1 = load <4 x half>, ptr addrspace(1) %arg1
1402  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 1, i32 6, i32 1>
1403  ret <4 x half> %shuffle
1404}
1405
1406define <4 x half> @shuffle_v4f16_2333(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
1407; GX900-LABEL: shuffle_v4f16_2333:
1408; GX900:       ; %bb.0:
1409; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1410; GX900-NEXT:    global_load_dword v0, v[0:1], off offset:4
1411; GX900-NEXT:    s_mov_b32 s4, 0x7060302
1412; GX900-NEXT:    s_waitcnt vmcnt(0)
1413; GX900-NEXT:    v_perm_b32 v1, v0, v0, s4
1414; GX900-NEXT:    s_setpc_b64 s[30:31]
1415;
1416; GFX940-LABEL: shuffle_v4f16_2333:
1417; GFX940:       ; %bb.0:
1418; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1419; GFX940-NEXT:    global_load_dword v0, v[0:1], off offset:4
1420; GFX940-NEXT:    s_mov_b32 s0, 0x7060302
1421; GFX940-NEXT:    s_waitcnt vmcnt(0)
1422; GFX940-NEXT:    v_perm_b32 v1, v0, v0, s0
1423; GFX940-NEXT:    s_setpc_b64 s[30:31]
1424;
1425; GFX10-LABEL: shuffle_v4f16_2333:
1426; GFX10:       ; %bb.0:
1427; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1428; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
1429; GFX10-NEXT:    s_waitcnt vmcnt(0)
1430; GFX10-NEXT:    v_perm_b32 v1, v0, v0, 0x7060302
1431; GFX10-NEXT:    s_setpc_b64 s[30:31]
1432;
1433; GFX11-LABEL: shuffle_v4f16_2333:
1434; GFX11:       ; %bb.0:
1435; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1436; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
1437; GFX11-NEXT:    s_waitcnt vmcnt(0)
1438; GFX11-NEXT:    v_perm_b32 v1, v0, v0, 0x7060302
1439; GFX11-NEXT:    s_setpc_b64 s[30:31]
1440  %val0 = load <4 x half>, ptr addrspace(1) %arg0
1441  %val1 = load <4 x half>, ptr addrspace(1) %arg1
1442  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 3, i32 3>
1443  ret <4 x half> %shuffle
1444}
1445
1446define <4 x half> @shuffle_v4f16_6667(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
1447; GX900-LABEL: shuffle_v4f16_6667:
1448; GX900:       ; %bb.0:
1449; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1450; GX900-NEXT:    global_load_dword v0, v[0:1], off offset:4
1451; GX900-NEXT:    s_mov_b32 s4, 0x7060302
1452; GX900-NEXT:    s_waitcnt vmcnt(0)
1453; GX900-NEXT:    v_perm_b32 v1, v0, v0, s4
1454; GX900-NEXT:    s_setpc_b64 s[30:31]
1455;
1456; GFX940-LABEL: shuffle_v4f16_6667:
1457; GFX940:       ; %bb.0:
1458; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1459; GFX940-NEXT:    global_load_dword v0, v[0:1], off offset:4
1460; GFX940-NEXT:    s_mov_b32 s0, 0x7060302
1461; GFX940-NEXT:    s_waitcnt vmcnt(0)
1462; GFX940-NEXT:    v_perm_b32 v1, v0, v0, s0
1463; GFX940-NEXT:    s_setpc_b64 s[30:31]
1464;
1465; GFX10-LABEL: shuffle_v4f16_6667:
1466; GFX10:       ; %bb.0:
1467; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1468; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
1469; GFX10-NEXT:    s_waitcnt vmcnt(0)
1470; GFX10-NEXT:    v_perm_b32 v1, v0, v0, 0x7060302
1471; GFX10-NEXT:    s_setpc_b64 s[30:31]
1472;
1473; GFX11-LABEL: shuffle_v4f16_6667:
1474; GFX11:       ; %bb.0:
1475; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1476; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
1477; GFX11-NEXT:    s_waitcnt vmcnt(0)
1478; GFX11-NEXT:    v_perm_b32 v1, v0, v0, 0x7060302
1479; GFX11-NEXT:    s_setpc_b64 s[30:31]
1480  %val0 = load <4 x half>, ptr addrspace(1) %arg0
1481  %val1 = load <4 x half>, ptr addrspace(1) %arg1
1482  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 3, i32 3>
1483  ret <4 x half> %shuffle
1484}
1485
1486define <4 x half> @shuffle_v8f16_0101(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
1487; GFX9-LABEL: shuffle_v8f16_0101:
1488; GFX9:       ; %bb.0:
1489; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1490; GFX9-NEXT:    global_load_dword v0, v[0:1], off
1491; GFX9-NEXT:    s_waitcnt vmcnt(0)
1492; GFX9-NEXT:    v_mov_b32_e32 v1, v0
1493; GFX9-NEXT:    s_setpc_b64 s[30:31]
1494;
1495; GFX10-LABEL: shuffle_v8f16_0101:
1496; GFX10:       ; %bb.0:
1497; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1498; GFX10-NEXT:    global_load_dword v0, v[0:1], off
1499; GFX10-NEXT:    s_waitcnt vmcnt(0)
1500; GFX10-NEXT:    v_mov_b32_e32 v1, v0
1501; GFX10-NEXT:    s_setpc_b64 s[30:31]
1502;
1503; GFX11-LABEL: shuffle_v8f16_0101:
1504; GFX11:       ; %bb.0:
1505; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1506; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
1507; GFX11-NEXT:    s_waitcnt vmcnt(0)
1508; GFX11-NEXT:    v_mov_b32_e32 v1, v0
1509; GFX11-NEXT:    s_setpc_b64 s[30:31]
1510  %val0 = load <8 x half>, ptr addrspace(1) %arg0
1511  %val1 = load <8 x half>, ptr addrspace(1) %arg1
1512  %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
1513  ret <4 x half> %shuffle
1514}
1515
1516define <4 x half> @shuffle_v8f16_0123(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
1517; GFX9-LABEL: shuffle_v8f16_0123:
1518; GFX9:       ; %bb.0:
1519; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1520; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
1521; GFX9-NEXT:    s_waitcnt vmcnt(0)
1522; GFX9-NEXT:    s_setpc_b64 s[30:31]
1523;
1524; GFX10-LABEL: shuffle_v8f16_0123:
1525; GFX10:       ; %bb.0:
1526; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1527; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
1528; GFX10-NEXT:    s_waitcnt vmcnt(0)
1529; GFX10-NEXT:    s_setpc_b64 s[30:31]
1530;
1531; GFX11-LABEL: shuffle_v8f16_0123:
1532; GFX11:       ; %bb.0:
1533; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1534; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
1535; GFX11-NEXT:    s_waitcnt vmcnt(0)
1536; GFX11-NEXT:    s_setpc_b64 s[30:31]
1537  %val0 = load <8 x half>, ptr addrspace(1) %arg0
1538  %val1 = load <8 x half>, ptr addrspace(1) %arg1
1539  %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1540  ret <4 x half> %shuffle
1541}
1542
1543define <4 x half> @shuffle_v8f16_4589(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
1544; GFX9-LABEL: shuffle_v8f16_4589:
1545; GFX9:       ; %bb.0:
1546; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1547; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:8
1548; GFX9-NEXT:    global_load_dword v5, v[2:3], off
1549; GFX9-NEXT:    s_waitcnt vmcnt(1)
1550; GFX9-NEXT:    v_mov_b32_e32 v0, v4
1551; GFX9-NEXT:    s_waitcnt vmcnt(0)
1552; GFX9-NEXT:    v_mov_b32_e32 v1, v5
1553; GFX9-NEXT:    s_setpc_b64 s[30:31]
1554;
1555; GFX10-LABEL: shuffle_v8f16_4589:
1556; GFX10:       ; %bb.0:
1557; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1558; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:8
1559; GFX10-NEXT:    global_load_dword v5, v[2:3], off
1560; GFX10-NEXT:    s_waitcnt vmcnt(1)
1561; GFX10-NEXT:    v_mov_b32_e32 v0, v4
1562; GFX10-NEXT:    s_waitcnt vmcnt(0)
1563; GFX10-NEXT:    v_mov_b32_e32 v1, v5
1564; GFX10-NEXT:    s_setpc_b64 s[30:31]
1565;
1566; GFX11-LABEL: shuffle_v8f16_4589:
1567; GFX11:       ; %bb.0:
1568; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1569; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:8
1570; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
1571; GFX11-NEXT:    s_waitcnt vmcnt(0)
1572; GFX11-NEXT:    s_setpc_b64 s[30:31]
1573  %val0 = load <8 x half>, ptr addrspace(1) %arg0
1574  %val1 = load <8 x half>, ptr addrspace(1) %arg1
1575  %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 4, i32 5, i32 8, i32 9>
1576  ret <4 x half> %shuffle
1577}
1578
1579define <4 x half> @shuffle_v8f16_10_11_2_3(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
1580; GFX9-LABEL: shuffle_v8f16_10_11_2_3:
1581; GFX9:       ; %bb.0:
1582; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1583; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
1584; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
1585; GFX9-NEXT:    s_waitcnt vmcnt(1)
1586; GFX9-NEXT:    v_mov_b32_e32 v0, v4
1587; GFX9-NEXT:    s_waitcnt vmcnt(0)
1588; GFX9-NEXT:    v_mov_b32_e32 v1, v5
1589; GFX9-NEXT:    s_setpc_b64 s[30:31]
1590;
1591; GFX10-LABEL: shuffle_v8f16_10_11_2_3:
1592; GFX10:       ; %bb.0:
1593; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1594; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
1595; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
1596; GFX10-NEXT:    s_waitcnt vmcnt(1)
1597; GFX10-NEXT:    v_mov_b32_e32 v0, v4
1598; GFX10-NEXT:    s_waitcnt vmcnt(0)
1599; GFX10-NEXT:    v_mov_b32_e32 v1, v5
1600; GFX10-NEXT:    s_setpc_b64 s[30:31]
1601;
1602; GFX11-LABEL: shuffle_v8f16_10_11_2_3:
1603; GFX11:       ; %bb.0:
1604; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1605; GFX11-NEXT:    global_load_b32 v2, v[2:3], off offset:4
1606; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
1607; GFX11-NEXT:    s_waitcnt vmcnt(1)
1608; GFX11-NEXT:    v_mov_b32_e32 v0, v2
1609; GFX11-NEXT:    s_waitcnt vmcnt(0)
1610; GFX11-NEXT:    s_setpc_b64 s[30:31]
1611  %val0 = load <8 x half>, ptr addrspace(1) %arg0
1612  %val1 = load <8 x half>, ptr addrspace(1) %arg1
1613  %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 10, i32 11, i32 2, i32 3>
1614  ret <4 x half> %shuffle
1615}
1616
1617define <4 x half> @shuffle_v8f16_13_14_2_3(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
1618; GX900-LABEL: shuffle_v8f16_13_14_2_3:
1619; GX900:       ; %bb.0:
1620; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1621; GX900-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off offset:8
1622; GX900-NEXT:    global_load_dword v4, v[0:1], off offset:4
1623; GX900-NEXT:    s_waitcnt vmcnt(1)
1624; GX900-NEXT:    v_alignbit_b32 v0, v6, v5, 16
1625; GX900-NEXT:    s_waitcnt vmcnt(0)
1626; GX900-NEXT:    v_mov_b32_e32 v1, v4
1627; GX900-NEXT:    s_setpc_b64 s[30:31]
1628;
1629; GFX940-LABEL: shuffle_v8f16_13_14_2_3:
1630; GFX940:       ; %bb.0:
1631; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1632; GFX940-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off offset:8
1633; GFX940-NEXT:    global_load_dword v4, v[0:1], off offset:4
1634; GFX940-NEXT:    s_waitcnt vmcnt(1)
1635; GFX940-NEXT:    v_alignbit_b32 v0, v7, v6, 16
1636; GFX940-NEXT:    s_waitcnt vmcnt(0)
1637; GFX940-NEXT:    v_mov_b32_e32 v1, v4
1638; GFX940-NEXT:    s_setpc_b64 s[30:31]
1639;
1640; GFX10-LABEL: shuffle_v8f16_13_14_2_3:
1641; GFX10:       ; %bb.0:
1642; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1643; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off offset:8
1644; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
1645; GFX10-NEXT:    s_waitcnt vmcnt(1)
1646; GFX10-NEXT:    v_alignbit_b32 v0, v6, v5, 16
1647; GFX10-NEXT:    s_waitcnt vmcnt(0)
1648; GFX10-NEXT:    v_mov_b32_e32 v1, v4
1649; GFX10-NEXT:    s_setpc_b64 s[30:31]
1650;
1651; GFX11-LABEL: shuffle_v8f16_13_14_2_3:
1652; GFX11:       ; %bb.0:
1653; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1654; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off offset:8
1655; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
1656; GFX11-NEXT:    s_waitcnt vmcnt(1)
1657; GFX11-NEXT:    v_alignbit_b32 v0, v3, v2, 16
1658; GFX11-NEXT:    s_waitcnt vmcnt(0)
1659; GFX11-NEXT:    s_setpc_b64 s[30:31]
1660  %val0 = load <8 x half>, ptr addrspace(1) %arg0
1661  %val1 = load <8 x half>, ptr addrspace(1) %arg1
1662  %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 13, i32 14, i32 2, i32 3>
1663  ret <4 x half> %shuffle
1664}
1665
1666define <4 x half> @shuffle_v3f16_0122(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
1667; GX900-LABEL: shuffle_v3f16_0122:
1668; GX900:       ; %bb.0:
1669; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1670; GX900-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
1671; GX900-NEXT:    s_mov_b32 s4, 0x5040100
1672; GX900-NEXT:    s_waitcnt vmcnt(0)
1673; GX900-NEXT:    v_perm_b32 v1, v1, v1, s4
1674; GX900-NEXT:    s_setpc_b64 s[30:31]
1675;
1676; GFX940-LABEL: shuffle_v3f16_0122:
1677; GFX940:       ; %bb.0:
1678; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1679; GFX940-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
1680; GFX940-NEXT:    s_mov_b32 s0, 0x5040100
1681; GFX940-NEXT:    s_waitcnt vmcnt(0)
1682; GFX940-NEXT:    v_perm_b32 v1, v1, v1, s0
1683; GFX940-NEXT:    s_setpc_b64 s[30:31]
1684;
1685; GFX10-LABEL: shuffle_v3f16_0122:
1686; GFX10:       ; %bb.0:
1687; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1688; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
1689; GFX10-NEXT:    s_waitcnt vmcnt(0)
1690; GFX10-NEXT:    v_perm_b32 v1, v1, v1, 0x5040100
1691; GFX10-NEXT:    s_setpc_b64 s[30:31]
1692;
1693; GFX11-LABEL: shuffle_v3f16_0122:
1694; GFX11:       ; %bb.0:
1695; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1696; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
1697; GFX11-NEXT:    s_waitcnt vmcnt(0)
1698; GFX11-NEXT:    v_perm_b32 v1, v1, v1, 0x5040100
1699; GFX11-NEXT:    s_setpc_b64 s[30:31]
1700  %val0 = load <3 x half>, ptr addrspace(1) %arg0
1701  %val1 = load <3 x half>, ptr addrspace(1) %arg1
1702  %shuffle = shufflevector <3 x half> %val0, <3 x half> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
1703  ret <4 x half> %shuffle
1704}
1705
1706define <4 x half> @shuffle_v2f16_0122(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
1707; GFX9-LABEL: shuffle_v2f16_0122:
1708; GFX9:       ; %bb.0:
1709; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1710; GFX9-NEXT:    global_load_dword v0, v[0:1], off
1711; GFX9-NEXT:    s_waitcnt vmcnt(0)
1712; GFX9-NEXT:    v_alignbit_b32 v1, v0, v0, 16
1713; GFX9-NEXT:    s_setpc_b64 s[30:31]
1714;
1715; GFX10-LABEL: shuffle_v2f16_0122:
1716; GFX10:       ; %bb.0:
1717; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1718; GFX10-NEXT:    global_load_dword v0, v[0:1], off
1719; GFX10-NEXT:    s_waitcnt vmcnt(0)
1720; GFX10-NEXT:    v_alignbit_b32 v1, v0, v0, 16
1721; GFX10-NEXT:    s_setpc_b64 s[30:31]
1722;
1723; GFX11-LABEL: shuffle_v2f16_0122:
1724; GFX11:       ; %bb.0:
1725; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1726; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
1727; GFX11-NEXT:    s_waitcnt vmcnt(0)
1728; GFX11-NEXT:    v_alignbit_b32 v1, v0, v0, 16
1729; GFX11-NEXT:    s_setpc_b64 s[30:31]
1730  %val0 = load <2 x half>, ptr addrspace(1) %arg0
1731  %val1 = load <2 x half>, ptr addrspace(1) %arg1
1732  %shuffle = shufflevector <2 x half> %val0, <2 x half> %val1, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
1733  ret <4 x half> %shuffle
1734}
1735
1736define <6 x half> @shuffle_v6f16_452367(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
1737; GX900-LABEL: shuffle_v6f16_452367:
1738; GX900:       ; %bb.0:
1739; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1740; GX900-NEXT:    v_mov_b32_e32 v6, v1
1741; GX900-NEXT:    v_mov_b32_e32 v5, v0
1742; GX900-NEXT:    v_mov_b32_e32 v4, v3
1743; GX900-NEXT:    v_mov_b32_e32 v3, v2
1744; GX900-NEXT:    global_load_dwordx3 v[0:2], v[5:6], off
1745; GX900-NEXT:    global_load_dword v7, v[3:4], off
1746; GX900-NEXT:    s_waitcnt vmcnt(1)
1747; GX900-NEXT:    v_mov_b32_e32 v0, v2
1748; GX900-NEXT:    s_waitcnt vmcnt(0)
1749; GX900-NEXT:    v_mov_b32_e32 v2, v7
1750; GX900-NEXT:    s_setpc_b64 s[30:31]
1751;
1752; GFX940-LABEL: shuffle_v6f16_452367:
1753; GFX940:       ; %bb.0:
1754; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1755; GFX940-NEXT:    v_mov_b32_e32 v7, v1
1756; GFX940-NEXT:    v_mov_b32_e32 v6, v0
1757; GFX940-NEXT:    v_mov_b32_e32 v5, v3
1758; GFX940-NEXT:    v_mov_b32_e32 v4, v2
1759; GFX940-NEXT:    global_load_dwordx3 v[0:2], v[6:7], off
1760; GFX940-NEXT:    global_load_dword v3, v[4:5], off
1761; GFX940-NEXT:    s_waitcnt vmcnt(1)
1762; GFX940-NEXT:    v_mov_b32_e32 v0, v2
1763; GFX940-NEXT:    s_waitcnt vmcnt(0)
1764; GFX940-NEXT:    v_mov_b32_e32 v2, v3
1765; GFX940-NEXT:    s_setpc_b64 s[30:31]
1766;
1767; GFX10-LABEL: shuffle_v6f16_452367:
1768; GFX10:       ; %bb.0:
1769; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1770; GFX10-NEXT:    v_mov_b32_e32 v6, v1
1771; GFX10-NEXT:    v_mov_b32_e32 v5, v0
1772; GFX10-NEXT:    v_mov_b32_e32 v4, v3
1773; GFX10-NEXT:    v_mov_b32_e32 v3, v2
1774; GFX10-NEXT:    global_load_dwordx3 v[0:2], v[5:6], off
1775; GFX10-NEXT:    global_load_dword v7, v[3:4], off
1776; GFX10-NEXT:    s_waitcnt vmcnt(1)
1777; GFX10-NEXT:    v_mov_b32_e32 v0, v2
1778; GFX10-NEXT:    s_waitcnt vmcnt(0)
1779; GFX10-NEXT:    v_mov_b32_e32 v2, v7
1780; GFX10-NEXT:    s_setpc_b64 s[30:31]
1781;
1782; GFX11-LABEL: shuffle_v6f16_452367:
1783; GFX11:       ; %bb.0:
1784; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1785; GFX11-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2
1786; GFX11-NEXT:    global_load_b96 v[0:2], v[0:1], off
1787; GFX11-NEXT:    global_load_b32 v3, v[3:4], off
1788; GFX11-NEXT:    s_waitcnt vmcnt(1)
1789; GFX11-NEXT:    v_mov_b32_e32 v0, v2
1790; GFX11-NEXT:    s_waitcnt vmcnt(0)
1791; GFX11-NEXT:    v_mov_b32_e32 v2, v3
1792; GFX11-NEXT:    s_setpc_b64 s[30:31]
1793  %val0 = load <6 x half>, ptr addrspace(1) %arg0
1794  %val1 = load <6 x half>, ptr addrspace(1) %arg1
1795  %shuffle = shufflevector <6 x half> %val0, <6 x half> %val1, <6 x i32> <i32 4, i32 5, i32 2, i32 3, i32 6, i32 7>
1796  ret <6 x half> %shuffle
1797}
1798
1799define amdgpu_kernel void @fma_shuffle_v2f16(ptr addrspace(1) nocapture readonly %A, ptr addrspace(1) nocapture readonly %B, ptr addrspace(1) nocapture %C)  {
1800; GX900-LABEL: fma_shuffle_v2f16:
1801; GX900:       ; %bb.0: ; %entry
1802; GX900-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1803; GX900-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
1804; GX900-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
1805; GX900-NEXT:    s_waitcnt lgkmcnt(0)
1806; GX900-NEXT:    global_load_dwordx2 v[0:1], v6, s[0:1]
1807; GX900-NEXT:    global_load_dwordx2 v[2:3], v6, s[2:3]
1808; GX900-NEXT:    global_load_dwordx2 v[4:5], v6, s[4:5]
1809; GX900-NEXT:    s_waitcnt vmcnt(0)
1810; GX900-NEXT:    v_pk_fma_f16 v4, v0, v2, v4 op_sel_hi:[0,1,1]
1811; GX900-NEXT:    v_pk_fma_f16 v2, v1, v2, v5 op_sel_hi:[0,1,1]
1812; GX900-NEXT:    v_pk_fma_f16 v0, v0, v3, v4 op_sel:[1,0,0]
1813; GX900-NEXT:    v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0]
1814; GX900-NEXT:    global_store_dwordx2 v6, v[0:1], s[4:5]
1815; GX900-NEXT:    s_endpgm
1816;
1817; GFX940-LABEL: fma_shuffle_v2f16:
1818; GFX940:       ; %bb.0: ; %entry
1819; GFX940-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1820; GFX940-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
1821; GFX940-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1822; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
1823; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
1824; GFX940-NEXT:    global_load_dwordx2 v[0:1], v6, s[0:1]
1825; GFX940-NEXT:    global_load_dwordx2 v[2:3], v6, s[2:3]
1826; GFX940-NEXT:    global_load_dwordx2 v[4:5], v6, s[6:7]
1827; GFX940-NEXT:    s_waitcnt vmcnt(0)
1828; GFX940-NEXT:    v_pk_fma_f16 v4, v0, v2, v4 op_sel_hi:[0,1,1]
1829; GFX940-NEXT:    v_pk_fma_f16 v2, v1, v2, v5 op_sel_hi:[0,1,1]
1830; GFX940-NEXT:    v_pk_fma_f16 v0, v0, v3, v4 op_sel:[1,0,0]
1831; GFX940-NEXT:    v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0]
1832; GFX940-NEXT:    global_store_dwordx2 v6, v[0:1], s[6:7] sc0 sc1
1833; GFX940-NEXT:    s_endpgm
1834;
1835; GFX10-LABEL: fma_shuffle_v2f16:
1836; GFX10:       ; %bb.0: ; %entry
1837; GFX10-NEXT:    s_clause 0x1
1838; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1839; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
1840; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
1841; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1842; GFX10-NEXT:    s_clause 0x2
1843; GFX10-NEXT:    global_load_dwordx2 v[0:1], v6, s[0:1]
1844; GFX10-NEXT:    global_load_dwordx2 v[2:3], v6, s[2:3]
1845; GFX10-NEXT:    global_load_dwordx2 v[4:5], v6, s[4:5]
1846; GFX10-NEXT:    s_waitcnt vmcnt(0)
1847; GFX10-NEXT:    v_pk_fma_f16 v4, v0, v2, v4 op_sel_hi:[0,1,1]
1848; GFX10-NEXT:    v_pk_fma_f16 v2, v1, v2, v5 op_sel_hi:[0,1,1]
1849; GFX10-NEXT:    v_pk_fma_f16 v0, v0, v3, v4 op_sel:[1,0,0]
1850; GFX10-NEXT:    v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0]
1851; GFX10-NEXT:    global_store_dwordx2 v6, v[0:1], s[4:5]
1852; GFX10-NEXT:    s_endpgm
1853;
1854; GFX11-LABEL: fma_shuffle_v2f16:
1855; GFX11:       ; %bb.0: ; %entry
1856; GFX11-NEXT:    s_clause 0x1
1857; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
1858; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
1859; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1860; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1861; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
1862; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1863; GFX11-NEXT:    s_clause 0x2
1864; GFX11-NEXT:    global_load_b64 v[0:1], v6, s[0:1]
1865; GFX11-NEXT:    global_load_b64 v[2:3], v6, s[2:3]
1866; GFX11-NEXT:    global_load_b64 v[4:5], v6, s[4:5]
1867; GFX11-NEXT:    s_waitcnt vmcnt(0)
1868; GFX11-NEXT:    v_pk_fma_f16 v4, v0, v2, v4 op_sel_hi:[0,1,1]
1869; GFX11-NEXT:    v_pk_fma_f16 v2, v1, v2, v5 op_sel_hi:[0,1,1]
1870; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1871; GFX11-NEXT:    v_pk_fma_f16 v0, v0, v3, v4 op_sel:[1,0,0]
1872; GFX11-NEXT:    v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0]
1873; GFX11-NEXT:    global_store_b64 v6, v[0:1], s[4:5]
1874; GFX11-NEXT:    s_endpgm
1875entry:
1876  %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x()
1877  %tmp12 = zext i32 %tmp1 to i64
1878  %arrayidx = getelementptr inbounds <4 x half>, ptr addrspace(1) %A, i64 %tmp12
1879  %tmp14 = load <4 x half>, ptr addrspace(1) %arrayidx, align 8
1880  %arrayidx1 = getelementptr inbounds <4 x half>, ptr addrspace(1) %B, i64 %tmp12
1881  %tmp15 = load <4 x half>, ptr addrspace(1) %arrayidx1, align 8
1882  %arrayidx2 = getelementptr inbounds <4 x half>, ptr addrspace(1) %C, i64 %tmp12
1883  %tmp16 = load <4 x half>, ptr addrspace(1) %arrayidx2, align 8
1884  %tmp17 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> zeroinitializer
1885  %tmp18 = shufflevector <4 x half> %tmp15, <4 x half> undef, <2 x i32> <i32 0, i32 1>
1886  %tmp19 = shufflevector <4 x half> %tmp16, <4 x half> undef, <2 x i32> <i32 0, i32 1>
1887  %tmp20 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp17, <2 x half> %tmp18, <2 x half> %tmp19)
1888  %tmp21 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> <i32 1, i32 1>
1889  %tmp22 = shufflevector <4 x half> %tmp15, <4 x half> undef, <2 x i32> <i32 2, i32 3>
1890  %tmp23 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp21, <2 x half> %tmp22, <2 x half> %tmp20)
1891  %tmp24 = shufflevector <2 x half> %tmp23, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1892  %tmp25 = shufflevector <4 x half> %tmp24, <4 x half> %tmp16, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
1893  %tmp26 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> <i32 2, i32 2>
1894  %tmp27 = shufflevector <4 x half> %tmp25, <4 x half> undef, <2 x i32> <i32 2, i32 3>
1895  %tmp28 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp26, <2 x half> %tmp18, <2 x half> %tmp27)
1896  %tmp29 = shufflevector <4 x half> %tmp14, <4 x half> undef, <2 x i32> <i32 3, i32 3>
1897  %tmp30 = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %tmp29, <2 x half> %tmp22, <2 x half> %tmp28)
1898  %tmp31 = shufflevector <2 x half> %tmp30, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1899  %tmp32 = shufflevector <4 x half> %tmp25, <4 x half> %tmp31, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1900  store <4 x half> %tmp32, ptr addrspace(1) %arrayidx2, align 8
1901  ret void
1902}
1903
1904define <4 x half> @shuffle_v4f16_0456(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
1905; GX900-LABEL: shuffle_v4f16_0456:
1906; GX900:       ; %bb.0:
1907; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1908; GX900-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
1909; GX900-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
1910; GX900-NEXT:    s_mov_b32 s4, 0x5040100
1911; GX900-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
1912; GX900-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
1913; GX900-NEXT:    s_waitcnt vmcnt(0)
1914; GX900-NEXT:    v_perm_b32 v0, v5, v4, s4
1915; GX900-NEXT:    v_alignbit_b32 v1, v6, v5, 16
1916; GX900-NEXT:    s_setpc_b64 s[30:31]
1917;
1918; GFX940-LABEL: shuffle_v4f16_0456:
1919; GFX940:       ; %bb.0:
1920; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1921; GFX940-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
1922; GFX940-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off
1923; GFX940-NEXT:    s_mov_b32 s0, 0x5040100
1924; GFX940-NEXT:    s_waitcnt vmcnt(0)
1925; GFX940-NEXT:    v_perm_b32 v0, v6, v4, s0
1926; GFX940-NEXT:    v_alignbit_b32 v1, v7, v6, 16
1927; GFX940-NEXT:    s_setpc_b64 s[30:31]
1928;
1929; GFX10-LABEL: shuffle_v4f16_0456:
1930; GFX10:       ; %bb.0:
1931; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1932; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
1933; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
1934; GFX10-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
1935; GFX10-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
1936; GFX10-NEXT:    s_waitcnt vmcnt(0)
1937; GFX10-NEXT:    v_perm_b32 v0, v5, v4, 0x5040100
1938; GFX10-NEXT:    v_alignbit_b32 v1, v6, v5, 16
1939; GFX10-NEXT:    s_setpc_b64 s[30:31]
1940;
1941; GFX11-LABEL: shuffle_v4f16_0456:
1942; GFX11:       ; %bb.0:
1943; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1944; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
1945; GFX11-NEXT:    global_load_b64 v[1:2], v[2:3], off
1946; GFX11-NEXT:    s_waitcnt vmcnt(0)
1947; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
1948; GFX11-NEXT:    v_alignbit_b32 v1, v2, v1, 16
1949; GFX11-NEXT:    s_setpc_b64 s[30:31]
1950  %val0 = load <4 x half>, ptr addrspace(1) %arg0
1951  %val1 = load <4 x half>, ptr addrspace(1) %arg1
1952  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 4, i32 5, i32 6>
1953  ret <4 x half> %shuffle
1954}
1955
1956define amdgpu_kernel void @shuffle_scalar_load_v8i32_0123(ptr addrspace(4) %in, ptr addrspace(1) %out)  {
1957; GX900-LABEL: shuffle_scalar_load_v8i32_0123:
1958; GX900:       ; %bb.0:
1959; GX900-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1960; GX900-NEXT:    v_mov_b32_e32 v4, 0
1961; GX900-NEXT:    s_waitcnt lgkmcnt(0)
1962; GX900-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
1963; GX900-NEXT:    s_waitcnt lgkmcnt(0)
1964; GX900-NEXT:    v_mov_b32_e32 v0, s4
1965; GX900-NEXT:    v_mov_b32_e32 v1, s5
1966; GX900-NEXT:    v_mov_b32_e32 v2, s6
1967; GX900-NEXT:    v_mov_b32_e32 v3, s7
1968; GX900-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
1969; GX900-NEXT:    s_endpgm
1970;
1971; GFX940-LABEL: shuffle_scalar_load_v8i32_0123:
1972; GFX940:       ; %bb.0:
1973; GFX940-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1974; GFX940-NEXT:    v_mov_b32_e32 v4, 0
1975; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
1976; GFX940-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
1977; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
1978; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
1979; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
1980; GFX940-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
1981; GFX940-NEXT:    s_endpgm
1982;
1983; GFX10-LABEL: shuffle_scalar_load_v8i32_0123:
1984; GFX10:       ; %bb.0:
1985; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1986; GFX10-NEXT:    v_mov_b32_e32 v4, 0
1987; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1988; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
1989; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1990; GFX10-NEXT:    v_mov_b32_e32 v0, s4
1991; GFX10-NEXT:    v_mov_b32_e32 v1, s5
1992; GFX10-NEXT:    v_mov_b32_e32 v2, s6
1993; GFX10-NEXT:    v_mov_b32_e32 v3, s7
1994; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
1995; GFX10-NEXT:    s_endpgm
1996;
1997; GFX11-LABEL: shuffle_scalar_load_v8i32_0123:
1998; GFX11:       ; %bb.0:
1999; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
2000; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2001; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
2002; GFX11-NEXT:    v_mov_b32_e32 v4, 0
2003; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2004; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
2005; GFX11-NEXT:    v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
2006; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[2:3]
2007; GFX11-NEXT:    s_endpgm
2008  %ld8 = load <8 x i32>, ptr addrspace(4) %in, align 16
2009  %id = shufflevector <8 x i32> %ld8, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2010  store <4 x i32> %id, ptr addrspace(1) %out, align 8
2011  ret void
2012}
2013
2014define <2 x half> @low16bits_v2f16(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
2015; GX900-LABEL: low16bits_v2f16:
2016; GX900:       ; %bb.0: ; %entry
2017; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2018; GX900-NEXT:    global_load_dword v4, v[0:1], off
2019; GX900-NEXT:    global_load_dword v5, v[2:3], off
2020; GX900-NEXT:    s_mov_b32 s4, 0x5040100
2021; GX900-NEXT:    s_waitcnt vmcnt(0)
2022; GX900-NEXT:    v_perm_b32 v0, v5, v4, s4
2023; GX900-NEXT:    s_setpc_b64 s[30:31]
2024;
2025; GFX940-LABEL: low16bits_v2f16:
2026; GFX940:       ; %bb.0: ; %entry
2027; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2028; GFX940-NEXT:    global_load_dword v4, v[0:1], off
2029; GFX940-NEXT:    global_load_dword v5, v[2:3], off
2030; GFX940-NEXT:    s_mov_b32 s0, 0x5040100
2031; GFX940-NEXT:    s_waitcnt vmcnt(0)
2032; GFX940-NEXT:    v_perm_b32 v0, v5, v4, s0
2033; GFX940-NEXT:    s_setpc_b64 s[30:31]
2034;
2035; GFX10-LABEL: low16bits_v2f16:
2036; GFX10:       ; %bb.0: ; %entry
2037; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2038; GFX10-NEXT:    global_load_dword v4, v[0:1], off
2039; GFX10-NEXT:    global_load_dword v5, v[2:3], off
2040; GFX10-NEXT:    s_waitcnt vmcnt(0)
2041; GFX10-NEXT:    v_perm_b32 v0, v5, v4, 0x5040100
2042; GFX10-NEXT:    s_setpc_b64 s[30:31]
2043;
2044; GFX11-LABEL: low16bits_v2f16:
2045; GFX11:       ; %bb.0: ; %entry
2046; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2047; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
2048; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
2049; GFX11-NEXT:    s_waitcnt vmcnt(0)
2050; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
2051; GFX11-NEXT:    s_setpc_b64 s[30:31]
2052entry:
2053  %0 = load <2 x half>, ptr addrspace(1) %x0, align 4
2054  %1 = load <2 x half>, ptr addrspace(1) %x1, align 4
2055  %vy1.0.vec.insert = shufflevector <2 x half> %0, <2 x half> poison, <2 x i32> <i32 0, i32 undef>
2056  %vy1.2.vec.insert = shufflevector <2 x half> %vy1.0.vec.insert, <2 x half> %1, <2 x i32> <i32 0, i32 2>
2057  ret <2 x half> %vy1.2.vec.insert
2058}
2059
2060define <2 x half> @hi16bits_v2f16(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
2061; GX900-LABEL: hi16bits_v2f16:
2062; GX900:       ; %bb.0: ; %entry
2063; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2064; GX900-NEXT:    global_load_dword v4, v[0:1], off
2065; GX900-NEXT:    global_load_dword v5, v[2:3], off
2066; GX900-NEXT:    s_mov_b32 s4, 0x7060302
2067; GX900-NEXT:    s_waitcnt vmcnt(0)
2068; GX900-NEXT:    v_perm_b32 v0, v5, v4, s4
2069; GX900-NEXT:    s_setpc_b64 s[30:31]
2070;
2071; GFX940-LABEL: hi16bits_v2f16:
2072; GFX940:       ; %bb.0: ; %entry
2073; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2074; GFX940-NEXT:    global_load_dword v4, v[0:1], off
2075; GFX940-NEXT:    global_load_dword v5, v[2:3], off
2076; GFX940-NEXT:    s_mov_b32 s0, 0x7060302
2077; GFX940-NEXT:    s_waitcnt vmcnt(0)
2078; GFX940-NEXT:    v_perm_b32 v0, v5, v4, s0
2079; GFX940-NEXT:    s_setpc_b64 s[30:31]
2080;
2081; GFX10-LABEL: hi16bits_v2f16:
2082; GFX10:       ; %bb.0: ; %entry
2083; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2084; GFX10-NEXT:    global_load_dword v4, v[0:1], off
2085; GFX10-NEXT:    global_load_dword v5, v[2:3], off
2086; GFX10-NEXT:    s_waitcnt vmcnt(0)
2087; GFX10-NEXT:    v_perm_b32 v0, v5, v4, 0x7060302
2088; GFX10-NEXT:    s_setpc_b64 s[30:31]
2089;
2090; GFX11-LABEL: hi16bits_v2f16:
2091; GFX11:       ; %bb.0: ; %entry
2092; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2093; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
2094; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
2095; GFX11-NEXT:    s_waitcnt vmcnt(0)
2096; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
2097; GFX11-NEXT:    s_setpc_b64 s[30:31]
2098entry:
2099  %0 = load <2 x half>, ptr addrspace(1) %x0, align 4
2100  %1 = load <2 x half>, ptr addrspace(1) %x1, align 4
2101  %vy1.0.vec.insert = shufflevector <2 x half> %0, <2 x half> poison, <2 x i32> <i32 1, i32 undef>
2102  %vy1.2.vec.insert = shufflevector <2 x half> %vy1.0.vec.insert, <2 x half> %1, <2 x i32> <i32 0, i32 3>
2103  ret <2 x half> %vy1.2.vec.insert
2104}
2105
2106define <2 x half> @low16hi16bits_v2f16(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
2107; GX900-LABEL: low16hi16bits_v2f16:
2108; GX900:       ; %bb.0: ; %entry
2109; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2110; GX900-NEXT:    global_load_dword v4, v[0:1], off
2111; GX900-NEXT:    global_load_dword v5, v[2:3], off
2112; GX900-NEXT:    s_mov_b32 s4, 0xffff
2113; GX900-NEXT:    s_waitcnt vmcnt(0)
2114; GX900-NEXT:    v_bfi_b32 v0, s4, v4, v5
2115; GX900-NEXT:    s_setpc_b64 s[30:31]
2116;
2117; GFX940-LABEL: low16hi16bits_v2f16:
2118; GFX940:       ; %bb.0: ; %entry
2119; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2120; GFX940-NEXT:    global_load_dword v4, v[0:1], off
2121; GFX940-NEXT:    global_load_dword v5, v[2:3], off
2122; GFX940-NEXT:    s_mov_b32 s0, 0xffff
2123; GFX940-NEXT:    s_waitcnt vmcnt(0)
2124; GFX940-NEXT:    v_bfi_b32 v0, s0, v4, v5
2125; GFX940-NEXT:    s_setpc_b64 s[30:31]
2126;
2127; GFX10-LABEL: low16hi16bits_v2f16:
2128; GFX10:       ; %bb.0: ; %entry
2129; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2130; GFX10-NEXT:    global_load_dword v4, v[0:1], off
2131; GFX10-NEXT:    global_load_dword v5, v[2:3], off
2132; GFX10-NEXT:    s_waitcnt vmcnt(0)
2133; GFX10-NEXT:    v_bfi_b32 v0, 0xffff, v4, v5
2134; GFX10-NEXT:    s_setpc_b64 s[30:31]
2135;
2136; GFX11-LABEL: low16hi16bits_v2f16:
2137; GFX11:       ; %bb.0: ; %entry
2138; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2139; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
2140; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
2141; GFX11-NEXT:    s_waitcnt vmcnt(0)
2142; GFX11-NEXT:    v_bfi_b32 v0, 0xffff, v0, v1
2143; GFX11-NEXT:    s_setpc_b64 s[30:31]
2144entry:
2145  %0 = load <2 x half>, ptr addrspace(1) %x0, align 4
2146  %1 = load <2 x half>, ptr addrspace(1) %x1, align 4
2147  %vy1.0.vec.insert = shufflevector <2 x half> %0, <2 x half> poison, <2 x i32> <i32 0, i32 undef>
2148  %vy1.2.vec.insert = shufflevector <2 x half> %vy1.0.vec.insert, <2 x half> %1, <2 x i32> <i32 0, i32 3>
2149  ret <2 x half> %vy1.2.vec.insert
2150}
2151
2152define <2 x half> @hi16low16bits_v2bf16(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
2153; GFX9-LABEL: hi16low16bits_v2bf16:
2154; GFX9:       ; %bb.0: ; %entry
2155; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2156; GFX9-NEXT:    global_load_dword v4, v[0:1], off
2157; GFX9-NEXT:    global_load_dword v5, v[2:3], off
2158; GFX9-NEXT:    s_waitcnt vmcnt(0)
2159; GFX9-NEXT:    v_alignbit_b32 v0, v5, v4, 16
2160; GFX9-NEXT:    s_setpc_b64 s[30:31]
2161;
2162; GFX10-LABEL: hi16low16bits_v2bf16:
2163; GFX10:       ; %bb.0: ; %entry
2164; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2165; GFX10-NEXT:    global_load_dword v4, v[0:1], off
2166; GFX10-NEXT:    global_load_dword v5, v[2:3], off
2167; GFX10-NEXT:    s_waitcnt vmcnt(0)
2168; GFX10-NEXT:    v_alignbit_b32 v0, v5, v4, 16
2169; GFX10-NEXT:    s_setpc_b64 s[30:31]
2170;
2171; GFX11-LABEL: hi16low16bits_v2bf16:
2172; GFX11:       ; %bb.0: ; %entry
2173; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2174; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
2175; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
2176; GFX11-NEXT:    s_waitcnt vmcnt(0)
2177; GFX11-NEXT:    v_alignbit_b32 v0, v1, v0, 16
2178; GFX11-NEXT:    s_setpc_b64 s[30:31]
2179entry:
2180  %0 = load <2 x half>, ptr addrspace(1) %x0, align 4
2181  %1 = load <2 x half>, ptr addrspace(1) %x1, align 4
2182  %vy1.0.vec.insert = shufflevector <2 x half> %0, <2 x half> poison, <2 x i32> <i32 1, i32 undef>
2183  %vy1.2.vec.insert = shufflevector <2 x half> %vy1.0.vec.insert, <2 x half> %1, <2 x i32> <i32 0, i32 2>
2184  ret <2 x half> %vy1.2.vec.insert
2185}
2186
2187define <2 x i16> @i16_low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
2188; GX900-LABEL: i16_low16bits:
2189; GX900:       ; %bb.0: ; %entry
2190; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2191; GX900-NEXT:    global_load_dword v4, v[0:1], off
2192; GX900-NEXT:    global_load_dword v5, v[2:3], off
2193; GX900-NEXT:    s_mov_b32 s4, 0x5040100
2194; GX900-NEXT:    s_waitcnt vmcnt(0)
2195; GX900-NEXT:    v_perm_b32 v0, v5, v4, s4
2196; GX900-NEXT:    s_setpc_b64 s[30:31]
2197;
2198; GFX940-LABEL: i16_low16bits:
2199; GFX940:       ; %bb.0: ; %entry
2200; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2201; GFX940-NEXT:    global_load_dword v4, v[0:1], off
2202; GFX940-NEXT:    global_load_dword v5, v[2:3], off
2203; GFX940-NEXT:    s_mov_b32 s0, 0x5040100
2204; GFX940-NEXT:    s_waitcnt vmcnt(0)
2205; GFX940-NEXT:    v_perm_b32 v0, v5, v4, s0
2206; GFX940-NEXT:    s_setpc_b64 s[30:31]
2207;
2208; GFX10-LABEL: i16_low16bits:
2209; GFX10:       ; %bb.0: ; %entry
2210; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2211; GFX10-NEXT:    global_load_dword v4, v[0:1], off
2212; GFX10-NEXT:    global_load_dword v5, v[2:3], off
2213; GFX10-NEXT:    s_waitcnt vmcnt(0)
2214; GFX10-NEXT:    v_perm_b32 v0, v5, v4, 0x5040100
2215; GFX10-NEXT:    s_setpc_b64 s[30:31]
2216;
2217; GFX11-LABEL: i16_low16bits:
2218; GFX11:       ; %bb.0: ; %entry
2219; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2220; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
2221; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
2222; GFX11-NEXT:    s_waitcnt vmcnt(0)
2223; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
2224; GFX11-NEXT:    s_setpc_b64 s[30:31]
2225entry:
2226  %0 = load <2 x i16>, ptr addrspace(1) %x0, align 4
2227  %1 = load <2 x i16>, ptr addrspace(1) %x1, align 4
2228  %vy1.0.vec.insert = shufflevector <2 x i16> %0, <2 x i16> poison, <2 x i32> <i32 0, i32 undef>
2229  %vy1.2.vec.insert = shufflevector <2 x i16> %vy1.0.vec.insert, <2 x i16> %1, <2 x i32> <i32 0, i32 2>
2230  ret <2 x i16> %vy1.2.vec.insert
2231}
2232
2233define <2 x i16> @i16_low16hi16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
2234; GX900-LABEL: i16_low16hi16bits:
2235; GX900:       ; %bb.0: ; %entry
2236; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2237; GX900-NEXT:    global_load_dword v4, v[0:1], off
2238; GX900-NEXT:    global_load_dword v5, v[2:3], off
2239; GX900-NEXT:    s_mov_b32 s4, 0xffff
2240; GX900-NEXT:    s_waitcnt vmcnt(0)
2241; GX900-NEXT:    v_bfi_b32 v0, s4, v4, v5
2242; GX900-NEXT:    s_setpc_b64 s[30:31]
2243;
2244; GFX940-LABEL: i16_low16hi16bits:
2245; GFX940:       ; %bb.0: ; %entry
2246; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2247; GFX940-NEXT:    global_load_dword v4, v[0:1], off
2248; GFX940-NEXT:    global_load_dword v5, v[2:3], off
2249; GFX940-NEXT:    s_mov_b32 s0, 0xffff
2250; GFX940-NEXT:    s_waitcnt vmcnt(0)
2251; GFX940-NEXT:    v_bfi_b32 v0, s0, v4, v5
2252; GFX940-NEXT:    s_setpc_b64 s[30:31]
2253;
2254; GFX10-LABEL: i16_low16hi16bits:
2255; GFX10:       ; %bb.0: ; %entry
2256; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2257; GFX10-NEXT:    global_load_dword v4, v[0:1], off
2258; GFX10-NEXT:    global_load_dword v5, v[2:3], off
2259; GFX10-NEXT:    s_waitcnt vmcnt(0)
2260; GFX10-NEXT:    v_bfi_b32 v0, 0xffff, v4, v5
2261; GFX10-NEXT:    s_setpc_b64 s[30:31]
2262;
2263; GFX11-LABEL: i16_low16hi16bits:
2264; GFX11:       ; %bb.0: ; %entry
2265; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2266; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
2267; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
2268; GFX11-NEXT:    s_waitcnt vmcnt(0)
2269; GFX11-NEXT:    v_bfi_b32 v0, 0xffff, v0, v1
2270; GFX11-NEXT:    s_setpc_b64 s[30:31]
2271entry:
2272  %0 = load <2 x i16>, ptr addrspace(1) %x0, align 4
2273  %1 = load <2 x i16>, ptr addrspace(1) %x1, align 4
2274  %vy1.0.vec.insert = shufflevector <2 x i16> %0, <2 x i16> poison, <2 x i32> <i32 0, i32 undef>
2275  %vy1.2.vec.insert = shufflevector <2 x i16> %vy1.0.vec.insert, <2 x i16> %1, <2 x i32> <i32 0, i32 3>
2276  ret <2 x i16> %vy1.2.vec.insert
2277}
2278
2279define <2 x i16> @i16_hi16low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
2280; GFX9-LABEL: i16_hi16low16bits:
2281; GFX9:       ; %bb.0: ; %entry
2282; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2283; GFX9-NEXT:    global_load_dword v4, v[0:1], off
2284; GFX9-NEXT:    global_load_dword v5, v[2:3], off
2285; GFX9-NEXT:    s_waitcnt vmcnt(0)
2286; GFX9-NEXT:    v_alignbit_b32 v0, v5, v4, 16
2287; GFX9-NEXT:    s_setpc_b64 s[30:31]
2288;
2289; GFX10-LABEL: i16_hi16low16bits:
2290; GFX10:       ; %bb.0: ; %entry
2291; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2292; GFX10-NEXT:    global_load_dword v4, v[0:1], off
2293; GFX10-NEXT:    global_load_dword v5, v[2:3], off
2294; GFX10-NEXT:    s_waitcnt vmcnt(0)
2295; GFX10-NEXT:    v_alignbit_b32 v0, v5, v4, 16
2296; GFX10-NEXT:    s_setpc_b64 s[30:31]
2297;
2298; GFX11-LABEL: i16_hi16low16bits:
2299; GFX11:       ; %bb.0: ; %entry
2300; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2301; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
2302; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
2303; GFX11-NEXT:    s_waitcnt vmcnt(0)
2304; GFX11-NEXT:    v_alignbit_b32 v0, v1, v0, 16
2305; GFX11-NEXT:    s_setpc_b64 s[30:31]
2306entry:
2307  %0 = load <2 x i16>, ptr addrspace(1) %x0, align 4
2308  %1 = load <2 x i16>, ptr addrspace(1) %x1, align 4
2309  %vy1.0.vec.insert = shufflevector <2 x i16> %0, <2 x i16> poison, <2 x i32> <i32 1, i32 undef>
2310  %vy1.2.vec.insert = shufflevector <2 x i16> %vy1.0.vec.insert, <2 x i16> %1, <2 x i32> <i32 0, i32 2>
2311  ret <2 x i16> %vy1.2.vec.insert
2312}
2313
2314define <2 x i16> @i16_hi16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
2315; GX900-LABEL: i16_hi16bits:
2316; GX900:       ; %bb.0: ; %entry
2317; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2318; GX900-NEXT:    global_load_dword v4, v[0:1], off
2319; GX900-NEXT:    global_load_dword v5, v[2:3], off
2320; GX900-NEXT:    s_mov_b32 s4, 0x7060302
2321; GX900-NEXT:    s_waitcnt vmcnt(0)
2322; GX900-NEXT:    v_perm_b32 v0, v5, v4, s4
2323; GX900-NEXT:    s_setpc_b64 s[30:31]
2324;
2325; GFX940-LABEL: i16_hi16bits:
2326; GFX940:       ; %bb.0: ; %entry
2327; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2328; GFX940-NEXT:    global_load_dword v4, v[0:1], off
2329; GFX940-NEXT:    global_load_dword v5, v[2:3], off
2330; GFX940-NEXT:    s_mov_b32 s0, 0x7060302
2331; GFX940-NEXT:    s_waitcnt vmcnt(0)
2332; GFX940-NEXT:    v_perm_b32 v0, v5, v4, s0
2333; GFX940-NEXT:    s_setpc_b64 s[30:31]
2334;
2335; GFX10-LABEL: i16_hi16bits:
2336; GFX10:       ; %bb.0: ; %entry
2337; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2338; GFX10-NEXT:    global_load_dword v4, v[0:1], off
2339; GFX10-NEXT:    global_load_dword v5, v[2:3], off
2340; GFX10-NEXT:    s_waitcnt vmcnt(0)
2341; GFX10-NEXT:    v_perm_b32 v0, v5, v4, 0x7060302
2342; GFX10-NEXT:    s_setpc_b64 s[30:31]
2343;
2344; GFX11-LABEL: i16_hi16bits:
2345; GFX11:       ; %bb.0: ; %entry
2346; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2347; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
2348; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
2349; GFX11-NEXT:    s_waitcnt vmcnt(0)
2350; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
2351; GFX11-NEXT:    s_setpc_b64 s[30:31]
2352entry:
2353  %0 = load <2 x i16>, ptr addrspace(1) %x0, align 4
2354  %1 = load <2 x i16>, ptr addrspace(1) %x1, align 4
2355  %vy1.0.vec.insert = shufflevector <2 x i16> %0, <2 x i16> poison, <2 x i32> <i32 1, i32 undef>
2356  %vy1.2.vec.insert = shufflevector <2 x i16> %vy1.0.vec.insert, <2 x i16> %1, <2 x i32> <i32 0, i32 3>
2357  ret <2 x i16> %vy1.2.vec.insert
2358}
2359
2360define <2 x i16> @v2i16_hi16bits(ptr addrspace(1) %x0) {
2361; GFX9-LABEL: v2i16_hi16bits:
2362; GFX9:       ; %bb.0: ; %entry
2363; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2364; GFX9-NEXT:    global_load_dword v0, v[0:1], off
2365; GFX9-NEXT:    s_waitcnt vmcnt(0)
2366; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
2367; GFX9-NEXT:    s_setpc_b64 s[30:31]
2368;
2369; GFX10-LABEL: v2i16_hi16bits:
2370; GFX10:       ; %bb.0: ; %entry
2371; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2372; GFX10-NEXT:    global_load_dword v0, v[0:1], off
2373; GFX10-NEXT:    s_waitcnt vmcnt(0)
2374; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
2375; GFX10-NEXT:    s_setpc_b64 s[30:31]
2376;
2377; GFX11-LABEL: v2i16_hi16bits:
2378; GFX11:       ; %bb.0: ; %entry
2379; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2380; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
2381; GFX11-NEXT:    s_waitcnt vmcnt(0)
2382; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
2383; GFX11-NEXT:    s_setpc_b64 s[30:31]
2384entry:
2385  %load0 = load <2 x i16>, ptr addrspace(1) %x0, align 4
2386  %insert1 = insertelement <2 x i16> undef, i16 0, i32 0
2387  %insert2 = insertelement <2 x i16> %insert1, i16 0, i32 1
2388  %vec.ret = shufflevector <2 x i16> %insert2, <2 x i16> %load0, <2 x i32> <i32 0, i32 3>
2389  ret <2 x i16> %vec.ret
2390}
2391
2392define <2 x half> @v2half_hi16bits(ptr addrspace(1) %x0) {
2393; GFX9-LABEL: v2half_hi16bits:
2394; GFX9:       ; %bb.0: ; %entry
2395; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2396; GFX9-NEXT:    global_load_dword v0, v[0:1], off
2397; GFX9-NEXT:    s_waitcnt vmcnt(0)
2398; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
2399; GFX9-NEXT:    s_setpc_b64 s[30:31]
2400;
2401; GFX10-LABEL: v2half_hi16bits:
2402; GFX10:       ; %bb.0: ; %entry
2403; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2404; GFX10-NEXT:    global_load_dword v0, v[0:1], off
2405; GFX10-NEXT:    s_waitcnt vmcnt(0)
2406; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
2407; GFX10-NEXT:    s_setpc_b64 s[30:31]
2408;
2409; GFX11-LABEL: v2half_hi16bits:
2410; GFX11:       ; %bb.0: ; %entry
2411; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2412; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
2413; GFX11-NEXT:    s_waitcnt vmcnt(0)
2414; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
2415; GFX11-NEXT:    s_setpc_b64 s[30:31]
2416entry:
2417  %load0 = load <2 x half>, ptr addrspace(1) %x0, align 4
2418  %insert1 = insertelement <2 x half> undef, half 0.0, i32 0
2419  %insert2 = insertelement <2 x half> %insert1, half 0.0, i32 1
2420  %vec.ret = shufflevector <2 x half> %insert2, <2 x half> %load0, <2 x i32> <i32 0, i32 3>
2421  ret <2 x half> %vec.ret
2422}
2423
2424define void @shuffle_v8f16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) {
2425; GX900-LABEL: shuffle_v8f16_concat:
2426; GX900:       ; %bb.0:
2427; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2428; GX900-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
2429; GX900-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
2430; GX900-NEXT:    s_waitcnt vmcnt(0)
2431; GX900-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off
2432; GX900-NEXT:    s_waitcnt vmcnt(0)
2433; GX900-NEXT:    s_setpc_b64 s[30:31]
2434;
2435; GFX940-LABEL: shuffle_v8f16_concat:
2436; GFX940:       ; %bb.0:
2437; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2438; GFX940-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
2439; GFX940-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
2440; GFX940-NEXT:    s_waitcnt vmcnt(0)
2441; GFX940-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off sc0 sc1
2442; GFX940-NEXT:    s_waitcnt vmcnt(0)
2443; GFX940-NEXT:    s_setpc_b64 s[30:31]
2444;
2445; GFX10-LABEL: shuffle_v8f16_concat:
2446; GFX10:       ; %bb.0:
2447; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2448; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
2449; GFX10-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
2450; GFX10-NEXT:    s_waitcnt vmcnt(0)
2451; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off
2452; GFX10-NEXT:    s_setpc_b64 s[30:31]
2453;
2454; GFX11-LABEL: shuffle_v8f16_concat:
2455; GFX11:       ; %bb.0:
2456; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2457; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
2458; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
2459; GFX11-NEXT:    s_waitcnt vmcnt(0)
2460; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off
2461; GFX11-NEXT:    s_setpc_b64 s[30:31]
2462  %val0 = load <4 x half>, ptr addrspace(1) %arg0
2463  %val1 = load <4 x half>, ptr addrspace(1) %arg1
2464  %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2465  store <8 x half> %shuffle, ptr addrspace(1) %out
2466  ret void
2467}
2468
2469define void @shuffle_v16f16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) {
2470; GX900-LABEL: shuffle_v16f16_concat:
2471; GX900:       ; %bb.0:
2472; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2473; GX900-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
2474; GX900-NEXT:    global_load_dwordx4 v[10:13], v[0:1], off
2475; GX900-NEXT:    s_waitcnt vmcnt(1)
2476; GX900-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:16
2477; GX900-NEXT:    s_waitcnt vmcnt(1)
2478; GX900-NEXT:    global_store_dwordx4 v[4:5], v[10:13], off
2479; GX900-NEXT:    s_waitcnt vmcnt(0)
2480; GX900-NEXT:    s_setpc_b64 s[30:31]
2481;
2482; GFX940-LABEL: shuffle_v16f16_concat:
2483; GFX940:       ; %bb.0:
2484; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2485; GFX940-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
2486; GFX940-NEXT:    global_load_dwordx4 v[10:13], v[0:1], off
2487; GFX940-NEXT:    s_waitcnt vmcnt(1)
2488; GFX940-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:16 sc0 sc1
2489; GFX940-NEXT:    s_waitcnt vmcnt(1)
2490; GFX940-NEXT:    global_store_dwordx4 v[4:5], v[10:13], off sc0 sc1
2491; GFX940-NEXT:    s_waitcnt vmcnt(0)
2492; GFX940-NEXT:    s_setpc_b64 s[30:31]
2493;
2494; GFX10-LABEL: shuffle_v16f16_concat:
2495; GFX10:       ; %bb.0:
2496; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2497; GFX10-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
2498; GFX10-NEXT:    global_load_dwordx4 v[10:13], v[0:1], off
2499; GFX10-NEXT:    s_waitcnt vmcnt(1)
2500; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:16
2501; GFX10-NEXT:    s_waitcnt vmcnt(0)
2502; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[10:13], off
2503; GFX10-NEXT:    s_setpc_b64 s[30:31]
2504;
2505; GFX11-LABEL: shuffle_v16f16_concat:
2506; GFX11:       ; %bb.0:
2507; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2508; GFX11-NEXT:    global_load_b128 v[6:9], v[2:3], off
2509; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
2510; GFX11-NEXT:    s_waitcnt vmcnt(1)
2511; GFX11-NEXT:    global_store_b128 v[4:5], v[6:9], off offset:16
2512; GFX11-NEXT:    s_waitcnt vmcnt(0)
2513; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off
2514; GFX11-NEXT:    s_setpc_b64 s[30:31]
2515  %val0 = load <8 x half>, ptr addrspace(1) %arg0
2516  %val1 = load <8 x half>, ptr addrspace(1) %arg1
2517  %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2518  store <16 x half> %shuffle, ptr addrspace(1) %out
2519  ret void
2520}
2521
2522define void @shuffle_v32f16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) {
2523; GX900-LABEL: shuffle_v32f16_concat:
2524; GX900:       ; %bb.0:
2525; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2526; GX900-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
2527; GX900-NEXT:    global_load_dwordx4 v[10:13], v[2:3], off offset:16
2528; GX900-NEXT:    global_load_dwordx4 v[14:17], v[0:1], off
2529; GX900-NEXT:    global_load_dwordx4 v[18:21], v[0:1], off offset:16
2530; GX900-NEXT:    s_waitcnt vmcnt(3)
2531; GX900-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:32
2532; GX900-NEXT:    s_waitcnt vmcnt(3)
2533; GX900-NEXT:    global_store_dwordx4 v[4:5], v[10:13], off offset:48
2534; GX900-NEXT:    s_waitcnt vmcnt(3)
2535; GX900-NEXT:    global_store_dwordx4 v[4:5], v[14:17], off
2536; GX900-NEXT:    s_waitcnt vmcnt(3)
2537; GX900-NEXT:    global_store_dwordx4 v[4:5], v[18:21], off offset:16
2538; GX900-NEXT:    s_waitcnt vmcnt(0)
2539; GX900-NEXT:    s_setpc_b64 s[30:31]
2540;
2541; GFX940-LABEL: shuffle_v32f16_concat:
2542; GFX940:       ; %bb.0:
2543; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2544; GFX940-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
2545; GFX940-NEXT:    global_load_dwordx4 v[10:13], v[2:3], off offset:16
2546; GFX940-NEXT:    global_load_dwordx4 v[14:17], v[0:1], off
2547; GFX940-NEXT:    global_load_dwordx4 v[18:21], v[0:1], off offset:16
2548; GFX940-NEXT:    s_waitcnt vmcnt(3)
2549; GFX940-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:32 sc0 sc1
2550; GFX940-NEXT:    s_waitcnt vmcnt(3)
2551; GFX940-NEXT:    global_store_dwordx4 v[4:5], v[10:13], off offset:48 sc0 sc1
2552; GFX940-NEXT:    s_waitcnt vmcnt(3)
2553; GFX940-NEXT:    global_store_dwordx4 v[4:5], v[14:17], off sc0 sc1
2554; GFX940-NEXT:    s_waitcnt vmcnt(3)
2555; GFX940-NEXT:    global_store_dwordx4 v[4:5], v[18:21], off offset:16 sc0 sc1
2556; GFX940-NEXT:    s_waitcnt vmcnt(0)
2557; GFX940-NEXT:    s_setpc_b64 s[30:31]
2558;
2559; GFX10-LABEL: shuffle_v32f16_concat:
2560; GFX10:       ; %bb.0:
2561; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2562; GFX10-NEXT:    s_clause 0x1
2563; GFX10-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
2564; GFX10-NEXT:    global_load_dwordx4 v[10:13], v[2:3], off offset:16
2565; GFX10-NEXT:    s_clause 0x1
2566; GFX10-NEXT:    global_load_dwordx4 v[14:17], v[0:1], off
2567; GFX10-NEXT:    global_load_dwordx4 v[18:21], v[0:1], off offset:16
2568; GFX10-NEXT:    s_waitcnt vmcnt(3)
2569; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:32
2570; GFX10-NEXT:    s_waitcnt vmcnt(2)
2571; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[10:13], off offset:48
2572; GFX10-NEXT:    s_waitcnt vmcnt(1)
2573; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[14:17], off
2574; GFX10-NEXT:    s_waitcnt vmcnt(0)
2575; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[18:21], off offset:16
2576; GFX10-NEXT:    s_setpc_b64 s[30:31]
2577;
2578; GFX11-LABEL: shuffle_v32f16_concat:
2579; GFX11:       ; %bb.0:
2580; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2581; GFX11-NEXT:    s_clause 0x1
2582; GFX11-NEXT:    global_load_b128 v[6:9], v[2:3], off
2583; GFX11-NEXT:    global_load_b128 v[10:13], v[2:3], off offset:16
2584; GFX11-NEXT:    s_clause 0x1
2585; GFX11-NEXT:    global_load_b128 v[14:17], v[0:1], off
2586; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off offset:16
2587; GFX11-NEXT:    s_waitcnt vmcnt(3)
2588; GFX11-NEXT:    global_store_b128 v[4:5], v[6:9], off offset:32
2589; GFX11-NEXT:    s_waitcnt vmcnt(2)
2590; GFX11-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:48
2591; GFX11-NEXT:    s_waitcnt vmcnt(1)
2592; GFX11-NEXT:    global_store_b128 v[4:5], v[14:17], off
2593; GFX11-NEXT:    s_waitcnt vmcnt(0)
2594; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off offset:16
2595; GFX11-NEXT:    s_setpc_b64 s[30:31]
2596  %val0 = load <16 x half>, ptr addrspace(1) %arg0
2597  %val1 = load <16 x half>, ptr addrspace(1) %arg1
2598  %shuffle = shufflevector <16 x half> %val0, <16 x half> %val1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
2599  store <32 x half> %shuffle, ptr addrspace(1) %out
2600  ret void
2601}
2602
2603define void @shuffle_v8i16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) {
2604; GX900-LABEL: shuffle_v8i16_concat:
2605; GX900:       ; %bb.0:
2606; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2607; GX900-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
2608; GX900-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
2609; GX900-NEXT:    s_waitcnt vmcnt(0)
2610; GX900-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off
2611; GX900-NEXT:    s_waitcnt vmcnt(0)
2612; GX900-NEXT:    s_setpc_b64 s[30:31]
2613;
2614; GFX940-LABEL: shuffle_v8i16_concat:
2615; GFX940:       ; %bb.0:
2616; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2617; GFX940-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
2618; GFX940-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
2619; GFX940-NEXT:    s_waitcnt vmcnt(0)
2620; GFX940-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off sc0 sc1
2621; GFX940-NEXT:    s_waitcnt vmcnt(0)
2622; GFX940-NEXT:    s_setpc_b64 s[30:31]
2623;
2624; GFX10-LABEL: shuffle_v8i16_concat:
2625; GFX10:       ; %bb.0:
2626; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2627; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
2628; GFX10-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
2629; GFX10-NEXT:    s_waitcnt vmcnt(0)
2630; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off
2631; GFX10-NEXT:    s_setpc_b64 s[30:31]
2632;
2633; GFX11-LABEL: shuffle_v8i16_concat:
2634; GFX11:       ; %bb.0:
2635; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2636; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
2637; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
2638; GFX11-NEXT:    s_waitcnt vmcnt(0)
2639; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off
2640; GFX11-NEXT:    s_setpc_b64 s[30:31]
2641  %val0 = load <4 x i16>, ptr addrspace(1) %arg0
2642  %val1 = load <4 x i16>, ptr addrspace(1) %arg1
2643  %shuffle = shufflevector <4 x i16> %val0, <4 x i16> %val1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2644  store <8 x i16> %shuffle, ptr addrspace(1) %out
2645  ret void
2646}
2647
2648define void @shuffle_v16i16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) {
2649; GX900-LABEL: shuffle_v16i16_concat:
2650; GX900:       ; %bb.0:
2651; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2652; GX900-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
2653; GX900-NEXT:    global_load_dwordx4 v[10:13], v[0:1], off
2654; GX900-NEXT:    s_waitcnt vmcnt(1)
2655; GX900-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:16
2656; GX900-NEXT:    s_waitcnt vmcnt(1)
2657; GX900-NEXT:    global_store_dwordx4 v[4:5], v[10:13], off
2658; GX900-NEXT:    s_waitcnt vmcnt(0)
2659; GX900-NEXT:    s_setpc_b64 s[30:31]
2660;
2661; GFX940-LABEL: shuffle_v16i16_concat:
2662; GFX940:       ; %bb.0:
2663; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2664; GFX940-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
2665; GFX940-NEXT:    global_load_dwordx4 v[10:13], v[0:1], off
2666; GFX940-NEXT:    s_waitcnt vmcnt(1)
2667; GFX940-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:16 sc0 sc1
2668; GFX940-NEXT:    s_waitcnt vmcnt(1)
2669; GFX940-NEXT:    global_store_dwordx4 v[4:5], v[10:13], off sc0 sc1
2670; GFX940-NEXT:    s_waitcnt vmcnt(0)
2671; GFX940-NEXT:    s_setpc_b64 s[30:31]
2672;
2673; GFX10-LABEL: shuffle_v16i16_concat:
2674; GFX10:       ; %bb.0:
2675; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2676; GFX10-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
2677; GFX10-NEXT:    global_load_dwordx4 v[10:13], v[0:1], off
2678; GFX10-NEXT:    s_waitcnt vmcnt(1)
2679; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:16
2680; GFX10-NEXT:    s_waitcnt vmcnt(0)
2681; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[10:13], off
2682; GFX10-NEXT:    s_setpc_b64 s[30:31]
2683;
2684; GFX11-LABEL: shuffle_v16i16_concat:
2685; GFX11:       ; %bb.0:
2686; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2687; GFX11-NEXT:    global_load_b128 v[6:9], v[2:3], off
2688; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
2689; GFX11-NEXT:    s_waitcnt vmcnt(1)
2690; GFX11-NEXT:    global_store_b128 v[4:5], v[6:9], off offset:16
2691; GFX11-NEXT:    s_waitcnt vmcnt(0)
2692; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off
2693; GFX11-NEXT:    s_setpc_b64 s[30:31]
2694  %val0 = load <8 x i16>, ptr addrspace(1) %arg0
2695  %val1 = load <8 x i16>, ptr addrspace(1) %arg1
2696  %shuffle = shufflevector <8 x i16> %val0, <8 x i16> %val1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2697  store <16 x i16> %shuffle, ptr addrspace(1) %out
2698  ret void
2699}
2700
2701define void @shuffle_v32i16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) {
2702; GX900-LABEL: shuffle_v32i16_concat:
2703; GX900:       ; %bb.0:
2704; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2705; GX900-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
2706; GX900-NEXT:    global_load_dwordx4 v[10:13], v[2:3], off offset:16
2707; GX900-NEXT:    global_load_dwordx4 v[14:17], v[0:1], off
2708; GX900-NEXT:    global_load_dwordx4 v[18:21], v[0:1], off offset:16
2709; GX900-NEXT:    s_waitcnt vmcnt(3)
2710; GX900-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:32
2711; GX900-NEXT:    s_waitcnt vmcnt(3)
2712; GX900-NEXT:    global_store_dwordx4 v[4:5], v[10:13], off offset:48
2713; GX900-NEXT:    s_waitcnt vmcnt(3)
2714; GX900-NEXT:    global_store_dwordx4 v[4:5], v[14:17], off
2715; GX900-NEXT:    s_waitcnt vmcnt(3)
2716; GX900-NEXT:    global_store_dwordx4 v[4:5], v[18:21], off offset:16
2717; GX900-NEXT:    s_waitcnt vmcnt(0)
2718; GX900-NEXT:    s_setpc_b64 s[30:31]
2719;
2720; GFX940-LABEL: shuffle_v32i16_concat:
2721; GFX940:       ; %bb.0:
2722; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2723; GFX940-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
2724; GFX940-NEXT:    global_load_dwordx4 v[10:13], v[2:3], off offset:16
2725; GFX940-NEXT:    global_load_dwordx4 v[14:17], v[0:1], off
2726; GFX940-NEXT:    global_load_dwordx4 v[18:21], v[0:1], off offset:16
2727; GFX940-NEXT:    s_waitcnt vmcnt(3)
2728; GFX940-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:32 sc0 sc1
2729; GFX940-NEXT:    s_waitcnt vmcnt(3)
2730; GFX940-NEXT:    global_store_dwordx4 v[4:5], v[10:13], off offset:48 sc0 sc1
2731; GFX940-NEXT:    s_waitcnt vmcnt(3)
2732; GFX940-NEXT:    global_store_dwordx4 v[4:5], v[14:17], off sc0 sc1
2733; GFX940-NEXT:    s_waitcnt vmcnt(3)
2734; GFX940-NEXT:    global_store_dwordx4 v[4:5], v[18:21], off offset:16 sc0 sc1
2735; GFX940-NEXT:    s_waitcnt vmcnt(0)
2736; GFX940-NEXT:    s_setpc_b64 s[30:31]
2737;
2738; GFX10-LABEL: shuffle_v32i16_concat:
2739; GFX10:       ; %bb.0:
2740; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2741; GFX10-NEXT:    s_clause 0x1
2742; GFX10-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
2743; GFX10-NEXT:    global_load_dwordx4 v[10:13], v[2:3], off offset:16
2744; GFX10-NEXT:    s_clause 0x1
2745; GFX10-NEXT:    global_load_dwordx4 v[14:17], v[0:1], off
2746; GFX10-NEXT:    global_load_dwordx4 v[18:21], v[0:1], off offset:16
2747; GFX10-NEXT:    s_waitcnt vmcnt(3)
2748; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:32
2749; GFX10-NEXT:    s_waitcnt vmcnt(2)
2750; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[10:13], off offset:48
2751; GFX10-NEXT:    s_waitcnt vmcnt(1)
2752; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[14:17], off
2753; GFX10-NEXT:    s_waitcnt vmcnt(0)
2754; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[18:21], off offset:16
2755; GFX10-NEXT:    s_setpc_b64 s[30:31]
2756;
2757; GFX11-LABEL: shuffle_v32i16_concat:
2758; GFX11:       ; %bb.0:
2759; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2760; GFX11-NEXT:    s_clause 0x1
2761; GFX11-NEXT:    global_load_b128 v[6:9], v[2:3], off
2762; GFX11-NEXT:    global_load_b128 v[10:13], v[2:3], off offset:16
2763; GFX11-NEXT:    s_clause 0x1
2764; GFX11-NEXT:    global_load_b128 v[14:17], v[0:1], off
2765; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off offset:16
2766; GFX11-NEXT:    s_waitcnt vmcnt(3)
2767; GFX11-NEXT:    global_store_b128 v[4:5], v[6:9], off offset:32
2768; GFX11-NEXT:    s_waitcnt vmcnt(2)
2769; GFX11-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:48
2770; GFX11-NEXT:    s_waitcnt vmcnt(1)
2771; GFX11-NEXT:    global_store_b128 v[4:5], v[14:17], off
2772; GFX11-NEXT:    s_waitcnt vmcnt(0)
2773; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off offset:16
2774; GFX11-NEXT:    s_setpc_b64 s[30:31]
2775  %val0 = load <16 x i16>, ptr addrspace(1) %arg0
2776  %val1 = load <16 x i16>, ptr addrspace(1) %arg1
2777  %shuffle = shufflevector <16 x i16> %val0, <16 x i16> %val1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
2778  store <32 x i16> %shuffle, ptr addrspace(1) %out
2779  ret void
2780}
2781
2782define void @shuffle_v4i8_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) {
2783; GX900-LABEL: shuffle_v4i8_concat:
2784; GX900:       ; %bb.0:
2785; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2786; GX900-NEXT:    global_load_ushort v0, v[0:1], off
2787; GX900-NEXT:    s_nop 0
2788; GX900-NEXT:    global_load_short_d16_hi v0, v[2:3], off
2789; GX900-NEXT:    s_waitcnt vmcnt(0)
2790; GX900-NEXT:    global_store_dword v[4:5], v0, off
2791; GX900-NEXT:    s_waitcnt vmcnt(0)
2792; GX900-NEXT:    s_setpc_b64 s[30:31]
2793;
2794; GFX940-LABEL: shuffle_v4i8_concat:
2795; GFX940:       ; %bb.0:
2796; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2797; GFX940-NEXT:    global_load_ushort v6, v[0:1], off
2798; GFX940-NEXT:    global_load_ushort v7, v[2:3], off
2799; GFX940-NEXT:    s_mov_b32 s0, 0x5040100
2800; GFX940-NEXT:    s_waitcnt vmcnt(0)
2801; GFX940-NEXT:    v_perm_b32 v0, v7, v6, s0
2802; GFX940-NEXT:    global_store_dword v[4:5], v0, off sc0 sc1
2803; GFX940-NEXT:    s_waitcnt vmcnt(0)
2804; GFX940-NEXT:    s_setpc_b64 s[30:31]
2805;
2806; GFX10-LABEL: shuffle_v4i8_concat:
2807; GFX10:       ; %bb.0:
2808; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2809; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
2810; GFX10-NEXT:    global_load_short_d16_hi v0, v[2:3], off
2811; GFX10-NEXT:    s_waitcnt vmcnt(0)
2812; GFX10-NEXT:    global_store_dword v[4:5], v0, off
2813; GFX10-NEXT:    s_setpc_b64 s[30:31]
2814;
2815; GFX11-LABEL: shuffle_v4i8_concat:
2816; GFX11:       ; %bb.0:
2817; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2818; GFX11-NEXT:    global_load_u16 v0, v[0:1], off
2819; GFX11-NEXT:    global_load_d16_hi_b16 v0, v[2:3], off
2820; GFX11-NEXT:    s_waitcnt vmcnt(0)
2821; GFX11-NEXT:    global_store_b32 v[4:5], v0, off
2822; GFX11-NEXT:    s_setpc_b64 s[30:31]
2823  %val0 = load <2 x i8>, ptr addrspace(1) %arg0
2824  %val1 = load <2 x i8>, ptr addrspace(1) %arg1
2825  %shuffle = shufflevector <2 x i8> %val0, <2 x i8> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2826  store <4 x i8> %shuffle, ptr addrspace(1) %out
2827  ret void
2828}
2829
2830define void @shuffle_v8i8_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) {
2831; GX900-LABEL: shuffle_v8i8_concat:
2832; GX900:       ; %bb.0:
2833; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2834; GX900-NEXT:    global_load_dword v6, v[0:1], off
2835; GX900-NEXT:    global_load_dword v7, v[2:3], off
2836; GX900-NEXT:    s_waitcnt vmcnt(0)
2837; GX900-NEXT:    global_store_dwordx2 v[4:5], v[6:7], off
2838; GX900-NEXT:    s_waitcnt vmcnt(0)
2839; GX900-NEXT:    s_setpc_b64 s[30:31]
2840;
2841; GFX940-LABEL: shuffle_v8i8_concat:
2842; GFX940:       ; %bb.0:
2843; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2844; GFX940-NEXT:    global_load_dword v6, v[0:1], off
2845; GFX940-NEXT:    global_load_dword v7, v[2:3], off
2846; GFX940-NEXT:    s_waitcnt vmcnt(0)
2847; GFX940-NEXT:    global_store_dwordx2 v[4:5], v[6:7], off sc0 sc1
2848; GFX940-NEXT:    s_waitcnt vmcnt(0)
2849; GFX940-NEXT:    s_setpc_b64 s[30:31]
2850;
2851; GFX10-LABEL: shuffle_v8i8_concat:
2852; GFX10:       ; %bb.0:
2853; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2854; GFX10-NEXT:    global_load_dword v6, v[0:1], off
2855; GFX10-NEXT:    global_load_dword v7, v[2:3], off
2856; GFX10-NEXT:    s_waitcnt vmcnt(0)
2857; GFX10-NEXT:    global_store_dwordx2 v[4:5], v[6:7], off
2858; GFX10-NEXT:    s_setpc_b64 s[30:31]
2859;
2860; GFX11-LABEL: shuffle_v8i8_concat:
2861; GFX11:       ; %bb.0:
2862; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2863; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
2864; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
2865; GFX11-NEXT:    s_waitcnt vmcnt(0)
2866; GFX11-NEXT:    global_store_b64 v[4:5], v[0:1], off
2867; GFX11-NEXT:    s_setpc_b64 s[30:31]
2868  %val0 = load <4 x i8>, ptr addrspace(1) %arg0
2869  %val1 = load <4 x i8>, ptr addrspace(1) %arg1
2870  %shuffle = shufflevector <4 x i8> %val0, <4 x i8> %val1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2871  store <8 x i8> %shuffle, ptr addrspace(1) %out
2872  ret void
2873}
2874
2875define void @shuffle_v16i8_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) {
2876; GX900-LABEL: shuffle_v16i8_concat:
2877; GX900:       ; %bb.0:
2878; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2879; GX900-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
2880; GX900-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
2881; GX900-NEXT:    s_waitcnt vmcnt(0)
2882; GX900-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off
2883; GX900-NEXT:    s_waitcnt vmcnt(0)
2884; GX900-NEXT:    s_setpc_b64 s[30:31]
2885;
2886; GFX940-LABEL: shuffle_v16i8_concat:
2887; GFX940:       ; %bb.0:
2888; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2889; GFX940-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
2890; GFX940-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
2891; GFX940-NEXT:    s_waitcnt vmcnt(0)
2892; GFX940-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off sc0 sc1
2893; GFX940-NEXT:    s_waitcnt vmcnt(0)
2894; GFX940-NEXT:    s_setpc_b64 s[30:31]
2895;
2896; GFX10-LABEL: shuffle_v16i8_concat:
2897; GFX10:       ; %bb.0:
2898; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2899; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
2900; GFX10-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
2901; GFX10-NEXT:    s_waitcnt vmcnt(0)
2902; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off
2903; GFX10-NEXT:    s_setpc_b64 s[30:31]
2904;
2905; GFX11-LABEL: shuffle_v16i8_concat:
2906; GFX11:       ; %bb.0:
2907; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2908; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
2909; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
2910; GFX11-NEXT:    s_waitcnt vmcnt(0)
2911; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off
2912; GFX11-NEXT:    s_setpc_b64 s[30:31]
2913  %val0 = load <8 x i8>, ptr addrspace(1) %arg0
2914  %val1 = load <8 x i8>, ptr addrspace(1) %arg1
2915  %shuffle = shufflevector <8 x i8> %val0, <8 x i8> %val1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2916  store <16 x i8> %shuffle, ptr addrspace(1) %out
2917  ret void
2918}
2919
2920define void @shuffle_v32i8_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) {
2921; GX900-LABEL: shuffle_v32i8_concat:
2922; GX900:       ; %bb.0:
2923; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2924; GX900-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
2925; GX900-NEXT:    global_load_dwordx4 v[10:13], v[0:1], off
2926; GX900-NEXT:    s_waitcnt vmcnt(1)
2927; GX900-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:16
2928; GX900-NEXT:    s_waitcnt vmcnt(1)
2929; GX900-NEXT:    global_store_dwordx4 v[4:5], v[10:13], off
2930; GX900-NEXT:    s_waitcnt vmcnt(0)
2931; GX900-NEXT:    s_setpc_b64 s[30:31]
2932;
2933; GFX940-LABEL: shuffle_v32i8_concat:
2934; GFX940:       ; %bb.0:
2935; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2936; GFX940-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
2937; GFX940-NEXT:    global_load_dwordx4 v[10:13], v[0:1], off
2938; GFX940-NEXT:    s_waitcnt vmcnt(1)
2939; GFX940-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:16 sc0 sc1
2940; GFX940-NEXT:    s_waitcnt vmcnt(1)
2941; GFX940-NEXT:    global_store_dwordx4 v[4:5], v[10:13], off sc0 sc1
2942; GFX940-NEXT:    s_waitcnt vmcnt(0)
2943; GFX940-NEXT:    s_setpc_b64 s[30:31]
2944;
2945; GFX10-LABEL: shuffle_v32i8_concat:
2946; GFX10:       ; %bb.0:
2947; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2948; GFX10-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
2949; GFX10-NEXT:    global_load_dwordx4 v[10:13], v[0:1], off
2950; GFX10-NEXT:    s_waitcnt vmcnt(1)
2951; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:16
2952; GFX10-NEXT:    s_waitcnt vmcnt(0)
2953; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[10:13], off
2954; GFX10-NEXT:    s_setpc_b64 s[30:31]
2955;
2956; GFX11-LABEL: shuffle_v32i8_concat:
2957; GFX11:       ; %bb.0:
2958; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2959; GFX11-NEXT:    global_load_b128 v[6:9], v[2:3], off
2960; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
2961; GFX11-NEXT:    s_waitcnt vmcnt(1)
2962; GFX11-NEXT:    global_store_b128 v[4:5], v[6:9], off offset:16
2963; GFX11-NEXT:    s_waitcnt vmcnt(0)
2964; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off
2965; GFX11-NEXT:    s_setpc_b64 s[30:31]
2966  %val0 = load <16 x i8>, ptr addrspace(1) %arg0
2967  %val1 = load <16 x i8>, ptr addrspace(1) %arg1
2968  %shuffle = shufflevector <16 x i8> %val0, <16 x i8> %val1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
2969  store <32 x i8> %shuffle, ptr addrspace(1) %out
2970  ret void
2971}
2972
2973define void @shuffle_v4i32_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) {
2974; GX900-LABEL: shuffle_v4i32_concat:
2975; GX900:       ; %bb.0:
2976; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2977; GX900-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
2978; GX900-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
2979; GX900-NEXT:    s_waitcnt vmcnt(0)
2980; GX900-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off
2981; GX900-NEXT:    s_waitcnt vmcnt(0)
2982; GX900-NEXT:    s_setpc_b64 s[30:31]
2983;
2984; GFX940-LABEL: shuffle_v4i32_concat:
2985; GFX940:       ; %bb.0:
2986; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2987; GFX940-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
2988; GFX940-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
2989; GFX940-NEXT:    s_waitcnt vmcnt(0)
2990; GFX940-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off sc0 sc1
2991; GFX940-NEXT:    s_waitcnt vmcnt(0)
2992; GFX940-NEXT:    s_setpc_b64 s[30:31]
2993;
2994; GFX10-LABEL: shuffle_v4i32_concat:
2995; GFX10:       ; %bb.0:
2996; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2997; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
2998; GFX10-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
2999; GFX10-NEXT:    s_waitcnt vmcnt(0)
3000; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off
3001; GFX10-NEXT:    s_setpc_b64 s[30:31]
3002;
3003; GFX11-LABEL: shuffle_v4i32_concat:
3004; GFX11:       ; %bb.0:
3005; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3006; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
3007; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
3008; GFX11-NEXT:    s_waitcnt vmcnt(0)
3009; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off
3010; GFX11-NEXT:    s_setpc_b64 s[30:31]
3011  %val0 = load <2 x i32>, ptr addrspace(1) %arg0
3012  %val1 = load <2 x i32>, ptr addrspace(1) %arg1
3013  %shuffle = shufflevector <2 x i32> %val0, <2 x i32> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3014  store <4 x i32> %shuffle, ptr addrspace(1) %out
3015  ret void
3016}
3017
3018define void @shuffle_v8i32_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) {
3019; GX900-LABEL: shuffle_v8i32_concat:
3020; GX900:       ; %bb.0:
3021; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3022; GX900-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
3023; GX900-NEXT:    global_load_dwordx4 v[10:13], v[0:1], off
3024; GX900-NEXT:    s_waitcnt vmcnt(1)
3025; GX900-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:16
3026; GX900-NEXT:    s_waitcnt vmcnt(1)
3027; GX900-NEXT:    global_store_dwordx4 v[4:5], v[10:13], off
3028; GX900-NEXT:    s_waitcnt vmcnt(0)
3029; GX900-NEXT:    s_setpc_b64 s[30:31]
3030;
3031; GFX940-LABEL: shuffle_v8i32_concat:
3032; GFX940:       ; %bb.0:
3033; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3034; GFX940-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
3035; GFX940-NEXT:    global_load_dwordx4 v[10:13], v[0:1], off
3036; GFX940-NEXT:    s_waitcnt vmcnt(1)
3037; GFX940-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:16 sc0 sc1
3038; GFX940-NEXT:    s_waitcnt vmcnt(1)
3039; GFX940-NEXT:    global_store_dwordx4 v[4:5], v[10:13], off sc0 sc1
3040; GFX940-NEXT:    s_waitcnt vmcnt(0)
3041; GFX940-NEXT:    s_setpc_b64 s[30:31]
3042;
3043; GFX10-LABEL: shuffle_v8i32_concat:
3044; GFX10:       ; %bb.0:
3045; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3046; GFX10-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
3047; GFX10-NEXT:    global_load_dwordx4 v[10:13], v[0:1], off
3048; GFX10-NEXT:    s_waitcnt vmcnt(1)
3049; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:16
3050; GFX10-NEXT:    s_waitcnt vmcnt(0)
3051; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[10:13], off
3052; GFX10-NEXT:    s_setpc_b64 s[30:31]
3053;
3054; GFX11-LABEL: shuffle_v8i32_concat:
3055; GFX11:       ; %bb.0:
3056; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3057; GFX11-NEXT:    global_load_b128 v[6:9], v[2:3], off
3058; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
3059; GFX11-NEXT:    s_waitcnt vmcnt(1)
3060; GFX11-NEXT:    global_store_b128 v[4:5], v[6:9], off offset:16
3061; GFX11-NEXT:    s_waitcnt vmcnt(0)
3062; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off
3063; GFX11-NEXT:    s_setpc_b64 s[30:31]
3064  %val0 = load <4 x i32>, ptr addrspace(1) %arg0
3065  %val1 = load <4 x i32>, ptr addrspace(1) %arg1
3066  %shuffle = shufflevector <4 x i32> %val0, <4 x i32> %val1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3067  store <8 x i32> %shuffle, ptr addrspace(1) %out
3068  ret void
3069}
3070
3071define void @shuffle_v16i32_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) {
3072; GX900-LABEL: shuffle_v16i32_concat:
3073; GX900:       ; %bb.0:
3074; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3075; GX900-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
3076; GX900-NEXT:    global_load_dwordx4 v[10:13], v[2:3], off offset:16
3077; GX900-NEXT:    global_load_dwordx4 v[14:17], v[0:1], off
3078; GX900-NEXT:    global_load_dwordx4 v[18:21], v[0:1], off offset:16
3079; GX900-NEXT:    s_waitcnt vmcnt(3)
3080; GX900-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:32
3081; GX900-NEXT:    s_waitcnt vmcnt(3)
3082; GX900-NEXT:    global_store_dwordx4 v[4:5], v[10:13], off offset:48
3083; GX900-NEXT:    s_waitcnt vmcnt(3)
3084; GX900-NEXT:    global_store_dwordx4 v[4:5], v[14:17], off
3085; GX900-NEXT:    s_waitcnt vmcnt(3)
3086; GX900-NEXT:    global_store_dwordx4 v[4:5], v[18:21], off offset:16
3087; GX900-NEXT:    s_waitcnt vmcnt(0)
3088; GX900-NEXT:    s_setpc_b64 s[30:31]
3089;
3090; GFX940-LABEL: shuffle_v16i32_concat:
3091; GFX940:       ; %bb.0:
3092; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3093; GFX940-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
3094; GFX940-NEXT:    global_load_dwordx4 v[10:13], v[2:3], off offset:16
3095; GFX940-NEXT:    global_load_dwordx4 v[14:17], v[0:1], off
3096; GFX940-NEXT:    global_load_dwordx4 v[18:21], v[0:1], off offset:16
3097; GFX940-NEXT:    s_waitcnt vmcnt(3)
3098; GFX940-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:32 sc0 sc1
3099; GFX940-NEXT:    s_waitcnt vmcnt(3)
3100; GFX940-NEXT:    global_store_dwordx4 v[4:5], v[10:13], off offset:48 sc0 sc1
3101; GFX940-NEXT:    s_waitcnt vmcnt(3)
3102; GFX940-NEXT:    global_store_dwordx4 v[4:5], v[14:17], off sc0 sc1
3103; GFX940-NEXT:    s_waitcnt vmcnt(3)
3104; GFX940-NEXT:    global_store_dwordx4 v[4:5], v[18:21], off offset:16 sc0 sc1
3105; GFX940-NEXT:    s_waitcnt vmcnt(0)
3106; GFX940-NEXT:    s_setpc_b64 s[30:31]
3107;
3108; GFX10-LABEL: shuffle_v16i32_concat:
3109; GFX10:       ; %bb.0:
3110; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3111; GFX10-NEXT:    s_clause 0x1
3112; GFX10-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
3113; GFX10-NEXT:    global_load_dwordx4 v[10:13], v[2:3], off offset:16
3114; GFX10-NEXT:    s_clause 0x1
3115; GFX10-NEXT:    global_load_dwordx4 v[14:17], v[0:1], off
3116; GFX10-NEXT:    global_load_dwordx4 v[18:21], v[0:1], off offset:16
3117; GFX10-NEXT:    s_waitcnt vmcnt(3)
3118; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:32
3119; GFX10-NEXT:    s_waitcnt vmcnt(2)
3120; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[10:13], off offset:48
3121; GFX10-NEXT:    s_waitcnt vmcnt(1)
3122; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[14:17], off
3123; GFX10-NEXT:    s_waitcnt vmcnt(0)
3124; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[18:21], off offset:16
3125; GFX10-NEXT:    s_setpc_b64 s[30:31]
3126;
3127; GFX11-LABEL: shuffle_v16i32_concat:
3128; GFX11:       ; %bb.0:
3129; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3130; GFX11-NEXT:    s_clause 0x1
3131; GFX11-NEXT:    global_load_b128 v[6:9], v[2:3], off
3132; GFX11-NEXT:    global_load_b128 v[10:13], v[2:3], off offset:16
3133; GFX11-NEXT:    s_clause 0x1
3134; GFX11-NEXT:    global_load_b128 v[14:17], v[0:1], off
3135; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off offset:16
3136; GFX11-NEXT:    s_waitcnt vmcnt(3)
3137; GFX11-NEXT:    global_store_b128 v[4:5], v[6:9], off offset:32
3138; GFX11-NEXT:    s_waitcnt vmcnt(2)
3139; GFX11-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:48
3140; GFX11-NEXT:    s_waitcnt vmcnt(1)
3141; GFX11-NEXT:    global_store_b128 v[4:5], v[14:17], off
3142; GFX11-NEXT:    s_waitcnt vmcnt(0)
3143; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off offset:16
3144; GFX11-NEXT:    s_setpc_b64 s[30:31]
3145  %val0 = load <8 x i32>, ptr addrspace(1) %arg0
3146  %val1 = load <8 x i32>, ptr addrspace(1) %arg1
3147  %shuffle = shufflevector <8 x i32> %val0, <8 x i32> %val1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
3148  store <16 x i32> %shuffle, ptr addrspace(1) %out
3149  ret void
3150}
3151
3152define <4 x bfloat> @shuffle_v4bf16_23uu(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
3153; GFX9-LABEL: shuffle_v4bf16_23uu:
3154; GFX9:       ; %bb.0:
3155; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3156; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
3157; GFX9-NEXT:    s_waitcnt vmcnt(0)
3158; GFX9-NEXT:    s_setpc_b64 s[30:31]
3159;
3160; GFX10-LABEL: shuffle_v4bf16_23uu:
3161; GFX10:       ; %bb.0:
3162; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3163; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
3164; GFX10-NEXT:    s_waitcnt vmcnt(0)
3165; GFX10-NEXT:    s_setpc_b64 s[30:31]
3166;
3167; GFX11-LABEL: shuffle_v4bf16_23uu:
3168; GFX11:       ; %bb.0:
3169; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3170; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
3171; GFX11-NEXT:    s_waitcnt vmcnt(0)
3172; GFX11-NEXT:    s_setpc_b64 s[30:31]
3173  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
3174  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
3175  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
3176  ret <4 x bfloat> %shuffle
3177}
3178
3179define <4 x bfloat> @shuffle_v4bf16_234u(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
3180; GX900-LABEL: shuffle_v4bf16_234u:
3181; GX900:       ; %bb.0:
3182; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3183; GX900-NEXT:    global_load_dword v6, v[0:1], off offset:4
3184; GX900-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
3185; GX900-NEXT:    s_waitcnt vmcnt(1)
3186; GX900-NEXT:    v_mov_b32_e32 v0, v6
3187; GX900-NEXT:    s_waitcnt vmcnt(0)
3188; GX900-NEXT:    v_mov_b32_e32 v1, v4
3189; GX900-NEXT:    s_setpc_b64 s[30:31]
3190;
3191; GFX940-LABEL: shuffle_v4bf16_234u:
3192; GFX940:       ; %bb.0:
3193; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3194; GFX940-NEXT:    global_load_dword v4, v[0:1], off offset:4
3195; GFX940-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off
3196; GFX940-NEXT:    s_waitcnt vmcnt(1)
3197; GFX940-NEXT:    v_mov_b32_e32 v0, v4
3198; GFX940-NEXT:    s_waitcnt vmcnt(0)
3199; GFX940-NEXT:    v_mov_b32_e32 v1, v6
3200; GFX940-NEXT:    s_setpc_b64 s[30:31]
3201;
3202; GFX10-LABEL: shuffle_v4bf16_234u:
3203; GFX10:       ; %bb.0:
3204; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3205; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
3206; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
3207; GFX10-NEXT:    s_waitcnt vmcnt(1)
3208; GFX10-NEXT:    v_mov_b32_e32 v0, v6
3209; GFX10-NEXT:    s_waitcnt vmcnt(0)
3210; GFX10-NEXT:    v_mov_b32_e32 v1, v4
3211; GFX10-NEXT:    s_setpc_b64 s[30:31]
3212;
3213; GFX11-LABEL: shuffle_v4bf16_234u:
3214; GFX11:       ; %bb.0:
3215; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3216; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
3217; GFX11-NEXT:    global_load_b64 v[1:2], v[2:3], off
3218; GFX11-NEXT:    s_waitcnt vmcnt(0)
3219; GFX11-NEXT:    s_setpc_b64 s[30:31]
3220  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
3221  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
3222  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 2, i32 3, i32 4, i32 undef>
3223  ret <4 x bfloat> %shuffle
3224}
3225
3226define <4 x bfloat> @shuffle_v4bf16_u1u3(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
3227; GFX9-LABEL: shuffle_v4bf16_u1u3:
3228; GFX9:       ; %bb.0:
3229; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3230; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
3231; GFX9-NEXT:    s_waitcnt vmcnt(0)
3232; GFX9-NEXT:    s_setpc_b64 s[30:31]
3233;
3234; GFX10-LABEL: shuffle_v4bf16_u1u3:
3235; GFX10:       ; %bb.0:
3236; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3237; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
3238; GFX10-NEXT:    s_waitcnt vmcnt(0)
3239; GFX10-NEXT:    s_setpc_b64 s[30:31]
3240;
3241; GFX11-LABEL: shuffle_v4bf16_u1u3:
3242; GFX11:       ; %bb.0:
3243; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3244; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
3245; GFX11-NEXT:    s_waitcnt vmcnt(0)
3246; GFX11-NEXT:    s_setpc_b64 s[30:31]
3247  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
3248  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
3249  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 undef, i32 1, i32 undef, i32 3>
3250  ret <4 x bfloat> %shuffle
3251}
3252
3253define <4 x bfloat> @shuffle_v4bf16_u3u1(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
3254; GX900-LABEL: shuffle_v4bf16_u3u1:
3255; GX900:       ; %bb.0:
3256; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3257; GX900-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
3258; GX900-NEXT:    s_waitcnt vmcnt(0)
3259; GX900-NEXT:    v_mov_b32_e32 v0, v2
3260; GX900-NEXT:    s_setpc_b64 s[30:31]
3261;
3262; GFX940-LABEL: shuffle_v4bf16_u3u1:
3263; GFX940:       ; %bb.0:
3264; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3265; GFX940-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
3266; GFX940-NEXT:    s_waitcnt vmcnt(0)
3267; GFX940-NEXT:    v_mov_b32_e32 v0, v3
3268; GFX940-NEXT:    v_mov_b32_e32 v1, v2
3269; GFX940-NEXT:    s_setpc_b64 s[30:31]
3270;
3271; GFX10-LABEL: shuffle_v4bf16_u3u1:
3272; GFX10:       ; %bb.0:
3273; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3274; GFX10-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
3275; GFX10-NEXT:    s_waitcnt vmcnt(0)
3276; GFX10-NEXT:    v_mov_b32_e32 v0, v2
3277; GFX10-NEXT:    s_setpc_b64 s[30:31]
3278;
3279; GFX11-LABEL: shuffle_v4bf16_u3u1:
3280; GFX11:       ; %bb.0:
3281; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3282; GFX11-NEXT:    global_load_b64 v[1:2], v[0:1], off
3283; GFX11-NEXT:    s_waitcnt vmcnt(0)
3284; GFX11-NEXT:    v_mov_b32_e32 v0, v2
3285; GFX11-NEXT:    s_setpc_b64 s[30:31]
3286  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
3287  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
3288  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 undef, i32 3, i32 undef, i32 1>
3289  ret <4 x bfloat> %shuffle
3290}
3291
3292define <4 x bfloat> @shuffle_v4bf16_u3uu(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
3293; GFX9-LABEL: shuffle_v4bf16_u3uu:
3294; GFX9:       ; %bb.0:
3295; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3296; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
3297; GFX9-NEXT:    s_waitcnt vmcnt(0)
3298; GFX9-NEXT:    s_setpc_b64 s[30:31]
3299;
3300; GFX10-LABEL: shuffle_v4bf16_u3uu:
3301; GFX10:       ; %bb.0:
3302; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3303; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
3304; GFX10-NEXT:    s_waitcnt vmcnt(0)
3305; GFX10-NEXT:    s_setpc_b64 s[30:31]
3306;
3307; GFX11-LABEL: shuffle_v4bf16_u3uu:
3308; GFX11:       ; %bb.0:
3309; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3310; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
3311; GFX11-NEXT:    s_waitcnt vmcnt(0)
3312; GFX11-NEXT:    s_setpc_b64 s[30:31]
3313  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
3314  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
3315  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef>
3316  ret <4 x bfloat> %shuffle
3317}
3318
3319define <4 x bfloat> @shuffle_v4bf16_3u6u(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
3320; GX900-LABEL: shuffle_v4bf16_3u6u:
3321; GX900:       ; %bb.0:
3322; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3323; GX900-NEXT:    global_load_dword v5, v[0:1], off offset:4
3324; GX900-NEXT:    global_load_dword v4, v[2:3], off offset:4
3325; GX900-NEXT:    s_waitcnt vmcnt(1)
3326; GX900-NEXT:    v_alignbit_b32 v0, s4, v5, 16
3327; GX900-NEXT:    s_waitcnt vmcnt(0)
3328; GX900-NEXT:    v_mov_b32_e32 v1, v4
3329; GX900-NEXT:    s_setpc_b64 s[30:31]
3330;
3331; GFX940-LABEL: shuffle_v4bf16_3u6u:
3332; GFX940:       ; %bb.0:
3333; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3334; GFX940-NEXT:    global_load_dword v5, v[0:1], off offset:4
3335; GFX940-NEXT:    global_load_dword v4, v[2:3], off offset:4
3336; GFX940-NEXT:    s_waitcnt vmcnt(1)
3337; GFX940-NEXT:    v_alignbit_b32 v0, s0, v5, 16
3338; GFX940-NEXT:    s_waitcnt vmcnt(0)
3339; GFX940-NEXT:    v_mov_b32_e32 v1, v4
3340; GFX940-NEXT:    s_setpc_b64 s[30:31]
3341;
3342; GFX10-LABEL: shuffle_v4bf16_3u6u:
3343; GFX10:       ; %bb.0:
3344; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3345; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
3346; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
3347; GFX10-NEXT:    s_waitcnt vmcnt(1)
3348; GFX10-NEXT:    v_alignbit_b32 v0, s4, v5, 16
3349; GFX10-NEXT:    s_waitcnt vmcnt(0)
3350; GFX10-NEXT:    v_mov_b32_e32 v1, v4
3351; GFX10-NEXT:    s_setpc_b64 s[30:31]
3352;
3353; GFX11-LABEL: shuffle_v4bf16_3u6u:
3354; GFX11:       ; %bb.0:
3355; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3356; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
3357; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:4
3358; GFX11-NEXT:    s_waitcnt vmcnt(1)
3359; GFX11-NEXT:    v_alignbit_b32 v0, s0, v0, 16
3360; GFX11-NEXT:    s_waitcnt vmcnt(0)
3361; GFX11-NEXT:    s_setpc_b64 s[30:31]
3362  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
3363  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
3364  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 3, i32 undef, i32 6, i32 undef>
3365  ret <4 x bfloat> %shuffle
3366}
3367
3368define <4 x bfloat> @shuffle_v4bf16_3uu7(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
3369; GX900-LABEL: shuffle_v4bf16_3uu7:
3370; GX900:       ; %bb.0:
3371; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3372; GX900-NEXT:    global_load_dword v5, v[0:1], off offset:4
3373; GX900-NEXT:    global_load_dword v4, v[2:3], off offset:4
3374; GX900-NEXT:    s_waitcnt vmcnt(1)
3375; GX900-NEXT:    v_alignbit_b32 v0, s4, v5, 16
3376; GX900-NEXT:    s_waitcnt vmcnt(0)
3377; GX900-NEXT:    v_mov_b32_e32 v1, v4
3378; GX900-NEXT:    s_setpc_b64 s[30:31]
3379;
3380; GFX940-LABEL: shuffle_v4bf16_3uu7:
3381; GFX940:       ; %bb.0:
3382; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3383; GFX940-NEXT:    global_load_dword v5, v[0:1], off offset:4
3384; GFX940-NEXT:    global_load_dword v4, v[2:3], off offset:4
3385; GFX940-NEXT:    s_waitcnt vmcnt(1)
3386; GFX940-NEXT:    v_alignbit_b32 v0, s0, v5, 16
3387; GFX940-NEXT:    s_waitcnt vmcnt(0)
3388; GFX940-NEXT:    v_mov_b32_e32 v1, v4
3389; GFX940-NEXT:    s_setpc_b64 s[30:31]
3390;
3391; GFX10-LABEL: shuffle_v4bf16_3uu7:
3392; GFX10:       ; %bb.0:
3393; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3394; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
3395; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
3396; GFX10-NEXT:    s_waitcnt vmcnt(1)
3397; GFX10-NEXT:    v_alignbit_b32 v0, s4, v5, 16
3398; GFX10-NEXT:    s_waitcnt vmcnt(0)
3399; GFX10-NEXT:    v_mov_b32_e32 v1, v4
3400; GFX10-NEXT:    s_setpc_b64 s[30:31]
3401;
3402; GFX11-LABEL: shuffle_v4bf16_3uu7:
3403; GFX11:       ; %bb.0:
3404; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3405; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
3406; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:4
3407; GFX11-NEXT:    s_waitcnt vmcnt(1)
3408; GFX11-NEXT:    v_alignbit_b32 v0, s0, v0, 16
3409; GFX11-NEXT:    s_waitcnt vmcnt(0)
3410; GFX11-NEXT:    s_setpc_b64 s[30:31]
3411  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
3412  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
3413  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 3, i32 undef, i32 undef, i32 7>
3414  ret <4 x bfloat> %shuffle
3415}
3416
3417define <4 x bfloat> @shuffle_v4bf16_35u5(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
3418; GX900-LABEL: shuffle_v4bf16_35u5:
3419; GX900:       ; %bb.0:
3420; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3421; GX900-NEXT:    global_load_dword v5, v[0:1], off offset:4
3422; GX900-NEXT:    global_load_dword v4, v[2:3], off
3423; GX900-NEXT:    s_mov_b32 s4, 0x7060302
3424; GX900-NEXT:    s_waitcnt vmcnt(0)
3425; GX900-NEXT:    v_perm_b32 v0, v4, v5, s4
3426; GX900-NEXT:    v_mov_b32_e32 v1, v4
3427; GX900-NEXT:    s_setpc_b64 s[30:31]
3428;
3429; GFX940-LABEL: shuffle_v4bf16_35u5:
3430; GFX940:       ; %bb.0:
3431; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3432; GFX940-NEXT:    global_load_dword v5, v[0:1], off offset:4
3433; GFX940-NEXT:    global_load_dword v4, v[2:3], off
3434; GFX940-NEXT:    s_mov_b32 s0, 0x7060302
3435; GFX940-NEXT:    s_waitcnt vmcnt(0)
3436; GFX940-NEXT:    v_perm_b32 v0, v4, v5, s0
3437; GFX940-NEXT:    v_mov_b32_e32 v1, v4
3438; GFX940-NEXT:    s_setpc_b64 s[30:31]
3439;
3440; GFX10-LABEL: shuffle_v4bf16_35u5:
3441; GFX10:       ; %bb.0:
3442; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3443; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
3444; GFX10-NEXT:    global_load_dword v4, v[2:3], off
3445; GFX10-NEXT:    s_waitcnt vmcnt(0)
3446; GFX10-NEXT:    v_perm_b32 v0, v4, v5, 0x7060302
3447; GFX10-NEXT:    v_mov_b32_e32 v1, v4
3448; GFX10-NEXT:    s_setpc_b64 s[30:31]
3449;
3450; GFX11-LABEL: shuffle_v4bf16_35u5:
3451; GFX11:       ; %bb.0:
3452; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3453; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
3454; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
3455; GFX11-NEXT:    s_waitcnt vmcnt(0)
3456; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
3457; GFX11-NEXT:    s_setpc_b64 s[30:31]
3458  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
3459  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
3460  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 3, i32 5, i32 undef, i32 5>
3461  ret <4 x bfloat> %shuffle
3462}
3463
3464define <4 x bfloat> @shuffle_v4bf16_357u(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
3465; GX900-LABEL: shuffle_v4bf16_357u:
3466; GX900:       ; %bb.0:
3467; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3468; GX900-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
3469; GX900-NEXT:    global_load_dword v6, v[0:1], off offset:4
3470; GX900-NEXT:    s_mov_b32 s4, 0x7060302
3471; GX900-NEXT:    s_waitcnt vmcnt(1)
3472; GX900-NEXT:    v_alignbit_b32 v1, s4, v5, 16
3473; GX900-NEXT:    s_waitcnt vmcnt(0)
3474; GX900-NEXT:    v_perm_b32 v0, v4, v6, s4
3475; GX900-NEXT:    s_setpc_b64 s[30:31]
3476;
3477; GFX940-LABEL: shuffle_v4bf16_357u:
3478; GFX940:       ; %bb.0:
3479; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3480; GFX940-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
3481; GFX940-NEXT:    global_load_dword v6, v[0:1], off offset:4
3482; GFX940-NEXT:    s_mov_b32 s0, 0x7060302
3483; GFX940-NEXT:    s_waitcnt vmcnt(1)
3484; GFX940-NEXT:    v_alignbit_b32 v1, s0, v5, 16
3485; GFX940-NEXT:    s_waitcnt vmcnt(0)
3486; GFX940-NEXT:    v_perm_b32 v0, v4, v6, s0
3487; GFX940-NEXT:    s_setpc_b64 s[30:31]
3488;
3489; GFX10-LABEL: shuffle_v4bf16_357u:
3490; GFX10:       ; %bb.0:
3491; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3492; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
3493; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
3494; GFX10-NEXT:    s_waitcnt vmcnt(1)
3495; GFX10-NEXT:    v_alignbit_b32 v1, s4, v5, 16
3496; GFX10-NEXT:    s_waitcnt vmcnt(0)
3497; GFX10-NEXT:    v_perm_b32 v0, v4, v6, 0x7060302
3498; GFX10-NEXT:    s_setpc_b64 s[30:31]
3499;
3500; GFX11-LABEL: shuffle_v4bf16_357u:
3501; GFX11:       ; %bb.0:
3502; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3503; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
3504; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
3505; GFX11-NEXT:    s_waitcnt vmcnt(1)
3506; GFX11-NEXT:    v_alignbit_b32 v1, s0, v3, 16
3507; GFX11-NEXT:    s_waitcnt vmcnt(0)
3508; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
3509; GFX11-NEXT:    s_setpc_b64 s[30:31]
3510  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
3511  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
3512  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 3, i32 5, i32 7, i32 undef>
3513  ret <4 x bfloat> %shuffle
3514}
3515
3516define <4 x bfloat> @shuffle_v4bf16_0101(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
3517; GFX9-LABEL: shuffle_v4bf16_0101:
3518; GFX9:       ; %bb.0:
3519; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3520; GFX9-NEXT:    global_load_dword v0, v[0:1], off
3521; GFX9-NEXT:    s_waitcnt vmcnt(0)
3522; GFX9-NEXT:    v_mov_b32_e32 v1, v0
3523; GFX9-NEXT:    s_setpc_b64 s[30:31]
3524;
3525; GFX10-LABEL: shuffle_v4bf16_0101:
3526; GFX10:       ; %bb.0:
3527; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3528; GFX10-NEXT:    global_load_dword v0, v[0:1], off
3529; GFX10-NEXT:    s_waitcnt vmcnt(0)
3530; GFX10-NEXT:    v_mov_b32_e32 v1, v0
3531; GFX10-NEXT:    s_setpc_b64 s[30:31]
3532;
3533; GFX11-LABEL: shuffle_v4bf16_0101:
3534; GFX11:       ; %bb.0:
3535; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3536; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
3537; GFX11-NEXT:    s_waitcnt vmcnt(0)
3538; GFX11-NEXT:    v_mov_b32_e32 v1, v0
3539; GFX11-NEXT:    s_setpc_b64 s[30:31]
3540  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
3541  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
3542  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
3543  ret <4 x bfloat> %shuffle
3544}
3545
3546define <4 x bfloat> @shuffle_v4bf16_0123(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
3547; GFX9-LABEL: shuffle_v4bf16_0123:
3548; GFX9:       ; %bb.0:
3549; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3550; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
3551; GFX9-NEXT:    s_waitcnt vmcnt(0)
3552; GFX9-NEXT:    s_setpc_b64 s[30:31]
3553;
3554; GFX10-LABEL: shuffle_v4bf16_0123:
3555; GFX10:       ; %bb.0:
3556; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3557; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
3558; GFX10-NEXT:    s_waitcnt vmcnt(0)
3559; GFX10-NEXT:    s_setpc_b64 s[30:31]
3560;
3561; GFX11-LABEL: shuffle_v4bf16_0123:
3562; GFX11:       ; %bb.0:
3563; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3564; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
3565; GFX11-NEXT:    s_waitcnt vmcnt(0)
3566; GFX11-NEXT:    s_setpc_b64 s[30:31]
3567  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
3568  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
3569  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3570  ret <4 x bfloat> %shuffle
3571}
3572
3573define <4 x bfloat> @shuffle_v4bf16_0145(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
3574; GFX9-LABEL: shuffle_v4bf16_0145:
3575; GFX9:       ; %bb.0:
3576; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3577; GFX9-NEXT:    global_load_dword v4, v[0:1], off
3578; GFX9-NEXT:    global_load_dword v5, v[2:3], off
3579; GFX9-NEXT:    s_waitcnt vmcnt(1)
3580; GFX9-NEXT:    v_mov_b32_e32 v0, v4
3581; GFX9-NEXT:    s_waitcnt vmcnt(0)
3582; GFX9-NEXT:    v_mov_b32_e32 v1, v5
3583; GFX9-NEXT:    s_setpc_b64 s[30:31]
3584;
3585; GFX10-LABEL: shuffle_v4bf16_0145:
3586; GFX10:       ; %bb.0:
3587; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3588; GFX10-NEXT:    global_load_dword v4, v[0:1], off
3589; GFX10-NEXT:    global_load_dword v5, v[2:3], off
3590; GFX10-NEXT:    s_waitcnt vmcnt(1)
3591; GFX10-NEXT:    v_mov_b32_e32 v0, v4
3592; GFX10-NEXT:    s_waitcnt vmcnt(0)
3593; GFX10-NEXT:    v_mov_b32_e32 v1, v5
3594; GFX10-NEXT:    s_setpc_b64 s[30:31]
3595;
3596; GFX11-LABEL: shuffle_v4bf16_0145:
3597; GFX11:       ; %bb.0:
3598; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3599; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
3600; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
3601; GFX11-NEXT:    s_waitcnt vmcnt(0)
3602; GFX11-NEXT:    s_setpc_b64 s[30:31]
3603  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
3604  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
3605  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
3606  ret <4 x bfloat> %shuffle
3607}
3608
3609define <4 x bfloat> @shuffle_v4bf16_0167(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
3610; GFX9-LABEL: shuffle_v4bf16_0167:
3611; GFX9:       ; %bb.0:
3612; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3613; GFX9-NEXT:    global_load_dword v4, v[0:1], off
3614; GFX9-NEXT:    global_load_dword v5, v[2:3], off offset:4
3615; GFX9-NEXT:    s_waitcnt vmcnt(1)
3616; GFX9-NEXT:    v_mov_b32_e32 v0, v4
3617; GFX9-NEXT:    s_waitcnt vmcnt(0)
3618; GFX9-NEXT:    v_mov_b32_e32 v1, v5
3619; GFX9-NEXT:    s_setpc_b64 s[30:31]
3620;
3621; GFX10-LABEL: shuffle_v4bf16_0167:
3622; GFX10:       ; %bb.0:
3623; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3624; GFX10-NEXT:    global_load_dword v4, v[0:1], off
3625; GFX10-NEXT:    global_load_dword v5, v[2:3], off offset:4
3626; GFX10-NEXT:    s_waitcnt vmcnt(1)
3627; GFX10-NEXT:    v_mov_b32_e32 v0, v4
3628; GFX10-NEXT:    s_waitcnt vmcnt(0)
3629; GFX10-NEXT:    v_mov_b32_e32 v1, v5
3630; GFX10-NEXT:    s_setpc_b64 s[30:31]
3631;
3632; GFX11-LABEL: shuffle_v4bf16_0167:
3633; GFX11:       ; %bb.0:
3634; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3635; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
3636; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:4
3637; GFX11-NEXT:    s_waitcnt vmcnt(0)
3638; GFX11-NEXT:    s_setpc_b64 s[30:31]
3639  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
3640  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
3641  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
3642  ret <4 x bfloat> %shuffle
3643}
3644
3645define <4 x bfloat> @shuffle_v4bf16_2301(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
3646; GX900-LABEL: shuffle_v4bf16_2301:
3647; GX900:       ; %bb.0:
3648; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3649; GX900-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
3650; GX900-NEXT:    s_waitcnt vmcnt(0)
3651; GX900-NEXT:    v_mov_b32_e32 v0, v2
3652; GX900-NEXT:    s_setpc_b64 s[30:31]
3653;
3654; GFX940-LABEL: shuffle_v4bf16_2301:
3655; GFX940:       ; %bb.0:
3656; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3657; GFX940-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
3658; GFX940-NEXT:    s_waitcnt vmcnt(0)
3659; GFX940-NEXT:    v_mov_b32_e32 v0, v3
3660; GFX940-NEXT:    v_mov_b32_e32 v1, v2
3661; GFX940-NEXT:    s_setpc_b64 s[30:31]
3662;
3663; GFX10-LABEL: shuffle_v4bf16_2301:
3664; GFX10:       ; %bb.0:
3665; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3666; GFX10-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
3667; GFX10-NEXT:    s_waitcnt vmcnt(0)
3668; GFX10-NEXT:    v_mov_b32_e32 v0, v2
3669; GFX10-NEXT:    s_setpc_b64 s[30:31]
3670;
3671; GFX11-LABEL: shuffle_v4bf16_2301:
3672; GFX11:       ; %bb.0:
3673; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3674; GFX11-NEXT:    global_load_b64 v[1:2], v[0:1], off
3675; GFX11-NEXT:    s_waitcnt vmcnt(0)
3676; GFX11-NEXT:    v_mov_b32_e32 v0, v2
3677; GFX11-NEXT:    s_setpc_b64 s[30:31]
3678  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
3679  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
3680  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
3681  ret <4 x bfloat> %shuffle
3682}
3683
3684define <4 x bfloat> @shuffle_v4bf16_2323(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
3685; GFX9-LABEL: shuffle_v4bf16_2323:
3686; GFX9:       ; %bb.0:
3687; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3688; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
3689; GFX9-NEXT:    s_waitcnt vmcnt(0)
3690; GFX9-NEXT:    v_mov_b32_e32 v1, v0
3691; GFX9-NEXT:    s_setpc_b64 s[30:31]
3692;
3693; GFX10-LABEL: shuffle_v4bf16_2323:
3694; GFX10:       ; %bb.0:
3695; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3696; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
3697; GFX10-NEXT:    s_waitcnt vmcnt(0)
3698; GFX10-NEXT:    v_mov_b32_e32 v1, v0
3699; GFX10-NEXT:    s_setpc_b64 s[30:31]
3700;
3701; GFX11-LABEL: shuffle_v4bf16_2323:
3702; GFX11:       ; %bb.0:
3703; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3704; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
3705; GFX11-NEXT:    s_waitcnt vmcnt(0)
3706; GFX11-NEXT:    v_mov_b32_e32 v1, v0
3707; GFX11-NEXT:    s_setpc_b64 s[30:31]
3708  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
3709  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
3710  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
3711  ret <4 x bfloat> %shuffle
3712}
3713
3714define <4 x bfloat> @shuffle_v4bf16_2345(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
3715; GFX9-LABEL: shuffle_v4bf16_2345:
3716; GFX9:       ; %bb.0:
3717; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3718; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
3719; GFX9-NEXT:    global_load_dword v5, v[2:3], off
3720; GFX9-NEXT:    s_waitcnt vmcnt(1)
3721; GFX9-NEXT:    v_mov_b32_e32 v0, v4
3722; GFX9-NEXT:    s_waitcnt vmcnt(0)
3723; GFX9-NEXT:    v_mov_b32_e32 v1, v5
3724; GFX9-NEXT:    s_setpc_b64 s[30:31]
3725;
3726; GFX10-LABEL: shuffle_v4bf16_2345:
3727; GFX10:       ; %bb.0:
3728; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3729; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
3730; GFX10-NEXT:    global_load_dword v5, v[2:3], off
3731; GFX10-NEXT:    s_waitcnt vmcnt(1)
3732; GFX10-NEXT:    v_mov_b32_e32 v0, v4
3733; GFX10-NEXT:    s_waitcnt vmcnt(0)
3734; GFX10-NEXT:    v_mov_b32_e32 v1, v5
3735; GFX10-NEXT:    s_setpc_b64 s[30:31]
3736;
3737; GFX11-LABEL: shuffle_v4bf16_2345:
3738; GFX11:       ; %bb.0:
3739; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3740; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
3741; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
3742; GFX11-NEXT:    s_waitcnt vmcnt(0)
3743; GFX11-NEXT:    s_setpc_b64 s[30:31]
3744  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
3745  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
3746  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
3747  ret <4 x bfloat> %shuffle
3748}
3749
3750define <4 x bfloat> @shuffle_v4bf16_2367(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
3751; GFX9-LABEL: shuffle_v4bf16_2367:
3752; GFX9:       ; %bb.0:
3753; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3754; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
3755; GFX9-NEXT:    global_load_dword v5, v[2:3], off offset:4
3756; GFX9-NEXT:    s_waitcnt vmcnt(1)
3757; GFX9-NEXT:    v_mov_b32_e32 v0, v4
3758; GFX9-NEXT:    s_waitcnt vmcnt(0)
3759; GFX9-NEXT:    v_mov_b32_e32 v1, v5
3760; GFX9-NEXT:    s_setpc_b64 s[30:31]
3761;
3762; GFX10-LABEL: shuffle_v4bf16_2367:
3763; GFX10:       ; %bb.0:
3764; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3765; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
3766; GFX10-NEXT:    global_load_dword v5, v[2:3], off offset:4
3767; GFX10-NEXT:    s_waitcnt vmcnt(1)
3768; GFX10-NEXT:    v_mov_b32_e32 v0, v4
3769; GFX10-NEXT:    s_waitcnt vmcnt(0)
3770; GFX10-NEXT:    v_mov_b32_e32 v1, v5
3771; GFX10-NEXT:    s_setpc_b64 s[30:31]
3772;
3773; GFX11-LABEL: shuffle_v4bf16_2367:
3774; GFX11:       ; %bb.0:
3775; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3776; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
3777; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:4
3778; GFX11-NEXT:    s_waitcnt vmcnt(0)
3779; GFX11-NEXT:    s_setpc_b64 s[30:31]
3780  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
3781  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
3782  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
3783  ret <4 x bfloat> %shuffle
3784}
3785
3786define <4 x bfloat> @shuffle_v4bf16_4501(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
3787; GFX9-LABEL: shuffle_v4bf16_4501:
3788; GFX9:       ; %bb.0:
3789; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3790; GFX9-NEXT:    global_load_dword v4, v[2:3], off
3791; GFX9-NEXT:    global_load_dword v5, v[0:1], off
3792; GFX9-NEXT:    s_waitcnt vmcnt(1)
3793; GFX9-NEXT:    v_mov_b32_e32 v0, v4
3794; GFX9-NEXT:    s_waitcnt vmcnt(0)
3795; GFX9-NEXT:    v_mov_b32_e32 v1, v5
3796; GFX9-NEXT:    s_setpc_b64 s[30:31]
3797;
3798; GFX10-LABEL: shuffle_v4bf16_4501:
3799; GFX10:       ; %bb.0:
3800; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3801; GFX10-NEXT:    global_load_dword v4, v[2:3], off
3802; GFX10-NEXT:    global_load_dword v5, v[0:1], off
3803; GFX10-NEXT:    s_waitcnt vmcnt(1)
3804; GFX10-NEXT:    v_mov_b32_e32 v0, v4
3805; GFX10-NEXT:    s_waitcnt vmcnt(0)
3806; GFX10-NEXT:    v_mov_b32_e32 v1, v5
3807; GFX10-NEXT:    s_setpc_b64 s[30:31]
3808;
3809; GFX11-LABEL: shuffle_v4bf16_4501:
3810; GFX11:       ; %bb.0:
3811; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3812; GFX11-NEXT:    global_load_b32 v2, v[2:3], off
3813; GFX11-NEXT:    global_load_b32 v1, v[0:1], off
3814; GFX11-NEXT:    s_waitcnt vmcnt(1)
3815; GFX11-NEXT:    v_mov_b32_e32 v0, v2
3816; GFX11-NEXT:    s_waitcnt vmcnt(0)
3817; GFX11-NEXT:    s_setpc_b64 s[30:31]
3818  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
3819  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
3820  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
3821  ret <4 x bfloat> %shuffle
3822}
3823
3824define <4 x bfloat> @shuffle_v4bf16_4523(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
3825; GFX9-LABEL: shuffle_v4bf16_4523:
3826; GFX9:       ; %bb.0:
3827; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3828; GFX9-NEXT:    global_load_dword v4, v[2:3], off
3829; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
3830; GFX9-NEXT:    s_waitcnt vmcnt(1)
3831; GFX9-NEXT:    v_mov_b32_e32 v0, v4
3832; GFX9-NEXT:    s_waitcnt vmcnt(0)
3833; GFX9-NEXT:    v_mov_b32_e32 v1, v5
3834; GFX9-NEXT:    s_setpc_b64 s[30:31]
3835;
3836; GFX10-LABEL: shuffle_v4bf16_4523:
3837; GFX10:       ; %bb.0:
3838; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3839; GFX10-NEXT:    global_load_dword v4, v[2:3], off
3840; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
3841; GFX10-NEXT:    s_waitcnt vmcnt(1)
3842; GFX10-NEXT:    v_mov_b32_e32 v0, v4
3843; GFX10-NEXT:    s_waitcnt vmcnt(0)
3844; GFX10-NEXT:    v_mov_b32_e32 v1, v5
3845; GFX10-NEXT:    s_setpc_b64 s[30:31]
3846;
3847; GFX11-LABEL: shuffle_v4bf16_4523:
3848; GFX11:       ; %bb.0:
3849; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3850; GFX11-NEXT:    global_load_b32 v2, v[2:3], off
3851; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
3852; GFX11-NEXT:    s_waitcnt vmcnt(1)
3853; GFX11-NEXT:    v_mov_b32_e32 v0, v2
3854; GFX11-NEXT:    s_waitcnt vmcnt(0)
3855; GFX11-NEXT:    s_setpc_b64 s[30:31]
3856  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
3857  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
3858  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
3859  ret <4 x bfloat> %shuffle
3860}
3861
3862define <4 x bfloat> @shuffle_v4bf16_4545(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
3863; GFX9-LABEL: shuffle_v4bf16_4545:
3864; GFX9:       ; %bb.0:
3865; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3866; GFX9-NEXT:    global_load_dword v0, v[2:3], off
3867; GFX9-NEXT:    s_waitcnt vmcnt(0)
3868; GFX9-NEXT:    v_mov_b32_e32 v1, v0
3869; GFX9-NEXT:    s_setpc_b64 s[30:31]
3870;
3871; GFX10-LABEL: shuffle_v4bf16_4545:
3872; GFX10:       ; %bb.0:
3873; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3874; GFX10-NEXT:    global_load_dword v0, v[2:3], off
3875; GFX10-NEXT:    s_waitcnt vmcnt(0)
3876; GFX10-NEXT:    v_mov_b32_e32 v1, v0
3877; GFX10-NEXT:    s_setpc_b64 s[30:31]
3878;
3879; GFX11-LABEL: shuffle_v4bf16_4545:
3880; GFX11:       ; %bb.0:
3881; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3882; GFX11-NEXT:    global_load_b32 v0, v[2:3], off
3883; GFX11-NEXT:    s_waitcnt vmcnt(0)
3884; GFX11-NEXT:    v_mov_b32_e32 v1, v0
3885; GFX11-NEXT:    s_setpc_b64 s[30:31]
3886  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
3887  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
3888  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 4, i32 5, i32 4, i32 5>
3889  ret <4 x bfloat> %shuffle
3890}
3891
3892define <4 x bfloat> @shuffle_v4bf16_4567(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
3893; GFX9-LABEL: shuffle_v4bf16_4567:
3894; GFX9:       ; %bb.0:
3895; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3896; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
3897; GFX9-NEXT:    s_waitcnt vmcnt(0)
3898; GFX9-NEXT:    s_setpc_b64 s[30:31]
3899;
3900; GFX10-LABEL: shuffle_v4bf16_4567:
3901; GFX10:       ; %bb.0:
3902; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3903; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
3904; GFX10-NEXT:    s_waitcnt vmcnt(0)
3905; GFX10-NEXT:    s_setpc_b64 s[30:31]
3906;
3907; GFX11-LABEL: shuffle_v4bf16_4567:
3908; GFX11:       ; %bb.0:
3909; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3910; GFX11-NEXT:    global_load_b64 v[0:1], v[2:3], off
3911; GFX11-NEXT:    s_waitcnt vmcnt(0)
3912; GFX11-NEXT:    s_setpc_b64 s[30:31]
3913  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
3914  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
3915  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3916  ret <4 x bfloat> %shuffle
3917}
3918
3919define <4 x bfloat> @shuffle_v4bf16_6701(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
3920; GFX9-LABEL: shuffle_v4bf16_6701:
3921; GFX9:       ; %bb.0:
3922; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3923; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
3924; GFX9-NEXT:    global_load_dword v5, v[0:1], off
3925; GFX9-NEXT:    s_waitcnt vmcnt(1)
3926; GFX9-NEXT:    v_mov_b32_e32 v0, v4
3927; GFX9-NEXT:    s_waitcnt vmcnt(0)
3928; GFX9-NEXT:    v_mov_b32_e32 v1, v5
3929; GFX9-NEXT:    s_setpc_b64 s[30:31]
3930;
3931; GFX10-LABEL: shuffle_v4bf16_6701:
3932; GFX10:       ; %bb.0:
3933; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3934; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
3935; GFX10-NEXT:    global_load_dword v5, v[0:1], off
3936; GFX10-NEXT:    s_waitcnt vmcnt(1)
3937; GFX10-NEXT:    v_mov_b32_e32 v0, v4
3938; GFX10-NEXT:    s_waitcnt vmcnt(0)
3939; GFX10-NEXT:    v_mov_b32_e32 v1, v5
3940; GFX10-NEXT:    s_setpc_b64 s[30:31]
3941;
3942; GFX11-LABEL: shuffle_v4bf16_6701:
3943; GFX11:       ; %bb.0:
3944; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3945; GFX11-NEXT:    global_load_b32 v2, v[2:3], off offset:4
3946; GFX11-NEXT:    global_load_b32 v1, v[0:1], off
3947; GFX11-NEXT:    s_waitcnt vmcnt(1)
3948; GFX11-NEXT:    v_mov_b32_e32 v0, v2
3949; GFX11-NEXT:    s_waitcnt vmcnt(0)
3950; GFX11-NEXT:    s_setpc_b64 s[30:31]
3951  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
3952  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
3953  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
3954  ret <4 x bfloat> %shuffle
3955}
3956
3957define <4 x bfloat> @shuffle_v4bf16_6723(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
3958; GFX9-LABEL: shuffle_v4bf16_6723:
3959; GFX9:       ; %bb.0:
3960; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3961; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
3962; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
3963; GFX9-NEXT:    s_waitcnt vmcnt(1)
3964; GFX9-NEXT:    v_mov_b32_e32 v0, v4
3965; GFX9-NEXT:    s_waitcnt vmcnt(0)
3966; GFX9-NEXT:    v_mov_b32_e32 v1, v5
3967; GFX9-NEXT:    s_setpc_b64 s[30:31]
3968;
3969; GFX10-LABEL: shuffle_v4bf16_6723:
3970; GFX10:       ; %bb.0:
3971; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3972; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
3973; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
3974; GFX10-NEXT:    s_waitcnt vmcnt(1)
3975; GFX10-NEXT:    v_mov_b32_e32 v0, v4
3976; GFX10-NEXT:    s_waitcnt vmcnt(0)
3977; GFX10-NEXT:    v_mov_b32_e32 v1, v5
3978; GFX10-NEXT:    s_setpc_b64 s[30:31]
3979;
3980; GFX11-LABEL: shuffle_v4bf16_6723:
3981; GFX11:       ; %bb.0:
3982; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3983; GFX11-NEXT:    global_load_b32 v2, v[2:3], off offset:4
3984; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
3985; GFX11-NEXT:    s_waitcnt vmcnt(1)
3986; GFX11-NEXT:    v_mov_b32_e32 v0, v2
3987; GFX11-NEXT:    s_waitcnt vmcnt(0)
3988; GFX11-NEXT:    s_setpc_b64 s[30:31]
3989  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
3990  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
3991  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
3992  ret <4 x bfloat> %shuffle
3993}
3994
3995define <4 x bfloat> @shuffle_v4bf16_6745(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
3996; GX900-LABEL: shuffle_v4bf16_6745:
3997; GX900:       ; %bb.0:
3998; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3999; GX900-NEXT:    global_load_dwordx2 v[1:2], v[2:3], off
4000; GX900-NEXT:    s_waitcnt vmcnt(0)
4001; GX900-NEXT:    v_mov_b32_e32 v0, v2
4002; GX900-NEXT:    s_setpc_b64 s[30:31]
4003;
4004; GFX940-LABEL: shuffle_v4bf16_6745:
4005; GFX940:       ; %bb.0:
4006; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4007; GFX940-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
4008; GFX940-NEXT:    s_waitcnt vmcnt(0)
4009; GFX940-NEXT:    v_mov_b32_e32 v0, v3
4010; GFX940-NEXT:    v_mov_b32_e32 v1, v2
4011; GFX940-NEXT:    s_setpc_b64 s[30:31]
4012;
4013; GFX10-LABEL: shuffle_v4bf16_6745:
4014; GFX10:       ; %bb.0:
4015; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4016; GFX10-NEXT:    global_load_dwordx2 v[1:2], v[2:3], off
4017; GFX10-NEXT:    s_waitcnt vmcnt(0)
4018; GFX10-NEXT:    v_mov_b32_e32 v0, v2
4019; GFX10-NEXT:    s_setpc_b64 s[30:31]
4020;
4021; GFX11-LABEL: shuffle_v4bf16_6745:
4022; GFX11:       ; %bb.0:
4023; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4024; GFX11-NEXT:    global_load_b64 v[1:2], v[2:3], off
4025; GFX11-NEXT:    s_waitcnt vmcnt(0)
4026; GFX11-NEXT:    v_mov_b32_e32 v0, v2
4027; GFX11-NEXT:    s_setpc_b64 s[30:31]
4028  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
4029  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
4030  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 6, i32 7, i32 4, i32 5>
4031  ret <4 x bfloat> %shuffle
4032}
4033
4034define <4 x bfloat> @shuffle_v4bf16_6767(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
4035; GFX9-LABEL: shuffle_v4bf16_6767:
4036; GFX9:       ; %bb.0:
4037; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4038; GFX9-NEXT:    global_load_dword v0, v[2:3], off offset:4
4039; GFX9-NEXT:    s_waitcnt vmcnt(0)
4040; GFX9-NEXT:    v_mov_b32_e32 v1, v0
4041; GFX9-NEXT:    s_setpc_b64 s[30:31]
4042;
4043; GFX10-LABEL: shuffle_v4bf16_6767:
4044; GFX10:       ; %bb.0:
4045; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4046; GFX10-NEXT:    global_load_dword v0, v[2:3], off offset:4
4047; GFX10-NEXT:    s_waitcnt vmcnt(0)
4048; GFX10-NEXT:    v_mov_b32_e32 v1, v0
4049; GFX10-NEXT:    s_setpc_b64 s[30:31]
4050;
4051; GFX11-LABEL: shuffle_v4bf16_6767:
4052; GFX11:       ; %bb.0:
4053; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4054; GFX11-NEXT:    global_load_b32 v0, v[2:3], off offset:4
4055; GFX11-NEXT:    s_waitcnt vmcnt(0)
4056; GFX11-NEXT:    v_mov_b32_e32 v1, v0
4057; GFX11-NEXT:    s_setpc_b64 s[30:31]
4058  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
4059  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
4060  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 6, i32 7, i32 6, i32 7>
4061  ret <4 x bfloat> %shuffle
4062}
4063
4064define <4 x bfloat> @shuffle_v4bf16_2356(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
4065; GX900-LABEL: shuffle_v4bf16_2356:
4066; GX900:       ; %bb.0:
4067; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4068; GX900-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
4069; GX900-NEXT:    global_load_dword v4, v[0:1], off offset:4
4070; GX900-NEXT:    s_waitcnt vmcnt(1)
4071; GX900-NEXT:    v_alignbit_b32 v1, v6, v5, 16
4072; GX900-NEXT:    s_waitcnt vmcnt(0)
4073; GX900-NEXT:    v_mov_b32_e32 v0, v4
4074; GX900-NEXT:    s_setpc_b64 s[30:31]
4075;
4076; GFX940-LABEL: shuffle_v4bf16_2356:
4077; GFX940:       ; %bb.0:
4078; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4079; GFX940-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off
4080; GFX940-NEXT:    global_load_dword v4, v[0:1], off offset:4
4081; GFX940-NEXT:    s_waitcnt vmcnt(1)
4082; GFX940-NEXT:    v_alignbit_b32 v1, v7, v6, 16
4083; GFX940-NEXT:    s_waitcnt vmcnt(0)
4084; GFX940-NEXT:    v_mov_b32_e32 v0, v4
4085; GFX940-NEXT:    s_setpc_b64 s[30:31]
4086;
4087; GFX10-LABEL: shuffle_v4bf16_2356:
4088; GFX10:       ; %bb.0:
4089; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4090; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
4091; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
4092; GFX10-NEXT:    s_waitcnt vmcnt(1)
4093; GFX10-NEXT:    v_alignbit_b32 v1, v6, v5, 16
4094; GFX10-NEXT:    s_waitcnt vmcnt(0)
4095; GFX10-NEXT:    v_mov_b32_e32 v0, v4
4096; GFX10-NEXT:    s_setpc_b64 s[30:31]
4097;
4098; GFX11-LABEL: shuffle_v4bf16_2356:
4099; GFX11:       ; %bb.0:
4100; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4101; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
4102; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
4103; GFX11-NEXT:    s_waitcnt vmcnt(1)
4104; GFX11-NEXT:    v_alignbit_b32 v1, v3, v2, 16
4105; GFX11-NEXT:    s_waitcnt vmcnt(0)
4106; GFX11-NEXT:    s_setpc_b64 s[30:31]
4107  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
4108  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
4109  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 2, i32 3, i32 5, i32 6>
4110  ret <4 x bfloat> %shuffle
4111}
4112
4113define <4 x bfloat> @shuffle_v4bf16_5623(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
4114; GX900-LABEL: shuffle_v4bf16_5623:
4115; GX900:       ; %bb.0:
4116; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4117; GX900-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
4118; GX900-NEXT:    global_load_dword v4, v[0:1], off offset:4
4119; GX900-NEXT:    s_waitcnt vmcnt(1)
4120; GX900-NEXT:    v_alignbit_b32 v0, v6, v5, 16
4121; GX900-NEXT:    s_waitcnt vmcnt(0)
4122; GX900-NEXT:    v_mov_b32_e32 v1, v4
4123; GX900-NEXT:    s_setpc_b64 s[30:31]
4124;
4125; GFX940-LABEL: shuffle_v4bf16_5623:
4126; GFX940:       ; %bb.0:
4127; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4128; GFX940-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off
4129; GFX940-NEXT:    global_load_dword v4, v[0:1], off offset:4
4130; GFX940-NEXT:    s_waitcnt vmcnt(1)
4131; GFX940-NEXT:    v_alignbit_b32 v0, v7, v6, 16
4132; GFX940-NEXT:    s_waitcnt vmcnt(0)
4133; GFX940-NEXT:    v_mov_b32_e32 v1, v4
4134; GFX940-NEXT:    s_setpc_b64 s[30:31]
4135;
4136; GFX10-LABEL: shuffle_v4bf16_5623:
4137; GFX10:       ; %bb.0:
4138; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4139; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
4140; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
4141; GFX10-NEXT:    s_waitcnt vmcnt(1)
4142; GFX10-NEXT:    v_alignbit_b32 v0, v6, v5, 16
4143; GFX10-NEXT:    s_waitcnt vmcnt(0)
4144; GFX10-NEXT:    v_mov_b32_e32 v1, v4
4145; GFX10-NEXT:    s_setpc_b64 s[30:31]
4146;
4147; GFX11-LABEL: shuffle_v4bf16_5623:
4148; GFX11:       ; %bb.0:
4149; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4150; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
4151; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
4152; GFX11-NEXT:    s_waitcnt vmcnt(1)
4153; GFX11-NEXT:    v_alignbit_b32 v0, v3, v2, 16
4154; GFX11-NEXT:    s_waitcnt vmcnt(0)
4155; GFX11-NEXT:    s_setpc_b64 s[30:31]
4156  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
4157  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
4158  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 5, i32 6, i32 2, i32 3>
4159  ret <4 x bfloat> %shuffle
4160}
4161
4162define <4 x bfloat> @shuffle_v4bf16_3456(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
4163; GFX9-LABEL: shuffle_v4bf16_3456:
4164; GFX9:       ; %bb.0:
4165; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4166; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
4167; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
4168; GFX9-NEXT:    s_waitcnt vmcnt(1)
4169; GFX9-NEXT:    v_alignbit_b32 v1, v5, v4, 16
4170; GFX9-NEXT:    s_waitcnt vmcnt(0)
4171; GFX9-NEXT:    v_alignbit_b32 v0, v4, v6, 16
4172; GFX9-NEXT:    s_setpc_b64 s[30:31]
4173;
4174; GFX10-LABEL: shuffle_v4bf16_3456:
4175; GFX10:       ; %bb.0:
4176; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4177; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
4178; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
4179; GFX10-NEXT:    s_waitcnt vmcnt(1)
4180; GFX10-NEXT:    v_alignbit_b32 v1, v5, v4, 16
4181; GFX10-NEXT:    s_waitcnt vmcnt(0)
4182; GFX10-NEXT:    v_alignbit_b32 v0, v4, v6, 16
4183; GFX10-NEXT:    s_setpc_b64 s[30:31]
4184;
4185; GFX11-LABEL: shuffle_v4bf16_3456:
4186; GFX11:       ; %bb.0:
4187; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4188; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
4189; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
4190; GFX11-NEXT:    s_waitcnt vmcnt(1)
4191; GFX11-NEXT:    v_alignbit_b32 v1, v3, v2, 16
4192; GFX11-NEXT:    s_waitcnt vmcnt(0)
4193; GFX11-NEXT:    v_alignbit_b32 v0, v2, v0, 16
4194; GFX11-NEXT:    s_setpc_b64 s[30:31]
4195  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
4196  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
4197  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
4198  ret <4 x bfloat> %shuffle
4199}
4200
4201define <4 x bfloat> @shuffle_v4bf16_5634(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
4202; GFX9-LABEL: shuffle_v4bf16_5634:
4203; GFX9:       ; %bb.0:
4204; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4205; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
4206; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
4207; GFX9-NEXT:    s_waitcnt vmcnt(1)
4208; GFX9-NEXT:    v_alignbit_b32 v0, v5, v4, 16
4209; GFX9-NEXT:    s_waitcnt vmcnt(0)
4210; GFX9-NEXT:    v_alignbit_b32 v1, v4, v6, 16
4211; GFX9-NEXT:    s_setpc_b64 s[30:31]
4212;
4213; GFX10-LABEL: shuffle_v4bf16_5634:
4214; GFX10:       ; %bb.0:
4215; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4216; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
4217; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
4218; GFX10-NEXT:    s_waitcnt vmcnt(1)
4219; GFX10-NEXT:    v_alignbit_b32 v0, v5, v4, 16
4220; GFX10-NEXT:    s_waitcnt vmcnt(0)
4221; GFX10-NEXT:    v_alignbit_b32 v1, v4, v6, 16
4222; GFX10-NEXT:    s_setpc_b64 s[30:31]
4223;
4224; GFX11-LABEL: shuffle_v4bf16_5634:
4225; GFX11:       ; %bb.0:
4226; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4227; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
4228; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
4229; GFX11-NEXT:    s_waitcnt vmcnt(1)
4230; GFX11-NEXT:    v_alignbit_b32 v0, v3, v2, 16
4231; GFX11-NEXT:    s_waitcnt vmcnt(0)
4232; GFX11-NEXT:    v_alignbit_b32 v1, v2, v1, 16
4233; GFX11-NEXT:    s_setpc_b64 s[30:31]
4234  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
4235  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
4236  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 5, i32 6, i32 3, i32 4>
4237  ret <4 x bfloat> %shuffle
4238}
4239
4240define <4 x bfloat> @shuffle_v4bf16_5734(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
4241; GX900-LABEL: shuffle_v4bf16_5734:
4242; GX900:       ; %bb.0:
4243; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4244; GX900-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
4245; GX900-NEXT:    global_load_dword v6, v[0:1], off offset:4
4246; GX900-NEXT:    s_mov_b32 s4, 0x7060302
4247; GX900-NEXT:    s_waitcnt vmcnt(1)
4248; GX900-NEXT:    v_perm_b32 v0, v5, v4, s4
4249; GX900-NEXT:    s_waitcnt vmcnt(0)
4250; GX900-NEXT:    v_alignbit_b32 v1, v4, v6, 16
4251; GX900-NEXT:    s_setpc_b64 s[30:31]
4252;
4253; GFX940-LABEL: shuffle_v4bf16_5734:
4254; GFX940:       ; %bb.0:
4255; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4256; GFX940-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
4257; GFX940-NEXT:    global_load_dword v6, v[0:1], off offset:4
4258; GFX940-NEXT:    s_mov_b32 s0, 0x7060302
4259; GFX940-NEXT:    s_waitcnt vmcnt(1)
4260; GFX940-NEXT:    v_perm_b32 v0, v5, v4, s0
4261; GFX940-NEXT:    s_waitcnt vmcnt(0)
4262; GFX940-NEXT:    v_alignbit_b32 v1, v4, v6, 16
4263; GFX940-NEXT:    s_setpc_b64 s[30:31]
4264;
4265; GFX10-LABEL: shuffle_v4bf16_5734:
4266; GFX10:       ; %bb.0:
4267; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4268; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
4269; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
4270; GFX10-NEXT:    s_waitcnt vmcnt(1)
4271; GFX10-NEXT:    v_perm_b32 v0, v5, v4, 0x7060302
4272; GFX10-NEXT:    s_waitcnt vmcnt(0)
4273; GFX10-NEXT:    v_alignbit_b32 v1, v4, v6, 16
4274; GFX10-NEXT:    s_setpc_b64 s[30:31]
4275;
4276; GFX11-LABEL: shuffle_v4bf16_5734:
4277; GFX11:       ; %bb.0:
4278; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4279; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
4280; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
4281; GFX11-NEXT:    s_waitcnt vmcnt(1)
4282; GFX11-NEXT:    v_perm_b32 v0, v3, v2, 0x7060302
4283; GFX11-NEXT:    s_waitcnt vmcnt(0)
4284; GFX11-NEXT:    v_alignbit_b32 v1, v2, v1, 16
4285; GFX11-NEXT:    s_setpc_b64 s[30:31]
4286  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
4287  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
4288  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 5, i32 7, i32 3, i32 4>
4289  ret <4 x bfloat> %shuffle
4290}
4291
4292define <4 x bfloat> @shuffle_v4bf16_0000(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
4293; GX900-LABEL: shuffle_v4bf16_0000:
4294; GX900:       ; %bb.0:
4295; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4296; GX900-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
4297; GX900-NEXT:    s_mov_b32 s4, 0x5040100
4298; GX900-NEXT:    s_waitcnt vmcnt(0)
4299; GX900-NEXT:    v_perm_b32 v0, v0, v0, s4
4300; GX900-NEXT:    v_mov_b32_e32 v1, v0
4301; GX900-NEXT:    s_setpc_b64 s[30:31]
4302;
4303; GFX940-LABEL: shuffle_v4bf16_0000:
4304; GFX940:       ; %bb.0:
4305; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4306; GFX940-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
4307; GFX940-NEXT:    s_mov_b32 s0, 0x5040100
4308; GFX940-NEXT:    s_waitcnt vmcnt(0)
4309; GFX940-NEXT:    v_perm_b32 v0, v0, v0, s0
4310; GFX940-NEXT:    v_mov_b32_e32 v1, v0
4311; GFX940-NEXT:    s_setpc_b64 s[30:31]
4312;
4313; GFX10-LABEL: shuffle_v4bf16_0000:
4314; GFX10:       ; %bb.0:
4315; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4316; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
4317; GFX10-NEXT:    s_waitcnt vmcnt(0)
4318; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x5040100
4319; GFX10-NEXT:    v_mov_b32_e32 v1, v0
4320; GFX10-NEXT:    s_setpc_b64 s[30:31]
4321;
4322; GFX11-LABEL: shuffle_v4bf16_0000:
4323; GFX11:       ; %bb.0:
4324; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4325; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
4326; GFX11-NEXT:    s_waitcnt vmcnt(0)
4327; GFX11-NEXT:    v_perm_b32 v0, v0, v0, 0x5040100
4328; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4329; GFX11-NEXT:    v_mov_b32_e32 v1, v0
4330; GFX11-NEXT:    s_setpc_b64 s[30:31]
4331  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
4332  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
4333  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> zeroinitializer
4334  ret <4 x bfloat> %shuffle
4335}
4336
4337define <4 x bfloat> @shuffle_v4bf16_1010(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
4338; GFX9-LABEL: shuffle_v4bf16_1010:
4339; GFX9:       ; %bb.0:
4340; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4341; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
4342; GFX9-NEXT:    s_waitcnt vmcnt(0)
4343; GFX9-NEXT:    v_alignbit_b32 v0, v0, v0, 16
4344; GFX9-NEXT:    v_mov_b32_e32 v1, v0
4345; GFX9-NEXT:    s_setpc_b64 s[30:31]
4346;
4347; GFX10-LABEL: shuffle_v4bf16_1010:
4348; GFX10:       ; %bb.0:
4349; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4350; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
4351; GFX10-NEXT:    s_waitcnt vmcnt(0)
4352; GFX10-NEXT:    v_alignbit_b32 v0, v0, v0, 16
4353; GFX10-NEXT:    v_mov_b32_e32 v1, v0
4354; GFX10-NEXT:    s_setpc_b64 s[30:31]
4355;
4356; GFX11-LABEL: shuffle_v4bf16_1010:
4357; GFX11:       ; %bb.0:
4358; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4359; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
4360; GFX11-NEXT:    s_waitcnt vmcnt(0)
4361; GFX11-NEXT:    v_alignbit_b32 v0, v0, v0, 16
4362; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4363; GFX11-NEXT:    v_mov_b32_e32 v1, v0
4364; GFX11-NEXT:    s_setpc_b64 s[30:31]
4365  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
4366  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
4367  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
4368  ret <4 x bfloat> %shuffle
4369}
4370
4371define <4 x bfloat> @shuffle_v4bf16_1100(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
4372; GX900-LABEL: shuffle_v4bf16_1100:
4373; GX900:       ; %bb.0:
4374; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4375; GX900-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
4376; GX900-NEXT:    s_mov_b32 s4, 0x7060302
4377; GX900-NEXT:    s_mov_b32 s5, 0x5040100
4378; GX900-NEXT:    s_waitcnt vmcnt(0)
4379; GX900-NEXT:    v_perm_b32 v0, v1, v1, s4
4380; GX900-NEXT:    v_perm_b32 v1, v1, v1, s5
4381; GX900-NEXT:    s_setpc_b64 s[30:31]
4382;
4383; GFX940-LABEL: shuffle_v4bf16_1100:
4384; GFX940:       ; %bb.0:
4385; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4386; GFX940-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
4387; GFX940-NEXT:    s_mov_b32 s0, 0x7060302
4388; GFX940-NEXT:    s_mov_b32 s1, 0x5040100
4389; GFX940-NEXT:    s_waitcnt vmcnt(0)
4390; GFX940-NEXT:    v_perm_b32 v0, v2, v2, s0
4391; GFX940-NEXT:    v_perm_b32 v1, v2, v2, s1
4392; GFX940-NEXT:    s_setpc_b64 s[30:31]
4393;
4394; GFX10-LABEL: shuffle_v4bf16_1100:
4395; GFX10:       ; %bb.0:
4396; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4397; GFX10-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
4398; GFX10-NEXT:    s_waitcnt vmcnt(0)
4399; GFX10-NEXT:    v_perm_b32 v0, v1, v1, 0x7060302
4400; GFX10-NEXT:    v_perm_b32 v1, v1, v1, 0x5040100
4401; GFX10-NEXT:    s_setpc_b64 s[30:31]
4402;
4403; GFX11-LABEL: shuffle_v4bf16_1100:
4404; GFX11:       ; %bb.0:
4405; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4406; GFX11-NEXT:    global_load_b64 v[1:2], v[0:1], off
4407; GFX11-NEXT:    s_waitcnt vmcnt(0)
4408; GFX11-NEXT:    v_perm_b32 v0, v1, v1, 0x7060302
4409; GFX11-NEXT:    v_perm_b32 v1, v1, v1, 0x5040100
4410; GFX11-NEXT:    s_setpc_b64 s[30:31]
4411  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
4412  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
4413  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 1, i32 1, i32 0, i32 0>
4414  ret <4 x bfloat> %shuffle
4415}
4416
4417define <4 x bfloat> @shuffle_v4bf16_6161(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
4418; GX900-LABEL: shuffle_v4bf16_6161:
4419; GX900:       ; %bb.0:
4420; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4421; GX900-NEXT:    global_load_dword v4, v[0:1], off
4422; GX900-NEXT:    global_load_dword v5, v[2:3], off offset:4
4423; GX900-NEXT:    s_mov_b32 s4, 0xffff
4424; GX900-NEXT:    s_waitcnt vmcnt(0)
4425; GX900-NEXT:    v_bfi_b32 v0, s4, v5, v4
4426; GX900-NEXT:    v_mov_b32_e32 v1, v0
4427; GX900-NEXT:    s_setpc_b64 s[30:31]
4428;
4429; GFX940-LABEL: shuffle_v4bf16_6161:
4430; GFX940:       ; %bb.0:
4431; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4432; GFX940-NEXT:    global_load_dword v4, v[0:1], off
4433; GFX940-NEXT:    global_load_dword v5, v[2:3], off offset:4
4434; GFX940-NEXT:    s_mov_b32 s0, 0xffff
4435; GFX940-NEXT:    s_waitcnt vmcnt(0)
4436; GFX940-NEXT:    v_bfi_b32 v0, s0, v5, v4
4437; GFX940-NEXT:    v_mov_b32_e32 v1, v0
4438; GFX940-NEXT:    s_setpc_b64 s[30:31]
4439;
4440; GFX10-LABEL: shuffle_v4bf16_6161:
4441; GFX10:       ; %bb.0:
4442; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4443; GFX10-NEXT:    global_load_dword v4, v[0:1], off
4444; GFX10-NEXT:    global_load_dword v5, v[2:3], off offset:4
4445; GFX10-NEXT:    s_waitcnt vmcnt(0)
4446; GFX10-NEXT:    v_bfi_b32 v0, 0xffff, v5, v4
4447; GFX10-NEXT:    v_mov_b32_e32 v1, v0
4448; GFX10-NEXT:    s_setpc_b64 s[30:31]
4449;
4450; GFX11-LABEL: shuffle_v4bf16_6161:
4451; GFX11:       ; %bb.0:
4452; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4453; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
4454; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:4
4455; GFX11-NEXT:    s_waitcnt vmcnt(0)
4456; GFX11-NEXT:    v_bfi_b32 v0, 0xffff, v1, v0
4457; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4458; GFX11-NEXT:    v_mov_b32_e32 v1, v0
4459; GFX11-NEXT:    s_setpc_b64 s[30:31]
4460  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
4461  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
4462  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 6, i32 1, i32 6, i32 1>
4463  ret <4 x bfloat> %shuffle
4464}
4465
4466define <4 x bfloat> @shuffle_v4bf16_2333(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
4467; GX900-LABEL: shuffle_v4bf16_2333:
4468; GX900:       ; %bb.0:
4469; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4470; GX900-NEXT:    global_load_dword v0, v[0:1], off offset:4
4471; GX900-NEXT:    s_mov_b32 s4, 0x7060302
4472; GX900-NEXT:    s_waitcnt vmcnt(0)
4473; GX900-NEXT:    v_perm_b32 v1, v0, v0, s4
4474; GX900-NEXT:    s_setpc_b64 s[30:31]
4475;
4476; GFX940-LABEL: shuffle_v4bf16_2333:
4477; GFX940:       ; %bb.0:
4478; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4479; GFX940-NEXT:    global_load_dword v0, v[0:1], off offset:4
4480; GFX940-NEXT:    s_mov_b32 s0, 0x7060302
4481; GFX940-NEXT:    s_waitcnt vmcnt(0)
4482; GFX940-NEXT:    v_perm_b32 v1, v0, v0, s0
4483; GFX940-NEXT:    s_setpc_b64 s[30:31]
4484;
4485; GFX10-LABEL: shuffle_v4bf16_2333:
4486; GFX10:       ; %bb.0:
4487; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4488; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
4489; GFX10-NEXT:    s_waitcnt vmcnt(0)
4490; GFX10-NEXT:    v_perm_b32 v1, v0, v0, 0x7060302
4491; GFX10-NEXT:    s_setpc_b64 s[30:31]
4492;
4493; GFX11-LABEL: shuffle_v4bf16_2333:
4494; GFX11:       ; %bb.0:
4495; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4496; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
4497; GFX11-NEXT:    s_waitcnt vmcnt(0)
4498; GFX11-NEXT:    v_perm_b32 v1, v0, v0, 0x7060302
4499; GFX11-NEXT:    s_setpc_b64 s[30:31]
4500  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
4501  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
4502  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 2, i32 3, i32 3, i32 3>
4503  ret <4 x bfloat> %shuffle
4504}
4505
4506define <4 x bfloat> @shuffle_v4bf16_6667(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
4507; GX900-LABEL: shuffle_v4bf16_6667:
4508; GX900:       ; %bb.0:
4509; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4510; GX900-NEXT:    global_load_dword v0, v[0:1], off offset:4
4511; GX900-NEXT:    s_mov_b32 s4, 0x7060302
4512; GX900-NEXT:    s_waitcnt vmcnt(0)
4513; GX900-NEXT:    v_perm_b32 v1, v0, v0, s4
4514; GX900-NEXT:    s_setpc_b64 s[30:31]
4515;
4516; GFX940-LABEL: shuffle_v4bf16_6667:
4517; GFX940:       ; %bb.0:
4518; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4519; GFX940-NEXT:    global_load_dword v0, v[0:1], off offset:4
4520; GFX940-NEXT:    s_mov_b32 s0, 0x7060302
4521; GFX940-NEXT:    s_waitcnt vmcnt(0)
4522; GFX940-NEXT:    v_perm_b32 v1, v0, v0, s0
4523; GFX940-NEXT:    s_setpc_b64 s[30:31]
4524;
4525; GFX10-LABEL: shuffle_v4bf16_6667:
4526; GFX10:       ; %bb.0:
4527; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4528; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
4529; GFX10-NEXT:    s_waitcnt vmcnt(0)
4530; GFX10-NEXT:    v_perm_b32 v1, v0, v0, 0x7060302
4531; GFX10-NEXT:    s_setpc_b64 s[30:31]
4532;
4533; GFX11-LABEL: shuffle_v4bf16_6667:
4534; GFX11:       ; %bb.0:
4535; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4536; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
4537; GFX11-NEXT:    s_waitcnt vmcnt(0)
4538; GFX11-NEXT:    v_perm_b32 v1, v0, v0, 0x7060302
4539; GFX11-NEXT:    s_setpc_b64 s[30:31]
4540  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
4541  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
4542  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 2, i32 3, i32 3, i32 3>
4543  ret <4 x bfloat> %shuffle
4544}
4545
4546define <4 x bfloat> @shuffle_v8bf16_0101(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
4547; GFX9-LABEL: shuffle_v8bf16_0101:
4548; GFX9:       ; %bb.0:
4549; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4550; GFX9-NEXT:    global_load_dword v0, v[0:1], off
4551; GFX9-NEXT:    s_waitcnt vmcnt(0)
4552; GFX9-NEXT:    v_mov_b32_e32 v1, v0
4553; GFX9-NEXT:    s_setpc_b64 s[30:31]
4554;
4555; GFX10-LABEL: shuffle_v8bf16_0101:
4556; GFX10:       ; %bb.0:
4557; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4558; GFX10-NEXT:    global_load_dword v0, v[0:1], off
4559; GFX10-NEXT:    s_waitcnt vmcnt(0)
4560; GFX10-NEXT:    v_mov_b32_e32 v1, v0
4561; GFX10-NEXT:    s_setpc_b64 s[30:31]
4562;
4563; GFX11-LABEL: shuffle_v8bf16_0101:
4564; GFX11:       ; %bb.0:
4565; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4566; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
4567; GFX11-NEXT:    s_waitcnt vmcnt(0)
4568; GFX11-NEXT:    v_mov_b32_e32 v1, v0
4569; GFX11-NEXT:    s_setpc_b64 s[30:31]
4570  %val0 = load <8 x bfloat>, ptr addrspace(1) %arg0
4571  %val1 = load <8 x bfloat>, ptr addrspace(1) %arg1
4572  %shuffle = shufflevector <8 x bfloat> %val0, <8 x bfloat> %val1, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
4573  ret <4 x bfloat> %shuffle
4574}
4575
4576define <4 x bfloat> @shuffle_v8bf16_0123(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
4577; GFX9-LABEL: shuffle_v8bf16_0123:
4578; GFX9:       ; %bb.0:
4579; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4580; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
4581; GFX9-NEXT:    s_waitcnt vmcnt(0)
4582; GFX9-NEXT:    s_setpc_b64 s[30:31]
4583;
4584; GFX10-LABEL: shuffle_v8bf16_0123:
4585; GFX10:       ; %bb.0:
4586; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4587; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
4588; GFX10-NEXT:    s_waitcnt vmcnt(0)
4589; GFX10-NEXT:    s_setpc_b64 s[30:31]
4590;
4591; GFX11-LABEL: shuffle_v8bf16_0123:
4592; GFX11:       ; %bb.0:
4593; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4594; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
4595; GFX11-NEXT:    s_waitcnt vmcnt(0)
4596; GFX11-NEXT:    s_setpc_b64 s[30:31]
4597  %val0 = load <8 x bfloat>, ptr addrspace(1) %arg0
4598  %val1 = load <8 x bfloat>, ptr addrspace(1) %arg1
4599  %shuffle = shufflevector <8 x bfloat> %val0, <8 x bfloat> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4600  ret <4 x bfloat> %shuffle
4601}
4602
4603define <4 x bfloat> @shuffle_v8bf16_4589(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
4604; GFX9-LABEL: shuffle_v8bf16_4589:
4605; GFX9:       ; %bb.0:
4606; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4607; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:8
4608; GFX9-NEXT:    global_load_dword v5, v[2:3], off
4609; GFX9-NEXT:    s_waitcnt vmcnt(1)
4610; GFX9-NEXT:    v_mov_b32_e32 v0, v4
4611; GFX9-NEXT:    s_waitcnt vmcnt(0)
4612; GFX9-NEXT:    v_mov_b32_e32 v1, v5
4613; GFX9-NEXT:    s_setpc_b64 s[30:31]
4614;
4615; GFX10-LABEL: shuffle_v8bf16_4589:
4616; GFX10:       ; %bb.0:
4617; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4618; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:8
4619; GFX10-NEXT:    global_load_dword v5, v[2:3], off
4620; GFX10-NEXT:    s_waitcnt vmcnt(1)
4621; GFX10-NEXT:    v_mov_b32_e32 v0, v4
4622; GFX10-NEXT:    s_waitcnt vmcnt(0)
4623; GFX10-NEXT:    v_mov_b32_e32 v1, v5
4624; GFX10-NEXT:    s_setpc_b64 s[30:31]
4625;
4626; GFX11-LABEL: shuffle_v8bf16_4589:
4627; GFX11:       ; %bb.0:
4628; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4629; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:8
4630; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
4631; GFX11-NEXT:    s_waitcnt vmcnt(0)
4632; GFX11-NEXT:    s_setpc_b64 s[30:31]
4633  %val0 = load <8 x bfloat>, ptr addrspace(1) %arg0
4634  %val1 = load <8 x bfloat>, ptr addrspace(1) %arg1
4635  %shuffle = shufflevector <8 x bfloat> %val0, <8 x bfloat> %val1, <4 x i32> <i32 4, i32 5, i32 8, i32 9>
4636  ret <4 x bfloat> %shuffle
4637}
4638
4639define <4 x bfloat> @shuffle_v8bf16_10_11_2_3(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
4640; GFX9-LABEL: shuffle_v8bf16_10_11_2_3:
4641; GFX9:       ; %bb.0:
4642; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4643; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
4644; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
4645; GFX9-NEXT:    s_waitcnt vmcnt(1)
4646; GFX9-NEXT:    v_mov_b32_e32 v0, v4
4647; GFX9-NEXT:    s_waitcnt vmcnt(0)
4648; GFX9-NEXT:    v_mov_b32_e32 v1, v5
4649; GFX9-NEXT:    s_setpc_b64 s[30:31]
4650;
4651; GFX10-LABEL: shuffle_v8bf16_10_11_2_3:
4652; GFX10:       ; %bb.0:
4653; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4654; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
4655; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
4656; GFX10-NEXT:    s_waitcnt vmcnt(1)
4657; GFX10-NEXT:    v_mov_b32_e32 v0, v4
4658; GFX10-NEXT:    s_waitcnt vmcnt(0)
4659; GFX10-NEXT:    v_mov_b32_e32 v1, v5
4660; GFX10-NEXT:    s_setpc_b64 s[30:31]
4661;
4662; GFX11-LABEL: shuffle_v8bf16_10_11_2_3:
4663; GFX11:       ; %bb.0:
4664; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4665; GFX11-NEXT:    global_load_b32 v2, v[2:3], off offset:4
4666; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
4667; GFX11-NEXT:    s_waitcnt vmcnt(1)
4668; GFX11-NEXT:    v_mov_b32_e32 v0, v2
4669; GFX11-NEXT:    s_waitcnt vmcnt(0)
4670; GFX11-NEXT:    s_setpc_b64 s[30:31]
4671  %val0 = load <8 x bfloat>, ptr addrspace(1) %arg0
4672  %val1 = load <8 x bfloat>, ptr addrspace(1) %arg1
4673  %shuffle = shufflevector <8 x bfloat> %val0, <8 x bfloat> %val1, <4 x i32> <i32 10, i32 11, i32 2, i32 3>
4674  ret <4 x bfloat> %shuffle
4675}
4676
4677define <4 x bfloat> @shuffle_v8bf16_13_14_2_3(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
4678; GX900-LABEL: shuffle_v8bf16_13_14_2_3:
4679; GX900:       ; %bb.0:
4680; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4681; GX900-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off offset:8
4682; GX900-NEXT:    global_load_dword v4, v[0:1], off offset:4
4683; GX900-NEXT:    s_waitcnt vmcnt(1)
4684; GX900-NEXT:    v_alignbit_b32 v0, v6, v5, 16
4685; GX900-NEXT:    s_waitcnt vmcnt(0)
4686; GX900-NEXT:    v_mov_b32_e32 v1, v4
4687; GX900-NEXT:    s_setpc_b64 s[30:31]
4688;
4689; GFX940-LABEL: shuffle_v8bf16_13_14_2_3:
4690; GFX940:       ; %bb.0:
4691; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4692; GFX940-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off offset:8
4693; GFX940-NEXT:    global_load_dword v4, v[0:1], off offset:4
4694; GFX940-NEXT:    s_waitcnt vmcnt(1)
4695; GFX940-NEXT:    v_alignbit_b32 v0, v7, v6, 16
4696; GFX940-NEXT:    s_waitcnt vmcnt(0)
4697; GFX940-NEXT:    v_mov_b32_e32 v1, v4
4698; GFX940-NEXT:    s_setpc_b64 s[30:31]
4699;
4700; GFX10-LABEL: shuffle_v8bf16_13_14_2_3:
4701; GFX10:       ; %bb.0:
4702; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4703; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off offset:8
4704; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
4705; GFX10-NEXT:    s_waitcnt vmcnt(1)
4706; GFX10-NEXT:    v_alignbit_b32 v0, v6, v5, 16
4707; GFX10-NEXT:    s_waitcnt vmcnt(0)
4708; GFX10-NEXT:    v_mov_b32_e32 v1, v4
4709; GFX10-NEXT:    s_setpc_b64 s[30:31]
4710;
4711; GFX11-LABEL: shuffle_v8bf16_13_14_2_3:
4712; GFX11:       ; %bb.0:
4713; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4714; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off offset:8
4715; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
4716; GFX11-NEXT:    s_waitcnt vmcnt(1)
4717; GFX11-NEXT:    v_alignbit_b32 v0, v3, v2, 16
4718; GFX11-NEXT:    s_waitcnt vmcnt(0)
4719; GFX11-NEXT:    s_setpc_b64 s[30:31]
4720  %val0 = load <8 x bfloat>, ptr addrspace(1) %arg0
4721  %val1 = load <8 x bfloat>, ptr addrspace(1) %arg1
4722  %shuffle = shufflevector <8 x bfloat> %val0, <8 x bfloat> %val1, <4 x i32> <i32 13, i32 14, i32 2, i32 3>
4723  ret <4 x bfloat> %shuffle
4724}
4725
4726define <4 x bfloat> @shuffle_v3bf16_0122(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
4727; GX900-LABEL: shuffle_v3bf16_0122:
4728; GX900:       ; %bb.0:
4729; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4730; GX900-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
4731; GX900-NEXT:    s_mov_b32 s4, 0x5040100
4732; GX900-NEXT:    s_waitcnt vmcnt(0)
4733; GX900-NEXT:    v_perm_b32 v1, v1, v1, s4
4734; GX900-NEXT:    s_setpc_b64 s[30:31]
4735;
4736; GFX940-LABEL: shuffle_v3bf16_0122:
4737; GFX940:       ; %bb.0:
4738; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4739; GFX940-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
4740; GFX940-NEXT:    s_mov_b32 s0, 0x5040100
4741; GFX940-NEXT:    s_waitcnt vmcnt(0)
4742; GFX940-NEXT:    v_perm_b32 v1, v1, v1, s0
4743; GFX940-NEXT:    s_setpc_b64 s[30:31]
4744;
4745; GFX10-LABEL: shuffle_v3bf16_0122:
4746; GFX10:       ; %bb.0:
4747; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4748; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
4749; GFX10-NEXT:    s_waitcnt vmcnt(0)
4750; GFX10-NEXT:    v_perm_b32 v1, v1, v1, 0x5040100
4751; GFX10-NEXT:    s_setpc_b64 s[30:31]
4752;
4753; GFX11-LABEL: shuffle_v3bf16_0122:
4754; GFX11:       ; %bb.0:
4755; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4756; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
4757; GFX11-NEXT:    s_waitcnt vmcnt(0)
4758; GFX11-NEXT:    v_perm_b32 v1, v1, v1, 0x5040100
4759; GFX11-NEXT:    s_setpc_b64 s[30:31]
4760  %val0 = load <3 x bfloat>, ptr addrspace(1) %arg0
4761  %val1 = load <3 x bfloat>, ptr addrspace(1) %arg1
4762  %shuffle = shufflevector <3 x bfloat> %val0, <3 x bfloat> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
4763  ret <4 x bfloat> %shuffle
4764}
4765
4766define <4 x bfloat> @shuffle_v2bf16_0122(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
4767; GFX9-LABEL: shuffle_v2bf16_0122:
4768; GFX9:       ; %bb.0:
4769; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4770; GFX9-NEXT:    global_load_dword v0, v[0:1], off
4771; GFX9-NEXT:    s_waitcnt vmcnt(0)
4772; GFX9-NEXT:    v_alignbit_b32 v1, v0, v0, 16
4773; GFX9-NEXT:    s_setpc_b64 s[30:31]
4774;
4775; GFX10-LABEL: shuffle_v2bf16_0122:
4776; GFX10:       ; %bb.0:
4777; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4778; GFX10-NEXT:    global_load_dword v0, v[0:1], off
4779; GFX10-NEXT:    s_waitcnt vmcnt(0)
4780; GFX10-NEXT:    v_alignbit_b32 v1, v0, v0, 16
4781; GFX10-NEXT:    s_setpc_b64 s[30:31]
4782;
4783; GFX11-LABEL: shuffle_v2bf16_0122:
4784; GFX11:       ; %bb.0:
4785; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4786; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
4787; GFX11-NEXT:    s_waitcnt vmcnt(0)
4788; GFX11-NEXT:    v_alignbit_b32 v1, v0, v0, 16
4789; GFX11-NEXT:    s_setpc_b64 s[30:31]
4790  %val0 = load <2 x bfloat>, ptr addrspace(1) %arg0
4791  %val1 = load <2 x bfloat>, ptr addrspace(1) %arg1
4792  %shuffle = shufflevector <2 x bfloat> %val0, <2 x bfloat> %val1, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
4793  ret <4 x bfloat> %shuffle
4794}
4795
4796define <6 x bfloat> @shuffle_v6bf16_452367(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
4797; GX900-LABEL: shuffle_v6bf16_452367:
4798; GX900:       ; %bb.0:
4799; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4800; GX900-NEXT:    v_mov_b32_e32 v6, v1
4801; GX900-NEXT:    v_mov_b32_e32 v5, v0
4802; GX900-NEXT:    v_mov_b32_e32 v4, v3
4803; GX900-NEXT:    v_mov_b32_e32 v3, v2
4804; GX900-NEXT:    global_load_dwordx3 v[0:2], v[5:6], off
4805; GX900-NEXT:    global_load_dword v7, v[3:4], off
4806; GX900-NEXT:    s_waitcnt vmcnt(1)
4807; GX900-NEXT:    v_mov_b32_e32 v0, v2
4808; GX900-NEXT:    s_waitcnt vmcnt(0)
4809; GX900-NEXT:    v_mov_b32_e32 v2, v7
4810; GX900-NEXT:    s_setpc_b64 s[30:31]
4811;
4812; GFX940-LABEL: shuffle_v6bf16_452367:
4813; GFX940:       ; %bb.0:
4814; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4815; GFX940-NEXT:    v_mov_b32_e32 v7, v1
4816; GFX940-NEXT:    v_mov_b32_e32 v6, v0
4817; GFX940-NEXT:    v_mov_b32_e32 v5, v3
4818; GFX940-NEXT:    v_mov_b32_e32 v4, v2
4819; GFX940-NEXT:    global_load_dwordx3 v[0:2], v[6:7], off
4820; GFX940-NEXT:    global_load_dword v3, v[4:5], off
4821; GFX940-NEXT:    s_waitcnt vmcnt(1)
4822; GFX940-NEXT:    v_mov_b32_e32 v0, v2
4823; GFX940-NEXT:    s_waitcnt vmcnt(0)
4824; GFX940-NEXT:    v_mov_b32_e32 v2, v3
4825; GFX940-NEXT:    s_setpc_b64 s[30:31]
4826;
4827; GFX10-LABEL: shuffle_v6bf16_452367:
4828; GFX10:       ; %bb.0:
4829; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4830; GFX10-NEXT:    v_mov_b32_e32 v6, v1
4831; GFX10-NEXT:    v_mov_b32_e32 v5, v0
4832; GFX10-NEXT:    v_mov_b32_e32 v4, v3
4833; GFX10-NEXT:    v_mov_b32_e32 v3, v2
4834; GFX10-NEXT:    global_load_dwordx3 v[0:2], v[5:6], off
4835; GFX10-NEXT:    global_load_dword v7, v[3:4], off
4836; GFX10-NEXT:    s_waitcnt vmcnt(1)
4837; GFX10-NEXT:    v_mov_b32_e32 v0, v2
4838; GFX10-NEXT:    s_waitcnt vmcnt(0)
4839; GFX10-NEXT:    v_mov_b32_e32 v2, v7
4840; GFX10-NEXT:    s_setpc_b64 s[30:31]
4841;
4842; GFX11-LABEL: shuffle_v6bf16_452367:
4843; GFX11:       ; %bb.0:
4844; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4845; GFX11-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2
4846; GFX11-NEXT:    global_load_b96 v[0:2], v[0:1], off
4847; GFX11-NEXT:    global_load_b32 v3, v[3:4], off
4848; GFX11-NEXT:    s_waitcnt vmcnt(1)
4849; GFX11-NEXT:    v_mov_b32_e32 v0, v2
4850; GFX11-NEXT:    s_waitcnt vmcnt(0)
4851; GFX11-NEXT:    v_mov_b32_e32 v2, v3
4852; GFX11-NEXT:    s_setpc_b64 s[30:31]
4853  %val0 = load <6 x bfloat>, ptr addrspace(1) %arg0
4854  %val1 = load <6 x bfloat>, ptr addrspace(1) %arg1
4855  %shuffle = shufflevector <6 x bfloat> %val0, <6 x bfloat> %val1, <6 x i32> <i32 4, i32 5, i32 2, i32 3, i32 6, i32 7>
4856  ret <6 x bfloat> %shuffle
4857}
4858
4859define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonly %A, ptr addrspace(1) nocapture readonly %B, ptr addrspace(1) nocapture %C)  {
4860; GX900-LABEL: fma_shuffle_v2bf16:
4861; GX900:       ; %bb.0: ; %entry
4862; GX900-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x10
4863; GX900-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x0
4864; GX900-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
4865; GX900-NEXT:    s_movk_i32 s2, 0x7fff
4866; GX900-NEXT:    s_mov_b32 s3, 0x7060302
4867; GX900-NEXT:    s_waitcnt lgkmcnt(0)
4868; GX900-NEXT:    global_load_dwordx2 v[1:2], v0, s[0:1]
4869; GX900-NEXT:    global_load_dwordx2 v[3:4], v0, s[4:5]
4870; GX900-NEXT:    global_load_dwordx2 v[5:6], v0, s[6:7]
4871; GX900-NEXT:    s_waitcnt vmcnt(2)
4872; GX900-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
4873; GX900-NEXT:    s_waitcnt vmcnt(1)
4874; GX900-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
4875; GX900-NEXT:    s_waitcnt vmcnt(0)
4876; GX900-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
4877; GX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
4878; GX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
4879; GX900-NEXT:    v_and_b32_e32 v11, 0xffff0000, v2
4880; GX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
4881; GX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
4882; GX900-NEXT:    v_fma_f32 v7, v8, v9, v7
4883; GX900-NEXT:    v_fma_f32 v1, v8, v5, v1
4884; GX900-NEXT:    v_fma_f32 v2, v12, v5, v2
4885; GX900-NEXT:    v_bfe_u32 v5, v7, 16, 1
4886; GX900-NEXT:    v_fma_f32 v8, v12, v9, v11
4887; GX900-NEXT:    v_or_b32_e32 v9, 0x400000, v7
4888; GX900-NEXT:    v_bfe_u32 v11, v1, 16, 1
4889; GX900-NEXT:    v_add3_u32 v5, v5, v7, s2
4890; GX900-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
4891; GX900-NEXT:    v_or_b32_e32 v12, 0x400000, v1
4892; GX900-NEXT:    v_bfe_u32 v13, v8, 16, 1
4893; GX900-NEXT:    v_add3_u32 v11, v11, v1, s2
4894; GX900-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc
4895; GX900-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
4896; GX900-NEXT:    v_or_b32_e32 v14, 0x400000, v8
4897; GX900-NEXT:    v_bfe_u32 v15, v2, 16, 1
4898; GX900-NEXT:    v_add3_u32 v13, v13, v8, s2
4899; GX900-NEXT:    v_cndmask_b32_e32 v1, v11, v12, vcc
4900; GX900-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
4901; GX900-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
4902; GX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v6
4903; GX900-NEXT:    v_or_b32_e32 v16, 0x400000, v2
4904; GX900-NEXT:    v_add3_u32 v15, v15, v2, s2
4905; GX900-NEXT:    v_cndmask_b32_e32 v7, v13, v14, vcc
4906; GX900-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
4907; GX900-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
4908; GX900-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
4909; GX900-NEXT:    v_cndmask_b32_e32 v2, v15, v16, vcc
4910; GX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
4911; GX900-NEXT:    v_fma_f32 v1, v3, v10, v1
4912; GX900-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
4913; GX900-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
4914; GX900-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
4915; GX900-NEXT:    v_fma_f32 v3, v3, v6, v5
4916; GX900-NEXT:    v_bfe_u32 v5, v1, 16, 1
4917; GX900-NEXT:    v_fma_f32 v2, v4, v10, v2
4918; GX900-NEXT:    v_fma_f32 v4, v4, v6, v7
4919; GX900-NEXT:    v_or_b32_e32 v6, 0x400000, v1
4920; GX900-NEXT:    v_bfe_u32 v7, v3, 16, 1
4921; GX900-NEXT:    v_add3_u32 v5, v5, v1, s2
4922; GX900-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
4923; GX900-NEXT:    v_or_b32_e32 v8, 0x400000, v3
4924; GX900-NEXT:    v_bfe_u32 v9, v2, 16, 1
4925; GX900-NEXT:    v_add3_u32 v7, v7, v3, s2
4926; GX900-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
4927; GX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
4928; GX900-NEXT:    v_or_b32_e32 v10, 0x400000, v2
4929; GX900-NEXT:    v_bfe_u32 v11, v4, 16, 1
4930; GX900-NEXT:    v_add3_u32 v9, v9, v2, s2
4931; GX900-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
4932; GX900-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
4933; GX900-NEXT:    v_or_b32_e32 v12, 0x400000, v4
4934; GX900-NEXT:    v_add3_u32 v11, v11, v4, s2
4935; GX900-NEXT:    v_cndmask_b32_e32 v2, v9, v10, vcc
4936; GX900-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
4937; GX900-NEXT:    v_cndmask_b32_e32 v4, v11, v12, vcc
4938; GX900-NEXT:    v_perm_b32 v2, v4, v2, s3
4939; GX900-NEXT:    v_perm_b32 v1, v3, v1, s3
4940; GX900-NEXT:    global_store_dwordx2 v0, v[1:2], s[0:1]
4941; GX900-NEXT:    s_endpgm
4942;
4943; GFX940-LABEL: fma_shuffle_v2bf16:
4944; GFX940:       ; %bb.0: ; %entry
4945; GFX940-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x0
4946; GFX940-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x10
4947; GFX940-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
4948; GFX940-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
4949; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
4950; GFX940-NEXT:    s_mov_b32 s3, 0x7060302
4951; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
4952; GFX940-NEXT:    global_load_dwordx2 v[0:1], v6, s[8:9]
4953; GFX940-NEXT:    global_load_dwordx2 v[2:3], v6, s[0:1]
4954; GFX940-NEXT:    global_load_dwordx2 v[4:5], v6, s[10:11]
4955; GFX940-NEXT:    s_waitcnt vmcnt(2)
4956; GFX940-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
4957; GFX940-NEXT:    s_waitcnt vmcnt(1)
4958; GFX940-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
4959; GFX940-NEXT:    s_waitcnt vmcnt(0)
4960; GFX940-NEXT:    v_and_b32_e32 v9, 0xffff0000, v4
4961; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
4962; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
4963; GFX940-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
4964; GFX940-NEXT:    v_and_b32_e32 v12, 0xffff0000, v3
4965; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
4966; GFX940-NEXT:    v_fmac_f32_e32 v8, v7, v9
4967; GFX940-NEXT:    v_fmac_f32_e32 v2, v7, v4
4968; GFX940-NEXT:    v_fmac_f32_e32 v3, v11, v4
4969; GFX940-NEXT:    v_bfe_u32 v4, v8, 16, 1
4970; GFX940-NEXT:    v_fmac_f32_e32 v12, v11, v9
4971; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v8
4972; GFX940-NEXT:    v_bfe_u32 v9, v2, 16, 1
4973; GFX940-NEXT:    v_add3_u32 v4, v4, v8, s2
4974; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
4975; GFX940-NEXT:    v_or_b32_e32 v11, 0x400000, v2
4976; GFX940-NEXT:    v_bfe_u32 v13, v12, 16, 1
4977; GFX940-NEXT:    v_add3_u32 v9, v9, v2, s2
4978; GFX940-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
4979; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
4980; GFX940-NEXT:    v_or_b32_e32 v14, 0x400000, v12
4981; GFX940-NEXT:    v_bfe_u32 v15, v3, 16, 1
4982; GFX940-NEXT:    v_add3_u32 v13, v13, v12, s2
4983; GFX940-NEXT:    v_cndmask_b32_e32 v2, v9, v11, vcc
4984; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
4985; GFX940-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
4986; GFX940-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
4987; GFX940-NEXT:    v_or_b32_e32 v16, 0x400000, v3
4988; GFX940-NEXT:    v_add3_u32 v15, v15, v3, s2
4989; GFX940-NEXT:    v_cndmask_b32_e32 v7, v13, v14, vcc
4990; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
4991; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
4992; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
4993; GFX940-NEXT:    v_cndmask_b32_e32 v3, v15, v16, vcc
4994; GFX940-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
4995; GFX940-NEXT:    v_fmac_f32_e32 v2, v0, v10
4996; GFX940-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
4997; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
4998; GFX940-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
4999; GFX940-NEXT:    v_fmac_f32_e32 v4, v0, v5
5000; GFX940-NEXT:    v_bfe_u32 v0, v2, 16, 1
5001; GFX940-NEXT:    v_fmac_f32_e32 v3, v1, v10
5002; GFX940-NEXT:    v_fmac_f32_e32 v7, v1, v5
5003; GFX940-NEXT:    v_or_b32_e32 v1, 0x400000, v2
5004; GFX940-NEXT:    v_bfe_u32 v5, v4, 16, 1
5005; GFX940-NEXT:    v_add3_u32 v0, v0, v2, s2
5006; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
5007; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v4
5008; GFX940-NEXT:    v_bfe_u32 v9, v3, 16, 1
5009; GFX940-NEXT:    v_add3_u32 v5, v5, v4, s2
5010; GFX940-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
5011; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
5012; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v3
5013; GFX940-NEXT:    v_bfe_u32 v11, v7, 16, 1
5014; GFX940-NEXT:    v_add3_u32 v9, v9, v3, s2
5015; GFX940-NEXT:    v_cndmask_b32_e32 v2, v5, v8, vcc
5016; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
5017; GFX940-NEXT:    v_or_b32_e32 v12, 0x400000, v7
5018; GFX940-NEXT:    v_add3_u32 v11, v11, v7, s2
5019; GFX940-NEXT:    v_cndmask_b32_e32 v1, v9, v10, vcc
5020; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
5021; GFX940-NEXT:    v_perm_b32 v0, v2, v0, s3
5022; GFX940-NEXT:    s_nop 0
5023; GFX940-NEXT:    v_cndmask_b32_e32 v3, v11, v12, vcc
5024; GFX940-NEXT:    v_perm_b32 v1, v3, v1, s3
5025; GFX940-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1] sc0 sc1
5026; GFX940-NEXT:    s_endpgm
5027;
5028; GFX10-LABEL: fma_shuffle_v2bf16:
5029; GFX10:       ; %bb.0: ; %entry
5030; GFX10-NEXT:    s_clause 0x1
5031; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x10
5032; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x0
5033; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
5034; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
5035; GFX10-NEXT:    s_clause 0x2
5036; GFX10-NEXT:    global_load_dwordx2 v[0:1], v6, s[0:1]
5037; GFX10-NEXT:    global_load_dwordx2 v[2:3], v6, s[4:5]
5038; GFX10-NEXT:    global_load_dwordx2 v[4:5], v6, s[6:7]
5039; GFX10-NEXT:    s_waitcnt vmcnt(2)
5040; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
5041; GFX10-NEXT:    s_waitcnt vmcnt(1)
5042; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
5043; GFX10-NEXT:    s_waitcnt vmcnt(0)
5044; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v4
5045; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
5046; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
5047; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v1
5048; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
5049; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
5050; GFX10-NEXT:    v_fmac_f32_e32 v7, v8, v9
5051; GFX10-NEXT:    v_fmac_f32_e32 v0, v8, v4
5052; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
5053; GFX10-NEXT:    v_fmac_f32_e32 v11, v12, v9
5054; GFX10-NEXT:    v_fmac_f32_e32 v1, v12, v4
5055; GFX10-NEXT:    v_bfe_u32 v4, v7, 16, 1
5056; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v7
5057; GFX10-NEXT:    v_bfe_u32 v9, v0, 16, 1
5058; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
5059; GFX10-NEXT:    v_or_b32_e32 v12, 0x400000, v0
5060; GFX10-NEXT:    v_add3_u32 v4, v4, v7, 0x7fff
5061; GFX10-NEXT:    v_bfe_u32 v15, v1, 16, 1
5062; GFX10-NEXT:    v_add3_u32 v9, v9, v0, 0x7fff
5063; GFX10-NEXT:    v_bfe_u32 v13, v11, 16, 1
5064; GFX10-NEXT:    v_or_b32_e32 v16, 0x400000, v1
5065; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc_lo
5066; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
5067; GFX10-NEXT:    v_add3_u32 v15, v15, v1, 0x7fff
5068; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
5069; GFX10-NEXT:    v_or_b32_e32 v14, 0x400000, v11
5070; GFX10-NEXT:    v_add3_u32 v13, v13, v11, 0x7fff
5071; GFX10-NEXT:    v_cndmask_b32_e32 v0, v9, v12, vcc_lo
5072; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
5073; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
5074; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
5075; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
5076; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
5077; GFX10-NEXT:    v_cndmask_b32_e32 v1, v15, v16, vcc_lo
5078; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
5079; GFX10-NEXT:    v_fmac_f32_e32 v4, v2, v5
5080; GFX10-NEXT:    v_fmac_f32_e32 v0, v2, v10
5081; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
5082; GFX10-NEXT:    v_cndmask_b32_e32 v7, v13, v14, vcc_lo
5083; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v4
5084; GFX10-NEXT:    v_bfe_u32 v2, v0, 16, 1
5085; GFX10-NEXT:    v_fmac_f32_e32 v1, v3, v10
5086; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
5087; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
5088; GFX10-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
5089; GFX10-NEXT:    v_bfe_u32 v9, v1, 16, 1
5090; GFX10-NEXT:    v_fmac_f32_e32 v7, v3, v5
5091; GFX10-NEXT:    v_or_b32_e32 v3, 0x400000, v0
5092; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v1
5093; GFX10-NEXT:    v_bfe_u32 v5, v4, 16, 1
5094; GFX10-NEXT:    v_add3_u32 v9, v9, v1, 0x7fff
5095; GFX10-NEXT:    v_bfe_u32 v11, v7, 16, 1
5096; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
5097; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
5098; GFX10-NEXT:    v_or_b32_e32 v12, 0x400000, v7
5099; GFX10-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
5100; GFX10-NEXT:    v_add3_u32 v11, v11, v7, 0x7fff
5101; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v10, vcc_lo
5102; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
5103; GFX10-NEXT:    v_cndmask_b32_e32 v2, v11, v12, vcc_lo
5104; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
5105; GFX10-NEXT:    v_perm_b32 v1, v2, v1, 0x7060302
5106; GFX10-NEXT:    v_cndmask_b32_e32 v3, v5, v8, vcc_lo
5107; GFX10-NEXT:    v_perm_b32 v0, v3, v0, 0x7060302
5108; GFX10-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1]
5109; GFX10-NEXT:    s_endpgm
5110;
5111; GFX11-LABEL: fma_shuffle_v2bf16:
5112; GFX11:       ; %bb.0: ; %entry
5113; GFX11-NEXT:    s_clause 0x1
5114; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x10
5115; GFX11-NEXT:    s_load_b128 s[4:7], s[4:5], 0x0
5116; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
5117; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
5118; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
5119; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
5120; GFX11-NEXT:    s_clause 0x2
5121; GFX11-NEXT:    global_load_b64 v[0:1], v6, s[0:1]
5122; GFX11-NEXT:    global_load_b64 v[2:3], v6, s[4:5]
5123; GFX11-NEXT:    global_load_b64 v[4:5], v6, s[6:7]
5124; GFX11-NEXT:    s_waitcnt vmcnt(0)
5125; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
5126; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
5127; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v4
5128; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
5129; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
5130; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
5131; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v1
5132; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
5133; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5134; GFX11-NEXT:    v_dual_fmac_f32 v1, v12, v4 :: v_dual_lshlrev_b32 v8, 16, v2
5135; GFX11-NEXT:    v_bfe_u32 v15, v1, 16, 1
5136; GFX11-NEXT:    v_or_b32_e32 v16, 0x400000, v1
5137; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
5138; GFX11-NEXT:    v_add3_u32 v15, v15, v1, 0x7fff
5139; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
5140; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
5141; GFX11-NEXT:    v_dual_fmac_f32 v7, v8, v9 :: v_dual_lshlrev_b32 v0, 16, v0
5142; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
5143; GFX11-NEXT:    v_fmac_f32_e32 v0, v8, v4
5144; GFX11-NEXT:    v_bfe_u32 v4, v7, 16, 1
5145; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v7
5146; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
5147; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
5148; GFX11-NEXT:    v_add3_u32 v4, v4, v7, 0x7fff
5149; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc_lo
5150; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
5151; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
5152; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
5153; GFX11-NEXT:    v_fmac_f32_e32 v4, v2, v5
5154; GFX11-NEXT:    v_fmac_f32_e32 v11, v12, v9
5155; GFX11-NEXT:    v_bfe_u32 v9, v0, 16, 1
5156; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v0
5157; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
5158; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v4
5159; GFX11-NEXT:    v_bfe_u32 v13, v11, 16, 1
5160; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
5161; GFX11-NEXT:    v_add3_u32 v9, v9, v0, 0x7fff
5162; GFX11-NEXT:    v_or_b32_e32 v14, 0x400000, v11
5163; GFX11-NEXT:    v_add3_u32 v13, v13, v11, 0x7fff
5164; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
5165; GFX11-NEXT:    v_cndmask_b32_e32 v0, v9, v12, vcc_lo
5166; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
5167; GFX11-NEXT:    v_cndmask_b32_e32 v1, v15, v16, vcc_lo
5168; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
5169; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
5170; GFX11-NEXT:    v_cndmask_b32_e32 v7, v13, v14, vcc_lo
5171; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
5172; GFX11-NEXT:    v_fmac_f32_e32 v1, v3, v10
5173; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
5174; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
5175; GFX11-NEXT:    v_bfe_u32 v9, v1, 16, 1
5176; GFX11-NEXT:    v_fmac_f32_e32 v7, v3, v5
5177; GFX11-NEXT:    v_bfe_u32 v5, v4, 16, 1
5178; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
5179; GFX11-NEXT:    v_add3_u32 v9, v9, v1, 0x7fff
5180; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
5181; GFX11-NEXT:    v_bfe_u32 v11, v7, 16, 1
5182; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v7
5183; GFX11-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
5184; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
5185; GFX11-NEXT:    v_fmac_f32_e32 v0, v2, v10
5186; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v1
5187; GFX11-NEXT:    v_add3_u32 v11, v11, v7, 0x7fff
5188; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
5189; GFX11-NEXT:    v_or_b32_e32 v3, 0x400000, v0
5190; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
5191; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
5192; GFX11-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
5193; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
5194; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
5195; GFX11-NEXT:    v_cndmask_b32_e32 v1, v9, v10, vcc_lo
5196; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
5197; GFX11-NEXT:    v_cndmask_b32_e32 v2, v11, v12, vcc_lo
5198; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
5199; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
5200; GFX11-NEXT:    v_perm_b32 v1, v2, v1, 0x7060302
5201; GFX11-NEXT:    v_cndmask_b32_e32 v3, v5, v8, vcc_lo
5202; GFX11-NEXT:    v_perm_b32 v0, v3, v0, 0x7060302
5203; GFX11-NEXT:    global_store_b64 v6, v[0:1], s[0:1]
5204; GFX11-NEXT:    s_endpgm
5205entry:
5206  %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x()
5207  %tmp12 = zext i32 %tmp1 to i64
5208  %arrayidx = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %A, i64 %tmp12
5209  %tmp14 = load <4 x bfloat>, ptr addrspace(1) %arrayidx, align 8
5210  %arrayidx1 = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %B, i64 %tmp12
5211  %tmp15 = load <4 x bfloat>, ptr addrspace(1) %arrayidx1, align 8
5212  %arrayidx2 = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %C, i64 %tmp12
5213  %tmp16 = load <4 x bfloat>, ptr addrspace(1) %arrayidx2, align 8
5214  %tmp17 = shufflevector <4 x bfloat> %tmp14, <4 x bfloat> undef, <2 x i32> zeroinitializer
5215  %tmp18 = shufflevector <4 x bfloat> %tmp15, <4 x bfloat> undef, <2 x i32> <i32 0, i32 1>
5216  %tmp19 = shufflevector <4 x bfloat> %tmp16, <4 x bfloat> undef, <2 x i32> <i32 0, i32 1>
5217  %tmp20 = tail call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> %tmp17, <2 x bfloat> %tmp18, <2 x bfloat> %tmp19)
5218  %tmp21 = shufflevector <4 x bfloat> %tmp14, <4 x bfloat> undef, <2 x i32> <i32 1, i32 1>
5219  %tmp22 = shufflevector <4 x bfloat> %tmp15, <4 x bfloat> undef, <2 x i32> <i32 2, i32 3>
5220  %tmp23 = tail call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> %tmp21, <2 x bfloat> %tmp22, <2 x bfloat> %tmp20)
5221  %tmp24 = shufflevector <2 x bfloat> %tmp23, <2 x bfloat> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
5222  %tmp25 = shufflevector <4 x bfloat> %tmp24, <4 x bfloat> %tmp16, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
5223  %tmp26 = shufflevector <4 x bfloat> %tmp14, <4 x bfloat> undef, <2 x i32> <i32 2, i32 2>
5224  %tmp27 = shufflevector <4 x bfloat> %tmp25, <4 x bfloat> undef, <2 x i32> <i32 2, i32 3>
5225  %tmp28 = tail call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> %tmp26, <2 x bfloat> %tmp18, <2 x bfloat> %tmp27)
5226  %tmp29 = shufflevector <4 x bfloat> %tmp14, <4 x bfloat> undef, <2 x i32> <i32 3, i32 3>
5227  %tmp30 = tail call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> %tmp29, <2 x bfloat> %tmp22, <2 x bfloat> %tmp28)
5228  %tmp31 = shufflevector <2 x bfloat> %tmp30, <2 x bfloat> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
5229  %tmp32 = shufflevector <4 x bfloat> %tmp25, <4 x bfloat> %tmp31, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
5230  store <4 x bfloat> %tmp32, ptr addrspace(1) %arrayidx2, align 8
5231  ret void
5232}
5233
5234define <4 x bfloat> @shuffle_v4bf16_0456(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
5235; GX900-LABEL: shuffle_v4bf16_0456:
5236; GX900:       ; %bb.0:
5237; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5238; GX900-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
5239; GX900-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
5240; GX900-NEXT:    s_mov_b32 s4, 0x5040100
5241; GX900-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
5242; GX900-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
5243; GX900-NEXT:    s_waitcnt vmcnt(0)
5244; GX900-NEXT:    v_perm_b32 v0, v5, v4, s4
5245; GX900-NEXT:    v_alignbit_b32 v1, v6, v5, 16
5246; GX900-NEXT:    s_setpc_b64 s[30:31]
5247;
5248; GFX940-LABEL: shuffle_v4bf16_0456:
5249; GFX940:       ; %bb.0:
5250; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5251; GFX940-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
5252; GFX940-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off
5253; GFX940-NEXT:    s_mov_b32 s0, 0x5040100
5254; GFX940-NEXT:    s_waitcnt vmcnt(0)
5255; GFX940-NEXT:    v_perm_b32 v0, v6, v4, s0
5256; GFX940-NEXT:    v_alignbit_b32 v1, v7, v6, 16
5257; GFX940-NEXT:    s_setpc_b64 s[30:31]
5258;
5259; GFX10-LABEL: shuffle_v4bf16_0456:
5260; GFX10:       ; %bb.0:
5261; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5262; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
5263; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
5264; GFX10-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
5265; GFX10-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
5266; GFX10-NEXT:    s_waitcnt vmcnt(0)
5267; GFX10-NEXT:    v_perm_b32 v0, v5, v4, 0x5040100
5268; GFX10-NEXT:    v_alignbit_b32 v1, v6, v5, 16
5269; GFX10-NEXT:    s_setpc_b64 s[30:31]
5270;
5271; GFX11-LABEL: shuffle_v4bf16_0456:
5272; GFX11:       ; %bb.0:
5273; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5274; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
5275; GFX11-NEXT:    global_load_b64 v[1:2], v[2:3], off
5276; GFX11-NEXT:    s_waitcnt vmcnt(0)
5277; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
5278; GFX11-NEXT:    v_alignbit_b32 v1, v2, v1, 16
5279; GFX11-NEXT:    s_setpc_b64 s[30:31]
5280  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
5281  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
5282  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 0, i32 4, i32 5, i32 6>
5283  ret <4 x bfloat> %shuffle
5284}
5285
5286define <2 x bfloat> @low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
5287; GX900-LABEL: low16bits:
5288; GX900:       ; %bb.0: ; %entry
5289; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5290; GX900-NEXT:    global_load_dword v4, v[0:1], off
5291; GX900-NEXT:    global_load_dword v5, v[2:3], off
5292; GX900-NEXT:    s_mov_b32 s4, 0x5040100
5293; GX900-NEXT:    s_waitcnt vmcnt(0)
5294; GX900-NEXT:    v_perm_b32 v0, v5, v4, s4
5295; GX900-NEXT:    s_setpc_b64 s[30:31]
5296;
5297; GFX940-LABEL: low16bits:
5298; GFX940:       ; %bb.0: ; %entry
5299; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5300; GFX940-NEXT:    global_load_dword v4, v[0:1], off
5301; GFX940-NEXT:    global_load_dword v5, v[2:3], off
5302; GFX940-NEXT:    s_mov_b32 s0, 0x5040100
5303; GFX940-NEXT:    s_waitcnt vmcnt(0)
5304; GFX940-NEXT:    v_perm_b32 v0, v5, v4, s0
5305; GFX940-NEXT:    s_setpc_b64 s[30:31]
5306;
5307; GFX10-LABEL: low16bits:
5308; GFX10:       ; %bb.0: ; %entry
5309; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5310; GFX10-NEXT:    global_load_dword v4, v[0:1], off
5311; GFX10-NEXT:    global_load_dword v5, v[2:3], off
5312; GFX10-NEXT:    s_waitcnt vmcnt(0)
5313; GFX10-NEXT:    v_perm_b32 v0, v5, v4, 0x5040100
5314; GFX10-NEXT:    s_setpc_b64 s[30:31]
5315;
5316; GFX11-LABEL: low16bits:
5317; GFX11:       ; %bb.0: ; %entry
5318; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5319; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
5320; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
5321; GFX11-NEXT:    s_waitcnt vmcnt(0)
5322; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
5323; GFX11-NEXT:    s_setpc_b64 s[30:31]
5324entry:
5325  %0 = load <2 x bfloat>, ptr addrspace(1) %x0, align 4
5326  %1 = load <2 x bfloat>, ptr addrspace(1) %x1, align 4
5327  %vy1.0.vec.insert = shufflevector <2 x bfloat> %0, <2 x bfloat> poison, <2 x i32> <i32 0, i32 undef>
5328  %vy1.2.vec.insert = shufflevector <2 x bfloat> %vy1.0.vec.insert, <2 x bfloat> %1, <2 x i32> <i32 0, i32 2>
5329  ret <2 x bfloat> %vy1.2.vec.insert
5330}
5331
5332define <2 x bfloat> @hi16bits_v2bf16(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
5333; GX900-LABEL: hi16bits_v2bf16:
5334; GX900:       ; %bb.0: ; %entry
5335; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5336; GX900-NEXT:    global_load_dword v4, v[0:1], off
5337; GX900-NEXT:    global_load_dword v5, v[2:3], off
5338; GX900-NEXT:    s_mov_b32 s4, 0x7060302
5339; GX900-NEXT:    s_waitcnt vmcnt(0)
5340; GX900-NEXT:    v_perm_b32 v0, v5, v4, s4
5341; GX900-NEXT:    s_setpc_b64 s[30:31]
5342;
5343; GFX940-LABEL: hi16bits_v2bf16:
5344; GFX940:       ; %bb.0: ; %entry
5345; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5346; GFX940-NEXT:    global_load_dword v4, v[0:1], off
5347; GFX940-NEXT:    global_load_dword v5, v[2:3], off
5348; GFX940-NEXT:    s_mov_b32 s0, 0x7060302
5349; GFX940-NEXT:    s_waitcnt vmcnt(0)
5350; GFX940-NEXT:    v_perm_b32 v0, v5, v4, s0
5351; GFX940-NEXT:    s_setpc_b64 s[30:31]
5352;
5353; GFX10-LABEL: hi16bits_v2bf16:
5354; GFX10:       ; %bb.0: ; %entry
5355; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5356; GFX10-NEXT:    global_load_dword v4, v[0:1], off
5357; GFX10-NEXT:    global_load_dword v5, v[2:3], off
5358; GFX10-NEXT:    s_waitcnt vmcnt(0)
5359; GFX10-NEXT:    v_perm_b32 v0, v5, v4, 0x7060302
5360; GFX10-NEXT:    s_setpc_b64 s[30:31]
5361;
5362; GFX11-LABEL: hi16bits_v2bf16:
5363; GFX11:       ; %bb.0: ; %entry
5364; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5365; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
5366; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
5367; GFX11-NEXT:    s_waitcnt vmcnt(0)
5368; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
5369; GFX11-NEXT:    s_setpc_b64 s[30:31]
5370entry:
5371  %0 = load <2 x bfloat>, ptr addrspace(1) %x0, align 4
5372  %1 = load <2 x bfloat>, ptr addrspace(1) %x1, align 4
5373  %vy1.0.vec.insert = shufflevector <2 x bfloat> %0, <2 x bfloat> poison, <2 x i32> <i32 1, i32 undef>
5374  %vy1.2.vec.insert = shufflevector <2 x bfloat> %vy1.0.vec.insert, <2 x bfloat> %1, <2 x i32> <i32 0, i32 3>
5375  ret <2 x bfloat> %vy1.2.vec.insert
5376}
5377
5378define <2 x bfloat> @low16hi16bits_v2bf16(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
5379; GX900-LABEL: low16hi16bits_v2bf16:
5380; GX900:       ; %bb.0: ; %entry
5381; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5382; GX900-NEXT:    global_load_dword v4, v[2:3], off
5383; GX900-NEXT:    global_load_dword v5, v[0:1], off
5384; GX900-NEXT:    s_mov_b32 s4, 0xffff
5385; GX900-NEXT:    s_waitcnt vmcnt(0)
5386; GX900-NEXT:    v_bfi_b32 v0, s4, v5, v4
5387; GX900-NEXT:    s_setpc_b64 s[30:31]
5388;
5389; GFX940-LABEL: low16hi16bits_v2bf16:
5390; GFX940:       ; %bb.0: ; %entry
5391; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5392; GFX940-NEXT:    global_load_dword v4, v[2:3], off
5393; GFX940-NEXT:    global_load_dword v5, v[0:1], off
5394; GFX940-NEXT:    s_mov_b32 s0, 0xffff
5395; GFX940-NEXT:    s_waitcnt vmcnt(0)
5396; GFX940-NEXT:    v_bfi_b32 v0, s0, v5, v4
5397; GFX940-NEXT:    s_setpc_b64 s[30:31]
5398;
5399; GFX10-LABEL: low16hi16bits_v2bf16:
5400; GFX10:       ; %bb.0: ; %entry
5401; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5402; GFX10-NEXT:    global_load_dword v4, v[2:3], off
5403; GFX10-NEXT:    global_load_dword v5, v[0:1], off
5404; GFX10-NEXT:    s_waitcnt vmcnt(0)
5405; GFX10-NEXT:    v_bfi_b32 v0, 0xffff, v5, v4
5406; GFX10-NEXT:    s_setpc_b64 s[30:31]
5407;
5408; GFX11-LABEL: low16hi16bits_v2bf16:
5409; GFX11:       ; %bb.0: ; %entry
5410; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5411; GFX11-NEXT:    global_load_b32 v2, v[2:3], off
5412; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
5413; GFX11-NEXT:    s_waitcnt vmcnt(0)
5414; GFX11-NEXT:    v_bfi_b32 v0, 0xffff, v0, v2
5415; GFX11-NEXT:    s_setpc_b64 s[30:31]
5416entry:
5417  %0 = load <2 x bfloat>, ptr addrspace(1) %x0, align 4
5418  %1 = load <2 x bfloat>, ptr addrspace(1) %x1, align 4
5419  %vy1.0.vec.insert = shufflevector <2 x bfloat> %0, <2 x bfloat> poison, <2 x i32> <i32 0, i32 undef>
5420  %vy1.2.vec.insert = shufflevector <2 x bfloat> %vy1.0.vec.insert, <2 x bfloat> %1, <2 x i32> <i32 0, i32 3>
5421  ret <2 x bfloat> %vy1.2.vec.insert
5422}
5423
5424define <2 x bfloat> @hi16low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
5425; GFX9-LABEL: hi16low16bits:
5426; GFX9:       ; %bb.0: ; %entry
5427; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5428; GFX9-NEXT:    global_load_dword v4, v[0:1], off
5429; GFX9-NEXT:    global_load_dword v5, v[2:3], off
5430; GFX9-NEXT:    s_waitcnt vmcnt(0)
5431; GFX9-NEXT:    v_alignbit_b32 v0, v5, v4, 16
5432; GFX9-NEXT:    s_setpc_b64 s[30:31]
5433;
5434; GFX10-LABEL: hi16low16bits:
5435; GFX10:       ; %bb.0: ; %entry
5436; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5437; GFX10-NEXT:    global_load_dword v4, v[0:1], off
5438; GFX10-NEXT:    global_load_dword v5, v[2:3], off
5439; GFX10-NEXT:    s_waitcnt vmcnt(0)
5440; GFX10-NEXT:    v_alignbit_b32 v0, v5, v4, 16
5441; GFX10-NEXT:    s_setpc_b64 s[30:31]
5442;
5443; GFX11-LABEL: hi16low16bits:
5444; GFX11:       ; %bb.0: ; %entry
5445; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5446; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
5447; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
5448; GFX11-NEXT:    s_waitcnt vmcnt(0)
5449; GFX11-NEXT:    v_alignbit_b32 v0, v1, v0, 16
5450; GFX11-NEXT:    s_setpc_b64 s[30:31]
5451entry:
5452  %0 = load <2 x bfloat>, ptr addrspace(1) %x0, align 4
5453  %1 = load <2 x bfloat>, ptr addrspace(1) %x1, align 4
5454  %vy1.0.vec.insert = shufflevector <2 x bfloat> %0, <2 x bfloat> poison, <2 x i32> <i32 1, i32 undef>
5455  %vy1.2.vec.insert = shufflevector <2 x bfloat> %vy1.0.vec.insert, <2 x bfloat> %1, <2 x i32> <i32 0, i32 2>
5456  ret <2 x bfloat> %vy1.2.vec.insert
5457}
5458
5459define <2 x bfloat> @v2bfloat_hi16bits(ptr addrspace(1) %x0) {
5460; GFX9-LABEL: v2bfloat_hi16bits:
5461; GFX9:       ; %bb.0: ; %entry
5462; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5463; GFX9-NEXT:    global_load_dword v0, v[0:1], off
5464; GFX9-NEXT:    s_waitcnt vmcnt(0)
5465; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
5466; GFX9-NEXT:    s_setpc_b64 s[30:31]
5467;
5468; GFX10-LABEL: v2bfloat_hi16bits:
5469; GFX10:       ; %bb.0: ; %entry
5470; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5471; GFX10-NEXT:    global_load_dword v0, v[0:1], off
5472; GFX10-NEXT:    s_waitcnt vmcnt(0)
5473; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
5474; GFX10-NEXT:    s_setpc_b64 s[30:31]
5475;
5476; GFX11-LABEL: v2bfloat_hi16bits:
5477; GFX11:       ; %bb.0: ; %entry
5478; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5479; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
5480; GFX11-NEXT:    s_waitcnt vmcnt(0)
5481; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
5482; GFX11-NEXT:    s_setpc_b64 s[30:31]
5483entry:
5484  %load0 = load <2 x bfloat>, ptr addrspace(1) %x0, align 4
5485  %insert1 = insertelement <2 x bfloat> undef, bfloat 0.0, i32 0
5486  %insert2 = insertelement <2 x bfloat> %insert1, bfloat 0.0, i32 1
5487  %vec.ret = shufflevector <2 x bfloat> %insert2, <2 x bfloat> %load0, <2 x i32> <i32 0, i32 3>
5488  ret <2 x bfloat> %vec.ret
5489}
5490
5491define void @shuffle_v8bf16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) {
5492; GX900-LABEL: shuffle_v8bf16_concat:
5493; GX900:       ; %bb.0:
5494; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5495; GX900-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
5496; GX900-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
5497; GX900-NEXT:    s_waitcnt vmcnt(0)
5498; GX900-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off
5499; GX900-NEXT:    s_waitcnt vmcnt(0)
5500; GX900-NEXT:    s_setpc_b64 s[30:31]
5501;
5502; GFX940-LABEL: shuffle_v8bf16_concat:
5503; GFX940:       ; %bb.0:
5504; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5505; GFX940-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
5506; GFX940-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
5507; GFX940-NEXT:    s_waitcnt vmcnt(0)
5508; GFX940-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off sc0 sc1
5509; GFX940-NEXT:    s_waitcnt vmcnt(0)
5510; GFX940-NEXT:    s_setpc_b64 s[30:31]
5511;
5512; GFX10-LABEL: shuffle_v8bf16_concat:
5513; GFX10:       ; %bb.0:
5514; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5515; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
5516; GFX10-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
5517; GFX10-NEXT:    s_waitcnt vmcnt(0)
5518; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off
5519; GFX10-NEXT:    s_setpc_b64 s[30:31]
5520;
5521; GFX11-LABEL: shuffle_v8bf16_concat:
5522; GFX11:       ; %bb.0:
5523; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5524; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
5525; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
5526; GFX11-NEXT:    s_waitcnt vmcnt(0)
5527; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off
5528; GFX11-NEXT:    s_setpc_b64 s[30:31]
5529  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
5530  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
5531  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
5532  store <8 x bfloat> %shuffle, ptr addrspace(1) %out
5533  ret void
5534}
5535
5536define void @shuffle_v16bf16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) {
5537; GX900-LABEL: shuffle_v16bf16_concat:
5538; GX900:       ; %bb.0:
5539; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5540; GX900-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
5541; GX900-NEXT:    global_load_dwordx4 v[10:13], v[0:1], off
5542; GX900-NEXT:    s_waitcnt vmcnt(1)
5543; GX900-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:16
5544; GX900-NEXT:    s_waitcnt vmcnt(1)
5545; GX900-NEXT:    global_store_dwordx4 v[4:5], v[10:13], off
5546; GX900-NEXT:    s_waitcnt vmcnt(0)
5547; GX900-NEXT:    s_setpc_b64 s[30:31]
5548;
5549; GFX940-LABEL: shuffle_v16bf16_concat:
5550; GFX940:       ; %bb.0:
5551; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5552; GFX940-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
5553; GFX940-NEXT:    global_load_dwordx4 v[10:13], v[0:1], off
5554; GFX940-NEXT:    s_waitcnt vmcnt(1)
5555; GFX940-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:16 sc0 sc1
5556; GFX940-NEXT:    s_waitcnt vmcnt(1)
5557; GFX940-NEXT:    global_store_dwordx4 v[4:5], v[10:13], off sc0 sc1
5558; GFX940-NEXT:    s_waitcnt vmcnt(0)
5559; GFX940-NEXT:    s_setpc_b64 s[30:31]
5560;
5561; GFX10-LABEL: shuffle_v16bf16_concat:
5562; GFX10:       ; %bb.0:
5563; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5564; GFX10-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
5565; GFX10-NEXT:    global_load_dwordx4 v[10:13], v[0:1], off
5566; GFX10-NEXT:    s_waitcnt vmcnt(1)
5567; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:16
5568; GFX10-NEXT:    s_waitcnt vmcnt(0)
5569; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[10:13], off
5570; GFX10-NEXT:    s_setpc_b64 s[30:31]
5571;
5572; GFX11-LABEL: shuffle_v16bf16_concat:
5573; GFX11:       ; %bb.0:
5574; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5575; GFX11-NEXT:    global_load_b128 v[6:9], v[2:3], off
5576; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
5577; GFX11-NEXT:    s_waitcnt vmcnt(1)
5578; GFX11-NEXT:    global_store_b128 v[4:5], v[6:9], off offset:16
5579; GFX11-NEXT:    s_waitcnt vmcnt(0)
5580; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off
5581; GFX11-NEXT:    s_setpc_b64 s[30:31]
5582  %val0 = load <8 x bfloat>, ptr addrspace(1) %arg0
5583  %val1 = load <8 x bfloat>, ptr addrspace(1) %arg1
5584  %shuffle = shufflevector <8 x bfloat> %val0, <8 x bfloat> %val1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
5585  store <16 x bfloat> %shuffle, ptr addrspace(1) %out
5586  ret void
5587}
5588
5589define void @shuffle_v32bf16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) {
5590; GX900-LABEL: shuffle_v32bf16_concat:
5591; GX900:       ; %bb.0:
5592; GX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5593; GX900-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
5594; GX900-NEXT:    global_load_dwordx4 v[10:13], v[2:3], off offset:16
5595; GX900-NEXT:    global_load_dwordx4 v[14:17], v[0:1], off
5596; GX900-NEXT:    global_load_dwordx4 v[18:21], v[0:1], off offset:16
5597; GX900-NEXT:    s_waitcnt vmcnt(3)
5598; GX900-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:32
5599; GX900-NEXT:    s_waitcnt vmcnt(3)
5600; GX900-NEXT:    global_store_dwordx4 v[4:5], v[10:13], off offset:48
5601; GX900-NEXT:    s_waitcnt vmcnt(3)
5602; GX900-NEXT:    global_store_dwordx4 v[4:5], v[14:17], off
5603; GX900-NEXT:    s_waitcnt vmcnt(3)
5604; GX900-NEXT:    global_store_dwordx4 v[4:5], v[18:21], off offset:16
5605; GX900-NEXT:    s_waitcnt vmcnt(0)
5606; GX900-NEXT:    s_setpc_b64 s[30:31]
5607;
5608; GFX940-LABEL: shuffle_v32bf16_concat:
5609; GFX940:       ; %bb.0:
5610; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5611; GFX940-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
5612; GFX940-NEXT:    global_load_dwordx4 v[10:13], v[2:3], off offset:16
5613; GFX940-NEXT:    global_load_dwordx4 v[14:17], v[0:1], off
5614; GFX940-NEXT:    global_load_dwordx4 v[18:21], v[0:1], off offset:16
5615; GFX940-NEXT:    s_waitcnt vmcnt(3)
5616; GFX940-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:32 sc0 sc1
5617; GFX940-NEXT:    s_waitcnt vmcnt(3)
5618; GFX940-NEXT:    global_store_dwordx4 v[4:5], v[10:13], off offset:48 sc0 sc1
5619; GFX940-NEXT:    s_waitcnt vmcnt(3)
5620; GFX940-NEXT:    global_store_dwordx4 v[4:5], v[14:17], off sc0 sc1
5621; GFX940-NEXT:    s_waitcnt vmcnt(3)
5622; GFX940-NEXT:    global_store_dwordx4 v[4:5], v[18:21], off offset:16 sc0 sc1
5623; GFX940-NEXT:    s_waitcnt vmcnt(0)
5624; GFX940-NEXT:    s_setpc_b64 s[30:31]
5625;
5626; GFX10-LABEL: shuffle_v32bf16_concat:
5627; GFX10:       ; %bb.0:
5628; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5629; GFX10-NEXT:    s_clause 0x1
5630; GFX10-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
5631; GFX10-NEXT:    global_load_dwordx4 v[10:13], v[2:3], off offset:16
5632; GFX10-NEXT:    s_clause 0x1
5633; GFX10-NEXT:    global_load_dwordx4 v[14:17], v[0:1], off
5634; GFX10-NEXT:    global_load_dwordx4 v[18:21], v[0:1], off offset:16
5635; GFX10-NEXT:    s_waitcnt vmcnt(3)
5636; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:32
5637; GFX10-NEXT:    s_waitcnt vmcnt(2)
5638; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[10:13], off offset:48
5639; GFX10-NEXT:    s_waitcnt vmcnt(1)
5640; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[14:17], off
5641; GFX10-NEXT:    s_waitcnt vmcnt(0)
5642; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[18:21], off offset:16
5643; GFX10-NEXT:    s_setpc_b64 s[30:31]
5644;
5645; GFX11-LABEL: shuffle_v32bf16_concat:
5646; GFX11:       ; %bb.0:
5647; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5648; GFX11-NEXT:    s_clause 0x1
5649; GFX11-NEXT:    global_load_b128 v[6:9], v[2:3], off
5650; GFX11-NEXT:    global_load_b128 v[10:13], v[2:3], off offset:16
5651; GFX11-NEXT:    s_clause 0x1
5652; GFX11-NEXT:    global_load_b128 v[14:17], v[0:1], off
5653; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off offset:16
5654; GFX11-NEXT:    s_waitcnt vmcnt(3)
5655; GFX11-NEXT:    global_store_b128 v[4:5], v[6:9], off offset:32
5656; GFX11-NEXT:    s_waitcnt vmcnt(2)
5657; GFX11-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:48
5658; GFX11-NEXT:    s_waitcnt vmcnt(1)
5659; GFX11-NEXT:    global_store_b128 v[4:5], v[14:17], off
5660; GFX11-NEXT:    s_waitcnt vmcnt(0)
5661; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off offset:16
5662; GFX11-NEXT:    s_setpc_b64 s[30:31]
5663  %val0 = load <16 x bfloat>, ptr addrspace(1) %arg0
5664  %val1 = load <16 x bfloat>, ptr addrspace(1) %arg1
5665  %shuffle = shufflevector <16 x bfloat> %val0, <16 x bfloat> %val1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
5666  store <32 x bfloat> %shuffle, ptr addrspace(1) %out
5667  ret void
5668}
5669
5670declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #0
5671declare <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat>, <2 x bfloat>, <2 x bfloat>) #0
5672declare i32 @llvm.amdgcn.workitem.id.x() #0
5673
5674attributes #0 = { nounwind readnone speculatable }
5675