xref: /llvm-project/llvm/test/CodeGen/AMDGPU/shufflevector.v2bf16.v4bf16.ll (revision 585858aeb6247b3892218edb9d353c63f1c33186)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s
3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s
4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s
5
6
7define void @v_shuffle_v2bf16_v4bf16__u_u(ptr addrspace(1) inreg %ptr) {
8; GFX9-LABEL: v_shuffle_v2bf16_v4bf16__u_u:
9; GFX9:       ; %bb.0:
10; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11; GFX9-NEXT:    s_setpc_b64 s[30:31]
12  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
13  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> poison
14  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
15  ret void
16}
17
18define void @v_shuffle_v2bf16_v4bf16__0_u(ptr addrspace(1) inreg %ptr) {
19; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__0_u:
20; GFX900:       ; %bb.0:
21; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22; GFX900-NEXT:    v_mov_b32_e32 v2, 0
23; GFX900-NEXT:    ;;#ASMSTART
24; GFX900-NEXT:    ; def v[0:1]
25; GFX900-NEXT:    ;;#ASMEND
26; GFX900-NEXT:    global_store_dword v2, v0, s[16:17]
27; GFX900-NEXT:    s_waitcnt vmcnt(0)
28; GFX900-NEXT:    s_setpc_b64 s[30:31]
29;
30; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__0_u:
31; GFX90A:       ; %bb.0:
32; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
34; GFX90A-NEXT:    ;;#ASMSTART
35; GFX90A-NEXT:    ; def v[0:1]
36; GFX90A-NEXT:    ;;#ASMEND
37; GFX90A-NEXT:    global_store_dword v2, v0, s[16:17]
38; GFX90A-NEXT:    s_waitcnt vmcnt(0)
39; GFX90A-NEXT:    s_setpc_b64 s[30:31]
40;
41; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__0_u:
42; GFX940:       ; %bb.0:
43; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
44; GFX940-NEXT:    v_mov_b32_e32 v2, 0
45; GFX940-NEXT:    ;;#ASMSTART
46; GFX940-NEXT:    ; def v[0:1]
47; GFX940-NEXT:    ;;#ASMEND
48; GFX940-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
49; GFX940-NEXT:    s_waitcnt vmcnt(0)
50; GFX940-NEXT:    s_setpc_b64 s[30:31]
51  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
52  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 0, i32 poison>
53  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
54  ret void
55}
56
57define void @v_shuffle_v2bf16_v4bf16__1_u(ptr addrspace(1) inreg %ptr) {
58; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__1_u:
59; GFX900:       ; %bb.0:
60; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
61; GFX900-NEXT:    ;;#ASMSTART
62; GFX900-NEXT:    ; def v[0:1]
63; GFX900-NEXT:    ;;#ASMEND
64; GFX900-NEXT:    v_mov_b32_e32 v2, 0
65; GFX900-NEXT:    v_alignbit_b32 v0, s4, v0, 16
66; GFX900-NEXT:    global_store_dword v2, v0, s[16:17]
67; GFX900-NEXT:    s_waitcnt vmcnt(0)
68; GFX900-NEXT:    s_setpc_b64 s[30:31]
69;
70; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__1_u:
71; GFX90A:       ; %bb.0:
72; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
73; GFX90A-NEXT:    ;;#ASMSTART
74; GFX90A-NEXT:    ; def v[0:1]
75; GFX90A-NEXT:    ;;#ASMEND
76; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
77; GFX90A-NEXT:    v_alignbit_b32 v0, s4, v0, 16
78; GFX90A-NEXT:    global_store_dword v2, v0, s[16:17]
79; GFX90A-NEXT:    s_waitcnt vmcnt(0)
80; GFX90A-NEXT:    s_setpc_b64 s[30:31]
81;
82; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__1_u:
83; GFX940:       ; %bb.0:
84; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
85; GFX940-NEXT:    ;;#ASMSTART
86; GFX940-NEXT:    ; def v[0:1]
87; GFX940-NEXT:    ;;#ASMEND
88; GFX940-NEXT:    v_mov_b32_e32 v2, 0
89; GFX940-NEXT:    v_alignbit_b32 v0, s0, v0, 16
90; GFX940-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
91; GFX940-NEXT:    s_waitcnt vmcnt(0)
92; GFX940-NEXT:    s_setpc_b64 s[30:31]
93  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
94  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 1, i32 poison>
95  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
96  ret void
97}
98
99define void @v_shuffle_v2bf16_v4bf16__2_u(ptr addrspace(1) inreg %ptr) {
100; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__2_u:
101; GFX900:       ; %bb.0:
102; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
103; GFX900-NEXT:    v_mov_b32_e32 v2, 0
104; GFX900-NEXT:    ;;#ASMSTART
105; GFX900-NEXT:    ; def v[0:1]
106; GFX900-NEXT:    ;;#ASMEND
107; GFX900-NEXT:    global_store_dword v2, v1, s[16:17]
108; GFX900-NEXT:    s_waitcnt vmcnt(0)
109; GFX900-NEXT:    s_setpc_b64 s[30:31]
110;
111; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__2_u:
112; GFX90A:       ; %bb.0:
113; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
114; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
115; GFX90A-NEXT:    ;;#ASMSTART
116; GFX90A-NEXT:    ; def v[0:1]
117; GFX90A-NEXT:    ;;#ASMEND
118; GFX90A-NEXT:    global_store_dword v2, v1, s[16:17]
119; GFX90A-NEXT:    s_waitcnt vmcnt(0)
120; GFX90A-NEXT:    s_setpc_b64 s[30:31]
121;
122; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__2_u:
123; GFX940:       ; %bb.0:
124; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
125; GFX940-NEXT:    v_mov_b32_e32 v2, 0
126; GFX940-NEXT:    ;;#ASMSTART
127; GFX940-NEXT:    ; def v[0:1]
128; GFX940-NEXT:    ;;#ASMEND
129; GFX940-NEXT:    global_store_dword v2, v1, s[0:1] sc0 sc1
130; GFX940-NEXT:    s_waitcnt vmcnt(0)
131; GFX940-NEXT:    s_setpc_b64 s[30:31]
132  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
133  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 2, i32 poison>
134  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
135  ret void
136}
137
138define void @v_shuffle_v2bf16_v4bf16__3_u(ptr addrspace(1) inreg %ptr) {
139; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__3_u:
140; GFX900:       ; %bb.0:
141; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
142; GFX900-NEXT:    ;;#ASMSTART
143; GFX900-NEXT:    ; def v[0:1]
144; GFX900-NEXT:    ;;#ASMEND
145; GFX900-NEXT:    v_mov_b32_e32 v2, 0
146; GFX900-NEXT:    v_alignbit_b32 v0, s4, v1, 16
147; GFX900-NEXT:    global_store_dword v2, v0, s[16:17]
148; GFX900-NEXT:    s_waitcnt vmcnt(0)
149; GFX900-NEXT:    s_setpc_b64 s[30:31]
150;
151; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__3_u:
152; GFX90A:       ; %bb.0:
153; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
154; GFX90A-NEXT:    ;;#ASMSTART
155; GFX90A-NEXT:    ; def v[0:1]
156; GFX90A-NEXT:    ;;#ASMEND
157; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
158; GFX90A-NEXT:    v_alignbit_b32 v0, s4, v1, 16
159; GFX90A-NEXT:    global_store_dword v2, v0, s[16:17]
160; GFX90A-NEXT:    s_waitcnt vmcnt(0)
161; GFX90A-NEXT:    s_setpc_b64 s[30:31]
162;
163; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__3_u:
164; GFX940:       ; %bb.0:
165; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
166; GFX940-NEXT:    ;;#ASMSTART
167; GFX940-NEXT:    ; def v[0:1]
168; GFX940-NEXT:    ;;#ASMEND
169; GFX940-NEXT:    v_mov_b32_e32 v2, 0
170; GFX940-NEXT:    v_alignbit_b32 v0, s0, v1, 16
171; GFX940-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
172; GFX940-NEXT:    s_waitcnt vmcnt(0)
173; GFX940-NEXT:    s_setpc_b64 s[30:31]
174  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
175  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 3, i32 poison>
176  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
177  ret void
178}
179
180define void @v_shuffle_v2bf16_v4bf16__4_u(ptr addrspace(1) inreg %ptr) {
181; GFX9-LABEL: v_shuffle_v2bf16_v4bf16__4_u:
182; GFX9:       ; %bb.0:
183; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
184; GFX9-NEXT:    s_setpc_b64 s[30:31]
185  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
186  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 4, i32 poison>
187  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
188  ret void
189}
190
191define void @v_shuffle_v2bf16_v4bf16__5_u(ptr addrspace(1) inreg %ptr) {
192; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__5_u:
193; GFX900:       ; %bb.0:
194; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
195; GFX900-NEXT:    ;;#ASMSTART
196; GFX900-NEXT:    ; def v[0:1]
197; GFX900-NEXT:    ;;#ASMEND
198; GFX900-NEXT:    v_mov_b32_e32 v2, 0
199; GFX900-NEXT:    v_alignbit_b32 v0, s4, v0, 16
200; GFX900-NEXT:    global_store_dword v2, v0, s[16:17]
201; GFX900-NEXT:    s_waitcnt vmcnt(0)
202; GFX900-NEXT:    s_setpc_b64 s[30:31]
203;
204; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__5_u:
205; GFX90A:       ; %bb.0:
206; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
207; GFX90A-NEXT:    ;;#ASMSTART
208; GFX90A-NEXT:    ; def v[0:1]
209; GFX90A-NEXT:    ;;#ASMEND
210; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
211; GFX90A-NEXT:    v_alignbit_b32 v0, s4, v0, 16
212; GFX90A-NEXT:    global_store_dword v2, v0, s[16:17]
213; GFX90A-NEXT:    s_waitcnt vmcnt(0)
214; GFX90A-NEXT:    s_setpc_b64 s[30:31]
215;
216; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__5_u:
217; GFX940:       ; %bb.0:
218; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
219; GFX940-NEXT:    ;;#ASMSTART
220; GFX940-NEXT:    ; def v[0:1]
221; GFX940-NEXT:    ;;#ASMEND
222; GFX940-NEXT:    v_mov_b32_e32 v2, 0
223; GFX940-NEXT:    v_alignbit_b32 v0, s0, v0, 16
224; GFX940-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
225; GFX940-NEXT:    s_waitcnt vmcnt(0)
226; GFX940-NEXT:    s_setpc_b64 s[30:31]
227  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
228  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
229  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 5, i32 poison>
230  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
231  ret void
232}
233
234define void @v_shuffle_v2bf16_v4bf16__6_u(ptr addrspace(1) inreg %ptr) {
235; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__6_u:
236; GFX900:       ; %bb.0:
237; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
238; GFX900-NEXT:    v_mov_b32_e32 v2, 0
239; GFX900-NEXT:    ;;#ASMSTART
240; GFX900-NEXT:    ; def v[0:1]
241; GFX900-NEXT:    ;;#ASMEND
242; GFX900-NEXT:    global_store_dword v2, v1, s[16:17]
243; GFX900-NEXT:    s_waitcnt vmcnt(0)
244; GFX900-NEXT:    s_setpc_b64 s[30:31]
245;
246; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__6_u:
247; GFX90A:       ; %bb.0:
248; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
249; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
250; GFX90A-NEXT:    ;;#ASMSTART
251; GFX90A-NEXT:    ; def v[0:1]
252; GFX90A-NEXT:    ;;#ASMEND
253; GFX90A-NEXT:    global_store_dword v2, v1, s[16:17]
254; GFX90A-NEXT:    s_waitcnt vmcnt(0)
255; GFX90A-NEXT:    s_setpc_b64 s[30:31]
256;
257; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__6_u:
258; GFX940:       ; %bb.0:
259; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
260; GFX940-NEXT:    v_mov_b32_e32 v2, 0
261; GFX940-NEXT:    ;;#ASMSTART
262; GFX940-NEXT:    ; def v[0:1]
263; GFX940-NEXT:    ;;#ASMEND
264; GFX940-NEXT:    global_store_dword v2, v1, s[0:1] sc0 sc1
265; GFX940-NEXT:    s_waitcnt vmcnt(0)
266; GFX940-NEXT:    s_setpc_b64 s[30:31]
267  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
268  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
269  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 6, i32 poison>
270  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
271  ret void
272}
273
274define void @v_shuffle_v2bf16_v4bf16__7_u(ptr addrspace(1) inreg %ptr) {
275; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__7_u:
276; GFX900:       ; %bb.0:
277; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
278; GFX900-NEXT:    ;;#ASMSTART
279; GFX900-NEXT:    ; def v[0:1]
280; GFX900-NEXT:    ;;#ASMEND
281; GFX900-NEXT:    v_mov_b32_e32 v2, 0
282; GFX900-NEXT:    v_alignbit_b32 v0, s4, v1, 16
283; GFX900-NEXT:    global_store_dword v2, v0, s[16:17]
284; GFX900-NEXT:    s_waitcnt vmcnt(0)
285; GFX900-NEXT:    s_setpc_b64 s[30:31]
286;
287; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__7_u:
288; GFX90A:       ; %bb.0:
289; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
290; GFX90A-NEXT:    ;;#ASMSTART
291; GFX90A-NEXT:    ; def v[0:1]
292; GFX90A-NEXT:    ;;#ASMEND
293; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
294; GFX90A-NEXT:    v_alignbit_b32 v0, s4, v1, 16
295; GFX90A-NEXT:    global_store_dword v2, v0, s[16:17]
296; GFX90A-NEXT:    s_waitcnt vmcnt(0)
297; GFX90A-NEXT:    s_setpc_b64 s[30:31]
298;
299; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__7_u:
300; GFX940:       ; %bb.0:
301; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
302; GFX940-NEXT:    ;;#ASMSTART
303; GFX940-NEXT:    ; def v[0:1]
304; GFX940-NEXT:    ;;#ASMEND
305; GFX940-NEXT:    v_mov_b32_e32 v2, 0
306; GFX940-NEXT:    v_alignbit_b32 v0, s0, v1, 16
307; GFX940-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
308; GFX940-NEXT:    s_waitcnt vmcnt(0)
309; GFX940-NEXT:    s_setpc_b64 s[30:31]
310  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
311  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
312  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 7, i32 poison>
313  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
314  ret void
315}
316
317define void @v_shuffle_v2bf16_v4bf16__7_0(ptr addrspace(1) inreg %ptr) {
318; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__7_0:
319; GFX900:       ; %bb.0:
320; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
321; GFX900-NEXT:    ;;#ASMSTART
322; GFX900-NEXT:    ; def v[0:1]
323; GFX900-NEXT:    ;;#ASMEND
324; GFX900-NEXT:    v_mov_b32_e32 v3, 0
325; GFX900-NEXT:    ;;#ASMSTART
326; GFX900-NEXT:    ; def v[1:2]
327; GFX900-NEXT:    ;;#ASMEND
328; GFX900-NEXT:    v_alignbit_b32 v0, v0, v2, 16
329; GFX900-NEXT:    global_store_dword v3, v0, s[16:17]
330; GFX900-NEXT:    s_waitcnt vmcnt(0)
331; GFX900-NEXT:    s_setpc_b64 s[30:31]
332;
333; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__7_0:
334; GFX90A:       ; %bb.0:
335; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
336; GFX90A-NEXT:    ;;#ASMSTART
337; GFX90A-NEXT:    ; def v[0:1]
338; GFX90A-NEXT:    ;;#ASMEND
339; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
340; GFX90A-NEXT:    ;;#ASMSTART
341; GFX90A-NEXT:    ; def v[2:3]
342; GFX90A-NEXT:    ;;#ASMEND
343; GFX90A-NEXT:    v_alignbit_b32 v0, v0, v3, 16
344; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
345; GFX90A-NEXT:    s_waitcnt vmcnt(0)
346; GFX90A-NEXT:    s_setpc_b64 s[30:31]
347;
348; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__7_0:
349; GFX940:       ; %bb.0:
350; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
351; GFX940-NEXT:    ;;#ASMSTART
352; GFX940-NEXT:    ; def v[0:1]
353; GFX940-NEXT:    ;;#ASMEND
354; GFX940-NEXT:    v_mov_b32_e32 v4, 0
355; GFX940-NEXT:    ;;#ASMSTART
356; GFX940-NEXT:    ; def v[2:3]
357; GFX940-NEXT:    ;;#ASMEND
358; GFX940-NEXT:    s_nop 0
359; GFX940-NEXT:    v_alignbit_b32 v0, v0, v3, 16
360; GFX940-NEXT:    global_store_dword v4, v0, s[0:1] sc0 sc1
361; GFX940-NEXT:    s_waitcnt vmcnt(0)
362; GFX940-NEXT:    s_setpc_b64 s[30:31]
363  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
364  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
365  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 7, i32 0>
366  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
367  ret void
368}
369
370define void @v_shuffle_v2bf16_v4bf16__7_1(ptr addrspace(1) inreg %ptr) {
371; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__7_1:
372; GFX900:       ; %bb.0:
373; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
374; GFX900-NEXT:    ;;#ASMSTART
375; GFX900-NEXT:    ; def v[0:1]
376; GFX900-NEXT:    ;;#ASMEND
377; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
378; GFX900-NEXT:    v_mov_b32_e32 v3, 0
379; GFX900-NEXT:    ;;#ASMSTART
380; GFX900-NEXT:    ; def v[1:2]
381; GFX900-NEXT:    ;;#ASMEND
382; GFX900-NEXT:    v_perm_b32 v0, v0, v2, s4
383; GFX900-NEXT:    global_store_dword v3, v0, s[16:17]
384; GFX900-NEXT:    s_waitcnt vmcnt(0)
385; GFX900-NEXT:    s_setpc_b64 s[30:31]
386;
387; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__7_1:
388; GFX90A:       ; %bb.0:
389; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
390; GFX90A-NEXT:    ;;#ASMSTART
391; GFX90A-NEXT:    ; def v[0:1]
392; GFX90A-NEXT:    ;;#ASMEND
393; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
394; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
395; GFX90A-NEXT:    ;;#ASMSTART
396; GFX90A-NEXT:    ; def v[2:3]
397; GFX90A-NEXT:    ;;#ASMEND
398; GFX90A-NEXT:    v_perm_b32 v0, v0, v3, s4
399; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
400; GFX90A-NEXT:    s_waitcnt vmcnt(0)
401; GFX90A-NEXT:    s_setpc_b64 s[30:31]
402;
403; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__7_1:
404; GFX940:       ; %bb.0:
405; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
406; GFX940-NEXT:    ;;#ASMSTART
407; GFX940-NEXT:    ; def v[0:1]
408; GFX940-NEXT:    ;;#ASMEND
409; GFX940-NEXT:    s_mov_b32 s2, 0x7060302
410; GFX940-NEXT:    v_mov_b32_e32 v4, 0
411; GFX940-NEXT:    ;;#ASMSTART
412; GFX940-NEXT:    ; def v[2:3]
413; GFX940-NEXT:    ;;#ASMEND
414; GFX940-NEXT:    s_nop 0
415; GFX940-NEXT:    v_perm_b32 v0, v0, v3, s2
416; GFX940-NEXT:    global_store_dword v4, v0, s[0:1] sc0 sc1
417; GFX940-NEXT:    s_waitcnt vmcnt(0)
418; GFX940-NEXT:    s_setpc_b64 s[30:31]
419  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
420  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
421  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 7, i32 1>
422  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
423  ret void
424}
425
426define void @v_shuffle_v2bf16_v4bf16__7_2(ptr addrspace(1) inreg %ptr) {
427; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__7_2:
428; GFX900:       ; %bb.0:
429; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
430; GFX900-NEXT:    ;;#ASMSTART
431; GFX900-NEXT:    ; def v[0:1]
432; GFX900-NEXT:    ;;#ASMEND
433; GFX900-NEXT:    v_mov_b32_e32 v4, 0
434; GFX900-NEXT:    ;;#ASMSTART
435; GFX900-NEXT:    ; def v[2:3]
436; GFX900-NEXT:    ;;#ASMEND
437; GFX900-NEXT:    v_alignbit_b32 v0, v1, v3, 16
438; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
439; GFX900-NEXT:    s_waitcnt vmcnt(0)
440; GFX900-NEXT:    s_setpc_b64 s[30:31]
441;
442; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__7_2:
443; GFX90A:       ; %bb.0:
444; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
445; GFX90A-NEXT:    ;;#ASMSTART
446; GFX90A-NEXT:    ; def v[0:1]
447; GFX90A-NEXT:    ;;#ASMEND
448; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
449; GFX90A-NEXT:    ;;#ASMSTART
450; GFX90A-NEXT:    ; def v[2:3]
451; GFX90A-NEXT:    ;;#ASMEND
452; GFX90A-NEXT:    v_alignbit_b32 v0, v1, v3, 16
453; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
454; GFX90A-NEXT:    s_waitcnt vmcnt(0)
455; GFX90A-NEXT:    s_setpc_b64 s[30:31]
456;
457; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__7_2:
458; GFX940:       ; %bb.0:
459; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
460; GFX940-NEXT:    ;;#ASMSTART
461; GFX940-NEXT:    ; def v[0:1]
462; GFX940-NEXT:    ;;#ASMEND
463; GFX940-NEXT:    v_mov_b32_e32 v4, 0
464; GFX940-NEXT:    ;;#ASMSTART
465; GFX940-NEXT:    ; def v[2:3]
466; GFX940-NEXT:    ;;#ASMEND
467; GFX940-NEXT:    s_nop 0
468; GFX940-NEXT:    v_alignbit_b32 v0, v1, v3, 16
469; GFX940-NEXT:    global_store_dword v4, v0, s[0:1] sc0 sc1
470; GFX940-NEXT:    s_waitcnt vmcnt(0)
471; GFX940-NEXT:    s_setpc_b64 s[30:31]
472  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
473  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
474  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 7, i32 2>
475  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
476  ret void
477}
478
479define void @v_shuffle_v2bf16_v4bf16__7_3(ptr addrspace(1) inreg %ptr) {
480; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__7_3:
481; GFX900:       ; %bb.0:
482; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
483; GFX900-NEXT:    ;;#ASMSTART
484; GFX900-NEXT:    ; def v[0:1]
485; GFX900-NEXT:    ;;#ASMEND
486; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
487; GFX900-NEXT:    v_mov_b32_e32 v4, 0
488; GFX900-NEXT:    ;;#ASMSTART
489; GFX900-NEXT:    ; def v[2:3]
490; GFX900-NEXT:    ;;#ASMEND
491; GFX900-NEXT:    v_perm_b32 v0, v1, v3, s4
492; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
493; GFX900-NEXT:    s_waitcnt vmcnt(0)
494; GFX900-NEXT:    s_setpc_b64 s[30:31]
495;
496; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__7_3:
497; GFX90A:       ; %bb.0:
498; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
499; GFX90A-NEXT:    ;;#ASMSTART
500; GFX90A-NEXT:    ; def v[0:1]
501; GFX90A-NEXT:    ;;#ASMEND
502; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
503; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
504; GFX90A-NEXT:    ;;#ASMSTART
505; GFX90A-NEXT:    ; def v[2:3]
506; GFX90A-NEXT:    ;;#ASMEND
507; GFX90A-NEXT:    v_perm_b32 v0, v1, v3, s4
508; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
509; GFX90A-NEXT:    s_waitcnt vmcnt(0)
510; GFX90A-NEXT:    s_setpc_b64 s[30:31]
511;
512; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__7_3:
513; GFX940:       ; %bb.0:
514; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
515; GFX940-NEXT:    ;;#ASMSTART
516; GFX940-NEXT:    ; def v[0:1]
517; GFX940-NEXT:    ;;#ASMEND
518; GFX940-NEXT:    s_mov_b32 s2, 0x7060302
519; GFX940-NEXT:    v_mov_b32_e32 v4, 0
520; GFX940-NEXT:    ;;#ASMSTART
521; GFX940-NEXT:    ; def v[2:3]
522; GFX940-NEXT:    ;;#ASMEND
523; GFX940-NEXT:    s_nop 0
524; GFX940-NEXT:    v_perm_b32 v0, v1, v3, s2
525; GFX940-NEXT:    global_store_dword v4, v0, s[0:1] sc0 sc1
526; GFX940-NEXT:    s_waitcnt vmcnt(0)
527; GFX940-NEXT:    s_setpc_b64 s[30:31]
528  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
529  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
530  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 7, i32 3>
531  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
532  ret void
533}
534
535define void @v_shuffle_v2bf16_v4bf16__7_4(ptr addrspace(1) inreg %ptr) {
536; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__7_4:
537; GFX900:       ; %bb.0:
538; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
539; GFX900-NEXT:    ;;#ASMSTART
540; GFX900-NEXT:    ; def v[0:1]
541; GFX900-NEXT:    ;;#ASMEND
542; GFX900-NEXT:    v_mov_b32_e32 v2, 0
543; GFX900-NEXT:    v_alignbit_b32 v0, v0, v1, 16
544; GFX900-NEXT:    global_store_dword v2, v0, s[16:17]
545; GFX900-NEXT:    s_waitcnt vmcnt(0)
546; GFX900-NEXT:    s_setpc_b64 s[30:31]
547;
548; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__7_4:
549; GFX90A:       ; %bb.0:
550; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
551; GFX90A-NEXT:    ;;#ASMSTART
552; GFX90A-NEXT:    ; def v[0:1]
553; GFX90A-NEXT:    ;;#ASMEND
554; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
555; GFX90A-NEXT:    v_alignbit_b32 v0, v0, v1, 16
556; GFX90A-NEXT:    global_store_dword v2, v0, s[16:17]
557; GFX90A-NEXT:    s_waitcnt vmcnt(0)
558; GFX90A-NEXT:    s_setpc_b64 s[30:31]
559;
560; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__7_4:
561; GFX940:       ; %bb.0:
562; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
563; GFX940-NEXT:    ;;#ASMSTART
564; GFX940-NEXT:    ; def v[0:1]
565; GFX940-NEXT:    ;;#ASMEND
566; GFX940-NEXT:    v_mov_b32_e32 v2, 0
567; GFX940-NEXT:    v_alignbit_b32 v0, v0, v1, 16
568; GFX940-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
569; GFX940-NEXT:    s_waitcnt vmcnt(0)
570; GFX940-NEXT:    s_setpc_b64 s[30:31]
571  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
572  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
573  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 7, i32 4>
574  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
575  ret void
576}
577
578define void @v_shuffle_v2bf16_v4bf16__7_5(ptr addrspace(1) inreg %ptr) {
579; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__7_5:
580; GFX900:       ; %bb.0:
581; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
582; GFX900-NEXT:    ;;#ASMSTART
583; GFX900-NEXT:    ; def v[0:1]
584; GFX900-NEXT:    ;;#ASMEND
585; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
586; GFX900-NEXT:    v_mov_b32_e32 v2, 0
587; GFX900-NEXT:    v_perm_b32 v0, v0, v1, s4
588; GFX900-NEXT:    global_store_dword v2, v0, s[16:17]
589; GFX900-NEXT:    s_waitcnt vmcnt(0)
590; GFX900-NEXT:    s_setpc_b64 s[30:31]
591;
592; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__7_5:
593; GFX90A:       ; %bb.0:
594; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
595; GFX90A-NEXT:    ;;#ASMSTART
596; GFX90A-NEXT:    ; def v[0:1]
597; GFX90A-NEXT:    ;;#ASMEND
598; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
599; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
600; GFX90A-NEXT:    v_perm_b32 v0, v0, v1, s4
601; GFX90A-NEXT:    global_store_dword v2, v0, s[16:17]
602; GFX90A-NEXT:    s_waitcnt vmcnt(0)
603; GFX90A-NEXT:    s_setpc_b64 s[30:31]
604;
605; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__7_5:
606; GFX940:       ; %bb.0:
607; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
608; GFX940-NEXT:    ;;#ASMSTART
609; GFX940-NEXT:    ; def v[0:1]
610; GFX940-NEXT:    ;;#ASMEND
611; GFX940-NEXT:    s_mov_b32 s2, 0x7060302
612; GFX940-NEXT:    v_mov_b32_e32 v2, 0
613; GFX940-NEXT:    v_perm_b32 v0, v0, v1, s2
614; GFX940-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
615; GFX940-NEXT:    s_waitcnt vmcnt(0)
616; GFX940-NEXT:    s_setpc_b64 s[30:31]
617  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
618  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
619  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 7, i32 5>
620  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
621  ret void
622}
623
624define void @v_shuffle_v2bf16_v4bf16__7_6(ptr addrspace(1) inreg %ptr) {
625; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__7_6:
626; GFX900:       ; %bb.0:
627; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
628; GFX900-NEXT:    ;;#ASMSTART
629; GFX900-NEXT:    ; def v[0:1]
630; GFX900-NEXT:    ;;#ASMEND
631; GFX900-NEXT:    v_mov_b32_e32 v2, 0
632; GFX900-NEXT:    v_alignbit_b32 v0, v1, v1, 16
633; GFX900-NEXT:    global_store_dword v2, v0, s[16:17]
634; GFX900-NEXT:    s_waitcnt vmcnt(0)
635; GFX900-NEXT:    s_setpc_b64 s[30:31]
636;
637; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__7_6:
638; GFX90A:       ; %bb.0:
639; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
640; GFX90A-NEXT:    ;;#ASMSTART
641; GFX90A-NEXT:    ; def v[0:1]
642; GFX90A-NEXT:    ;;#ASMEND
643; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
644; GFX90A-NEXT:    v_alignbit_b32 v0, v1, v1, 16
645; GFX90A-NEXT:    global_store_dword v2, v0, s[16:17]
646; GFX90A-NEXT:    s_waitcnt vmcnt(0)
647; GFX90A-NEXT:    s_setpc_b64 s[30:31]
648;
649; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__7_6:
650; GFX940:       ; %bb.0:
651; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
652; GFX940-NEXT:    ;;#ASMSTART
653; GFX940-NEXT:    ; def v[0:1]
654; GFX940-NEXT:    ;;#ASMEND
655; GFX940-NEXT:    v_mov_b32_e32 v2, 0
656; GFX940-NEXT:    v_alignbit_b32 v0, v1, v1, 16
657; GFX940-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
658; GFX940-NEXT:    s_waitcnt vmcnt(0)
659; GFX940-NEXT:    s_setpc_b64 s[30:31]
660  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
661  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
662  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 7, i32 6>
663  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
664  ret void
665}
666
667define void @v_shuffle_v2bf16_v4bf16__7_7(ptr addrspace(1) inreg %ptr) {
668; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__7_7:
669; GFX900:       ; %bb.0:
670; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
671; GFX900-NEXT:    ;;#ASMSTART
672; GFX900-NEXT:    ; def v[0:1]
673; GFX900-NEXT:    ;;#ASMEND
674; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
675; GFX900-NEXT:    v_mov_b32_e32 v2, 0
676; GFX900-NEXT:    v_perm_b32 v0, v1, v1, s4
677; GFX900-NEXT:    global_store_dword v2, v0, s[16:17]
678; GFX900-NEXT:    s_waitcnt vmcnt(0)
679; GFX900-NEXT:    s_setpc_b64 s[30:31]
680;
681; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__7_7:
682; GFX90A:       ; %bb.0:
683; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
684; GFX90A-NEXT:    ;;#ASMSTART
685; GFX90A-NEXT:    ; def v[0:1]
686; GFX90A-NEXT:    ;;#ASMEND
687; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
688; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
689; GFX90A-NEXT:    v_perm_b32 v0, v1, v1, s4
690; GFX90A-NEXT:    global_store_dword v2, v0, s[16:17]
691; GFX90A-NEXT:    s_waitcnt vmcnt(0)
692; GFX90A-NEXT:    s_setpc_b64 s[30:31]
693;
694; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__7_7:
695; GFX940:       ; %bb.0:
696; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
697; GFX940-NEXT:    ;;#ASMSTART
698; GFX940-NEXT:    ; def v[0:1]
699; GFX940-NEXT:    ;;#ASMEND
700; GFX940-NEXT:    s_mov_b32 s2, 0x7060302
701; GFX940-NEXT:    v_mov_b32_e32 v2, 0
702; GFX940-NEXT:    v_perm_b32 v0, v1, v1, s2
703; GFX940-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
704; GFX940-NEXT:    s_waitcnt vmcnt(0)
705; GFX940-NEXT:    s_setpc_b64 s[30:31]
706  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
707  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
708  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 7, i32 7>
709  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
710  ret void
711}
712
713define void @v_shuffle_v2bf16_v4bf16__u_0(ptr addrspace(1) inreg %ptr) {
714; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__u_0:
715; GFX900:       ; %bb.0:
716; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
717; GFX900-NEXT:    ;;#ASMSTART
718; GFX900-NEXT:    ; def v[0:1]
719; GFX900-NEXT:    ;;#ASMEND
720; GFX900-NEXT:    v_mov_b32_e32 v2, 0
721; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
722; GFX900-NEXT:    global_store_dword v2, v0, s[16:17]
723; GFX900-NEXT:    s_waitcnt vmcnt(0)
724; GFX900-NEXT:    s_setpc_b64 s[30:31]
725;
726; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__u_0:
727; GFX90A:       ; %bb.0:
728; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
729; GFX90A-NEXT:    ;;#ASMSTART
730; GFX90A-NEXT:    ; def v[0:1]
731; GFX90A-NEXT:    ;;#ASMEND
732; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
733; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
734; GFX90A-NEXT:    global_store_dword v2, v0, s[16:17]
735; GFX90A-NEXT:    s_waitcnt vmcnt(0)
736; GFX90A-NEXT:    s_setpc_b64 s[30:31]
737;
738; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__u_0:
739; GFX940:       ; %bb.0:
740; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
741; GFX940-NEXT:    ;;#ASMSTART
742; GFX940-NEXT:    ; def v[0:1]
743; GFX940-NEXT:    ;;#ASMEND
744; GFX940-NEXT:    v_mov_b32_e32 v2, 0
745; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
746; GFX940-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
747; GFX940-NEXT:    s_waitcnt vmcnt(0)
748; GFX940-NEXT:    s_setpc_b64 s[30:31]
749  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
750  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 poison, i32 0>
751  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
752  ret void
753}
754
755define void @v_shuffle_v2bf16_v4bf16__0_0(ptr addrspace(1) inreg %ptr) {
756; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__0_0:
757; GFX900:       ; %bb.0:
758; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
759; GFX900-NEXT:    ;;#ASMSTART
760; GFX900-NEXT:    ; def v[0:1]
761; GFX900-NEXT:    ;;#ASMEND
762; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
763; GFX900-NEXT:    v_mov_b32_e32 v2, 0
764; GFX900-NEXT:    v_perm_b32 v0, v0, v0, s4
765; GFX900-NEXT:    global_store_dword v2, v0, s[16:17]
766; GFX900-NEXT:    s_waitcnt vmcnt(0)
767; GFX900-NEXT:    s_setpc_b64 s[30:31]
768;
769; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__0_0:
770; GFX90A:       ; %bb.0:
771; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
772; GFX90A-NEXT:    ;;#ASMSTART
773; GFX90A-NEXT:    ; def v[0:1]
774; GFX90A-NEXT:    ;;#ASMEND
775; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
776; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
777; GFX90A-NEXT:    v_perm_b32 v0, v0, v0, s4
778; GFX90A-NEXT:    global_store_dword v2, v0, s[16:17]
779; GFX90A-NEXT:    s_waitcnt vmcnt(0)
780; GFX90A-NEXT:    s_setpc_b64 s[30:31]
781;
782; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__0_0:
783; GFX940:       ; %bb.0:
784; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
785; GFX940-NEXT:    ;;#ASMSTART
786; GFX940-NEXT:    ; def v[0:1]
787; GFX940-NEXT:    ;;#ASMEND
788; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
789; GFX940-NEXT:    v_mov_b32_e32 v2, 0
790; GFX940-NEXT:    v_perm_b32 v0, v0, v0, s2
791; GFX940-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
792; GFX940-NEXT:    s_waitcnt vmcnt(0)
793; GFX940-NEXT:    s_setpc_b64 s[30:31]
794  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
795  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> zeroinitializer
796  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
797  ret void
798}
799
800define void @v_shuffle_v2bf16_v4bf16__1_0(ptr addrspace(1) inreg %ptr) {
801; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__1_0:
802; GFX900:       ; %bb.0:
803; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
804; GFX900-NEXT:    ;;#ASMSTART
805; GFX900-NEXT:    ; def v[0:1]
806; GFX900-NEXT:    ;;#ASMEND
807; GFX900-NEXT:    v_mov_b32_e32 v2, 0
808; GFX900-NEXT:    v_alignbit_b32 v0, v0, v0, 16
809; GFX900-NEXT:    global_store_dword v2, v0, s[16:17]
810; GFX900-NEXT:    s_waitcnt vmcnt(0)
811; GFX900-NEXT:    s_setpc_b64 s[30:31]
812;
813; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__1_0:
814; GFX90A:       ; %bb.0:
815; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
816; GFX90A-NEXT:    ;;#ASMSTART
817; GFX90A-NEXT:    ; def v[0:1]
818; GFX90A-NEXT:    ;;#ASMEND
819; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
820; GFX90A-NEXT:    v_alignbit_b32 v0, v0, v0, 16
821; GFX90A-NEXT:    global_store_dword v2, v0, s[16:17]
822; GFX90A-NEXT:    s_waitcnt vmcnt(0)
823; GFX90A-NEXT:    s_setpc_b64 s[30:31]
824;
825; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__1_0:
826; GFX940:       ; %bb.0:
827; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
828; GFX940-NEXT:    ;;#ASMSTART
829; GFX940-NEXT:    ; def v[0:1]
830; GFX940-NEXT:    ;;#ASMEND
831; GFX940-NEXT:    v_mov_b32_e32 v2, 0
832; GFX940-NEXT:    v_alignbit_b32 v0, v0, v0, 16
833; GFX940-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
834; GFX940-NEXT:    s_waitcnt vmcnt(0)
835; GFX940-NEXT:    s_setpc_b64 s[30:31]
836  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
837  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 1, i32 0>
838  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
839  ret void
840}
841
842define void @v_shuffle_v2bf16_v4bf16__2_0(ptr addrspace(1) inreg %ptr) {
843; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__2_0:
844; GFX900:       ; %bb.0:
845; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
846; GFX900-NEXT:    ;;#ASMSTART
847; GFX900-NEXT:    ; def v[0:1]
848; GFX900-NEXT:    ;;#ASMEND
849; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
850; GFX900-NEXT:    v_mov_b32_e32 v2, 0
851; GFX900-NEXT:    v_perm_b32 v0, v0, v1, s4
852; GFX900-NEXT:    global_store_dword v2, v0, s[16:17]
853; GFX900-NEXT:    s_waitcnt vmcnt(0)
854; GFX900-NEXT:    s_setpc_b64 s[30:31]
855;
856; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__2_0:
857; GFX90A:       ; %bb.0:
858; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
859; GFX90A-NEXT:    ;;#ASMSTART
860; GFX90A-NEXT:    ; def v[0:1]
861; GFX90A-NEXT:    ;;#ASMEND
862; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
863; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
864; GFX90A-NEXT:    v_perm_b32 v0, v0, v1, s4
865; GFX90A-NEXT:    global_store_dword v2, v0, s[16:17]
866; GFX90A-NEXT:    s_waitcnt vmcnt(0)
867; GFX90A-NEXT:    s_setpc_b64 s[30:31]
868;
869; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__2_0:
870; GFX940:       ; %bb.0:
871; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
872; GFX940-NEXT:    ;;#ASMSTART
873; GFX940-NEXT:    ; def v[0:1]
874; GFX940-NEXT:    ;;#ASMEND
875; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
876; GFX940-NEXT:    v_mov_b32_e32 v2, 0
877; GFX940-NEXT:    v_perm_b32 v0, v0, v1, s2
878; GFX940-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
879; GFX940-NEXT:    s_waitcnt vmcnt(0)
880; GFX940-NEXT:    s_setpc_b64 s[30:31]
881  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
882  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 2, i32 0>
883  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
884  ret void
885}
886
887define void @v_shuffle_v2bf16_v4bf16__3_0(ptr addrspace(1) inreg %ptr) {
888; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__3_0:
889; GFX900:       ; %bb.0:
890; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
891; GFX900-NEXT:    ;;#ASMSTART
892; GFX900-NEXT:    ; def v[0:1]
893; GFX900-NEXT:    ;;#ASMEND
894; GFX900-NEXT:    v_mov_b32_e32 v2, 0
895; GFX900-NEXT:    v_alignbit_b32 v0, v0, v1, 16
896; GFX900-NEXT:    global_store_dword v2, v0, s[16:17]
897; GFX900-NEXT:    s_waitcnt vmcnt(0)
898; GFX900-NEXT:    s_setpc_b64 s[30:31]
899;
900; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__3_0:
901; GFX90A:       ; %bb.0:
902; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
903; GFX90A-NEXT:    ;;#ASMSTART
904; GFX90A-NEXT:    ; def v[0:1]
905; GFX90A-NEXT:    ;;#ASMEND
906; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
907; GFX90A-NEXT:    v_alignbit_b32 v0, v0, v1, 16
908; GFX90A-NEXT:    global_store_dword v2, v0, s[16:17]
909; GFX90A-NEXT:    s_waitcnt vmcnt(0)
910; GFX90A-NEXT:    s_setpc_b64 s[30:31]
911;
912; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__3_0:
913; GFX940:       ; %bb.0:
914; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
915; GFX940-NEXT:    ;;#ASMSTART
916; GFX940-NEXT:    ; def v[0:1]
917; GFX940-NEXT:    ;;#ASMEND
918; GFX940-NEXT:    v_mov_b32_e32 v2, 0
919; GFX940-NEXT:    v_alignbit_b32 v0, v0, v1, 16
920; GFX940-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
921; GFX940-NEXT:    s_waitcnt vmcnt(0)
922; GFX940-NEXT:    s_setpc_b64 s[30:31]
923  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
924  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 3, i32 0>
925  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
926  ret void
927}
928
929define void @v_shuffle_v2bf16_v4bf16__4_0(ptr addrspace(1) inreg %ptr) {
930; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__4_0:
931; GFX900:       ; %bb.0:
932; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
933; GFX900-NEXT:    ;;#ASMSTART
934; GFX900-NEXT:    ; def v[0:1]
935; GFX900-NEXT:    ;;#ASMEND
936; GFX900-NEXT:    v_mov_b32_e32 v2, 0
937; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
938; GFX900-NEXT:    global_store_dword v2, v0, s[16:17]
939; GFX900-NEXT:    s_waitcnt vmcnt(0)
940; GFX900-NEXT:    s_setpc_b64 s[30:31]
941;
942; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__4_0:
943; GFX90A:       ; %bb.0:
944; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
945; GFX90A-NEXT:    ;;#ASMSTART
946; GFX90A-NEXT:    ; def v[0:1]
947; GFX90A-NEXT:    ;;#ASMEND
948; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
949; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
950; GFX90A-NEXT:    global_store_dword v2, v0, s[16:17]
951; GFX90A-NEXT:    s_waitcnt vmcnt(0)
952; GFX90A-NEXT:    s_setpc_b64 s[30:31]
953;
954; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__4_0:
955; GFX940:       ; %bb.0:
956; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
957; GFX940-NEXT:    ;;#ASMSTART
958; GFX940-NEXT:    ; def v[0:1]
959; GFX940-NEXT:    ;;#ASMEND
960; GFX940-NEXT:    v_mov_b32_e32 v2, 0
961; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
962; GFX940-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
963; GFX940-NEXT:    s_waitcnt vmcnt(0)
964; GFX940-NEXT:    s_setpc_b64 s[30:31]
965  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
966  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 4, i32 0>
967  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
968  ret void
969}
970
971define void @v_shuffle_v2bf16_v4bf16__5_0(ptr addrspace(1) inreg %ptr) {
972; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__5_0:
973; GFX900:       ; %bb.0:
974; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
975; GFX900-NEXT:    ;;#ASMSTART
976; GFX900-NEXT:    ; def v[0:1]
977; GFX900-NEXT:    ;;#ASMEND
978; GFX900-NEXT:    v_mov_b32_e32 v3, 0
979; GFX900-NEXT:    ;;#ASMSTART
980; GFX900-NEXT:    ; def v[1:2]
981; GFX900-NEXT:    ;;#ASMEND
982; GFX900-NEXT:    v_alignbit_b32 v0, v0, v1, 16
983; GFX900-NEXT:    global_store_dword v3, v0, s[16:17]
984; GFX900-NEXT:    s_waitcnt vmcnt(0)
985; GFX900-NEXT:    s_setpc_b64 s[30:31]
986;
987; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__5_0:
988; GFX90A:       ; %bb.0:
989; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
990; GFX90A-NEXT:    ;;#ASMSTART
991; GFX90A-NEXT:    ; def v[0:1]
992; GFX90A-NEXT:    ;;#ASMEND
993; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
994; GFX90A-NEXT:    ;;#ASMSTART
995; GFX90A-NEXT:    ; def v[2:3]
996; GFX90A-NEXT:    ;;#ASMEND
997; GFX90A-NEXT:    v_alignbit_b32 v0, v0, v2, 16
998; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
999; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1000; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1001;
1002; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__5_0:
1003; GFX940:       ; %bb.0:
1004; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1005; GFX940-NEXT:    ;;#ASMSTART
1006; GFX940-NEXT:    ; def v[0:1]
1007; GFX940-NEXT:    ;;#ASMEND
1008; GFX940-NEXT:    v_mov_b32_e32 v4, 0
1009; GFX940-NEXT:    ;;#ASMSTART
1010; GFX940-NEXT:    ; def v[2:3]
1011; GFX940-NEXT:    ;;#ASMEND
1012; GFX940-NEXT:    s_nop 0
1013; GFX940-NEXT:    v_alignbit_b32 v0, v0, v2, 16
1014; GFX940-NEXT:    global_store_dword v4, v0, s[0:1] sc0 sc1
1015; GFX940-NEXT:    s_waitcnt vmcnt(0)
1016; GFX940-NEXT:    s_setpc_b64 s[30:31]
1017  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
1018  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
1019  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 5, i32 0>
1020  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
1021  ret void
1022}
1023
1024define void @v_shuffle_v2bf16_v4bf16__6_0(ptr addrspace(1) inreg %ptr) {
1025; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__6_0:
1026; GFX900:       ; %bb.0:
1027; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1028; GFX900-NEXT:    ;;#ASMSTART
1029; GFX900-NEXT:    ; def v[0:1]
1030; GFX900-NEXT:    ;;#ASMEND
1031; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
1032; GFX900-NEXT:    v_mov_b32_e32 v3, 0
1033; GFX900-NEXT:    ;;#ASMSTART
1034; GFX900-NEXT:    ; def v[1:2]
1035; GFX900-NEXT:    ;;#ASMEND
1036; GFX900-NEXT:    v_perm_b32 v0, v0, v2, s4
1037; GFX900-NEXT:    global_store_dword v3, v0, s[16:17]
1038; GFX900-NEXT:    s_waitcnt vmcnt(0)
1039; GFX900-NEXT:    s_setpc_b64 s[30:31]
1040;
1041; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__6_0:
1042; GFX90A:       ; %bb.0:
1043; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1044; GFX90A-NEXT:    ;;#ASMSTART
1045; GFX90A-NEXT:    ; def v[0:1]
1046; GFX90A-NEXT:    ;;#ASMEND
1047; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
1048; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
1049; GFX90A-NEXT:    ;;#ASMSTART
1050; GFX90A-NEXT:    ; def v[2:3]
1051; GFX90A-NEXT:    ;;#ASMEND
1052; GFX90A-NEXT:    v_perm_b32 v0, v0, v3, s4
1053; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
1054; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1055; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1056;
1057; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__6_0:
1058; GFX940:       ; %bb.0:
1059; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1060; GFX940-NEXT:    ;;#ASMSTART
1061; GFX940-NEXT:    ; def v[0:1]
1062; GFX940-NEXT:    ;;#ASMEND
1063; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
1064; GFX940-NEXT:    v_mov_b32_e32 v4, 0
1065; GFX940-NEXT:    ;;#ASMSTART
1066; GFX940-NEXT:    ; def v[2:3]
1067; GFX940-NEXT:    ;;#ASMEND
1068; GFX940-NEXT:    s_nop 0
1069; GFX940-NEXT:    v_perm_b32 v0, v0, v3, s2
1070; GFX940-NEXT:    global_store_dword v4, v0, s[0:1] sc0 sc1
1071; GFX940-NEXT:    s_waitcnt vmcnt(0)
1072; GFX940-NEXT:    s_setpc_b64 s[30:31]
1073  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
1074  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
1075  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 6, i32 0>
1076  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
1077  ret void
1078}
1079
1080define void @v_shuffle_v2bf16_v4bf16__u_1(ptr addrspace(1) inreg %ptr) {
1081; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__u_1:
1082; GFX900:       ; %bb.0:
1083; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1084; GFX900-NEXT:    v_mov_b32_e32 v2, 0
1085; GFX900-NEXT:    ;;#ASMSTART
1086; GFX900-NEXT:    ; def v[0:1]
1087; GFX900-NEXT:    ;;#ASMEND
1088; GFX900-NEXT:    global_store_dword v2, v0, s[16:17]
1089; GFX900-NEXT:    s_waitcnt vmcnt(0)
1090; GFX900-NEXT:    s_setpc_b64 s[30:31]
1091;
1092; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__u_1:
1093; GFX90A:       ; %bb.0:
1094; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1095; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
1096; GFX90A-NEXT:    ;;#ASMSTART
1097; GFX90A-NEXT:    ; def v[0:1]
1098; GFX90A-NEXT:    ;;#ASMEND
1099; GFX90A-NEXT:    global_store_dword v2, v0, s[16:17]
1100; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1101; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1102;
1103; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__u_1:
1104; GFX940:       ; %bb.0:
1105; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1106; GFX940-NEXT:    v_mov_b32_e32 v2, 0
1107; GFX940-NEXT:    ;;#ASMSTART
1108; GFX940-NEXT:    ; def v[0:1]
1109; GFX940-NEXT:    ;;#ASMEND
1110; GFX940-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
1111; GFX940-NEXT:    s_waitcnt vmcnt(0)
1112; GFX940-NEXT:    s_setpc_b64 s[30:31]
1113  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
1114  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 poison, i32 1>
1115  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
1116  ret void
1117}
1118
1119define void @v_shuffle_v2bf16_v4bf16__0_1(ptr addrspace(1) inreg %ptr) {
1120; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__0_1:
1121; GFX900:       ; %bb.0:
1122; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1123; GFX900-NEXT:    v_mov_b32_e32 v2, 0
1124; GFX900-NEXT:    ;;#ASMSTART
1125; GFX900-NEXT:    ; def v[0:1]
1126; GFX900-NEXT:    ;;#ASMEND
1127; GFX900-NEXT:    global_store_dword v2, v0, s[16:17]
1128; GFX900-NEXT:    s_waitcnt vmcnt(0)
1129; GFX900-NEXT:    s_setpc_b64 s[30:31]
1130;
1131; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__0_1:
1132; GFX90A:       ; %bb.0:
1133; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1134; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
1135; GFX90A-NEXT:    ;;#ASMSTART
1136; GFX90A-NEXT:    ; def v[0:1]
1137; GFX90A-NEXT:    ;;#ASMEND
1138; GFX90A-NEXT:    global_store_dword v2, v0, s[16:17]
1139; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1140; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1141;
1142; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__0_1:
1143; GFX940:       ; %bb.0:
1144; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1145; GFX940-NEXT:    v_mov_b32_e32 v2, 0
1146; GFX940-NEXT:    ;;#ASMSTART
1147; GFX940-NEXT:    ; def v[0:1]
1148; GFX940-NEXT:    ;;#ASMEND
1149; GFX940-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
1150; GFX940-NEXT:    s_waitcnt vmcnt(0)
1151; GFX940-NEXT:    s_setpc_b64 s[30:31]
1152  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
1153  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 0, i32 1>
1154  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
1155  ret void
1156}
1157
1158define void @v_shuffle_v2bf16_v4bf16__1_1(ptr addrspace(1) inreg %ptr) {
1159; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__1_1:
1160; GFX900:       ; %bb.0:
1161; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1162; GFX900-NEXT:    ;;#ASMSTART
1163; GFX900-NEXT:    ; def v[0:1]
1164; GFX900-NEXT:    ;;#ASMEND
1165; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
1166; GFX900-NEXT:    v_mov_b32_e32 v2, 0
1167; GFX900-NEXT:    v_perm_b32 v0, v0, v0, s4
1168; GFX900-NEXT:    global_store_dword v2, v0, s[16:17]
1169; GFX900-NEXT:    s_waitcnt vmcnt(0)
1170; GFX900-NEXT:    s_setpc_b64 s[30:31]
1171;
1172; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__1_1:
1173; GFX90A:       ; %bb.0:
1174; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1175; GFX90A-NEXT:    ;;#ASMSTART
1176; GFX90A-NEXT:    ; def v[0:1]
1177; GFX90A-NEXT:    ;;#ASMEND
1178; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
1179; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
1180; GFX90A-NEXT:    v_perm_b32 v0, v0, v0, s4
1181; GFX90A-NEXT:    global_store_dword v2, v0, s[16:17]
1182; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1183; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1184;
1185; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__1_1:
1186; GFX940:       ; %bb.0:
1187; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1188; GFX940-NEXT:    ;;#ASMSTART
1189; GFX940-NEXT:    ; def v[0:1]
1190; GFX940-NEXT:    ;;#ASMEND
1191; GFX940-NEXT:    s_mov_b32 s2, 0x7060302
1192; GFX940-NEXT:    v_mov_b32_e32 v2, 0
1193; GFX940-NEXT:    v_perm_b32 v0, v0, v0, s2
1194; GFX940-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
1195; GFX940-NEXT:    s_waitcnt vmcnt(0)
1196; GFX940-NEXT:    s_setpc_b64 s[30:31]
1197  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
1198  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 1, i32 1>
1199  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
1200  ret void
1201}
1202
1203define void @v_shuffle_v2bf16_v4bf16__2_1(ptr addrspace(1) inreg %ptr) {
1204; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__2_1:
1205; GFX900:       ; %bb.0:
1206; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1207; GFX900-NEXT:    ;;#ASMSTART
1208; GFX900-NEXT:    ; def v[0:1]
1209; GFX900-NEXT:    ;;#ASMEND
1210; GFX900-NEXT:    s_mov_b32 s4, 0xffff
1211; GFX900-NEXT:    v_mov_b32_e32 v2, 0
1212; GFX900-NEXT:    v_bfi_b32 v0, s4, v1, v0
1213; GFX900-NEXT:    global_store_dword v2, v0, s[16:17]
1214; GFX900-NEXT:    s_waitcnt vmcnt(0)
1215; GFX900-NEXT:    s_setpc_b64 s[30:31]
1216;
1217; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__2_1:
1218; GFX90A:       ; %bb.0:
1219; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1220; GFX90A-NEXT:    ;;#ASMSTART
1221; GFX90A-NEXT:    ; def v[0:1]
1222; GFX90A-NEXT:    ;;#ASMEND
1223; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
1224; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
1225; GFX90A-NEXT:    v_bfi_b32 v0, s4, v1, v0
1226; GFX90A-NEXT:    global_store_dword v2, v0, s[16:17]
1227; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1228; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1229;
1230; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__2_1:
1231; GFX940:       ; %bb.0:
1232; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1233; GFX940-NEXT:    ;;#ASMSTART
1234; GFX940-NEXT:    ; def v[0:1]
1235; GFX940-NEXT:    ;;#ASMEND
1236; GFX940-NEXT:    s_mov_b32 s2, 0xffff
1237; GFX940-NEXT:    v_mov_b32_e32 v2, 0
1238; GFX940-NEXT:    v_bfi_b32 v0, s2, v1, v0
1239; GFX940-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
1240; GFX940-NEXT:    s_waitcnt vmcnt(0)
1241; GFX940-NEXT:    s_setpc_b64 s[30:31]
1242  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
1243  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 2, i32 1>
1244  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
1245  ret void
1246}
1247
1248define void @v_shuffle_v2bf16_v4bf16__3_1(ptr addrspace(1) inreg %ptr) {
1249; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__3_1:
1250; GFX900:       ; %bb.0:
1251; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1252; GFX900-NEXT:    ;;#ASMSTART
1253; GFX900-NEXT:    ; def v[0:1]
1254; GFX900-NEXT:    ;;#ASMEND
1255; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
1256; GFX900-NEXT:    v_mov_b32_e32 v2, 0
1257; GFX900-NEXT:    v_perm_b32 v0, v0, v1, s4
1258; GFX900-NEXT:    global_store_dword v2, v0, s[16:17]
1259; GFX900-NEXT:    s_waitcnt vmcnt(0)
1260; GFX900-NEXT:    s_setpc_b64 s[30:31]
1261;
1262; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__3_1:
1263; GFX90A:       ; %bb.0:
1264; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1265; GFX90A-NEXT:    ;;#ASMSTART
1266; GFX90A-NEXT:    ; def v[0:1]
1267; GFX90A-NEXT:    ;;#ASMEND
1268; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
1269; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
1270; GFX90A-NEXT:    v_perm_b32 v0, v0, v1, s4
1271; GFX90A-NEXT:    global_store_dword v2, v0, s[16:17]
1272; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1273; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1274;
1275; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__3_1:
1276; GFX940:       ; %bb.0:
1277; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1278; GFX940-NEXT:    ;;#ASMSTART
1279; GFX940-NEXT:    ; def v[0:1]
1280; GFX940-NEXT:    ;;#ASMEND
1281; GFX940-NEXT:    s_mov_b32 s2, 0x7060302
1282; GFX940-NEXT:    v_mov_b32_e32 v2, 0
1283; GFX940-NEXT:    v_perm_b32 v0, v0, v1, s2
1284; GFX940-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
1285; GFX940-NEXT:    s_waitcnt vmcnt(0)
1286; GFX940-NEXT:    s_setpc_b64 s[30:31]
1287  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
1288  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 3, i32 1>
1289  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
1290  ret void
1291}
1292
1293define void @v_shuffle_v2bf16_v4bf16__4_1(ptr addrspace(1) inreg %ptr) {
1294; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__4_1:
1295; GFX900:       ; %bb.0:
1296; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1297; GFX900-NEXT:    v_mov_b32_e32 v2, 0
1298; GFX900-NEXT:    ;;#ASMSTART
1299; GFX900-NEXT:    ; def v[0:1]
1300; GFX900-NEXT:    ;;#ASMEND
1301; GFX900-NEXT:    global_store_dword v2, v0, s[16:17]
1302; GFX900-NEXT:    s_waitcnt vmcnt(0)
1303; GFX900-NEXT:    s_setpc_b64 s[30:31]
1304;
1305; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__4_1:
1306; GFX90A:       ; %bb.0:
1307; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1308; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
1309; GFX90A-NEXT:    ;;#ASMSTART
1310; GFX90A-NEXT:    ; def v[0:1]
1311; GFX90A-NEXT:    ;;#ASMEND
1312; GFX90A-NEXT:    global_store_dword v2, v0, s[16:17]
1313; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1314; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1315;
1316; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__4_1:
1317; GFX940:       ; %bb.0:
1318; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1319; GFX940-NEXT:    v_mov_b32_e32 v2, 0
1320; GFX940-NEXT:    ;;#ASMSTART
1321; GFX940-NEXT:    ; def v[0:1]
1322; GFX940-NEXT:    ;;#ASMEND
1323; GFX940-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
1324; GFX940-NEXT:    s_waitcnt vmcnt(0)
1325; GFX940-NEXT:    s_setpc_b64 s[30:31]
1326  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
1327  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 4, i32 1>
1328  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
1329  ret void
1330}
1331
1332define void @v_shuffle_v2bf16_v4bf16__5_1(ptr addrspace(1) inreg %ptr) {
1333; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__5_1:
1334; GFX900:       ; %bb.0:
1335; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1336; GFX900-NEXT:    ;;#ASMSTART
1337; GFX900-NEXT:    ; def v[0:1]
1338; GFX900-NEXT:    ;;#ASMEND
1339; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
1340; GFX900-NEXT:    v_mov_b32_e32 v3, 0
1341; GFX900-NEXT:    ;;#ASMSTART
1342; GFX900-NEXT:    ; def v[1:2]
1343; GFX900-NEXT:    ;;#ASMEND
1344; GFX900-NEXT:    v_perm_b32 v0, v0, v1, s4
1345; GFX900-NEXT:    global_store_dword v3, v0, s[16:17]
1346; GFX900-NEXT:    s_waitcnt vmcnt(0)
1347; GFX900-NEXT:    s_setpc_b64 s[30:31]
1348;
1349; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__5_1:
1350; GFX90A:       ; %bb.0:
1351; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1352; GFX90A-NEXT:    ;;#ASMSTART
1353; GFX90A-NEXT:    ; def v[0:1]
1354; GFX90A-NEXT:    ;;#ASMEND
1355; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
1356; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
1357; GFX90A-NEXT:    ;;#ASMSTART
1358; GFX90A-NEXT:    ; def v[2:3]
1359; GFX90A-NEXT:    ;;#ASMEND
1360; GFX90A-NEXT:    v_perm_b32 v0, v0, v2, s4
1361; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
1362; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1363; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1364;
1365; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__5_1:
1366; GFX940:       ; %bb.0:
1367; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1368; GFX940-NEXT:    ;;#ASMSTART
1369; GFX940-NEXT:    ; def v[0:1]
1370; GFX940-NEXT:    ;;#ASMEND
1371; GFX940-NEXT:    s_mov_b32 s2, 0x7060302
1372; GFX940-NEXT:    v_mov_b32_e32 v4, 0
1373; GFX940-NEXT:    ;;#ASMSTART
1374; GFX940-NEXT:    ; def v[2:3]
1375; GFX940-NEXT:    ;;#ASMEND
1376; GFX940-NEXT:    s_nop 0
1377; GFX940-NEXT:    v_perm_b32 v0, v0, v2, s2
1378; GFX940-NEXT:    global_store_dword v4, v0, s[0:1] sc0 sc1
1379; GFX940-NEXT:    s_waitcnt vmcnt(0)
1380; GFX940-NEXT:    s_setpc_b64 s[30:31]
1381  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
1382  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
1383  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 5, i32 1>
1384  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
1385  ret void
1386}
1387
1388define void @v_shuffle_v2bf16_v4bf16__6_1(ptr addrspace(1) inreg %ptr) {
1389; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__6_1:
1390; GFX900:       ; %bb.0:
1391; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1392; GFX900-NEXT:    ;;#ASMSTART
1393; GFX900-NEXT:    ; def v[0:1]
1394; GFX900-NEXT:    ;;#ASMEND
1395; GFX900-NEXT:    s_mov_b32 s4, 0xffff
1396; GFX900-NEXT:    v_mov_b32_e32 v3, 0
1397; GFX900-NEXT:    ;;#ASMSTART
1398; GFX900-NEXT:    ; def v[1:2]
1399; GFX900-NEXT:    ;;#ASMEND
1400; GFX900-NEXT:    v_bfi_b32 v0, s4, v2, v0
1401; GFX900-NEXT:    global_store_dword v3, v0, s[16:17]
1402; GFX900-NEXT:    s_waitcnt vmcnt(0)
1403; GFX900-NEXT:    s_setpc_b64 s[30:31]
1404;
1405; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__6_1:
1406; GFX90A:       ; %bb.0:
1407; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1408; GFX90A-NEXT:    ;;#ASMSTART
1409; GFX90A-NEXT:    ; def v[0:1]
1410; GFX90A-NEXT:    ;;#ASMEND
1411; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
1412; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
1413; GFX90A-NEXT:    ;;#ASMSTART
1414; GFX90A-NEXT:    ; def v[2:3]
1415; GFX90A-NEXT:    ;;#ASMEND
1416; GFX90A-NEXT:    v_bfi_b32 v0, s4, v3, v0
1417; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
1418; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1419; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1420;
1421; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__6_1:
1422; GFX940:       ; %bb.0:
1423; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1424; GFX940-NEXT:    ;;#ASMSTART
1425; GFX940-NEXT:    ; def v[0:1]
1426; GFX940-NEXT:    ;;#ASMEND
1427; GFX940-NEXT:    s_mov_b32 s2, 0xffff
1428; GFX940-NEXT:    v_mov_b32_e32 v4, 0
1429; GFX940-NEXT:    ;;#ASMSTART
1430; GFX940-NEXT:    ; def v[2:3]
1431; GFX940-NEXT:    ;;#ASMEND
1432; GFX940-NEXT:    s_nop 0
1433; GFX940-NEXT:    v_bfi_b32 v0, s2, v3, v0
1434; GFX940-NEXT:    global_store_dword v4, v0, s[0:1] sc0 sc1
1435; GFX940-NEXT:    s_waitcnt vmcnt(0)
1436; GFX940-NEXT:    s_setpc_b64 s[30:31]
1437  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
1438  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
1439  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 6, i32 1>
1440  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
1441  ret void
1442}
1443
1444define void @v_shuffle_v2bf16_v4bf16__u_2(ptr addrspace(1) inreg %ptr) {
1445; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__u_2:
1446; GFX900:       ; %bb.0:
1447; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1448; GFX900-NEXT:    ;;#ASMSTART
1449; GFX900-NEXT:    ; def v[0:1]
1450; GFX900-NEXT:    ;;#ASMEND
1451; GFX900-NEXT:    v_mov_b32_e32 v2, 0
1452; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
1453; GFX900-NEXT:    global_store_dword v2, v0, s[16:17]
1454; GFX900-NEXT:    s_waitcnt vmcnt(0)
1455; GFX900-NEXT:    s_setpc_b64 s[30:31]
1456;
1457; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__u_2:
1458; GFX90A:       ; %bb.0:
1459; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1460; GFX90A-NEXT:    ;;#ASMSTART
1461; GFX90A-NEXT:    ; def v[0:1]
1462; GFX90A-NEXT:    ;;#ASMEND
1463; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
1464; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
1465; GFX90A-NEXT:    global_store_dword v2, v0, s[16:17]
1466; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1467; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1468;
1469; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__u_2:
1470; GFX940:       ; %bb.0:
1471; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1472; GFX940-NEXT:    ;;#ASMSTART
1473; GFX940-NEXT:    ; def v[0:1]
1474; GFX940-NEXT:    ;;#ASMEND
1475; GFX940-NEXT:    v_mov_b32_e32 v2, 0
1476; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
1477; GFX940-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
1478; GFX940-NEXT:    s_waitcnt vmcnt(0)
1479; GFX940-NEXT:    s_setpc_b64 s[30:31]
1480  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
1481  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 poison, i32 2>
1482  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
1483  ret void
1484}
1485
1486define void @v_shuffle_v2bf16_v4bf16__0_2(ptr addrspace(1) inreg %ptr) {
1487; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__0_2:
1488; GFX900:       ; %bb.0:
1489; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1490; GFX900-NEXT:    ;;#ASMSTART
1491; GFX900-NEXT:    ; def v[0:1]
1492; GFX900-NEXT:    ;;#ASMEND
1493; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
1494; GFX900-NEXT:    v_mov_b32_e32 v2, 0
1495; GFX900-NEXT:    v_perm_b32 v0, v1, v0, s4
1496; GFX900-NEXT:    global_store_dword v2, v0, s[16:17]
1497; GFX900-NEXT:    s_waitcnt vmcnt(0)
1498; GFX900-NEXT:    s_setpc_b64 s[30:31]
1499;
1500; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__0_2:
1501; GFX90A:       ; %bb.0:
1502; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1503; GFX90A-NEXT:    ;;#ASMSTART
1504; GFX90A-NEXT:    ; def v[0:1]
1505; GFX90A-NEXT:    ;;#ASMEND
1506; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
1507; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
1508; GFX90A-NEXT:    v_perm_b32 v0, v1, v0, s4
1509; GFX90A-NEXT:    global_store_dword v2, v0, s[16:17]
1510; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1511; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1512;
1513; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__0_2:
1514; GFX940:       ; %bb.0:
1515; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1516; GFX940-NEXT:    ;;#ASMSTART
1517; GFX940-NEXT:    ; def v[0:1]
1518; GFX940-NEXT:    ;;#ASMEND
1519; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
1520; GFX940-NEXT:    v_mov_b32_e32 v2, 0
1521; GFX940-NEXT:    v_perm_b32 v0, v1, v0, s2
1522; GFX940-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
1523; GFX940-NEXT:    s_waitcnt vmcnt(0)
1524; GFX940-NEXT:    s_setpc_b64 s[30:31]
1525  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
1526  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 0, i32 2>
1527  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
1528  ret void
1529}
1530
1531define void @v_shuffle_v2bf16_v4bf16__1_2(ptr addrspace(1) inreg %ptr) {
1532; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__1_2:
1533; GFX900:       ; %bb.0:
1534; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1535; GFX900-NEXT:    ;;#ASMSTART
1536; GFX900-NEXT:    ; def v[0:1]
1537; GFX900-NEXT:    ;;#ASMEND
1538; GFX900-NEXT:    v_mov_b32_e32 v2, 0
1539; GFX900-NEXT:    v_alignbit_b32 v0, v1, v0, 16
1540; GFX900-NEXT:    global_store_dword v2, v0, s[16:17]
1541; GFX900-NEXT:    s_waitcnt vmcnt(0)
1542; GFX900-NEXT:    s_setpc_b64 s[30:31]
1543;
1544; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__1_2:
1545; GFX90A:       ; %bb.0:
1546; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1547; GFX90A-NEXT:    ;;#ASMSTART
1548; GFX90A-NEXT:    ; def v[0:1]
1549; GFX90A-NEXT:    ;;#ASMEND
1550; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
1551; GFX90A-NEXT:    v_alignbit_b32 v0, v1, v0, 16
1552; GFX90A-NEXT:    global_store_dword v2, v0, s[16:17]
1553; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1554; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1555;
1556; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__1_2:
1557; GFX940:       ; %bb.0:
1558; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1559; GFX940-NEXT:    ;;#ASMSTART
1560; GFX940-NEXT:    ; def v[0:1]
1561; GFX940-NEXT:    ;;#ASMEND
1562; GFX940-NEXT:    v_mov_b32_e32 v2, 0
1563; GFX940-NEXT:    v_alignbit_b32 v0, v1, v0, 16
1564; GFX940-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
1565; GFX940-NEXT:    s_waitcnt vmcnt(0)
1566; GFX940-NEXT:    s_setpc_b64 s[30:31]
1567  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
1568  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 1, i32 2>
1569  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
1570  ret void
1571}
1572
1573define void @v_shuffle_v2bf16_v4bf16__2_2(ptr addrspace(1) inreg %ptr) {
1574; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__2_2:
1575; GFX900:       ; %bb.0:
1576; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1577; GFX900-NEXT:    ;;#ASMSTART
1578; GFX900-NEXT:    ; def v[0:1]
1579; GFX900-NEXT:    ;;#ASMEND
1580; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
1581; GFX900-NEXT:    v_mov_b32_e32 v2, 0
1582; GFX900-NEXT:    v_perm_b32 v0, v1, v1, s4
1583; GFX900-NEXT:    global_store_dword v2, v0, s[16:17]
1584; GFX900-NEXT:    s_waitcnt vmcnt(0)
1585; GFX900-NEXT:    s_setpc_b64 s[30:31]
1586;
1587; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__2_2:
1588; GFX90A:       ; %bb.0:
1589; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1590; GFX90A-NEXT:    ;;#ASMSTART
1591; GFX90A-NEXT:    ; def v[0:1]
1592; GFX90A-NEXT:    ;;#ASMEND
1593; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
1594; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
1595; GFX90A-NEXT:    v_perm_b32 v0, v1, v1, s4
1596; GFX90A-NEXT:    global_store_dword v2, v0, s[16:17]
1597; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1598; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1599;
1600; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__2_2:
1601; GFX940:       ; %bb.0:
1602; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1603; GFX940-NEXT:    ;;#ASMSTART
1604; GFX940-NEXT:    ; def v[0:1]
1605; GFX940-NEXT:    ;;#ASMEND
1606; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
1607; GFX940-NEXT:    v_mov_b32_e32 v2, 0
1608; GFX940-NEXT:    v_perm_b32 v0, v1, v1, s2
1609; GFX940-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
1610; GFX940-NEXT:    s_waitcnt vmcnt(0)
1611; GFX940-NEXT:    s_setpc_b64 s[30:31]
1612  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
1613  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 2, i32 2>
1614  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
1615  ret void
1616}
1617
1618define void @v_shuffle_v2bf16_v4bf16__3_2(ptr addrspace(1) inreg %ptr) {
1619; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__3_2:
1620; GFX900:       ; %bb.0:
1621; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1622; GFX900-NEXT:    ;;#ASMSTART
1623; GFX900-NEXT:    ; def v[0:1]
1624; GFX900-NEXT:    ;;#ASMEND
1625; GFX900-NEXT:    v_mov_b32_e32 v2, 0
1626; GFX900-NEXT:    v_alignbit_b32 v0, v1, v1, 16
1627; GFX900-NEXT:    global_store_dword v2, v0, s[16:17]
1628; GFX900-NEXT:    s_waitcnt vmcnt(0)
1629; GFX900-NEXT:    s_setpc_b64 s[30:31]
1630;
1631; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__3_2:
1632; GFX90A:       ; %bb.0:
1633; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1634; GFX90A-NEXT:    ;;#ASMSTART
1635; GFX90A-NEXT:    ; def v[0:1]
1636; GFX90A-NEXT:    ;;#ASMEND
1637; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
1638; GFX90A-NEXT:    v_alignbit_b32 v0, v1, v1, 16
1639; GFX90A-NEXT:    global_store_dword v2, v0, s[16:17]
1640; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1641; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1642;
1643; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__3_2:
1644; GFX940:       ; %bb.0:
1645; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1646; GFX940-NEXT:    ;;#ASMSTART
1647; GFX940-NEXT:    ; def v[0:1]
1648; GFX940-NEXT:    ;;#ASMEND
1649; GFX940-NEXT:    v_mov_b32_e32 v2, 0
1650; GFX940-NEXT:    v_alignbit_b32 v0, v1, v1, 16
1651; GFX940-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
1652; GFX940-NEXT:    s_waitcnt vmcnt(0)
1653; GFX940-NEXT:    s_setpc_b64 s[30:31]
1654  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
1655  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 3, i32 2>
1656  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
1657  ret void
1658}
1659
1660define void @v_shuffle_v2bf16_v4bf16__4_2(ptr addrspace(1) inreg %ptr) {
1661; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__4_2:
1662; GFX900:       ; %bb.0:
1663; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1664; GFX900-NEXT:    ;;#ASMSTART
1665; GFX900-NEXT:    ; def v[0:1]
1666; GFX900-NEXT:    ;;#ASMEND
1667; GFX900-NEXT:    v_mov_b32_e32 v2, 0
1668; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
1669; GFX900-NEXT:    global_store_dword v2, v0, s[16:17]
1670; GFX900-NEXT:    s_waitcnt vmcnt(0)
1671; GFX900-NEXT:    s_setpc_b64 s[30:31]
1672;
1673; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__4_2:
1674; GFX90A:       ; %bb.0:
1675; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1676; GFX90A-NEXT:    ;;#ASMSTART
1677; GFX90A-NEXT:    ; def v[0:1]
1678; GFX90A-NEXT:    ;;#ASMEND
1679; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
1680; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
1681; GFX90A-NEXT:    global_store_dword v2, v0, s[16:17]
1682; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1683; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1684;
1685; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__4_2:
1686; GFX940:       ; %bb.0:
1687; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1688; GFX940-NEXT:    ;;#ASMSTART
1689; GFX940-NEXT:    ; def v[0:1]
1690; GFX940-NEXT:    ;;#ASMEND
1691; GFX940-NEXT:    v_mov_b32_e32 v2, 0
1692; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
1693; GFX940-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
1694; GFX940-NEXT:    s_waitcnt vmcnt(0)
1695; GFX940-NEXT:    s_setpc_b64 s[30:31]
1696  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
1697  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 4, i32 2>
1698  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
1699  ret void
1700}
1701
1702define void @v_shuffle_v2bf16_v4bf16__5_2(ptr addrspace(1) inreg %ptr) {
1703; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__5_2:
1704; GFX900:       ; %bb.0:
1705; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1706; GFX900-NEXT:    ;;#ASMSTART
1707; GFX900-NEXT:    ; def v[0:1]
1708; GFX900-NEXT:    ;;#ASMEND
1709; GFX900-NEXT:    v_mov_b32_e32 v4, 0
1710; GFX900-NEXT:    ;;#ASMSTART
1711; GFX900-NEXT:    ; def v[2:3]
1712; GFX900-NEXT:    ;;#ASMEND
1713; GFX900-NEXT:    v_alignbit_b32 v0, v1, v2, 16
1714; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
1715; GFX900-NEXT:    s_waitcnt vmcnt(0)
1716; GFX900-NEXT:    s_setpc_b64 s[30:31]
1717;
1718; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__5_2:
1719; GFX90A:       ; %bb.0:
1720; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1721; GFX90A-NEXT:    ;;#ASMSTART
1722; GFX90A-NEXT:    ; def v[0:1]
1723; GFX90A-NEXT:    ;;#ASMEND
1724; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
1725; GFX90A-NEXT:    ;;#ASMSTART
1726; GFX90A-NEXT:    ; def v[2:3]
1727; GFX90A-NEXT:    ;;#ASMEND
1728; GFX90A-NEXT:    v_alignbit_b32 v0, v1, v2, 16
1729; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
1730; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1731; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1732;
1733; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__5_2:
1734; GFX940:       ; %bb.0:
1735; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1736; GFX940-NEXT:    ;;#ASMSTART
1737; GFX940-NEXT:    ; def v[0:1]
1738; GFX940-NEXT:    ;;#ASMEND
1739; GFX940-NEXT:    v_mov_b32_e32 v4, 0
1740; GFX940-NEXT:    ;;#ASMSTART
1741; GFX940-NEXT:    ; def v[2:3]
1742; GFX940-NEXT:    ;;#ASMEND
1743; GFX940-NEXT:    s_nop 0
1744; GFX940-NEXT:    v_alignbit_b32 v0, v1, v2, 16
1745; GFX940-NEXT:    global_store_dword v4, v0, s[0:1] sc0 sc1
1746; GFX940-NEXT:    s_waitcnt vmcnt(0)
1747; GFX940-NEXT:    s_setpc_b64 s[30:31]
1748  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
1749  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
1750  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 5, i32 2>
1751  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
1752  ret void
1753}
1754
1755define void @v_shuffle_v2bf16_v4bf16__6_2(ptr addrspace(1) inreg %ptr) {
1756; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__6_2:
1757; GFX900:       ; %bb.0:
1758; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1759; GFX900-NEXT:    ;;#ASMSTART
1760; GFX900-NEXT:    ; def v[0:1]
1761; GFX900-NEXT:    ;;#ASMEND
1762; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
1763; GFX900-NEXT:    v_mov_b32_e32 v4, 0
1764; GFX900-NEXT:    ;;#ASMSTART
1765; GFX900-NEXT:    ; def v[2:3]
1766; GFX900-NEXT:    ;;#ASMEND
1767; GFX900-NEXT:    v_perm_b32 v0, v1, v3, s4
1768; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
1769; GFX900-NEXT:    s_waitcnt vmcnt(0)
1770; GFX900-NEXT:    s_setpc_b64 s[30:31]
1771;
1772; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__6_2:
1773; GFX90A:       ; %bb.0:
1774; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1775; GFX90A-NEXT:    ;;#ASMSTART
1776; GFX90A-NEXT:    ; def v[0:1]
1777; GFX90A-NEXT:    ;;#ASMEND
1778; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
1779; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
1780; GFX90A-NEXT:    ;;#ASMSTART
1781; GFX90A-NEXT:    ; def v[2:3]
1782; GFX90A-NEXT:    ;;#ASMEND
1783; GFX90A-NEXT:    v_perm_b32 v0, v1, v3, s4
1784; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
1785; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1786; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1787;
1788; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__6_2:
1789; GFX940:       ; %bb.0:
1790; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1791; GFX940-NEXT:    ;;#ASMSTART
1792; GFX940-NEXT:    ; def v[0:1]
1793; GFX940-NEXT:    ;;#ASMEND
1794; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
1795; GFX940-NEXT:    v_mov_b32_e32 v4, 0
1796; GFX940-NEXT:    ;;#ASMSTART
1797; GFX940-NEXT:    ; def v[2:3]
1798; GFX940-NEXT:    ;;#ASMEND
1799; GFX940-NEXT:    s_nop 0
1800; GFX940-NEXT:    v_perm_b32 v0, v1, v3, s2
1801; GFX940-NEXT:    global_store_dword v4, v0, s[0:1] sc0 sc1
1802; GFX940-NEXT:    s_waitcnt vmcnt(0)
1803; GFX940-NEXT:    s_setpc_b64 s[30:31]
1804  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
1805  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
1806  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 6, i32 2>
1807  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
1808  ret void
1809}
1810
1811define void @v_shuffle_v2bf16_v4bf16__u_3(ptr addrspace(1) inreg %ptr) {
1812; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__u_3:
1813; GFX900:       ; %bb.0:
1814; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1815; GFX900-NEXT:    v_mov_b32_e32 v2, 0
1816; GFX900-NEXT:    ;;#ASMSTART
1817; GFX900-NEXT:    ; def v[0:1]
1818; GFX900-NEXT:    ;;#ASMEND
1819; GFX900-NEXT:    global_store_dword v2, v1, s[16:17]
1820; GFX900-NEXT:    s_waitcnt vmcnt(0)
1821; GFX900-NEXT:    s_setpc_b64 s[30:31]
1822;
1823; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__u_3:
1824; GFX90A:       ; %bb.0:
1825; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1826; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
1827; GFX90A-NEXT:    ;;#ASMSTART
1828; GFX90A-NEXT:    ; def v[0:1]
1829; GFX90A-NEXT:    ;;#ASMEND
1830; GFX90A-NEXT:    global_store_dword v2, v1, s[16:17]
1831; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1832; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1833;
1834; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__u_3:
1835; GFX940:       ; %bb.0:
1836; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1837; GFX940-NEXT:    v_mov_b32_e32 v2, 0
1838; GFX940-NEXT:    ;;#ASMSTART
1839; GFX940-NEXT:    ; def v[0:1]
1840; GFX940-NEXT:    ;;#ASMEND
1841; GFX940-NEXT:    global_store_dword v2, v1, s[0:1] sc0 sc1
1842; GFX940-NEXT:    s_waitcnt vmcnt(0)
1843; GFX940-NEXT:    s_setpc_b64 s[30:31]
1844  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
1845  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 poison, i32 3>
1846  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
1847  ret void
1848}
1849
1850define void @v_shuffle_v2bf16_v4bf16__0_3(ptr addrspace(1) inreg %ptr) {
1851; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__0_3:
1852; GFX900:       ; %bb.0:
1853; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1854; GFX900-NEXT:    ;;#ASMSTART
1855; GFX900-NEXT:    ; def v[0:1]
1856; GFX900-NEXT:    ;;#ASMEND
1857; GFX900-NEXT:    s_mov_b32 s4, 0xffff
1858; GFX900-NEXT:    v_mov_b32_e32 v2, 0
1859; GFX900-NEXT:    v_bfi_b32 v0, s4, v0, v1
1860; GFX900-NEXT:    global_store_dword v2, v0, s[16:17]
1861; GFX900-NEXT:    s_waitcnt vmcnt(0)
1862; GFX900-NEXT:    s_setpc_b64 s[30:31]
1863;
1864; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__0_3:
1865; GFX90A:       ; %bb.0:
1866; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1867; GFX90A-NEXT:    ;;#ASMSTART
1868; GFX90A-NEXT:    ; def v[0:1]
1869; GFX90A-NEXT:    ;;#ASMEND
1870; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
1871; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
1872; GFX90A-NEXT:    v_bfi_b32 v0, s4, v0, v1
1873; GFX90A-NEXT:    global_store_dword v2, v0, s[16:17]
1874; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1875; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1876;
1877; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__0_3:
1878; GFX940:       ; %bb.0:
1879; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1880; GFX940-NEXT:    ;;#ASMSTART
1881; GFX940-NEXT:    ; def v[0:1]
1882; GFX940-NEXT:    ;;#ASMEND
1883; GFX940-NEXT:    s_mov_b32 s2, 0xffff
1884; GFX940-NEXT:    v_mov_b32_e32 v2, 0
1885; GFX940-NEXT:    v_bfi_b32 v0, s2, v0, v1
1886; GFX940-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
1887; GFX940-NEXT:    s_waitcnt vmcnt(0)
1888; GFX940-NEXT:    s_setpc_b64 s[30:31]
1889  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
1890  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 0, i32 3>
1891  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
1892  ret void
1893}
1894
1895define void @v_shuffle_v2bf16_v4bf16__1_3(ptr addrspace(1) inreg %ptr) {
1896; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__1_3:
1897; GFX900:       ; %bb.0:
1898; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1899; GFX900-NEXT:    ;;#ASMSTART
1900; GFX900-NEXT:    ; def v[0:1]
1901; GFX900-NEXT:    ;;#ASMEND
1902; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
1903; GFX900-NEXT:    v_mov_b32_e32 v2, 0
1904; GFX900-NEXT:    v_perm_b32 v0, v1, v0, s4
1905; GFX900-NEXT:    global_store_dword v2, v0, s[16:17]
1906; GFX900-NEXT:    s_waitcnt vmcnt(0)
1907; GFX900-NEXT:    s_setpc_b64 s[30:31]
1908;
1909; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__1_3:
1910; GFX90A:       ; %bb.0:
1911; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1912; GFX90A-NEXT:    ;;#ASMSTART
1913; GFX90A-NEXT:    ; def v[0:1]
1914; GFX90A-NEXT:    ;;#ASMEND
1915; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
1916; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
1917; GFX90A-NEXT:    v_perm_b32 v0, v1, v0, s4
1918; GFX90A-NEXT:    global_store_dword v2, v0, s[16:17]
1919; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1920; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1921;
1922; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__1_3:
1923; GFX940:       ; %bb.0:
1924; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1925; GFX940-NEXT:    ;;#ASMSTART
1926; GFX940-NEXT:    ; def v[0:1]
1927; GFX940-NEXT:    ;;#ASMEND
1928; GFX940-NEXT:    s_mov_b32 s2, 0x7060302
1929; GFX940-NEXT:    v_mov_b32_e32 v2, 0
1930; GFX940-NEXT:    v_perm_b32 v0, v1, v0, s2
1931; GFX940-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
1932; GFX940-NEXT:    s_waitcnt vmcnt(0)
1933; GFX940-NEXT:    s_setpc_b64 s[30:31]
1934  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
1935  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 1, i32 3>
1936  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
1937  ret void
1938}
1939
1940define void @v_shuffle_v2bf16_v4bf16__2_3(ptr addrspace(1) inreg %ptr) {
1941; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__2_3:
1942; GFX900:       ; %bb.0:
1943; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1944; GFX900-NEXT:    v_mov_b32_e32 v2, 0
1945; GFX900-NEXT:    ;;#ASMSTART
1946; GFX900-NEXT:    ; def v[0:1]
1947; GFX900-NEXT:    ;;#ASMEND
1948; GFX900-NEXT:    global_store_dword v2, v1, s[16:17]
1949; GFX900-NEXT:    s_waitcnt vmcnt(0)
1950; GFX900-NEXT:    s_setpc_b64 s[30:31]
1951;
1952; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__2_3:
1953; GFX90A:       ; %bb.0:
1954; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1955; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
1956; GFX90A-NEXT:    ;;#ASMSTART
1957; GFX90A-NEXT:    ; def v[0:1]
1958; GFX90A-NEXT:    ;;#ASMEND
1959; GFX90A-NEXT:    global_store_dword v2, v1, s[16:17]
1960; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1961; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1962;
1963; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__2_3:
1964; GFX940:       ; %bb.0:
1965; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1966; GFX940-NEXT:    v_mov_b32_e32 v2, 0
1967; GFX940-NEXT:    ;;#ASMSTART
1968; GFX940-NEXT:    ; def v[0:1]
1969; GFX940-NEXT:    ;;#ASMEND
1970; GFX940-NEXT:    global_store_dword v2, v1, s[0:1] sc0 sc1
1971; GFX940-NEXT:    s_waitcnt vmcnt(0)
1972; GFX940-NEXT:    s_setpc_b64 s[30:31]
1973  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
1974  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 2, i32 3>
1975  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
1976  ret void
1977}
1978
1979define void @v_shuffle_v2bf16_v4bf16__3_3(ptr addrspace(1) inreg %ptr) {
1980; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__3_3:
1981; GFX900:       ; %bb.0:
1982; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1983; GFX900-NEXT:    ;;#ASMSTART
1984; GFX900-NEXT:    ; def v[0:1]
1985; GFX900-NEXT:    ;;#ASMEND
1986; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
1987; GFX900-NEXT:    v_mov_b32_e32 v2, 0
1988; GFX900-NEXT:    v_perm_b32 v0, v1, v1, s4
1989; GFX900-NEXT:    global_store_dword v2, v0, s[16:17]
1990; GFX900-NEXT:    s_waitcnt vmcnt(0)
1991; GFX900-NEXT:    s_setpc_b64 s[30:31]
1992;
1993; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__3_3:
1994; GFX90A:       ; %bb.0:
1995; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1996; GFX90A-NEXT:    ;;#ASMSTART
1997; GFX90A-NEXT:    ; def v[0:1]
1998; GFX90A-NEXT:    ;;#ASMEND
1999; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
2000; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
2001; GFX90A-NEXT:    v_perm_b32 v0, v1, v1, s4
2002; GFX90A-NEXT:    global_store_dword v2, v0, s[16:17]
2003; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2004; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2005;
2006; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__3_3:
2007; GFX940:       ; %bb.0:
2008; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2009; GFX940-NEXT:    ;;#ASMSTART
2010; GFX940-NEXT:    ; def v[0:1]
2011; GFX940-NEXT:    ;;#ASMEND
2012; GFX940-NEXT:    s_mov_b32 s2, 0x7060302
2013; GFX940-NEXT:    v_mov_b32_e32 v2, 0
2014; GFX940-NEXT:    v_perm_b32 v0, v1, v1, s2
2015; GFX940-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
2016; GFX940-NEXT:    s_waitcnt vmcnt(0)
2017; GFX940-NEXT:    s_setpc_b64 s[30:31]
2018  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
2019  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 3, i32 3>
2020  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
2021  ret void
2022}
2023
2024define void @v_shuffle_v2bf16_v4bf16__4_3(ptr addrspace(1) inreg %ptr) {
2025; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__4_3:
2026; GFX900:       ; %bb.0:
2027; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2028; GFX900-NEXT:    v_mov_b32_e32 v2, 0
2029; GFX900-NEXT:    ;;#ASMSTART
2030; GFX900-NEXT:    ; def v[0:1]
2031; GFX900-NEXT:    ;;#ASMEND
2032; GFX900-NEXT:    global_store_dword v2, v1, s[16:17]
2033; GFX900-NEXT:    s_waitcnt vmcnt(0)
2034; GFX900-NEXT:    s_setpc_b64 s[30:31]
2035;
2036; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__4_3:
2037; GFX90A:       ; %bb.0:
2038; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2039; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
2040; GFX90A-NEXT:    ;;#ASMSTART
2041; GFX90A-NEXT:    ; def v[0:1]
2042; GFX90A-NEXT:    ;;#ASMEND
2043; GFX90A-NEXT:    global_store_dword v2, v1, s[16:17]
2044; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2045; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2046;
2047; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__4_3:
2048; GFX940:       ; %bb.0:
2049; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2050; GFX940-NEXT:    v_mov_b32_e32 v2, 0
2051; GFX940-NEXT:    ;;#ASMSTART
2052; GFX940-NEXT:    ; def v[0:1]
2053; GFX940-NEXT:    ;;#ASMEND
2054; GFX940-NEXT:    global_store_dword v2, v1, s[0:1] sc0 sc1
2055; GFX940-NEXT:    s_waitcnt vmcnt(0)
2056; GFX940-NEXT:    s_setpc_b64 s[30:31]
2057  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
2058  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 4, i32 3>
2059  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
2060  ret void
2061}
2062
2063define void @v_shuffle_v2bf16_v4bf16__5_3(ptr addrspace(1) inreg %ptr) {
2064; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__5_3:
2065; GFX900:       ; %bb.0:
2066; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2067; GFX900-NEXT:    ;;#ASMSTART
2068; GFX900-NEXT:    ; def v[0:1]
2069; GFX900-NEXT:    ;;#ASMEND
2070; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
2071; GFX900-NEXT:    v_mov_b32_e32 v4, 0
2072; GFX900-NEXT:    ;;#ASMSTART
2073; GFX900-NEXT:    ; def v[2:3]
2074; GFX900-NEXT:    ;;#ASMEND
2075; GFX900-NEXT:    v_perm_b32 v0, v1, v2, s4
2076; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
2077; GFX900-NEXT:    s_waitcnt vmcnt(0)
2078; GFX900-NEXT:    s_setpc_b64 s[30:31]
2079;
2080; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__5_3:
2081; GFX90A:       ; %bb.0:
2082; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2083; GFX90A-NEXT:    ;;#ASMSTART
2084; GFX90A-NEXT:    ; def v[0:1]
2085; GFX90A-NEXT:    ;;#ASMEND
2086; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
2087; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
2088; GFX90A-NEXT:    ;;#ASMSTART
2089; GFX90A-NEXT:    ; def v[2:3]
2090; GFX90A-NEXT:    ;;#ASMEND
2091; GFX90A-NEXT:    v_perm_b32 v0, v1, v2, s4
2092; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
2093; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2094; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2095;
2096; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__5_3:
2097; GFX940:       ; %bb.0:
2098; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2099; GFX940-NEXT:    ;;#ASMSTART
2100; GFX940-NEXT:    ; def v[0:1]
2101; GFX940-NEXT:    ;;#ASMEND
2102; GFX940-NEXT:    s_mov_b32 s2, 0x7060302
2103; GFX940-NEXT:    v_mov_b32_e32 v4, 0
2104; GFX940-NEXT:    ;;#ASMSTART
2105; GFX940-NEXT:    ; def v[2:3]
2106; GFX940-NEXT:    ;;#ASMEND
2107; GFX940-NEXT:    s_nop 0
2108; GFX940-NEXT:    v_perm_b32 v0, v1, v2, s2
2109; GFX940-NEXT:    global_store_dword v4, v0, s[0:1] sc0 sc1
2110; GFX940-NEXT:    s_waitcnt vmcnt(0)
2111; GFX940-NEXT:    s_setpc_b64 s[30:31]
2112  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
2113  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
2114  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 5, i32 3>
2115  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
2116  ret void
2117}
2118
2119define void @v_shuffle_v2bf16_v4bf16__6_3(ptr addrspace(1) inreg %ptr) {
2120; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__6_3:
2121; GFX900:       ; %bb.0:
2122; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2123; GFX900-NEXT:    ;;#ASMSTART
2124; GFX900-NEXT:    ; def v[0:1]
2125; GFX900-NEXT:    ;;#ASMEND
2126; GFX900-NEXT:    s_mov_b32 s4, 0xffff
2127; GFX900-NEXT:    v_mov_b32_e32 v4, 0
2128; GFX900-NEXT:    ;;#ASMSTART
2129; GFX900-NEXT:    ; def v[2:3]
2130; GFX900-NEXT:    ;;#ASMEND
2131; GFX900-NEXT:    v_bfi_b32 v0, s4, v3, v1
2132; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
2133; GFX900-NEXT:    s_waitcnt vmcnt(0)
2134; GFX900-NEXT:    s_setpc_b64 s[30:31]
2135;
2136; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__6_3:
2137; GFX90A:       ; %bb.0:
2138; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2139; GFX90A-NEXT:    ;;#ASMSTART
2140; GFX90A-NEXT:    ; def v[0:1]
2141; GFX90A-NEXT:    ;;#ASMEND
2142; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
2143; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
2144; GFX90A-NEXT:    ;;#ASMSTART
2145; GFX90A-NEXT:    ; def v[2:3]
2146; GFX90A-NEXT:    ;;#ASMEND
2147; GFX90A-NEXT:    v_bfi_b32 v0, s4, v3, v1
2148; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
2149; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2150; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2151;
2152; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__6_3:
2153; GFX940:       ; %bb.0:
2154; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2155; GFX940-NEXT:    ;;#ASMSTART
2156; GFX940-NEXT:    ; def v[0:1]
2157; GFX940-NEXT:    ;;#ASMEND
2158; GFX940-NEXT:    s_mov_b32 s2, 0xffff
2159; GFX940-NEXT:    v_mov_b32_e32 v4, 0
2160; GFX940-NEXT:    ;;#ASMSTART
2161; GFX940-NEXT:    ; def v[2:3]
2162; GFX940-NEXT:    ;;#ASMEND
2163; GFX940-NEXT:    s_nop 0
2164; GFX940-NEXT:    v_bfi_b32 v0, s2, v3, v1
2165; GFX940-NEXT:    global_store_dword v4, v0, s[0:1] sc0 sc1
2166; GFX940-NEXT:    s_waitcnt vmcnt(0)
2167; GFX940-NEXT:    s_setpc_b64 s[30:31]
2168  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
2169  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
2170  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 6, i32 3>
2171  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
2172  ret void
2173}
2174
2175define void @v_shuffle_v2bf16_v4bf16__u_4(ptr addrspace(1) inreg %ptr) {
2176; GFX9-LABEL: v_shuffle_v2bf16_v4bf16__u_4:
2177; GFX9:       ; %bb.0:
2178; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2179; GFX9-NEXT:    s_setpc_b64 s[30:31]
2180  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
2181  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 poison, i32 4>
2182  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
2183  ret void
2184}
2185
2186define void @v_shuffle_v2bf16_v4bf16__0_4(ptr addrspace(1) inreg %ptr) {
2187; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__0_4:
2188; GFX900:       ; %bb.0:
2189; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2190; GFX900-NEXT:    v_mov_b32_e32 v2, 0
2191; GFX900-NEXT:    ;;#ASMSTART
2192; GFX900-NEXT:    ; def v[0:1]
2193; GFX900-NEXT:    ;;#ASMEND
2194; GFX900-NEXT:    global_store_dword v2, v0, s[16:17]
2195; GFX900-NEXT:    s_waitcnt vmcnt(0)
2196; GFX900-NEXT:    s_setpc_b64 s[30:31]
2197;
2198; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__0_4:
2199; GFX90A:       ; %bb.0:
2200; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2201; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
2202; GFX90A-NEXT:    ;;#ASMSTART
2203; GFX90A-NEXT:    ; def v[0:1]
2204; GFX90A-NEXT:    ;;#ASMEND
2205; GFX90A-NEXT:    global_store_dword v2, v0, s[16:17]
2206; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2207; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2208;
2209; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__0_4:
2210; GFX940:       ; %bb.0:
2211; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2212; GFX940-NEXT:    v_mov_b32_e32 v2, 0
2213; GFX940-NEXT:    ;;#ASMSTART
2214; GFX940-NEXT:    ; def v[0:1]
2215; GFX940-NEXT:    ;;#ASMEND
2216; GFX940-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
2217; GFX940-NEXT:    s_waitcnt vmcnt(0)
2218; GFX940-NEXT:    s_setpc_b64 s[30:31]
2219  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
2220  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 0, i32 4>
2221  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
2222  ret void
2223}
2224
2225define void @v_shuffle_v2bf16_v4bf16__1_4(ptr addrspace(1) inreg %ptr) {
2226; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__1_4:
2227; GFX900:       ; %bb.0:
2228; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2229; GFX900-NEXT:    ;;#ASMSTART
2230; GFX900-NEXT:    ; def v[0:1]
2231; GFX900-NEXT:    ;;#ASMEND
2232; GFX900-NEXT:    v_mov_b32_e32 v2, 0
2233; GFX900-NEXT:    v_alignbit_b32 v0, s4, v0, 16
2234; GFX900-NEXT:    global_store_dword v2, v0, s[16:17]
2235; GFX900-NEXT:    s_waitcnt vmcnt(0)
2236; GFX900-NEXT:    s_setpc_b64 s[30:31]
2237;
2238; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__1_4:
2239; GFX90A:       ; %bb.0:
2240; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2241; GFX90A-NEXT:    ;;#ASMSTART
2242; GFX90A-NEXT:    ; def v[0:1]
2243; GFX90A-NEXT:    ;;#ASMEND
2244; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
2245; GFX90A-NEXT:    v_alignbit_b32 v0, s4, v0, 16
2246; GFX90A-NEXT:    global_store_dword v2, v0, s[16:17]
2247; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2248; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2249;
2250; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__1_4:
2251; GFX940:       ; %bb.0:
2252; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2253; GFX940-NEXT:    ;;#ASMSTART
2254; GFX940-NEXT:    ; def v[0:1]
2255; GFX940-NEXT:    ;;#ASMEND
2256; GFX940-NEXT:    v_mov_b32_e32 v2, 0
2257; GFX940-NEXT:    v_alignbit_b32 v0, s0, v0, 16
2258; GFX940-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
2259; GFX940-NEXT:    s_waitcnt vmcnt(0)
2260; GFX940-NEXT:    s_setpc_b64 s[30:31]
2261  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
2262  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 1, i32 4>
2263  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
2264  ret void
2265}
2266
2267define void @v_shuffle_v2bf16_v4bf16__2_4(ptr addrspace(1) inreg %ptr) {
2268; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__2_4:
2269; GFX900:       ; %bb.0:
2270; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2271; GFX900-NEXT:    v_mov_b32_e32 v2, 0
2272; GFX900-NEXT:    ;;#ASMSTART
2273; GFX900-NEXT:    ; def v[0:1]
2274; GFX900-NEXT:    ;;#ASMEND
2275; GFX900-NEXT:    global_store_dword v2, v1, s[16:17]
2276; GFX900-NEXT:    s_waitcnt vmcnt(0)
2277; GFX900-NEXT:    s_setpc_b64 s[30:31]
2278;
2279; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__2_4:
2280; GFX90A:       ; %bb.0:
2281; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2282; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
2283; GFX90A-NEXT:    ;;#ASMSTART
2284; GFX90A-NEXT:    ; def v[0:1]
2285; GFX90A-NEXT:    ;;#ASMEND
2286; GFX90A-NEXT:    global_store_dword v2, v1, s[16:17]
2287; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2288; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2289;
2290; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__2_4:
2291; GFX940:       ; %bb.0:
2292; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2293; GFX940-NEXT:    v_mov_b32_e32 v2, 0
2294; GFX940-NEXT:    ;;#ASMSTART
2295; GFX940-NEXT:    ; def v[0:1]
2296; GFX940-NEXT:    ;;#ASMEND
2297; GFX940-NEXT:    global_store_dword v2, v1, s[0:1] sc0 sc1
2298; GFX940-NEXT:    s_waitcnt vmcnt(0)
2299; GFX940-NEXT:    s_setpc_b64 s[30:31]
2300  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
2301  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 2, i32 4>
2302  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
2303  ret void
2304}
2305
2306define void @v_shuffle_v2bf16_v4bf16__3_4(ptr addrspace(1) inreg %ptr) {
2307; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__3_4:
2308; GFX900:       ; %bb.0:
2309; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2310; GFX900-NEXT:    ;;#ASMSTART
2311; GFX900-NEXT:    ; def v[0:1]
2312; GFX900-NEXT:    ;;#ASMEND
2313; GFX900-NEXT:    v_mov_b32_e32 v2, 0
2314; GFX900-NEXT:    v_alignbit_b32 v0, s4, v1, 16
2315; GFX900-NEXT:    global_store_dword v2, v0, s[16:17]
2316; GFX900-NEXT:    s_waitcnt vmcnt(0)
2317; GFX900-NEXT:    s_setpc_b64 s[30:31]
2318;
2319; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__3_4:
2320; GFX90A:       ; %bb.0:
2321; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2322; GFX90A-NEXT:    ;;#ASMSTART
2323; GFX90A-NEXT:    ; def v[0:1]
2324; GFX90A-NEXT:    ;;#ASMEND
2325; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
2326; GFX90A-NEXT:    v_alignbit_b32 v0, s4, v1, 16
2327; GFX90A-NEXT:    global_store_dword v2, v0, s[16:17]
2328; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2329; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2330;
2331; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__3_4:
2332; GFX940:       ; %bb.0:
2333; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2334; GFX940-NEXT:    ;;#ASMSTART
2335; GFX940-NEXT:    ; def v[0:1]
2336; GFX940-NEXT:    ;;#ASMEND
2337; GFX940-NEXT:    v_mov_b32_e32 v2, 0
2338; GFX940-NEXT:    v_alignbit_b32 v0, s0, v1, 16
2339; GFX940-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
2340; GFX940-NEXT:    s_waitcnt vmcnt(0)
2341; GFX940-NEXT:    s_setpc_b64 s[30:31]
2342  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
2343  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 3, i32 4>
2344  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
2345  ret void
2346}
2347
2348define void @v_shuffle_v2bf16_v4bf16__4_4(ptr addrspace(1) inreg %ptr) {
2349; GFX9-LABEL: v_shuffle_v2bf16_v4bf16__4_4:
2350; GFX9:       ; %bb.0:
2351; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2352; GFX9-NEXT:    s_setpc_b64 s[30:31]
2353  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
2354  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 4, i32 4>
2355  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
2356  ret void
2357}
2358
2359define void @v_shuffle_v2bf16_v4bf16__5_4(ptr addrspace(1) inreg %ptr) {
2360; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__5_4:
2361; GFX900:       ; %bb.0:
2362; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2363; GFX900-NEXT:    ;;#ASMSTART
2364; GFX900-NEXT:    ; def v[0:1]
2365; GFX900-NEXT:    ;;#ASMEND
2366; GFX900-NEXT:    v_mov_b32_e32 v2, 0
2367; GFX900-NEXT:    v_alignbit_b32 v0, v0, v0, 16
2368; GFX900-NEXT:    global_store_dword v2, v0, s[16:17]
2369; GFX900-NEXT:    s_waitcnt vmcnt(0)
2370; GFX900-NEXT:    s_setpc_b64 s[30:31]
2371;
2372; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__5_4:
2373; GFX90A:       ; %bb.0:
2374; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2375; GFX90A-NEXT:    ;;#ASMSTART
2376; GFX90A-NEXT:    ; def v[0:1]
2377; GFX90A-NEXT:    ;;#ASMEND
2378; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
2379; GFX90A-NEXT:    v_alignbit_b32 v0, v0, v0, 16
2380; GFX90A-NEXT:    global_store_dword v2, v0, s[16:17]
2381; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2382; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2383;
2384; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__5_4:
2385; GFX940:       ; %bb.0:
2386; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2387; GFX940-NEXT:    ;;#ASMSTART
2388; GFX940-NEXT:    ; def v[0:1]
2389; GFX940-NEXT:    ;;#ASMEND
2390; GFX940-NEXT:    v_mov_b32_e32 v2, 0
2391; GFX940-NEXT:    v_alignbit_b32 v0, v0, v0, 16
2392; GFX940-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
2393; GFX940-NEXT:    s_waitcnt vmcnt(0)
2394; GFX940-NEXT:    s_setpc_b64 s[30:31]
2395  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
2396  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
2397  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 5, i32 4>
2398  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
2399  ret void
2400}
2401
2402define void @v_shuffle_v2bf16_v4bf16__6_4(ptr addrspace(1) inreg %ptr) {
2403; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__6_4:
2404; GFX900:       ; %bb.0:
2405; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2406; GFX900-NEXT:    ;;#ASMSTART
2407; GFX900-NEXT:    ; def v[0:1]
2408; GFX900-NEXT:    ;;#ASMEND
2409; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
2410; GFX900-NEXT:    v_mov_b32_e32 v2, 0
2411; GFX900-NEXT:    v_perm_b32 v0, v0, v1, s4
2412; GFX900-NEXT:    global_store_dword v2, v0, s[16:17]
2413; GFX900-NEXT:    s_waitcnt vmcnt(0)
2414; GFX900-NEXT:    s_setpc_b64 s[30:31]
2415;
2416; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__6_4:
2417; GFX90A:       ; %bb.0:
2418; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2419; GFX90A-NEXT:    ;;#ASMSTART
2420; GFX90A-NEXT:    ; def v[0:1]
2421; GFX90A-NEXT:    ;;#ASMEND
2422; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
2423; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
2424; GFX90A-NEXT:    v_perm_b32 v0, v0, v1, s4
2425; GFX90A-NEXT:    global_store_dword v2, v0, s[16:17]
2426; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2427; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2428;
2429; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__6_4:
2430; GFX940:       ; %bb.0:
2431; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2432; GFX940-NEXT:    ;;#ASMSTART
2433; GFX940-NEXT:    ; def v[0:1]
2434; GFX940-NEXT:    ;;#ASMEND
2435; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
2436; GFX940-NEXT:    v_mov_b32_e32 v2, 0
2437; GFX940-NEXT:    v_perm_b32 v0, v0, v1, s2
2438; GFX940-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
2439; GFX940-NEXT:    s_waitcnt vmcnt(0)
2440; GFX940-NEXT:    s_setpc_b64 s[30:31]
2441  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
2442  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
2443  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 6, i32 4>
2444  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
2445  ret void
2446}
2447
2448define void @v_shuffle_v2bf16_v4bf16__u_5(ptr addrspace(1) inreg %ptr) {
2449; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__u_5:
2450; GFX900:       ; %bb.0:
2451; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2452; GFX900-NEXT:    v_mov_b32_e32 v2, 0
2453; GFX900-NEXT:    ;;#ASMSTART
2454; GFX900-NEXT:    ; def v[0:1]
2455; GFX900-NEXT:    ;;#ASMEND
2456; GFX900-NEXT:    global_store_dword v2, v0, s[16:17]
2457; GFX900-NEXT:    s_waitcnt vmcnt(0)
2458; GFX900-NEXT:    s_setpc_b64 s[30:31]
2459;
2460; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__u_5:
2461; GFX90A:       ; %bb.0:
2462; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2463; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
2464; GFX90A-NEXT:    ;;#ASMSTART
2465; GFX90A-NEXT:    ; def v[0:1]
2466; GFX90A-NEXT:    ;;#ASMEND
2467; GFX90A-NEXT:    global_store_dword v2, v0, s[16:17]
2468; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2469; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2470;
2471; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__u_5:
2472; GFX940:       ; %bb.0:
2473; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2474; GFX940-NEXT:    v_mov_b32_e32 v2, 0
2475; GFX940-NEXT:    ;;#ASMSTART
2476; GFX940-NEXT:    ; def v[0:1]
2477; GFX940-NEXT:    ;;#ASMEND
2478; GFX940-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
2479; GFX940-NEXT:    s_waitcnt vmcnt(0)
2480; GFX940-NEXT:    s_setpc_b64 s[30:31]
2481  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
2482  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
2483  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 poison, i32 5>
2484  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
2485  ret void
2486}
2487
2488define void @v_shuffle_v2bf16_v4bf16__0_5(ptr addrspace(1) inreg %ptr) {
2489; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__0_5:
2490; GFX900:       ; %bb.0:
2491; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2492; GFX900-NEXT:    ;;#ASMSTART
2493; GFX900-NEXT:    ; def v[0:1]
2494; GFX900-NEXT:    ;;#ASMEND
2495; GFX900-NEXT:    s_mov_b32 s4, 0xffff
2496; GFX900-NEXT:    v_mov_b32_e32 v3, 0
2497; GFX900-NEXT:    ;;#ASMSTART
2498; GFX900-NEXT:    ; def v[1:2]
2499; GFX900-NEXT:    ;;#ASMEND
2500; GFX900-NEXT:    v_bfi_b32 v0, s4, v0, v1
2501; GFX900-NEXT:    global_store_dword v3, v0, s[16:17]
2502; GFX900-NEXT:    s_waitcnt vmcnt(0)
2503; GFX900-NEXT:    s_setpc_b64 s[30:31]
2504;
2505; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__0_5:
2506; GFX90A:       ; %bb.0:
2507; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2508; GFX90A-NEXT:    ;;#ASMSTART
2509; GFX90A-NEXT:    ; def v[0:1]
2510; GFX90A-NEXT:    ;;#ASMEND
2511; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
2512; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
2513; GFX90A-NEXT:    ;;#ASMSTART
2514; GFX90A-NEXT:    ; def v[2:3]
2515; GFX90A-NEXT:    ;;#ASMEND
2516; GFX90A-NEXT:    v_bfi_b32 v0, s4, v0, v2
2517; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
2518; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2519; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2520;
2521; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__0_5:
2522; GFX940:       ; %bb.0:
2523; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2524; GFX940-NEXT:    ;;#ASMSTART
2525; GFX940-NEXT:    ; def v[0:1]
2526; GFX940-NEXT:    ;;#ASMEND
2527; GFX940-NEXT:    s_mov_b32 s2, 0xffff
2528; GFX940-NEXT:    v_mov_b32_e32 v4, 0
2529; GFX940-NEXT:    ;;#ASMSTART
2530; GFX940-NEXT:    ; def v[2:3]
2531; GFX940-NEXT:    ;;#ASMEND
2532; GFX940-NEXT:    s_nop 0
2533; GFX940-NEXT:    v_bfi_b32 v0, s2, v0, v2
2534; GFX940-NEXT:    global_store_dword v4, v0, s[0:1] sc0 sc1
2535; GFX940-NEXT:    s_waitcnt vmcnt(0)
2536; GFX940-NEXT:    s_setpc_b64 s[30:31]
2537  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
2538  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
2539  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 0, i32 5>
2540  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
2541  ret void
2542}
2543
2544define void @v_shuffle_v2bf16_v4bf16__1_5(ptr addrspace(1) inreg %ptr) {
2545; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__1_5:
2546; GFX900:       ; %bb.0:
2547; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2548; GFX900-NEXT:    ;;#ASMSTART
2549; GFX900-NEXT:    ; def v[0:1]
2550; GFX900-NEXT:    ;;#ASMEND
2551; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
2552; GFX900-NEXT:    v_mov_b32_e32 v3, 0
2553; GFX900-NEXT:    ;;#ASMSTART
2554; GFX900-NEXT:    ; def v[1:2]
2555; GFX900-NEXT:    ;;#ASMEND
2556; GFX900-NEXT:    v_perm_b32 v0, v1, v0, s4
2557; GFX900-NEXT:    global_store_dword v3, v0, s[16:17]
2558; GFX900-NEXT:    s_waitcnt vmcnt(0)
2559; GFX900-NEXT:    s_setpc_b64 s[30:31]
2560;
2561; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__1_5:
2562; GFX90A:       ; %bb.0:
2563; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2564; GFX90A-NEXT:    ;;#ASMSTART
2565; GFX90A-NEXT:    ; def v[0:1]
2566; GFX90A-NEXT:    ;;#ASMEND
2567; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
2568; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
2569; GFX90A-NEXT:    ;;#ASMSTART
2570; GFX90A-NEXT:    ; def v[2:3]
2571; GFX90A-NEXT:    ;;#ASMEND
2572; GFX90A-NEXT:    v_perm_b32 v0, v2, v0, s4
2573; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
2574; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2575; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2576;
2577; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__1_5:
2578; GFX940:       ; %bb.0:
2579; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2580; GFX940-NEXT:    ;;#ASMSTART
2581; GFX940-NEXT:    ; def v[0:1]
2582; GFX940-NEXT:    ;;#ASMEND
2583; GFX940-NEXT:    s_mov_b32 s2, 0x7060302
2584; GFX940-NEXT:    v_mov_b32_e32 v4, 0
2585; GFX940-NEXT:    ;;#ASMSTART
2586; GFX940-NEXT:    ; def v[2:3]
2587; GFX940-NEXT:    ;;#ASMEND
2588; GFX940-NEXT:    s_nop 0
2589; GFX940-NEXT:    v_perm_b32 v0, v2, v0, s2
2590; GFX940-NEXT:    global_store_dword v4, v0, s[0:1] sc0 sc1
2591; GFX940-NEXT:    s_waitcnt vmcnt(0)
2592; GFX940-NEXT:    s_setpc_b64 s[30:31]
2593  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
2594  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
2595  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 1, i32 5>
2596  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
2597  ret void
2598}
2599
2600define void @v_shuffle_v2bf16_v4bf16__2_5(ptr addrspace(1) inreg %ptr) {
2601; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__2_5:
2602; GFX900:       ; %bb.0:
2603; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2604; GFX900-NEXT:    ;;#ASMSTART
2605; GFX900-NEXT:    ; def v[0:1]
2606; GFX900-NEXT:    ;;#ASMEND
2607; GFX900-NEXT:    s_mov_b32 s4, 0xffff
2608; GFX900-NEXT:    v_mov_b32_e32 v4, 0
2609; GFX900-NEXT:    ;;#ASMSTART
2610; GFX900-NEXT:    ; def v[2:3]
2611; GFX900-NEXT:    ;;#ASMEND
2612; GFX900-NEXT:    v_bfi_b32 v0, s4, v1, v2
2613; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
2614; GFX900-NEXT:    s_waitcnt vmcnt(0)
2615; GFX900-NEXT:    s_setpc_b64 s[30:31]
2616;
2617; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__2_5:
2618; GFX90A:       ; %bb.0:
2619; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2620; GFX90A-NEXT:    ;;#ASMSTART
2621; GFX90A-NEXT:    ; def v[0:1]
2622; GFX90A-NEXT:    ;;#ASMEND
2623; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
2624; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
2625; GFX90A-NEXT:    ;;#ASMSTART
2626; GFX90A-NEXT:    ; def v[2:3]
2627; GFX90A-NEXT:    ;;#ASMEND
2628; GFX90A-NEXT:    v_bfi_b32 v0, s4, v1, v2
2629; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
2630; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2631; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2632;
2633; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__2_5:
2634; GFX940:       ; %bb.0:
2635; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2636; GFX940-NEXT:    ;;#ASMSTART
2637; GFX940-NEXT:    ; def v[0:1]
2638; GFX940-NEXT:    ;;#ASMEND
2639; GFX940-NEXT:    s_mov_b32 s2, 0xffff
2640; GFX940-NEXT:    v_mov_b32_e32 v4, 0
2641; GFX940-NEXT:    ;;#ASMSTART
2642; GFX940-NEXT:    ; def v[2:3]
2643; GFX940-NEXT:    ;;#ASMEND
2644; GFX940-NEXT:    s_nop 0
2645; GFX940-NEXT:    v_bfi_b32 v0, s2, v1, v2
2646; GFX940-NEXT:    global_store_dword v4, v0, s[0:1] sc0 sc1
2647; GFX940-NEXT:    s_waitcnt vmcnt(0)
2648; GFX940-NEXT:    s_setpc_b64 s[30:31]
2649  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
2650  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
2651  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 2, i32 5>
2652  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
2653  ret void
2654}
2655
2656define void @v_shuffle_v2bf16_v4bf16__3_5(ptr addrspace(1) inreg %ptr) {
2657; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__3_5:
2658; GFX900:       ; %bb.0:
2659; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2660; GFX900-NEXT:    ;;#ASMSTART
2661; GFX900-NEXT:    ; def v[0:1]
2662; GFX900-NEXT:    ;;#ASMEND
2663; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
2664; GFX900-NEXT:    v_mov_b32_e32 v4, 0
2665; GFX900-NEXT:    ;;#ASMSTART
2666; GFX900-NEXT:    ; def v[2:3]
2667; GFX900-NEXT:    ;;#ASMEND
2668; GFX900-NEXT:    v_perm_b32 v0, v2, v1, s4
2669; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
2670; GFX900-NEXT:    s_waitcnt vmcnt(0)
2671; GFX900-NEXT:    s_setpc_b64 s[30:31]
2672;
2673; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__3_5:
2674; GFX90A:       ; %bb.0:
2675; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2676; GFX90A-NEXT:    ;;#ASMSTART
2677; GFX90A-NEXT:    ; def v[0:1]
2678; GFX90A-NEXT:    ;;#ASMEND
2679; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
2680; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
2681; GFX90A-NEXT:    ;;#ASMSTART
2682; GFX90A-NEXT:    ; def v[2:3]
2683; GFX90A-NEXT:    ;;#ASMEND
2684; GFX90A-NEXT:    v_perm_b32 v0, v2, v1, s4
2685; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
2686; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2687; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2688;
2689; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__3_5:
2690; GFX940:       ; %bb.0:
2691; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2692; GFX940-NEXT:    ;;#ASMSTART
2693; GFX940-NEXT:    ; def v[0:1]
2694; GFX940-NEXT:    ;;#ASMEND
2695; GFX940-NEXT:    s_mov_b32 s2, 0x7060302
2696; GFX940-NEXT:    v_mov_b32_e32 v4, 0
2697; GFX940-NEXT:    ;;#ASMSTART
2698; GFX940-NEXT:    ; def v[2:3]
2699; GFX940-NEXT:    ;;#ASMEND
2700; GFX940-NEXT:    s_nop 0
2701; GFX940-NEXT:    v_perm_b32 v0, v2, v1, s2
2702; GFX940-NEXT:    global_store_dword v4, v0, s[0:1] sc0 sc1
2703; GFX940-NEXT:    s_waitcnt vmcnt(0)
2704; GFX940-NEXT:    s_setpc_b64 s[30:31]
2705  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
2706  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
2707  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 3, i32 5>
2708  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
2709  ret void
2710}
2711
2712define void @v_shuffle_v2bf16_v4bf16__4_5(ptr addrspace(1) inreg %ptr) {
2713; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__4_5:
2714; GFX900:       ; %bb.0:
2715; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2716; GFX900-NEXT:    v_mov_b32_e32 v2, 0
2717; GFX900-NEXT:    ;;#ASMSTART
2718; GFX900-NEXT:    ; def v[0:1]
2719; GFX900-NEXT:    ;;#ASMEND
2720; GFX900-NEXT:    global_store_dword v2, v0, s[16:17]
2721; GFX900-NEXT:    s_waitcnt vmcnt(0)
2722; GFX900-NEXT:    s_setpc_b64 s[30:31]
2723;
2724; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__4_5:
2725; GFX90A:       ; %bb.0:
2726; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2727; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
2728; GFX90A-NEXT:    ;;#ASMSTART
2729; GFX90A-NEXT:    ; def v[0:1]
2730; GFX90A-NEXT:    ;;#ASMEND
2731; GFX90A-NEXT:    global_store_dword v2, v0, s[16:17]
2732; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2733; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2734;
2735; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__4_5:
2736; GFX940:       ; %bb.0:
2737; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2738; GFX940-NEXT:    v_mov_b32_e32 v2, 0
2739; GFX940-NEXT:    ;;#ASMSTART
2740; GFX940-NEXT:    ; def v[0:1]
2741; GFX940-NEXT:    ;;#ASMEND
2742; GFX940-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
2743; GFX940-NEXT:    s_waitcnt vmcnt(0)
2744; GFX940-NEXT:    s_setpc_b64 s[30:31]
2745  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
2746  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
2747  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 4, i32 5>
2748  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
2749  ret void
2750}
2751
2752define void @v_shuffle_v2bf16_v4bf16__5_5(ptr addrspace(1) inreg %ptr) {
2753; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__5_5:
2754; GFX900:       ; %bb.0:
2755; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2756; GFX900-NEXT:    ;;#ASMSTART
2757; GFX900-NEXT:    ; def v[0:1]
2758; GFX900-NEXT:    ;;#ASMEND
2759; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
2760; GFX900-NEXT:    v_mov_b32_e32 v2, 0
2761; GFX900-NEXT:    v_perm_b32 v0, v0, v0, s4
2762; GFX900-NEXT:    global_store_dword v2, v0, s[16:17]
2763; GFX900-NEXT:    s_waitcnt vmcnt(0)
2764; GFX900-NEXT:    s_setpc_b64 s[30:31]
2765;
2766; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__5_5:
2767; GFX90A:       ; %bb.0:
2768; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2769; GFX90A-NEXT:    ;;#ASMSTART
2770; GFX90A-NEXT:    ; def v[0:1]
2771; GFX90A-NEXT:    ;;#ASMEND
2772; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
2773; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
2774; GFX90A-NEXT:    v_perm_b32 v0, v0, v0, s4
2775; GFX90A-NEXT:    global_store_dword v2, v0, s[16:17]
2776; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2777; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2778;
2779; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__5_5:
2780; GFX940:       ; %bb.0:
2781; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2782; GFX940-NEXT:    ;;#ASMSTART
2783; GFX940-NEXT:    ; def v[0:1]
2784; GFX940-NEXT:    ;;#ASMEND
2785; GFX940-NEXT:    s_mov_b32 s2, 0x7060302
2786; GFX940-NEXT:    v_mov_b32_e32 v2, 0
2787; GFX940-NEXT:    v_perm_b32 v0, v0, v0, s2
2788; GFX940-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
2789; GFX940-NEXT:    s_waitcnt vmcnt(0)
2790; GFX940-NEXT:    s_setpc_b64 s[30:31]
2791  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
2792  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
2793  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 5, i32 5>
2794  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
2795  ret void
2796}
2797
2798define void @v_shuffle_v2bf16_v4bf16__6_5(ptr addrspace(1) inreg %ptr) {
2799; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__6_5:
2800; GFX900:       ; %bb.0:
2801; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2802; GFX900-NEXT:    ;;#ASMSTART
2803; GFX900-NEXT:    ; def v[0:1]
2804; GFX900-NEXT:    ;;#ASMEND
2805; GFX900-NEXT:    s_mov_b32 s4, 0xffff
2806; GFX900-NEXT:    v_mov_b32_e32 v2, 0
2807; GFX900-NEXT:    v_bfi_b32 v0, s4, v1, v0
2808; GFX900-NEXT:    global_store_dword v2, v0, s[16:17]
2809; GFX900-NEXT:    s_waitcnt vmcnt(0)
2810; GFX900-NEXT:    s_setpc_b64 s[30:31]
2811;
2812; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__6_5:
2813; GFX90A:       ; %bb.0:
2814; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2815; GFX90A-NEXT:    ;;#ASMSTART
2816; GFX90A-NEXT:    ; def v[0:1]
2817; GFX90A-NEXT:    ;;#ASMEND
2818; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
2819; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
2820; GFX90A-NEXT:    v_bfi_b32 v0, s4, v1, v0
2821; GFX90A-NEXT:    global_store_dword v2, v0, s[16:17]
2822; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2823; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2824;
2825; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__6_5:
2826; GFX940:       ; %bb.0:
2827; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2828; GFX940-NEXT:    ;;#ASMSTART
2829; GFX940-NEXT:    ; def v[0:1]
2830; GFX940-NEXT:    ;;#ASMEND
2831; GFX940-NEXT:    s_mov_b32 s2, 0xffff
2832; GFX940-NEXT:    v_mov_b32_e32 v2, 0
2833; GFX940-NEXT:    v_bfi_b32 v0, s2, v1, v0
2834; GFX940-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
2835; GFX940-NEXT:    s_waitcnt vmcnt(0)
2836; GFX940-NEXT:    s_setpc_b64 s[30:31]
2837  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
2838  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
2839  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 6, i32 5>
2840  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
2841  ret void
2842}
2843
2844define void @v_shuffle_v2bf16_v4bf16__u_6(ptr addrspace(1) inreg %ptr) {
2845; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__u_6:
2846; GFX900:       ; %bb.0:
2847; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2848; GFX900-NEXT:    ;;#ASMSTART
2849; GFX900-NEXT:    ; def v[0:1]
2850; GFX900-NEXT:    ;;#ASMEND
2851; GFX900-NEXT:    v_mov_b32_e32 v2, 0
2852; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
2853; GFX900-NEXT:    global_store_dword v2, v0, s[16:17]
2854; GFX900-NEXT:    s_waitcnt vmcnt(0)
2855; GFX900-NEXT:    s_setpc_b64 s[30:31]
2856;
2857; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__u_6:
2858; GFX90A:       ; %bb.0:
2859; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2860; GFX90A-NEXT:    ;;#ASMSTART
2861; GFX90A-NEXT:    ; def v[0:1]
2862; GFX90A-NEXT:    ;;#ASMEND
2863; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
2864; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
2865; GFX90A-NEXT:    global_store_dword v2, v0, s[16:17]
2866; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2867; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2868;
2869; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__u_6:
2870; GFX940:       ; %bb.0:
2871; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2872; GFX940-NEXT:    ;;#ASMSTART
2873; GFX940-NEXT:    ; def v[0:1]
2874; GFX940-NEXT:    ;;#ASMEND
2875; GFX940-NEXT:    v_mov_b32_e32 v2, 0
2876; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
2877; GFX940-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
2878; GFX940-NEXT:    s_waitcnt vmcnt(0)
2879; GFX940-NEXT:    s_setpc_b64 s[30:31]
2880  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
2881  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
2882  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 poison, i32 6>
2883  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
2884  ret void
2885}
2886
2887define void @v_shuffle_v2bf16_v4bf16__0_6(ptr addrspace(1) inreg %ptr) {
2888; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__0_6:
2889; GFX900:       ; %bb.0:
2890; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2891; GFX900-NEXT:    ;;#ASMSTART
2892; GFX900-NEXT:    ; def v[0:1]
2893; GFX900-NEXT:    ;;#ASMEND
2894; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
2895; GFX900-NEXT:    v_mov_b32_e32 v3, 0
2896; GFX900-NEXT:    ;;#ASMSTART
2897; GFX900-NEXT:    ; def v[1:2]
2898; GFX900-NEXT:    ;;#ASMEND
2899; GFX900-NEXT:    v_perm_b32 v0, v2, v0, s4
2900; GFX900-NEXT:    global_store_dword v3, v0, s[16:17]
2901; GFX900-NEXT:    s_waitcnt vmcnt(0)
2902; GFX900-NEXT:    s_setpc_b64 s[30:31]
2903;
2904; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__0_6:
2905; GFX90A:       ; %bb.0:
2906; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2907; GFX90A-NEXT:    ;;#ASMSTART
2908; GFX90A-NEXT:    ; def v[0:1]
2909; GFX90A-NEXT:    ;;#ASMEND
2910; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
2911; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
2912; GFX90A-NEXT:    ;;#ASMSTART
2913; GFX90A-NEXT:    ; def v[2:3]
2914; GFX90A-NEXT:    ;;#ASMEND
2915; GFX90A-NEXT:    v_perm_b32 v0, v3, v0, s4
2916; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
2917; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2918; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2919;
2920; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__0_6:
2921; GFX940:       ; %bb.0:
2922; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2923; GFX940-NEXT:    ;;#ASMSTART
2924; GFX940-NEXT:    ; def v[0:1]
2925; GFX940-NEXT:    ;;#ASMEND
2926; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
2927; GFX940-NEXT:    v_mov_b32_e32 v4, 0
2928; GFX940-NEXT:    ;;#ASMSTART
2929; GFX940-NEXT:    ; def v[2:3]
2930; GFX940-NEXT:    ;;#ASMEND
2931; GFX940-NEXT:    s_nop 0
2932; GFX940-NEXT:    v_perm_b32 v0, v3, v0, s2
2933; GFX940-NEXT:    global_store_dword v4, v0, s[0:1] sc0 sc1
2934; GFX940-NEXT:    s_waitcnt vmcnt(0)
2935; GFX940-NEXT:    s_setpc_b64 s[30:31]
2936  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
2937  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
2938  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 0, i32 6>
2939  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
2940  ret void
2941}
2942
2943define void @v_shuffle_v2bf16_v4bf16__1_6(ptr addrspace(1) inreg %ptr) {
2944; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__1_6:
2945; GFX900:       ; %bb.0:
2946; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2947; GFX900-NEXT:    ;;#ASMSTART
2948; GFX900-NEXT:    ; def v[0:1]
2949; GFX900-NEXT:    ;;#ASMEND
2950; GFX900-NEXT:    v_mov_b32_e32 v3, 0
2951; GFX900-NEXT:    ;;#ASMSTART
2952; GFX900-NEXT:    ; def v[1:2]
2953; GFX900-NEXT:    ;;#ASMEND
2954; GFX900-NEXT:    v_alignbit_b32 v0, v2, v0, 16
2955; GFX900-NEXT:    global_store_dword v3, v0, s[16:17]
2956; GFX900-NEXT:    s_waitcnt vmcnt(0)
2957; GFX900-NEXT:    s_setpc_b64 s[30:31]
2958;
2959; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__1_6:
2960; GFX90A:       ; %bb.0:
2961; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2962; GFX90A-NEXT:    ;;#ASMSTART
2963; GFX90A-NEXT:    ; def v[0:1]
2964; GFX90A-NEXT:    ;;#ASMEND
2965; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
2966; GFX90A-NEXT:    ;;#ASMSTART
2967; GFX90A-NEXT:    ; def v[2:3]
2968; GFX90A-NEXT:    ;;#ASMEND
2969; GFX90A-NEXT:    v_alignbit_b32 v0, v3, v0, 16
2970; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
2971; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2972; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2973;
2974; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__1_6:
2975; GFX940:       ; %bb.0:
2976; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2977; GFX940-NEXT:    ;;#ASMSTART
2978; GFX940-NEXT:    ; def v[0:1]
2979; GFX940-NEXT:    ;;#ASMEND
2980; GFX940-NEXT:    v_mov_b32_e32 v4, 0
2981; GFX940-NEXT:    ;;#ASMSTART
2982; GFX940-NEXT:    ; def v[2:3]
2983; GFX940-NEXT:    ;;#ASMEND
2984; GFX940-NEXT:    s_nop 0
2985; GFX940-NEXT:    v_alignbit_b32 v0, v3, v0, 16
2986; GFX940-NEXT:    global_store_dword v4, v0, s[0:1] sc0 sc1
2987; GFX940-NEXT:    s_waitcnt vmcnt(0)
2988; GFX940-NEXT:    s_setpc_b64 s[30:31]
2989  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
2990  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
2991  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 1, i32 6>
2992  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
2993  ret void
2994}
2995
2996define void @v_shuffle_v2bf16_v4bf16__2_6(ptr addrspace(1) inreg %ptr) {
2997; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__2_6:
2998; GFX900:       ; %bb.0:
2999; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3000; GFX900-NEXT:    ;;#ASMSTART
3001; GFX900-NEXT:    ; def v[0:1]
3002; GFX900-NEXT:    ;;#ASMEND
3003; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
3004; GFX900-NEXT:    v_mov_b32_e32 v4, 0
3005; GFX900-NEXT:    ;;#ASMSTART
3006; GFX900-NEXT:    ; def v[2:3]
3007; GFX900-NEXT:    ;;#ASMEND
3008; GFX900-NEXT:    v_perm_b32 v0, v3, v1, s4
3009; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
3010; GFX900-NEXT:    s_waitcnt vmcnt(0)
3011; GFX900-NEXT:    s_setpc_b64 s[30:31]
3012;
3013; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__2_6:
3014; GFX90A:       ; %bb.0:
3015; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3016; GFX90A-NEXT:    ;;#ASMSTART
3017; GFX90A-NEXT:    ; def v[0:1]
3018; GFX90A-NEXT:    ;;#ASMEND
3019; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
3020; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
3021; GFX90A-NEXT:    ;;#ASMSTART
3022; GFX90A-NEXT:    ; def v[2:3]
3023; GFX90A-NEXT:    ;;#ASMEND
3024; GFX90A-NEXT:    v_perm_b32 v0, v3, v1, s4
3025; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
3026; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3027; GFX90A-NEXT:    s_setpc_b64 s[30:31]
3028;
3029; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__2_6:
3030; GFX940:       ; %bb.0:
3031; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3032; GFX940-NEXT:    ;;#ASMSTART
3033; GFX940-NEXT:    ; def v[0:1]
3034; GFX940-NEXT:    ;;#ASMEND
3035; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
3036; GFX940-NEXT:    v_mov_b32_e32 v4, 0
3037; GFX940-NEXT:    ;;#ASMSTART
3038; GFX940-NEXT:    ; def v[2:3]
3039; GFX940-NEXT:    ;;#ASMEND
3040; GFX940-NEXT:    s_nop 0
3041; GFX940-NEXT:    v_perm_b32 v0, v3, v1, s2
3042; GFX940-NEXT:    global_store_dword v4, v0, s[0:1] sc0 sc1
3043; GFX940-NEXT:    s_waitcnt vmcnt(0)
3044; GFX940-NEXT:    s_setpc_b64 s[30:31]
3045  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
3046  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
3047  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 2, i32 6>
3048  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
3049  ret void
3050}
3051
3052define void @v_shuffle_v2bf16_v4bf16__3_6(ptr addrspace(1) inreg %ptr) {
3053; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__3_6:
3054; GFX900:       ; %bb.0:
3055; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3056; GFX900-NEXT:    ;;#ASMSTART
3057; GFX900-NEXT:    ; def v[0:1]
3058; GFX900-NEXT:    ;;#ASMEND
3059; GFX900-NEXT:    v_mov_b32_e32 v4, 0
3060; GFX900-NEXT:    ;;#ASMSTART
3061; GFX900-NEXT:    ; def v[2:3]
3062; GFX900-NEXT:    ;;#ASMEND
3063; GFX900-NEXT:    v_alignbit_b32 v0, v3, v1, 16
3064; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
3065; GFX900-NEXT:    s_waitcnt vmcnt(0)
3066; GFX900-NEXT:    s_setpc_b64 s[30:31]
3067;
3068; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__3_6:
3069; GFX90A:       ; %bb.0:
3070; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3071; GFX90A-NEXT:    ;;#ASMSTART
3072; GFX90A-NEXT:    ; def v[0:1]
3073; GFX90A-NEXT:    ;;#ASMEND
3074; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
3075; GFX90A-NEXT:    ;;#ASMSTART
3076; GFX90A-NEXT:    ; def v[2:3]
3077; GFX90A-NEXT:    ;;#ASMEND
3078; GFX90A-NEXT:    v_alignbit_b32 v0, v3, v1, 16
3079; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
3080; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3081; GFX90A-NEXT:    s_setpc_b64 s[30:31]
3082;
3083; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__3_6:
3084; GFX940:       ; %bb.0:
3085; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3086; GFX940-NEXT:    ;;#ASMSTART
3087; GFX940-NEXT:    ; def v[0:1]
3088; GFX940-NEXT:    ;;#ASMEND
3089; GFX940-NEXT:    v_mov_b32_e32 v4, 0
3090; GFX940-NEXT:    ;;#ASMSTART
3091; GFX940-NEXT:    ; def v[2:3]
3092; GFX940-NEXT:    ;;#ASMEND
3093; GFX940-NEXT:    s_nop 0
3094; GFX940-NEXT:    v_alignbit_b32 v0, v3, v1, 16
3095; GFX940-NEXT:    global_store_dword v4, v0, s[0:1] sc0 sc1
3096; GFX940-NEXT:    s_waitcnt vmcnt(0)
3097; GFX940-NEXT:    s_setpc_b64 s[30:31]
3098  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
3099  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
3100  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 3, i32 6>
3101  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
3102  ret void
3103}
3104
3105define void @v_shuffle_v2bf16_v4bf16__4_6(ptr addrspace(1) inreg %ptr) {
3106; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__4_6:
3107; GFX900:       ; %bb.0:
3108; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3109; GFX900-NEXT:    ;;#ASMSTART
3110; GFX900-NEXT:    ; def v[0:1]
3111; GFX900-NEXT:    ;;#ASMEND
3112; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
3113; GFX900-NEXT:    v_mov_b32_e32 v2, 0
3114; GFX900-NEXT:    v_perm_b32 v0, v1, v0, s4
3115; GFX900-NEXT:    global_store_dword v2, v0, s[16:17]
3116; GFX900-NEXT:    s_waitcnt vmcnt(0)
3117; GFX900-NEXT:    s_setpc_b64 s[30:31]
3118;
3119; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__4_6:
3120; GFX90A:       ; %bb.0:
3121; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3122; GFX90A-NEXT:    ;;#ASMSTART
3123; GFX90A-NEXT:    ; def v[0:1]
3124; GFX90A-NEXT:    ;;#ASMEND
3125; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
3126; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
3127; GFX90A-NEXT:    v_perm_b32 v0, v1, v0, s4
3128; GFX90A-NEXT:    global_store_dword v2, v0, s[16:17]
3129; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3130; GFX90A-NEXT:    s_setpc_b64 s[30:31]
3131;
3132; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__4_6:
3133; GFX940:       ; %bb.0:
3134; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3135; GFX940-NEXT:    ;;#ASMSTART
3136; GFX940-NEXT:    ; def v[0:1]
3137; GFX940-NEXT:    ;;#ASMEND
3138; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
3139; GFX940-NEXT:    v_mov_b32_e32 v2, 0
3140; GFX940-NEXT:    v_perm_b32 v0, v1, v0, s2
3141; GFX940-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
3142; GFX940-NEXT:    s_waitcnt vmcnt(0)
3143; GFX940-NEXT:    s_setpc_b64 s[30:31]
3144  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
3145  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
3146  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 4, i32 6>
3147  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
3148  ret void
3149}
3150
3151define void @v_shuffle_v2bf16_v4bf16__5_6(ptr addrspace(1) inreg %ptr) {
3152; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__5_6:
3153; GFX900:       ; %bb.0:
3154; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3155; GFX900-NEXT:    ;;#ASMSTART
3156; GFX900-NEXT:    ; def v[0:1]
3157; GFX900-NEXT:    ;;#ASMEND
3158; GFX900-NEXT:    v_mov_b32_e32 v2, 0
3159; GFX900-NEXT:    v_alignbit_b32 v0, v1, v0, 16
3160; GFX900-NEXT:    global_store_dword v2, v0, s[16:17]
3161; GFX900-NEXT:    s_waitcnt vmcnt(0)
3162; GFX900-NEXT:    s_setpc_b64 s[30:31]
3163;
3164; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__5_6:
3165; GFX90A:       ; %bb.0:
3166; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3167; GFX90A-NEXT:    ;;#ASMSTART
3168; GFX90A-NEXT:    ; def v[0:1]
3169; GFX90A-NEXT:    ;;#ASMEND
3170; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
3171; GFX90A-NEXT:    v_alignbit_b32 v0, v1, v0, 16
3172; GFX90A-NEXT:    global_store_dword v2, v0, s[16:17]
3173; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3174; GFX90A-NEXT:    s_setpc_b64 s[30:31]
3175;
3176; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__5_6:
3177; GFX940:       ; %bb.0:
3178; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3179; GFX940-NEXT:    ;;#ASMSTART
3180; GFX940-NEXT:    ; def v[0:1]
3181; GFX940-NEXT:    ;;#ASMEND
3182; GFX940-NEXT:    v_mov_b32_e32 v2, 0
3183; GFX940-NEXT:    v_alignbit_b32 v0, v1, v0, 16
3184; GFX940-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
3185; GFX940-NEXT:    s_waitcnt vmcnt(0)
3186; GFX940-NEXT:    s_setpc_b64 s[30:31]
3187  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
3188  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
3189  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 5, i32 6>
3190  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
3191  ret void
3192}
3193
3194define void @v_shuffle_v2bf16_v4bf16__6_6(ptr addrspace(1) inreg %ptr) {
3195; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__6_6:
3196; GFX900:       ; %bb.0:
3197; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3198; GFX900-NEXT:    ;;#ASMSTART
3199; GFX900-NEXT:    ; def v[0:1]
3200; GFX900-NEXT:    ;;#ASMEND
3201; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
3202; GFX900-NEXT:    v_mov_b32_e32 v2, 0
3203; GFX900-NEXT:    v_perm_b32 v0, v1, v1, s4
3204; GFX900-NEXT:    global_store_dword v2, v0, s[16:17]
3205; GFX900-NEXT:    s_waitcnt vmcnt(0)
3206; GFX900-NEXT:    s_setpc_b64 s[30:31]
3207;
3208; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__6_6:
3209; GFX90A:       ; %bb.0:
3210; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3211; GFX90A-NEXT:    ;;#ASMSTART
3212; GFX90A-NEXT:    ; def v[0:1]
3213; GFX90A-NEXT:    ;;#ASMEND
3214; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
3215; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
3216; GFX90A-NEXT:    v_perm_b32 v0, v1, v1, s4
3217; GFX90A-NEXT:    global_store_dword v2, v0, s[16:17]
3218; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3219; GFX90A-NEXT:    s_setpc_b64 s[30:31]
3220;
3221; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__6_6:
3222; GFX940:       ; %bb.0:
3223; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3224; GFX940-NEXT:    ;;#ASMSTART
3225; GFX940-NEXT:    ; def v[0:1]
3226; GFX940-NEXT:    ;;#ASMEND
3227; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
3228; GFX940-NEXT:    v_mov_b32_e32 v2, 0
3229; GFX940-NEXT:    v_perm_b32 v0, v1, v1, s2
3230; GFX940-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
3231; GFX940-NEXT:    s_waitcnt vmcnt(0)
3232; GFX940-NEXT:    s_setpc_b64 s[30:31]
3233  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
3234  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
3235  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 6, i32 6>
3236  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
3237  ret void
3238}
3239
3240define void @v_shuffle_v2bf16_v4bf16__u_7(ptr addrspace(1) inreg %ptr) {
3241; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__u_7:
3242; GFX900:       ; %bb.0:
3243; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3244; GFX900-NEXT:    v_mov_b32_e32 v2, 0
3245; GFX900-NEXT:    ;;#ASMSTART
3246; GFX900-NEXT:    ; def v[0:1]
3247; GFX900-NEXT:    ;;#ASMEND
3248; GFX900-NEXT:    global_store_dword v2, v1, s[16:17]
3249; GFX900-NEXT:    s_waitcnt vmcnt(0)
3250; GFX900-NEXT:    s_setpc_b64 s[30:31]
3251;
3252; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__u_7:
3253; GFX90A:       ; %bb.0:
3254; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3255; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
3256; GFX90A-NEXT:    ;;#ASMSTART
3257; GFX90A-NEXT:    ; def v[0:1]
3258; GFX90A-NEXT:    ;;#ASMEND
3259; GFX90A-NEXT:    global_store_dword v2, v1, s[16:17]
3260; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3261; GFX90A-NEXT:    s_setpc_b64 s[30:31]
3262;
3263; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__u_7:
3264; GFX940:       ; %bb.0:
3265; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3266; GFX940-NEXT:    v_mov_b32_e32 v2, 0
3267; GFX940-NEXT:    ;;#ASMSTART
3268; GFX940-NEXT:    ; def v[0:1]
3269; GFX940-NEXT:    ;;#ASMEND
3270; GFX940-NEXT:    global_store_dword v2, v1, s[0:1] sc0 sc1
3271; GFX940-NEXT:    s_waitcnt vmcnt(0)
3272; GFX940-NEXT:    s_setpc_b64 s[30:31]
3273  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
3274  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
3275  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 poison, i32 7>
3276  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
3277  ret void
3278}
3279
3280define void @v_shuffle_v2bf16_v4bf16__0_7(ptr addrspace(1) inreg %ptr) {
3281; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__0_7:
3282; GFX900:       ; %bb.0:
3283; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3284; GFX900-NEXT:    ;;#ASMSTART
3285; GFX900-NEXT:    ; def v[0:1]
3286; GFX900-NEXT:    ;;#ASMEND
3287; GFX900-NEXT:    s_mov_b32 s4, 0xffff
3288; GFX900-NEXT:    v_mov_b32_e32 v3, 0
3289; GFX900-NEXT:    ;;#ASMSTART
3290; GFX900-NEXT:    ; def v[1:2]
3291; GFX900-NEXT:    ;;#ASMEND
3292; GFX900-NEXT:    v_bfi_b32 v0, s4, v0, v2
3293; GFX900-NEXT:    global_store_dword v3, v0, s[16:17]
3294; GFX900-NEXT:    s_waitcnt vmcnt(0)
3295; GFX900-NEXT:    s_setpc_b64 s[30:31]
3296;
3297; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__0_7:
3298; GFX90A:       ; %bb.0:
3299; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3300; GFX90A-NEXT:    ;;#ASMSTART
3301; GFX90A-NEXT:    ; def v[0:1]
3302; GFX90A-NEXT:    ;;#ASMEND
3303; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
3304; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
3305; GFX90A-NEXT:    ;;#ASMSTART
3306; GFX90A-NEXT:    ; def v[2:3]
3307; GFX90A-NEXT:    ;;#ASMEND
3308; GFX90A-NEXT:    v_bfi_b32 v0, s4, v0, v3
3309; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
3310; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3311; GFX90A-NEXT:    s_setpc_b64 s[30:31]
3312;
3313; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__0_7:
3314; GFX940:       ; %bb.0:
3315; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3316; GFX940-NEXT:    ;;#ASMSTART
3317; GFX940-NEXT:    ; def v[0:1]
3318; GFX940-NEXT:    ;;#ASMEND
3319; GFX940-NEXT:    s_mov_b32 s2, 0xffff
3320; GFX940-NEXT:    v_mov_b32_e32 v4, 0
3321; GFX940-NEXT:    ;;#ASMSTART
3322; GFX940-NEXT:    ; def v[2:3]
3323; GFX940-NEXT:    ;;#ASMEND
3324; GFX940-NEXT:    s_nop 0
3325; GFX940-NEXT:    v_bfi_b32 v0, s2, v0, v3
3326; GFX940-NEXT:    global_store_dword v4, v0, s[0:1] sc0 sc1
3327; GFX940-NEXT:    s_waitcnt vmcnt(0)
3328; GFX940-NEXT:    s_setpc_b64 s[30:31]
3329  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
3330  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
3331  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 0, i32 7>
3332  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
3333  ret void
3334}
3335
3336define void @v_shuffle_v2bf16_v4bf16__1_7(ptr addrspace(1) inreg %ptr) {
3337; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__1_7:
3338; GFX900:       ; %bb.0:
3339; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3340; GFX900-NEXT:    ;;#ASMSTART
3341; GFX900-NEXT:    ; def v[0:1]
3342; GFX900-NEXT:    ;;#ASMEND
3343; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
3344; GFX900-NEXT:    v_mov_b32_e32 v3, 0
3345; GFX900-NEXT:    ;;#ASMSTART
3346; GFX900-NEXT:    ; def v[1:2]
3347; GFX900-NEXT:    ;;#ASMEND
3348; GFX900-NEXT:    v_perm_b32 v0, v2, v0, s4
3349; GFX900-NEXT:    global_store_dword v3, v0, s[16:17]
3350; GFX900-NEXT:    s_waitcnt vmcnt(0)
3351; GFX900-NEXT:    s_setpc_b64 s[30:31]
3352;
3353; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__1_7:
3354; GFX90A:       ; %bb.0:
3355; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3356; GFX90A-NEXT:    ;;#ASMSTART
3357; GFX90A-NEXT:    ; def v[0:1]
3358; GFX90A-NEXT:    ;;#ASMEND
3359; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
3360; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
3361; GFX90A-NEXT:    ;;#ASMSTART
3362; GFX90A-NEXT:    ; def v[2:3]
3363; GFX90A-NEXT:    ;;#ASMEND
3364; GFX90A-NEXT:    v_perm_b32 v0, v3, v0, s4
3365; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
3366; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3367; GFX90A-NEXT:    s_setpc_b64 s[30:31]
3368;
3369; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__1_7:
3370; GFX940:       ; %bb.0:
3371; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3372; GFX940-NEXT:    ;;#ASMSTART
3373; GFX940-NEXT:    ; def v[0:1]
3374; GFX940-NEXT:    ;;#ASMEND
3375; GFX940-NEXT:    s_mov_b32 s2, 0x7060302
3376; GFX940-NEXT:    v_mov_b32_e32 v4, 0
3377; GFX940-NEXT:    ;;#ASMSTART
3378; GFX940-NEXT:    ; def v[2:3]
3379; GFX940-NEXT:    ;;#ASMEND
3380; GFX940-NEXT:    s_nop 0
3381; GFX940-NEXT:    v_perm_b32 v0, v3, v0, s2
3382; GFX940-NEXT:    global_store_dword v4, v0, s[0:1] sc0 sc1
3383; GFX940-NEXT:    s_waitcnt vmcnt(0)
3384; GFX940-NEXT:    s_setpc_b64 s[30:31]
3385  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
3386  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
3387  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 1, i32 7>
3388  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
3389  ret void
3390}
3391
3392define void @v_shuffle_v2bf16_v4bf16__2_7(ptr addrspace(1) inreg %ptr) {
3393; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__2_7:
3394; GFX900:       ; %bb.0:
3395; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3396; GFX900-NEXT:    ;;#ASMSTART
3397; GFX900-NEXT:    ; def v[0:1]
3398; GFX900-NEXT:    ;;#ASMEND
3399; GFX900-NEXT:    s_mov_b32 s4, 0xffff
3400; GFX900-NEXT:    v_mov_b32_e32 v4, 0
3401; GFX900-NEXT:    ;;#ASMSTART
3402; GFX900-NEXT:    ; def v[2:3]
3403; GFX900-NEXT:    ;;#ASMEND
3404; GFX900-NEXT:    v_bfi_b32 v0, s4, v1, v3
3405; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
3406; GFX900-NEXT:    s_waitcnt vmcnt(0)
3407; GFX900-NEXT:    s_setpc_b64 s[30:31]
3408;
3409; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__2_7:
3410; GFX90A:       ; %bb.0:
3411; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3412; GFX90A-NEXT:    ;;#ASMSTART
3413; GFX90A-NEXT:    ; def v[0:1]
3414; GFX90A-NEXT:    ;;#ASMEND
3415; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
3416; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
3417; GFX90A-NEXT:    ;;#ASMSTART
3418; GFX90A-NEXT:    ; def v[2:3]
3419; GFX90A-NEXT:    ;;#ASMEND
3420; GFX90A-NEXT:    v_bfi_b32 v0, s4, v1, v3
3421; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
3422; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3423; GFX90A-NEXT:    s_setpc_b64 s[30:31]
3424;
3425; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__2_7:
3426; GFX940:       ; %bb.0:
3427; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3428; GFX940-NEXT:    ;;#ASMSTART
3429; GFX940-NEXT:    ; def v[0:1]
3430; GFX940-NEXT:    ;;#ASMEND
3431; GFX940-NEXT:    s_mov_b32 s2, 0xffff
3432; GFX940-NEXT:    v_mov_b32_e32 v4, 0
3433; GFX940-NEXT:    ;;#ASMSTART
3434; GFX940-NEXT:    ; def v[2:3]
3435; GFX940-NEXT:    ;;#ASMEND
3436; GFX940-NEXT:    s_nop 0
3437; GFX940-NEXT:    v_bfi_b32 v0, s2, v1, v3
3438; GFX940-NEXT:    global_store_dword v4, v0, s[0:1] sc0 sc1
3439; GFX940-NEXT:    s_waitcnt vmcnt(0)
3440; GFX940-NEXT:    s_setpc_b64 s[30:31]
3441  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
3442  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
3443  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 2, i32 7>
3444  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
3445  ret void
3446}
3447
3448define void @v_shuffle_v2bf16_v4bf16__3_7(ptr addrspace(1) inreg %ptr) {
3449; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__3_7:
3450; GFX900:       ; %bb.0:
3451; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3452; GFX900-NEXT:    ;;#ASMSTART
3453; GFX900-NEXT:    ; def v[0:1]
3454; GFX900-NEXT:    ;;#ASMEND
3455; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
3456; GFX900-NEXT:    v_mov_b32_e32 v4, 0
3457; GFX900-NEXT:    ;;#ASMSTART
3458; GFX900-NEXT:    ; def v[2:3]
3459; GFX900-NEXT:    ;;#ASMEND
3460; GFX900-NEXT:    v_perm_b32 v0, v3, v1, s4
3461; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
3462; GFX900-NEXT:    s_waitcnt vmcnt(0)
3463; GFX900-NEXT:    s_setpc_b64 s[30:31]
3464;
3465; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__3_7:
3466; GFX90A:       ; %bb.0:
3467; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3468; GFX90A-NEXT:    ;;#ASMSTART
3469; GFX90A-NEXT:    ; def v[0:1]
3470; GFX90A-NEXT:    ;;#ASMEND
3471; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
3472; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
3473; GFX90A-NEXT:    ;;#ASMSTART
3474; GFX90A-NEXT:    ; def v[2:3]
3475; GFX90A-NEXT:    ;;#ASMEND
3476; GFX90A-NEXT:    v_perm_b32 v0, v3, v1, s4
3477; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
3478; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3479; GFX90A-NEXT:    s_setpc_b64 s[30:31]
3480;
3481; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__3_7:
3482; GFX940:       ; %bb.0:
3483; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3484; GFX940-NEXT:    ;;#ASMSTART
3485; GFX940-NEXT:    ; def v[0:1]
3486; GFX940-NEXT:    ;;#ASMEND
3487; GFX940-NEXT:    s_mov_b32 s2, 0x7060302
3488; GFX940-NEXT:    v_mov_b32_e32 v4, 0
3489; GFX940-NEXT:    ;;#ASMSTART
3490; GFX940-NEXT:    ; def v[2:3]
3491; GFX940-NEXT:    ;;#ASMEND
3492; GFX940-NEXT:    s_nop 0
3493; GFX940-NEXT:    v_perm_b32 v0, v3, v1, s2
3494; GFX940-NEXT:    global_store_dword v4, v0, s[0:1] sc0 sc1
3495; GFX940-NEXT:    s_waitcnt vmcnt(0)
3496; GFX940-NEXT:    s_setpc_b64 s[30:31]
3497  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
3498  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
3499  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 3, i32 7>
3500  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
3501  ret void
3502}
3503
3504define void @v_shuffle_v2bf16_v4bf16__4_7(ptr addrspace(1) inreg %ptr) {
3505; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__4_7:
3506; GFX900:       ; %bb.0:
3507; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3508; GFX900-NEXT:    ;;#ASMSTART
3509; GFX900-NEXT:    ; def v[0:1]
3510; GFX900-NEXT:    ;;#ASMEND
3511; GFX900-NEXT:    s_mov_b32 s4, 0xffff
3512; GFX900-NEXT:    v_mov_b32_e32 v2, 0
3513; GFX900-NEXT:    v_bfi_b32 v0, s4, v0, v1
3514; GFX900-NEXT:    global_store_dword v2, v0, s[16:17]
3515; GFX900-NEXT:    s_waitcnt vmcnt(0)
3516; GFX900-NEXT:    s_setpc_b64 s[30:31]
3517;
3518; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__4_7:
3519; GFX90A:       ; %bb.0:
3520; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3521; GFX90A-NEXT:    ;;#ASMSTART
3522; GFX90A-NEXT:    ; def v[0:1]
3523; GFX90A-NEXT:    ;;#ASMEND
3524; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
3525; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
3526; GFX90A-NEXT:    v_bfi_b32 v0, s4, v0, v1
3527; GFX90A-NEXT:    global_store_dword v2, v0, s[16:17]
3528; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3529; GFX90A-NEXT:    s_setpc_b64 s[30:31]
3530;
3531; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__4_7:
3532; GFX940:       ; %bb.0:
3533; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3534; GFX940-NEXT:    ;;#ASMSTART
3535; GFX940-NEXT:    ; def v[0:1]
3536; GFX940-NEXT:    ;;#ASMEND
3537; GFX940-NEXT:    s_mov_b32 s2, 0xffff
3538; GFX940-NEXT:    v_mov_b32_e32 v2, 0
3539; GFX940-NEXT:    v_bfi_b32 v0, s2, v0, v1
3540; GFX940-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
3541; GFX940-NEXT:    s_waitcnt vmcnt(0)
3542; GFX940-NEXT:    s_setpc_b64 s[30:31]
3543  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
3544  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
3545  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 4, i32 7>
3546  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
3547  ret void
3548}
3549
3550define void @v_shuffle_v2bf16_v4bf16__5_7(ptr addrspace(1) inreg %ptr) {
3551; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__5_7:
3552; GFX900:       ; %bb.0:
3553; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3554; GFX900-NEXT:    ;;#ASMSTART
3555; GFX900-NEXT:    ; def v[0:1]
3556; GFX900-NEXT:    ;;#ASMEND
3557; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
3558; GFX900-NEXT:    v_mov_b32_e32 v2, 0
3559; GFX900-NEXT:    v_perm_b32 v0, v1, v0, s4
3560; GFX900-NEXT:    global_store_dword v2, v0, s[16:17]
3561; GFX900-NEXT:    s_waitcnt vmcnt(0)
3562; GFX900-NEXT:    s_setpc_b64 s[30:31]
3563;
3564; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__5_7:
3565; GFX90A:       ; %bb.0:
3566; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3567; GFX90A-NEXT:    ;;#ASMSTART
3568; GFX90A-NEXT:    ; def v[0:1]
3569; GFX90A-NEXT:    ;;#ASMEND
3570; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
3571; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
3572; GFX90A-NEXT:    v_perm_b32 v0, v1, v0, s4
3573; GFX90A-NEXT:    global_store_dword v2, v0, s[16:17]
3574; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3575; GFX90A-NEXT:    s_setpc_b64 s[30:31]
3576;
3577; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__5_7:
3578; GFX940:       ; %bb.0:
3579; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3580; GFX940-NEXT:    ;;#ASMSTART
3581; GFX940-NEXT:    ; def v[0:1]
3582; GFX940-NEXT:    ;;#ASMEND
3583; GFX940-NEXT:    s_mov_b32 s2, 0x7060302
3584; GFX940-NEXT:    v_mov_b32_e32 v2, 0
3585; GFX940-NEXT:    v_perm_b32 v0, v1, v0, s2
3586; GFX940-NEXT:    global_store_dword v2, v0, s[0:1] sc0 sc1
3587; GFX940-NEXT:    s_waitcnt vmcnt(0)
3588; GFX940-NEXT:    s_setpc_b64 s[30:31]
3589  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
3590  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
3591  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 5, i32 7>
3592  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
3593  ret void
3594}
3595
3596define void @v_shuffle_v2bf16_v4bf16__6_7(ptr addrspace(1) inreg %ptr) {
3597; GFX900-LABEL: v_shuffle_v2bf16_v4bf16__6_7:
3598; GFX900:       ; %bb.0:
3599; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3600; GFX900-NEXT:    v_mov_b32_e32 v2, 0
3601; GFX900-NEXT:    ;;#ASMSTART
3602; GFX900-NEXT:    ; def v[0:1]
3603; GFX900-NEXT:    ;;#ASMEND
3604; GFX900-NEXT:    global_store_dword v2, v1, s[16:17]
3605; GFX900-NEXT:    s_waitcnt vmcnt(0)
3606; GFX900-NEXT:    s_setpc_b64 s[30:31]
3607;
3608; GFX90A-LABEL: v_shuffle_v2bf16_v4bf16__6_7:
3609; GFX90A:       ; %bb.0:
3610; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3611; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
3612; GFX90A-NEXT:    ;;#ASMSTART
3613; GFX90A-NEXT:    ; def v[0:1]
3614; GFX90A-NEXT:    ;;#ASMEND
3615; GFX90A-NEXT:    global_store_dword v2, v1, s[16:17]
3616; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3617; GFX90A-NEXT:    s_setpc_b64 s[30:31]
3618;
3619; GFX940-LABEL: v_shuffle_v2bf16_v4bf16__6_7:
3620; GFX940:       ; %bb.0:
3621; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3622; GFX940-NEXT:    v_mov_b32_e32 v2, 0
3623; GFX940-NEXT:    ;;#ASMSTART
3624; GFX940-NEXT:    ; def v[0:1]
3625; GFX940-NEXT:    ;;#ASMEND
3626; GFX940-NEXT:    global_store_dword v2, v1, s[0:1] sc0 sc1
3627; GFX940-NEXT:    s_waitcnt vmcnt(0)
3628; GFX940-NEXT:    s_setpc_b64 s[30:31]
3629  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
3630  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
3631  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 6, i32 7>
3632  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
3633  ret void
3634}
3635
3636define void @s_shuffle_v2bf16_v4bf16__u_u() {
3637; GFX9-LABEL: s_shuffle_v2bf16_v4bf16__u_u:
3638; GFX9:       ; %bb.0:
3639; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3640; GFX9-NEXT:    ;;#ASMSTART
3641; GFX9-NEXT:    ; use s8
3642; GFX9-NEXT:    ;;#ASMEND
3643; GFX9-NEXT:    s_setpc_b64 s[30:31]
3644  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
3645  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> poison
3646  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
3647  ret void
3648}
3649
3650define void @s_shuffle_v2bf16_v4bf16__0_u() {
3651; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__0_u:
3652; GFX900:       ; %bb.0:
3653; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3654; GFX900-NEXT:    ;;#ASMSTART
3655; GFX900-NEXT:    ; def s[8:9]
3656; GFX900-NEXT:    ;;#ASMEND
3657; GFX900-NEXT:    ;;#ASMSTART
3658; GFX900-NEXT:    ; use s8
3659; GFX900-NEXT:    ;;#ASMEND
3660; GFX900-NEXT:    s_setpc_b64 s[30:31]
3661;
3662; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__0_u:
3663; GFX90A:       ; %bb.0:
3664; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3665; GFX90A-NEXT:    ;;#ASMSTART
3666; GFX90A-NEXT:    ; def s[8:9]
3667; GFX90A-NEXT:    ;;#ASMEND
3668; GFX90A-NEXT:    ;;#ASMSTART
3669; GFX90A-NEXT:    ; use s8
3670; GFX90A-NEXT:    ;;#ASMEND
3671; GFX90A-NEXT:    s_setpc_b64 s[30:31]
3672;
3673; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__0_u:
3674; GFX940:       ; %bb.0:
3675; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3676; GFX940-NEXT:    ;;#ASMSTART
3677; GFX940-NEXT:    ; def s[8:9]
3678; GFX940-NEXT:    ;;#ASMEND
3679; GFX940-NEXT:    s_nop 0
3680; GFX940-NEXT:    ;;#ASMSTART
3681; GFX940-NEXT:    ; use s8
3682; GFX940-NEXT:    ;;#ASMEND
3683; GFX940-NEXT:    s_setpc_b64 s[30:31]
3684  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
3685  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 0, i32 poison>
3686  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
3687  ret void
3688}
3689
3690define void @s_shuffle_v2bf16_v4bf16__1_u() {
3691; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__1_u:
3692; GFX900:       ; %bb.0:
3693; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3694; GFX900-NEXT:    ;;#ASMSTART
3695; GFX900-NEXT:    ; def s[4:5]
3696; GFX900-NEXT:    ;;#ASMEND
3697; GFX900-NEXT:    s_lshr_b32 s8, s4, 16
3698; GFX900-NEXT:    ;;#ASMSTART
3699; GFX900-NEXT:    ; use s8
3700; GFX900-NEXT:    ;;#ASMEND
3701; GFX900-NEXT:    s_setpc_b64 s[30:31]
3702;
3703; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__1_u:
3704; GFX90A:       ; %bb.0:
3705; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3706; GFX90A-NEXT:    ;;#ASMSTART
3707; GFX90A-NEXT:    ; def s[4:5]
3708; GFX90A-NEXT:    ;;#ASMEND
3709; GFX90A-NEXT:    s_lshr_b32 s8, s4, 16
3710; GFX90A-NEXT:    ;;#ASMSTART
3711; GFX90A-NEXT:    ; use s8
3712; GFX90A-NEXT:    ;;#ASMEND
3713; GFX90A-NEXT:    s_setpc_b64 s[30:31]
3714;
3715; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__1_u:
3716; GFX940:       ; %bb.0:
3717; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3718; GFX940-NEXT:    ;;#ASMSTART
3719; GFX940-NEXT:    ; def s[0:1]
3720; GFX940-NEXT:    ;;#ASMEND
3721; GFX940-NEXT:    s_lshr_b32 s8, s0, 16
3722; GFX940-NEXT:    ;;#ASMSTART
3723; GFX940-NEXT:    ; use s8
3724; GFX940-NEXT:    ;;#ASMEND
3725; GFX940-NEXT:    s_setpc_b64 s[30:31]
3726  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
3727  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 1, i32 poison>
3728  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
3729  ret void
3730}
3731
3732define void @s_shuffle_v2bf16_v4bf16__2_u() {
3733; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__2_u:
3734; GFX900:       ; %bb.0:
3735; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3736; GFX900-NEXT:    ;;#ASMSTART
3737; GFX900-NEXT:    ; def s[4:5]
3738; GFX900-NEXT:    ;;#ASMEND
3739; GFX900-NEXT:    s_mov_b32 s8, s5
3740; GFX900-NEXT:    ;;#ASMSTART
3741; GFX900-NEXT:    ; use s8
3742; GFX900-NEXT:    ;;#ASMEND
3743; GFX900-NEXT:    s_setpc_b64 s[30:31]
3744;
3745; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__2_u:
3746; GFX90A:       ; %bb.0:
3747; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3748; GFX90A-NEXT:    ;;#ASMSTART
3749; GFX90A-NEXT:    ; def s[4:5]
3750; GFX90A-NEXT:    ;;#ASMEND
3751; GFX90A-NEXT:    s_mov_b32 s8, s5
3752; GFX90A-NEXT:    ;;#ASMSTART
3753; GFX90A-NEXT:    ; use s8
3754; GFX90A-NEXT:    ;;#ASMEND
3755; GFX90A-NEXT:    s_setpc_b64 s[30:31]
3756;
3757; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__2_u:
3758; GFX940:       ; %bb.0:
3759; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3760; GFX940-NEXT:    ;;#ASMSTART
3761; GFX940-NEXT:    ; def s[0:1]
3762; GFX940-NEXT:    ;;#ASMEND
3763; GFX940-NEXT:    s_mov_b32 s8, s1
3764; GFX940-NEXT:    ;;#ASMSTART
3765; GFX940-NEXT:    ; use s8
3766; GFX940-NEXT:    ;;#ASMEND
3767; GFX940-NEXT:    s_setpc_b64 s[30:31]
3768  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
3769  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 2, i32 poison>
3770  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
3771  ret void
3772}
3773
3774define void @s_shuffle_v2bf16_v4bf16__3_u() {
3775; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__3_u:
3776; GFX900:       ; %bb.0:
3777; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3778; GFX900-NEXT:    ;;#ASMSTART
3779; GFX900-NEXT:    ; def s[4:5]
3780; GFX900-NEXT:    ;;#ASMEND
3781; GFX900-NEXT:    s_lshr_b32 s8, s5, 16
3782; GFX900-NEXT:    ;;#ASMSTART
3783; GFX900-NEXT:    ; use s8
3784; GFX900-NEXT:    ;;#ASMEND
3785; GFX900-NEXT:    s_setpc_b64 s[30:31]
3786;
3787; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__3_u:
3788; GFX90A:       ; %bb.0:
3789; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3790; GFX90A-NEXT:    ;;#ASMSTART
3791; GFX90A-NEXT:    ; def s[4:5]
3792; GFX90A-NEXT:    ;;#ASMEND
3793; GFX90A-NEXT:    s_lshr_b32 s8, s5, 16
3794; GFX90A-NEXT:    ;;#ASMSTART
3795; GFX90A-NEXT:    ; use s8
3796; GFX90A-NEXT:    ;;#ASMEND
3797; GFX90A-NEXT:    s_setpc_b64 s[30:31]
3798;
3799; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__3_u:
3800; GFX940:       ; %bb.0:
3801; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3802; GFX940-NEXT:    ;;#ASMSTART
3803; GFX940-NEXT:    ; def s[0:1]
3804; GFX940-NEXT:    ;;#ASMEND
3805; GFX940-NEXT:    s_lshr_b32 s8, s1, 16
3806; GFX940-NEXT:    ;;#ASMSTART
3807; GFX940-NEXT:    ; use s8
3808; GFX940-NEXT:    ;;#ASMEND
3809; GFX940-NEXT:    s_setpc_b64 s[30:31]
3810  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
3811  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 3, i32 poison>
3812  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
3813  ret void
3814}
3815
3816define void @s_shuffle_v2bf16_v4bf16__4_u() {
3817; GFX9-LABEL: s_shuffle_v2bf16_v4bf16__4_u:
3818; GFX9:       ; %bb.0:
3819; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3820; GFX9-NEXT:    ;;#ASMSTART
3821; GFX9-NEXT:    ; use s8
3822; GFX9-NEXT:    ;;#ASMEND
3823; GFX9-NEXT:    s_setpc_b64 s[30:31]
3824  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
3825  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 4, i32 poison>
3826  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
3827  ret void
3828}
3829
3830define void @s_shuffle_v2bf16_v4bf16__5_u() {
3831; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__5_u:
3832; GFX900:       ; %bb.0:
3833; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3834; GFX900-NEXT:    ;;#ASMSTART
3835; GFX900-NEXT:    ; def s[4:5]
3836; GFX900-NEXT:    ;;#ASMEND
3837; GFX900-NEXT:    s_lshr_b32 s8, s4, 16
3838; GFX900-NEXT:    ;;#ASMSTART
3839; GFX900-NEXT:    ; use s8
3840; GFX900-NEXT:    ;;#ASMEND
3841; GFX900-NEXT:    s_setpc_b64 s[30:31]
3842;
3843; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__5_u:
3844; GFX90A:       ; %bb.0:
3845; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3846; GFX90A-NEXT:    ;;#ASMSTART
3847; GFX90A-NEXT:    ; def s[4:5]
3848; GFX90A-NEXT:    ;;#ASMEND
3849; GFX90A-NEXT:    s_lshr_b32 s8, s4, 16
3850; GFX90A-NEXT:    ;;#ASMSTART
3851; GFX90A-NEXT:    ; use s8
3852; GFX90A-NEXT:    ;;#ASMEND
3853; GFX90A-NEXT:    s_setpc_b64 s[30:31]
3854;
3855; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__5_u:
3856; GFX940:       ; %bb.0:
3857; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3858; GFX940-NEXT:    ;;#ASMSTART
3859; GFX940-NEXT:    ; def s[0:1]
3860; GFX940-NEXT:    ;;#ASMEND
3861; GFX940-NEXT:    s_lshr_b32 s8, s0, 16
3862; GFX940-NEXT:    ;;#ASMSTART
3863; GFX940-NEXT:    ; use s8
3864; GFX940-NEXT:    ;;#ASMEND
3865; GFX940-NEXT:    s_setpc_b64 s[30:31]
3866  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
3867  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
3868  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 5, i32 poison>
3869  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
3870  ret void
3871}
3872
3873define void @s_shuffle_v2bf16_v4bf16__6_u() {
3874; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__6_u:
3875; GFX900:       ; %bb.0:
3876; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3877; GFX900-NEXT:    ;;#ASMSTART
3878; GFX900-NEXT:    ; def s[4:5]
3879; GFX900-NEXT:    ;;#ASMEND
3880; GFX900-NEXT:    s_mov_b32 s8, s5
3881; GFX900-NEXT:    ;;#ASMSTART
3882; GFX900-NEXT:    ; use s8
3883; GFX900-NEXT:    ;;#ASMEND
3884; GFX900-NEXT:    s_setpc_b64 s[30:31]
3885;
3886; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__6_u:
3887; GFX90A:       ; %bb.0:
3888; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3889; GFX90A-NEXT:    ;;#ASMSTART
3890; GFX90A-NEXT:    ; def s[4:5]
3891; GFX90A-NEXT:    ;;#ASMEND
3892; GFX90A-NEXT:    s_mov_b32 s8, s5
3893; GFX90A-NEXT:    ;;#ASMSTART
3894; GFX90A-NEXT:    ; use s8
3895; GFX90A-NEXT:    ;;#ASMEND
3896; GFX90A-NEXT:    s_setpc_b64 s[30:31]
3897;
3898; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__6_u:
3899; GFX940:       ; %bb.0:
3900; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3901; GFX940-NEXT:    ;;#ASMSTART
3902; GFX940-NEXT:    ; def s[0:1]
3903; GFX940-NEXT:    ;;#ASMEND
3904; GFX940-NEXT:    s_mov_b32 s8, s1
3905; GFX940-NEXT:    ;;#ASMSTART
3906; GFX940-NEXT:    ; use s8
3907; GFX940-NEXT:    ;;#ASMEND
3908; GFX940-NEXT:    s_setpc_b64 s[30:31]
3909  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
3910  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
3911  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 6, i32 poison>
3912  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
3913  ret void
3914}
3915
3916define void @s_shuffle_v2bf16_v4bf16__7_u() {
3917; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__7_u:
3918; GFX900:       ; %bb.0:
3919; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3920; GFX900-NEXT:    ;;#ASMSTART
3921; GFX900-NEXT:    ; def s[4:5]
3922; GFX900-NEXT:    ;;#ASMEND
3923; GFX900-NEXT:    s_lshr_b32 s8, s5, 16
3924; GFX900-NEXT:    ;;#ASMSTART
3925; GFX900-NEXT:    ; use s8
3926; GFX900-NEXT:    ;;#ASMEND
3927; GFX900-NEXT:    s_setpc_b64 s[30:31]
3928;
3929; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__7_u:
3930; GFX90A:       ; %bb.0:
3931; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3932; GFX90A-NEXT:    ;;#ASMSTART
3933; GFX90A-NEXT:    ; def s[4:5]
3934; GFX90A-NEXT:    ;;#ASMEND
3935; GFX90A-NEXT:    s_lshr_b32 s8, s5, 16
3936; GFX90A-NEXT:    ;;#ASMSTART
3937; GFX90A-NEXT:    ; use s8
3938; GFX90A-NEXT:    ;;#ASMEND
3939; GFX90A-NEXT:    s_setpc_b64 s[30:31]
3940;
3941; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__7_u:
3942; GFX940:       ; %bb.0:
3943; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3944; GFX940-NEXT:    ;;#ASMSTART
3945; GFX940-NEXT:    ; def s[0:1]
3946; GFX940-NEXT:    ;;#ASMEND
3947; GFX940-NEXT:    s_lshr_b32 s8, s1, 16
3948; GFX940-NEXT:    ;;#ASMSTART
3949; GFX940-NEXT:    ; use s8
3950; GFX940-NEXT:    ;;#ASMEND
3951; GFX940-NEXT:    s_setpc_b64 s[30:31]
3952  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
3953  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
3954  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 7, i32 poison>
3955  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
3956  ret void
3957}
3958
3959define void @s_shuffle_v2bf16_v4bf16__7_0() {
3960; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__7_0:
3961; GFX900:       ; %bb.0:
3962; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3963; GFX900-NEXT:    ;;#ASMSTART
3964; GFX900-NEXT:    ; def s[4:5]
3965; GFX900-NEXT:    ;;#ASMEND
3966; GFX900-NEXT:    ;;#ASMSTART
3967; GFX900-NEXT:    ; def s[6:7]
3968; GFX900-NEXT:    ;;#ASMEND
3969; GFX900-NEXT:    s_lshr_b32 s5, s7, 16
3970; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
3971; GFX900-NEXT:    ;;#ASMSTART
3972; GFX900-NEXT:    ; use s8
3973; GFX900-NEXT:    ;;#ASMEND
3974; GFX900-NEXT:    s_setpc_b64 s[30:31]
3975;
3976; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__7_0:
3977; GFX90A:       ; %bb.0:
3978; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3979; GFX90A-NEXT:    ;;#ASMSTART
3980; GFX90A-NEXT:    ; def s[4:5]
3981; GFX90A-NEXT:    ;;#ASMEND
3982; GFX90A-NEXT:    ;;#ASMSTART
3983; GFX90A-NEXT:    ; def s[6:7]
3984; GFX90A-NEXT:    ;;#ASMEND
3985; GFX90A-NEXT:    s_lshr_b32 s5, s7, 16
3986; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
3987; GFX90A-NEXT:    ;;#ASMSTART
3988; GFX90A-NEXT:    ; use s8
3989; GFX90A-NEXT:    ;;#ASMEND
3990; GFX90A-NEXT:    s_setpc_b64 s[30:31]
3991;
3992; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__7_0:
3993; GFX940:       ; %bb.0:
3994; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3995; GFX940-NEXT:    ;;#ASMSTART
3996; GFX940-NEXT:    ; def s[0:1]
3997; GFX940-NEXT:    ;;#ASMEND
3998; GFX940-NEXT:    ;;#ASMSTART
3999; GFX940-NEXT:    ; def s[2:3]
4000; GFX940-NEXT:    ;;#ASMEND
4001; GFX940-NEXT:    s_lshr_b32 s1, s3, 16
4002; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
4003; GFX940-NEXT:    ;;#ASMSTART
4004; GFX940-NEXT:    ; use s8
4005; GFX940-NEXT:    ;;#ASMEND
4006; GFX940-NEXT:    s_setpc_b64 s[30:31]
4007  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
4008  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
4009  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 7, i32 0>
4010  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
4011  ret void
4012}
4013
4014define void @s_shuffle_v2bf16_v4bf16__7_1() {
4015; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__7_1:
4016; GFX900:       ; %bb.0:
4017; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4018; GFX900-NEXT:    ;;#ASMSTART
4019; GFX900-NEXT:    ; def s[4:5]
4020; GFX900-NEXT:    ;;#ASMEND
4021; GFX900-NEXT:    ;;#ASMSTART
4022; GFX900-NEXT:    ; def s[6:7]
4023; GFX900-NEXT:    ;;#ASMEND
4024; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
4025; GFX900-NEXT:    s_lshr_b32 s5, s7, 16
4026; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
4027; GFX900-NEXT:    ;;#ASMSTART
4028; GFX900-NEXT:    ; use s8
4029; GFX900-NEXT:    ;;#ASMEND
4030; GFX900-NEXT:    s_setpc_b64 s[30:31]
4031;
4032; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__7_1:
4033; GFX90A:       ; %bb.0:
4034; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4035; GFX90A-NEXT:    ;;#ASMSTART
4036; GFX90A-NEXT:    ; def s[4:5]
4037; GFX90A-NEXT:    ;;#ASMEND
4038; GFX90A-NEXT:    ;;#ASMSTART
4039; GFX90A-NEXT:    ; def s[6:7]
4040; GFX90A-NEXT:    ;;#ASMEND
4041; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
4042; GFX90A-NEXT:    s_lshr_b32 s5, s7, 16
4043; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
4044; GFX90A-NEXT:    ;;#ASMSTART
4045; GFX90A-NEXT:    ; use s8
4046; GFX90A-NEXT:    ;;#ASMEND
4047; GFX90A-NEXT:    s_setpc_b64 s[30:31]
4048;
4049; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__7_1:
4050; GFX940:       ; %bb.0:
4051; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4052; GFX940-NEXT:    ;;#ASMSTART
4053; GFX940-NEXT:    ; def s[0:1]
4054; GFX940-NEXT:    ;;#ASMEND
4055; GFX940-NEXT:    ;;#ASMSTART
4056; GFX940-NEXT:    ; def s[2:3]
4057; GFX940-NEXT:    ;;#ASMEND
4058; GFX940-NEXT:    s_lshr_b32 s0, s0, 16
4059; GFX940-NEXT:    s_lshr_b32 s1, s3, 16
4060; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
4061; GFX940-NEXT:    ;;#ASMSTART
4062; GFX940-NEXT:    ; use s8
4063; GFX940-NEXT:    ;;#ASMEND
4064; GFX940-NEXT:    s_setpc_b64 s[30:31]
4065  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
4066  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
4067  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 7, i32 1>
4068  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
4069  ret void
4070}
4071
4072define void @s_shuffle_v2bf16_v4bf16__7_2() {
4073; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__7_2:
4074; GFX900:       ; %bb.0:
4075; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4076; GFX900-NEXT:    ;;#ASMSTART
4077; GFX900-NEXT:    ; def s[4:5]
4078; GFX900-NEXT:    ;;#ASMEND
4079; GFX900-NEXT:    ;;#ASMSTART
4080; GFX900-NEXT:    ; def s[6:7]
4081; GFX900-NEXT:    ;;#ASMEND
4082; GFX900-NEXT:    s_lshr_b32 s4, s7, 16
4083; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
4084; GFX900-NEXT:    ;;#ASMSTART
4085; GFX900-NEXT:    ; use s8
4086; GFX900-NEXT:    ;;#ASMEND
4087; GFX900-NEXT:    s_setpc_b64 s[30:31]
4088;
4089; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__7_2:
4090; GFX90A:       ; %bb.0:
4091; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4092; GFX90A-NEXT:    ;;#ASMSTART
4093; GFX90A-NEXT:    ; def s[4:5]
4094; GFX90A-NEXT:    ;;#ASMEND
4095; GFX90A-NEXT:    ;;#ASMSTART
4096; GFX90A-NEXT:    ; def s[6:7]
4097; GFX90A-NEXT:    ;;#ASMEND
4098; GFX90A-NEXT:    s_lshr_b32 s4, s7, 16
4099; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
4100; GFX90A-NEXT:    ;;#ASMSTART
4101; GFX90A-NEXT:    ; use s8
4102; GFX90A-NEXT:    ;;#ASMEND
4103; GFX90A-NEXT:    s_setpc_b64 s[30:31]
4104;
4105; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__7_2:
4106; GFX940:       ; %bb.0:
4107; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4108; GFX940-NEXT:    ;;#ASMSTART
4109; GFX940-NEXT:    ; def s[0:1]
4110; GFX940-NEXT:    ;;#ASMEND
4111; GFX940-NEXT:    ;;#ASMSTART
4112; GFX940-NEXT:    ; def s[2:3]
4113; GFX940-NEXT:    ;;#ASMEND
4114; GFX940-NEXT:    s_lshr_b32 s0, s3, 16
4115; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s0, s1
4116; GFX940-NEXT:    ;;#ASMSTART
4117; GFX940-NEXT:    ; use s8
4118; GFX940-NEXT:    ;;#ASMEND
4119; GFX940-NEXT:    s_setpc_b64 s[30:31]
4120  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
4121  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
4122  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 7, i32 2>
4123  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
4124  ret void
4125}
4126
4127define void @s_shuffle_v2bf16_v4bf16__7_3() {
4128; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__7_3:
4129; GFX900:       ; %bb.0:
4130; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4131; GFX900-NEXT:    ;;#ASMSTART
4132; GFX900-NEXT:    ; def s[4:5]
4133; GFX900-NEXT:    ;;#ASMEND
4134; GFX900-NEXT:    ;;#ASMSTART
4135; GFX900-NEXT:    ; def s[6:7]
4136; GFX900-NEXT:    ;;#ASMEND
4137; GFX900-NEXT:    s_lshr_b32 s4, s5, 16
4138; GFX900-NEXT:    s_lshr_b32 s5, s7, 16
4139; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
4140; GFX900-NEXT:    ;;#ASMSTART
4141; GFX900-NEXT:    ; use s8
4142; GFX900-NEXT:    ;;#ASMEND
4143; GFX900-NEXT:    s_setpc_b64 s[30:31]
4144;
4145; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__7_3:
4146; GFX90A:       ; %bb.0:
4147; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4148; GFX90A-NEXT:    ;;#ASMSTART
4149; GFX90A-NEXT:    ; def s[4:5]
4150; GFX90A-NEXT:    ;;#ASMEND
4151; GFX90A-NEXT:    ;;#ASMSTART
4152; GFX90A-NEXT:    ; def s[6:7]
4153; GFX90A-NEXT:    ;;#ASMEND
4154; GFX90A-NEXT:    s_lshr_b32 s4, s5, 16
4155; GFX90A-NEXT:    s_lshr_b32 s5, s7, 16
4156; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
4157; GFX90A-NEXT:    ;;#ASMSTART
4158; GFX90A-NEXT:    ; use s8
4159; GFX90A-NEXT:    ;;#ASMEND
4160; GFX90A-NEXT:    s_setpc_b64 s[30:31]
4161;
4162; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__7_3:
4163; GFX940:       ; %bb.0:
4164; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4165; GFX940-NEXT:    ;;#ASMSTART
4166; GFX940-NEXT:    ; def s[0:1]
4167; GFX940-NEXT:    ;;#ASMEND
4168; GFX940-NEXT:    ;;#ASMSTART
4169; GFX940-NEXT:    ; def s[2:3]
4170; GFX940-NEXT:    ;;#ASMEND
4171; GFX940-NEXT:    s_lshr_b32 s0, s1, 16
4172; GFX940-NEXT:    s_lshr_b32 s1, s3, 16
4173; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
4174; GFX940-NEXT:    ;;#ASMSTART
4175; GFX940-NEXT:    ; use s8
4176; GFX940-NEXT:    ;;#ASMEND
4177; GFX940-NEXT:    s_setpc_b64 s[30:31]
4178  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
4179  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
4180  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 7, i32 3>
4181  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
4182  ret void
4183}
4184
4185define void @s_shuffle_v2bf16_v4bf16__7_4() {
4186; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__7_4:
4187; GFX900:       ; %bb.0:
4188; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4189; GFX900-NEXT:    ;;#ASMSTART
4190; GFX900-NEXT:    ; def s[4:5]
4191; GFX900-NEXT:    ;;#ASMEND
4192; GFX900-NEXT:    s_lshr_b32 s5, s5, 16
4193; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
4194; GFX900-NEXT:    ;;#ASMSTART
4195; GFX900-NEXT:    ; use s8
4196; GFX900-NEXT:    ;;#ASMEND
4197; GFX900-NEXT:    s_setpc_b64 s[30:31]
4198;
4199; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__7_4:
4200; GFX90A:       ; %bb.0:
4201; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4202; GFX90A-NEXT:    ;;#ASMSTART
4203; GFX90A-NEXT:    ; def s[4:5]
4204; GFX90A-NEXT:    ;;#ASMEND
4205; GFX90A-NEXT:    s_lshr_b32 s5, s5, 16
4206; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
4207; GFX90A-NEXT:    ;;#ASMSTART
4208; GFX90A-NEXT:    ; use s8
4209; GFX90A-NEXT:    ;;#ASMEND
4210; GFX90A-NEXT:    s_setpc_b64 s[30:31]
4211;
4212; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__7_4:
4213; GFX940:       ; %bb.0:
4214; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4215; GFX940-NEXT:    ;;#ASMSTART
4216; GFX940-NEXT:    ; def s[0:1]
4217; GFX940-NEXT:    ;;#ASMEND
4218; GFX940-NEXT:    s_lshr_b32 s1, s1, 16
4219; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
4220; GFX940-NEXT:    ;;#ASMSTART
4221; GFX940-NEXT:    ; use s8
4222; GFX940-NEXT:    ;;#ASMEND
4223; GFX940-NEXT:    s_setpc_b64 s[30:31]
4224  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
4225  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
4226  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 7, i32 4>
4227  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
4228  ret void
4229}
4230
4231define void @s_shuffle_v2bf16_v4bf16__7_5() {
4232; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__7_5:
4233; GFX900:       ; %bb.0:
4234; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4235; GFX900-NEXT:    ;;#ASMSTART
4236; GFX900-NEXT:    ; def s[4:5]
4237; GFX900-NEXT:    ;;#ASMEND
4238; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
4239; GFX900-NEXT:    s_lshr_b32 s5, s5, 16
4240; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
4241; GFX900-NEXT:    ;;#ASMSTART
4242; GFX900-NEXT:    ; use s8
4243; GFX900-NEXT:    ;;#ASMEND
4244; GFX900-NEXT:    s_setpc_b64 s[30:31]
4245;
4246; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__7_5:
4247; GFX90A:       ; %bb.0:
4248; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4249; GFX90A-NEXT:    ;;#ASMSTART
4250; GFX90A-NEXT:    ; def s[4:5]
4251; GFX90A-NEXT:    ;;#ASMEND
4252; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
4253; GFX90A-NEXT:    s_lshr_b32 s5, s5, 16
4254; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
4255; GFX90A-NEXT:    ;;#ASMSTART
4256; GFX90A-NEXT:    ; use s8
4257; GFX90A-NEXT:    ;;#ASMEND
4258; GFX90A-NEXT:    s_setpc_b64 s[30:31]
4259;
4260; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__7_5:
4261; GFX940:       ; %bb.0:
4262; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4263; GFX940-NEXT:    ;;#ASMSTART
4264; GFX940-NEXT:    ; def s[0:1]
4265; GFX940-NEXT:    ;;#ASMEND
4266; GFX940-NEXT:    s_lshr_b32 s0, s0, 16
4267; GFX940-NEXT:    s_lshr_b32 s1, s1, 16
4268; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
4269; GFX940-NEXT:    ;;#ASMSTART
4270; GFX940-NEXT:    ; use s8
4271; GFX940-NEXT:    ;;#ASMEND
4272; GFX940-NEXT:    s_setpc_b64 s[30:31]
4273  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
4274  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
4275  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 7, i32 5>
4276  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
4277  ret void
4278}
4279
4280define void @s_shuffle_v2bf16_v4bf16__7_6() {
4281; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__7_6:
4282; GFX900:       ; %bb.0:
4283; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4284; GFX900-NEXT:    ;;#ASMSTART
4285; GFX900-NEXT:    ; def s[4:5]
4286; GFX900-NEXT:    ;;#ASMEND
4287; GFX900-NEXT:    s_lshr_b32 s4, s5, 16
4288; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
4289; GFX900-NEXT:    ;;#ASMSTART
4290; GFX900-NEXT:    ; use s8
4291; GFX900-NEXT:    ;;#ASMEND
4292; GFX900-NEXT:    s_setpc_b64 s[30:31]
4293;
4294; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__7_6:
4295; GFX90A:       ; %bb.0:
4296; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4297; GFX90A-NEXT:    ;;#ASMSTART
4298; GFX90A-NEXT:    ; def s[4:5]
4299; GFX90A-NEXT:    ;;#ASMEND
4300; GFX90A-NEXT:    s_lshr_b32 s4, s5, 16
4301; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
4302; GFX90A-NEXT:    ;;#ASMSTART
4303; GFX90A-NEXT:    ; use s8
4304; GFX90A-NEXT:    ;;#ASMEND
4305; GFX90A-NEXT:    s_setpc_b64 s[30:31]
4306;
4307; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__7_6:
4308; GFX940:       ; %bb.0:
4309; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4310; GFX940-NEXT:    ;;#ASMSTART
4311; GFX940-NEXT:    ; def s[0:1]
4312; GFX940-NEXT:    ;;#ASMEND
4313; GFX940-NEXT:    s_lshr_b32 s0, s1, 16
4314; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s0, s1
4315; GFX940-NEXT:    ;;#ASMSTART
4316; GFX940-NEXT:    ; use s8
4317; GFX940-NEXT:    ;;#ASMEND
4318; GFX940-NEXT:    s_setpc_b64 s[30:31]
4319  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
4320  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
4321  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 7, i32 6>
4322  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
4323  ret void
4324}
4325
4326define void @s_shuffle_v2bf16_v4bf16__7_7() {
4327; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__7_7:
4328; GFX900:       ; %bb.0:
4329; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4330; GFX900-NEXT:    ;;#ASMSTART
4331; GFX900-NEXT:    ; def s[4:5]
4332; GFX900-NEXT:    ;;#ASMEND
4333; GFX900-NEXT:    s_lshr_b32 s4, s5, 16
4334; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s4
4335; GFX900-NEXT:    ;;#ASMSTART
4336; GFX900-NEXT:    ; use s8
4337; GFX900-NEXT:    ;;#ASMEND
4338; GFX900-NEXT:    s_setpc_b64 s[30:31]
4339;
4340; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__7_7:
4341; GFX90A:       ; %bb.0:
4342; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4343; GFX90A-NEXT:    ;;#ASMSTART
4344; GFX90A-NEXT:    ; def s[4:5]
4345; GFX90A-NEXT:    ;;#ASMEND
4346; GFX90A-NEXT:    s_lshr_b32 s4, s5, 16
4347; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s4
4348; GFX90A-NEXT:    ;;#ASMSTART
4349; GFX90A-NEXT:    ; use s8
4350; GFX90A-NEXT:    ;;#ASMEND
4351; GFX90A-NEXT:    s_setpc_b64 s[30:31]
4352;
4353; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__7_7:
4354; GFX940:       ; %bb.0:
4355; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4356; GFX940-NEXT:    ;;#ASMSTART
4357; GFX940-NEXT:    ; def s[0:1]
4358; GFX940-NEXT:    ;;#ASMEND
4359; GFX940-NEXT:    s_lshr_b32 s0, s1, 16
4360; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s0, s0
4361; GFX940-NEXT:    ;;#ASMSTART
4362; GFX940-NEXT:    ; use s8
4363; GFX940-NEXT:    ;;#ASMEND
4364; GFX940-NEXT:    s_setpc_b64 s[30:31]
4365  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
4366  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
4367  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 7, i32 7>
4368  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
4369  ret void
4370}
4371
4372define void @s_shuffle_v2bf16_v4bf16__u_0() {
4373; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__u_0:
4374; GFX900:       ; %bb.0:
4375; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4376; GFX900-NEXT:    ;;#ASMSTART
4377; GFX900-NEXT:    ; def s[4:5]
4378; GFX900-NEXT:    ;;#ASMEND
4379; GFX900-NEXT:    s_lshl_b32 s8, s4, 16
4380; GFX900-NEXT:    ;;#ASMSTART
4381; GFX900-NEXT:    ; use s8
4382; GFX900-NEXT:    ;;#ASMEND
4383; GFX900-NEXT:    s_setpc_b64 s[30:31]
4384;
4385; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__u_0:
4386; GFX90A:       ; %bb.0:
4387; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4388; GFX90A-NEXT:    ;;#ASMSTART
4389; GFX90A-NEXT:    ; def s[4:5]
4390; GFX90A-NEXT:    ;;#ASMEND
4391; GFX90A-NEXT:    s_lshl_b32 s8, s4, 16
4392; GFX90A-NEXT:    ;;#ASMSTART
4393; GFX90A-NEXT:    ; use s8
4394; GFX90A-NEXT:    ;;#ASMEND
4395; GFX90A-NEXT:    s_setpc_b64 s[30:31]
4396;
4397; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__u_0:
4398; GFX940:       ; %bb.0:
4399; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4400; GFX940-NEXT:    ;;#ASMSTART
4401; GFX940-NEXT:    ; def s[0:1]
4402; GFX940-NEXT:    ;;#ASMEND
4403; GFX940-NEXT:    s_lshl_b32 s8, s0, 16
4404; GFX940-NEXT:    ;;#ASMSTART
4405; GFX940-NEXT:    ; use s8
4406; GFX940-NEXT:    ;;#ASMEND
4407; GFX940-NEXT:    s_setpc_b64 s[30:31]
4408  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
4409  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 poison, i32 0>
4410  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
4411  ret void
4412}
4413
4414define void @s_shuffle_v2bf16_v4bf16__0_0() {
4415; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__0_0:
4416; GFX900:       ; %bb.0:
4417; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4418; GFX900-NEXT:    ;;#ASMSTART
4419; GFX900-NEXT:    ; def s[4:5]
4420; GFX900-NEXT:    ;;#ASMEND
4421; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s4
4422; GFX900-NEXT:    ;;#ASMSTART
4423; GFX900-NEXT:    ; use s8
4424; GFX900-NEXT:    ;;#ASMEND
4425; GFX900-NEXT:    s_setpc_b64 s[30:31]
4426;
4427; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__0_0:
4428; GFX90A:       ; %bb.0:
4429; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4430; GFX90A-NEXT:    ;;#ASMSTART
4431; GFX90A-NEXT:    ; def s[4:5]
4432; GFX90A-NEXT:    ;;#ASMEND
4433; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s4
4434; GFX90A-NEXT:    ;;#ASMSTART
4435; GFX90A-NEXT:    ; use s8
4436; GFX90A-NEXT:    ;;#ASMEND
4437; GFX90A-NEXT:    s_setpc_b64 s[30:31]
4438;
4439; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__0_0:
4440; GFX940:       ; %bb.0:
4441; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4442; GFX940-NEXT:    ;;#ASMSTART
4443; GFX940-NEXT:    ; def s[0:1]
4444; GFX940-NEXT:    ;;#ASMEND
4445; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s0, s0
4446; GFX940-NEXT:    ;;#ASMSTART
4447; GFX940-NEXT:    ; use s8
4448; GFX940-NEXT:    ;;#ASMEND
4449; GFX940-NEXT:    s_setpc_b64 s[30:31]
4450  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
4451  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> zeroinitializer
4452  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
4453  ret void
4454}
4455
4456define void @s_shuffle_v2bf16_v4bf16__1_0() {
4457; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__1_0:
4458; GFX900:       ; %bb.0:
4459; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4460; GFX900-NEXT:    ;;#ASMSTART
4461; GFX900-NEXT:    ; def s[4:5]
4462; GFX900-NEXT:    ;;#ASMEND
4463; GFX900-NEXT:    s_lshr_b32 s5, s4, 16
4464; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
4465; GFX900-NEXT:    ;;#ASMSTART
4466; GFX900-NEXT:    ; use s8
4467; GFX900-NEXT:    ;;#ASMEND
4468; GFX900-NEXT:    s_setpc_b64 s[30:31]
4469;
4470; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__1_0:
4471; GFX90A:       ; %bb.0:
4472; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4473; GFX90A-NEXT:    ;;#ASMSTART
4474; GFX90A-NEXT:    ; def s[4:5]
4475; GFX90A-NEXT:    ;;#ASMEND
4476; GFX90A-NEXT:    s_lshr_b32 s5, s4, 16
4477; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
4478; GFX90A-NEXT:    ;;#ASMSTART
4479; GFX90A-NEXT:    ; use s8
4480; GFX90A-NEXT:    ;;#ASMEND
4481; GFX90A-NEXT:    s_setpc_b64 s[30:31]
4482;
4483; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__1_0:
4484; GFX940:       ; %bb.0:
4485; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4486; GFX940-NEXT:    ;;#ASMSTART
4487; GFX940-NEXT:    ; def s[0:1]
4488; GFX940-NEXT:    ;;#ASMEND
4489; GFX940-NEXT:    s_lshr_b32 s1, s0, 16
4490; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
4491; GFX940-NEXT:    ;;#ASMSTART
4492; GFX940-NEXT:    ; use s8
4493; GFX940-NEXT:    ;;#ASMEND
4494; GFX940-NEXT:    s_setpc_b64 s[30:31]
4495  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
4496  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 1, i32 0>
4497  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
4498  ret void
4499}
4500
4501define void @s_shuffle_v2bf16_v4bf16__2_0() {
4502; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__2_0:
4503; GFX900:       ; %bb.0:
4504; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4505; GFX900-NEXT:    ;;#ASMSTART
4506; GFX900-NEXT:    ; def s[4:5]
4507; GFX900-NEXT:    ;;#ASMEND
4508; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
4509; GFX900-NEXT:    ;;#ASMSTART
4510; GFX900-NEXT:    ; use s8
4511; GFX900-NEXT:    ;;#ASMEND
4512; GFX900-NEXT:    s_setpc_b64 s[30:31]
4513;
4514; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__2_0:
4515; GFX90A:       ; %bb.0:
4516; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4517; GFX90A-NEXT:    ;;#ASMSTART
4518; GFX90A-NEXT:    ; def s[4:5]
4519; GFX90A-NEXT:    ;;#ASMEND
4520; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
4521; GFX90A-NEXT:    ;;#ASMSTART
4522; GFX90A-NEXT:    ; use s8
4523; GFX90A-NEXT:    ;;#ASMEND
4524; GFX90A-NEXT:    s_setpc_b64 s[30:31]
4525;
4526; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__2_0:
4527; GFX940:       ; %bb.0:
4528; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4529; GFX940-NEXT:    ;;#ASMSTART
4530; GFX940-NEXT:    ; def s[0:1]
4531; GFX940-NEXT:    ;;#ASMEND
4532; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
4533; GFX940-NEXT:    ;;#ASMSTART
4534; GFX940-NEXT:    ; use s8
4535; GFX940-NEXT:    ;;#ASMEND
4536; GFX940-NEXT:    s_setpc_b64 s[30:31]
4537  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
4538  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 2, i32 0>
4539  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
4540  ret void
4541}
4542
4543define void @s_shuffle_v2bf16_v4bf16__3_0() {
4544; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__3_0:
4545; GFX900:       ; %bb.0:
4546; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4547; GFX900-NEXT:    ;;#ASMSTART
4548; GFX900-NEXT:    ; def s[4:5]
4549; GFX900-NEXT:    ;;#ASMEND
4550; GFX900-NEXT:    s_lshr_b32 s5, s5, 16
4551; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
4552; GFX900-NEXT:    ;;#ASMSTART
4553; GFX900-NEXT:    ; use s8
4554; GFX900-NEXT:    ;;#ASMEND
4555; GFX900-NEXT:    s_setpc_b64 s[30:31]
4556;
4557; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__3_0:
4558; GFX90A:       ; %bb.0:
4559; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4560; GFX90A-NEXT:    ;;#ASMSTART
4561; GFX90A-NEXT:    ; def s[4:5]
4562; GFX90A-NEXT:    ;;#ASMEND
4563; GFX90A-NEXT:    s_lshr_b32 s5, s5, 16
4564; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
4565; GFX90A-NEXT:    ;;#ASMSTART
4566; GFX90A-NEXT:    ; use s8
4567; GFX90A-NEXT:    ;;#ASMEND
4568; GFX90A-NEXT:    s_setpc_b64 s[30:31]
4569;
4570; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__3_0:
4571; GFX940:       ; %bb.0:
4572; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4573; GFX940-NEXT:    ;;#ASMSTART
4574; GFX940-NEXT:    ; def s[0:1]
4575; GFX940-NEXT:    ;;#ASMEND
4576; GFX940-NEXT:    s_lshr_b32 s1, s1, 16
4577; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
4578; GFX940-NEXT:    ;;#ASMSTART
4579; GFX940-NEXT:    ; use s8
4580; GFX940-NEXT:    ;;#ASMEND
4581; GFX940-NEXT:    s_setpc_b64 s[30:31]
4582  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
4583  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 3, i32 0>
4584  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
4585  ret void
4586}
4587
4588define void @s_shuffle_v2bf16_v4bf16__4_0() {
4589; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__4_0:
4590; GFX900:       ; %bb.0:
4591; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4592; GFX900-NEXT:    ;;#ASMSTART
4593; GFX900-NEXT:    ; def s[4:5]
4594; GFX900-NEXT:    ;;#ASMEND
4595; GFX900-NEXT:    s_lshl_b32 s8, s4, 16
4596; GFX900-NEXT:    ;;#ASMSTART
4597; GFX900-NEXT:    ; use s8
4598; GFX900-NEXT:    ;;#ASMEND
4599; GFX900-NEXT:    s_setpc_b64 s[30:31]
4600;
4601; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__4_0:
4602; GFX90A:       ; %bb.0:
4603; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4604; GFX90A-NEXT:    ;;#ASMSTART
4605; GFX90A-NEXT:    ; def s[4:5]
4606; GFX90A-NEXT:    ;;#ASMEND
4607; GFX90A-NEXT:    s_lshl_b32 s8, s4, 16
4608; GFX90A-NEXT:    ;;#ASMSTART
4609; GFX90A-NEXT:    ; use s8
4610; GFX90A-NEXT:    ;;#ASMEND
4611; GFX90A-NEXT:    s_setpc_b64 s[30:31]
4612;
4613; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__4_0:
4614; GFX940:       ; %bb.0:
4615; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4616; GFX940-NEXT:    ;;#ASMSTART
4617; GFX940-NEXT:    ; def s[0:1]
4618; GFX940-NEXT:    ;;#ASMEND
4619; GFX940-NEXT:    s_lshl_b32 s8, s0, 16
4620; GFX940-NEXT:    ;;#ASMSTART
4621; GFX940-NEXT:    ; use s8
4622; GFX940-NEXT:    ;;#ASMEND
4623; GFX940-NEXT:    s_setpc_b64 s[30:31]
4624  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
4625  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 4, i32 0>
4626  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
4627  ret void
4628}
4629
4630define void @s_shuffle_v2bf16_v4bf16__5_0() {
4631; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__5_0:
4632; GFX900:       ; %bb.0:
4633; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4634; GFX900-NEXT:    ;;#ASMSTART
4635; GFX900-NEXT:    ; def s[4:5]
4636; GFX900-NEXT:    ;;#ASMEND
4637; GFX900-NEXT:    ;;#ASMSTART
4638; GFX900-NEXT:    ; def s[6:7]
4639; GFX900-NEXT:    ;;#ASMEND
4640; GFX900-NEXT:    s_lshr_b32 s5, s6, 16
4641; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
4642; GFX900-NEXT:    ;;#ASMSTART
4643; GFX900-NEXT:    ; use s8
4644; GFX900-NEXT:    ;;#ASMEND
4645; GFX900-NEXT:    s_setpc_b64 s[30:31]
4646;
4647; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__5_0:
4648; GFX90A:       ; %bb.0:
4649; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4650; GFX90A-NEXT:    ;;#ASMSTART
4651; GFX90A-NEXT:    ; def s[4:5]
4652; GFX90A-NEXT:    ;;#ASMEND
4653; GFX90A-NEXT:    ;;#ASMSTART
4654; GFX90A-NEXT:    ; def s[6:7]
4655; GFX90A-NEXT:    ;;#ASMEND
4656; GFX90A-NEXT:    s_lshr_b32 s5, s6, 16
4657; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
4658; GFX90A-NEXT:    ;;#ASMSTART
4659; GFX90A-NEXT:    ; use s8
4660; GFX90A-NEXT:    ;;#ASMEND
4661; GFX90A-NEXT:    s_setpc_b64 s[30:31]
4662;
4663; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__5_0:
4664; GFX940:       ; %bb.0:
4665; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4666; GFX940-NEXT:    ;;#ASMSTART
4667; GFX940-NEXT:    ; def s[0:1]
4668; GFX940-NEXT:    ;;#ASMEND
4669; GFX940-NEXT:    ;;#ASMSTART
4670; GFX940-NEXT:    ; def s[2:3]
4671; GFX940-NEXT:    ;;#ASMEND
4672; GFX940-NEXT:    s_lshr_b32 s1, s2, 16
4673; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
4674; GFX940-NEXT:    ;;#ASMSTART
4675; GFX940-NEXT:    ; use s8
4676; GFX940-NEXT:    ;;#ASMEND
4677; GFX940-NEXT:    s_setpc_b64 s[30:31]
4678  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
4679  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
4680  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 5, i32 0>
4681  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
4682  ret void
4683}
4684
4685define void @s_shuffle_v2bf16_v4bf16__6_0() {
4686; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__6_0:
4687; GFX900:       ; %bb.0:
4688; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4689; GFX900-NEXT:    ;;#ASMSTART
4690; GFX900-NEXT:    ; def s[4:5]
4691; GFX900-NEXT:    ;;#ASMEND
4692; GFX900-NEXT:    ;;#ASMSTART
4693; GFX900-NEXT:    ; def s[6:7]
4694; GFX900-NEXT:    ;;#ASMEND
4695; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s4
4696; GFX900-NEXT:    ;;#ASMSTART
4697; GFX900-NEXT:    ; use s8
4698; GFX900-NEXT:    ;;#ASMEND
4699; GFX900-NEXT:    s_setpc_b64 s[30:31]
4700;
4701; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__6_0:
4702; GFX90A:       ; %bb.0:
4703; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4704; GFX90A-NEXT:    ;;#ASMSTART
4705; GFX90A-NEXT:    ; def s[4:5]
4706; GFX90A-NEXT:    ;;#ASMEND
4707; GFX90A-NEXT:    ;;#ASMSTART
4708; GFX90A-NEXT:    ; def s[6:7]
4709; GFX90A-NEXT:    ;;#ASMEND
4710; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s4
4711; GFX90A-NEXT:    ;;#ASMSTART
4712; GFX90A-NEXT:    ; use s8
4713; GFX90A-NEXT:    ;;#ASMEND
4714; GFX90A-NEXT:    s_setpc_b64 s[30:31]
4715;
4716; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__6_0:
4717; GFX940:       ; %bb.0:
4718; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4719; GFX940-NEXT:    ;;#ASMSTART
4720; GFX940-NEXT:    ; def s[0:1]
4721; GFX940-NEXT:    ;;#ASMEND
4722; GFX940-NEXT:    ;;#ASMSTART
4723; GFX940-NEXT:    ; def s[2:3]
4724; GFX940-NEXT:    ;;#ASMEND
4725; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s0
4726; GFX940-NEXT:    ;;#ASMSTART
4727; GFX940-NEXT:    ; use s8
4728; GFX940-NEXT:    ;;#ASMEND
4729; GFX940-NEXT:    s_setpc_b64 s[30:31]
4730  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
4731  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
4732  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 6, i32 0>
4733  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
4734  ret void
4735}
4736
4737define void @s_shuffle_v2bf16_v4bf16__u_1() {
4738; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__u_1:
4739; GFX900:       ; %bb.0:
4740; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4741; GFX900-NEXT:    ;;#ASMSTART
4742; GFX900-NEXT:    ; def s[8:9]
4743; GFX900-NEXT:    ;;#ASMEND
4744; GFX900-NEXT:    ;;#ASMSTART
4745; GFX900-NEXT:    ; use s8
4746; GFX900-NEXT:    ;;#ASMEND
4747; GFX900-NEXT:    s_setpc_b64 s[30:31]
4748;
4749; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__u_1:
4750; GFX90A:       ; %bb.0:
4751; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4752; GFX90A-NEXT:    ;;#ASMSTART
4753; GFX90A-NEXT:    ; def s[8:9]
4754; GFX90A-NEXT:    ;;#ASMEND
4755; GFX90A-NEXT:    ;;#ASMSTART
4756; GFX90A-NEXT:    ; use s8
4757; GFX90A-NEXT:    ;;#ASMEND
4758; GFX90A-NEXT:    s_setpc_b64 s[30:31]
4759;
4760; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__u_1:
4761; GFX940:       ; %bb.0:
4762; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4763; GFX940-NEXT:    ;;#ASMSTART
4764; GFX940-NEXT:    ; def s[8:9]
4765; GFX940-NEXT:    ;;#ASMEND
4766; GFX940-NEXT:    s_nop 0
4767; GFX940-NEXT:    ;;#ASMSTART
4768; GFX940-NEXT:    ; use s8
4769; GFX940-NEXT:    ;;#ASMEND
4770; GFX940-NEXT:    s_setpc_b64 s[30:31]
4771  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
4772  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 poison, i32 1>
4773  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
4774  ret void
4775}
4776
4777define void @s_shuffle_v2bf16_v4bf16__0_1() {
4778; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__0_1:
4779; GFX900:       ; %bb.0:
4780; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4781; GFX900-NEXT:    ;;#ASMSTART
4782; GFX900-NEXT:    ; def s[8:9]
4783; GFX900-NEXT:    ;;#ASMEND
4784; GFX900-NEXT:    ;;#ASMSTART
4785; GFX900-NEXT:    ; use s8
4786; GFX900-NEXT:    ;;#ASMEND
4787; GFX900-NEXT:    s_setpc_b64 s[30:31]
4788;
4789; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__0_1:
4790; GFX90A:       ; %bb.0:
4791; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4792; GFX90A-NEXT:    ;;#ASMSTART
4793; GFX90A-NEXT:    ; def s[8:9]
4794; GFX90A-NEXT:    ;;#ASMEND
4795; GFX90A-NEXT:    ;;#ASMSTART
4796; GFX90A-NEXT:    ; use s8
4797; GFX90A-NEXT:    ;;#ASMEND
4798; GFX90A-NEXT:    s_setpc_b64 s[30:31]
4799;
4800; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__0_1:
4801; GFX940:       ; %bb.0:
4802; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4803; GFX940-NEXT:    ;;#ASMSTART
4804; GFX940-NEXT:    ; def s[8:9]
4805; GFX940-NEXT:    ;;#ASMEND
4806; GFX940-NEXT:    s_nop 0
4807; GFX940-NEXT:    ;;#ASMSTART
4808; GFX940-NEXT:    ; use s8
4809; GFX940-NEXT:    ;;#ASMEND
4810; GFX940-NEXT:    s_setpc_b64 s[30:31]
4811  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
4812  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 0, i32 1>
4813  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
4814  ret void
4815}
4816
4817define void @s_shuffle_v2bf16_v4bf16__1_1() {
4818; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__1_1:
4819; GFX900:       ; %bb.0:
4820; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4821; GFX900-NEXT:    ;;#ASMSTART
4822; GFX900-NEXT:    ; def s[4:5]
4823; GFX900-NEXT:    ;;#ASMEND
4824; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
4825; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s4
4826; GFX900-NEXT:    ;;#ASMSTART
4827; GFX900-NEXT:    ; use s8
4828; GFX900-NEXT:    ;;#ASMEND
4829; GFX900-NEXT:    s_setpc_b64 s[30:31]
4830;
4831; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__1_1:
4832; GFX90A:       ; %bb.0:
4833; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4834; GFX90A-NEXT:    ;;#ASMSTART
4835; GFX90A-NEXT:    ; def s[4:5]
4836; GFX90A-NEXT:    ;;#ASMEND
4837; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
4838; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s4
4839; GFX90A-NEXT:    ;;#ASMSTART
4840; GFX90A-NEXT:    ; use s8
4841; GFX90A-NEXT:    ;;#ASMEND
4842; GFX90A-NEXT:    s_setpc_b64 s[30:31]
4843;
4844; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__1_1:
4845; GFX940:       ; %bb.0:
4846; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4847; GFX940-NEXT:    ;;#ASMSTART
4848; GFX940-NEXT:    ; def s[0:1]
4849; GFX940-NEXT:    ;;#ASMEND
4850; GFX940-NEXT:    s_lshr_b32 s0, s0, 16
4851; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s0, s0
4852; GFX940-NEXT:    ;;#ASMSTART
4853; GFX940-NEXT:    ; use s8
4854; GFX940-NEXT:    ;;#ASMEND
4855; GFX940-NEXT:    s_setpc_b64 s[30:31]
4856  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
4857  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 1, i32 1>
4858  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
4859  ret void
4860}
4861
4862define void @s_shuffle_v2bf16_v4bf16__2_1() {
4863; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__2_1:
4864; GFX900:       ; %bb.0:
4865; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4866; GFX900-NEXT:    ;;#ASMSTART
4867; GFX900-NEXT:    ; def s[4:5]
4868; GFX900-NEXT:    ;;#ASMEND
4869; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
4870; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
4871; GFX900-NEXT:    ;;#ASMSTART
4872; GFX900-NEXT:    ; use s8
4873; GFX900-NEXT:    ;;#ASMEND
4874; GFX900-NEXT:    s_setpc_b64 s[30:31]
4875;
4876; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__2_1:
4877; GFX90A:       ; %bb.0:
4878; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4879; GFX90A-NEXT:    ;;#ASMSTART
4880; GFX90A-NEXT:    ; def s[4:5]
4881; GFX90A-NEXT:    ;;#ASMEND
4882; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
4883; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
4884; GFX90A-NEXT:    ;;#ASMSTART
4885; GFX90A-NEXT:    ; use s8
4886; GFX90A-NEXT:    ;;#ASMEND
4887; GFX90A-NEXT:    s_setpc_b64 s[30:31]
4888;
4889; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__2_1:
4890; GFX940:       ; %bb.0:
4891; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4892; GFX940-NEXT:    ;;#ASMSTART
4893; GFX940-NEXT:    ; def s[0:1]
4894; GFX940-NEXT:    ;;#ASMEND
4895; GFX940-NEXT:    s_lshr_b32 s0, s0, 16
4896; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
4897; GFX940-NEXT:    ;;#ASMSTART
4898; GFX940-NEXT:    ; use s8
4899; GFX940-NEXT:    ;;#ASMEND
4900; GFX940-NEXT:    s_setpc_b64 s[30:31]
4901  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
4902  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 2, i32 1>
4903  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
4904  ret void
4905}
4906
4907define void @s_shuffle_v2bf16_v4bf16__3_1() {
4908; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__3_1:
4909; GFX900:       ; %bb.0:
4910; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4911; GFX900-NEXT:    ;;#ASMSTART
4912; GFX900-NEXT:    ; def s[4:5]
4913; GFX900-NEXT:    ;;#ASMEND
4914; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
4915; GFX900-NEXT:    s_lshr_b32 s5, s5, 16
4916; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
4917; GFX900-NEXT:    ;;#ASMSTART
4918; GFX900-NEXT:    ; use s8
4919; GFX900-NEXT:    ;;#ASMEND
4920; GFX900-NEXT:    s_setpc_b64 s[30:31]
4921;
4922; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__3_1:
4923; GFX90A:       ; %bb.0:
4924; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4925; GFX90A-NEXT:    ;;#ASMSTART
4926; GFX90A-NEXT:    ; def s[4:5]
4927; GFX90A-NEXT:    ;;#ASMEND
4928; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
4929; GFX90A-NEXT:    s_lshr_b32 s5, s5, 16
4930; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
4931; GFX90A-NEXT:    ;;#ASMSTART
4932; GFX90A-NEXT:    ; use s8
4933; GFX90A-NEXT:    ;;#ASMEND
4934; GFX90A-NEXT:    s_setpc_b64 s[30:31]
4935;
4936; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__3_1:
4937; GFX940:       ; %bb.0:
4938; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4939; GFX940-NEXT:    ;;#ASMSTART
4940; GFX940-NEXT:    ; def s[0:1]
4941; GFX940-NEXT:    ;;#ASMEND
4942; GFX940-NEXT:    s_lshr_b32 s0, s0, 16
4943; GFX940-NEXT:    s_lshr_b32 s1, s1, 16
4944; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
4945; GFX940-NEXT:    ;;#ASMSTART
4946; GFX940-NEXT:    ; use s8
4947; GFX940-NEXT:    ;;#ASMEND
4948; GFX940-NEXT:    s_setpc_b64 s[30:31]
4949  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
4950  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 3, i32 1>
4951  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
4952  ret void
4953}
4954
4955define void @s_shuffle_v2bf16_v4bf16__4_1() {
4956; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__4_1:
4957; GFX900:       ; %bb.0:
4958; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4959; GFX900-NEXT:    ;;#ASMSTART
4960; GFX900-NEXT:    ; def s[8:9]
4961; GFX900-NEXT:    ;;#ASMEND
4962; GFX900-NEXT:    ;;#ASMSTART
4963; GFX900-NEXT:    ; use s8
4964; GFX900-NEXT:    ;;#ASMEND
4965; GFX900-NEXT:    s_setpc_b64 s[30:31]
4966;
4967; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__4_1:
4968; GFX90A:       ; %bb.0:
4969; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4970; GFX90A-NEXT:    ;;#ASMSTART
4971; GFX90A-NEXT:    ; def s[8:9]
4972; GFX90A-NEXT:    ;;#ASMEND
4973; GFX90A-NEXT:    ;;#ASMSTART
4974; GFX90A-NEXT:    ; use s8
4975; GFX90A-NEXT:    ;;#ASMEND
4976; GFX90A-NEXT:    s_setpc_b64 s[30:31]
4977;
4978; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__4_1:
4979; GFX940:       ; %bb.0:
4980; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4981; GFX940-NEXT:    ;;#ASMSTART
4982; GFX940-NEXT:    ; def s[8:9]
4983; GFX940-NEXT:    ;;#ASMEND
4984; GFX940-NEXT:    s_nop 0
4985; GFX940-NEXT:    ;;#ASMSTART
4986; GFX940-NEXT:    ; use s8
4987; GFX940-NEXT:    ;;#ASMEND
4988; GFX940-NEXT:    s_setpc_b64 s[30:31]
4989  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
4990  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 4, i32 1>
4991  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
4992  ret void
4993}
4994
4995define void @s_shuffle_v2bf16_v4bf16__5_1() {
4996; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__5_1:
4997; GFX900:       ; %bb.0:
4998; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4999; GFX900-NEXT:    ;;#ASMSTART
5000; GFX900-NEXT:    ; def s[4:5]
5001; GFX900-NEXT:    ;;#ASMEND
5002; GFX900-NEXT:    ;;#ASMSTART
5003; GFX900-NEXT:    ; def s[6:7]
5004; GFX900-NEXT:    ;;#ASMEND
5005; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
5006; GFX900-NEXT:    s_lshr_b32 s5, s6, 16
5007; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
5008; GFX900-NEXT:    ;;#ASMSTART
5009; GFX900-NEXT:    ; use s8
5010; GFX900-NEXT:    ;;#ASMEND
5011; GFX900-NEXT:    s_setpc_b64 s[30:31]
5012;
5013; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__5_1:
5014; GFX90A:       ; %bb.0:
5015; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5016; GFX90A-NEXT:    ;;#ASMSTART
5017; GFX90A-NEXT:    ; def s[4:5]
5018; GFX90A-NEXT:    ;;#ASMEND
5019; GFX90A-NEXT:    ;;#ASMSTART
5020; GFX90A-NEXT:    ; def s[6:7]
5021; GFX90A-NEXT:    ;;#ASMEND
5022; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
5023; GFX90A-NEXT:    s_lshr_b32 s5, s6, 16
5024; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
5025; GFX90A-NEXT:    ;;#ASMSTART
5026; GFX90A-NEXT:    ; use s8
5027; GFX90A-NEXT:    ;;#ASMEND
5028; GFX90A-NEXT:    s_setpc_b64 s[30:31]
5029;
5030; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__5_1:
5031; GFX940:       ; %bb.0:
5032; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5033; GFX940-NEXT:    ;;#ASMSTART
5034; GFX940-NEXT:    ; def s[0:1]
5035; GFX940-NEXT:    ;;#ASMEND
5036; GFX940-NEXT:    ;;#ASMSTART
5037; GFX940-NEXT:    ; def s[2:3]
5038; GFX940-NEXT:    ;;#ASMEND
5039; GFX940-NEXT:    s_lshr_b32 s0, s0, 16
5040; GFX940-NEXT:    s_lshr_b32 s1, s2, 16
5041; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
5042; GFX940-NEXT:    ;;#ASMSTART
5043; GFX940-NEXT:    ; use s8
5044; GFX940-NEXT:    ;;#ASMEND
5045; GFX940-NEXT:    s_setpc_b64 s[30:31]
5046  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
5047  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
5048  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 5, i32 1>
5049  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
5050  ret void
5051}
5052
5053define void @s_shuffle_v2bf16_v4bf16__6_1() {
5054; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__6_1:
5055; GFX900:       ; %bb.0:
5056; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5057; GFX900-NEXT:    ;;#ASMSTART
5058; GFX900-NEXT:    ; def s[4:5]
5059; GFX900-NEXT:    ;;#ASMEND
5060; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
5061; GFX900-NEXT:    ;;#ASMSTART
5062; GFX900-NEXT:    ; def s[6:7]
5063; GFX900-NEXT:    ;;#ASMEND
5064; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s4
5065; GFX900-NEXT:    ;;#ASMSTART
5066; GFX900-NEXT:    ; use s8
5067; GFX900-NEXT:    ;;#ASMEND
5068; GFX900-NEXT:    s_setpc_b64 s[30:31]
5069;
5070; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__6_1:
5071; GFX90A:       ; %bb.0:
5072; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5073; GFX90A-NEXT:    ;;#ASMSTART
5074; GFX90A-NEXT:    ; def s[4:5]
5075; GFX90A-NEXT:    ;;#ASMEND
5076; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
5077; GFX90A-NEXT:    ;;#ASMSTART
5078; GFX90A-NEXT:    ; def s[6:7]
5079; GFX90A-NEXT:    ;;#ASMEND
5080; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s4
5081; GFX90A-NEXT:    ;;#ASMSTART
5082; GFX90A-NEXT:    ; use s8
5083; GFX90A-NEXT:    ;;#ASMEND
5084; GFX90A-NEXT:    s_setpc_b64 s[30:31]
5085;
5086; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__6_1:
5087; GFX940:       ; %bb.0:
5088; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5089; GFX940-NEXT:    ;;#ASMSTART
5090; GFX940-NEXT:    ; def s[0:1]
5091; GFX940-NEXT:    ;;#ASMEND
5092; GFX940-NEXT:    s_lshr_b32 s0, s0, 16
5093; GFX940-NEXT:    ;;#ASMSTART
5094; GFX940-NEXT:    ; def s[2:3]
5095; GFX940-NEXT:    ;;#ASMEND
5096; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s0
5097; GFX940-NEXT:    ;;#ASMSTART
5098; GFX940-NEXT:    ; use s8
5099; GFX940-NEXT:    ;;#ASMEND
5100; GFX940-NEXT:    s_setpc_b64 s[30:31]
5101  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
5102  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
5103  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 6, i32 1>
5104  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
5105  ret void
5106}
5107
5108define void @s_shuffle_v2bf16_v4bf16__u_2() {
5109; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__u_2:
5110; GFX900:       ; %bb.0:
5111; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5112; GFX900-NEXT:    ;;#ASMSTART
5113; GFX900-NEXT:    ; def s[4:5]
5114; GFX900-NEXT:    ;;#ASMEND
5115; GFX900-NEXT:    s_lshl_b32 s8, s5, 16
5116; GFX900-NEXT:    ;;#ASMSTART
5117; GFX900-NEXT:    ; use s8
5118; GFX900-NEXT:    ;;#ASMEND
5119; GFX900-NEXT:    s_setpc_b64 s[30:31]
5120;
5121; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__u_2:
5122; GFX90A:       ; %bb.0:
5123; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5124; GFX90A-NEXT:    ;;#ASMSTART
5125; GFX90A-NEXT:    ; def s[4:5]
5126; GFX90A-NEXT:    ;;#ASMEND
5127; GFX90A-NEXT:    s_lshl_b32 s8, s5, 16
5128; GFX90A-NEXT:    ;;#ASMSTART
5129; GFX90A-NEXT:    ; use s8
5130; GFX90A-NEXT:    ;;#ASMEND
5131; GFX90A-NEXT:    s_setpc_b64 s[30:31]
5132;
5133; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__u_2:
5134; GFX940:       ; %bb.0:
5135; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5136; GFX940-NEXT:    ;;#ASMSTART
5137; GFX940-NEXT:    ; def s[0:1]
5138; GFX940-NEXT:    ;;#ASMEND
5139; GFX940-NEXT:    s_lshl_b32 s8, s1, 16
5140; GFX940-NEXT:    ;;#ASMSTART
5141; GFX940-NEXT:    ; use s8
5142; GFX940-NEXT:    ;;#ASMEND
5143; GFX940-NEXT:    s_setpc_b64 s[30:31]
5144  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
5145  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 poison, i32 2>
5146  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
5147  ret void
5148}
5149
5150define void @s_shuffle_v2bf16_v4bf16__0_2() {
5151; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__0_2:
5152; GFX900:       ; %bb.0:
5153; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5154; GFX900-NEXT:    ;;#ASMSTART
5155; GFX900-NEXT:    ; def s[4:5]
5156; GFX900-NEXT:    ;;#ASMEND
5157; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
5158; GFX900-NEXT:    ;;#ASMSTART
5159; GFX900-NEXT:    ; use s8
5160; GFX900-NEXT:    ;;#ASMEND
5161; GFX900-NEXT:    s_setpc_b64 s[30:31]
5162;
5163; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__0_2:
5164; GFX90A:       ; %bb.0:
5165; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5166; GFX90A-NEXT:    ;;#ASMSTART
5167; GFX90A-NEXT:    ; def s[4:5]
5168; GFX90A-NEXT:    ;;#ASMEND
5169; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
5170; GFX90A-NEXT:    ;;#ASMSTART
5171; GFX90A-NEXT:    ; use s8
5172; GFX90A-NEXT:    ;;#ASMEND
5173; GFX90A-NEXT:    s_setpc_b64 s[30:31]
5174;
5175; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__0_2:
5176; GFX940:       ; %bb.0:
5177; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5178; GFX940-NEXT:    ;;#ASMSTART
5179; GFX940-NEXT:    ; def s[0:1]
5180; GFX940-NEXT:    ;;#ASMEND
5181; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s0, s1
5182; GFX940-NEXT:    ;;#ASMSTART
5183; GFX940-NEXT:    ; use s8
5184; GFX940-NEXT:    ;;#ASMEND
5185; GFX940-NEXT:    s_setpc_b64 s[30:31]
5186  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
5187  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 0, i32 2>
5188  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
5189  ret void
5190}
5191
5192define void @s_shuffle_v2bf16_v4bf16__1_2() {
5193; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__1_2:
5194; GFX900:       ; %bb.0:
5195; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5196; GFX900-NEXT:    ;;#ASMSTART
5197; GFX900-NEXT:    ; def s[4:5]
5198; GFX900-NEXT:    ;;#ASMEND
5199; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
5200; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
5201; GFX900-NEXT:    ;;#ASMSTART
5202; GFX900-NEXT:    ; use s8
5203; GFX900-NEXT:    ;;#ASMEND
5204; GFX900-NEXT:    s_setpc_b64 s[30:31]
5205;
5206; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__1_2:
5207; GFX90A:       ; %bb.0:
5208; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5209; GFX90A-NEXT:    ;;#ASMSTART
5210; GFX90A-NEXT:    ; def s[4:5]
5211; GFX90A-NEXT:    ;;#ASMEND
5212; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
5213; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
5214; GFX90A-NEXT:    ;;#ASMSTART
5215; GFX90A-NEXT:    ; use s8
5216; GFX90A-NEXT:    ;;#ASMEND
5217; GFX90A-NEXT:    s_setpc_b64 s[30:31]
5218;
5219; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__1_2:
5220; GFX940:       ; %bb.0:
5221; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5222; GFX940-NEXT:    ;;#ASMSTART
5223; GFX940-NEXT:    ; def s[0:1]
5224; GFX940-NEXT:    ;;#ASMEND
5225; GFX940-NEXT:    s_lshr_b32 s0, s0, 16
5226; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s0, s1
5227; GFX940-NEXT:    ;;#ASMSTART
5228; GFX940-NEXT:    ; use s8
5229; GFX940-NEXT:    ;;#ASMEND
5230; GFX940-NEXT:    s_setpc_b64 s[30:31]
5231  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
5232  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 1, i32 2>
5233  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
5234  ret void
5235}
5236
5237define void @s_shuffle_v2bf16_v4bf16__2_2() {
5238; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__2_2:
5239; GFX900:       ; %bb.0:
5240; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5241; GFX900-NEXT:    ;;#ASMSTART
5242; GFX900-NEXT:    ; def s[4:5]
5243; GFX900-NEXT:    ;;#ASMEND
5244; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s5
5245; GFX900-NEXT:    ;;#ASMSTART
5246; GFX900-NEXT:    ; use s8
5247; GFX900-NEXT:    ;;#ASMEND
5248; GFX900-NEXT:    s_setpc_b64 s[30:31]
5249;
5250; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__2_2:
5251; GFX90A:       ; %bb.0:
5252; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5253; GFX90A-NEXT:    ;;#ASMSTART
5254; GFX90A-NEXT:    ; def s[4:5]
5255; GFX90A-NEXT:    ;;#ASMEND
5256; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s5
5257; GFX90A-NEXT:    ;;#ASMSTART
5258; GFX90A-NEXT:    ; use s8
5259; GFX90A-NEXT:    ;;#ASMEND
5260; GFX90A-NEXT:    s_setpc_b64 s[30:31]
5261;
5262; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__2_2:
5263; GFX940:       ; %bb.0:
5264; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5265; GFX940-NEXT:    ;;#ASMSTART
5266; GFX940-NEXT:    ; def s[0:1]
5267; GFX940-NEXT:    ;;#ASMEND
5268; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s1, s1
5269; GFX940-NEXT:    ;;#ASMSTART
5270; GFX940-NEXT:    ; use s8
5271; GFX940-NEXT:    ;;#ASMEND
5272; GFX940-NEXT:    s_setpc_b64 s[30:31]
5273  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
5274  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 2, i32 2>
5275  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
5276  ret void
5277}
5278
5279define void @s_shuffle_v2bf16_v4bf16__3_2() {
5280; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__3_2:
5281; GFX900:       ; %bb.0:
5282; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5283; GFX900-NEXT:    ;;#ASMSTART
5284; GFX900-NEXT:    ; def s[4:5]
5285; GFX900-NEXT:    ;;#ASMEND
5286; GFX900-NEXT:    s_lshr_b32 s4, s5, 16
5287; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
5288; GFX900-NEXT:    ;;#ASMSTART
5289; GFX900-NEXT:    ; use s8
5290; GFX900-NEXT:    ;;#ASMEND
5291; GFX900-NEXT:    s_setpc_b64 s[30:31]
5292;
5293; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__3_2:
5294; GFX90A:       ; %bb.0:
5295; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5296; GFX90A-NEXT:    ;;#ASMSTART
5297; GFX90A-NEXT:    ; def s[4:5]
5298; GFX90A-NEXT:    ;;#ASMEND
5299; GFX90A-NEXT:    s_lshr_b32 s4, s5, 16
5300; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
5301; GFX90A-NEXT:    ;;#ASMSTART
5302; GFX90A-NEXT:    ; use s8
5303; GFX90A-NEXT:    ;;#ASMEND
5304; GFX90A-NEXT:    s_setpc_b64 s[30:31]
5305;
5306; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__3_2:
5307; GFX940:       ; %bb.0:
5308; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5309; GFX940-NEXT:    ;;#ASMSTART
5310; GFX940-NEXT:    ; def s[0:1]
5311; GFX940-NEXT:    ;;#ASMEND
5312; GFX940-NEXT:    s_lshr_b32 s0, s1, 16
5313; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s0, s1
5314; GFX940-NEXT:    ;;#ASMSTART
5315; GFX940-NEXT:    ; use s8
5316; GFX940-NEXT:    ;;#ASMEND
5317; GFX940-NEXT:    s_setpc_b64 s[30:31]
5318  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
5319  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 3, i32 2>
5320  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
5321  ret void
5322}
5323
5324define void @s_shuffle_v2bf16_v4bf16__4_2() {
5325; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__4_2:
5326; GFX900:       ; %bb.0:
5327; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5328; GFX900-NEXT:    ;;#ASMSTART
5329; GFX900-NEXT:    ; def s[4:5]
5330; GFX900-NEXT:    ;;#ASMEND
5331; GFX900-NEXT:    s_lshl_b32 s8, s5, 16
5332; GFX900-NEXT:    ;;#ASMSTART
5333; GFX900-NEXT:    ; use s8
5334; GFX900-NEXT:    ;;#ASMEND
5335; GFX900-NEXT:    s_setpc_b64 s[30:31]
5336;
5337; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__4_2:
5338; GFX90A:       ; %bb.0:
5339; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5340; GFX90A-NEXT:    ;;#ASMSTART
5341; GFX90A-NEXT:    ; def s[4:5]
5342; GFX90A-NEXT:    ;;#ASMEND
5343; GFX90A-NEXT:    s_lshl_b32 s8, s5, 16
5344; GFX90A-NEXT:    ;;#ASMSTART
5345; GFX90A-NEXT:    ; use s8
5346; GFX90A-NEXT:    ;;#ASMEND
5347; GFX90A-NEXT:    s_setpc_b64 s[30:31]
5348;
5349; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__4_2:
5350; GFX940:       ; %bb.0:
5351; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5352; GFX940-NEXT:    ;;#ASMSTART
5353; GFX940-NEXT:    ; def s[0:1]
5354; GFX940-NEXT:    ;;#ASMEND
5355; GFX940-NEXT:    s_lshl_b32 s8, s1, 16
5356; GFX940-NEXT:    ;;#ASMSTART
5357; GFX940-NEXT:    ; use s8
5358; GFX940-NEXT:    ;;#ASMEND
5359; GFX940-NEXT:    s_setpc_b64 s[30:31]
5360  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
5361  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 4, i32 2>
5362  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
5363  ret void
5364}
5365
5366define void @s_shuffle_v2bf16_v4bf16__5_2() {
5367; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__5_2:
5368; GFX900:       ; %bb.0:
5369; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5370; GFX900-NEXT:    ;;#ASMSTART
5371; GFX900-NEXT:    ; def s[4:5]
5372; GFX900-NEXT:    ;;#ASMEND
5373; GFX900-NEXT:    ;;#ASMSTART
5374; GFX900-NEXT:    ; def s[6:7]
5375; GFX900-NEXT:    ;;#ASMEND
5376; GFX900-NEXT:    s_lshr_b32 s4, s6, 16
5377; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
5378; GFX900-NEXT:    ;;#ASMSTART
5379; GFX900-NEXT:    ; use s8
5380; GFX900-NEXT:    ;;#ASMEND
5381; GFX900-NEXT:    s_setpc_b64 s[30:31]
5382;
5383; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__5_2:
5384; GFX90A:       ; %bb.0:
5385; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5386; GFX90A-NEXT:    ;;#ASMSTART
5387; GFX90A-NEXT:    ; def s[4:5]
5388; GFX90A-NEXT:    ;;#ASMEND
5389; GFX90A-NEXT:    ;;#ASMSTART
5390; GFX90A-NEXT:    ; def s[6:7]
5391; GFX90A-NEXT:    ;;#ASMEND
5392; GFX90A-NEXT:    s_lshr_b32 s4, s6, 16
5393; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
5394; GFX90A-NEXT:    ;;#ASMSTART
5395; GFX90A-NEXT:    ; use s8
5396; GFX90A-NEXT:    ;;#ASMEND
5397; GFX90A-NEXT:    s_setpc_b64 s[30:31]
5398;
5399; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__5_2:
5400; GFX940:       ; %bb.0:
5401; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5402; GFX940-NEXT:    ;;#ASMSTART
5403; GFX940-NEXT:    ; def s[0:1]
5404; GFX940-NEXT:    ;;#ASMEND
5405; GFX940-NEXT:    ;;#ASMSTART
5406; GFX940-NEXT:    ; def s[2:3]
5407; GFX940-NEXT:    ;;#ASMEND
5408; GFX940-NEXT:    s_lshr_b32 s0, s2, 16
5409; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s0, s1
5410; GFX940-NEXT:    ;;#ASMSTART
5411; GFX940-NEXT:    ; use s8
5412; GFX940-NEXT:    ;;#ASMEND
5413; GFX940-NEXT:    s_setpc_b64 s[30:31]
5414  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
5415  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
5416  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 5, i32 2>
5417  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
5418  ret void
5419}
5420
5421define void @s_shuffle_v2bf16_v4bf16__6_2() {
5422; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__6_2:
5423; GFX900:       ; %bb.0:
5424; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5425; GFX900-NEXT:    ;;#ASMSTART
5426; GFX900-NEXT:    ; def s[4:5]
5427; GFX900-NEXT:    ;;#ASMEND
5428; GFX900-NEXT:    ;;#ASMSTART
5429; GFX900-NEXT:    ; def s[6:7]
5430; GFX900-NEXT:    ;;#ASMEND
5431; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s5
5432; GFX900-NEXT:    ;;#ASMSTART
5433; GFX900-NEXT:    ; use s8
5434; GFX900-NEXT:    ;;#ASMEND
5435; GFX900-NEXT:    s_setpc_b64 s[30:31]
5436;
5437; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__6_2:
5438; GFX90A:       ; %bb.0:
5439; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5440; GFX90A-NEXT:    ;;#ASMSTART
5441; GFX90A-NEXT:    ; def s[4:5]
5442; GFX90A-NEXT:    ;;#ASMEND
5443; GFX90A-NEXT:    ;;#ASMSTART
5444; GFX90A-NEXT:    ; def s[6:7]
5445; GFX90A-NEXT:    ;;#ASMEND
5446; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s5
5447; GFX90A-NEXT:    ;;#ASMSTART
5448; GFX90A-NEXT:    ; use s8
5449; GFX90A-NEXT:    ;;#ASMEND
5450; GFX90A-NEXT:    s_setpc_b64 s[30:31]
5451;
5452; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__6_2:
5453; GFX940:       ; %bb.0:
5454; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5455; GFX940-NEXT:    ;;#ASMSTART
5456; GFX940-NEXT:    ; def s[0:1]
5457; GFX940-NEXT:    ;;#ASMEND
5458; GFX940-NEXT:    ;;#ASMSTART
5459; GFX940-NEXT:    ; def s[2:3]
5460; GFX940-NEXT:    ;;#ASMEND
5461; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s1
5462; GFX940-NEXT:    ;;#ASMSTART
5463; GFX940-NEXT:    ; use s8
5464; GFX940-NEXT:    ;;#ASMEND
5465; GFX940-NEXT:    s_setpc_b64 s[30:31]
5466  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
5467  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
5468  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 6, i32 2>
5469  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
5470  ret void
5471}
5472
5473define void @s_shuffle_v2bf16_v4bf16__u_3() {
5474; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__u_3:
5475; GFX900:       ; %bb.0:
5476; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5477; GFX900-NEXT:    ;;#ASMSTART
5478; GFX900-NEXT:    ; def s[4:5]
5479; GFX900-NEXT:    ;;#ASMEND
5480; GFX900-NEXT:    s_mov_b32 s8, s5
5481; GFX900-NEXT:    ;;#ASMSTART
5482; GFX900-NEXT:    ; use s8
5483; GFX900-NEXT:    ;;#ASMEND
5484; GFX900-NEXT:    s_setpc_b64 s[30:31]
5485;
5486; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__u_3:
5487; GFX90A:       ; %bb.0:
5488; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5489; GFX90A-NEXT:    ;;#ASMSTART
5490; GFX90A-NEXT:    ; def s[4:5]
5491; GFX90A-NEXT:    ;;#ASMEND
5492; GFX90A-NEXT:    s_mov_b32 s8, s5
5493; GFX90A-NEXT:    ;;#ASMSTART
5494; GFX90A-NEXT:    ; use s8
5495; GFX90A-NEXT:    ;;#ASMEND
5496; GFX90A-NEXT:    s_setpc_b64 s[30:31]
5497;
5498; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__u_3:
5499; GFX940:       ; %bb.0:
5500; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5501; GFX940-NEXT:    ;;#ASMSTART
5502; GFX940-NEXT:    ; def s[0:1]
5503; GFX940-NEXT:    ;;#ASMEND
5504; GFX940-NEXT:    s_mov_b32 s8, s1
5505; GFX940-NEXT:    ;;#ASMSTART
5506; GFX940-NEXT:    ; use s8
5507; GFX940-NEXT:    ;;#ASMEND
5508; GFX940-NEXT:    s_setpc_b64 s[30:31]
5509  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
5510  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 poison, i32 3>
5511  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
5512  ret void
5513}
5514
5515define void @s_shuffle_v2bf16_v4bf16__0_3() {
5516; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__0_3:
5517; GFX900:       ; %bb.0:
5518; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5519; GFX900-NEXT:    ;;#ASMSTART
5520; GFX900-NEXT:    ; def s[4:5]
5521; GFX900-NEXT:    ;;#ASMEND
5522; GFX900-NEXT:    s_lshr_b32 s5, s5, 16
5523; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
5524; GFX900-NEXT:    ;;#ASMSTART
5525; GFX900-NEXT:    ; use s8
5526; GFX900-NEXT:    ;;#ASMEND
5527; GFX900-NEXT:    s_setpc_b64 s[30:31]
5528;
5529; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__0_3:
5530; GFX90A:       ; %bb.0:
5531; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5532; GFX90A-NEXT:    ;;#ASMSTART
5533; GFX90A-NEXT:    ; def s[4:5]
5534; GFX90A-NEXT:    ;;#ASMEND
5535; GFX90A-NEXT:    s_lshr_b32 s5, s5, 16
5536; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
5537; GFX90A-NEXT:    ;;#ASMSTART
5538; GFX90A-NEXT:    ; use s8
5539; GFX90A-NEXT:    ;;#ASMEND
5540; GFX90A-NEXT:    s_setpc_b64 s[30:31]
5541;
5542; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__0_3:
5543; GFX940:       ; %bb.0:
5544; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5545; GFX940-NEXT:    ;;#ASMSTART
5546; GFX940-NEXT:    ; def s[0:1]
5547; GFX940-NEXT:    ;;#ASMEND
5548; GFX940-NEXT:    s_lshr_b32 s1, s1, 16
5549; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s0, s1
5550; GFX940-NEXT:    ;;#ASMSTART
5551; GFX940-NEXT:    ; use s8
5552; GFX940-NEXT:    ;;#ASMEND
5553; GFX940-NEXT:    s_setpc_b64 s[30:31]
5554  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
5555  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 0, i32 3>
5556  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
5557  ret void
5558}
5559
5560define void @s_shuffle_v2bf16_v4bf16__1_3() {
5561; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__1_3:
5562; GFX900:       ; %bb.0:
5563; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5564; GFX900-NEXT:    ;;#ASMSTART
5565; GFX900-NEXT:    ; def s[4:5]
5566; GFX900-NEXT:    ;;#ASMEND
5567; GFX900-NEXT:    s_lshr_b32 s5, s5, 16
5568; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
5569; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
5570; GFX900-NEXT:    ;;#ASMSTART
5571; GFX900-NEXT:    ; use s8
5572; GFX900-NEXT:    ;;#ASMEND
5573; GFX900-NEXT:    s_setpc_b64 s[30:31]
5574;
5575; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__1_3:
5576; GFX90A:       ; %bb.0:
5577; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5578; GFX90A-NEXT:    ;;#ASMSTART
5579; GFX90A-NEXT:    ; def s[4:5]
5580; GFX90A-NEXT:    ;;#ASMEND
5581; GFX90A-NEXT:    s_lshr_b32 s5, s5, 16
5582; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
5583; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
5584; GFX90A-NEXT:    ;;#ASMSTART
5585; GFX90A-NEXT:    ; use s8
5586; GFX90A-NEXT:    ;;#ASMEND
5587; GFX90A-NEXT:    s_setpc_b64 s[30:31]
5588;
5589; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__1_3:
5590; GFX940:       ; %bb.0:
5591; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5592; GFX940-NEXT:    ;;#ASMSTART
5593; GFX940-NEXT:    ; def s[0:1]
5594; GFX940-NEXT:    ;;#ASMEND
5595; GFX940-NEXT:    s_lshr_b32 s1, s1, 16
5596; GFX940-NEXT:    s_lshr_b32 s0, s0, 16
5597; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s0, s1
5598; GFX940-NEXT:    ;;#ASMSTART
5599; GFX940-NEXT:    ; use s8
5600; GFX940-NEXT:    ;;#ASMEND
5601; GFX940-NEXT:    s_setpc_b64 s[30:31]
5602  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
5603  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 1, i32 3>
5604  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
5605  ret void
5606}
5607
5608define void @s_shuffle_v2bf16_v4bf16__2_3() {
5609; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__2_3:
5610; GFX900:       ; %bb.0:
5611; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5612; GFX900-NEXT:    ;;#ASMSTART
5613; GFX900-NEXT:    ; def s[4:5]
5614; GFX900-NEXT:    ;;#ASMEND
5615; GFX900-NEXT:    s_mov_b32 s8, s5
5616; GFX900-NEXT:    ;;#ASMSTART
5617; GFX900-NEXT:    ; use s8
5618; GFX900-NEXT:    ;;#ASMEND
5619; GFX900-NEXT:    s_setpc_b64 s[30:31]
5620;
5621; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__2_3:
5622; GFX90A:       ; %bb.0:
5623; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5624; GFX90A-NEXT:    ;;#ASMSTART
5625; GFX90A-NEXT:    ; def s[4:5]
5626; GFX90A-NEXT:    ;;#ASMEND
5627; GFX90A-NEXT:    s_mov_b32 s8, s5
5628; GFX90A-NEXT:    ;;#ASMSTART
5629; GFX90A-NEXT:    ; use s8
5630; GFX90A-NEXT:    ;;#ASMEND
5631; GFX90A-NEXT:    s_setpc_b64 s[30:31]
5632;
5633; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__2_3:
5634; GFX940:       ; %bb.0:
5635; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5636; GFX940-NEXT:    ;;#ASMSTART
5637; GFX940-NEXT:    ; def s[0:1]
5638; GFX940-NEXT:    ;;#ASMEND
5639; GFX940-NEXT:    s_mov_b32 s8, s1
5640; GFX940-NEXT:    ;;#ASMSTART
5641; GFX940-NEXT:    ; use s8
5642; GFX940-NEXT:    ;;#ASMEND
5643; GFX940-NEXT:    s_setpc_b64 s[30:31]
5644  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
5645  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 2, i32 3>
5646  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
5647  ret void
5648}
5649
5650define void @s_shuffle_v2bf16_v4bf16__3_3() {
5651; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__3_3:
5652; GFX900:       ; %bb.0:
5653; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5654; GFX900-NEXT:    ;;#ASMSTART
5655; GFX900-NEXT:    ; def s[4:5]
5656; GFX900-NEXT:    ;;#ASMEND
5657; GFX900-NEXT:    s_lshr_b32 s4, s5, 16
5658; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s4
5659; GFX900-NEXT:    ;;#ASMSTART
5660; GFX900-NEXT:    ; use s8
5661; GFX900-NEXT:    ;;#ASMEND
5662; GFX900-NEXT:    s_setpc_b64 s[30:31]
5663;
5664; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__3_3:
5665; GFX90A:       ; %bb.0:
5666; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5667; GFX90A-NEXT:    ;;#ASMSTART
5668; GFX90A-NEXT:    ; def s[4:5]
5669; GFX90A-NEXT:    ;;#ASMEND
5670; GFX90A-NEXT:    s_lshr_b32 s4, s5, 16
5671; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s4
5672; GFX90A-NEXT:    ;;#ASMSTART
5673; GFX90A-NEXT:    ; use s8
5674; GFX90A-NEXT:    ;;#ASMEND
5675; GFX90A-NEXT:    s_setpc_b64 s[30:31]
5676;
5677; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__3_3:
5678; GFX940:       ; %bb.0:
5679; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5680; GFX940-NEXT:    ;;#ASMSTART
5681; GFX940-NEXT:    ; def s[0:1]
5682; GFX940-NEXT:    ;;#ASMEND
5683; GFX940-NEXT:    s_lshr_b32 s0, s1, 16
5684; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s0, s0
5685; GFX940-NEXT:    ;;#ASMSTART
5686; GFX940-NEXT:    ; use s8
5687; GFX940-NEXT:    ;;#ASMEND
5688; GFX940-NEXT:    s_setpc_b64 s[30:31]
5689  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
5690  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 3, i32 3>
5691  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
5692  ret void
5693}
5694
5695define void @s_shuffle_v2bf16_v4bf16__4_3() {
5696; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__4_3:
5697; GFX900:       ; %bb.0:
5698; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5699; GFX900-NEXT:    ;;#ASMSTART
5700; GFX900-NEXT:    ; def s[4:5]
5701; GFX900-NEXT:    ;;#ASMEND
5702; GFX900-NEXT:    s_mov_b32 s8, s5
5703; GFX900-NEXT:    ;;#ASMSTART
5704; GFX900-NEXT:    ; use s8
5705; GFX900-NEXT:    ;;#ASMEND
5706; GFX900-NEXT:    s_setpc_b64 s[30:31]
5707;
5708; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__4_3:
5709; GFX90A:       ; %bb.0:
5710; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5711; GFX90A-NEXT:    ;;#ASMSTART
5712; GFX90A-NEXT:    ; def s[4:5]
5713; GFX90A-NEXT:    ;;#ASMEND
5714; GFX90A-NEXT:    s_mov_b32 s8, s5
5715; GFX90A-NEXT:    ;;#ASMSTART
5716; GFX90A-NEXT:    ; use s8
5717; GFX90A-NEXT:    ;;#ASMEND
5718; GFX90A-NEXT:    s_setpc_b64 s[30:31]
5719;
5720; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__4_3:
5721; GFX940:       ; %bb.0:
5722; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5723; GFX940-NEXT:    ;;#ASMSTART
5724; GFX940-NEXT:    ; def s[0:1]
5725; GFX940-NEXT:    ;;#ASMEND
5726; GFX940-NEXT:    s_mov_b32 s8, s1
5727; GFX940-NEXT:    ;;#ASMSTART
5728; GFX940-NEXT:    ; use s8
5729; GFX940-NEXT:    ;;#ASMEND
5730; GFX940-NEXT:    s_setpc_b64 s[30:31]
5731  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
5732  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 4, i32 3>
5733  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
5734  ret void
5735}
5736
5737define void @s_shuffle_v2bf16_v4bf16__5_3() {
5738; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__5_3:
5739; GFX900:       ; %bb.0:
5740; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5741; GFX900-NEXT:    ;;#ASMSTART
5742; GFX900-NEXT:    ; def s[4:5]
5743; GFX900-NEXT:    ;;#ASMEND
5744; GFX900-NEXT:    ;;#ASMSTART
5745; GFX900-NEXT:    ; def s[6:7]
5746; GFX900-NEXT:    ;;#ASMEND
5747; GFX900-NEXT:    s_lshr_b32 s4, s5, 16
5748; GFX900-NEXT:    s_lshr_b32 s5, s6, 16
5749; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
5750; GFX900-NEXT:    ;;#ASMSTART
5751; GFX900-NEXT:    ; use s8
5752; GFX900-NEXT:    ;;#ASMEND
5753; GFX900-NEXT:    s_setpc_b64 s[30:31]
5754;
5755; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__5_3:
5756; GFX90A:       ; %bb.0:
5757; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5758; GFX90A-NEXT:    ;;#ASMSTART
5759; GFX90A-NEXT:    ; def s[4:5]
5760; GFX90A-NEXT:    ;;#ASMEND
5761; GFX90A-NEXT:    ;;#ASMSTART
5762; GFX90A-NEXT:    ; def s[6:7]
5763; GFX90A-NEXT:    ;;#ASMEND
5764; GFX90A-NEXT:    s_lshr_b32 s4, s5, 16
5765; GFX90A-NEXT:    s_lshr_b32 s5, s6, 16
5766; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
5767; GFX90A-NEXT:    ;;#ASMSTART
5768; GFX90A-NEXT:    ; use s8
5769; GFX90A-NEXT:    ;;#ASMEND
5770; GFX90A-NEXT:    s_setpc_b64 s[30:31]
5771;
5772; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__5_3:
5773; GFX940:       ; %bb.0:
5774; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5775; GFX940-NEXT:    ;;#ASMSTART
5776; GFX940-NEXT:    ; def s[0:1]
5777; GFX940-NEXT:    ;;#ASMEND
5778; GFX940-NEXT:    ;;#ASMSTART
5779; GFX940-NEXT:    ; def s[2:3]
5780; GFX940-NEXT:    ;;#ASMEND
5781; GFX940-NEXT:    s_lshr_b32 s0, s1, 16
5782; GFX940-NEXT:    s_lshr_b32 s1, s2, 16
5783; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
5784; GFX940-NEXT:    ;;#ASMSTART
5785; GFX940-NEXT:    ; use s8
5786; GFX940-NEXT:    ;;#ASMEND
5787; GFX940-NEXT:    s_setpc_b64 s[30:31]
5788  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
5789  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
5790  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 5, i32 3>
5791  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
5792  ret void
5793}
5794
5795define void @s_shuffle_v2bf16_v4bf16__6_3() {
5796; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__6_3:
5797; GFX900:       ; %bb.0:
5798; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5799; GFX900-NEXT:    ;;#ASMSTART
5800; GFX900-NEXT:    ; def s[4:5]
5801; GFX900-NEXT:    ;;#ASMEND
5802; GFX900-NEXT:    s_lshr_b32 s4, s5, 16
5803; GFX900-NEXT:    ;;#ASMSTART
5804; GFX900-NEXT:    ; def s[6:7]
5805; GFX900-NEXT:    ;;#ASMEND
5806; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s4
5807; GFX900-NEXT:    ;;#ASMSTART
5808; GFX900-NEXT:    ; use s8
5809; GFX900-NEXT:    ;;#ASMEND
5810; GFX900-NEXT:    s_setpc_b64 s[30:31]
5811;
5812; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__6_3:
5813; GFX90A:       ; %bb.0:
5814; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5815; GFX90A-NEXT:    ;;#ASMSTART
5816; GFX90A-NEXT:    ; def s[4:5]
5817; GFX90A-NEXT:    ;;#ASMEND
5818; GFX90A-NEXT:    s_lshr_b32 s4, s5, 16
5819; GFX90A-NEXT:    ;;#ASMSTART
5820; GFX90A-NEXT:    ; def s[6:7]
5821; GFX90A-NEXT:    ;;#ASMEND
5822; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s4
5823; GFX90A-NEXT:    ;;#ASMSTART
5824; GFX90A-NEXT:    ; use s8
5825; GFX90A-NEXT:    ;;#ASMEND
5826; GFX90A-NEXT:    s_setpc_b64 s[30:31]
5827;
5828; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__6_3:
5829; GFX940:       ; %bb.0:
5830; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5831; GFX940-NEXT:    ;;#ASMSTART
5832; GFX940-NEXT:    ; def s[0:1]
5833; GFX940-NEXT:    ;;#ASMEND
5834; GFX940-NEXT:    s_lshr_b32 s0, s1, 16
5835; GFX940-NEXT:    ;;#ASMSTART
5836; GFX940-NEXT:    ; def s[2:3]
5837; GFX940-NEXT:    ;;#ASMEND
5838; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s0
5839; GFX940-NEXT:    ;;#ASMSTART
5840; GFX940-NEXT:    ; use s8
5841; GFX940-NEXT:    ;;#ASMEND
5842; GFX940-NEXT:    s_setpc_b64 s[30:31]
5843  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
5844  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
5845  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 6, i32 3>
5846  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
5847  ret void
5848}
5849
5850define void @s_shuffle_v2bf16_v4bf16__u_4() {
5851; GFX9-LABEL: s_shuffle_v2bf16_v4bf16__u_4:
5852; GFX9:       ; %bb.0:
5853; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5854; GFX9-NEXT:    ;;#ASMSTART
5855; GFX9-NEXT:    ; use s8
5856; GFX9-NEXT:    ;;#ASMEND
5857; GFX9-NEXT:    s_setpc_b64 s[30:31]
5858  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
5859  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 poison, i32 4>
5860  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
5861  ret void
5862}
5863
5864define void @s_shuffle_v2bf16_v4bf16__0_4() {
5865; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__0_4:
5866; GFX900:       ; %bb.0:
5867; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5868; GFX900-NEXT:    ;;#ASMSTART
5869; GFX900-NEXT:    ; def s[8:9]
5870; GFX900-NEXT:    ;;#ASMEND
5871; GFX900-NEXT:    ;;#ASMSTART
5872; GFX900-NEXT:    ; use s8
5873; GFX900-NEXT:    ;;#ASMEND
5874; GFX900-NEXT:    s_setpc_b64 s[30:31]
5875;
5876; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__0_4:
5877; GFX90A:       ; %bb.0:
5878; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5879; GFX90A-NEXT:    ;;#ASMSTART
5880; GFX90A-NEXT:    ; def s[8:9]
5881; GFX90A-NEXT:    ;;#ASMEND
5882; GFX90A-NEXT:    ;;#ASMSTART
5883; GFX90A-NEXT:    ; use s8
5884; GFX90A-NEXT:    ;;#ASMEND
5885; GFX90A-NEXT:    s_setpc_b64 s[30:31]
5886;
5887; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__0_4:
5888; GFX940:       ; %bb.0:
5889; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5890; GFX940-NEXT:    ;;#ASMSTART
5891; GFX940-NEXT:    ; def s[8:9]
5892; GFX940-NEXT:    ;;#ASMEND
5893; GFX940-NEXT:    s_nop 0
5894; GFX940-NEXT:    ;;#ASMSTART
5895; GFX940-NEXT:    ; use s8
5896; GFX940-NEXT:    ;;#ASMEND
5897; GFX940-NEXT:    s_setpc_b64 s[30:31]
5898  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
5899  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 0, i32 4>
5900  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
5901  ret void
5902}
5903
5904define void @s_shuffle_v2bf16_v4bf16__1_4() {
5905; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__1_4:
5906; GFX900:       ; %bb.0:
5907; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5908; GFX900-NEXT:    ;;#ASMSTART
5909; GFX900-NEXT:    ; def s[4:5]
5910; GFX900-NEXT:    ;;#ASMEND
5911; GFX900-NEXT:    s_lshr_b32 s8, s4, 16
5912; GFX900-NEXT:    ;;#ASMSTART
5913; GFX900-NEXT:    ; use s8
5914; GFX900-NEXT:    ;;#ASMEND
5915; GFX900-NEXT:    s_setpc_b64 s[30:31]
5916;
5917; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__1_4:
5918; GFX90A:       ; %bb.0:
5919; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5920; GFX90A-NEXT:    ;;#ASMSTART
5921; GFX90A-NEXT:    ; def s[4:5]
5922; GFX90A-NEXT:    ;;#ASMEND
5923; GFX90A-NEXT:    s_lshr_b32 s8, s4, 16
5924; GFX90A-NEXT:    ;;#ASMSTART
5925; GFX90A-NEXT:    ; use s8
5926; GFX90A-NEXT:    ;;#ASMEND
5927; GFX90A-NEXT:    s_setpc_b64 s[30:31]
5928;
5929; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__1_4:
5930; GFX940:       ; %bb.0:
5931; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5932; GFX940-NEXT:    ;;#ASMSTART
5933; GFX940-NEXT:    ; def s[0:1]
5934; GFX940-NEXT:    ;;#ASMEND
5935; GFX940-NEXT:    s_lshr_b32 s8, s0, 16
5936; GFX940-NEXT:    ;;#ASMSTART
5937; GFX940-NEXT:    ; use s8
5938; GFX940-NEXT:    ;;#ASMEND
5939; GFX940-NEXT:    s_setpc_b64 s[30:31]
5940  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
5941  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 1, i32 4>
5942  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
5943  ret void
5944}
5945
5946define void @s_shuffle_v2bf16_v4bf16__2_4() {
5947; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__2_4:
5948; GFX900:       ; %bb.0:
5949; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5950; GFX900-NEXT:    ;;#ASMSTART
5951; GFX900-NEXT:    ; def s[4:5]
5952; GFX900-NEXT:    ;;#ASMEND
5953; GFX900-NEXT:    s_mov_b32 s8, s5
5954; GFX900-NEXT:    ;;#ASMSTART
5955; GFX900-NEXT:    ; use s8
5956; GFX900-NEXT:    ;;#ASMEND
5957; GFX900-NEXT:    s_setpc_b64 s[30:31]
5958;
5959; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__2_4:
5960; GFX90A:       ; %bb.0:
5961; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5962; GFX90A-NEXT:    ;;#ASMSTART
5963; GFX90A-NEXT:    ; def s[4:5]
5964; GFX90A-NEXT:    ;;#ASMEND
5965; GFX90A-NEXT:    s_mov_b32 s8, s5
5966; GFX90A-NEXT:    ;;#ASMSTART
5967; GFX90A-NEXT:    ; use s8
5968; GFX90A-NEXT:    ;;#ASMEND
5969; GFX90A-NEXT:    s_setpc_b64 s[30:31]
5970;
5971; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__2_4:
5972; GFX940:       ; %bb.0:
5973; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5974; GFX940-NEXT:    ;;#ASMSTART
5975; GFX940-NEXT:    ; def s[0:1]
5976; GFX940-NEXT:    ;;#ASMEND
5977; GFX940-NEXT:    s_mov_b32 s8, s1
5978; GFX940-NEXT:    ;;#ASMSTART
5979; GFX940-NEXT:    ; use s8
5980; GFX940-NEXT:    ;;#ASMEND
5981; GFX940-NEXT:    s_setpc_b64 s[30:31]
5982  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
5983  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 2, i32 4>
5984  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
5985  ret void
5986}
5987
5988define void @s_shuffle_v2bf16_v4bf16__3_4() {
5989; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__3_4:
5990; GFX900:       ; %bb.0:
5991; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5992; GFX900-NEXT:    ;;#ASMSTART
5993; GFX900-NEXT:    ; def s[4:5]
5994; GFX900-NEXT:    ;;#ASMEND
5995; GFX900-NEXT:    s_lshr_b32 s8, s5, 16
5996; GFX900-NEXT:    ;;#ASMSTART
5997; GFX900-NEXT:    ; use s8
5998; GFX900-NEXT:    ;;#ASMEND
5999; GFX900-NEXT:    s_setpc_b64 s[30:31]
6000;
6001; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__3_4:
6002; GFX90A:       ; %bb.0:
6003; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6004; GFX90A-NEXT:    ;;#ASMSTART
6005; GFX90A-NEXT:    ; def s[4:5]
6006; GFX90A-NEXT:    ;;#ASMEND
6007; GFX90A-NEXT:    s_lshr_b32 s8, s5, 16
6008; GFX90A-NEXT:    ;;#ASMSTART
6009; GFX90A-NEXT:    ; use s8
6010; GFX90A-NEXT:    ;;#ASMEND
6011; GFX90A-NEXT:    s_setpc_b64 s[30:31]
6012;
6013; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__3_4:
6014; GFX940:       ; %bb.0:
6015; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6016; GFX940-NEXT:    ;;#ASMSTART
6017; GFX940-NEXT:    ; def s[0:1]
6018; GFX940-NEXT:    ;;#ASMEND
6019; GFX940-NEXT:    s_lshr_b32 s8, s1, 16
6020; GFX940-NEXT:    ;;#ASMSTART
6021; GFX940-NEXT:    ; use s8
6022; GFX940-NEXT:    ;;#ASMEND
6023; GFX940-NEXT:    s_setpc_b64 s[30:31]
6024  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
6025  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 3, i32 4>
6026  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
6027  ret void
6028}
6029
6030define void @s_shuffle_v2bf16_v4bf16__4_4() {
6031; GFX9-LABEL: s_shuffle_v2bf16_v4bf16__4_4:
6032; GFX9:       ; %bb.0:
6033; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6034; GFX9-NEXT:    ;;#ASMSTART
6035; GFX9-NEXT:    ; use s8
6036; GFX9-NEXT:    ;;#ASMEND
6037; GFX9-NEXT:    s_setpc_b64 s[30:31]
6038  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
6039  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <2 x i32> <i32 4, i32 4>
6040  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
6041  ret void
6042}
6043
6044define void @s_shuffle_v2bf16_v4bf16__5_4() {
6045; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__5_4:
6046; GFX900:       ; %bb.0:
6047; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6048; GFX900-NEXT:    ;;#ASMSTART
6049; GFX900-NEXT:    ; def s[4:5]
6050; GFX900-NEXT:    ;;#ASMEND
6051; GFX900-NEXT:    s_lshr_b32 s5, s4, 16
6052; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
6053; GFX900-NEXT:    ;;#ASMSTART
6054; GFX900-NEXT:    ; use s8
6055; GFX900-NEXT:    ;;#ASMEND
6056; GFX900-NEXT:    s_setpc_b64 s[30:31]
6057;
6058; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__5_4:
6059; GFX90A:       ; %bb.0:
6060; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6061; GFX90A-NEXT:    ;;#ASMSTART
6062; GFX90A-NEXT:    ; def s[4:5]
6063; GFX90A-NEXT:    ;;#ASMEND
6064; GFX90A-NEXT:    s_lshr_b32 s5, s4, 16
6065; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
6066; GFX90A-NEXT:    ;;#ASMSTART
6067; GFX90A-NEXT:    ; use s8
6068; GFX90A-NEXT:    ;;#ASMEND
6069; GFX90A-NEXT:    s_setpc_b64 s[30:31]
6070;
6071; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__5_4:
6072; GFX940:       ; %bb.0:
6073; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6074; GFX940-NEXT:    ;;#ASMSTART
6075; GFX940-NEXT:    ; def s[0:1]
6076; GFX940-NEXT:    ;;#ASMEND
6077; GFX940-NEXT:    s_lshr_b32 s1, s0, 16
6078; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
6079; GFX940-NEXT:    ;;#ASMSTART
6080; GFX940-NEXT:    ; use s8
6081; GFX940-NEXT:    ;;#ASMEND
6082; GFX940-NEXT:    s_setpc_b64 s[30:31]
6083  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
6084  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
6085  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 5, i32 4>
6086  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
6087  ret void
6088}
6089
6090define void @s_shuffle_v2bf16_v4bf16__6_4() {
6091; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__6_4:
6092; GFX900:       ; %bb.0:
6093; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6094; GFX900-NEXT:    ;;#ASMSTART
6095; GFX900-NEXT:    ; def s[4:5]
6096; GFX900-NEXT:    ;;#ASMEND
6097; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
6098; GFX900-NEXT:    ;;#ASMSTART
6099; GFX900-NEXT:    ; use s8
6100; GFX900-NEXT:    ;;#ASMEND
6101; GFX900-NEXT:    s_setpc_b64 s[30:31]
6102;
6103; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__6_4:
6104; GFX90A:       ; %bb.0:
6105; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6106; GFX90A-NEXT:    ;;#ASMSTART
6107; GFX90A-NEXT:    ; def s[4:5]
6108; GFX90A-NEXT:    ;;#ASMEND
6109; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
6110; GFX90A-NEXT:    ;;#ASMSTART
6111; GFX90A-NEXT:    ; use s8
6112; GFX90A-NEXT:    ;;#ASMEND
6113; GFX90A-NEXT:    s_setpc_b64 s[30:31]
6114;
6115; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__6_4:
6116; GFX940:       ; %bb.0:
6117; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6118; GFX940-NEXT:    ;;#ASMSTART
6119; GFX940-NEXT:    ; def s[0:1]
6120; GFX940-NEXT:    ;;#ASMEND
6121; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
6122; GFX940-NEXT:    ;;#ASMSTART
6123; GFX940-NEXT:    ; use s8
6124; GFX940-NEXT:    ;;#ASMEND
6125; GFX940-NEXT:    s_setpc_b64 s[30:31]
6126  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
6127  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
6128  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 6, i32 4>
6129  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
6130  ret void
6131}
6132
6133define void @s_shuffle_v2bf16_v4bf16__u_5() {
6134; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__u_5:
6135; GFX900:       ; %bb.0:
6136; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6137; GFX900-NEXT:    ;;#ASMSTART
6138; GFX900-NEXT:    ; def s[8:9]
6139; GFX900-NEXT:    ;;#ASMEND
6140; GFX900-NEXT:    ;;#ASMSTART
6141; GFX900-NEXT:    ; use s8
6142; GFX900-NEXT:    ;;#ASMEND
6143; GFX900-NEXT:    s_setpc_b64 s[30:31]
6144;
6145; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__u_5:
6146; GFX90A:       ; %bb.0:
6147; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6148; GFX90A-NEXT:    ;;#ASMSTART
6149; GFX90A-NEXT:    ; def s[8:9]
6150; GFX90A-NEXT:    ;;#ASMEND
6151; GFX90A-NEXT:    ;;#ASMSTART
6152; GFX90A-NEXT:    ; use s8
6153; GFX90A-NEXT:    ;;#ASMEND
6154; GFX90A-NEXT:    s_setpc_b64 s[30:31]
6155;
6156; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__u_5:
6157; GFX940:       ; %bb.0:
6158; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6159; GFX940-NEXT:    ;;#ASMSTART
6160; GFX940-NEXT:    ; def s[8:9]
6161; GFX940-NEXT:    ;;#ASMEND
6162; GFX940-NEXT:    s_nop 0
6163; GFX940-NEXT:    ;;#ASMSTART
6164; GFX940-NEXT:    ; use s8
6165; GFX940-NEXT:    ;;#ASMEND
6166; GFX940-NEXT:    s_setpc_b64 s[30:31]
6167  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
6168  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
6169  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 poison, i32 5>
6170  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
6171  ret void
6172}
6173
6174define void @s_shuffle_v2bf16_v4bf16__0_5() {
6175; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__0_5:
6176; GFX900:       ; %bb.0:
6177; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6178; GFX900-NEXT:    ;;#ASMSTART
6179; GFX900-NEXT:    ; def s[4:5]
6180; GFX900-NEXT:    ;;#ASMEND
6181; GFX900-NEXT:    ;;#ASMSTART
6182; GFX900-NEXT:    ; def s[6:7]
6183; GFX900-NEXT:    ;;#ASMEND
6184; GFX900-NEXT:    s_lshr_b32 s5, s6, 16
6185; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
6186; GFX900-NEXT:    ;;#ASMSTART
6187; GFX900-NEXT:    ; use s8
6188; GFX900-NEXT:    ;;#ASMEND
6189; GFX900-NEXT:    s_setpc_b64 s[30:31]
6190;
6191; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__0_5:
6192; GFX90A:       ; %bb.0:
6193; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6194; GFX90A-NEXT:    ;;#ASMSTART
6195; GFX90A-NEXT:    ; def s[4:5]
6196; GFX90A-NEXT:    ;;#ASMEND
6197; GFX90A-NEXT:    ;;#ASMSTART
6198; GFX90A-NEXT:    ; def s[6:7]
6199; GFX90A-NEXT:    ;;#ASMEND
6200; GFX90A-NEXT:    s_lshr_b32 s5, s6, 16
6201; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
6202; GFX90A-NEXT:    ;;#ASMSTART
6203; GFX90A-NEXT:    ; use s8
6204; GFX90A-NEXT:    ;;#ASMEND
6205; GFX90A-NEXT:    s_setpc_b64 s[30:31]
6206;
6207; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__0_5:
6208; GFX940:       ; %bb.0:
6209; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6210; GFX940-NEXT:    ;;#ASMSTART
6211; GFX940-NEXT:    ; def s[0:1]
6212; GFX940-NEXT:    ;;#ASMEND
6213; GFX940-NEXT:    ;;#ASMSTART
6214; GFX940-NEXT:    ; def s[2:3]
6215; GFX940-NEXT:    ;;#ASMEND
6216; GFX940-NEXT:    s_lshr_b32 s1, s2, 16
6217; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s0, s1
6218; GFX940-NEXT:    ;;#ASMSTART
6219; GFX940-NEXT:    ; use s8
6220; GFX940-NEXT:    ;;#ASMEND
6221; GFX940-NEXT:    s_setpc_b64 s[30:31]
6222  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
6223  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
6224  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 0, i32 5>
6225  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
6226  ret void
6227}
6228
6229define void @s_shuffle_v2bf16_v4bf16__1_5() {
6230; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__1_5:
6231; GFX900:       ; %bb.0:
6232; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6233; GFX900-NEXT:    ;;#ASMSTART
6234; GFX900-NEXT:    ; def s[4:5]
6235; GFX900-NEXT:    ;;#ASMEND
6236; GFX900-NEXT:    ;;#ASMSTART
6237; GFX900-NEXT:    ; def s[6:7]
6238; GFX900-NEXT:    ;;#ASMEND
6239; GFX900-NEXT:    s_lshr_b32 s5, s6, 16
6240; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
6241; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
6242; GFX900-NEXT:    ;;#ASMSTART
6243; GFX900-NEXT:    ; use s8
6244; GFX900-NEXT:    ;;#ASMEND
6245; GFX900-NEXT:    s_setpc_b64 s[30:31]
6246;
6247; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__1_5:
6248; GFX90A:       ; %bb.0:
6249; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6250; GFX90A-NEXT:    ;;#ASMSTART
6251; GFX90A-NEXT:    ; def s[4:5]
6252; GFX90A-NEXT:    ;;#ASMEND
6253; GFX90A-NEXT:    ;;#ASMSTART
6254; GFX90A-NEXT:    ; def s[6:7]
6255; GFX90A-NEXT:    ;;#ASMEND
6256; GFX90A-NEXT:    s_lshr_b32 s5, s6, 16
6257; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
6258; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
6259; GFX90A-NEXT:    ;;#ASMSTART
6260; GFX90A-NEXT:    ; use s8
6261; GFX90A-NEXT:    ;;#ASMEND
6262; GFX90A-NEXT:    s_setpc_b64 s[30:31]
6263;
6264; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__1_5:
6265; GFX940:       ; %bb.0:
6266; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6267; GFX940-NEXT:    ;;#ASMSTART
6268; GFX940-NEXT:    ; def s[0:1]
6269; GFX940-NEXT:    ;;#ASMEND
6270; GFX940-NEXT:    ;;#ASMSTART
6271; GFX940-NEXT:    ; def s[2:3]
6272; GFX940-NEXT:    ;;#ASMEND
6273; GFX940-NEXT:    s_lshr_b32 s1, s2, 16
6274; GFX940-NEXT:    s_lshr_b32 s0, s0, 16
6275; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s0, s1
6276; GFX940-NEXT:    ;;#ASMSTART
6277; GFX940-NEXT:    ; use s8
6278; GFX940-NEXT:    ;;#ASMEND
6279; GFX940-NEXT:    s_setpc_b64 s[30:31]
6280  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
6281  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
6282  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 1, i32 5>
6283  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
6284  ret void
6285}
6286
6287define void @s_shuffle_v2bf16_v4bf16__2_5() {
6288; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__2_5:
6289; GFX900:       ; %bb.0:
6290; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6291; GFX900-NEXT:    ;;#ASMSTART
6292; GFX900-NEXT:    ; def s[4:5]
6293; GFX900-NEXT:    ;;#ASMEND
6294; GFX900-NEXT:    ;;#ASMSTART
6295; GFX900-NEXT:    ; def s[6:7]
6296; GFX900-NEXT:    ;;#ASMEND
6297; GFX900-NEXT:    s_lshr_b32 s4, s6, 16
6298; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
6299; GFX900-NEXT:    ;;#ASMSTART
6300; GFX900-NEXT:    ; use s8
6301; GFX900-NEXT:    ;;#ASMEND
6302; GFX900-NEXT:    s_setpc_b64 s[30:31]
6303;
6304; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__2_5:
6305; GFX90A:       ; %bb.0:
6306; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6307; GFX90A-NEXT:    ;;#ASMSTART
6308; GFX90A-NEXT:    ; def s[4:5]
6309; GFX90A-NEXT:    ;;#ASMEND
6310; GFX90A-NEXT:    ;;#ASMSTART
6311; GFX90A-NEXT:    ; def s[6:7]
6312; GFX90A-NEXT:    ;;#ASMEND
6313; GFX90A-NEXT:    s_lshr_b32 s4, s6, 16
6314; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
6315; GFX90A-NEXT:    ;;#ASMSTART
6316; GFX90A-NEXT:    ; use s8
6317; GFX90A-NEXT:    ;;#ASMEND
6318; GFX90A-NEXT:    s_setpc_b64 s[30:31]
6319;
6320; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__2_5:
6321; GFX940:       ; %bb.0:
6322; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6323; GFX940-NEXT:    ;;#ASMSTART
6324; GFX940-NEXT:    ; def s[0:1]
6325; GFX940-NEXT:    ;;#ASMEND
6326; GFX940-NEXT:    ;;#ASMSTART
6327; GFX940-NEXT:    ; def s[2:3]
6328; GFX940-NEXT:    ;;#ASMEND
6329; GFX940-NEXT:    s_lshr_b32 s0, s2, 16
6330; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
6331; GFX940-NEXT:    ;;#ASMSTART
6332; GFX940-NEXT:    ; use s8
6333; GFX940-NEXT:    ;;#ASMEND
6334; GFX940-NEXT:    s_setpc_b64 s[30:31]
6335  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
6336  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
6337  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 2, i32 5>
6338  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
6339  ret void
6340}
6341
6342define void @s_shuffle_v2bf16_v4bf16__3_5() {
6343; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__3_5:
6344; GFX900:       ; %bb.0:
6345; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6346; GFX900-NEXT:    ;;#ASMSTART
6347; GFX900-NEXT:    ; def s[4:5]
6348; GFX900-NEXT:    ;;#ASMEND
6349; GFX900-NEXT:    ;;#ASMSTART
6350; GFX900-NEXT:    ; def s[6:7]
6351; GFX900-NEXT:    ;;#ASMEND
6352; GFX900-NEXT:    s_lshr_b32 s4, s6, 16
6353; GFX900-NEXT:    s_lshr_b32 s5, s5, 16
6354; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
6355; GFX900-NEXT:    ;;#ASMSTART
6356; GFX900-NEXT:    ; use s8
6357; GFX900-NEXT:    ;;#ASMEND
6358; GFX900-NEXT:    s_setpc_b64 s[30:31]
6359;
6360; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__3_5:
6361; GFX90A:       ; %bb.0:
6362; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6363; GFX90A-NEXT:    ;;#ASMSTART
6364; GFX90A-NEXT:    ; def s[4:5]
6365; GFX90A-NEXT:    ;;#ASMEND
6366; GFX90A-NEXT:    ;;#ASMSTART
6367; GFX90A-NEXT:    ; def s[6:7]
6368; GFX90A-NEXT:    ;;#ASMEND
6369; GFX90A-NEXT:    s_lshr_b32 s4, s6, 16
6370; GFX90A-NEXT:    s_lshr_b32 s5, s5, 16
6371; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
6372; GFX90A-NEXT:    ;;#ASMSTART
6373; GFX90A-NEXT:    ; use s8
6374; GFX90A-NEXT:    ;;#ASMEND
6375; GFX90A-NEXT:    s_setpc_b64 s[30:31]
6376;
6377; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__3_5:
6378; GFX940:       ; %bb.0:
6379; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6380; GFX940-NEXT:    ;;#ASMSTART
6381; GFX940-NEXT:    ; def s[0:1]
6382; GFX940-NEXT:    ;;#ASMEND
6383; GFX940-NEXT:    ;;#ASMSTART
6384; GFX940-NEXT:    ; def s[2:3]
6385; GFX940-NEXT:    ;;#ASMEND
6386; GFX940-NEXT:    s_lshr_b32 s0, s2, 16
6387; GFX940-NEXT:    s_lshr_b32 s1, s1, 16
6388; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
6389; GFX940-NEXT:    ;;#ASMSTART
6390; GFX940-NEXT:    ; use s8
6391; GFX940-NEXT:    ;;#ASMEND
6392; GFX940-NEXT:    s_setpc_b64 s[30:31]
6393  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
6394  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
6395  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 3, i32 5>
6396  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
6397  ret void
6398}
6399
6400define void @s_shuffle_v2bf16_v4bf16__4_5() {
6401; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__4_5:
6402; GFX900:       ; %bb.0:
6403; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6404; GFX900-NEXT:    ;;#ASMSTART
6405; GFX900-NEXT:    ; def s[8:9]
6406; GFX900-NEXT:    ;;#ASMEND
6407; GFX900-NEXT:    ;;#ASMSTART
6408; GFX900-NEXT:    ; use s8
6409; GFX900-NEXT:    ;;#ASMEND
6410; GFX900-NEXT:    s_setpc_b64 s[30:31]
6411;
6412; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__4_5:
6413; GFX90A:       ; %bb.0:
6414; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6415; GFX90A-NEXT:    ;;#ASMSTART
6416; GFX90A-NEXT:    ; def s[8:9]
6417; GFX90A-NEXT:    ;;#ASMEND
6418; GFX90A-NEXT:    ;;#ASMSTART
6419; GFX90A-NEXT:    ; use s8
6420; GFX90A-NEXT:    ;;#ASMEND
6421; GFX90A-NEXT:    s_setpc_b64 s[30:31]
6422;
6423; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__4_5:
6424; GFX940:       ; %bb.0:
6425; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6426; GFX940-NEXT:    ;;#ASMSTART
6427; GFX940-NEXT:    ; def s[8:9]
6428; GFX940-NEXT:    ;;#ASMEND
6429; GFX940-NEXT:    s_nop 0
6430; GFX940-NEXT:    ;;#ASMSTART
6431; GFX940-NEXT:    ; use s8
6432; GFX940-NEXT:    ;;#ASMEND
6433; GFX940-NEXT:    s_setpc_b64 s[30:31]
6434  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
6435  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
6436  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 4, i32 5>
6437  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
6438  ret void
6439}
6440
6441define void @s_shuffle_v2bf16_v4bf16__5_5() {
6442; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__5_5:
6443; GFX900:       ; %bb.0:
6444; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6445; GFX900-NEXT:    ;;#ASMSTART
6446; GFX900-NEXT:    ; def s[4:5]
6447; GFX900-NEXT:    ;;#ASMEND
6448; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
6449; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s4
6450; GFX900-NEXT:    ;;#ASMSTART
6451; GFX900-NEXT:    ; use s8
6452; GFX900-NEXT:    ;;#ASMEND
6453; GFX900-NEXT:    s_setpc_b64 s[30:31]
6454;
6455; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__5_5:
6456; GFX90A:       ; %bb.0:
6457; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6458; GFX90A-NEXT:    ;;#ASMSTART
6459; GFX90A-NEXT:    ; def s[4:5]
6460; GFX90A-NEXT:    ;;#ASMEND
6461; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
6462; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s4
6463; GFX90A-NEXT:    ;;#ASMSTART
6464; GFX90A-NEXT:    ; use s8
6465; GFX90A-NEXT:    ;;#ASMEND
6466; GFX90A-NEXT:    s_setpc_b64 s[30:31]
6467;
6468; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__5_5:
6469; GFX940:       ; %bb.0:
6470; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6471; GFX940-NEXT:    ;;#ASMSTART
6472; GFX940-NEXT:    ; def s[0:1]
6473; GFX940-NEXT:    ;;#ASMEND
6474; GFX940-NEXT:    s_lshr_b32 s0, s0, 16
6475; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s0, s0
6476; GFX940-NEXT:    ;;#ASMSTART
6477; GFX940-NEXT:    ; use s8
6478; GFX940-NEXT:    ;;#ASMEND
6479; GFX940-NEXT:    s_setpc_b64 s[30:31]
6480  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
6481  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
6482  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 5, i32 5>
6483  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
6484  ret void
6485}
6486
6487define void @s_shuffle_v2bf16_v4bf16__6_5() {
6488; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__6_5:
6489; GFX900:       ; %bb.0:
6490; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6491; GFX900-NEXT:    ;;#ASMSTART
6492; GFX900-NEXT:    ; def s[4:5]
6493; GFX900-NEXT:    ;;#ASMEND
6494; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
6495; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
6496; GFX900-NEXT:    ;;#ASMSTART
6497; GFX900-NEXT:    ; use s8
6498; GFX900-NEXT:    ;;#ASMEND
6499; GFX900-NEXT:    s_setpc_b64 s[30:31]
6500;
6501; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__6_5:
6502; GFX90A:       ; %bb.0:
6503; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6504; GFX90A-NEXT:    ;;#ASMSTART
6505; GFX90A-NEXT:    ; def s[4:5]
6506; GFX90A-NEXT:    ;;#ASMEND
6507; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
6508; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
6509; GFX90A-NEXT:    ;;#ASMSTART
6510; GFX90A-NEXT:    ; use s8
6511; GFX90A-NEXT:    ;;#ASMEND
6512; GFX90A-NEXT:    s_setpc_b64 s[30:31]
6513;
6514; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__6_5:
6515; GFX940:       ; %bb.0:
6516; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6517; GFX940-NEXT:    ;;#ASMSTART
6518; GFX940-NEXT:    ; def s[0:1]
6519; GFX940-NEXT:    ;;#ASMEND
6520; GFX940-NEXT:    s_lshr_b32 s0, s0, 16
6521; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
6522; GFX940-NEXT:    ;;#ASMSTART
6523; GFX940-NEXT:    ; use s8
6524; GFX940-NEXT:    ;;#ASMEND
6525; GFX940-NEXT:    s_setpc_b64 s[30:31]
6526  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
6527  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
6528  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 6, i32 5>
6529  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
6530  ret void
6531}
6532
6533define void @s_shuffle_v2bf16_v4bf16__u_6() {
6534; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__u_6:
6535; GFX900:       ; %bb.0:
6536; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6537; GFX900-NEXT:    ;;#ASMSTART
6538; GFX900-NEXT:    ; def s[4:5]
6539; GFX900-NEXT:    ;;#ASMEND
6540; GFX900-NEXT:    s_lshl_b32 s8, s5, 16
6541; GFX900-NEXT:    ;;#ASMSTART
6542; GFX900-NEXT:    ; use s8
6543; GFX900-NEXT:    ;;#ASMEND
6544; GFX900-NEXT:    s_setpc_b64 s[30:31]
6545;
6546; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__u_6:
6547; GFX90A:       ; %bb.0:
6548; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6549; GFX90A-NEXT:    ;;#ASMSTART
6550; GFX90A-NEXT:    ; def s[4:5]
6551; GFX90A-NEXT:    ;;#ASMEND
6552; GFX90A-NEXT:    s_lshl_b32 s8, s5, 16
6553; GFX90A-NEXT:    ;;#ASMSTART
6554; GFX90A-NEXT:    ; use s8
6555; GFX90A-NEXT:    ;;#ASMEND
6556; GFX90A-NEXT:    s_setpc_b64 s[30:31]
6557;
6558; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__u_6:
6559; GFX940:       ; %bb.0:
6560; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6561; GFX940-NEXT:    ;;#ASMSTART
6562; GFX940-NEXT:    ; def s[0:1]
6563; GFX940-NEXT:    ;;#ASMEND
6564; GFX940-NEXT:    s_lshl_b32 s8, s1, 16
6565; GFX940-NEXT:    ;;#ASMSTART
6566; GFX940-NEXT:    ; use s8
6567; GFX940-NEXT:    ;;#ASMEND
6568; GFX940-NEXT:    s_setpc_b64 s[30:31]
6569  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
6570  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
6571  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 poison, i32 6>
6572  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
6573  ret void
6574}
6575
6576define void @s_shuffle_v2bf16_v4bf16__0_6() {
6577; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__0_6:
6578; GFX900:       ; %bb.0:
6579; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6580; GFX900-NEXT:    ;;#ASMSTART
6581; GFX900-NEXT:    ; def s[4:5]
6582; GFX900-NEXT:    ;;#ASMEND
6583; GFX900-NEXT:    ;;#ASMSTART
6584; GFX900-NEXT:    ; def s[6:7]
6585; GFX900-NEXT:    ;;#ASMEND
6586; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s7
6587; GFX900-NEXT:    ;;#ASMSTART
6588; GFX900-NEXT:    ; use s8
6589; GFX900-NEXT:    ;;#ASMEND
6590; GFX900-NEXT:    s_setpc_b64 s[30:31]
6591;
6592; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__0_6:
6593; GFX90A:       ; %bb.0:
6594; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6595; GFX90A-NEXT:    ;;#ASMSTART
6596; GFX90A-NEXT:    ; def s[4:5]
6597; GFX90A-NEXT:    ;;#ASMEND
6598; GFX90A-NEXT:    ;;#ASMSTART
6599; GFX90A-NEXT:    ; def s[6:7]
6600; GFX90A-NEXT:    ;;#ASMEND
6601; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s7
6602; GFX90A-NEXT:    ;;#ASMSTART
6603; GFX90A-NEXT:    ; use s8
6604; GFX90A-NEXT:    ;;#ASMEND
6605; GFX90A-NEXT:    s_setpc_b64 s[30:31]
6606;
6607; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__0_6:
6608; GFX940:       ; %bb.0:
6609; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6610; GFX940-NEXT:    ;;#ASMSTART
6611; GFX940-NEXT:    ; def s[0:1]
6612; GFX940-NEXT:    ;;#ASMEND
6613; GFX940-NEXT:    ;;#ASMSTART
6614; GFX940-NEXT:    ; def s[2:3]
6615; GFX940-NEXT:    ;;#ASMEND
6616; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s0, s3
6617; GFX940-NEXT:    ;;#ASMSTART
6618; GFX940-NEXT:    ; use s8
6619; GFX940-NEXT:    ;;#ASMEND
6620; GFX940-NEXT:    s_setpc_b64 s[30:31]
6621  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
6622  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
6623  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 0, i32 6>
6624  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
6625  ret void
6626}
6627
6628define void @s_shuffle_v2bf16_v4bf16__1_6() {
6629; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__1_6:
6630; GFX900:       ; %bb.0:
6631; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6632; GFX900-NEXT:    ;;#ASMSTART
6633; GFX900-NEXT:    ; def s[4:5]
6634; GFX900-NEXT:    ;;#ASMEND
6635; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
6636; GFX900-NEXT:    ;;#ASMSTART
6637; GFX900-NEXT:    ; def s[6:7]
6638; GFX900-NEXT:    ;;#ASMEND
6639; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s7
6640; GFX900-NEXT:    ;;#ASMSTART
6641; GFX900-NEXT:    ; use s8
6642; GFX900-NEXT:    ;;#ASMEND
6643; GFX900-NEXT:    s_setpc_b64 s[30:31]
6644;
6645; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__1_6:
6646; GFX90A:       ; %bb.0:
6647; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6648; GFX90A-NEXT:    ;;#ASMSTART
6649; GFX90A-NEXT:    ; def s[4:5]
6650; GFX90A-NEXT:    ;;#ASMEND
6651; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
6652; GFX90A-NEXT:    ;;#ASMSTART
6653; GFX90A-NEXT:    ; def s[6:7]
6654; GFX90A-NEXT:    ;;#ASMEND
6655; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s7
6656; GFX90A-NEXT:    ;;#ASMSTART
6657; GFX90A-NEXT:    ; use s8
6658; GFX90A-NEXT:    ;;#ASMEND
6659; GFX90A-NEXT:    s_setpc_b64 s[30:31]
6660;
6661; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__1_6:
6662; GFX940:       ; %bb.0:
6663; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6664; GFX940-NEXT:    ;;#ASMSTART
6665; GFX940-NEXT:    ; def s[0:1]
6666; GFX940-NEXT:    ;;#ASMEND
6667; GFX940-NEXT:    s_lshr_b32 s0, s0, 16
6668; GFX940-NEXT:    ;;#ASMSTART
6669; GFX940-NEXT:    ; def s[2:3]
6670; GFX940-NEXT:    ;;#ASMEND
6671; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s0, s3
6672; GFX940-NEXT:    ;;#ASMSTART
6673; GFX940-NEXT:    ; use s8
6674; GFX940-NEXT:    ;;#ASMEND
6675; GFX940-NEXT:    s_setpc_b64 s[30:31]
6676  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
6677  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
6678  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 1, i32 6>
6679  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
6680  ret void
6681}
6682
6683define void @s_shuffle_v2bf16_v4bf16__2_6() {
6684; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__2_6:
6685; GFX900:       ; %bb.0:
6686; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6687; GFX900-NEXT:    ;;#ASMSTART
6688; GFX900-NEXT:    ; def s[4:5]
6689; GFX900-NEXT:    ;;#ASMEND
6690; GFX900-NEXT:    ;;#ASMSTART
6691; GFX900-NEXT:    ; def s[6:7]
6692; GFX900-NEXT:    ;;#ASMEND
6693; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s7
6694; GFX900-NEXT:    ;;#ASMSTART
6695; GFX900-NEXT:    ; use s8
6696; GFX900-NEXT:    ;;#ASMEND
6697; GFX900-NEXT:    s_setpc_b64 s[30:31]
6698;
6699; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__2_6:
6700; GFX90A:       ; %bb.0:
6701; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6702; GFX90A-NEXT:    ;;#ASMSTART
6703; GFX90A-NEXT:    ; def s[4:5]
6704; GFX90A-NEXT:    ;;#ASMEND
6705; GFX90A-NEXT:    ;;#ASMSTART
6706; GFX90A-NEXT:    ; def s[6:7]
6707; GFX90A-NEXT:    ;;#ASMEND
6708; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s7
6709; GFX90A-NEXT:    ;;#ASMSTART
6710; GFX90A-NEXT:    ; use s8
6711; GFX90A-NEXT:    ;;#ASMEND
6712; GFX90A-NEXT:    s_setpc_b64 s[30:31]
6713;
6714; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__2_6:
6715; GFX940:       ; %bb.0:
6716; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6717; GFX940-NEXT:    ;;#ASMSTART
6718; GFX940-NEXT:    ; def s[0:1]
6719; GFX940-NEXT:    ;;#ASMEND
6720; GFX940-NEXT:    ;;#ASMSTART
6721; GFX940-NEXT:    ; def s[2:3]
6722; GFX940-NEXT:    ;;#ASMEND
6723; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s1, s3
6724; GFX940-NEXT:    ;;#ASMSTART
6725; GFX940-NEXT:    ; use s8
6726; GFX940-NEXT:    ;;#ASMEND
6727; GFX940-NEXT:    s_setpc_b64 s[30:31]
6728  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
6729  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
6730  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 2, i32 6>
6731  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
6732  ret void
6733}
6734
6735define void @s_shuffle_v2bf16_v4bf16__3_6() {
6736; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__3_6:
6737; GFX900:       ; %bb.0:
6738; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6739; GFX900-NEXT:    ;;#ASMSTART
6740; GFX900-NEXT:    ; def s[4:5]
6741; GFX900-NEXT:    ;;#ASMEND
6742; GFX900-NEXT:    s_lshr_b32 s4, s5, 16
6743; GFX900-NEXT:    ;;#ASMSTART
6744; GFX900-NEXT:    ; def s[6:7]
6745; GFX900-NEXT:    ;;#ASMEND
6746; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s7
6747; GFX900-NEXT:    ;;#ASMSTART
6748; GFX900-NEXT:    ; use s8
6749; GFX900-NEXT:    ;;#ASMEND
6750; GFX900-NEXT:    s_setpc_b64 s[30:31]
6751;
6752; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__3_6:
6753; GFX90A:       ; %bb.0:
6754; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6755; GFX90A-NEXT:    ;;#ASMSTART
6756; GFX90A-NEXT:    ; def s[4:5]
6757; GFX90A-NEXT:    ;;#ASMEND
6758; GFX90A-NEXT:    s_lshr_b32 s4, s5, 16
6759; GFX90A-NEXT:    ;;#ASMSTART
6760; GFX90A-NEXT:    ; def s[6:7]
6761; GFX90A-NEXT:    ;;#ASMEND
6762; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s7
6763; GFX90A-NEXT:    ;;#ASMSTART
6764; GFX90A-NEXT:    ; use s8
6765; GFX90A-NEXT:    ;;#ASMEND
6766; GFX90A-NEXT:    s_setpc_b64 s[30:31]
6767;
6768; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__3_6:
6769; GFX940:       ; %bb.0:
6770; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6771; GFX940-NEXT:    ;;#ASMSTART
6772; GFX940-NEXT:    ; def s[0:1]
6773; GFX940-NEXT:    ;;#ASMEND
6774; GFX940-NEXT:    s_lshr_b32 s0, s1, 16
6775; GFX940-NEXT:    ;;#ASMSTART
6776; GFX940-NEXT:    ; def s[2:3]
6777; GFX940-NEXT:    ;;#ASMEND
6778; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s0, s3
6779; GFX940-NEXT:    ;;#ASMSTART
6780; GFX940-NEXT:    ; use s8
6781; GFX940-NEXT:    ;;#ASMEND
6782; GFX940-NEXT:    s_setpc_b64 s[30:31]
6783  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
6784  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
6785  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 3, i32 6>
6786  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
6787  ret void
6788}
6789
6790define void @s_shuffle_v2bf16_v4bf16__4_6() {
6791; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__4_6:
6792; GFX900:       ; %bb.0:
6793; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6794; GFX900-NEXT:    ;;#ASMSTART
6795; GFX900-NEXT:    ; def s[4:5]
6796; GFX900-NEXT:    ;;#ASMEND
6797; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
6798; GFX900-NEXT:    ;;#ASMSTART
6799; GFX900-NEXT:    ; use s8
6800; GFX900-NEXT:    ;;#ASMEND
6801; GFX900-NEXT:    s_setpc_b64 s[30:31]
6802;
6803; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__4_6:
6804; GFX90A:       ; %bb.0:
6805; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6806; GFX90A-NEXT:    ;;#ASMSTART
6807; GFX90A-NEXT:    ; def s[4:5]
6808; GFX90A-NEXT:    ;;#ASMEND
6809; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
6810; GFX90A-NEXT:    ;;#ASMSTART
6811; GFX90A-NEXT:    ; use s8
6812; GFX90A-NEXT:    ;;#ASMEND
6813; GFX90A-NEXT:    s_setpc_b64 s[30:31]
6814;
6815; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__4_6:
6816; GFX940:       ; %bb.0:
6817; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6818; GFX940-NEXT:    ;;#ASMSTART
6819; GFX940-NEXT:    ; def s[0:1]
6820; GFX940-NEXT:    ;;#ASMEND
6821; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s0, s1
6822; GFX940-NEXT:    ;;#ASMSTART
6823; GFX940-NEXT:    ; use s8
6824; GFX940-NEXT:    ;;#ASMEND
6825; GFX940-NEXT:    s_setpc_b64 s[30:31]
6826  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
6827  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
6828  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 4, i32 6>
6829  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
6830  ret void
6831}
6832
6833define void @s_shuffle_v2bf16_v4bf16__5_6() {
6834; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__5_6:
6835; GFX900:       ; %bb.0:
6836; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6837; GFX900-NEXT:    ;;#ASMSTART
6838; GFX900-NEXT:    ; def s[4:5]
6839; GFX900-NEXT:    ;;#ASMEND
6840; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
6841; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
6842; GFX900-NEXT:    ;;#ASMSTART
6843; GFX900-NEXT:    ; use s8
6844; GFX900-NEXT:    ;;#ASMEND
6845; GFX900-NEXT:    s_setpc_b64 s[30:31]
6846;
6847; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__5_6:
6848; GFX90A:       ; %bb.0:
6849; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6850; GFX90A-NEXT:    ;;#ASMSTART
6851; GFX90A-NEXT:    ; def s[4:5]
6852; GFX90A-NEXT:    ;;#ASMEND
6853; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
6854; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
6855; GFX90A-NEXT:    ;;#ASMSTART
6856; GFX90A-NEXT:    ; use s8
6857; GFX90A-NEXT:    ;;#ASMEND
6858; GFX90A-NEXT:    s_setpc_b64 s[30:31]
6859;
6860; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__5_6:
6861; GFX940:       ; %bb.0:
6862; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6863; GFX940-NEXT:    ;;#ASMSTART
6864; GFX940-NEXT:    ; def s[0:1]
6865; GFX940-NEXT:    ;;#ASMEND
6866; GFX940-NEXT:    s_lshr_b32 s0, s0, 16
6867; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s0, s1
6868; GFX940-NEXT:    ;;#ASMSTART
6869; GFX940-NEXT:    ; use s8
6870; GFX940-NEXT:    ;;#ASMEND
6871; GFX940-NEXT:    s_setpc_b64 s[30:31]
6872  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
6873  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
6874  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 5, i32 6>
6875  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
6876  ret void
6877}
6878
6879define void @s_shuffle_v2bf16_v4bf16__6_6() {
6880; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__6_6:
6881; GFX900:       ; %bb.0:
6882; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6883; GFX900-NEXT:    ;;#ASMSTART
6884; GFX900-NEXT:    ; def s[4:5]
6885; GFX900-NEXT:    ;;#ASMEND
6886; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s5
6887; GFX900-NEXT:    ;;#ASMSTART
6888; GFX900-NEXT:    ; use s8
6889; GFX900-NEXT:    ;;#ASMEND
6890; GFX900-NEXT:    s_setpc_b64 s[30:31]
6891;
6892; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__6_6:
6893; GFX90A:       ; %bb.0:
6894; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6895; GFX90A-NEXT:    ;;#ASMSTART
6896; GFX90A-NEXT:    ; def s[4:5]
6897; GFX90A-NEXT:    ;;#ASMEND
6898; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s5
6899; GFX90A-NEXT:    ;;#ASMSTART
6900; GFX90A-NEXT:    ; use s8
6901; GFX90A-NEXT:    ;;#ASMEND
6902; GFX90A-NEXT:    s_setpc_b64 s[30:31]
6903;
6904; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__6_6:
6905; GFX940:       ; %bb.0:
6906; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6907; GFX940-NEXT:    ;;#ASMSTART
6908; GFX940-NEXT:    ; def s[0:1]
6909; GFX940-NEXT:    ;;#ASMEND
6910; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s1, s1
6911; GFX940-NEXT:    ;;#ASMSTART
6912; GFX940-NEXT:    ; use s8
6913; GFX940-NEXT:    ;;#ASMEND
6914; GFX940-NEXT:    s_setpc_b64 s[30:31]
6915  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
6916  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
6917  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 6, i32 6>
6918  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
6919  ret void
6920}
6921
6922define void @s_shuffle_v2bf16_v4bf16__u_7() {
6923; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__u_7:
6924; GFX900:       ; %bb.0:
6925; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6926; GFX900-NEXT:    ;;#ASMSTART
6927; GFX900-NEXT:    ; def s[4:5]
6928; GFX900-NEXT:    ;;#ASMEND
6929; GFX900-NEXT:    s_mov_b32 s8, s5
6930; GFX900-NEXT:    ;;#ASMSTART
6931; GFX900-NEXT:    ; use s8
6932; GFX900-NEXT:    ;;#ASMEND
6933; GFX900-NEXT:    s_setpc_b64 s[30:31]
6934;
6935; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__u_7:
6936; GFX90A:       ; %bb.0:
6937; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6938; GFX90A-NEXT:    ;;#ASMSTART
6939; GFX90A-NEXT:    ; def s[4:5]
6940; GFX90A-NEXT:    ;;#ASMEND
6941; GFX90A-NEXT:    s_mov_b32 s8, s5
6942; GFX90A-NEXT:    ;;#ASMSTART
6943; GFX90A-NEXT:    ; use s8
6944; GFX90A-NEXT:    ;;#ASMEND
6945; GFX90A-NEXT:    s_setpc_b64 s[30:31]
6946;
6947; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__u_7:
6948; GFX940:       ; %bb.0:
6949; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6950; GFX940-NEXT:    ;;#ASMSTART
6951; GFX940-NEXT:    ; def s[0:1]
6952; GFX940-NEXT:    ;;#ASMEND
6953; GFX940-NEXT:    s_mov_b32 s8, s1
6954; GFX940-NEXT:    ;;#ASMSTART
6955; GFX940-NEXT:    ; use s8
6956; GFX940-NEXT:    ;;#ASMEND
6957; GFX940-NEXT:    s_setpc_b64 s[30:31]
6958  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
6959  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
6960  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 poison, i32 7>
6961  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
6962  ret void
6963}
6964
6965define void @s_shuffle_v2bf16_v4bf16__0_7() {
6966; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__0_7:
6967; GFX900:       ; %bb.0:
6968; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6969; GFX900-NEXT:    ;;#ASMSTART
6970; GFX900-NEXT:    ; def s[4:5]
6971; GFX900-NEXT:    ;;#ASMEND
6972; GFX900-NEXT:    ;;#ASMSTART
6973; GFX900-NEXT:    ; def s[6:7]
6974; GFX900-NEXT:    ;;#ASMEND
6975; GFX900-NEXT:    s_lshr_b32 s5, s7, 16
6976; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
6977; GFX900-NEXT:    ;;#ASMSTART
6978; GFX900-NEXT:    ; use s8
6979; GFX900-NEXT:    ;;#ASMEND
6980; GFX900-NEXT:    s_setpc_b64 s[30:31]
6981;
6982; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__0_7:
6983; GFX90A:       ; %bb.0:
6984; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6985; GFX90A-NEXT:    ;;#ASMSTART
6986; GFX90A-NEXT:    ; def s[4:5]
6987; GFX90A-NEXT:    ;;#ASMEND
6988; GFX90A-NEXT:    ;;#ASMSTART
6989; GFX90A-NEXT:    ; def s[6:7]
6990; GFX90A-NEXT:    ;;#ASMEND
6991; GFX90A-NEXT:    s_lshr_b32 s5, s7, 16
6992; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
6993; GFX90A-NEXT:    ;;#ASMSTART
6994; GFX90A-NEXT:    ; use s8
6995; GFX90A-NEXT:    ;;#ASMEND
6996; GFX90A-NEXT:    s_setpc_b64 s[30:31]
6997;
6998; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__0_7:
6999; GFX940:       ; %bb.0:
7000; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7001; GFX940-NEXT:    ;;#ASMSTART
7002; GFX940-NEXT:    ; def s[0:1]
7003; GFX940-NEXT:    ;;#ASMEND
7004; GFX940-NEXT:    ;;#ASMSTART
7005; GFX940-NEXT:    ; def s[2:3]
7006; GFX940-NEXT:    ;;#ASMEND
7007; GFX940-NEXT:    s_lshr_b32 s1, s3, 16
7008; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s0, s1
7009; GFX940-NEXT:    ;;#ASMSTART
7010; GFX940-NEXT:    ; use s8
7011; GFX940-NEXT:    ;;#ASMEND
7012; GFX940-NEXT:    s_setpc_b64 s[30:31]
7013  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
7014  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
7015  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 0, i32 7>
7016  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
7017  ret void
7018}
7019
7020define void @s_shuffle_v2bf16_v4bf16__1_7() {
7021; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__1_7:
7022; GFX900:       ; %bb.0:
7023; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7024; GFX900-NEXT:    ;;#ASMSTART
7025; GFX900-NEXT:    ; def s[4:5]
7026; GFX900-NEXT:    ;;#ASMEND
7027; GFX900-NEXT:    ;;#ASMSTART
7028; GFX900-NEXT:    ; def s[6:7]
7029; GFX900-NEXT:    ;;#ASMEND
7030; GFX900-NEXT:    s_lshr_b32 s5, s7, 16
7031; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
7032; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
7033; GFX900-NEXT:    ;;#ASMSTART
7034; GFX900-NEXT:    ; use s8
7035; GFX900-NEXT:    ;;#ASMEND
7036; GFX900-NEXT:    s_setpc_b64 s[30:31]
7037;
7038; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__1_7:
7039; GFX90A:       ; %bb.0:
7040; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7041; GFX90A-NEXT:    ;;#ASMSTART
7042; GFX90A-NEXT:    ; def s[4:5]
7043; GFX90A-NEXT:    ;;#ASMEND
7044; GFX90A-NEXT:    ;;#ASMSTART
7045; GFX90A-NEXT:    ; def s[6:7]
7046; GFX90A-NEXT:    ;;#ASMEND
7047; GFX90A-NEXT:    s_lshr_b32 s5, s7, 16
7048; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
7049; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
7050; GFX90A-NEXT:    ;;#ASMSTART
7051; GFX90A-NEXT:    ; use s8
7052; GFX90A-NEXT:    ;;#ASMEND
7053; GFX90A-NEXT:    s_setpc_b64 s[30:31]
7054;
7055; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__1_7:
7056; GFX940:       ; %bb.0:
7057; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7058; GFX940-NEXT:    ;;#ASMSTART
7059; GFX940-NEXT:    ; def s[0:1]
7060; GFX940-NEXT:    ;;#ASMEND
7061; GFX940-NEXT:    ;;#ASMSTART
7062; GFX940-NEXT:    ; def s[2:3]
7063; GFX940-NEXT:    ;;#ASMEND
7064; GFX940-NEXT:    s_lshr_b32 s1, s3, 16
7065; GFX940-NEXT:    s_lshr_b32 s0, s0, 16
7066; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s0, s1
7067; GFX940-NEXT:    ;;#ASMSTART
7068; GFX940-NEXT:    ; use s8
7069; GFX940-NEXT:    ;;#ASMEND
7070; GFX940-NEXT:    s_setpc_b64 s[30:31]
7071  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
7072  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
7073  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 1, i32 7>
7074  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
7075  ret void
7076}
7077
7078define void @s_shuffle_v2bf16_v4bf16__2_7() {
7079; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__2_7:
7080; GFX900:       ; %bb.0:
7081; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7082; GFX900-NEXT:    ;;#ASMSTART
7083; GFX900-NEXT:    ; def s[4:5]
7084; GFX900-NEXT:    ;;#ASMEND
7085; GFX900-NEXT:    ;;#ASMSTART
7086; GFX900-NEXT:    ; def s[6:7]
7087; GFX900-NEXT:    ;;#ASMEND
7088; GFX900-NEXT:    s_lshr_b32 s4, s7, 16
7089; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
7090; GFX900-NEXT:    ;;#ASMSTART
7091; GFX900-NEXT:    ; use s8
7092; GFX900-NEXT:    ;;#ASMEND
7093; GFX900-NEXT:    s_setpc_b64 s[30:31]
7094;
7095; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__2_7:
7096; GFX90A:       ; %bb.0:
7097; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7098; GFX90A-NEXT:    ;;#ASMSTART
7099; GFX90A-NEXT:    ; def s[4:5]
7100; GFX90A-NEXT:    ;;#ASMEND
7101; GFX90A-NEXT:    ;;#ASMSTART
7102; GFX90A-NEXT:    ; def s[6:7]
7103; GFX90A-NEXT:    ;;#ASMEND
7104; GFX90A-NEXT:    s_lshr_b32 s4, s7, 16
7105; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
7106; GFX90A-NEXT:    ;;#ASMSTART
7107; GFX90A-NEXT:    ; use s8
7108; GFX90A-NEXT:    ;;#ASMEND
7109; GFX90A-NEXT:    s_setpc_b64 s[30:31]
7110;
7111; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__2_7:
7112; GFX940:       ; %bb.0:
7113; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7114; GFX940-NEXT:    ;;#ASMSTART
7115; GFX940-NEXT:    ; def s[0:1]
7116; GFX940-NEXT:    ;;#ASMEND
7117; GFX940-NEXT:    ;;#ASMSTART
7118; GFX940-NEXT:    ; def s[2:3]
7119; GFX940-NEXT:    ;;#ASMEND
7120; GFX940-NEXT:    s_lshr_b32 s0, s3, 16
7121; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
7122; GFX940-NEXT:    ;;#ASMSTART
7123; GFX940-NEXT:    ; use s8
7124; GFX940-NEXT:    ;;#ASMEND
7125; GFX940-NEXT:    s_setpc_b64 s[30:31]
7126  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
7127  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
7128  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 2, i32 7>
7129  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
7130  ret void
7131}
7132
7133define void @s_shuffle_v2bf16_v4bf16__3_7() {
7134; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__3_7:
7135; GFX900:       ; %bb.0:
7136; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7137; GFX900-NEXT:    ;;#ASMSTART
7138; GFX900-NEXT:    ; def s[4:5]
7139; GFX900-NEXT:    ;;#ASMEND
7140; GFX900-NEXT:    ;;#ASMSTART
7141; GFX900-NEXT:    ; def s[6:7]
7142; GFX900-NEXT:    ;;#ASMEND
7143; GFX900-NEXT:    s_lshr_b32 s4, s7, 16
7144; GFX900-NEXT:    s_lshr_b32 s5, s5, 16
7145; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
7146; GFX900-NEXT:    ;;#ASMSTART
7147; GFX900-NEXT:    ; use s8
7148; GFX900-NEXT:    ;;#ASMEND
7149; GFX900-NEXT:    s_setpc_b64 s[30:31]
7150;
7151; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__3_7:
7152; GFX90A:       ; %bb.0:
7153; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7154; GFX90A-NEXT:    ;;#ASMSTART
7155; GFX90A-NEXT:    ; def s[4:5]
7156; GFX90A-NEXT:    ;;#ASMEND
7157; GFX90A-NEXT:    ;;#ASMSTART
7158; GFX90A-NEXT:    ; def s[6:7]
7159; GFX90A-NEXT:    ;;#ASMEND
7160; GFX90A-NEXT:    s_lshr_b32 s4, s7, 16
7161; GFX90A-NEXT:    s_lshr_b32 s5, s5, 16
7162; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
7163; GFX90A-NEXT:    ;;#ASMSTART
7164; GFX90A-NEXT:    ; use s8
7165; GFX90A-NEXT:    ;;#ASMEND
7166; GFX90A-NEXT:    s_setpc_b64 s[30:31]
7167;
7168; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__3_7:
7169; GFX940:       ; %bb.0:
7170; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7171; GFX940-NEXT:    ;;#ASMSTART
7172; GFX940-NEXT:    ; def s[0:1]
7173; GFX940-NEXT:    ;;#ASMEND
7174; GFX940-NEXT:    ;;#ASMSTART
7175; GFX940-NEXT:    ; def s[2:3]
7176; GFX940-NEXT:    ;;#ASMEND
7177; GFX940-NEXT:    s_lshr_b32 s0, s3, 16
7178; GFX940-NEXT:    s_lshr_b32 s1, s1, 16
7179; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
7180; GFX940-NEXT:    ;;#ASMSTART
7181; GFX940-NEXT:    ; use s8
7182; GFX940-NEXT:    ;;#ASMEND
7183; GFX940-NEXT:    s_setpc_b64 s[30:31]
7184  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
7185  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
7186  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 3, i32 7>
7187  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
7188  ret void
7189}
7190
7191define void @s_shuffle_v2bf16_v4bf16__4_7() {
7192; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__4_7:
7193; GFX900:       ; %bb.0:
7194; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7195; GFX900-NEXT:    ;;#ASMSTART
7196; GFX900-NEXT:    ; def s[4:5]
7197; GFX900-NEXT:    ;;#ASMEND
7198; GFX900-NEXT:    s_lshr_b32 s5, s5, 16
7199; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
7200; GFX900-NEXT:    ;;#ASMSTART
7201; GFX900-NEXT:    ; use s8
7202; GFX900-NEXT:    ;;#ASMEND
7203; GFX900-NEXT:    s_setpc_b64 s[30:31]
7204;
7205; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__4_7:
7206; GFX90A:       ; %bb.0:
7207; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7208; GFX90A-NEXT:    ;;#ASMSTART
7209; GFX90A-NEXT:    ; def s[4:5]
7210; GFX90A-NEXT:    ;;#ASMEND
7211; GFX90A-NEXT:    s_lshr_b32 s5, s5, 16
7212; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
7213; GFX90A-NEXT:    ;;#ASMSTART
7214; GFX90A-NEXT:    ; use s8
7215; GFX90A-NEXT:    ;;#ASMEND
7216; GFX90A-NEXT:    s_setpc_b64 s[30:31]
7217;
7218; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__4_7:
7219; GFX940:       ; %bb.0:
7220; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7221; GFX940-NEXT:    ;;#ASMSTART
7222; GFX940-NEXT:    ; def s[0:1]
7223; GFX940-NEXT:    ;;#ASMEND
7224; GFX940-NEXT:    s_lshr_b32 s1, s1, 16
7225; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s0, s1
7226; GFX940-NEXT:    ;;#ASMSTART
7227; GFX940-NEXT:    ; use s8
7228; GFX940-NEXT:    ;;#ASMEND
7229; GFX940-NEXT:    s_setpc_b64 s[30:31]
7230  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
7231  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
7232  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 4, i32 7>
7233  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
7234  ret void
7235}
7236
7237define void @s_shuffle_v2bf16_v4bf16__5_7() {
7238; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__5_7:
7239; GFX900:       ; %bb.0:
7240; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7241; GFX900-NEXT:    ;;#ASMSTART
7242; GFX900-NEXT:    ; def s[4:5]
7243; GFX900-NEXT:    ;;#ASMEND
7244; GFX900-NEXT:    s_lshr_b32 s5, s5, 16
7245; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
7246; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
7247; GFX900-NEXT:    ;;#ASMSTART
7248; GFX900-NEXT:    ; use s8
7249; GFX900-NEXT:    ;;#ASMEND
7250; GFX900-NEXT:    s_setpc_b64 s[30:31]
7251;
7252; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__5_7:
7253; GFX90A:       ; %bb.0:
7254; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7255; GFX90A-NEXT:    ;;#ASMSTART
7256; GFX90A-NEXT:    ; def s[4:5]
7257; GFX90A-NEXT:    ;;#ASMEND
7258; GFX90A-NEXT:    s_lshr_b32 s5, s5, 16
7259; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
7260; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
7261; GFX90A-NEXT:    ;;#ASMSTART
7262; GFX90A-NEXT:    ; use s8
7263; GFX90A-NEXT:    ;;#ASMEND
7264; GFX90A-NEXT:    s_setpc_b64 s[30:31]
7265;
7266; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__5_7:
7267; GFX940:       ; %bb.0:
7268; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7269; GFX940-NEXT:    ;;#ASMSTART
7270; GFX940-NEXT:    ; def s[0:1]
7271; GFX940-NEXT:    ;;#ASMEND
7272; GFX940-NEXT:    s_lshr_b32 s1, s1, 16
7273; GFX940-NEXT:    s_lshr_b32 s0, s0, 16
7274; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s0, s1
7275; GFX940-NEXT:    ;;#ASMSTART
7276; GFX940-NEXT:    ; use s8
7277; GFX940-NEXT:    ;;#ASMEND
7278; GFX940-NEXT:    s_setpc_b64 s[30:31]
7279  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
7280  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
7281  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 5, i32 7>
7282  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
7283  ret void
7284}
7285
7286define void @s_shuffle_v2bf16_v4bf16__6_7() {
7287; GFX900-LABEL: s_shuffle_v2bf16_v4bf16__6_7:
7288; GFX900:       ; %bb.0:
7289; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7290; GFX900-NEXT:    ;;#ASMSTART
7291; GFX900-NEXT:    ; def s[4:5]
7292; GFX900-NEXT:    ;;#ASMEND
7293; GFX900-NEXT:    s_mov_b32 s8, s5
7294; GFX900-NEXT:    ;;#ASMSTART
7295; GFX900-NEXT:    ; use s8
7296; GFX900-NEXT:    ;;#ASMEND
7297; GFX900-NEXT:    s_setpc_b64 s[30:31]
7298;
7299; GFX90A-LABEL: s_shuffle_v2bf16_v4bf16__6_7:
7300; GFX90A:       ; %bb.0:
7301; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7302; GFX90A-NEXT:    ;;#ASMSTART
7303; GFX90A-NEXT:    ; def s[4:5]
7304; GFX90A-NEXT:    ;;#ASMEND
7305; GFX90A-NEXT:    s_mov_b32 s8, s5
7306; GFX90A-NEXT:    ;;#ASMSTART
7307; GFX90A-NEXT:    ; use s8
7308; GFX90A-NEXT:    ;;#ASMEND
7309; GFX90A-NEXT:    s_setpc_b64 s[30:31]
7310;
7311; GFX940-LABEL: s_shuffle_v2bf16_v4bf16__6_7:
7312; GFX940:       ; %bb.0:
7313; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7314; GFX940-NEXT:    ;;#ASMSTART
7315; GFX940-NEXT:    ; def s[0:1]
7316; GFX940-NEXT:    ;;#ASMEND
7317; GFX940-NEXT:    s_mov_b32 s8, s1
7318; GFX940-NEXT:    ;;#ASMSTART
7319; GFX940-NEXT:    ; use s8
7320; GFX940-NEXT:    ;;#ASMEND
7321; GFX940-NEXT:    s_setpc_b64 s[30:31]
7322  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
7323  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
7324  %shuf = shufflevector <4 x bfloat> %vec0, <4 x bfloat> %vec1, <2 x i32> <i32 6, i32 7>
7325  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
7326  ret void
7327}
7328;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
7329; GFX90APLUS: {{.*}}
7330