xref: /llvm-project/llvm/test/CodeGen/AMDGPU/shufflevector.v4bf16.v3bf16.ll (revision 585858aeb6247b3892218edb9d353c63f1c33186)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s
3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s
4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX940 %s
5
6
7define void @v_shuffle_v4bf16_v3bf16__u_u_u_u(ptr addrspace(1) inreg %ptr) {
8; GFX9-LABEL: v_shuffle_v4bf16_v3bf16__u_u_u_u:
9; GFX9:       ; %bb.0:
10; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11; GFX9-NEXT:    s_setpc_b64 s[30:31]
12  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
13  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
14  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> poison
15  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
16  ret void
17}
18
19define void @v_shuffle_v4bf16_v3bf16__0_u_u_u(ptr addrspace(1) inreg %ptr) {
20; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__0_u_u_u:
21; GFX900:       ; %bb.0:
22; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
23; GFX900-NEXT:    v_mov_b32_e32 v2, 0
24; GFX900-NEXT:    ;;#ASMSTART
25; GFX900-NEXT:    ; def v[0:1]
26; GFX900-NEXT:    ;;#ASMEND
27; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
28; GFX900-NEXT:    s_waitcnt vmcnt(0)
29; GFX900-NEXT:    s_setpc_b64 s[30:31]
30;
31; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__0_u_u_u:
32; GFX90A:       ; %bb.0:
33; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
35; GFX90A-NEXT:    ;;#ASMSTART
36; GFX90A-NEXT:    ; def v[0:1]
37; GFX90A-NEXT:    ;;#ASMEND
38; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
39; GFX90A-NEXT:    s_waitcnt vmcnt(0)
40; GFX90A-NEXT:    s_setpc_b64 s[30:31]
41;
42; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__0_u_u_u:
43; GFX940:       ; %bb.0:
44; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
45; GFX940-NEXT:    v_mov_b32_e32 v2, 0
46; GFX940-NEXT:    ;;#ASMSTART
47; GFX940-NEXT:    ; def v[0:1]
48; GFX940-NEXT:    ;;#ASMEND
49; GFX940-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
50; GFX940-NEXT:    s_waitcnt vmcnt(0)
51; GFX940-NEXT:    s_setpc_b64 s[30:31]
52  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
53  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
54  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
55  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
56  ret void
57}
58
59define void @v_shuffle_v4bf16_v3bf16__1_u_u_u(ptr addrspace(1) inreg %ptr) {
60; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__1_u_u_u:
61; GFX900:       ; %bb.0:
62; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
63; GFX900-NEXT:    ;;#ASMSTART
64; GFX900-NEXT:    ; def v[0:1]
65; GFX900-NEXT:    ;;#ASMEND
66; GFX900-NEXT:    v_mov_b32_e32 v2, 0
67; GFX900-NEXT:    v_alignbit_b32 v0, s4, v0, 16
68; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
69; GFX900-NEXT:    s_waitcnt vmcnt(0)
70; GFX900-NEXT:    s_setpc_b64 s[30:31]
71;
72; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__1_u_u_u:
73; GFX90A:       ; %bb.0:
74; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
75; GFX90A-NEXT:    ;;#ASMSTART
76; GFX90A-NEXT:    ; def v[0:1]
77; GFX90A-NEXT:    ;;#ASMEND
78; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
79; GFX90A-NEXT:    v_alignbit_b32 v0, s4, v0, 16
80; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
81; GFX90A-NEXT:    s_waitcnt vmcnt(0)
82; GFX90A-NEXT:    s_setpc_b64 s[30:31]
83;
84; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__1_u_u_u:
85; GFX940:       ; %bb.0:
86; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
87; GFX940-NEXT:    ;;#ASMSTART
88; GFX940-NEXT:    ; def v[0:1]
89; GFX940-NEXT:    ;;#ASMEND
90; GFX940-NEXT:    v_mov_b32_e32 v2, 0
91; GFX940-NEXT:    v_alignbit_b32 v0, s0, v0, 16
92; GFX940-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
93; GFX940-NEXT:    s_waitcnt vmcnt(0)
94; GFX940-NEXT:    s_setpc_b64 s[30:31]
95  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
96  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
97  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
98  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
99  ret void
100}
101
102define void @v_shuffle_v4bf16_v3bf16__2_u_u_u(ptr addrspace(1) inreg %ptr) {
103; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__2_u_u_u:
104; GFX900:       ; %bb.0:
105; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
106; GFX900-NEXT:    ;;#ASMSTART
107; GFX900-NEXT:    ; def v[0:1]
108; GFX900-NEXT:    ;;#ASMEND
109; GFX900-NEXT:    v_mov_b32_e32 v2, 0
110; GFX900-NEXT:    v_mov_b32_e32 v0, v1
111; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
112; GFX900-NEXT:    s_waitcnt vmcnt(0)
113; GFX900-NEXT:    s_setpc_b64 s[30:31]
114;
115; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__2_u_u_u:
116; GFX90A:       ; %bb.0:
117; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
118; GFX90A-NEXT:    ;;#ASMSTART
119; GFX90A-NEXT:    ; def v[0:1]
120; GFX90A-NEXT:    ;;#ASMEND
121; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
122; GFX90A-NEXT:    v_mov_b32_e32 v0, v1
123; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
124; GFX90A-NEXT:    s_waitcnt vmcnt(0)
125; GFX90A-NEXT:    s_setpc_b64 s[30:31]
126;
127; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__2_u_u_u:
128; GFX940:       ; %bb.0:
129; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
130; GFX940-NEXT:    ;;#ASMSTART
131; GFX940-NEXT:    ; def v[0:1]
132; GFX940-NEXT:    ;;#ASMEND
133; GFX940-NEXT:    v_mov_b32_e32 v2, 0
134; GFX940-NEXT:    v_mov_b32_e32 v0, v1
135; GFX940-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
136; GFX940-NEXT:    s_waitcnt vmcnt(0)
137; GFX940-NEXT:    s_setpc_b64 s[30:31]
138  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
139  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
140  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 2, i32 poison, i32 poison, i32 poison>
141  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
142  ret void
143}
144
145define void @v_shuffle_v4bf16_v3bf16__3_u_u_u(ptr addrspace(1) inreg %ptr) {
146; GFX9-LABEL: v_shuffle_v4bf16_v3bf16__3_u_u_u:
147; GFX9:       ; %bb.0:
148; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
149; GFX9-NEXT:    s_setpc_b64 s[30:31]
150  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
151  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
152  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 3, i32 poison, i32 poison, i32 poison>
153  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
154  ret void
155}
156
157define void @v_shuffle_v4bf16_v3bf16__4_u_u_u(ptr addrspace(1) inreg %ptr) {
158; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__4_u_u_u:
159; GFX900:       ; %bb.0:
160; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
161; GFX900-NEXT:    ;;#ASMSTART
162; GFX900-NEXT:    ; def v[0:1]
163; GFX900-NEXT:    ;;#ASMEND
164; GFX900-NEXT:    v_mov_b32_e32 v2, 0
165; GFX900-NEXT:    v_alignbit_b32 v0, s4, v0, 16
166; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
167; GFX900-NEXT:    s_waitcnt vmcnt(0)
168; GFX900-NEXT:    s_setpc_b64 s[30:31]
169;
170; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__4_u_u_u:
171; GFX90A:       ; %bb.0:
172; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
173; GFX90A-NEXT:    ;;#ASMSTART
174; GFX90A-NEXT:    ; def v[0:1]
175; GFX90A-NEXT:    ;;#ASMEND
176; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
177; GFX90A-NEXT:    v_alignbit_b32 v0, s4, v0, 16
178; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
179; GFX90A-NEXT:    s_waitcnt vmcnt(0)
180; GFX90A-NEXT:    s_setpc_b64 s[30:31]
181;
182; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__4_u_u_u:
183; GFX940:       ; %bb.0:
184; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
185; GFX940-NEXT:    ;;#ASMSTART
186; GFX940-NEXT:    ; def v[0:1]
187; GFX940-NEXT:    ;;#ASMEND
188; GFX940-NEXT:    v_mov_b32_e32 v2, 0
189; GFX940-NEXT:    v_alignbit_b32 v0, s0, v0, 16
190; GFX940-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
191; GFX940-NEXT:    s_waitcnt vmcnt(0)
192; GFX940-NEXT:    s_setpc_b64 s[30:31]
193  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
194  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
195  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
196  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
197  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 4, i32 poison, i32 poison, i32 poison>
198  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
199  ret void
200}
201
202define void @v_shuffle_v4bf16_v3bf16__5_u_u_u(ptr addrspace(1) inreg %ptr) {
203; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_u_u_u:
204; GFX900:       ; %bb.0:
205; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
206; GFX900-NEXT:    ;;#ASMSTART
207; GFX900-NEXT:    ; def v[0:1]
208; GFX900-NEXT:    ;;#ASMEND
209; GFX900-NEXT:    v_mov_b32_e32 v2, 0
210; GFX900-NEXT:    v_mov_b32_e32 v0, v1
211; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
212; GFX900-NEXT:    s_waitcnt vmcnt(0)
213; GFX900-NEXT:    s_setpc_b64 s[30:31]
214;
215; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_u_u_u:
216; GFX90A:       ; %bb.0:
217; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
218; GFX90A-NEXT:    ;;#ASMSTART
219; GFX90A-NEXT:    ; def v[0:1]
220; GFX90A-NEXT:    ;;#ASMEND
221; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
222; GFX90A-NEXT:    v_mov_b32_e32 v0, v1
223; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
224; GFX90A-NEXT:    s_waitcnt vmcnt(0)
225; GFX90A-NEXT:    s_setpc_b64 s[30:31]
226;
227; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_u_u_u:
228; GFX940:       ; %bb.0:
229; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
230; GFX940-NEXT:    ;;#ASMSTART
231; GFX940-NEXT:    ; def v[0:1]
232; GFX940-NEXT:    ;;#ASMEND
233; GFX940-NEXT:    v_mov_b32_e32 v2, 0
234; GFX940-NEXT:    v_mov_b32_e32 v0, v1
235; GFX940-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
236; GFX940-NEXT:    s_waitcnt vmcnt(0)
237; GFX940-NEXT:    s_setpc_b64 s[30:31]
238  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
239  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
240  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
241  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
242  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 poison, i32 poison, i32 poison>
243  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
244  ret void
245}
246
247define void @v_shuffle_v4bf16_v3bf16__5_0_u_u(ptr addrspace(1) inreg %ptr) {
248; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_0_u_u:
249; GFX900:       ; %bb.0:
250; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
251; GFX900-NEXT:    ;;#ASMSTART
252; GFX900-NEXT:    ; def v[0:1]
253; GFX900-NEXT:    ;;#ASMEND
254; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
255; GFX900-NEXT:    v_mov_b32_e32 v3, 0
256; GFX900-NEXT:    ;;#ASMSTART
257; GFX900-NEXT:    ; def v[1:2]
258; GFX900-NEXT:    ;;#ASMEND
259; GFX900-NEXT:    v_perm_b32 v0, v0, v2, s4
260; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
261; GFX900-NEXT:    s_waitcnt vmcnt(0)
262; GFX900-NEXT:    s_setpc_b64 s[30:31]
263;
264; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_0_u_u:
265; GFX90A:       ; %bb.0:
266; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
267; GFX90A-NEXT:    ;;#ASMSTART
268; GFX90A-NEXT:    ; def v[0:1]
269; GFX90A-NEXT:    ;;#ASMEND
270; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
271; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
272; GFX90A-NEXT:    ;;#ASMSTART
273; GFX90A-NEXT:    ; def v[2:3]
274; GFX90A-NEXT:    ;;#ASMEND
275; GFX90A-NEXT:    v_perm_b32 v0, v0, v3, s4
276; GFX90A-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
277; GFX90A-NEXT:    s_waitcnt vmcnt(0)
278; GFX90A-NEXT:    s_setpc_b64 s[30:31]
279;
280; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_0_u_u:
281; GFX940:       ; %bb.0:
282; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
283; GFX940-NEXT:    ;;#ASMSTART
284; GFX940-NEXT:    ; def v[0:1]
285; GFX940-NEXT:    ;;#ASMEND
286; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
287; GFX940-NEXT:    v_mov_b32_e32 v4, 0
288; GFX940-NEXT:    ;;#ASMSTART
289; GFX940-NEXT:    ; def v[2:3]
290; GFX940-NEXT:    ;;#ASMEND
291; GFX940-NEXT:    s_nop 0
292; GFX940-NEXT:    v_perm_b32 v0, v0, v3, s2
293; GFX940-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
294; GFX940-NEXT:    s_waitcnt vmcnt(0)
295; GFX940-NEXT:    s_setpc_b64 s[30:31]
296  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
297  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
298  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
299  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
300  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 0, i32 poison, i32 poison>
301  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
302  ret void
303}
304
305define void @v_shuffle_v4bf16_v3bf16__5_1_u_u(ptr addrspace(1) inreg %ptr) {
306; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_1_u_u:
307; GFX900:       ; %bb.0:
308; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
309; GFX900-NEXT:    ;;#ASMSTART
310; GFX900-NEXT:    ; def v[0:1]
311; GFX900-NEXT:    ;;#ASMEND
312; GFX900-NEXT:    s_mov_b32 s4, 0xffff
313; GFX900-NEXT:    v_mov_b32_e32 v3, 0
314; GFX900-NEXT:    ;;#ASMSTART
315; GFX900-NEXT:    ; def v[1:2]
316; GFX900-NEXT:    ;;#ASMEND
317; GFX900-NEXT:    v_bfi_b32 v0, s4, v2, v0
318; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
319; GFX900-NEXT:    s_waitcnt vmcnt(0)
320; GFX900-NEXT:    s_setpc_b64 s[30:31]
321;
322; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_1_u_u:
323; GFX90A:       ; %bb.0:
324; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
325; GFX90A-NEXT:    ;;#ASMSTART
326; GFX90A-NEXT:    ; def v[0:1]
327; GFX90A-NEXT:    ;;#ASMEND
328; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
329; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
330; GFX90A-NEXT:    ;;#ASMSTART
331; GFX90A-NEXT:    ; def v[2:3]
332; GFX90A-NEXT:    ;;#ASMEND
333; GFX90A-NEXT:    v_bfi_b32 v0, s4, v3, v0
334; GFX90A-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
335; GFX90A-NEXT:    s_waitcnt vmcnt(0)
336; GFX90A-NEXT:    s_setpc_b64 s[30:31]
337;
338; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_1_u_u:
339; GFX940:       ; %bb.0:
340; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
341; GFX940-NEXT:    ;;#ASMSTART
342; GFX940-NEXT:    ; def v[0:1]
343; GFX940-NEXT:    ;;#ASMEND
344; GFX940-NEXT:    s_mov_b32 s2, 0xffff
345; GFX940-NEXT:    v_mov_b32_e32 v4, 0
346; GFX940-NEXT:    ;;#ASMSTART
347; GFX940-NEXT:    ; def v[2:3]
348; GFX940-NEXT:    ;;#ASMEND
349; GFX940-NEXT:    s_nop 0
350; GFX940-NEXT:    v_bfi_b32 v0, s2, v3, v0
351; GFX940-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
352; GFX940-NEXT:    s_waitcnt vmcnt(0)
353; GFX940-NEXT:    s_setpc_b64 s[30:31]
354  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
355  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
356  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
357  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
358  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 1, i32 poison, i32 poison>
359  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
360  ret void
361}
362
363define void @v_shuffle_v4bf16_v3bf16__5_2_u_u(ptr addrspace(1) inreg %ptr) {
364; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_2_u_u:
365; GFX900:       ; %bb.0:
366; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
367; GFX900-NEXT:    ;;#ASMSTART
368; GFX900-NEXT:    ; def v[0:1]
369; GFX900-NEXT:    ;;#ASMEND
370; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
371; GFX900-NEXT:    v_mov_b32_e32 v4, 0
372; GFX900-NEXT:    ;;#ASMSTART
373; GFX900-NEXT:    ; def v[2:3]
374; GFX900-NEXT:    ;;#ASMEND
375; GFX900-NEXT:    v_perm_b32 v0, v1, v3, s4
376; GFX900-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
377; GFX900-NEXT:    s_waitcnt vmcnt(0)
378; GFX900-NEXT:    s_setpc_b64 s[30:31]
379;
380; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_2_u_u:
381; GFX90A:       ; %bb.0:
382; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
383; GFX90A-NEXT:    ;;#ASMSTART
384; GFX90A-NEXT:    ; def v[0:1]
385; GFX90A-NEXT:    ;;#ASMEND
386; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
387; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
388; GFX90A-NEXT:    ;;#ASMSTART
389; GFX90A-NEXT:    ; def v[2:3]
390; GFX90A-NEXT:    ;;#ASMEND
391; GFX90A-NEXT:    v_perm_b32 v0, v1, v3, s4
392; GFX90A-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
393; GFX90A-NEXT:    s_waitcnt vmcnt(0)
394; GFX90A-NEXT:    s_setpc_b64 s[30:31]
395;
396; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_2_u_u:
397; GFX940:       ; %bb.0:
398; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
399; GFX940-NEXT:    ;;#ASMSTART
400; GFX940-NEXT:    ; def v[0:1]
401; GFX940-NEXT:    ;;#ASMEND
402; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
403; GFX940-NEXT:    v_mov_b32_e32 v4, 0
404; GFX940-NEXT:    ;;#ASMSTART
405; GFX940-NEXT:    ; def v[2:3]
406; GFX940-NEXT:    ;;#ASMEND
407; GFX940-NEXT:    s_nop 0
408; GFX940-NEXT:    v_perm_b32 v0, v1, v3, s2
409; GFX940-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
410; GFX940-NEXT:    s_waitcnt vmcnt(0)
411; GFX940-NEXT:    s_setpc_b64 s[30:31]
412  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
413  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
414  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
415  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
416  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 2, i32 poison, i32 poison>
417  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
418  ret void
419}
420
421define void @v_shuffle_v4bf16_v3bf16__5_3_u_u(ptr addrspace(1) inreg %ptr) {
422; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_3_u_u:
423; GFX900:       ; %bb.0:
424; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
425; GFX900-NEXT:    ;;#ASMSTART
426; GFX900-NEXT:    ; def v[0:1]
427; GFX900-NEXT:    ;;#ASMEND
428; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
429; GFX900-NEXT:    v_mov_b32_e32 v2, 0
430; GFX900-NEXT:    v_perm_b32 v0, v0, v1, s4
431; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
432; GFX900-NEXT:    s_waitcnt vmcnt(0)
433; GFX900-NEXT:    s_setpc_b64 s[30:31]
434;
435; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_3_u_u:
436; GFX90A:       ; %bb.0:
437; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
438; GFX90A-NEXT:    ;;#ASMSTART
439; GFX90A-NEXT:    ; def v[0:1]
440; GFX90A-NEXT:    ;;#ASMEND
441; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
442; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
443; GFX90A-NEXT:    v_perm_b32 v0, v0, v1, s4
444; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
445; GFX90A-NEXT:    s_waitcnt vmcnt(0)
446; GFX90A-NEXT:    s_setpc_b64 s[30:31]
447;
448; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_3_u_u:
449; GFX940:       ; %bb.0:
450; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
451; GFX940-NEXT:    ;;#ASMSTART
452; GFX940-NEXT:    ; def v[0:1]
453; GFX940-NEXT:    ;;#ASMEND
454; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
455; GFX940-NEXT:    v_mov_b32_e32 v2, 0
456; GFX940-NEXT:    v_perm_b32 v0, v0, v1, s2
457; GFX940-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
458; GFX940-NEXT:    s_waitcnt vmcnt(0)
459; GFX940-NEXT:    s_setpc_b64 s[30:31]
460  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
461  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
462  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
463  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
464  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 3, i32 poison, i32 poison>
465  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
466  ret void
467}
468
469define void @v_shuffle_v4bf16_v3bf16__5_4_u_u(ptr addrspace(1) inreg %ptr) {
470; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_4_u_u:
471; GFX900:       ; %bb.0:
472; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
473; GFX900-NEXT:    ;;#ASMSTART
474; GFX900-NEXT:    ; def v[0:1]
475; GFX900-NEXT:    ;;#ASMEND
476; GFX900-NEXT:    s_mov_b32 s4, 0xffff
477; GFX900-NEXT:    v_mov_b32_e32 v2, 0
478; GFX900-NEXT:    v_bfi_b32 v0, s4, v1, v0
479; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
480; GFX900-NEXT:    s_waitcnt vmcnt(0)
481; GFX900-NEXT:    s_setpc_b64 s[30:31]
482;
483; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_4_u_u:
484; GFX90A:       ; %bb.0:
485; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
486; GFX90A-NEXT:    ;;#ASMSTART
487; GFX90A-NEXT:    ; def v[0:1]
488; GFX90A-NEXT:    ;;#ASMEND
489; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
490; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
491; GFX90A-NEXT:    v_bfi_b32 v0, s4, v1, v0
492; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
493; GFX90A-NEXT:    s_waitcnt vmcnt(0)
494; GFX90A-NEXT:    s_setpc_b64 s[30:31]
495;
496; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_4_u_u:
497; GFX940:       ; %bb.0:
498; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
499; GFX940-NEXT:    ;;#ASMSTART
500; GFX940-NEXT:    ; def v[0:1]
501; GFX940-NEXT:    ;;#ASMEND
502; GFX940-NEXT:    s_mov_b32 s2, 0xffff
503; GFX940-NEXT:    v_mov_b32_e32 v2, 0
504; GFX940-NEXT:    v_bfi_b32 v0, s2, v1, v0
505; GFX940-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
506; GFX940-NEXT:    s_waitcnt vmcnt(0)
507; GFX940-NEXT:    s_setpc_b64 s[30:31]
508  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
509  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
510  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
511  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
512  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 4, i32 poison, i32 poison>
513  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
514  ret void
515}
516
517define void @v_shuffle_v4bf16_v3bf16__5_5_u_u(ptr addrspace(1) inreg %ptr) {
518; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_u:
519; GFX900:       ; %bb.0:
520; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
521; GFX900-NEXT:    ;;#ASMSTART
522; GFX900-NEXT:    ; def v[0:1]
523; GFX900-NEXT:    ;;#ASMEND
524; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
525; GFX900-NEXT:    v_mov_b32_e32 v2, 0
526; GFX900-NEXT:    v_perm_b32 v0, v1, v1, s4
527; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
528; GFX900-NEXT:    s_waitcnt vmcnt(0)
529; GFX900-NEXT:    s_setpc_b64 s[30:31]
530;
531; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_u:
532; GFX90A:       ; %bb.0:
533; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
534; GFX90A-NEXT:    ;;#ASMSTART
535; GFX90A-NEXT:    ; def v[0:1]
536; GFX90A-NEXT:    ;;#ASMEND
537; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
538; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
539; GFX90A-NEXT:    v_perm_b32 v0, v1, v1, s4
540; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
541; GFX90A-NEXT:    s_waitcnt vmcnt(0)
542; GFX90A-NEXT:    s_setpc_b64 s[30:31]
543;
544; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_u:
545; GFX940:       ; %bb.0:
546; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
547; GFX940-NEXT:    ;;#ASMSTART
548; GFX940-NEXT:    ; def v[0:1]
549; GFX940-NEXT:    ;;#ASMEND
550; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
551; GFX940-NEXT:    v_mov_b32_e32 v2, 0
552; GFX940-NEXT:    v_perm_b32 v0, v1, v1, s2
553; GFX940-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
554; GFX940-NEXT:    s_waitcnt vmcnt(0)
555; GFX940-NEXT:    s_setpc_b64 s[30:31]
556  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
557  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
558  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
559  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
560  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 poison, i32 poison>
561  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
562  ret void
563}
564
565define void @v_shuffle_v4bf16_v3bf16__5_5_0_u(ptr addrspace(1) inreg %ptr) {
566; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_u:
567; GFX900:       ; %bb.0:
568; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
569; GFX900-NEXT:    ;;#ASMSTART
570; GFX900-NEXT:    ; def v[0:1]
571; GFX900-NEXT:    ;;#ASMEND
572; GFX900-NEXT:    ;;#ASMSTART
573; GFX900-NEXT:    ; def v[1:2]
574; GFX900-NEXT:    ;;#ASMEND
575; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
576; GFX900-NEXT:    v_mov_b32_e32 v3, 0
577; GFX900-NEXT:    v_perm_b32 v1, v2, v2, s4
578; GFX900-NEXT:    v_mov_b32_e32 v2, v0
579; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
580; GFX900-NEXT:    s_waitcnt vmcnt(0)
581; GFX900-NEXT:    s_setpc_b64 s[30:31]
582;
583; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_u:
584; GFX90A:       ; %bb.0:
585; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
586; GFX90A-NEXT:    ;;#ASMSTART
587; GFX90A-NEXT:    ; def v[2:3]
588; GFX90A-NEXT:    ;;#ASMEND
589; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
590; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
591; GFX90A-NEXT:    ;;#ASMSTART
592; GFX90A-NEXT:    ; def v[0:1]
593; GFX90A-NEXT:    ;;#ASMEND
594; GFX90A-NEXT:    v_perm_b32 v2, v3, v3, s4
595; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
596; GFX90A-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
597; GFX90A-NEXT:    s_waitcnt vmcnt(0)
598; GFX90A-NEXT:    s_setpc_b64 s[30:31]
599;
600; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_u:
601; GFX940:       ; %bb.0:
602; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
603; GFX940-NEXT:    ;;#ASMSTART
604; GFX940-NEXT:    ; def v[2:3]
605; GFX940-NEXT:    ;;#ASMEND
606; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
607; GFX940-NEXT:    v_mov_b32_e32 v4, 0
608; GFX940-NEXT:    ;;#ASMSTART
609; GFX940-NEXT:    ; def v[0:1]
610; GFX940-NEXT:    ;;#ASMEND
611; GFX940-NEXT:    v_perm_b32 v2, v3, v3, s2
612; GFX940-NEXT:    v_mov_b32_e32 v3, v0
613; GFX940-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
614; GFX940-NEXT:    s_waitcnt vmcnt(0)
615; GFX940-NEXT:    s_setpc_b64 s[30:31]
616  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
617  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
618  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
619  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
620  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 0, i32 poison>
621  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
622  ret void
623}
624
625define void @v_shuffle_v4bf16_v3bf16__5_5_1_u(ptr addrspace(1) inreg %ptr) {
626; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_u:
627; GFX900:       ; %bb.0:
628; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
629; GFX900-NEXT:    ;;#ASMSTART
630; GFX900-NEXT:    ; def v[0:1]
631; GFX900-NEXT:    ;;#ASMEND
632; GFX900-NEXT:    ;;#ASMSTART
633; GFX900-NEXT:    ; def v[1:2]
634; GFX900-NEXT:    ;;#ASMEND
635; GFX900-NEXT:    v_alignbit_b32 v1, s4, v0, 16
636; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
637; GFX900-NEXT:    v_mov_b32_e32 v3, 0
638; GFX900-NEXT:    v_perm_b32 v0, v2, v2, s4
639; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
640; GFX900-NEXT:    s_waitcnt vmcnt(0)
641; GFX900-NEXT:    s_setpc_b64 s[30:31]
642;
643; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_u:
644; GFX90A:       ; %bb.0:
645; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
646; GFX90A-NEXT:    ;;#ASMSTART
647; GFX90A-NEXT:    ; def v[0:1]
648; GFX90A-NEXT:    ;;#ASMEND
649; GFX90A-NEXT:    v_alignbit_b32 v1, s4, v0, 16
650; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
651; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
652; GFX90A-NEXT:    ;;#ASMSTART
653; GFX90A-NEXT:    ; def v[2:3]
654; GFX90A-NEXT:    ;;#ASMEND
655; GFX90A-NEXT:    v_perm_b32 v0, v3, v3, s4
656; GFX90A-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
657; GFX90A-NEXT:    s_waitcnt vmcnt(0)
658; GFX90A-NEXT:    s_setpc_b64 s[30:31]
659;
660; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_u:
661; GFX940:       ; %bb.0:
662; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
663; GFX940-NEXT:    ;;#ASMSTART
664; GFX940-NEXT:    ; def v[0:1]
665; GFX940-NEXT:    ;;#ASMEND
666; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
667; GFX940-NEXT:    v_mov_b32_e32 v4, 0
668; GFX940-NEXT:    ;;#ASMSTART
669; GFX940-NEXT:    ; def v[2:3]
670; GFX940-NEXT:    ;;#ASMEND
671; GFX940-NEXT:    v_alignbit_b32 v1, s0, v0, 16
672; GFX940-NEXT:    v_perm_b32 v0, v3, v3, s2
673; GFX940-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
674; GFX940-NEXT:    s_waitcnt vmcnt(0)
675; GFX940-NEXT:    s_setpc_b64 s[30:31]
676  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
677  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
678  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
679  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
680  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 1, i32 poison>
681  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
682  ret void
683}
684
685define void @v_shuffle_v4bf16_v3bf16__5_5_2_u(ptr addrspace(1) inreg %ptr) {
686; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_u:
687; GFX900:       ; %bb.0:
688; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
689; GFX900-NEXT:    ;;#ASMSTART
690; GFX900-NEXT:    ; def v[0:1]
691; GFX900-NEXT:    ;;#ASMEND
692; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
693; GFX900-NEXT:    v_mov_b32_e32 v4, 0
694; GFX900-NEXT:    ;;#ASMSTART
695; GFX900-NEXT:    ; def v[2:3]
696; GFX900-NEXT:    ;;#ASMEND
697; GFX900-NEXT:    v_perm_b32 v0, v3, v3, s4
698; GFX900-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
699; GFX900-NEXT:    s_waitcnt vmcnt(0)
700; GFX900-NEXT:    s_setpc_b64 s[30:31]
701;
702; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_u:
703; GFX90A:       ; %bb.0:
704; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
705; GFX90A-NEXT:    ;;#ASMSTART
706; GFX90A-NEXT:    ; def v[0:1]
707; GFX90A-NEXT:    ;;#ASMEND
708; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
709; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
710; GFX90A-NEXT:    ;;#ASMSTART
711; GFX90A-NEXT:    ; def v[2:3]
712; GFX90A-NEXT:    ;;#ASMEND
713; GFX90A-NEXT:    v_perm_b32 v0, v3, v3, s4
714; GFX90A-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
715; GFX90A-NEXT:    s_waitcnt vmcnt(0)
716; GFX90A-NEXT:    s_setpc_b64 s[30:31]
717;
718; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_u:
719; GFX940:       ; %bb.0:
720; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
721; GFX940-NEXT:    ;;#ASMSTART
722; GFX940-NEXT:    ; def v[0:1]
723; GFX940-NEXT:    ;;#ASMEND
724; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
725; GFX940-NEXT:    v_mov_b32_e32 v4, 0
726; GFX940-NEXT:    ;;#ASMSTART
727; GFX940-NEXT:    ; def v[2:3]
728; GFX940-NEXT:    ;;#ASMEND
729; GFX940-NEXT:    s_nop 0
730; GFX940-NEXT:    v_perm_b32 v0, v3, v3, s2
731; GFX940-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
732; GFX940-NEXT:    s_waitcnt vmcnt(0)
733; GFX940-NEXT:    s_setpc_b64 s[30:31]
734  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
735  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
736  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
737  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
738  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 2, i32 poison>
739  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
740  ret void
741}
742
743define void @v_shuffle_v4bf16_v3bf16__5_5_3_u(ptr addrspace(1) inreg %ptr) {
744; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_u:
745; GFX900:       ; %bb.0:
746; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
747; GFX900-NEXT:    ;;#ASMSTART
748; GFX900-NEXT:    ; def v[0:1]
749; GFX900-NEXT:    ;;#ASMEND
750; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
751; GFX900-NEXT:    v_mov_b32_e32 v3, 0
752; GFX900-NEXT:    v_perm_b32 v1, v1, v1, s4
753; GFX900-NEXT:    v_mov_b32_e32 v2, v0
754; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
755; GFX900-NEXT:    s_waitcnt vmcnt(0)
756; GFX900-NEXT:    s_setpc_b64 s[30:31]
757;
758; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_u:
759; GFX90A:       ; %bb.0:
760; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
761; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
762; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
763; GFX90A-NEXT:    ;;#ASMSTART
764; GFX90A-NEXT:    ; def v[0:1]
765; GFX90A-NEXT:    ;;#ASMEND
766; GFX90A-NEXT:    v_perm_b32 v2, v1, v1, s4
767; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
768; GFX90A-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
769; GFX90A-NEXT:    s_waitcnt vmcnt(0)
770; GFX90A-NEXT:    s_setpc_b64 s[30:31]
771;
772; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_u:
773; GFX940:       ; %bb.0:
774; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
775; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
776; GFX940-NEXT:    v_mov_b32_e32 v4, 0
777; GFX940-NEXT:    ;;#ASMSTART
778; GFX940-NEXT:    ; def v[0:1]
779; GFX940-NEXT:    ;;#ASMEND
780; GFX940-NEXT:    s_nop 0
781; GFX940-NEXT:    v_perm_b32 v2, v1, v1, s2
782; GFX940-NEXT:    v_mov_b32_e32 v3, v0
783; GFX940-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
784; GFX940-NEXT:    s_waitcnt vmcnt(0)
785; GFX940-NEXT:    s_setpc_b64 s[30:31]
786  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
787  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
788  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
789  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
790  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 3, i32 poison>
791  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
792  ret void
793}
794
795define void @v_shuffle_v4bf16_v3bf16__5_5_4_u(ptr addrspace(1) inreg %ptr) {
796; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_u:
797; GFX900:       ; %bb.0:
798; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
799; GFX900-NEXT:    ;;#ASMSTART
800; GFX900-NEXT:    ; def v[0:1]
801; GFX900-NEXT:    ;;#ASMEND
802; GFX900-NEXT:    v_alignbit_b32 v2, s4, v0, 16
803; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
804; GFX900-NEXT:    v_mov_b32_e32 v3, 0
805; GFX900-NEXT:    v_perm_b32 v1, v1, v1, s4
806; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
807; GFX900-NEXT:    s_waitcnt vmcnt(0)
808; GFX900-NEXT:    s_setpc_b64 s[30:31]
809;
810; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_u:
811; GFX90A:       ; %bb.0:
812; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
813; GFX90A-NEXT:    ;;#ASMSTART
814; GFX90A-NEXT:    ; def v[0:1]
815; GFX90A-NEXT:    ;;#ASMEND
816; GFX90A-NEXT:    v_alignbit_b32 v3, s4, v0, 16
817; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
818; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
819; GFX90A-NEXT:    v_perm_b32 v2, v1, v1, s4
820; GFX90A-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
821; GFX90A-NEXT:    s_waitcnt vmcnt(0)
822; GFX90A-NEXT:    s_setpc_b64 s[30:31]
823;
824; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_u:
825; GFX940:       ; %bb.0:
826; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
827; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
828; GFX940-NEXT:    v_mov_b32_e32 v4, 0
829; GFX940-NEXT:    ;;#ASMSTART
830; GFX940-NEXT:    ; def v[0:1]
831; GFX940-NEXT:    ;;#ASMEND
832; GFX940-NEXT:    s_nop 0
833; GFX940-NEXT:    v_alignbit_b32 v3, s0, v0, 16
834; GFX940-NEXT:    v_perm_b32 v2, v1, v1, s2
835; GFX940-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
836; GFX940-NEXT:    s_waitcnt vmcnt(0)
837; GFX940-NEXT:    s_setpc_b64 s[30:31]
838  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
839  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
840  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
841  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
842  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 4, i32 poison>
843  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
844  ret void
845}
846
847define void @v_shuffle_v4bf16_v3bf16__5_5_5_u(ptr addrspace(1) inreg %ptr) {
848; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_u:
849; GFX900:       ; %bb.0:
850; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
851; GFX900-NEXT:    ;;#ASMSTART
852; GFX900-NEXT:    ; def v[0:1]
853; GFX900-NEXT:    ;;#ASMEND
854; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
855; GFX900-NEXT:    v_mov_b32_e32 v2, 0
856; GFX900-NEXT:    v_perm_b32 v0, v1, v1, s4
857; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
858; GFX900-NEXT:    s_waitcnt vmcnt(0)
859; GFX900-NEXT:    s_setpc_b64 s[30:31]
860;
861; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_u:
862; GFX90A:       ; %bb.0:
863; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
864; GFX90A-NEXT:    ;;#ASMSTART
865; GFX90A-NEXT:    ; def v[0:1]
866; GFX90A-NEXT:    ;;#ASMEND
867; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
868; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
869; GFX90A-NEXT:    v_perm_b32 v0, v1, v1, s4
870; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
871; GFX90A-NEXT:    s_waitcnt vmcnt(0)
872; GFX90A-NEXT:    s_setpc_b64 s[30:31]
873;
874; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_u:
875; GFX940:       ; %bb.0:
876; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
877; GFX940-NEXT:    ;;#ASMSTART
878; GFX940-NEXT:    ; def v[0:1]
879; GFX940-NEXT:    ;;#ASMEND
880; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
881; GFX940-NEXT:    v_mov_b32_e32 v2, 0
882; GFX940-NEXT:    v_perm_b32 v0, v1, v1, s2
883; GFX940-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
884; GFX940-NEXT:    s_waitcnt vmcnt(0)
885; GFX940-NEXT:    s_setpc_b64 s[30:31]
886  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
887  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
888  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
889  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
890  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 5, i32 poison>
891  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
892  ret void
893}
894
895define void @v_shuffle_v4bf16_v3bf16__5_5_5_0(ptr addrspace(1) inreg %ptr) {
896; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_0:
897; GFX900:       ; %bb.0:
898; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
899; GFX900-NEXT:    ;;#ASMSTART
900; GFX900-NEXT:    ; def v[0:1]
901; GFX900-NEXT:    ;;#ASMEND
902; GFX900-NEXT:    ;;#ASMSTART
903; GFX900-NEXT:    ; def v[1:2]
904; GFX900-NEXT:    ;;#ASMEND
905; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
906; GFX900-NEXT:    v_mov_b32_e32 v3, 0
907; GFX900-NEXT:    v_perm_b32 v1, v0, v2, s4
908; GFX900-NEXT:    v_perm_b32 v0, v2, v2, s4
909; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
910; GFX900-NEXT:    s_waitcnt vmcnt(0)
911; GFX900-NEXT:    s_setpc_b64 s[30:31]
912;
913; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_0:
914; GFX90A:       ; %bb.0:
915; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
916; GFX90A-NEXT:    ;;#ASMSTART
917; GFX90A-NEXT:    ; def v[0:1]
918; GFX90A-NEXT:    ;;#ASMEND
919; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
920; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
921; GFX90A-NEXT:    ;;#ASMSTART
922; GFX90A-NEXT:    ; def v[2:3]
923; GFX90A-NEXT:    ;;#ASMEND
924; GFX90A-NEXT:    v_perm_b32 v1, v0, v3, s4
925; GFX90A-NEXT:    v_perm_b32 v0, v3, v3, s4
926; GFX90A-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
927; GFX90A-NEXT:    s_waitcnt vmcnt(0)
928; GFX90A-NEXT:    s_setpc_b64 s[30:31]
929;
930; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_0:
931; GFX940:       ; %bb.0:
932; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
933; GFX940-NEXT:    ;;#ASMSTART
934; GFX940-NEXT:    ; def v[0:1]
935; GFX940-NEXT:    ;;#ASMEND
936; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
937; GFX940-NEXT:    v_mov_b32_e32 v4, 0
938; GFX940-NEXT:    ;;#ASMSTART
939; GFX940-NEXT:    ; def v[2:3]
940; GFX940-NEXT:    ;;#ASMEND
941; GFX940-NEXT:    s_nop 0
942; GFX940-NEXT:    v_perm_b32 v1, v0, v3, s2
943; GFX940-NEXT:    v_perm_b32 v0, v3, v3, s2
944; GFX940-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
945; GFX940-NEXT:    s_waitcnt vmcnt(0)
946; GFX940-NEXT:    s_setpc_b64 s[30:31]
947  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
948  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
949  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
950  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
951  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 5, i32 0>
952  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
953  ret void
954}
955
956define void @v_shuffle_v4bf16_v3bf16__5_5_5_1(ptr addrspace(1) inreg %ptr) {
957; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_1:
958; GFX900:       ; %bb.0:
959; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
960; GFX900-NEXT:    ;;#ASMSTART
961; GFX900-NEXT:    ; def v[0:1]
962; GFX900-NEXT:    ;;#ASMEND
963; GFX900-NEXT:    ;;#ASMSTART
964; GFX900-NEXT:    ; def v[1:2]
965; GFX900-NEXT:    ;;#ASMEND
966; GFX900-NEXT:    s_mov_b32 s4, 0xffff
967; GFX900-NEXT:    v_bfi_b32 v1, s4, v2, v0
968; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
969; GFX900-NEXT:    v_mov_b32_e32 v3, 0
970; GFX900-NEXT:    v_perm_b32 v0, v2, v2, s4
971; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
972; GFX900-NEXT:    s_waitcnt vmcnt(0)
973; GFX900-NEXT:    s_setpc_b64 s[30:31]
974;
975; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_1:
976; GFX90A:       ; %bb.0:
977; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
978; GFX90A-NEXT:    ;;#ASMSTART
979; GFX90A-NEXT:    ; def v[0:1]
980; GFX90A-NEXT:    ;;#ASMEND
981; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
982; GFX90A-NEXT:    ;;#ASMSTART
983; GFX90A-NEXT:    ; def v[2:3]
984; GFX90A-NEXT:    ;;#ASMEND
985; GFX90A-NEXT:    v_bfi_b32 v1, s4, v3, v0
986; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
987; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
988; GFX90A-NEXT:    v_perm_b32 v0, v3, v3, s4
989; GFX90A-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
990; GFX90A-NEXT:    s_waitcnt vmcnt(0)
991; GFX90A-NEXT:    s_setpc_b64 s[30:31]
992;
993; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_1:
994; GFX940:       ; %bb.0:
995; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
996; GFX940-NEXT:    ;;#ASMSTART
997; GFX940-NEXT:    ; def v[0:1]
998; GFX940-NEXT:    ;;#ASMEND
999; GFX940-NEXT:    s_mov_b32 s2, 0xffff
1000; GFX940-NEXT:    ;;#ASMSTART
1001; GFX940-NEXT:    ; def v[2:3]
1002; GFX940-NEXT:    ;;#ASMEND
1003; GFX940-NEXT:    v_mov_b32_e32 v4, 0
1004; GFX940-NEXT:    v_bfi_b32 v1, s2, v3, v0
1005; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
1006; GFX940-NEXT:    v_perm_b32 v0, v3, v3, s2
1007; GFX940-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
1008; GFX940-NEXT:    s_waitcnt vmcnt(0)
1009; GFX940-NEXT:    s_setpc_b64 s[30:31]
1010  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
1011  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
1012  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
1013  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
1014  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 5, i32 1>
1015  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
1016  ret void
1017}
1018
1019define void @v_shuffle_v4bf16_v3bf16__5_5_5_2(ptr addrspace(1) inreg %ptr) {
1020; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_2:
1021; GFX900:       ; %bb.0:
1022; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1023; GFX900-NEXT:    ;;#ASMSTART
1024; GFX900-NEXT:    ; def v[0:1]
1025; GFX900-NEXT:    ;;#ASMEND
1026; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
1027; GFX900-NEXT:    v_mov_b32_e32 v4, 0
1028; GFX900-NEXT:    ;;#ASMSTART
1029; GFX900-NEXT:    ; def v[2:3]
1030; GFX900-NEXT:    ;;#ASMEND
1031; GFX900-NEXT:    v_perm_b32 v1, v1, v3, s4
1032; GFX900-NEXT:    v_perm_b32 v0, v3, v3, s4
1033; GFX900-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
1034; GFX900-NEXT:    s_waitcnt vmcnt(0)
1035; GFX900-NEXT:    s_setpc_b64 s[30:31]
1036;
1037; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_2:
1038; GFX90A:       ; %bb.0:
1039; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1040; GFX90A-NEXT:    ;;#ASMSTART
1041; GFX90A-NEXT:    ; def v[0:1]
1042; GFX90A-NEXT:    ;;#ASMEND
1043; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
1044; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
1045; GFX90A-NEXT:    ;;#ASMSTART
1046; GFX90A-NEXT:    ; def v[2:3]
1047; GFX90A-NEXT:    ;;#ASMEND
1048; GFX90A-NEXT:    v_perm_b32 v1, v1, v3, s4
1049; GFX90A-NEXT:    v_perm_b32 v0, v3, v3, s4
1050; GFX90A-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
1051; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1052; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1053;
1054; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_2:
1055; GFX940:       ; %bb.0:
1056; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1057; GFX940-NEXT:    ;;#ASMSTART
1058; GFX940-NEXT:    ; def v[0:1]
1059; GFX940-NEXT:    ;;#ASMEND
1060; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
1061; GFX940-NEXT:    v_mov_b32_e32 v4, 0
1062; GFX940-NEXT:    ;;#ASMSTART
1063; GFX940-NEXT:    ; def v[2:3]
1064; GFX940-NEXT:    ;;#ASMEND
1065; GFX940-NEXT:    s_nop 0
1066; GFX940-NEXT:    v_perm_b32 v1, v1, v3, s2
1067; GFX940-NEXT:    v_perm_b32 v0, v3, v3, s2
1068; GFX940-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
1069; GFX940-NEXT:    s_waitcnt vmcnt(0)
1070; GFX940-NEXT:    s_setpc_b64 s[30:31]
1071  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
1072  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
1073  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
1074  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
1075  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 5, i32 2>
1076  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
1077  ret void
1078}
1079
1080define void @v_shuffle_v4bf16_v3bf16__5_5_5_3(ptr addrspace(1) inreg %ptr) {
1081; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_3:
1082; GFX900:       ; %bb.0:
1083; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1084; GFX900-NEXT:    ;;#ASMSTART
1085; GFX900-NEXT:    ; def v[0:1]
1086; GFX900-NEXT:    ;;#ASMEND
1087; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
1088; GFX900-NEXT:    v_mov_b32_e32 v3, 0
1089; GFX900-NEXT:    v_perm_b32 v2, v0, v1, s4
1090; GFX900-NEXT:    v_perm_b32 v1, v1, v1, s4
1091; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
1092; GFX900-NEXT:    s_waitcnt vmcnt(0)
1093; GFX900-NEXT:    s_setpc_b64 s[30:31]
1094;
1095; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_3:
1096; GFX90A:       ; %bb.0:
1097; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1098; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
1099; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
1100; GFX90A-NEXT:    ;;#ASMSTART
1101; GFX90A-NEXT:    ; def v[0:1]
1102; GFX90A-NEXT:    ;;#ASMEND
1103; GFX90A-NEXT:    v_perm_b32 v3, v0, v1, s4
1104; GFX90A-NEXT:    v_perm_b32 v2, v1, v1, s4
1105; GFX90A-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
1106; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1107; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1108;
1109; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_3:
1110; GFX940:       ; %bb.0:
1111; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1112; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
1113; GFX940-NEXT:    v_mov_b32_e32 v4, 0
1114; GFX940-NEXT:    ;;#ASMSTART
1115; GFX940-NEXT:    ; def v[0:1]
1116; GFX940-NEXT:    ;;#ASMEND
1117; GFX940-NEXT:    s_nop 0
1118; GFX940-NEXT:    v_perm_b32 v3, v0, v1, s2
1119; GFX940-NEXT:    v_perm_b32 v2, v1, v1, s2
1120; GFX940-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
1121; GFX940-NEXT:    s_waitcnt vmcnt(0)
1122; GFX940-NEXT:    s_setpc_b64 s[30:31]
1123  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
1124  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
1125  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
1126  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
1127  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 5, i32 3>
1128  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
1129  ret void
1130}
1131
1132define void @v_shuffle_v4bf16_v3bf16__5_5_5_4(ptr addrspace(1) inreg %ptr) {
1133; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_4:
1134; GFX900:       ; %bb.0:
1135; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1136; GFX900-NEXT:    s_mov_b32 s4, 0xffff
1137; GFX900-NEXT:    ;;#ASMSTART
1138; GFX900-NEXT:    ; def v[0:1]
1139; GFX900-NEXT:    ;;#ASMEND
1140; GFX900-NEXT:    v_bfi_b32 v2, s4, v1, v0
1141; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
1142; GFX900-NEXT:    v_mov_b32_e32 v3, 0
1143; GFX900-NEXT:    v_perm_b32 v1, v1, v1, s4
1144; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
1145; GFX900-NEXT:    s_waitcnt vmcnt(0)
1146; GFX900-NEXT:    s_setpc_b64 s[30:31]
1147;
1148; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_4:
1149; GFX90A:       ; %bb.0:
1150; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1151; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
1152; GFX90A-NEXT:    ;;#ASMSTART
1153; GFX90A-NEXT:    ; def v[0:1]
1154; GFX90A-NEXT:    ;;#ASMEND
1155; GFX90A-NEXT:    v_bfi_b32 v3, s4, v1, v0
1156; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
1157; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
1158; GFX90A-NEXT:    v_perm_b32 v2, v1, v1, s4
1159; GFX90A-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
1160; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1161; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1162;
1163; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_4:
1164; GFX940:       ; %bb.0:
1165; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1166; GFX940-NEXT:    s_mov_b32 s2, 0xffff
1167; GFX940-NEXT:    ;;#ASMSTART
1168; GFX940-NEXT:    ; def v[0:1]
1169; GFX940-NEXT:    ;;#ASMEND
1170; GFX940-NEXT:    v_mov_b32_e32 v4, 0
1171; GFX940-NEXT:    v_bfi_b32 v3, s2, v1, v0
1172; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
1173; GFX940-NEXT:    v_perm_b32 v2, v1, v1, s2
1174; GFX940-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
1175; GFX940-NEXT:    s_waitcnt vmcnt(0)
1176; GFX940-NEXT:    s_setpc_b64 s[30:31]
1177  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
1178  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
1179  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
1180  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
1181  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 5, i32 4>
1182  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
1183  ret void
1184}
1185
1186define void @v_shuffle_v4bf16_v3bf16__5_5_5_5(ptr addrspace(1) inreg %ptr) {
1187; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_5:
1188; GFX900:       ; %bb.0:
1189; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1190; GFX900-NEXT:    ;;#ASMSTART
1191; GFX900-NEXT:    ; def v[0:1]
1192; GFX900-NEXT:    ;;#ASMEND
1193; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
1194; GFX900-NEXT:    v_perm_b32 v0, v1, v1, s4
1195; GFX900-NEXT:    v_mov_b32_e32 v2, 0
1196; GFX900-NEXT:    v_mov_b32_e32 v1, v0
1197; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
1198; GFX900-NEXT:    s_waitcnt vmcnt(0)
1199; GFX900-NEXT:    s_setpc_b64 s[30:31]
1200;
1201; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_5:
1202; GFX90A:       ; %bb.0:
1203; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1204; GFX90A-NEXT:    ;;#ASMSTART
1205; GFX90A-NEXT:    ; def v[0:1]
1206; GFX90A-NEXT:    ;;#ASMEND
1207; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
1208; GFX90A-NEXT:    v_perm_b32 v0, v1, v1, s4
1209; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
1210; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
1211; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
1212; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1213; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1214;
1215; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_5_5:
1216; GFX940:       ; %bb.0:
1217; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1218; GFX940-NEXT:    ;;#ASMSTART
1219; GFX940-NEXT:    ; def v[0:1]
1220; GFX940-NEXT:    ;;#ASMEND
1221; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
1222; GFX940-NEXT:    v_perm_b32 v0, v1, v1, s2
1223; GFX940-NEXT:    v_mov_b32_e32 v2, 0
1224; GFX940-NEXT:    v_mov_b32_e32 v1, v0
1225; GFX940-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
1226; GFX940-NEXT:    s_waitcnt vmcnt(0)
1227; GFX940-NEXT:    s_setpc_b64 s[30:31]
1228  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
1229  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
1230  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
1231  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
1232  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 5, i32 5>
1233  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
1234  ret void
1235}
1236
1237define void @v_shuffle_v4bf16_v3bf16__u_0_0_0(ptr addrspace(1) inreg %ptr) {
1238; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__u_0_0_0:
1239; GFX900:       ; %bb.0:
1240; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1241; GFX900-NEXT:    ;;#ASMSTART
1242; GFX900-NEXT:    ; def v[0:1]
1243; GFX900-NEXT:    ;;#ASMEND
1244; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
1245; GFX900-NEXT:    v_mov_b32_e32 v2, 0
1246; GFX900-NEXT:    v_perm_b32 v1, v0, v0, s4
1247; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1248; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
1249; GFX900-NEXT:    s_waitcnt vmcnt(0)
1250; GFX900-NEXT:    s_setpc_b64 s[30:31]
1251;
1252; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__u_0_0_0:
1253; GFX90A:       ; %bb.0:
1254; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1255; GFX90A-NEXT:    ;;#ASMSTART
1256; GFX90A-NEXT:    ; def v[0:1]
1257; GFX90A-NEXT:    ;;#ASMEND
1258; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
1259; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
1260; GFX90A-NEXT:    v_perm_b32 v1, v0, v0, s4
1261; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1262; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
1263; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1264; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1265;
1266; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__u_0_0_0:
1267; GFX940:       ; %bb.0:
1268; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1269; GFX940-NEXT:    ;;#ASMSTART
1270; GFX940-NEXT:    ; def v[0:1]
1271; GFX940-NEXT:    ;;#ASMEND
1272; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
1273; GFX940-NEXT:    v_mov_b32_e32 v2, 0
1274; GFX940-NEXT:    v_perm_b32 v1, v0, v0, s2
1275; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1276; GFX940-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
1277; GFX940-NEXT:    s_waitcnt vmcnt(0)
1278; GFX940-NEXT:    s_setpc_b64 s[30:31]
1279  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
1280  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
1281  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 poison, i32 0, i32 0, i32 0>
1282  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
1283  ret void
1284}
1285
1286define void @v_shuffle_v4bf16_v3bf16__0_0_0_0(ptr addrspace(1) inreg %ptr) {
1287; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__0_0_0_0:
1288; GFX900:       ; %bb.0:
1289; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1290; GFX900-NEXT:    ;;#ASMSTART
1291; GFX900-NEXT:    ; def v[0:1]
1292; GFX900-NEXT:    ;;#ASMEND
1293; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
1294; GFX900-NEXT:    v_perm_b32 v0, v0, v0, s4
1295; GFX900-NEXT:    v_mov_b32_e32 v2, 0
1296; GFX900-NEXT:    v_mov_b32_e32 v1, v0
1297; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
1298; GFX900-NEXT:    s_waitcnt vmcnt(0)
1299; GFX900-NEXT:    s_setpc_b64 s[30:31]
1300;
1301; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__0_0_0_0:
1302; GFX90A:       ; %bb.0:
1303; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1304; GFX90A-NEXT:    ;;#ASMSTART
1305; GFX90A-NEXT:    ; def v[0:1]
1306; GFX90A-NEXT:    ;;#ASMEND
1307; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
1308; GFX90A-NEXT:    v_perm_b32 v0, v0, v0, s4
1309; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
1310; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
1311; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
1312; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1313; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1314;
1315; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__0_0_0_0:
1316; GFX940:       ; %bb.0:
1317; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1318; GFX940-NEXT:    ;;#ASMSTART
1319; GFX940-NEXT:    ; def v[0:1]
1320; GFX940-NEXT:    ;;#ASMEND
1321; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
1322; GFX940-NEXT:    v_perm_b32 v0, v0, v0, s2
1323; GFX940-NEXT:    v_mov_b32_e32 v2, 0
1324; GFX940-NEXT:    v_mov_b32_e32 v1, v0
1325; GFX940-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
1326; GFX940-NEXT:    s_waitcnt vmcnt(0)
1327; GFX940-NEXT:    s_setpc_b64 s[30:31]
1328  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
1329  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
1330  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> zeroinitializer
1331  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
1332  ret void
1333}
1334
1335define void @v_shuffle_v4bf16_v3bf16__1_0_0_0(ptr addrspace(1) inreg %ptr) {
1336; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__1_0_0_0:
1337; GFX900:       ; %bb.0:
1338; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1339; GFX900-NEXT:    ;;#ASMSTART
1340; GFX900-NEXT:    ; def v[0:1]
1341; GFX900-NEXT:    ;;#ASMEND
1342; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
1343; GFX900-NEXT:    v_mov_b32_e32 v2, 0
1344; GFX900-NEXT:    v_perm_b32 v1, v0, v0, s4
1345; GFX900-NEXT:    v_alignbit_b32 v0, v0, v0, 16
1346; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
1347; GFX900-NEXT:    s_waitcnt vmcnt(0)
1348; GFX900-NEXT:    s_setpc_b64 s[30:31]
1349;
1350; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__1_0_0_0:
1351; GFX90A:       ; %bb.0:
1352; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1353; GFX90A-NEXT:    ;;#ASMSTART
1354; GFX90A-NEXT:    ; def v[0:1]
1355; GFX90A-NEXT:    ;;#ASMEND
1356; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
1357; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
1358; GFX90A-NEXT:    v_perm_b32 v1, v0, v0, s4
1359; GFX90A-NEXT:    v_alignbit_b32 v0, v0, v0, 16
1360; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
1361; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1362; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1363;
1364; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__1_0_0_0:
1365; GFX940:       ; %bb.0:
1366; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1367; GFX940-NEXT:    ;;#ASMSTART
1368; GFX940-NEXT:    ; def v[0:1]
1369; GFX940-NEXT:    ;;#ASMEND
1370; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
1371; GFX940-NEXT:    v_mov_b32_e32 v2, 0
1372; GFX940-NEXT:    v_perm_b32 v1, v0, v0, s2
1373; GFX940-NEXT:    v_alignbit_b32 v0, v0, v0, 16
1374; GFX940-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
1375; GFX940-NEXT:    s_waitcnt vmcnt(0)
1376; GFX940-NEXT:    s_setpc_b64 s[30:31]
1377  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
1378  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
1379  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
1380  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
1381  ret void
1382}
1383
1384define void @v_shuffle_v4bf16_v3bf16__2_0_0_0(ptr addrspace(1) inreg %ptr) {
1385; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__2_0_0_0:
1386; GFX900:       ; %bb.0:
1387; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1388; GFX900-NEXT:    ;;#ASMSTART
1389; GFX900-NEXT:    ; def v[0:1]
1390; GFX900-NEXT:    ;;#ASMEND
1391; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
1392; GFX900-NEXT:    v_mov_b32_e32 v3, 0
1393; GFX900-NEXT:    v_perm_b32 v1, v0, v1, s4
1394; GFX900-NEXT:    v_perm_b32 v2, v0, v0, s4
1395; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
1396; GFX900-NEXT:    s_waitcnt vmcnt(0)
1397; GFX900-NEXT:    s_setpc_b64 s[30:31]
1398;
1399; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__2_0_0_0:
1400; GFX90A:       ; %bb.0:
1401; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1402; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
1403; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
1404; GFX90A-NEXT:    ;;#ASMSTART
1405; GFX90A-NEXT:    ; def v[0:1]
1406; GFX90A-NEXT:    ;;#ASMEND
1407; GFX90A-NEXT:    v_perm_b32 v2, v0, v1, s4
1408; GFX90A-NEXT:    v_perm_b32 v3, v0, v0, s4
1409; GFX90A-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
1410; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1411; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1412;
1413; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__2_0_0_0:
1414; GFX940:       ; %bb.0:
1415; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1416; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
1417; GFX940-NEXT:    v_mov_b32_e32 v4, 0
1418; GFX940-NEXT:    ;;#ASMSTART
1419; GFX940-NEXT:    ; def v[0:1]
1420; GFX940-NEXT:    ;;#ASMEND
1421; GFX940-NEXT:    s_nop 0
1422; GFX940-NEXT:    v_perm_b32 v2, v0, v1, s2
1423; GFX940-NEXT:    v_perm_b32 v3, v0, v0, s2
1424; GFX940-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
1425; GFX940-NEXT:    s_waitcnt vmcnt(0)
1426; GFX940-NEXT:    s_setpc_b64 s[30:31]
1427  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
1428  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
1429  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 2, i32 0, i32 0, i32 0>
1430  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
1431  ret void
1432}
1433
1434define void @v_shuffle_v4bf16_v3bf16__3_0_0_0(ptr addrspace(1) inreg %ptr) {
1435; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__3_0_0_0:
1436; GFX900:       ; %bb.0:
1437; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1438; GFX900-NEXT:    ;;#ASMSTART
1439; GFX900-NEXT:    ; def v[0:1]
1440; GFX900-NEXT:    ;;#ASMEND
1441; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
1442; GFX900-NEXT:    v_mov_b32_e32 v2, 0
1443; GFX900-NEXT:    v_perm_b32 v1, v0, v0, s4
1444; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1445; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
1446; GFX900-NEXT:    s_waitcnt vmcnt(0)
1447; GFX900-NEXT:    s_setpc_b64 s[30:31]
1448;
1449; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__3_0_0_0:
1450; GFX90A:       ; %bb.0:
1451; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1452; GFX90A-NEXT:    ;;#ASMSTART
1453; GFX90A-NEXT:    ; def v[0:1]
1454; GFX90A-NEXT:    ;;#ASMEND
1455; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
1456; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
1457; GFX90A-NEXT:    v_perm_b32 v1, v0, v0, s4
1458; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1459; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
1460; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1461; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1462;
1463; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__3_0_0_0:
1464; GFX940:       ; %bb.0:
1465; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1466; GFX940-NEXT:    ;;#ASMSTART
1467; GFX940-NEXT:    ; def v[0:1]
1468; GFX940-NEXT:    ;;#ASMEND
1469; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
1470; GFX940-NEXT:    v_mov_b32_e32 v2, 0
1471; GFX940-NEXT:    v_perm_b32 v1, v0, v0, s2
1472; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1473; GFX940-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
1474; GFX940-NEXT:    s_waitcnt vmcnt(0)
1475; GFX940-NEXT:    s_setpc_b64 s[30:31]
1476  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
1477  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
1478  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
1479  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
1480  ret void
1481}
1482
1483define void @v_shuffle_v4bf16_v3bf16__4_0_0_0(ptr addrspace(1) inreg %ptr) {
1484; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__4_0_0_0:
1485; GFX900:       ; %bb.0:
1486; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1487; GFX900-NEXT:    ;;#ASMSTART
1488; GFX900-NEXT:    ; def v[0:1]
1489; GFX900-NEXT:    ;;#ASMEND
1490; GFX900-NEXT:    ;;#ASMSTART
1491; GFX900-NEXT:    ; def v[1:2]
1492; GFX900-NEXT:    ;;#ASMEND
1493; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
1494; GFX900-NEXT:    v_mov_b32_e32 v3, 0
1495; GFX900-NEXT:    v_perm_b32 v2, v0, v0, s4
1496; GFX900-NEXT:    v_alignbit_b32 v1, v0, v1, 16
1497; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
1498; GFX900-NEXT:    s_waitcnt vmcnt(0)
1499; GFX900-NEXT:    s_setpc_b64 s[30:31]
1500;
1501; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__4_0_0_0:
1502; GFX90A:       ; %bb.0:
1503; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1504; GFX90A-NEXT:    ;;#ASMSTART
1505; GFX90A-NEXT:    ; def v[0:1]
1506; GFX90A-NEXT:    ;;#ASMEND
1507; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
1508; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
1509; GFX90A-NEXT:    ;;#ASMSTART
1510; GFX90A-NEXT:    ; def v[2:3]
1511; GFX90A-NEXT:    ;;#ASMEND
1512; GFX90A-NEXT:    v_perm_b32 v1, v0, v0, s4
1513; GFX90A-NEXT:    v_alignbit_b32 v0, v0, v2, 16
1514; GFX90A-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
1515; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1516; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1517;
1518; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__4_0_0_0:
1519; GFX940:       ; %bb.0:
1520; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1521; GFX940-NEXT:    ;;#ASMSTART
1522; GFX940-NEXT:    ; def v[0:1]
1523; GFX940-NEXT:    ;;#ASMEND
1524; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
1525; GFX940-NEXT:    v_mov_b32_e32 v4, 0
1526; GFX940-NEXT:    ;;#ASMSTART
1527; GFX940-NEXT:    ; def v[2:3]
1528; GFX940-NEXT:    ;;#ASMEND
1529; GFX940-NEXT:    v_perm_b32 v1, v0, v0, s2
1530; GFX940-NEXT:    v_alignbit_b32 v0, v0, v2, 16
1531; GFX940-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
1532; GFX940-NEXT:    s_waitcnt vmcnt(0)
1533; GFX940-NEXT:    s_setpc_b64 s[30:31]
1534  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
1535  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
1536  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
1537  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
1538  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 4, i32 0, i32 0, i32 0>
1539  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
1540  ret void
1541}
1542
1543define void @v_shuffle_v4bf16_v3bf16__5_0_0_0(ptr addrspace(1) inreg %ptr) {
1544; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_0_0_0:
1545; GFX900:       ; %bb.0:
1546; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1547; GFX900-NEXT:    ;;#ASMSTART
1548; GFX900-NEXT:    ; def v[0:1]
1549; GFX900-NEXT:    ;;#ASMEND
1550; GFX900-NEXT:    ;;#ASMSTART
1551; GFX900-NEXT:    ; def v[1:2]
1552; GFX900-NEXT:    ;;#ASMEND
1553; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
1554; GFX900-NEXT:    v_mov_b32_e32 v3, 0
1555; GFX900-NEXT:    v_perm_b32 v1, v0, v2, s4
1556; GFX900-NEXT:    v_perm_b32 v2, v0, v0, s4
1557; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
1558; GFX900-NEXT:    s_waitcnt vmcnt(0)
1559; GFX900-NEXT:    s_setpc_b64 s[30:31]
1560;
1561; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_0_0_0:
1562; GFX90A:       ; %bb.0:
1563; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1564; GFX90A-NEXT:    ;;#ASMSTART
1565; GFX90A-NEXT:    ; def v[2:3]
1566; GFX90A-NEXT:    ;;#ASMEND
1567; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
1568; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
1569; GFX90A-NEXT:    ;;#ASMSTART
1570; GFX90A-NEXT:    ; def v[0:1]
1571; GFX90A-NEXT:    ;;#ASMEND
1572; GFX90A-NEXT:    v_perm_b32 v2, v0, v3, s4
1573; GFX90A-NEXT:    v_perm_b32 v3, v0, v0, s4
1574; GFX90A-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
1575; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1576; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1577;
1578; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_0_0_0:
1579; GFX940:       ; %bb.0:
1580; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1581; GFX940-NEXT:    ;;#ASMSTART
1582; GFX940-NEXT:    ; def v[2:3]
1583; GFX940-NEXT:    ;;#ASMEND
1584; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
1585; GFX940-NEXT:    v_mov_b32_e32 v4, 0
1586; GFX940-NEXT:    ;;#ASMSTART
1587; GFX940-NEXT:    ; def v[0:1]
1588; GFX940-NEXT:    ;;#ASMEND
1589; GFX940-NEXT:    s_nop 0
1590; GFX940-NEXT:    v_perm_b32 v2, v0, v3, s2
1591; GFX940-NEXT:    v_perm_b32 v3, v0, v0, s2
1592; GFX940-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
1593; GFX940-NEXT:    s_waitcnt vmcnt(0)
1594; GFX940-NEXT:    s_setpc_b64 s[30:31]
1595  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
1596  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
1597  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
1598  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
1599  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 0, i32 0, i32 0>
1600  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
1601  ret void
1602}
1603
1604define void @v_shuffle_v4bf16_v3bf16__5_u_0_0(ptr addrspace(1) inreg %ptr) {
1605; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_u_0_0:
1606; GFX900:       ; %bb.0:
1607; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1608; GFX900-NEXT:    ;;#ASMSTART
1609; GFX900-NEXT:    ; def v[0:1]
1610; GFX900-NEXT:    ;;#ASMEND
1611; GFX900-NEXT:    ;;#ASMSTART
1612; GFX900-NEXT:    ; def v[1:2]
1613; GFX900-NEXT:    ;;#ASMEND
1614; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
1615; GFX900-NEXT:    v_mov_b32_e32 v3, 0
1616; GFX900-NEXT:    v_perm_b32 v1, v0, v0, s4
1617; GFX900-NEXT:    v_mov_b32_e32 v0, v2
1618; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
1619; GFX900-NEXT:    s_waitcnt vmcnt(0)
1620; GFX900-NEXT:    s_setpc_b64 s[30:31]
1621;
1622; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_u_0_0:
1623; GFX90A:       ; %bb.0:
1624; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1625; GFX90A-NEXT:    ;;#ASMSTART
1626; GFX90A-NEXT:    ; def v[0:1]
1627; GFX90A-NEXT:    ;;#ASMEND
1628; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
1629; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
1630; GFX90A-NEXT:    ;;#ASMSTART
1631; GFX90A-NEXT:    ; def v[2:3]
1632; GFX90A-NEXT:    ;;#ASMEND
1633; GFX90A-NEXT:    v_perm_b32 v1, v0, v0, s4
1634; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
1635; GFX90A-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
1636; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1637; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1638;
1639; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_u_0_0:
1640; GFX940:       ; %bb.0:
1641; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1642; GFX940-NEXT:    ;;#ASMSTART
1643; GFX940-NEXT:    ; def v[0:1]
1644; GFX940-NEXT:    ;;#ASMEND
1645; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
1646; GFX940-NEXT:    v_mov_b32_e32 v4, 0
1647; GFX940-NEXT:    ;;#ASMSTART
1648; GFX940-NEXT:    ; def v[2:3]
1649; GFX940-NEXT:    ;;#ASMEND
1650; GFX940-NEXT:    v_perm_b32 v1, v0, v0, s2
1651; GFX940-NEXT:    v_mov_b32_e32 v0, v3
1652; GFX940-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
1653; GFX940-NEXT:    s_waitcnt vmcnt(0)
1654; GFX940-NEXT:    s_setpc_b64 s[30:31]
1655  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
1656  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
1657  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
1658  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
1659  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 poison, i32 0, i32 0>
1660  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
1661  ret void
1662}
1663
1664define void @v_shuffle_v4bf16_v3bf16__5_1_0_0(ptr addrspace(1) inreg %ptr) {
1665; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_1_0_0:
1666; GFX900:       ; %bb.0:
1667; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1668; GFX900-NEXT:    ;;#ASMSTART
1669; GFX900-NEXT:    ; def v[0:1]
1670; GFX900-NEXT:    ;;#ASMEND
1671; GFX900-NEXT:    ;;#ASMSTART
1672; GFX900-NEXT:    ; def v[1:2]
1673; GFX900-NEXT:    ;;#ASMEND
1674; GFX900-NEXT:    s_mov_b32 s4, 0xffff
1675; GFX900-NEXT:    v_bfi_b32 v1, s4, v2, v0
1676; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
1677; GFX900-NEXT:    v_mov_b32_e32 v3, 0
1678; GFX900-NEXT:    v_perm_b32 v2, v0, v0, s4
1679; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
1680; GFX900-NEXT:    s_waitcnt vmcnt(0)
1681; GFX900-NEXT:    s_setpc_b64 s[30:31]
1682;
1683; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_1_0_0:
1684; GFX90A:       ; %bb.0:
1685; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1686; GFX90A-NEXT:    ;;#ASMSTART
1687; GFX90A-NEXT:    ; def v[2:3]
1688; GFX90A-NEXT:    ;;#ASMEND
1689; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
1690; GFX90A-NEXT:    ;;#ASMSTART
1691; GFX90A-NEXT:    ; def v[0:1]
1692; GFX90A-NEXT:    ;;#ASMEND
1693; GFX90A-NEXT:    v_bfi_b32 v2, s4, v3, v0
1694; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
1695; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
1696; GFX90A-NEXT:    v_perm_b32 v3, v0, v0, s4
1697; GFX90A-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
1698; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1699; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1700;
1701; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_1_0_0:
1702; GFX940:       ; %bb.0:
1703; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1704; GFX940-NEXT:    ;;#ASMSTART
1705; GFX940-NEXT:    ; def v[2:3]
1706; GFX940-NEXT:    ;;#ASMEND
1707; GFX940-NEXT:    s_mov_b32 s2, 0xffff
1708; GFX940-NEXT:    ;;#ASMSTART
1709; GFX940-NEXT:    ; def v[0:1]
1710; GFX940-NEXT:    ;;#ASMEND
1711; GFX940-NEXT:    v_mov_b32_e32 v4, 0
1712; GFX940-NEXT:    v_bfi_b32 v2, s2, v3, v0
1713; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
1714; GFX940-NEXT:    v_perm_b32 v3, v0, v0, s2
1715; GFX940-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
1716; GFX940-NEXT:    s_waitcnt vmcnt(0)
1717; GFX940-NEXT:    s_setpc_b64 s[30:31]
1718  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
1719  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
1720  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
1721  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
1722  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 1, i32 0, i32 0>
1723  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
1724  ret void
1725}
1726
1727define void @v_shuffle_v4bf16_v3bf16__5_2_0_0(ptr addrspace(1) inreg %ptr) {
1728; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_2_0_0:
1729; GFX900:       ; %bb.0:
1730; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1731; GFX900-NEXT:    ;;#ASMSTART
1732; GFX900-NEXT:    ; def v[0:1]
1733; GFX900-NEXT:    ;;#ASMEND
1734; GFX900-NEXT:    ;;#ASMSTART
1735; GFX900-NEXT:    ; def v[2:3]
1736; GFX900-NEXT:    ;;#ASMEND
1737; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
1738; GFX900-NEXT:    v_mov_b32_e32 v4, 0
1739; GFX900-NEXT:    v_perm_b32 v1, v1, v3, s4
1740; GFX900-NEXT:    v_perm_b32 v2, v0, v0, s4
1741; GFX900-NEXT:    global_store_dwordx2 v4, v[1:2], s[16:17]
1742; GFX900-NEXT:    s_waitcnt vmcnt(0)
1743; GFX900-NEXT:    s_setpc_b64 s[30:31]
1744;
1745; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_2_0_0:
1746; GFX90A:       ; %bb.0:
1747; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1748; GFX90A-NEXT:    ;;#ASMSTART
1749; GFX90A-NEXT:    ; def v[2:3]
1750; GFX90A-NEXT:    ;;#ASMEND
1751; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
1752; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
1753; GFX90A-NEXT:    ;;#ASMSTART
1754; GFX90A-NEXT:    ; def v[0:1]
1755; GFX90A-NEXT:    ;;#ASMEND
1756; GFX90A-NEXT:    v_perm_b32 v2, v1, v3, s4
1757; GFX90A-NEXT:    v_perm_b32 v3, v0, v0, s4
1758; GFX90A-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
1759; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1760; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1761;
1762; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_2_0_0:
1763; GFX940:       ; %bb.0:
1764; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1765; GFX940-NEXT:    ;;#ASMSTART
1766; GFX940-NEXT:    ; def v[2:3]
1767; GFX940-NEXT:    ;;#ASMEND
1768; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
1769; GFX940-NEXT:    v_mov_b32_e32 v4, 0
1770; GFX940-NEXT:    ;;#ASMSTART
1771; GFX940-NEXT:    ; def v[0:1]
1772; GFX940-NEXT:    ;;#ASMEND
1773; GFX940-NEXT:    s_nop 0
1774; GFX940-NEXT:    v_perm_b32 v2, v1, v3, s2
1775; GFX940-NEXT:    v_perm_b32 v3, v0, v0, s2
1776; GFX940-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
1777; GFX940-NEXT:    s_waitcnt vmcnt(0)
1778; GFX940-NEXT:    s_setpc_b64 s[30:31]
1779  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
1780  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
1781  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
1782  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
1783  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 2, i32 0, i32 0>
1784  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
1785  ret void
1786}
1787
1788define void @v_shuffle_v4bf16_v3bf16__5_3_0_0(ptr addrspace(1) inreg %ptr) {
1789; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_3_0_0:
1790; GFX900:       ; %bb.0:
1791; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1792; GFX900-NEXT:    ;;#ASMSTART
1793; GFX900-NEXT:    ; def v[0:1]
1794; GFX900-NEXT:    ;;#ASMEND
1795; GFX900-NEXT:    ;;#ASMSTART
1796; GFX900-NEXT:    ; def v[1:2]
1797; GFX900-NEXT:    ;;#ASMEND
1798; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
1799; GFX900-NEXT:    v_mov_b32_e32 v3, 0
1800; GFX900-NEXT:    v_perm_b32 v1, v1, v2, s4
1801; GFX900-NEXT:    v_perm_b32 v2, v0, v0, s4
1802; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
1803; GFX900-NEXT:    s_waitcnt vmcnt(0)
1804; GFX900-NEXT:    s_setpc_b64 s[30:31]
1805;
1806; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_3_0_0:
1807; GFX90A:       ; %bb.0:
1808; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1809; GFX90A-NEXT:    ;;#ASMSTART
1810; GFX90A-NEXT:    ; def v[2:3]
1811; GFX90A-NEXT:    ;;#ASMEND
1812; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
1813; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
1814; GFX90A-NEXT:    ;;#ASMSTART
1815; GFX90A-NEXT:    ; def v[0:1]
1816; GFX90A-NEXT:    ;;#ASMEND
1817; GFX90A-NEXT:    v_perm_b32 v2, v2, v3, s4
1818; GFX90A-NEXT:    v_perm_b32 v3, v0, v0, s4
1819; GFX90A-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
1820; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1821; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1822;
1823; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_3_0_0:
1824; GFX940:       ; %bb.0:
1825; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1826; GFX940-NEXT:    ;;#ASMSTART
1827; GFX940-NEXT:    ; def v[2:3]
1828; GFX940-NEXT:    ;;#ASMEND
1829; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
1830; GFX940-NEXT:    v_mov_b32_e32 v4, 0
1831; GFX940-NEXT:    ;;#ASMSTART
1832; GFX940-NEXT:    ; def v[0:1]
1833; GFX940-NEXT:    ;;#ASMEND
1834; GFX940-NEXT:    v_perm_b32 v2, v2, v3, s2
1835; GFX940-NEXT:    v_perm_b32 v3, v0, v0, s2
1836; GFX940-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
1837; GFX940-NEXT:    s_waitcnt vmcnt(0)
1838; GFX940-NEXT:    s_setpc_b64 s[30:31]
1839  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
1840  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
1841  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
1842  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
1843  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 3, i32 0, i32 0>
1844  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
1845  ret void
1846}
1847
1848define void @v_shuffle_v4bf16_v3bf16__5_4_0_0(ptr addrspace(1) inreg %ptr) {
1849; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_4_0_0:
1850; GFX900:       ; %bb.0:
1851; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1852; GFX900-NEXT:    ;;#ASMSTART
1853; GFX900-NEXT:    ; def v[0:1]
1854; GFX900-NEXT:    ;;#ASMEND
1855; GFX900-NEXT:    ;;#ASMSTART
1856; GFX900-NEXT:    ; def v[1:2]
1857; GFX900-NEXT:    ;;#ASMEND
1858; GFX900-NEXT:    s_mov_b32 s4, 0xffff
1859; GFX900-NEXT:    v_bfi_b32 v1, s4, v2, v1
1860; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
1861; GFX900-NEXT:    v_mov_b32_e32 v3, 0
1862; GFX900-NEXT:    v_perm_b32 v2, v0, v0, s4
1863; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
1864; GFX900-NEXT:    s_waitcnt vmcnt(0)
1865; GFX900-NEXT:    s_setpc_b64 s[30:31]
1866;
1867; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_4_0_0:
1868; GFX90A:       ; %bb.0:
1869; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1870; GFX90A-NEXT:    ;;#ASMSTART
1871; GFX90A-NEXT:    ; def v[2:3]
1872; GFX90A-NEXT:    ;;#ASMEND
1873; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
1874; GFX90A-NEXT:    v_bfi_b32 v2, s4, v3, v2
1875; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
1876; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
1877; GFX90A-NEXT:    ;;#ASMSTART
1878; GFX90A-NEXT:    ; def v[0:1]
1879; GFX90A-NEXT:    ;;#ASMEND
1880; GFX90A-NEXT:    v_perm_b32 v3, v0, v0, s4
1881; GFX90A-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
1882; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1883; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1884;
1885; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_4_0_0:
1886; GFX940:       ; %bb.0:
1887; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1888; GFX940-NEXT:    ;;#ASMSTART
1889; GFX940-NEXT:    ; def v[2:3]
1890; GFX940-NEXT:    ;;#ASMEND
1891; GFX940-NEXT:    s_mov_b32 s2, 0xffff
1892; GFX940-NEXT:    v_bfi_b32 v2, s2, v3, v2
1893; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
1894; GFX940-NEXT:    v_mov_b32_e32 v4, 0
1895; GFX940-NEXT:    ;;#ASMSTART
1896; GFX940-NEXT:    ; def v[0:1]
1897; GFX940-NEXT:    ;;#ASMEND
1898; GFX940-NEXT:    s_nop 0
1899; GFX940-NEXT:    v_perm_b32 v3, v0, v0, s2
1900; GFX940-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
1901; GFX940-NEXT:    s_waitcnt vmcnt(0)
1902; GFX940-NEXT:    s_setpc_b64 s[30:31]
1903  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
1904  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
1905  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
1906  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
1907  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 4, i32 0, i32 0>
1908  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
1909  ret void
1910}
1911
1912define void @v_shuffle_v4bf16_v3bf16__5_5_0_0(ptr addrspace(1) inreg %ptr) {
1913; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_0:
1914; GFX900:       ; %bb.0:
1915; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1916; GFX900-NEXT:    ;;#ASMSTART
1917; GFX900-NEXT:    ; def v[0:1]
1918; GFX900-NEXT:    ;;#ASMEND
1919; GFX900-NEXT:    ;;#ASMSTART
1920; GFX900-NEXT:    ; def v[1:2]
1921; GFX900-NEXT:    ;;#ASMEND
1922; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
1923; GFX900-NEXT:    v_mov_b32_e32 v3, 0
1924; GFX900-NEXT:    v_perm_b32 v1, v0, v0, s4
1925; GFX900-NEXT:    v_perm_b32 v0, v2, v2, s4
1926; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
1927; GFX900-NEXT:    s_waitcnt vmcnt(0)
1928; GFX900-NEXT:    s_setpc_b64 s[30:31]
1929;
1930; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_0:
1931; GFX90A:       ; %bb.0:
1932; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1933; GFX90A-NEXT:    ;;#ASMSTART
1934; GFX90A-NEXT:    ; def v[0:1]
1935; GFX90A-NEXT:    ;;#ASMEND
1936; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
1937; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
1938; GFX90A-NEXT:    ;;#ASMSTART
1939; GFX90A-NEXT:    ; def v[2:3]
1940; GFX90A-NEXT:    ;;#ASMEND
1941; GFX90A-NEXT:    v_perm_b32 v1, v0, v0, s4
1942; GFX90A-NEXT:    v_perm_b32 v0, v3, v3, s4
1943; GFX90A-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
1944; GFX90A-NEXT:    s_waitcnt vmcnt(0)
1945; GFX90A-NEXT:    s_setpc_b64 s[30:31]
1946;
1947; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_0:
1948; GFX940:       ; %bb.0:
1949; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1950; GFX940-NEXT:    ;;#ASMSTART
1951; GFX940-NEXT:    ; def v[0:1]
1952; GFX940-NEXT:    ;;#ASMEND
1953; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
1954; GFX940-NEXT:    v_mov_b32_e32 v4, 0
1955; GFX940-NEXT:    ;;#ASMSTART
1956; GFX940-NEXT:    ; def v[2:3]
1957; GFX940-NEXT:    ;;#ASMEND
1958; GFX940-NEXT:    v_perm_b32 v1, v0, v0, s2
1959; GFX940-NEXT:    v_perm_b32 v0, v3, v3, s2
1960; GFX940-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
1961; GFX940-NEXT:    s_waitcnt vmcnt(0)
1962; GFX940-NEXT:    s_setpc_b64 s[30:31]
1963  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
1964  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
1965  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
1966  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
1967  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 0, i32 0>
1968  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
1969  ret void
1970}
1971
1972define void @v_shuffle_v4bf16_v3bf16__5_5_u_0(ptr addrspace(1) inreg %ptr) {
1973; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_0:
1974; GFX900:       ; %bb.0:
1975; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1976; GFX900-NEXT:    ;;#ASMSTART
1977; GFX900-NEXT:    ; def v[0:1]
1978; GFX900-NEXT:    ;;#ASMEND
1979; GFX900-NEXT:    ;;#ASMSTART
1980; GFX900-NEXT:    ; def v[1:2]
1981; GFX900-NEXT:    ;;#ASMEND
1982; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
1983; GFX900-NEXT:    v_mov_b32_e32 v3, 0
1984; GFX900-NEXT:    v_perm_b32 v1, v2, v2, s4
1985; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
1986; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
1987; GFX900-NEXT:    s_waitcnt vmcnt(0)
1988; GFX900-NEXT:    s_setpc_b64 s[30:31]
1989;
1990; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_0:
1991; GFX90A:       ; %bb.0:
1992; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1993; GFX90A-NEXT:    ;;#ASMSTART
1994; GFX90A-NEXT:    ; def v[2:3]
1995; GFX90A-NEXT:    ;;#ASMEND
1996; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
1997; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
1998; GFX90A-NEXT:    ;;#ASMSTART
1999; GFX90A-NEXT:    ; def v[0:1]
2000; GFX90A-NEXT:    ;;#ASMEND
2001; GFX90A-NEXT:    v_perm_b32 v2, v3, v3, s4
2002; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
2003; GFX90A-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
2004; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2005; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2006;
2007; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_0:
2008; GFX940:       ; %bb.0:
2009; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2010; GFX940-NEXT:    ;;#ASMSTART
2011; GFX940-NEXT:    ; def v[2:3]
2012; GFX940-NEXT:    ;;#ASMEND
2013; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
2014; GFX940-NEXT:    v_mov_b32_e32 v4, 0
2015; GFX940-NEXT:    ;;#ASMSTART
2016; GFX940-NEXT:    ; def v[0:1]
2017; GFX940-NEXT:    ;;#ASMEND
2018; GFX940-NEXT:    v_perm_b32 v2, v3, v3, s2
2019; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
2020; GFX940-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
2021; GFX940-NEXT:    s_waitcnt vmcnt(0)
2022; GFX940-NEXT:    s_setpc_b64 s[30:31]
2023  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
2024  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
2025  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
2026  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
2027  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 poison, i32 0>
2028  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
2029  ret void
2030}
2031
2032define void @v_shuffle_v4bf16_v3bf16__5_5_1_0(ptr addrspace(1) inreg %ptr) {
2033; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_0:
2034; GFX900:       ; %bb.0:
2035; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2036; GFX900-NEXT:    ;;#ASMSTART
2037; GFX900-NEXT:    ; def v[0:1]
2038; GFX900-NEXT:    ;;#ASMEND
2039; GFX900-NEXT:    ;;#ASMSTART
2040; GFX900-NEXT:    ; def v[1:2]
2041; GFX900-NEXT:    ;;#ASMEND
2042; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
2043; GFX900-NEXT:    v_mov_b32_e32 v3, 0
2044; GFX900-NEXT:    v_alignbit_b32 v1, v0, v0, 16
2045; GFX900-NEXT:    v_perm_b32 v0, v2, v2, s4
2046; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
2047; GFX900-NEXT:    s_waitcnt vmcnt(0)
2048; GFX900-NEXT:    s_setpc_b64 s[30:31]
2049;
2050; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_0:
2051; GFX90A:       ; %bb.0:
2052; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2053; GFX90A-NEXT:    ;;#ASMSTART
2054; GFX90A-NEXT:    ; def v[0:1]
2055; GFX90A-NEXT:    ;;#ASMEND
2056; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
2057; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
2058; GFX90A-NEXT:    ;;#ASMSTART
2059; GFX90A-NEXT:    ; def v[2:3]
2060; GFX90A-NEXT:    ;;#ASMEND
2061; GFX90A-NEXT:    v_alignbit_b32 v1, v0, v0, 16
2062; GFX90A-NEXT:    v_perm_b32 v0, v3, v3, s4
2063; GFX90A-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
2064; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2065; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2066;
2067; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_0:
2068; GFX940:       ; %bb.0:
2069; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2070; GFX940-NEXT:    ;;#ASMSTART
2071; GFX940-NEXT:    ; def v[0:1]
2072; GFX940-NEXT:    ;;#ASMEND
2073; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
2074; GFX940-NEXT:    v_mov_b32_e32 v4, 0
2075; GFX940-NEXT:    ;;#ASMSTART
2076; GFX940-NEXT:    ; def v[2:3]
2077; GFX940-NEXT:    ;;#ASMEND
2078; GFX940-NEXT:    v_alignbit_b32 v1, v0, v0, 16
2079; GFX940-NEXT:    v_perm_b32 v0, v3, v3, s2
2080; GFX940-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
2081; GFX940-NEXT:    s_waitcnt vmcnt(0)
2082; GFX940-NEXT:    s_setpc_b64 s[30:31]
2083  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
2084  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
2085  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
2086  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
2087  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 1, i32 0>
2088  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
2089  ret void
2090}
2091
2092define void @v_shuffle_v4bf16_v3bf16__5_5_2_0(ptr addrspace(1) inreg %ptr) {
2093; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_0:
2094; GFX900:       ; %bb.0:
2095; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2096; GFX900-NEXT:    ;;#ASMSTART
2097; GFX900-NEXT:    ; def v[0:1]
2098; GFX900-NEXT:    ;;#ASMEND
2099; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
2100; GFX900-NEXT:    v_mov_b32_e32 v4, 0
2101; GFX900-NEXT:    ;;#ASMSTART
2102; GFX900-NEXT:    ; def v[2:3]
2103; GFX900-NEXT:    ;;#ASMEND
2104; GFX900-NEXT:    v_perm_b32 v1, v0, v1, s4
2105; GFX900-NEXT:    v_perm_b32 v0, v3, v3, s4
2106; GFX900-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
2107; GFX900-NEXT:    s_waitcnt vmcnt(0)
2108; GFX900-NEXT:    s_setpc_b64 s[30:31]
2109;
2110; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_0:
2111; GFX90A:       ; %bb.0:
2112; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2113; GFX90A-NEXT:    ;;#ASMSTART
2114; GFX90A-NEXT:    ; def v[0:1]
2115; GFX90A-NEXT:    ;;#ASMEND
2116; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
2117; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
2118; GFX90A-NEXT:    ;;#ASMSTART
2119; GFX90A-NEXT:    ; def v[2:3]
2120; GFX90A-NEXT:    ;;#ASMEND
2121; GFX90A-NEXT:    v_perm_b32 v1, v0, v1, s4
2122; GFX90A-NEXT:    v_perm_b32 v0, v3, v3, s4
2123; GFX90A-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
2124; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2125; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2126;
2127; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_0:
2128; GFX940:       ; %bb.0:
2129; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2130; GFX940-NEXT:    ;;#ASMSTART
2131; GFX940-NEXT:    ; def v[0:1]
2132; GFX940-NEXT:    ;;#ASMEND
2133; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
2134; GFX940-NEXT:    v_mov_b32_e32 v4, 0
2135; GFX940-NEXT:    ;;#ASMSTART
2136; GFX940-NEXT:    ; def v[2:3]
2137; GFX940-NEXT:    ;;#ASMEND
2138; GFX940-NEXT:    v_perm_b32 v1, v0, v1, s2
2139; GFX940-NEXT:    v_perm_b32 v0, v3, v3, s2
2140; GFX940-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
2141; GFX940-NEXT:    s_waitcnt vmcnt(0)
2142; GFX940-NEXT:    s_setpc_b64 s[30:31]
2143  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
2144  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
2145  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
2146  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
2147  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 2, i32 0>
2148  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
2149  ret void
2150}
2151
2152define void @v_shuffle_v4bf16_v3bf16__5_5_3_0(ptr addrspace(1) inreg %ptr) {
2153; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_0:
2154; GFX900:       ; %bb.0:
2155; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2156; GFX900-NEXT:    ;;#ASMSTART
2157; GFX900-NEXT:    ; def v[0:1]
2158; GFX900-NEXT:    ;;#ASMEND
2159; GFX900-NEXT:    ;;#ASMSTART
2160; GFX900-NEXT:    ; def v[1:2]
2161; GFX900-NEXT:    ;;#ASMEND
2162; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
2163; GFX900-NEXT:    v_mov_b32_e32 v3, 0
2164; GFX900-NEXT:    v_perm_b32 v1, v0, v1, s4
2165; GFX900-NEXT:    v_perm_b32 v0, v2, v2, s4
2166; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
2167; GFX900-NEXT:    s_waitcnt vmcnt(0)
2168; GFX900-NEXT:    s_setpc_b64 s[30:31]
2169;
2170; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_0:
2171; GFX90A:       ; %bb.0:
2172; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2173; GFX90A-NEXT:    ;;#ASMSTART
2174; GFX90A-NEXT:    ; def v[0:1]
2175; GFX90A-NEXT:    ;;#ASMEND
2176; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
2177; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
2178; GFX90A-NEXT:    ;;#ASMSTART
2179; GFX90A-NEXT:    ; def v[2:3]
2180; GFX90A-NEXT:    ;;#ASMEND
2181; GFX90A-NEXT:    v_perm_b32 v1, v0, v2, s4
2182; GFX90A-NEXT:    v_perm_b32 v0, v3, v3, s4
2183; GFX90A-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
2184; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2185; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2186;
2187; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_0:
2188; GFX940:       ; %bb.0:
2189; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2190; GFX940-NEXT:    ;;#ASMSTART
2191; GFX940-NEXT:    ; def v[0:1]
2192; GFX940-NEXT:    ;;#ASMEND
2193; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
2194; GFX940-NEXT:    v_mov_b32_e32 v4, 0
2195; GFX940-NEXT:    ;;#ASMSTART
2196; GFX940-NEXT:    ; def v[2:3]
2197; GFX940-NEXT:    ;;#ASMEND
2198; GFX940-NEXT:    s_nop 0
2199; GFX940-NEXT:    v_perm_b32 v1, v0, v2, s2
2200; GFX940-NEXT:    v_perm_b32 v0, v3, v3, s2
2201; GFX940-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
2202; GFX940-NEXT:    s_waitcnt vmcnt(0)
2203; GFX940-NEXT:    s_setpc_b64 s[30:31]
2204  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
2205  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
2206  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
2207  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
2208  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 3, i32 0>
2209  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
2210  ret void
2211}
2212
2213define void @v_shuffle_v4bf16_v3bf16__5_5_4_0(ptr addrspace(1) inreg %ptr) {
2214; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_0:
2215; GFX900:       ; %bb.0:
2216; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2217; GFX900-NEXT:    ;;#ASMSTART
2218; GFX900-NEXT:    ; def v[0:1]
2219; GFX900-NEXT:    ;;#ASMEND
2220; GFX900-NEXT:    ;;#ASMSTART
2221; GFX900-NEXT:    ; def v[1:2]
2222; GFX900-NEXT:    ;;#ASMEND
2223; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
2224; GFX900-NEXT:    v_mov_b32_e32 v3, 0
2225; GFX900-NEXT:    v_alignbit_b32 v1, v0, v1, 16
2226; GFX900-NEXT:    v_perm_b32 v0, v2, v2, s4
2227; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
2228; GFX900-NEXT:    s_waitcnt vmcnt(0)
2229; GFX900-NEXT:    s_setpc_b64 s[30:31]
2230;
2231; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_0:
2232; GFX90A:       ; %bb.0:
2233; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2234; GFX90A-NEXT:    ;;#ASMSTART
2235; GFX90A-NEXT:    ; def v[0:1]
2236; GFX90A-NEXT:    ;;#ASMEND
2237; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
2238; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
2239; GFX90A-NEXT:    ;;#ASMSTART
2240; GFX90A-NEXT:    ; def v[2:3]
2241; GFX90A-NEXT:    ;;#ASMEND
2242; GFX90A-NEXT:    v_alignbit_b32 v1, v0, v2, 16
2243; GFX90A-NEXT:    v_perm_b32 v0, v3, v3, s4
2244; GFX90A-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
2245; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2246; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2247;
2248; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_0:
2249; GFX940:       ; %bb.0:
2250; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2251; GFX940-NEXT:    ;;#ASMSTART
2252; GFX940-NEXT:    ; def v[0:1]
2253; GFX940-NEXT:    ;;#ASMEND
2254; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
2255; GFX940-NEXT:    v_mov_b32_e32 v4, 0
2256; GFX940-NEXT:    ;;#ASMSTART
2257; GFX940-NEXT:    ; def v[2:3]
2258; GFX940-NEXT:    ;;#ASMEND
2259; GFX940-NEXT:    s_nop 0
2260; GFX940-NEXT:    v_alignbit_b32 v1, v0, v2, 16
2261; GFX940-NEXT:    v_perm_b32 v0, v3, v3, s2
2262; GFX940-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
2263; GFX940-NEXT:    s_waitcnt vmcnt(0)
2264; GFX940-NEXT:    s_setpc_b64 s[30:31]
2265  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
2266  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
2267  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
2268  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
2269  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 4, i32 0>
2270  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
2271  ret void
2272}
2273
2274define void @v_shuffle_v4bf16_v3bf16__u_1_1_1(ptr addrspace(1) inreg %ptr) {
2275; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__u_1_1_1:
2276; GFX900:       ; %bb.0:
2277; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2278; GFX900-NEXT:    ;;#ASMSTART
2279; GFX900-NEXT:    ; def v[0:1]
2280; GFX900-NEXT:    ;;#ASMEND
2281; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
2282; GFX900-NEXT:    v_mov_b32_e32 v2, 0
2283; GFX900-NEXT:    v_perm_b32 v1, v0, v0, s4
2284; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
2285; GFX900-NEXT:    s_waitcnt vmcnt(0)
2286; GFX900-NEXT:    s_setpc_b64 s[30:31]
2287;
2288; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__u_1_1_1:
2289; GFX90A:       ; %bb.0:
2290; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2291; GFX90A-NEXT:    ;;#ASMSTART
2292; GFX90A-NEXT:    ; def v[0:1]
2293; GFX90A-NEXT:    ;;#ASMEND
2294; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
2295; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
2296; GFX90A-NEXT:    v_perm_b32 v1, v0, v0, s4
2297; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
2298; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2299; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2300;
2301; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__u_1_1_1:
2302; GFX940:       ; %bb.0:
2303; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2304; GFX940-NEXT:    ;;#ASMSTART
2305; GFX940-NEXT:    ; def v[0:1]
2306; GFX940-NEXT:    ;;#ASMEND
2307; GFX940-NEXT:    s_mov_b32 s2, 0x7060302
2308; GFX940-NEXT:    v_mov_b32_e32 v2, 0
2309; GFX940-NEXT:    v_perm_b32 v1, v0, v0, s2
2310; GFX940-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
2311; GFX940-NEXT:    s_waitcnt vmcnt(0)
2312; GFX940-NEXT:    s_setpc_b64 s[30:31]
2313  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
2314  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
2315  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 poison, i32 1, i32 1, i32 1>
2316  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
2317  ret void
2318}
2319
2320define void @v_shuffle_v4bf16_v3bf16__0_1_1_1(ptr addrspace(1) inreg %ptr) {
2321; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__0_1_1_1:
2322; GFX900:       ; %bb.0:
2323; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2324; GFX900-NEXT:    ;;#ASMSTART
2325; GFX900-NEXT:    ; def v[0:1]
2326; GFX900-NEXT:    ;;#ASMEND
2327; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
2328; GFX900-NEXT:    v_mov_b32_e32 v2, 0
2329; GFX900-NEXT:    v_perm_b32 v1, v0, v0, s4
2330; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
2331; GFX900-NEXT:    s_waitcnt vmcnt(0)
2332; GFX900-NEXT:    s_setpc_b64 s[30:31]
2333;
2334; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__0_1_1_1:
2335; GFX90A:       ; %bb.0:
2336; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2337; GFX90A-NEXT:    ;;#ASMSTART
2338; GFX90A-NEXT:    ; def v[0:1]
2339; GFX90A-NEXT:    ;;#ASMEND
2340; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
2341; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
2342; GFX90A-NEXT:    v_perm_b32 v1, v0, v0, s4
2343; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
2344; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2345; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2346;
2347; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__0_1_1_1:
2348; GFX940:       ; %bb.0:
2349; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2350; GFX940-NEXT:    ;;#ASMSTART
2351; GFX940-NEXT:    ; def v[0:1]
2352; GFX940-NEXT:    ;;#ASMEND
2353; GFX940-NEXT:    s_mov_b32 s2, 0x7060302
2354; GFX940-NEXT:    v_mov_b32_e32 v2, 0
2355; GFX940-NEXT:    v_perm_b32 v1, v0, v0, s2
2356; GFX940-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
2357; GFX940-NEXT:    s_waitcnt vmcnt(0)
2358; GFX940-NEXT:    s_setpc_b64 s[30:31]
2359  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
2360  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
2361  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
2362  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
2363  ret void
2364}
2365
2366define void @v_shuffle_v4bf16_v3bf16__1_1_1_1(ptr addrspace(1) inreg %ptr) {
2367; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__1_1_1_1:
2368; GFX900:       ; %bb.0:
2369; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2370; GFX900-NEXT:    ;;#ASMSTART
2371; GFX900-NEXT:    ; def v[0:1]
2372; GFX900-NEXT:    ;;#ASMEND
2373; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
2374; GFX900-NEXT:    v_perm_b32 v0, v0, v0, s4
2375; GFX900-NEXT:    v_mov_b32_e32 v2, 0
2376; GFX900-NEXT:    v_mov_b32_e32 v1, v0
2377; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
2378; GFX900-NEXT:    s_waitcnt vmcnt(0)
2379; GFX900-NEXT:    s_setpc_b64 s[30:31]
2380;
2381; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__1_1_1_1:
2382; GFX90A:       ; %bb.0:
2383; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2384; GFX90A-NEXT:    ;;#ASMSTART
2385; GFX90A-NEXT:    ; def v[0:1]
2386; GFX90A-NEXT:    ;;#ASMEND
2387; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
2388; GFX90A-NEXT:    v_perm_b32 v0, v0, v0, s4
2389; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
2390; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
2391; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
2392; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2393; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2394;
2395; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__1_1_1_1:
2396; GFX940:       ; %bb.0:
2397; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2398; GFX940-NEXT:    ;;#ASMSTART
2399; GFX940-NEXT:    ; def v[0:1]
2400; GFX940-NEXT:    ;;#ASMEND
2401; GFX940-NEXT:    s_mov_b32 s2, 0x7060302
2402; GFX940-NEXT:    v_perm_b32 v0, v0, v0, s2
2403; GFX940-NEXT:    v_mov_b32_e32 v2, 0
2404; GFX940-NEXT:    v_mov_b32_e32 v1, v0
2405; GFX940-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
2406; GFX940-NEXT:    s_waitcnt vmcnt(0)
2407; GFX940-NEXT:    s_setpc_b64 s[30:31]
2408  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
2409  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
2410  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
2411  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
2412  ret void
2413}
2414
2415define void @v_shuffle_v4bf16_v3bf16__2_1_1_1(ptr addrspace(1) inreg %ptr) {
2416; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__2_1_1_1:
2417; GFX900:       ; %bb.0:
2418; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2419; GFX900-NEXT:    ;;#ASMSTART
2420; GFX900-NEXT:    ; def v[0:1]
2421; GFX900-NEXT:    ;;#ASMEND
2422; GFX900-NEXT:    s_mov_b32 s4, 0xffff
2423; GFX900-NEXT:    v_bfi_b32 v1, s4, v1, v0
2424; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
2425; GFX900-NEXT:    v_mov_b32_e32 v3, 0
2426; GFX900-NEXT:    v_perm_b32 v2, v0, v0, s4
2427; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
2428; GFX900-NEXT:    s_waitcnt vmcnt(0)
2429; GFX900-NEXT:    s_setpc_b64 s[30:31]
2430;
2431; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__2_1_1_1:
2432; GFX90A:       ; %bb.0:
2433; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2434; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
2435; GFX90A-NEXT:    ;;#ASMSTART
2436; GFX90A-NEXT:    ; def v[0:1]
2437; GFX90A-NEXT:    ;;#ASMEND
2438; GFX90A-NEXT:    v_bfi_b32 v2, s4, v1, v0
2439; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
2440; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
2441; GFX90A-NEXT:    v_perm_b32 v3, v0, v0, s4
2442; GFX90A-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
2443; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2444; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2445;
2446; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__2_1_1_1:
2447; GFX940:       ; %bb.0:
2448; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2449; GFX940-NEXT:    s_mov_b32 s2, 0xffff
2450; GFX940-NEXT:    ;;#ASMSTART
2451; GFX940-NEXT:    ; def v[0:1]
2452; GFX940-NEXT:    ;;#ASMEND
2453; GFX940-NEXT:    v_mov_b32_e32 v4, 0
2454; GFX940-NEXT:    v_bfi_b32 v2, s2, v1, v0
2455; GFX940-NEXT:    s_mov_b32 s2, 0x7060302
2456; GFX940-NEXT:    v_perm_b32 v3, v0, v0, s2
2457; GFX940-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
2458; GFX940-NEXT:    s_waitcnt vmcnt(0)
2459; GFX940-NEXT:    s_setpc_b64 s[30:31]
2460  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
2461  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
2462  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 2, i32 1, i32 1, i32 1>
2463  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
2464  ret void
2465}
2466
2467define void @v_shuffle_v4bf16_v3bf16__3_1_1_1(ptr addrspace(1) inreg %ptr) {
2468; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__3_1_1_1:
2469; GFX900:       ; %bb.0:
2470; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2471; GFX900-NEXT:    ;;#ASMSTART
2472; GFX900-NEXT:    ; def v[0:1]
2473; GFX900-NEXT:    ;;#ASMEND
2474; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
2475; GFX900-NEXT:    v_mov_b32_e32 v2, 0
2476; GFX900-NEXT:    v_perm_b32 v1, v0, v0, s4
2477; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
2478; GFX900-NEXT:    s_waitcnt vmcnt(0)
2479; GFX900-NEXT:    s_setpc_b64 s[30:31]
2480;
2481; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__3_1_1_1:
2482; GFX90A:       ; %bb.0:
2483; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2484; GFX90A-NEXT:    ;;#ASMSTART
2485; GFX90A-NEXT:    ; def v[0:1]
2486; GFX90A-NEXT:    ;;#ASMEND
2487; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
2488; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
2489; GFX90A-NEXT:    v_perm_b32 v1, v0, v0, s4
2490; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
2491; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2492; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2493;
2494; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__3_1_1_1:
2495; GFX940:       ; %bb.0:
2496; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2497; GFX940-NEXT:    ;;#ASMSTART
2498; GFX940-NEXT:    ; def v[0:1]
2499; GFX940-NEXT:    ;;#ASMEND
2500; GFX940-NEXT:    s_mov_b32 s2, 0x7060302
2501; GFX940-NEXT:    v_mov_b32_e32 v2, 0
2502; GFX940-NEXT:    v_perm_b32 v1, v0, v0, s2
2503; GFX940-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
2504; GFX940-NEXT:    s_waitcnt vmcnt(0)
2505; GFX940-NEXT:    s_setpc_b64 s[30:31]
2506  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
2507  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
2508  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 3, i32 1, i32 1, i32 1>
2509  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
2510  ret void
2511}
2512
2513define void @v_shuffle_v4bf16_v3bf16__4_1_1_1(ptr addrspace(1) inreg %ptr) {
2514; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__4_1_1_1:
2515; GFX900:       ; %bb.0:
2516; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2517; GFX900-NEXT:    ;;#ASMSTART
2518; GFX900-NEXT:    ; def v[0:1]
2519; GFX900-NEXT:    ;;#ASMEND
2520; GFX900-NEXT:    ;;#ASMSTART
2521; GFX900-NEXT:    ; def v[1:2]
2522; GFX900-NEXT:    ;;#ASMEND
2523; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
2524; GFX900-NEXT:    v_mov_b32_e32 v3, 0
2525; GFX900-NEXT:    v_perm_b32 v1, v0, v1, s4
2526; GFX900-NEXT:    v_perm_b32 v2, v0, v0, s4
2527; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
2528; GFX900-NEXT:    s_waitcnt vmcnt(0)
2529; GFX900-NEXT:    s_setpc_b64 s[30:31]
2530;
2531; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__4_1_1_1:
2532; GFX90A:       ; %bb.0:
2533; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2534; GFX90A-NEXT:    ;;#ASMSTART
2535; GFX90A-NEXT:    ; def v[2:3]
2536; GFX90A-NEXT:    ;;#ASMEND
2537; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
2538; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
2539; GFX90A-NEXT:    ;;#ASMSTART
2540; GFX90A-NEXT:    ; def v[0:1]
2541; GFX90A-NEXT:    ;;#ASMEND
2542; GFX90A-NEXT:    v_perm_b32 v2, v0, v2, s4
2543; GFX90A-NEXT:    v_perm_b32 v3, v0, v0, s4
2544; GFX90A-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
2545; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2546; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2547;
2548; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__4_1_1_1:
2549; GFX940:       ; %bb.0:
2550; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2551; GFX940-NEXT:    ;;#ASMSTART
2552; GFX940-NEXT:    ; def v[2:3]
2553; GFX940-NEXT:    ;;#ASMEND
2554; GFX940-NEXT:    s_mov_b32 s2, 0x7060302
2555; GFX940-NEXT:    v_mov_b32_e32 v4, 0
2556; GFX940-NEXT:    ;;#ASMSTART
2557; GFX940-NEXT:    ; def v[0:1]
2558; GFX940-NEXT:    ;;#ASMEND
2559; GFX940-NEXT:    s_nop 0
2560; GFX940-NEXT:    v_perm_b32 v2, v0, v2, s2
2561; GFX940-NEXT:    v_perm_b32 v3, v0, v0, s2
2562; GFX940-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
2563; GFX940-NEXT:    s_waitcnt vmcnt(0)
2564; GFX940-NEXT:    s_setpc_b64 s[30:31]
2565  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
2566  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
2567  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
2568  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
2569  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 4, i32 1, i32 1, i32 1>
2570  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
2571  ret void
2572}
2573
2574define void @v_shuffle_v4bf16_v3bf16__5_1_1_1(ptr addrspace(1) inreg %ptr) {
2575; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_1_1_1:
2576; GFX900:       ; %bb.0:
2577; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2578; GFX900-NEXT:    ;;#ASMSTART
2579; GFX900-NEXT:    ; def v[0:1]
2580; GFX900-NEXT:    ;;#ASMEND
2581; GFX900-NEXT:    ;;#ASMSTART
2582; GFX900-NEXT:    ; def v[1:2]
2583; GFX900-NEXT:    ;;#ASMEND
2584; GFX900-NEXT:    s_mov_b32 s4, 0xffff
2585; GFX900-NEXT:    v_bfi_b32 v1, s4, v2, v0
2586; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
2587; GFX900-NEXT:    v_mov_b32_e32 v3, 0
2588; GFX900-NEXT:    v_perm_b32 v2, v0, v0, s4
2589; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
2590; GFX900-NEXT:    s_waitcnt vmcnt(0)
2591; GFX900-NEXT:    s_setpc_b64 s[30:31]
2592;
2593; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_1_1_1:
2594; GFX90A:       ; %bb.0:
2595; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2596; GFX90A-NEXT:    ;;#ASMSTART
2597; GFX90A-NEXT:    ; def v[2:3]
2598; GFX90A-NEXT:    ;;#ASMEND
2599; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
2600; GFX90A-NEXT:    ;;#ASMSTART
2601; GFX90A-NEXT:    ; def v[0:1]
2602; GFX90A-NEXT:    ;;#ASMEND
2603; GFX90A-NEXT:    v_bfi_b32 v2, s4, v3, v0
2604; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
2605; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
2606; GFX90A-NEXT:    v_perm_b32 v3, v0, v0, s4
2607; GFX90A-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
2608; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2609; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2610;
2611; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_1_1_1:
2612; GFX940:       ; %bb.0:
2613; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2614; GFX940-NEXT:    ;;#ASMSTART
2615; GFX940-NEXT:    ; def v[2:3]
2616; GFX940-NEXT:    ;;#ASMEND
2617; GFX940-NEXT:    s_mov_b32 s2, 0xffff
2618; GFX940-NEXT:    ;;#ASMSTART
2619; GFX940-NEXT:    ; def v[0:1]
2620; GFX940-NEXT:    ;;#ASMEND
2621; GFX940-NEXT:    v_mov_b32_e32 v4, 0
2622; GFX940-NEXT:    v_bfi_b32 v2, s2, v3, v0
2623; GFX940-NEXT:    s_mov_b32 s2, 0x7060302
2624; GFX940-NEXT:    v_perm_b32 v3, v0, v0, s2
2625; GFX940-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
2626; GFX940-NEXT:    s_waitcnt vmcnt(0)
2627; GFX940-NEXT:    s_setpc_b64 s[30:31]
2628  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
2629  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
2630  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
2631  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
2632  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 1, i32 1, i32 1>
2633  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
2634  ret void
2635}
2636
2637define void @v_shuffle_v4bf16_v3bf16__5_u_1_1(ptr addrspace(1) inreg %ptr) {
2638; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_u_1_1:
2639; GFX900:       ; %bb.0:
2640; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2641; GFX900-NEXT:    ;;#ASMSTART
2642; GFX900-NEXT:    ; def v[0:1]
2643; GFX900-NEXT:    ;;#ASMEND
2644; GFX900-NEXT:    ;;#ASMSTART
2645; GFX900-NEXT:    ; def v[1:2]
2646; GFX900-NEXT:    ;;#ASMEND
2647; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
2648; GFX900-NEXT:    v_mov_b32_e32 v3, 0
2649; GFX900-NEXT:    v_perm_b32 v1, v0, v0, s4
2650; GFX900-NEXT:    v_mov_b32_e32 v0, v2
2651; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
2652; GFX900-NEXT:    s_waitcnt vmcnt(0)
2653; GFX900-NEXT:    s_setpc_b64 s[30:31]
2654;
2655; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_u_1_1:
2656; GFX90A:       ; %bb.0:
2657; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2658; GFX90A-NEXT:    ;;#ASMSTART
2659; GFX90A-NEXT:    ; def v[0:1]
2660; GFX90A-NEXT:    ;;#ASMEND
2661; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
2662; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
2663; GFX90A-NEXT:    ;;#ASMSTART
2664; GFX90A-NEXT:    ; def v[2:3]
2665; GFX90A-NEXT:    ;;#ASMEND
2666; GFX90A-NEXT:    v_perm_b32 v1, v0, v0, s4
2667; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
2668; GFX90A-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
2669; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2670; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2671;
2672; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_u_1_1:
2673; GFX940:       ; %bb.0:
2674; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2675; GFX940-NEXT:    ;;#ASMSTART
2676; GFX940-NEXT:    ; def v[0:1]
2677; GFX940-NEXT:    ;;#ASMEND
2678; GFX940-NEXT:    s_mov_b32 s2, 0x7060302
2679; GFX940-NEXT:    v_mov_b32_e32 v4, 0
2680; GFX940-NEXT:    ;;#ASMSTART
2681; GFX940-NEXT:    ; def v[2:3]
2682; GFX940-NEXT:    ;;#ASMEND
2683; GFX940-NEXT:    v_perm_b32 v1, v0, v0, s2
2684; GFX940-NEXT:    v_mov_b32_e32 v0, v3
2685; GFX940-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
2686; GFX940-NEXT:    s_waitcnt vmcnt(0)
2687; GFX940-NEXT:    s_setpc_b64 s[30:31]
2688  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
2689  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
2690  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
2691  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
2692  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 poison, i32 1, i32 1>
2693  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
2694  ret void
2695}
2696
2697define void @v_shuffle_v4bf16_v3bf16__5_0_1_1(ptr addrspace(1) inreg %ptr) {
2698; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_0_1_1:
2699; GFX900:       ; %bb.0:
2700; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2701; GFX900-NEXT:    ;;#ASMSTART
2702; GFX900-NEXT:    ; def v[0:1]
2703; GFX900-NEXT:    ;;#ASMEND
2704; GFX900-NEXT:    ;;#ASMSTART
2705; GFX900-NEXT:    ; def v[1:2]
2706; GFX900-NEXT:    ;;#ASMEND
2707; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
2708; GFX900-NEXT:    v_perm_b32 v1, v0, v2, s4
2709; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
2710; GFX900-NEXT:    v_mov_b32_e32 v3, 0
2711; GFX900-NEXT:    v_perm_b32 v2, v0, v0, s4
2712; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
2713; GFX900-NEXT:    s_waitcnt vmcnt(0)
2714; GFX900-NEXT:    s_setpc_b64 s[30:31]
2715;
2716; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_0_1_1:
2717; GFX90A:       ; %bb.0:
2718; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2719; GFX90A-NEXT:    ;;#ASMSTART
2720; GFX90A-NEXT:    ; def v[2:3]
2721; GFX90A-NEXT:    ;;#ASMEND
2722; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
2723; GFX90A-NEXT:    ;;#ASMSTART
2724; GFX90A-NEXT:    ; def v[0:1]
2725; GFX90A-NEXT:    ;;#ASMEND
2726; GFX90A-NEXT:    v_perm_b32 v2, v0, v3, s4
2727; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
2728; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
2729; GFX90A-NEXT:    v_perm_b32 v3, v0, v0, s4
2730; GFX90A-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
2731; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2732; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2733;
2734; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_0_1_1:
2735; GFX940:       ; %bb.0:
2736; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2737; GFX940-NEXT:    ;;#ASMSTART
2738; GFX940-NEXT:    ; def v[2:3]
2739; GFX940-NEXT:    ;;#ASMEND
2740; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
2741; GFX940-NEXT:    ;;#ASMSTART
2742; GFX940-NEXT:    ; def v[0:1]
2743; GFX940-NEXT:    ;;#ASMEND
2744; GFX940-NEXT:    v_mov_b32_e32 v4, 0
2745; GFX940-NEXT:    v_perm_b32 v2, v0, v3, s2
2746; GFX940-NEXT:    s_mov_b32 s2, 0x7060302
2747; GFX940-NEXT:    v_perm_b32 v3, v0, v0, s2
2748; GFX940-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
2749; GFX940-NEXT:    s_waitcnt vmcnt(0)
2750; GFX940-NEXT:    s_setpc_b64 s[30:31]
2751  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
2752  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
2753  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
2754  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
2755  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 0, i32 1, i32 1>
2756  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
2757  ret void
2758}
2759
2760define void @v_shuffle_v4bf16_v3bf16__5_2_1_1(ptr addrspace(1) inreg %ptr) {
2761; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_2_1_1:
2762; GFX900:       ; %bb.0:
2763; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2764; GFX900-NEXT:    ;;#ASMSTART
2765; GFX900-NEXT:    ; def v[0:1]
2766; GFX900-NEXT:    ;;#ASMEND
2767; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
2768; GFX900-NEXT:    ;;#ASMSTART
2769; GFX900-NEXT:    ; def v[2:3]
2770; GFX900-NEXT:    ;;#ASMEND
2771; GFX900-NEXT:    v_perm_b32 v1, v1, v3, s4
2772; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
2773; GFX900-NEXT:    v_mov_b32_e32 v4, 0
2774; GFX900-NEXT:    v_perm_b32 v2, v0, v0, s4
2775; GFX900-NEXT:    global_store_dwordx2 v4, v[1:2], s[16:17]
2776; GFX900-NEXT:    s_waitcnt vmcnt(0)
2777; GFX900-NEXT:    s_setpc_b64 s[30:31]
2778;
2779; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_2_1_1:
2780; GFX90A:       ; %bb.0:
2781; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2782; GFX90A-NEXT:    ;;#ASMSTART
2783; GFX90A-NEXT:    ; def v[2:3]
2784; GFX90A-NEXT:    ;;#ASMEND
2785; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
2786; GFX90A-NEXT:    ;;#ASMSTART
2787; GFX90A-NEXT:    ; def v[0:1]
2788; GFX90A-NEXT:    ;;#ASMEND
2789; GFX90A-NEXT:    v_perm_b32 v2, v1, v3, s4
2790; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
2791; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
2792; GFX90A-NEXT:    v_perm_b32 v3, v0, v0, s4
2793; GFX90A-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
2794; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2795; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2796;
2797; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_2_1_1:
2798; GFX940:       ; %bb.0:
2799; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2800; GFX940-NEXT:    ;;#ASMSTART
2801; GFX940-NEXT:    ; def v[2:3]
2802; GFX940-NEXT:    ;;#ASMEND
2803; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
2804; GFX940-NEXT:    ;;#ASMSTART
2805; GFX940-NEXT:    ; def v[0:1]
2806; GFX940-NEXT:    ;;#ASMEND
2807; GFX940-NEXT:    v_mov_b32_e32 v4, 0
2808; GFX940-NEXT:    v_perm_b32 v2, v1, v3, s2
2809; GFX940-NEXT:    s_mov_b32 s2, 0x7060302
2810; GFX940-NEXT:    v_perm_b32 v3, v0, v0, s2
2811; GFX940-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
2812; GFX940-NEXT:    s_waitcnt vmcnt(0)
2813; GFX940-NEXT:    s_setpc_b64 s[30:31]
2814  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
2815  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
2816  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
2817  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
2818  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 2, i32 1, i32 1>
2819  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
2820  ret void
2821}
2822
2823define void @v_shuffle_v4bf16_v3bf16__5_3_1_1(ptr addrspace(1) inreg %ptr) {
2824; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_3_1_1:
2825; GFX900:       ; %bb.0:
2826; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2827; GFX900-NEXT:    ;;#ASMSTART
2828; GFX900-NEXT:    ; def v[0:1]
2829; GFX900-NEXT:    ;;#ASMEND
2830; GFX900-NEXT:    ;;#ASMSTART
2831; GFX900-NEXT:    ; def v[1:2]
2832; GFX900-NEXT:    ;;#ASMEND
2833; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
2834; GFX900-NEXT:    v_perm_b32 v1, v1, v2, s4
2835; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
2836; GFX900-NEXT:    v_mov_b32_e32 v3, 0
2837; GFX900-NEXT:    v_perm_b32 v2, v0, v0, s4
2838; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
2839; GFX900-NEXT:    s_waitcnt vmcnt(0)
2840; GFX900-NEXT:    s_setpc_b64 s[30:31]
2841;
2842; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_3_1_1:
2843; GFX90A:       ; %bb.0:
2844; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2845; GFX90A-NEXT:    ;;#ASMSTART
2846; GFX90A-NEXT:    ; def v[2:3]
2847; GFX90A-NEXT:    ;;#ASMEND
2848; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
2849; GFX90A-NEXT:    v_perm_b32 v2, v2, v3, s4
2850; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
2851; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
2852; GFX90A-NEXT:    ;;#ASMSTART
2853; GFX90A-NEXT:    ; def v[0:1]
2854; GFX90A-NEXT:    ;;#ASMEND
2855; GFX90A-NEXT:    v_perm_b32 v3, v0, v0, s4
2856; GFX90A-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
2857; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2858; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2859;
2860; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_3_1_1:
2861; GFX940:       ; %bb.0:
2862; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2863; GFX940-NEXT:    ;;#ASMSTART
2864; GFX940-NEXT:    ; def v[2:3]
2865; GFX940-NEXT:    ;;#ASMEND
2866; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
2867; GFX940-NEXT:    v_perm_b32 v2, v2, v3, s2
2868; GFX940-NEXT:    s_mov_b32 s2, 0x7060302
2869; GFX940-NEXT:    v_mov_b32_e32 v4, 0
2870; GFX940-NEXT:    ;;#ASMSTART
2871; GFX940-NEXT:    ; def v[0:1]
2872; GFX940-NEXT:    ;;#ASMEND
2873; GFX940-NEXT:    s_nop 0
2874; GFX940-NEXT:    v_perm_b32 v3, v0, v0, s2
2875; GFX940-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
2876; GFX940-NEXT:    s_waitcnt vmcnt(0)
2877; GFX940-NEXT:    s_setpc_b64 s[30:31]
2878  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
2879  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
2880  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
2881  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
2882  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 3, i32 1, i32 1>
2883  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
2884  ret void
2885}
2886
2887define void @v_shuffle_v4bf16_v3bf16__5_4_1_1(ptr addrspace(1) inreg %ptr) {
2888; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_4_1_1:
2889; GFX900:       ; %bb.0:
2890; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2891; GFX900-NEXT:    ;;#ASMSTART
2892; GFX900-NEXT:    ; def v[0:1]
2893; GFX900-NEXT:    ;;#ASMEND
2894; GFX900-NEXT:    ;;#ASMSTART
2895; GFX900-NEXT:    ; def v[1:2]
2896; GFX900-NEXT:    ;;#ASMEND
2897; GFX900-NEXT:    s_mov_b32 s4, 0xffff
2898; GFX900-NEXT:    v_bfi_b32 v1, s4, v2, v1
2899; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
2900; GFX900-NEXT:    v_mov_b32_e32 v3, 0
2901; GFX900-NEXT:    v_perm_b32 v2, v0, v0, s4
2902; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
2903; GFX900-NEXT:    s_waitcnt vmcnt(0)
2904; GFX900-NEXT:    s_setpc_b64 s[30:31]
2905;
2906; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_4_1_1:
2907; GFX90A:       ; %bb.0:
2908; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2909; GFX90A-NEXT:    ;;#ASMSTART
2910; GFX90A-NEXT:    ; def v[2:3]
2911; GFX90A-NEXT:    ;;#ASMEND
2912; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
2913; GFX90A-NEXT:    v_bfi_b32 v2, s4, v3, v2
2914; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
2915; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
2916; GFX90A-NEXT:    ;;#ASMSTART
2917; GFX90A-NEXT:    ; def v[0:1]
2918; GFX90A-NEXT:    ;;#ASMEND
2919; GFX90A-NEXT:    v_perm_b32 v3, v0, v0, s4
2920; GFX90A-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
2921; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2922; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2923;
2924; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_4_1_1:
2925; GFX940:       ; %bb.0:
2926; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2927; GFX940-NEXT:    ;;#ASMSTART
2928; GFX940-NEXT:    ; def v[2:3]
2929; GFX940-NEXT:    ;;#ASMEND
2930; GFX940-NEXT:    s_mov_b32 s2, 0xffff
2931; GFX940-NEXT:    v_bfi_b32 v2, s2, v3, v2
2932; GFX940-NEXT:    s_mov_b32 s2, 0x7060302
2933; GFX940-NEXT:    v_mov_b32_e32 v4, 0
2934; GFX940-NEXT:    ;;#ASMSTART
2935; GFX940-NEXT:    ; def v[0:1]
2936; GFX940-NEXT:    ;;#ASMEND
2937; GFX940-NEXT:    s_nop 0
2938; GFX940-NEXT:    v_perm_b32 v3, v0, v0, s2
2939; GFX940-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
2940; GFX940-NEXT:    s_waitcnt vmcnt(0)
2941; GFX940-NEXT:    s_setpc_b64 s[30:31]
2942  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
2943  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
2944  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
2945  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
2946  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 4, i32 1, i32 1>
2947  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
2948  ret void
2949}
2950
2951define void @v_shuffle_v4bf16_v3bf16__5_5_1_1(ptr addrspace(1) inreg %ptr) {
2952; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_1:
2953; GFX900:       ; %bb.0:
2954; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2955; GFX900-NEXT:    ;;#ASMSTART
2956; GFX900-NEXT:    ; def v[0:1]
2957; GFX900-NEXT:    ;;#ASMEND
2958; GFX900-NEXT:    ;;#ASMSTART
2959; GFX900-NEXT:    ; def v[1:2]
2960; GFX900-NEXT:    ;;#ASMEND
2961; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
2962; GFX900-NEXT:    v_perm_b32 v1, v0, v0, s4
2963; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
2964; GFX900-NEXT:    v_mov_b32_e32 v3, 0
2965; GFX900-NEXT:    v_perm_b32 v0, v2, v2, s4
2966; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
2967; GFX900-NEXT:    s_waitcnt vmcnt(0)
2968; GFX900-NEXT:    s_setpc_b64 s[30:31]
2969;
2970; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_1:
2971; GFX90A:       ; %bb.0:
2972; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2973; GFX90A-NEXT:    ;;#ASMSTART
2974; GFX90A-NEXT:    ; def v[0:1]
2975; GFX90A-NEXT:    ;;#ASMEND
2976; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
2977; GFX90A-NEXT:    v_perm_b32 v1, v0, v0, s4
2978; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
2979; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
2980; GFX90A-NEXT:    ;;#ASMSTART
2981; GFX90A-NEXT:    ; def v[2:3]
2982; GFX90A-NEXT:    ;;#ASMEND
2983; GFX90A-NEXT:    v_perm_b32 v0, v3, v3, s4
2984; GFX90A-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
2985; GFX90A-NEXT:    s_waitcnt vmcnt(0)
2986; GFX90A-NEXT:    s_setpc_b64 s[30:31]
2987;
2988; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_1:
2989; GFX940:       ; %bb.0:
2990; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2991; GFX940-NEXT:    ;;#ASMSTART
2992; GFX940-NEXT:    ; def v[0:1]
2993; GFX940-NEXT:    ;;#ASMEND
2994; GFX940-NEXT:    s_mov_b32 s2, 0x7060302
2995; GFX940-NEXT:    v_perm_b32 v1, v0, v0, s2
2996; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
2997; GFX940-NEXT:    v_mov_b32_e32 v4, 0
2998; GFX940-NEXT:    ;;#ASMSTART
2999; GFX940-NEXT:    ; def v[2:3]
3000; GFX940-NEXT:    ;;#ASMEND
3001; GFX940-NEXT:    s_nop 0
3002; GFX940-NEXT:    v_perm_b32 v0, v3, v3, s2
3003; GFX940-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
3004; GFX940-NEXT:    s_waitcnt vmcnt(0)
3005; GFX940-NEXT:    s_setpc_b64 s[30:31]
3006  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
3007  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
3008  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
3009  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
3010  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 1, i32 1>
3011  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
3012  ret void
3013}
3014
3015define void @v_shuffle_v4bf16_v3bf16__5_5_u_1(ptr addrspace(1) inreg %ptr) {
3016; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_1:
3017; GFX900:       ; %bb.0:
3018; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3019; GFX900-NEXT:    ;;#ASMSTART
3020; GFX900-NEXT:    ; def v[0:1]
3021; GFX900-NEXT:    ;;#ASMEND
3022; GFX900-NEXT:    ;;#ASMSTART
3023; GFX900-NEXT:    ; def v[1:2]
3024; GFX900-NEXT:    ;;#ASMEND
3025; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
3026; GFX900-NEXT:    v_mov_b32_e32 v3, 0
3027; GFX900-NEXT:    v_perm_b32 v1, v2, v2, s4
3028; GFX900-NEXT:    v_mov_b32_e32 v2, v0
3029; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
3030; GFX900-NEXT:    s_waitcnt vmcnt(0)
3031; GFX900-NEXT:    s_setpc_b64 s[30:31]
3032;
3033; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_1:
3034; GFX90A:       ; %bb.0:
3035; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3036; GFX90A-NEXT:    ;;#ASMSTART
3037; GFX90A-NEXT:    ; def v[2:3]
3038; GFX90A-NEXT:    ;;#ASMEND
3039; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
3040; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
3041; GFX90A-NEXT:    ;;#ASMSTART
3042; GFX90A-NEXT:    ; def v[0:1]
3043; GFX90A-NEXT:    ;;#ASMEND
3044; GFX90A-NEXT:    v_perm_b32 v2, v3, v3, s4
3045; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
3046; GFX90A-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
3047; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3048; GFX90A-NEXT:    s_setpc_b64 s[30:31]
3049;
3050; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_1:
3051; GFX940:       ; %bb.0:
3052; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3053; GFX940-NEXT:    ;;#ASMSTART
3054; GFX940-NEXT:    ; def v[2:3]
3055; GFX940-NEXT:    ;;#ASMEND
3056; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
3057; GFX940-NEXT:    v_mov_b32_e32 v4, 0
3058; GFX940-NEXT:    ;;#ASMSTART
3059; GFX940-NEXT:    ; def v[0:1]
3060; GFX940-NEXT:    ;;#ASMEND
3061; GFX940-NEXT:    v_perm_b32 v2, v3, v3, s2
3062; GFX940-NEXT:    v_mov_b32_e32 v3, v0
3063; GFX940-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
3064; GFX940-NEXT:    s_waitcnt vmcnt(0)
3065; GFX940-NEXT:    s_setpc_b64 s[30:31]
3066  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
3067  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
3068  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
3069  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
3070  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 poison, i32 1>
3071  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
3072  ret void
3073}
3074
3075define void @v_shuffle_v4bf16_v3bf16__5_5_0_1(ptr addrspace(1) inreg %ptr) {
3076; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_1:
3077; GFX900:       ; %bb.0:
3078; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3079; GFX900-NEXT:    ;;#ASMSTART
3080; GFX900-NEXT:    ; def v[0:1]
3081; GFX900-NEXT:    ;;#ASMEND
3082; GFX900-NEXT:    ;;#ASMSTART
3083; GFX900-NEXT:    ; def v[1:2]
3084; GFX900-NEXT:    ;;#ASMEND
3085; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
3086; GFX900-NEXT:    v_mov_b32_e32 v3, 0
3087; GFX900-NEXT:    v_perm_b32 v1, v2, v2, s4
3088; GFX900-NEXT:    v_mov_b32_e32 v2, v0
3089; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
3090; GFX900-NEXT:    s_waitcnt vmcnt(0)
3091; GFX900-NEXT:    s_setpc_b64 s[30:31]
3092;
3093; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_1:
3094; GFX90A:       ; %bb.0:
3095; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3096; GFX90A-NEXT:    ;;#ASMSTART
3097; GFX90A-NEXT:    ; def v[2:3]
3098; GFX90A-NEXT:    ;;#ASMEND
3099; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
3100; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
3101; GFX90A-NEXT:    ;;#ASMSTART
3102; GFX90A-NEXT:    ; def v[0:1]
3103; GFX90A-NEXT:    ;;#ASMEND
3104; GFX90A-NEXT:    v_perm_b32 v2, v3, v3, s4
3105; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
3106; GFX90A-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
3107; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3108; GFX90A-NEXT:    s_setpc_b64 s[30:31]
3109;
3110; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_1:
3111; GFX940:       ; %bb.0:
3112; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3113; GFX940-NEXT:    ;;#ASMSTART
3114; GFX940-NEXT:    ; def v[2:3]
3115; GFX940-NEXT:    ;;#ASMEND
3116; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
3117; GFX940-NEXT:    v_mov_b32_e32 v4, 0
3118; GFX940-NEXT:    ;;#ASMSTART
3119; GFX940-NEXT:    ; def v[0:1]
3120; GFX940-NEXT:    ;;#ASMEND
3121; GFX940-NEXT:    v_perm_b32 v2, v3, v3, s2
3122; GFX940-NEXT:    v_mov_b32_e32 v3, v0
3123; GFX940-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
3124; GFX940-NEXT:    s_waitcnt vmcnt(0)
3125; GFX940-NEXT:    s_setpc_b64 s[30:31]
3126  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
3127  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
3128  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
3129  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
3130  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 0, i32 1>
3131  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
3132  ret void
3133}
3134
3135define void @v_shuffle_v4bf16_v3bf16__5_5_2_1(ptr addrspace(1) inreg %ptr) {
3136; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_1:
3137; GFX900:       ; %bb.0:
3138; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3139; GFX900-NEXT:    ;;#ASMSTART
3140; GFX900-NEXT:    ; def v[0:1]
3141; GFX900-NEXT:    ;;#ASMEND
3142; GFX900-NEXT:    s_mov_b32 s4, 0xffff
3143; GFX900-NEXT:    v_bfi_b32 v1, s4, v1, v0
3144; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
3145; GFX900-NEXT:    v_mov_b32_e32 v4, 0
3146; GFX900-NEXT:    ;;#ASMSTART
3147; GFX900-NEXT:    ; def v[2:3]
3148; GFX900-NEXT:    ;;#ASMEND
3149; GFX900-NEXT:    v_perm_b32 v0, v3, v3, s4
3150; GFX900-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
3151; GFX900-NEXT:    s_waitcnt vmcnt(0)
3152; GFX900-NEXT:    s_setpc_b64 s[30:31]
3153;
3154; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_1:
3155; GFX90A:       ; %bb.0:
3156; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3157; GFX90A-NEXT:    ;;#ASMSTART
3158; GFX90A-NEXT:    ; def v[0:1]
3159; GFX90A-NEXT:    ;;#ASMEND
3160; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
3161; GFX90A-NEXT:    v_bfi_b32 v1, s4, v1, v0
3162; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
3163; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
3164; GFX90A-NEXT:    ;;#ASMSTART
3165; GFX90A-NEXT:    ; def v[2:3]
3166; GFX90A-NEXT:    ;;#ASMEND
3167; GFX90A-NEXT:    v_perm_b32 v0, v3, v3, s4
3168; GFX90A-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
3169; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3170; GFX90A-NEXT:    s_setpc_b64 s[30:31]
3171;
3172; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_1:
3173; GFX940:       ; %bb.0:
3174; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3175; GFX940-NEXT:    ;;#ASMSTART
3176; GFX940-NEXT:    ; def v[0:1]
3177; GFX940-NEXT:    ;;#ASMEND
3178; GFX940-NEXT:    s_mov_b32 s2, 0xffff
3179; GFX940-NEXT:    v_bfi_b32 v1, s2, v1, v0
3180; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
3181; GFX940-NEXT:    v_mov_b32_e32 v4, 0
3182; GFX940-NEXT:    ;;#ASMSTART
3183; GFX940-NEXT:    ; def v[2:3]
3184; GFX940-NEXT:    ;;#ASMEND
3185; GFX940-NEXT:    s_nop 0
3186; GFX940-NEXT:    v_perm_b32 v0, v3, v3, s2
3187; GFX940-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
3188; GFX940-NEXT:    s_waitcnt vmcnt(0)
3189; GFX940-NEXT:    s_setpc_b64 s[30:31]
3190  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
3191  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
3192  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
3193  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
3194  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 2, i32 1>
3195  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
3196  ret void
3197}
3198
3199define void @v_shuffle_v4bf16_v3bf16__5_5_3_1(ptr addrspace(1) inreg %ptr) {
3200; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_1:
3201; GFX900:       ; %bb.0:
3202; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3203; GFX900-NEXT:    ;;#ASMSTART
3204; GFX900-NEXT:    ; def v[0:1]
3205; GFX900-NEXT:    ;;#ASMEND
3206; GFX900-NEXT:    ;;#ASMSTART
3207; GFX900-NEXT:    ; def v[1:2]
3208; GFX900-NEXT:    ;;#ASMEND
3209; GFX900-NEXT:    s_mov_b32 s4, 0xffff
3210; GFX900-NEXT:    v_bfi_b32 v1, s4, v1, v0
3211; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
3212; GFX900-NEXT:    v_mov_b32_e32 v3, 0
3213; GFX900-NEXT:    v_perm_b32 v0, v2, v2, s4
3214; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
3215; GFX900-NEXT:    s_waitcnt vmcnt(0)
3216; GFX900-NEXT:    s_setpc_b64 s[30:31]
3217;
3218; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_1:
3219; GFX90A:       ; %bb.0:
3220; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3221; GFX90A-NEXT:    ;;#ASMSTART
3222; GFX90A-NEXT:    ; def v[0:1]
3223; GFX90A-NEXT:    ;;#ASMEND
3224; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
3225; GFX90A-NEXT:    ;;#ASMSTART
3226; GFX90A-NEXT:    ; def v[2:3]
3227; GFX90A-NEXT:    ;;#ASMEND
3228; GFX90A-NEXT:    v_bfi_b32 v1, s4, v2, v0
3229; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
3230; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
3231; GFX90A-NEXT:    v_perm_b32 v0, v3, v3, s4
3232; GFX90A-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
3233; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3234; GFX90A-NEXT:    s_setpc_b64 s[30:31]
3235;
3236; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_1:
3237; GFX940:       ; %bb.0:
3238; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3239; GFX940-NEXT:    ;;#ASMSTART
3240; GFX940-NEXT:    ; def v[0:1]
3241; GFX940-NEXT:    ;;#ASMEND
3242; GFX940-NEXT:    s_mov_b32 s2, 0xffff
3243; GFX940-NEXT:    ;;#ASMSTART
3244; GFX940-NEXT:    ; def v[2:3]
3245; GFX940-NEXT:    ;;#ASMEND
3246; GFX940-NEXT:    v_mov_b32_e32 v4, 0
3247; GFX940-NEXT:    v_bfi_b32 v1, s2, v2, v0
3248; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
3249; GFX940-NEXT:    v_perm_b32 v0, v3, v3, s2
3250; GFX940-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
3251; GFX940-NEXT:    s_waitcnt vmcnt(0)
3252; GFX940-NEXT:    s_setpc_b64 s[30:31]
3253  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
3254  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
3255  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
3256  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
3257  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 3, i32 1>
3258  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
3259  ret void
3260}
3261
3262define void @v_shuffle_v4bf16_v3bf16__5_5_4_1(ptr addrspace(1) inreg %ptr) {
3263; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_1:
3264; GFX900:       ; %bb.0:
3265; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3266; GFX900-NEXT:    ;;#ASMSTART
3267; GFX900-NEXT:    ; def v[0:1]
3268; GFX900-NEXT:    ;;#ASMEND
3269; GFX900-NEXT:    ;;#ASMSTART
3270; GFX900-NEXT:    ; def v[1:2]
3271; GFX900-NEXT:    ;;#ASMEND
3272; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
3273; GFX900-NEXT:    v_perm_b32 v1, v0, v1, s4
3274; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
3275; GFX900-NEXT:    v_mov_b32_e32 v3, 0
3276; GFX900-NEXT:    v_perm_b32 v0, v2, v2, s4
3277; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
3278; GFX900-NEXT:    s_waitcnt vmcnt(0)
3279; GFX900-NEXT:    s_setpc_b64 s[30:31]
3280;
3281; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_1:
3282; GFX90A:       ; %bb.0:
3283; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3284; GFX90A-NEXT:    ;;#ASMSTART
3285; GFX90A-NEXT:    ; def v[0:1]
3286; GFX90A-NEXT:    ;;#ASMEND
3287; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
3288; GFX90A-NEXT:    ;;#ASMSTART
3289; GFX90A-NEXT:    ; def v[2:3]
3290; GFX90A-NEXT:    ;;#ASMEND
3291; GFX90A-NEXT:    v_perm_b32 v1, v0, v2, s4
3292; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
3293; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
3294; GFX90A-NEXT:    v_perm_b32 v0, v3, v3, s4
3295; GFX90A-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
3296; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3297; GFX90A-NEXT:    s_setpc_b64 s[30:31]
3298;
3299; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_1:
3300; GFX940:       ; %bb.0:
3301; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3302; GFX940-NEXT:    ;;#ASMSTART
3303; GFX940-NEXT:    ; def v[0:1]
3304; GFX940-NEXT:    ;;#ASMEND
3305; GFX940-NEXT:    s_mov_b32 s2, 0x7060302
3306; GFX940-NEXT:    ;;#ASMSTART
3307; GFX940-NEXT:    ; def v[2:3]
3308; GFX940-NEXT:    ;;#ASMEND
3309; GFX940-NEXT:    v_mov_b32_e32 v4, 0
3310; GFX940-NEXT:    v_perm_b32 v1, v0, v2, s2
3311; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
3312; GFX940-NEXT:    v_perm_b32 v0, v3, v3, s2
3313; GFX940-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
3314; GFX940-NEXT:    s_waitcnt vmcnt(0)
3315; GFX940-NEXT:    s_setpc_b64 s[30:31]
3316  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
3317  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
3318  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
3319  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
3320  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 4, i32 1>
3321  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
3322  ret void
3323}
3324
3325define void @v_shuffle_v4bf16_v3bf16__u_2_2_2(ptr addrspace(1) inreg %ptr) {
3326; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__u_2_2_2:
3327; GFX900:       ; %bb.0:
3328; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3329; GFX900-NEXT:    ;;#ASMSTART
3330; GFX900-NEXT:    ; def v[0:1]
3331; GFX900-NEXT:    ;;#ASMEND
3332; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
3333; GFX900-NEXT:    v_mov_b32_e32 v3, 0
3334; GFX900-NEXT:    v_perm_b32 v2, v1, v1, s4
3335; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
3336; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
3337; GFX900-NEXT:    s_waitcnt vmcnt(0)
3338; GFX900-NEXT:    s_setpc_b64 s[30:31]
3339;
3340; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__u_2_2_2:
3341; GFX90A:       ; %bb.0:
3342; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3343; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
3344; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
3345; GFX90A-NEXT:    ;;#ASMSTART
3346; GFX90A-NEXT:    ; def v[0:1]
3347; GFX90A-NEXT:    ;;#ASMEND
3348; GFX90A-NEXT:    v_perm_b32 v3, v1, v1, s4
3349; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
3350; GFX90A-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
3351; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3352; GFX90A-NEXT:    s_setpc_b64 s[30:31]
3353;
3354; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__u_2_2_2:
3355; GFX940:       ; %bb.0:
3356; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3357; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
3358; GFX940-NEXT:    v_mov_b32_e32 v4, 0
3359; GFX940-NEXT:    ;;#ASMSTART
3360; GFX940-NEXT:    ; def v[0:1]
3361; GFX940-NEXT:    ;;#ASMEND
3362; GFX940-NEXT:    s_nop 0
3363; GFX940-NEXT:    v_perm_b32 v3, v1, v1, s2
3364; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
3365; GFX940-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
3366; GFX940-NEXT:    s_waitcnt vmcnt(0)
3367; GFX940-NEXT:    s_setpc_b64 s[30:31]
3368  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
3369  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
3370  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 poison, i32 2, i32 2, i32 2>
3371  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
3372  ret void
3373}
3374
3375define void @v_shuffle_v4bf16_v3bf16__0_2_2_2(ptr addrspace(1) inreg %ptr) {
3376; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__0_2_2_2:
3377; GFX900:       ; %bb.0:
3378; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3379; GFX900-NEXT:    ;;#ASMSTART
3380; GFX900-NEXT:    ; def v[0:1]
3381; GFX900-NEXT:    ;;#ASMEND
3382; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
3383; GFX900-NEXT:    v_mov_b32_e32 v2, 0
3384; GFX900-NEXT:    v_perm_b32 v0, v1, v0, s4
3385; GFX900-NEXT:    v_perm_b32 v1, v1, v1, s4
3386; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
3387; GFX900-NEXT:    s_waitcnt vmcnt(0)
3388; GFX900-NEXT:    s_setpc_b64 s[30:31]
3389;
3390; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__0_2_2_2:
3391; GFX90A:       ; %bb.0:
3392; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3393; GFX90A-NEXT:    ;;#ASMSTART
3394; GFX90A-NEXT:    ; def v[0:1]
3395; GFX90A-NEXT:    ;;#ASMEND
3396; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
3397; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
3398; GFX90A-NEXT:    v_perm_b32 v0, v1, v0, s4
3399; GFX90A-NEXT:    v_perm_b32 v1, v1, v1, s4
3400; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
3401; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3402; GFX90A-NEXT:    s_setpc_b64 s[30:31]
3403;
3404; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__0_2_2_2:
3405; GFX940:       ; %bb.0:
3406; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3407; GFX940-NEXT:    ;;#ASMSTART
3408; GFX940-NEXT:    ; def v[0:1]
3409; GFX940-NEXT:    ;;#ASMEND
3410; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
3411; GFX940-NEXT:    v_mov_b32_e32 v2, 0
3412; GFX940-NEXT:    v_perm_b32 v0, v1, v0, s2
3413; GFX940-NEXT:    v_perm_b32 v1, v1, v1, s2
3414; GFX940-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
3415; GFX940-NEXT:    s_waitcnt vmcnt(0)
3416; GFX940-NEXT:    s_setpc_b64 s[30:31]
3417  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
3418  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
3419  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 0, i32 2, i32 2, i32 2>
3420  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
3421  ret void
3422}
3423
3424define void @v_shuffle_v4bf16_v3bf16__1_2_2_2(ptr addrspace(1) inreg %ptr) {
3425; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__1_2_2_2:
3426; GFX900:       ; %bb.0:
3427; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3428; GFX900-NEXT:    ;;#ASMSTART
3429; GFX900-NEXT:    ; def v[0:1]
3430; GFX900-NEXT:    ;;#ASMEND
3431; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
3432; GFX900-NEXT:    v_mov_b32_e32 v3, 0
3433; GFX900-NEXT:    v_perm_b32 v2, v1, v1, s4
3434; GFX900-NEXT:    v_alignbit_b32 v1, v1, v0, 16
3435; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
3436; GFX900-NEXT:    s_waitcnt vmcnt(0)
3437; GFX900-NEXT:    s_setpc_b64 s[30:31]
3438;
3439; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__1_2_2_2:
3440; GFX90A:       ; %bb.0:
3441; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3442; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
3443; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
3444; GFX90A-NEXT:    ;;#ASMSTART
3445; GFX90A-NEXT:    ; def v[0:1]
3446; GFX90A-NEXT:    ;;#ASMEND
3447; GFX90A-NEXT:    v_perm_b32 v3, v1, v1, s4
3448; GFX90A-NEXT:    v_alignbit_b32 v2, v1, v0, 16
3449; GFX90A-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
3450; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3451; GFX90A-NEXT:    s_setpc_b64 s[30:31]
3452;
3453; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__1_2_2_2:
3454; GFX940:       ; %bb.0:
3455; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3456; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
3457; GFX940-NEXT:    v_mov_b32_e32 v4, 0
3458; GFX940-NEXT:    ;;#ASMSTART
3459; GFX940-NEXT:    ; def v[0:1]
3460; GFX940-NEXT:    ;;#ASMEND
3461; GFX940-NEXT:    s_nop 0
3462; GFX940-NEXT:    v_perm_b32 v3, v1, v1, s2
3463; GFX940-NEXT:    v_alignbit_b32 v2, v1, v0, 16
3464; GFX940-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
3465; GFX940-NEXT:    s_waitcnt vmcnt(0)
3466; GFX940-NEXT:    s_setpc_b64 s[30:31]
3467  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
3468  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
3469  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 1, i32 2, i32 2, i32 2>
3470  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
3471  ret void
3472}
3473
3474define void @v_shuffle_v4bf16_v3bf16__2_2_2_2(ptr addrspace(1) inreg %ptr) {
3475; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__2_2_2_2:
3476; GFX900:       ; %bb.0:
3477; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3478; GFX900-NEXT:    ;;#ASMSTART
3479; GFX900-NEXT:    ; def v[0:1]
3480; GFX900-NEXT:    ;;#ASMEND
3481; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
3482; GFX900-NEXT:    v_perm_b32 v0, v1, v1, s4
3483; GFX900-NEXT:    v_mov_b32_e32 v2, 0
3484; GFX900-NEXT:    v_mov_b32_e32 v1, v0
3485; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
3486; GFX900-NEXT:    s_waitcnt vmcnt(0)
3487; GFX900-NEXT:    s_setpc_b64 s[30:31]
3488;
3489; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__2_2_2_2:
3490; GFX90A:       ; %bb.0:
3491; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3492; GFX90A-NEXT:    ;;#ASMSTART
3493; GFX90A-NEXT:    ; def v[0:1]
3494; GFX90A-NEXT:    ;;#ASMEND
3495; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
3496; GFX90A-NEXT:    v_perm_b32 v0, v1, v1, s4
3497; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
3498; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
3499; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
3500; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3501; GFX90A-NEXT:    s_setpc_b64 s[30:31]
3502;
3503; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__2_2_2_2:
3504; GFX940:       ; %bb.0:
3505; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3506; GFX940-NEXT:    ;;#ASMSTART
3507; GFX940-NEXT:    ; def v[0:1]
3508; GFX940-NEXT:    ;;#ASMEND
3509; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
3510; GFX940-NEXT:    v_perm_b32 v0, v1, v1, s2
3511; GFX940-NEXT:    v_mov_b32_e32 v2, 0
3512; GFX940-NEXT:    v_mov_b32_e32 v1, v0
3513; GFX940-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
3514; GFX940-NEXT:    s_waitcnt vmcnt(0)
3515; GFX940-NEXT:    s_setpc_b64 s[30:31]
3516  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
3517  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
3518  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
3519  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
3520  ret void
3521}
3522
3523define void @v_shuffle_v4bf16_v3bf16__3_2_2_2(ptr addrspace(1) inreg %ptr) {
3524; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__3_2_2_2:
3525; GFX900:       ; %bb.0:
3526; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3527; GFX900-NEXT:    ;;#ASMSTART
3528; GFX900-NEXT:    ; def v[0:1]
3529; GFX900-NEXT:    ;;#ASMEND
3530; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
3531; GFX900-NEXT:    v_mov_b32_e32 v3, 0
3532; GFX900-NEXT:    v_perm_b32 v2, v1, v1, s4
3533; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
3534; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
3535; GFX900-NEXT:    s_waitcnt vmcnt(0)
3536; GFX900-NEXT:    s_setpc_b64 s[30:31]
3537;
3538; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__3_2_2_2:
3539; GFX90A:       ; %bb.0:
3540; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3541; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
3542; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
3543; GFX90A-NEXT:    ;;#ASMSTART
3544; GFX90A-NEXT:    ; def v[0:1]
3545; GFX90A-NEXT:    ;;#ASMEND
3546; GFX90A-NEXT:    v_perm_b32 v3, v1, v1, s4
3547; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
3548; GFX90A-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
3549; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3550; GFX90A-NEXT:    s_setpc_b64 s[30:31]
3551;
3552; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__3_2_2_2:
3553; GFX940:       ; %bb.0:
3554; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3555; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
3556; GFX940-NEXT:    v_mov_b32_e32 v4, 0
3557; GFX940-NEXT:    ;;#ASMSTART
3558; GFX940-NEXT:    ; def v[0:1]
3559; GFX940-NEXT:    ;;#ASMEND
3560; GFX940-NEXT:    s_nop 0
3561; GFX940-NEXT:    v_perm_b32 v3, v1, v1, s2
3562; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
3563; GFX940-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
3564; GFX940-NEXT:    s_waitcnt vmcnt(0)
3565; GFX940-NEXT:    s_setpc_b64 s[30:31]
3566  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
3567  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
3568  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 3, i32 2, i32 2, i32 2>
3569  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
3570  ret void
3571}
3572
3573define void @v_shuffle_v4bf16_v3bf16__4_2_2_2(ptr addrspace(1) inreg %ptr) {
3574; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__4_2_2_2:
3575; GFX900:       ; %bb.0:
3576; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3577; GFX900-NEXT:    ;;#ASMSTART
3578; GFX900-NEXT:    ; def v[2:3]
3579; GFX900-NEXT:    ;;#ASMEND
3580; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
3581; GFX900-NEXT:    v_mov_b32_e32 v4, 0
3582; GFX900-NEXT:    ;;#ASMSTART
3583; GFX900-NEXT:    ; def v[0:1]
3584; GFX900-NEXT:    ;;#ASMEND
3585; GFX900-NEXT:    v_perm_b32 v3, v1, v1, s4
3586; GFX900-NEXT:    v_alignbit_b32 v2, v1, v2, 16
3587; GFX900-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
3588; GFX900-NEXT:    s_waitcnt vmcnt(0)
3589; GFX900-NEXT:    s_setpc_b64 s[30:31]
3590;
3591; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__4_2_2_2:
3592; GFX90A:       ; %bb.0:
3593; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3594; GFX90A-NEXT:    ;;#ASMSTART
3595; GFX90A-NEXT:    ; def v[2:3]
3596; GFX90A-NEXT:    ;;#ASMEND
3597; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
3598; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
3599; GFX90A-NEXT:    ;;#ASMSTART
3600; GFX90A-NEXT:    ; def v[0:1]
3601; GFX90A-NEXT:    ;;#ASMEND
3602; GFX90A-NEXT:    v_perm_b32 v3, v1, v1, s4
3603; GFX90A-NEXT:    v_alignbit_b32 v2, v1, v2, 16
3604; GFX90A-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
3605; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3606; GFX90A-NEXT:    s_setpc_b64 s[30:31]
3607;
3608; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__4_2_2_2:
3609; GFX940:       ; %bb.0:
3610; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3611; GFX940-NEXT:    ;;#ASMSTART
3612; GFX940-NEXT:    ; def v[2:3]
3613; GFX940-NEXT:    ;;#ASMEND
3614; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
3615; GFX940-NEXT:    v_mov_b32_e32 v4, 0
3616; GFX940-NEXT:    ;;#ASMSTART
3617; GFX940-NEXT:    ; def v[0:1]
3618; GFX940-NEXT:    ;;#ASMEND
3619; GFX940-NEXT:    s_nop 0
3620; GFX940-NEXT:    v_perm_b32 v3, v1, v1, s2
3621; GFX940-NEXT:    v_alignbit_b32 v2, v1, v2, 16
3622; GFX940-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
3623; GFX940-NEXT:    s_waitcnt vmcnt(0)
3624; GFX940-NEXT:    s_setpc_b64 s[30:31]
3625  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
3626  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
3627  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
3628  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
3629  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 4, i32 2, i32 2, i32 2>
3630  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
3631  ret void
3632}
3633
3634define void @v_shuffle_v4bf16_v3bf16__5_2_2_2(ptr addrspace(1) inreg %ptr) {
3635; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_2_2_2:
3636; GFX900:       ; %bb.0:
3637; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3638; GFX900-NEXT:    ;;#ASMSTART
3639; GFX900-NEXT:    ; def v[0:1]
3640; GFX900-NEXT:    ;;#ASMEND
3641; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
3642; GFX900-NEXT:    v_mov_b32_e32 v4, 0
3643; GFX900-NEXT:    ;;#ASMSTART
3644; GFX900-NEXT:    ; def v[2:3]
3645; GFX900-NEXT:    ;;#ASMEND
3646; GFX900-NEXT:    v_perm_b32 v0, v1, v3, s4
3647; GFX900-NEXT:    v_perm_b32 v1, v1, v1, s4
3648; GFX900-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
3649; GFX900-NEXT:    s_waitcnt vmcnt(0)
3650; GFX900-NEXT:    s_setpc_b64 s[30:31]
3651;
3652; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_2_2_2:
3653; GFX90A:       ; %bb.0:
3654; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3655; GFX90A-NEXT:    ;;#ASMSTART
3656; GFX90A-NEXT:    ; def v[0:1]
3657; GFX90A-NEXT:    ;;#ASMEND
3658; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
3659; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
3660; GFX90A-NEXT:    ;;#ASMSTART
3661; GFX90A-NEXT:    ; def v[2:3]
3662; GFX90A-NEXT:    ;;#ASMEND
3663; GFX90A-NEXT:    v_perm_b32 v0, v1, v3, s4
3664; GFX90A-NEXT:    v_perm_b32 v1, v1, v1, s4
3665; GFX90A-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
3666; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3667; GFX90A-NEXT:    s_setpc_b64 s[30:31]
3668;
3669; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_2_2_2:
3670; GFX940:       ; %bb.0:
3671; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3672; GFX940-NEXT:    ;;#ASMSTART
3673; GFX940-NEXT:    ; def v[0:1]
3674; GFX940-NEXT:    ;;#ASMEND
3675; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
3676; GFX940-NEXT:    v_mov_b32_e32 v4, 0
3677; GFX940-NEXT:    ;;#ASMSTART
3678; GFX940-NEXT:    ; def v[2:3]
3679; GFX940-NEXT:    ;;#ASMEND
3680; GFX940-NEXT:    s_nop 0
3681; GFX940-NEXT:    v_perm_b32 v0, v1, v3, s2
3682; GFX940-NEXT:    v_perm_b32 v1, v1, v1, s2
3683; GFX940-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
3684; GFX940-NEXT:    s_waitcnt vmcnt(0)
3685; GFX940-NEXT:    s_setpc_b64 s[30:31]
3686  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
3687  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
3688  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
3689  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
3690  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 2, i32 2, i32 2>
3691  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
3692  ret void
3693}
3694
3695define void @v_shuffle_v4bf16_v3bf16__5_u_2_2(ptr addrspace(1) inreg %ptr) {
3696; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_u_2_2:
3697; GFX900:       ; %bb.0:
3698; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3699; GFX900-NEXT:    ;;#ASMSTART
3700; GFX900-NEXT:    ; def v[0:1]
3701; GFX900-NEXT:    ;;#ASMEND
3702; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
3703; GFX900-NEXT:    v_mov_b32_e32 v4, 0
3704; GFX900-NEXT:    ;;#ASMSTART
3705; GFX900-NEXT:    ; def v[2:3]
3706; GFX900-NEXT:    ;;#ASMEND
3707; GFX900-NEXT:    v_perm_b32 v1, v1, v1, s4
3708; GFX900-NEXT:    v_mov_b32_e32 v0, v3
3709; GFX900-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
3710; GFX900-NEXT:    s_waitcnt vmcnt(0)
3711; GFX900-NEXT:    s_setpc_b64 s[30:31]
3712;
3713; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_u_2_2:
3714; GFX90A:       ; %bb.0:
3715; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3716; GFX90A-NEXT:    ;;#ASMSTART
3717; GFX90A-NEXT:    ; def v[0:1]
3718; GFX90A-NEXT:    ;;#ASMEND
3719; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
3720; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
3721; GFX90A-NEXT:    ;;#ASMSTART
3722; GFX90A-NEXT:    ; def v[2:3]
3723; GFX90A-NEXT:    ;;#ASMEND
3724; GFX90A-NEXT:    v_perm_b32 v1, v1, v1, s4
3725; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
3726; GFX90A-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
3727; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3728; GFX90A-NEXT:    s_setpc_b64 s[30:31]
3729;
3730; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_u_2_2:
3731; GFX940:       ; %bb.0:
3732; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3733; GFX940-NEXT:    ;;#ASMSTART
3734; GFX940-NEXT:    ; def v[0:1]
3735; GFX940-NEXT:    ;;#ASMEND
3736; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
3737; GFX940-NEXT:    v_mov_b32_e32 v4, 0
3738; GFX940-NEXT:    ;;#ASMSTART
3739; GFX940-NEXT:    ; def v[2:3]
3740; GFX940-NEXT:    ;;#ASMEND
3741; GFX940-NEXT:    v_perm_b32 v1, v1, v1, s2
3742; GFX940-NEXT:    v_mov_b32_e32 v0, v3
3743; GFX940-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
3744; GFX940-NEXT:    s_waitcnt vmcnt(0)
3745; GFX940-NEXT:    s_setpc_b64 s[30:31]
3746  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
3747  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
3748  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
3749  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
3750  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 poison, i32 2, i32 2>
3751  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
3752  ret void
3753}
3754
3755define void @v_shuffle_v4bf16_v3bf16__5_0_2_2(ptr addrspace(1) inreg %ptr) {
3756; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_0_2_2:
3757; GFX900:       ; %bb.0:
3758; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3759; GFX900-NEXT:    ;;#ASMSTART
3760; GFX900-NEXT:    ; def v[0:1]
3761; GFX900-NEXT:    ;;#ASMEND
3762; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
3763; GFX900-NEXT:    v_mov_b32_e32 v4, 0
3764; GFX900-NEXT:    ;;#ASMSTART
3765; GFX900-NEXT:    ; def v[2:3]
3766; GFX900-NEXT:    ;;#ASMEND
3767; GFX900-NEXT:    v_perm_b32 v0, v0, v3, s4
3768; GFX900-NEXT:    v_perm_b32 v1, v1, v1, s4
3769; GFX900-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
3770; GFX900-NEXT:    s_waitcnt vmcnt(0)
3771; GFX900-NEXT:    s_setpc_b64 s[30:31]
3772;
3773; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_0_2_2:
3774; GFX90A:       ; %bb.0:
3775; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3776; GFX90A-NEXT:    ;;#ASMSTART
3777; GFX90A-NEXT:    ; def v[0:1]
3778; GFX90A-NEXT:    ;;#ASMEND
3779; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
3780; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
3781; GFX90A-NEXT:    ;;#ASMSTART
3782; GFX90A-NEXT:    ; def v[2:3]
3783; GFX90A-NEXT:    ;;#ASMEND
3784; GFX90A-NEXT:    v_perm_b32 v0, v0, v3, s4
3785; GFX90A-NEXT:    v_perm_b32 v1, v1, v1, s4
3786; GFX90A-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
3787; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3788; GFX90A-NEXT:    s_setpc_b64 s[30:31]
3789;
3790; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_0_2_2:
3791; GFX940:       ; %bb.0:
3792; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3793; GFX940-NEXT:    ;;#ASMSTART
3794; GFX940-NEXT:    ; def v[0:1]
3795; GFX940-NEXT:    ;;#ASMEND
3796; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
3797; GFX940-NEXT:    v_mov_b32_e32 v4, 0
3798; GFX940-NEXT:    ;;#ASMSTART
3799; GFX940-NEXT:    ; def v[2:3]
3800; GFX940-NEXT:    ;;#ASMEND
3801; GFX940-NEXT:    v_perm_b32 v1, v1, v1, s2
3802; GFX940-NEXT:    v_perm_b32 v0, v0, v3, s2
3803; GFX940-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
3804; GFX940-NEXT:    s_waitcnt vmcnt(0)
3805; GFX940-NEXT:    s_setpc_b64 s[30:31]
3806  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
3807  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
3808  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
3809  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
3810  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 0, i32 2, i32 2>
3811  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
3812  ret void
3813}
3814
3815define void @v_shuffle_v4bf16_v3bf16__5_1_2_2(ptr addrspace(1) inreg %ptr) {
3816; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_1_2_2:
3817; GFX900:       ; %bb.0:
3818; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3819; GFX900-NEXT:    ;;#ASMSTART
3820; GFX900-NEXT:    ; def v[0:1]
3821; GFX900-NEXT:    ;;#ASMEND
3822; GFX900-NEXT:    s_mov_b32 s4, 0xffff
3823; GFX900-NEXT:    ;;#ASMSTART
3824; GFX900-NEXT:    ; def v[2:3]
3825; GFX900-NEXT:    ;;#ASMEND
3826; GFX900-NEXT:    v_bfi_b32 v0, s4, v3, v0
3827; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
3828; GFX900-NEXT:    v_mov_b32_e32 v4, 0
3829; GFX900-NEXT:    v_perm_b32 v1, v1, v1, s4
3830; GFX900-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
3831; GFX900-NEXT:    s_waitcnt vmcnt(0)
3832; GFX900-NEXT:    s_setpc_b64 s[30:31]
3833;
3834; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_1_2_2:
3835; GFX90A:       ; %bb.0:
3836; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3837; GFX90A-NEXT:    ;;#ASMSTART
3838; GFX90A-NEXT:    ; def v[0:1]
3839; GFX90A-NEXT:    ;;#ASMEND
3840; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
3841; GFX90A-NEXT:    ;;#ASMSTART
3842; GFX90A-NEXT:    ; def v[2:3]
3843; GFX90A-NEXT:    ;;#ASMEND
3844; GFX90A-NEXT:    v_bfi_b32 v0, s4, v3, v0
3845; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
3846; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
3847; GFX90A-NEXT:    v_perm_b32 v1, v1, v1, s4
3848; GFX90A-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
3849; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3850; GFX90A-NEXT:    s_setpc_b64 s[30:31]
3851;
3852; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_1_2_2:
3853; GFX940:       ; %bb.0:
3854; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3855; GFX940-NEXT:    ;;#ASMSTART
3856; GFX940-NEXT:    ; def v[0:1]
3857; GFX940-NEXT:    ;;#ASMEND
3858; GFX940-NEXT:    s_mov_b32 s2, 0xffff
3859; GFX940-NEXT:    ;;#ASMSTART
3860; GFX940-NEXT:    ; def v[2:3]
3861; GFX940-NEXT:    ;;#ASMEND
3862; GFX940-NEXT:    v_mov_b32_e32 v4, 0
3863; GFX940-NEXT:    v_bfi_b32 v0, s2, v3, v0
3864; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
3865; GFX940-NEXT:    v_perm_b32 v1, v1, v1, s2
3866; GFX940-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
3867; GFX940-NEXT:    s_waitcnt vmcnt(0)
3868; GFX940-NEXT:    s_setpc_b64 s[30:31]
3869  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
3870  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
3871  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
3872  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
3873  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 1, i32 2, i32 2>
3874  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
3875  ret void
3876}
3877
3878define void @v_shuffle_v4bf16_v3bf16__5_3_2_2(ptr addrspace(1) inreg %ptr) {
3879; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_3_2_2:
3880; GFX900:       ; %bb.0:
3881; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3882; GFX900-NEXT:    ;;#ASMSTART
3883; GFX900-NEXT:    ; def v[0:1]
3884; GFX900-NEXT:    ;;#ASMEND
3885; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
3886; GFX900-NEXT:    v_mov_b32_e32 v4, 0
3887; GFX900-NEXT:    ;;#ASMSTART
3888; GFX900-NEXT:    ; def v[2:3]
3889; GFX900-NEXT:    ;;#ASMEND
3890; GFX900-NEXT:    v_perm_b32 v0, v2, v3, s4
3891; GFX900-NEXT:    v_perm_b32 v1, v1, v1, s4
3892; GFX900-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
3893; GFX900-NEXT:    s_waitcnt vmcnt(0)
3894; GFX900-NEXT:    s_setpc_b64 s[30:31]
3895;
3896; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_3_2_2:
3897; GFX90A:       ; %bb.0:
3898; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3899; GFX90A-NEXT:    ;;#ASMSTART
3900; GFX90A-NEXT:    ; def v[0:1]
3901; GFX90A-NEXT:    ;;#ASMEND
3902; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
3903; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
3904; GFX90A-NEXT:    ;;#ASMSTART
3905; GFX90A-NEXT:    ; def v[2:3]
3906; GFX90A-NEXT:    ;;#ASMEND
3907; GFX90A-NEXT:    v_perm_b32 v0, v2, v3, s4
3908; GFX90A-NEXT:    v_perm_b32 v1, v1, v1, s4
3909; GFX90A-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
3910; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3911; GFX90A-NEXT:    s_setpc_b64 s[30:31]
3912;
3913; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_3_2_2:
3914; GFX940:       ; %bb.0:
3915; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3916; GFX940-NEXT:    ;;#ASMSTART
3917; GFX940-NEXT:    ; def v[0:1]
3918; GFX940-NEXT:    ;;#ASMEND
3919; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
3920; GFX940-NEXT:    v_mov_b32_e32 v4, 0
3921; GFX940-NEXT:    ;;#ASMSTART
3922; GFX940-NEXT:    ; def v[2:3]
3923; GFX940-NEXT:    ;;#ASMEND
3924; GFX940-NEXT:    v_perm_b32 v1, v1, v1, s2
3925; GFX940-NEXT:    v_perm_b32 v0, v2, v3, s2
3926; GFX940-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
3927; GFX940-NEXT:    s_waitcnt vmcnt(0)
3928; GFX940-NEXT:    s_setpc_b64 s[30:31]
3929  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
3930  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
3931  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
3932  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
3933  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 3, i32 2, i32 2>
3934  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
3935  ret void
3936}
3937
3938define void @v_shuffle_v4bf16_v3bf16__5_4_2_2(ptr addrspace(1) inreg %ptr) {
3939; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_4_2_2:
3940; GFX900:       ; %bb.0:
3941; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3942; GFX900-NEXT:    ;;#ASMSTART
3943; GFX900-NEXT:    ; def v[0:1]
3944; GFX900-NEXT:    ;;#ASMEND
3945; GFX900-NEXT:    s_mov_b32 s4, 0xffff
3946; GFX900-NEXT:    ;;#ASMSTART
3947; GFX900-NEXT:    ; def v[2:3]
3948; GFX900-NEXT:    ;;#ASMEND
3949; GFX900-NEXT:    v_bfi_b32 v0, s4, v3, v2
3950; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
3951; GFX900-NEXT:    v_mov_b32_e32 v4, 0
3952; GFX900-NEXT:    v_perm_b32 v1, v1, v1, s4
3953; GFX900-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
3954; GFX900-NEXT:    s_waitcnt vmcnt(0)
3955; GFX900-NEXT:    s_setpc_b64 s[30:31]
3956;
3957; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_4_2_2:
3958; GFX90A:       ; %bb.0:
3959; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3960; GFX90A-NEXT:    ;;#ASMSTART
3961; GFX90A-NEXT:    ; def v[0:1]
3962; GFX90A-NEXT:    ;;#ASMEND
3963; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
3964; GFX90A-NEXT:    ;;#ASMSTART
3965; GFX90A-NEXT:    ; def v[2:3]
3966; GFX90A-NEXT:    ;;#ASMEND
3967; GFX90A-NEXT:    v_bfi_b32 v0, s4, v3, v2
3968; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
3969; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
3970; GFX90A-NEXT:    v_perm_b32 v1, v1, v1, s4
3971; GFX90A-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
3972; GFX90A-NEXT:    s_waitcnt vmcnt(0)
3973; GFX90A-NEXT:    s_setpc_b64 s[30:31]
3974;
3975; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_4_2_2:
3976; GFX940:       ; %bb.0:
3977; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3978; GFX940-NEXT:    ;;#ASMSTART
3979; GFX940-NEXT:    ; def v[0:1]
3980; GFX940-NEXT:    ;;#ASMEND
3981; GFX940-NEXT:    s_mov_b32 s2, 0xffff
3982; GFX940-NEXT:    ;;#ASMSTART
3983; GFX940-NEXT:    ; def v[2:3]
3984; GFX940-NEXT:    ;;#ASMEND
3985; GFX940-NEXT:    v_mov_b32_e32 v4, 0
3986; GFX940-NEXT:    v_bfi_b32 v0, s2, v3, v2
3987; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
3988; GFX940-NEXT:    v_perm_b32 v1, v1, v1, s2
3989; GFX940-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
3990; GFX940-NEXT:    s_waitcnt vmcnt(0)
3991; GFX940-NEXT:    s_setpc_b64 s[30:31]
3992  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
3993  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
3994  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
3995  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
3996  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 4, i32 2, i32 2>
3997  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
3998  ret void
3999}
4000
4001define void @v_shuffle_v4bf16_v3bf16__5_5_2_2(ptr addrspace(1) inreg %ptr) {
4002; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_2:
4003; GFX900:       ; %bb.0:
4004; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4005; GFX900-NEXT:    ;;#ASMSTART
4006; GFX900-NEXT:    ; def v[0:1]
4007; GFX900-NEXT:    ;;#ASMEND
4008; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
4009; GFX900-NEXT:    v_mov_b32_e32 v4, 0
4010; GFX900-NEXT:    ;;#ASMSTART
4011; GFX900-NEXT:    ; def v[2:3]
4012; GFX900-NEXT:    ;;#ASMEND
4013; GFX900-NEXT:    v_perm_b32 v1, v1, v1, s4
4014; GFX900-NEXT:    v_perm_b32 v0, v3, v3, s4
4015; GFX900-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
4016; GFX900-NEXT:    s_waitcnt vmcnt(0)
4017; GFX900-NEXT:    s_setpc_b64 s[30:31]
4018;
4019; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_2:
4020; GFX90A:       ; %bb.0:
4021; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4022; GFX90A-NEXT:    ;;#ASMSTART
4023; GFX90A-NEXT:    ; def v[0:1]
4024; GFX90A-NEXT:    ;;#ASMEND
4025; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
4026; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
4027; GFX90A-NEXT:    ;;#ASMSTART
4028; GFX90A-NEXT:    ; def v[2:3]
4029; GFX90A-NEXT:    ;;#ASMEND
4030; GFX90A-NEXT:    v_perm_b32 v1, v1, v1, s4
4031; GFX90A-NEXT:    v_perm_b32 v0, v3, v3, s4
4032; GFX90A-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
4033; GFX90A-NEXT:    s_waitcnt vmcnt(0)
4034; GFX90A-NEXT:    s_setpc_b64 s[30:31]
4035;
4036; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_2:
4037; GFX940:       ; %bb.0:
4038; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4039; GFX940-NEXT:    ;;#ASMSTART
4040; GFX940-NEXT:    ; def v[0:1]
4041; GFX940-NEXT:    ;;#ASMEND
4042; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
4043; GFX940-NEXT:    v_mov_b32_e32 v4, 0
4044; GFX940-NEXT:    ;;#ASMSTART
4045; GFX940-NEXT:    ; def v[2:3]
4046; GFX940-NEXT:    ;;#ASMEND
4047; GFX940-NEXT:    v_perm_b32 v1, v1, v1, s2
4048; GFX940-NEXT:    v_perm_b32 v0, v3, v3, s2
4049; GFX940-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
4050; GFX940-NEXT:    s_waitcnt vmcnt(0)
4051; GFX940-NEXT:    s_setpc_b64 s[30:31]
4052  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
4053  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
4054  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
4055  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
4056  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 2, i32 2>
4057  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
4058  ret void
4059}
4060
4061define void @v_shuffle_v4bf16_v3bf16__5_5_u_2(ptr addrspace(1) inreg %ptr) {
4062; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_2:
4063; GFX900:       ; %bb.0:
4064; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4065; GFX900-NEXT:    ;;#ASMSTART
4066; GFX900-NEXT:    ; def v[0:1]
4067; GFX900-NEXT:    ;;#ASMEND
4068; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
4069; GFX900-NEXT:    v_mov_b32_e32 v4, 0
4070; GFX900-NEXT:    ;;#ASMSTART
4071; GFX900-NEXT:    ; def v[2:3]
4072; GFX900-NEXT:    ;;#ASMEND
4073; GFX900-NEXT:    v_perm_b32 v0, v3, v3, s4
4074; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
4075; GFX900-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
4076; GFX900-NEXT:    s_waitcnt vmcnt(0)
4077; GFX900-NEXT:    s_setpc_b64 s[30:31]
4078;
4079; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_2:
4080; GFX90A:       ; %bb.0:
4081; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4082; GFX90A-NEXT:    ;;#ASMSTART
4083; GFX90A-NEXT:    ; def v[0:1]
4084; GFX90A-NEXT:    ;;#ASMEND
4085; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
4086; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
4087; GFX90A-NEXT:    ;;#ASMSTART
4088; GFX90A-NEXT:    ; def v[2:3]
4089; GFX90A-NEXT:    ;;#ASMEND
4090; GFX90A-NEXT:    v_perm_b32 v0, v3, v3, s4
4091; GFX90A-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
4092; GFX90A-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
4093; GFX90A-NEXT:    s_waitcnt vmcnt(0)
4094; GFX90A-NEXT:    s_setpc_b64 s[30:31]
4095;
4096; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_2:
4097; GFX940:       ; %bb.0:
4098; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4099; GFX940-NEXT:    ;;#ASMSTART
4100; GFX940-NEXT:    ; def v[0:1]
4101; GFX940-NEXT:    ;;#ASMEND
4102; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
4103; GFX940-NEXT:    v_mov_b32_e32 v4, 0
4104; GFX940-NEXT:    ;;#ASMSTART
4105; GFX940-NEXT:    ; def v[2:3]
4106; GFX940-NEXT:    ;;#ASMEND
4107; GFX940-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
4108; GFX940-NEXT:    v_perm_b32 v0, v3, v3, s2
4109; GFX940-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
4110; GFX940-NEXT:    s_waitcnt vmcnt(0)
4111; GFX940-NEXT:    s_setpc_b64 s[30:31]
4112  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
4113  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
4114  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
4115  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
4116  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 poison, i32 2>
4117  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
4118  ret void
4119}
4120
4121define void @v_shuffle_v4bf16_v3bf16__5_5_0_2(ptr addrspace(1) inreg %ptr) {
4122; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_2:
4123; GFX900:       ; %bb.0:
4124; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4125; GFX900-NEXT:    ;;#ASMSTART
4126; GFX900-NEXT:    ; def v[0:1]
4127; GFX900-NEXT:    ;;#ASMEND
4128; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
4129; GFX900-NEXT:    v_mov_b32_e32 v4, 0
4130; GFX900-NEXT:    ;;#ASMSTART
4131; GFX900-NEXT:    ; def v[2:3]
4132; GFX900-NEXT:    ;;#ASMEND
4133; GFX900-NEXT:    v_perm_b32 v1, v1, v0, s4
4134; GFX900-NEXT:    v_perm_b32 v0, v3, v3, s4
4135; GFX900-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
4136; GFX900-NEXT:    s_waitcnt vmcnt(0)
4137; GFX900-NEXT:    s_setpc_b64 s[30:31]
4138;
4139; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_2:
4140; GFX90A:       ; %bb.0:
4141; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4142; GFX90A-NEXT:    ;;#ASMSTART
4143; GFX90A-NEXT:    ; def v[0:1]
4144; GFX90A-NEXT:    ;;#ASMEND
4145; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
4146; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
4147; GFX90A-NEXT:    ;;#ASMSTART
4148; GFX90A-NEXT:    ; def v[2:3]
4149; GFX90A-NEXT:    ;;#ASMEND
4150; GFX90A-NEXT:    v_perm_b32 v1, v1, v0, s4
4151; GFX90A-NEXT:    v_perm_b32 v0, v3, v3, s4
4152; GFX90A-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
4153; GFX90A-NEXT:    s_waitcnt vmcnt(0)
4154; GFX90A-NEXT:    s_setpc_b64 s[30:31]
4155;
4156; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_2:
4157; GFX940:       ; %bb.0:
4158; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4159; GFX940-NEXT:    ;;#ASMSTART
4160; GFX940-NEXT:    ; def v[0:1]
4161; GFX940-NEXT:    ;;#ASMEND
4162; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
4163; GFX940-NEXT:    v_mov_b32_e32 v4, 0
4164; GFX940-NEXT:    ;;#ASMSTART
4165; GFX940-NEXT:    ; def v[2:3]
4166; GFX940-NEXT:    ;;#ASMEND
4167; GFX940-NEXT:    v_perm_b32 v1, v1, v0, s2
4168; GFX940-NEXT:    v_perm_b32 v0, v3, v3, s2
4169; GFX940-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
4170; GFX940-NEXT:    s_waitcnt vmcnt(0)
4171; GFX940-NEXT:    s_setpc_b64 s[30:31]
4172  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
4173  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
4174  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
4175  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
4176  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 0, i32 2>
4177  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
4178  ret void
4179}
4180
4181define void @v_shuffle_v4bf16_v3bf16__5_5_1_2(ptr addrspace(1) inreg %ptr) {
4182; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_2:
4183; GFX900:       ; %bb.0:
4184; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4185; GFX900-NEXT:    ;;#ASMSTART
4186; GFX900-NEXT:    ; def v[0:1]
4187; GFX900-NEXT:    ;;#ASMEND
4188; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
4189; GFX900-NEXT:    v_mov_b32_e32 v4, 0
4190; GFX900-NEXT:    ;;#ASMSTART
4191; GFX900-NEXT:    ; def v[2:3]
4192; GFX900-NEXT:    ;;#ASMEND
4193; GFX900-NEXT:    v_alignbit_b32 v1, v1, v0, 16
4194; GFX900-NEXT:    v_perm_b32 v0, v3, v3, s4
4195; GFX900-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
4196; GFX900-NEXT:    s_waitcnt vmcnt(0)
4197; GFX900-NEXT:    s_setpc_b64 s[30:31]
4198;
4199; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_2:
4200; GFX90A:       ; %bb.0:
4201; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4202; GFX90A-NEXT:    ;;#ASMSTART
4203; GFX90A-NEXT:    ; def v[0:1]
4204; GFX90A-NEXT:    ;;#ASMEND
4205; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
4206; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
4207; GFX90A-NEXT:    ;;#ASMSTART
4208; GFX90A-NEXT:    ; def v[2:3]
4209; GFX90A-NEXT:    ;;#ASMEND
4210; GFX90A-NEXT:    v_alignbit_b32 v1, v1, v0, 16
4211; GFX90A-NEXT:    v_perm_b32 v0, v3, v3, s4
4212; GFX90A-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
4213; GFX90A-NEXT:    s_waitcnt vmcnt(0)
4214; GFX90A-NEXT:    s_setpc_b64 s[30:31]
4215;
4216; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_2:
4217; GFX940:       ; %bb.0:
4218; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4219; GFX940-NEXT:    ;;#ASMSTART
4220; GFX940-NEXT:    ; def v[0:1]
4221; GFX940-NEXT:    ;;#ASMEND
4222; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
4223; GFX940-NEXT:    v_mov_b32_e32 v4, 0
4224; GFX940-NEXT:    ;;#ASMSTART
4225; GFX940-NEXT:    ; def v[2:3]
4226; GFX940-NEXT:    ;;#ASMEND
4227; GFX940-NEXT:    v_alignbit_b32 v1, v1, v0, 16
4228; GFX940-NEXT:    v_perm_b32 v0, v3, v3, s2
4229; GFX940-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
4230; GFX940-NEXT:    s_waitcnt vmcnt(0)
4231; GFX940-NEXT:    s_setpc_b64 s[30:31]
4232  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
4233  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
4234  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
4235  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
4236  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 1, i32 2>
4237  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
4238  ret void
4239}
4240
4241define void @v_shuffle_v4bf16_v3bf16__5_5_3_2(ptr addrspace(1) inreg %ptr) {
4242; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_2:
4243; GFX900:       ; %bb.0:
4244; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4245; GFX900-NEXT:    ;;#ASMSTART
4246; GFX900-NEXT:    ; def v[0:1]
4247; GFX900-NEXT:    ;;#ASMEND
4248; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
4249; GFX900-NEXT:    v_mov_b32_e32 v4, 0
4250; GFX900-NEXT:    ;;#ASMSTART
4251; GFX900-NEXT:    ; def v[2:3]
4252; GFX900-NEXT:    ;;#ASMEND
4253; GFX900-NEXT:    v_perm_b32 v1, v1, v2, s4
4254; GFX900-NEXT:    v_perm_b32 v0, v3, v3, s4
4255; GFX900-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
4256; GFX900-NEXT:    s_waitcnt vmcnt(0)
4257; GFX900-NEXT:    s_setpc_b64 s[30:31]
4258;
4259; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_2:
4260; GFX90A:       ; %bb.0:
4261; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4262; GFX90A-NEXT:    ;;#ASMSTART
4263; GFX90A-NEXT:    ; def v[0:1]
4264; GFX90A-NEXT:    ;;#ASMEND
4265; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
4266; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
4267; GFX90A-NEXT:    ;;#ASMSTART
4268; GFX90A-NEXT:    ; def v[2:3]
4269; GFX90A-NEXT:    ;;#ASMEND
4270; GFX90A-NEXT:    v_perm_b32 v1, v1, v2, s4
4271; GFX90A-NEXT:    v_perm_b32 v0, v3, v3, s4
4272; GFX90A-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
4273; GFX90A-NEXT:    s_waitcnt vmcnt(0)
4274; GFX90A-NEXT:    s_setpc_b64 s[30:31]
4275;
4276; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_2:
4277; GFX940:       ; %bb.0:
4278; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4279; GFX940-NEXT:    ;;#ASMSTART
4280; GFX940-NEXT:    ; def v[0:1]
4281; GFX940-NEXT:    ;;#ASMEND
4282; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
4283; GFX940-NEXT:    v_mov_b32_e32 v4, 0
4284; GFX940-NEXT:    ;;#ASMSTART
4285; GFX940-NEXT:    ; def v[2:3]
4286; GFX940-NEXT:    ;;#ASMEND
4287; GFX940-NEXT:    s_nop 0
4288; GFX940-NEXT:    v_perm_b32 v1, v1, v2, s2
4289; GFX940-NEXT:    v_perm_b32 v0, v3, v3, s2
4290; GFX940-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
4291; GFX940-NEXT:    s_waitcnt vmcnt(0)
4292; GFX940-NEXT:    s_setpc_b64 s[30:31]
4293  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
4294  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
4295  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
4296  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
4297  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 3, i32 2>
4298  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
4299  ret void
4300}
4301
4302define void @v_shuffle_v4bf16_v3bf16__5_5_4_2(ptr addrspace(1) inreg %ptr) {
4303; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_2:
4304; GFX900:       ; %bb.0:
4305; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4306; GFX900-NEXT:    ;;#ASMSTART
4307; GFX900-NEXT:    ; def v[0:1]
4308; GFX900-NEXT:    ;;#ASMEND
4309; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
4310; GFX900-NEXT:    v_mov_b32_e32 v4, 0
4311; GFX900-NEXT:    ;;#ASMSTART
4312; GFX900-NEXT:    ; def v[2:3]
4313; GFX900-NEXT:    ;;#ASMEND
4314; GFX900-NEXT:    v_alignbit_b32 v1, v1, v2, 16
4315; GFX900-NEXT:    v_perm_b32 v0, v3, v3, s4
4316; GFX900-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
4317; GFX900-NEXT:    s_waitcnt vmcnt(0)
4318; GFX900-NEXT:    s_setpc_b64 s[30:31]
4319;
4320; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_2:
4321; GFX90A:       ; %bb.0:
4322; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4323; GFX90A-NEXT:    ;;#ASMSTART
4324; GFX90A-NEXT:    ; def v[0:1]
4325; GFX90A-NEXT:    ;;#ASMEND
4326; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
4327; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
4328; GFX90A-NEXT:    ;;#ASMSTART
4329; GFX90A-NEXT:    ; def v[2:3]
4330; GFX90A-NEXT:    ;;#ASMEND
4331; GFX90A-NEXT:    v_alignbit_b32 v1, v1, v2, 16
4332; GFX90A-NEXT:    v_perm_b32 v0, v3, v3, s4
4333; GFX90A-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
4334; GFX90A-NEXT:    s_waitcnt vmcnt(0)
4335; GFX90A-NEXT:    s_setpc_b64 s[30:31]
4336;
4337; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_2:
4338; GFX940:       ; %bb.0:
4339; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4340; GFX940-NEXT:    ;;#ASMSTART
4341; GFX940-NEXT:    ; def v[0:1]
4342; GFX940-NEXT:    ;;#ASMEND
4343; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
4344; GFX940-NEXT:    v_mov_b32_e32 v4, 0
4345; GFX940-NEXT:    ;;#ASMSTART
4346; GFX940-NEXT:    ; def v[2:3]
4347; GFX940-NEXT:    ;;#ASMEND
4348; GFX940-NEXT:    s_nop 0
4349; GFX940-NEXT:    v_alignbit_b32 v1, v1, v2, 16
4350; GFX940-NEXT:    v_perm_b32 v0, v3, v3, s2
4351; GFX940-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
4352; GFX940-NEXT:    s_waitcnt vmcnt(0)
4353; GFX940-NEXT:    s_setpc_b64 s[30:31]
4354  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
4355  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
4356  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
4357  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
4358  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 4, i32 2>
4359  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
4360  ret void
4361}
4362
4363define void @v_shuffle_v4bf16_v3bf16__u_3_3_3(ptr addrspace(1) inreg %ptr) {
4364; GFX9-LABEL: v_shuffle_v4bf16_v3bf16__u_3_3_3:
4365; GFX9:       ; %bb.0:
4366; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4367; GFX9-NEXT:    s_setpc_b64 s[30:31]
4368  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
4369  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
4370  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 poison, i32 3, i32 3, i32 3>
4371  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
4372  ret void
4373}
4374
4375define void @v_shuffle_v4bf16_v3bf16__0_3_3_3(ptr addrspace(1) inreg %ptr) {
4376; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__0_3_3_3:
4377; GFX900:       ; %bb.0:
4378; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4379; GFX900-NEXT:    v_mov_b32_e32 v2, 0
4380; GFX900-NEXT:    ;;#ASMSTART
4381; GFX900-NEXT:    ; def v[0:1]
4382; GFX900-NEXT:    ;;#ASMEND
4383; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
4384; GFX900-NEXT:    s_waitcnt vmcnt(0)
4385; GFX900-NEXT:    s_setpc_b64 s[30:31]
4386;
4387; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__0_3_3_3:
4388; GFX90A:       ; %bb.0:
4389; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4390; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
4391; GFX90A-NEXT:    ;;#ASMSTART
4392; GFX90A-NEXT:    ; def v[0:1]
4393; GFX90A-NEXT:    ;;#ASMEND
4394; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
4395; GFX90A-NEXT:    s_waitcnt vmcnt(0)
4396; GFX90A-NEXT:    s_setpc_b64 s[30:31]
4397;
4398; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__0_3_3_3:
4399; GFX940:       ; %bb.0:
4400; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4401; GFX940-NEXT:    v_mov_b32_e32 v2, 0
4402; GFX940-NEXT:    ;;#ASMSTART
4403; GFX940-NEXT:    ; def v[0:1]
4404; GFX940-NEXT:    ;;#ASMEND
4405; GFX940-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
4406; GFX940-NEXT:    s_waitcnt vmcnt(0)
4407; GFX940-NEXT:    s_setpc_b64 s[30:31]
4408  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
4409  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
4410  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 0, i32 3, i32 3, i32 3>
4411  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
4412  ret void
4413}
4414
4415define void @v_shuffle_v4bf16_v3bf16__1_3_3_3(ptr addrspace(1) inreg %ptr) {
4416; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__1_3_3_3:
4417; GFX900:       ; %bb.0:
4418; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4419; GFX900-NEXT:    ;;#ASMSTART
4420; GFX900-NEXT:    ; def v[0:1]
4421; GFX900-NEXT:    ;;#ASMEND
4422; GFX900-NEXT:    v_mov_b32_e32 v2, 0
4423; GFX900-NEXT:    v_alignbit_b32 v0, s4, v0, 16
4424; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
4425; GFX900-NEXT:    s_waitcnt vmcnt(0)
4426; GFX900-NEXT:    s_setpc_b64 s[30:31]
4427;
4428; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__1_3_3_3:
4429; GFX90A:       ; %bb.0:
4430; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4431; GFX90A-NEXT:    ;;#ASMSTART
4432; GFX90A-NEXT:    ; def v[0:1]
4433; GFX90A-NEXT:    ;;#ASMEND
4434; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
4435; GFX90A-NEXT:    v_alignbit_b32 v0, s4, v0, 16
4436; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
4437; GFX90A-NEXT:    s_waitcnt vmcnt(0)
4438; GFX90A-NEXT:    s_setpc_b64 s[30:31]
4439;
4440; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__1_3_3_3:
4441; GFX940:       ; %bb.0:
4442; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4443; GFX940-NEXT:    ;;#ASMSTART
4444; GFX940-NEXT:    ; def v[0:1]
4445; GFX940-NEXT:    ;;#ASMEND
4446; GFX940-NEXT:    v_mov_b32_e32 v2, 0
4447; GFX940-NEXT:    v_alignbit_b32 v0, s0, v0, 16
4448; GFX940-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
4449; GFX940-NEXT:    s_waitcnt vmcnt(0)
4450; GFX940-NEXT:    s_setpc_b64 s[30:31]
4451  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
4452  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
4453  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 1, i32 3, i32 3, i32 3>
4454  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
4455  ret void
4456}
4457
4458define void @v_shuffle_v4bf16_v3bf16__2_3_3_3(ptr addrspace(1) inreg %ptr) {
4459; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__2_3_3_3:
4460; GFX900:       ; %bb.0:
4461; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4462; GFX900-NEXT:    ;;#ASMSTART
4463; GFX900-NEXT:    ; def v[0:1]
4464; GFX900-NEXT:    ;;#ASMEND
4465; GFX900-NEXT:    v_mov_b32_e32 v2, 0
4466; GFX900-NEXT:    v_mov_b32_e32 v0, v1
4467; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
4468; GFX900-NEXT:    s_waitcnt vmcnt(0)
4469; GFX900-NEXT:    s_setpc_b64 s[30:31]
4470;
4471; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__2_3_3_3:
4472; GFX90A:       ; %bb.0:
4473; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4474; GFX90A-NEXT:    ;;#ASMSTART
4475; GFX90A-NEXT:    ; def v[0:1]
4476; GFX90A-NEXT:    ;;#ASMEND
4477; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
4478; GFX90A-NEXT:    v_mov_b32_e32 v0, v1
4479; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
4480; GFX90A-NEXT:    s_waitcnt vmcnt(0)
4481; GFX90A-NEXT:    s_setpc_b64 s[30:31]
4482;
4483; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__2_3_3_3:
4484; GFX940:       ; %bb.0:
4485; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4486; GFX940-NEXT:    ;;#ASMSTART
4487; GFX940-NEXT:    ; def v[0:1]
4488; GFX940-NEXT:    ;;#ASMEND
4489; GFX940-NEXT:    v_mov_b32_e32 v2, 0
4490; GFX940-NEXT:    v_mov_b32_e32 v0, v1
4491; GFX940-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
4492; GFX940-NEXT:    s_waitcnt vmcnt(0)
4493; GFX940-NEXT:    s_setpc_b64 s[30:31]
4494  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
4495  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
4496  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 2, i32 3, i32 3, i32 3>
4497  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
4498  ret void
4499}
4500
4501define void @v_shuffle_v4bf16_v3bf16__3_3_3_3(ptr addrspace(1) inreg %ptr) {
4502; GFX9-LABEL: v_shuffle_v4bf16_v3bf16__3_3_3_3:
4503; GFX9:       ; %bb.0:
4504; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4505; GFX9-NEXT:    s_setpc_b64 s[30:31]
4506  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
4507  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
4508  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
4509  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
4510  ret void
4511}
4512
4513define void @v_shuffle_v4bf16_v3bf16__4_3_3_3(ptr addrspace(1) inreg %ptr) {
4514; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__4_3_3_3:
4515; GFX900:       ; %bb.0:
4516; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4517; GFX900-NEXT:    ;;#ASMSTART
4518; GFX900-NEXT:    ; def v[0:1]
4519; GFX900-NEXT:    ;;#ASMEND
4520; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
4521; GFX900-NEXT:    v_mov_b32_e32 v2, 0
4522; GFX900-NEXT:    v_perm_b32 v1, v0, v0, s4
4523; GFX900-NEXT:    v_alignbit_b32 v0, v0, v0, 16
4524; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
4525; GFX900-NEXT:    s_waitcnt vmcnt(0)
4526; GFX900-NEXT:    s_setpc_b64 s[30:31]
4527;
4528; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__4_3_3_3:
4529; GFX90A:       ; %bb.0:
4530; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4531; GFX90A-NEXT:    ;;#ASMSTART
4532; GFX90A-NEXT:    ; def v[0:1]
4533; GFX90A-NEXT:    ;;#ASMEND
4534; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
4535; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
4536; GFX90A-NEXT:    v_perm_b32 v1, v0, v0, s4
4537; GFX90A-NEXT:    v_alignbit_b32 v0, v0, v0, 16
4538; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
4539; GFX90A-NEXT:    s_waitcnt vmcnt(0)
4540; GFX90A-NEXT:    s_setpc_b64 s[30:31]
4541;
4542; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__4_3_3_3:
4543; GFX940:       ; %bb.0:
4544; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4545; GFX940-NEXT:    ;;#ASMSTART
4546; GFX940-NEXT:    ; def v[0:1]
4547; GFX940-NEXT:    ;;#ASMEND
4548; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
4549; GFX940-NEXT:    v_mov_b32_e32 v2, 0
4550; GFX940-NEXT:    v_perm_b32 v1, v0, v0, s2
4551; GFX940-NEXT:    v_alignbit_b32 v0, v0, v0, 16
4552; GFX940-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
4553; GFX940-NEXT:    s_waitcnt vmcnt(0)
4554; GFX940-NEXT:    s_setpc_b64 s[30:31]
4555  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
4556  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
4557  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
4558  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
4559  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 4, i32 3, i32 3, i32 3>
4560  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
4561  ret void
4562}
4563
4564define void @v_shuffle_v4bf16_v3bf16__5_3_3_3(ptr addrspace(1) inreg %ptr) {
4565; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_3_3_3:
4566; GFX900:       ; %bb.0:
4567; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4568; GFX900-NEXT:    ;;#ASMSTART
4569; GFX900-NEXT:    ; def v[0:1]
4570; GFX900-NEXT:    ;;#ASMEND
4571; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
4572; GFX900-NEXT:    v_mov_b32_e32 v3, 0
4573; GFX900-NEXT:    v_perm_b32 v1, v0, v1, s4
4574; GFX900-NEXT:    v_perm_b32 v2, v0, v0, s4
4575; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
4576; GFX900-NEXT:    s_waitcnt vmcnt(0)
4577; GFX900-NEXT:    s_setpc_b64 s[30:31]
4578;
4579; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_3_3_3:
4580; GFX90A:       ; %bb.0:
4581; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4582; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
4583; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
4584; GFX90A-NEXT:    ;;#ASMSTART
4585; GFX90A-NEXT:    ; def v[0:1]
4586; GFX90A-NEXT:    ;;#ASMEND
4587; GFX90A-NEXT:    v_perm_b32 v2, v0, v1, s4
4588; GFX90A-NEXT:    v_perm_b32 v3, v0, v0, s4
4589; GFX90A-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
4590; GFX90A-NEXT:    s_waitcnt vmcnt(0)
4591; GFX90A-NEXT:    s_setpc_b64 s[30:31]
4592;
4593; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_3_3_3:
4594; GFX940:       ; %bb.0:
4595; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4596; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
4597; GFX940-NEXT:    v_mov_b32_e32 v4, 0
4598; GFX940-NEXT:    ;;#ASMSTART
4599; GFX940-NEXT:    ; def v[0:1]
4600; GFX940-NEXT:    ;;#ASMEND
4601; GFX940-NEXT:    s_nop 0
4602; GFX940-NEXT:    v_perm_b32 v2, v0, v1, s2
4603; GFX940-NEXT:    v_perm_b32 v3, v0, v0, s2
4604; GFX940-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
4605; GFX940-NEXT:    s_waitcnt vmcnt(0)
4606; GFX940-NEXT:    s_setpc_b64 s[30:31]
4607  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
4608  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
4609  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
4610  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
4611  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 3, i32 3, i32 3>
4612  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
4613  ret void
4614}
4615
4616define void @v_shuffle_v4bf16_v3bf16__5_u_3_3(ptr addrspace(1) inreg %ptr) {
4617; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_u_3_3:
4618; GFX900:       ; %bb.0:
4619; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4620; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
4621; GFX900-NEXT:    v_mov_b32_e32 v3, 0
4622; GFX900-NEXT:    ;;#ASMSTART
4623; GFX900-NEXT:    ; def v[0:1]
4624; GFX900-NEXT:    ;;#ASMEND
4625; GFX900-NEXT:    v_perm_b32 v2, v0, v0, s4
4626; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
4627; GFX900-NEXT:    s_waitcnt vmcnt(0)
4628; GFX900-NEXT:    s_setpc_b64 s[30:31]
4629;
4630; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_u_3_3:
4631; GFX90A:       ; %bb.0:
4632; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4633; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
4634; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
4635; GFX90A-NEXT:    ;;#ASMSTART
4636; GFX90A-NEXT:    ; def v[0:1]
4637; GFX90A-NEXT:    ;;#ASMEND
4638; GFX90A-NEXT:    v_perm_b32 v3, v0, v0, s4
4639; GFX90A-NEXT:    v_mov_b32_e32 v2, v1
4640; GFX90A-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
4641; GFX90A-NEXT:    s_waitcnt vmcnt(0)
4642; GFX90A-NEXT:    s_setpc_b64 s[30:31]
4643;
4644; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_u_3_3:
4645; GFX940:       ; %bb.0:
4646; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4647; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
4648; GFX940-NEXT:    v_mov_b32_e32 v4, 0
4649; GFX940-NEXT:    ;;#ASMSTART
4650; GFX940-NEXT:    ; def v[0:1]
4651; GFX940-NEXT:    ;;#ASMEND
4652; GFX940-NEXT:    s_nop 0
4653; GFX940-NEXT:    v_perm_b32 v3, v0, v0, s2
4654; GFX940-NEXT:    v_mov_b32_e32 v2, v1
4655; GFX940-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
4656; GFX940-NEXT:    s_waitcnt vmcnt(0)
4657; GFX940-NEXT:    s_setpc_b64 s[30:31]
4658  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
4659  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
4660  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
4661  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
4662  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 poison, i32 3, i32 3>
4663  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
4664  ret void
4665}
4666
4667define void @v_shuffle_v4bf16_v3bf16__5_0_3_3(ptr addrspace(1) inreg %ptr) {
4668; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_0_3_3:
4669; GFX900:       ; %bb.0:
4670; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4671; GFX900-NEXT:    ;;#ASMSTART
4672; GFX900-NEXT:    ; def v[0:1]
4673; GFX900-NEXT:    ;;#ASMEND
4674; GFX900-NEXT:    ;;#ASMSTART
4675; GFX900-NEXT:    ; def v[1:2]
4676; GFX900-NEXT:    ;;#ASMEND
4677; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
4678; GFX900-NEXT:    v_mov_b32_e32 v3, 0
4679; GFX900-NEXT:    v_perm_b32 v0, v0, v2, s4
4680; GFX900-NEXT:    v_perm_b32 v1, v1, v1, s4
4681; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
4682; GFX900-NEXT:    s_waitcnt vmcnt(0)
4683; GFX900-NEXT:    s_setpc_b64 s[30:31]
4684;
4685; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_0_3_3:
4686; GFX90A:       ; %bb.0:
4687; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4688; GFX90A-NEXT:    ;;#ASMSTART
4689; GFX90A-NEXT:    ; def v[0:1]
4690; GFX90A-NEXT:    ;;#ASMEND
4691; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
4692; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
4693; GFX90A-NEXT:    ;;#ASMSTART
4694; GFX90A-NEXT:    ; def v[2:3]
4695; GFX90A-NEXT:    ;;#ASMEND
4696; GFX90A-NEXT:    v_perm_b32 v0, v0, v3, s4
4697; GFX90A-NEXT:    v_perm_b32 v1, v2, v2, s4
4698; GFX90A-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
4699; GFX90A-NEXT:    s_waitcnt vmcnt(0)
4700; GFX90A-NEXT:    s_setpc_b64 s[30:31]
4701;
4702; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_0_3_3:
4703; GFX940:       ; %bb.0:
4704; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4705; GFX940-NEXT:    ;;#ASMSTART
4706; GFX940-NEXT:    ; def v[0:1]
4707; GFX940-NEXT:    ;;#ASMEND
4708; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
4709; GFX940-NEXT:    v_mov_b32_e32 v4, 0
4710; GFX940-NEXT:    ;;#ASMSTART
4711; GFX940-NEXT:    ; def v[2:3]
4712; GFX940-NEXT:    ;;#ASMEND
4713; GFX940-NEXT:    s_nop 0
4714; GFX940-NEXT:    v_perm_b32 v0, v0, v3, s2
4715; GFX940-NEXT:    v_perm_b32 v1, v2, v2, s2
4716; GFX940-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
4717; GFX940-NEXT:    s_waitcnt vmcnt(0)
4718; GFX940-NEXT:    s_setpc_b64 s[30:31]
4719  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
4720  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
4721  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
4722  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
4723  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 0, i32 3, i32 3>
4724  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
4725  ret void
4726}
4727
4728define void @v_shuffle_v4bf16_v3bf16__5_1_3_3(ptr addrspace(1) inreg %ptr) {
4729; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_1_3_3:
4730; GFX900:       ; %bb.0:
4731; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4732; GFX900-NEXT:    ;;#ASMSTART
4733; GFX900-NEXT:    ; def v[0:1]
4734; GFX900-NEXT:    ;;#ASMEND
4735; GFX900-NEXT:    s_mov_b32 s4, 0xffff
4736; GFX900-NEXT:    ;;#ASMSTART
4737; GFX900-NEXT:    ; def v[1:2]
4738; GFX900-NEXT:    ;;#ASMEND
4739; GFX900-NEXT:    v_bfi_b32 v0, s4, v2, v0
4740; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
4741; GFX900-NEXT:    v_mov_b32_e32 v3, 0
4742; GFX900-NEXT:    v_perm_b32 v1, v1, v1, s4
4743; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
4744; GFX900-NEXT:    s_waitcnt vmcnt(0)
4745; GFX900-NEXT:    s_setpc_b64 s[30:31]
4746;
4747; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_1_3_3:
4748; GFX90A:       ; %bb.0:
4749; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4750; GFX90A-NEXT:    ;;#ASMSTART
4751; GFX90A-NEXT:    ; def v[0:1]
4752; GFX90A-NEXT:    ;;#ASMEND
4753; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
4754; GFX90A-NEXT:    ;;#ASMSTART
4755; GFX90A-NEXT:    ; def v[2:3]
4756; GFX90A-NEXT:    ;;#ASMEND
4757; GFX90A-NEXT:    v_bfi_b32 v0, s4, v3, v0
4758; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
4759; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
4760; GFX90A-NEXT:    v_perm_b32 v1, v2, v2, s4
4761; GFX90A-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
4762; GFX90A-NEXT:    s_waitcnt vmcnt(0)
4763; GFX90A-NEXT:    s_setpc_b64 s[30:31]
4764;
4765; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_1_3_3:
4766; GFX940:       ; %bb.0:
4767; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4768; GFX940-NEXT:    ;;#ASMSTART
4769; GFX940-NEXT:    ; def v[0:1]
4770; GFX940-NEXT:    ;;#ASMEND
4771; GFX940-NEXT:    s_mov_b32 s2, 0xffff
4772; GFX940-NEXT:    ;;#ASMSTART
4773; GFX940-NEXT:    ; def v[2:3]
4774; GFX940-NEXT:    ;;#ASMEND
4775; GFX940-NEXT:    v_mov_b32_e32 v4, 0
4776; GFX940-NEXT:    v_bfi_b32 v0, s2, v3, v0
4777; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
4778; GFX940-NEXT:    v_perm_b32 v1, v2, v2, s2
4779; GFX940-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
4780; GFX940-NEXT:    s_waitcnt vmcnt(0)
4781; GFX940-NEXT:    s_setpc_b64 s[30:31]
4782  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
4783  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
4784  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
4785  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
4786  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 1, i32 3, i32 3>
4787  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
4788  ret void
4789}
4790
4791define void @v_shuffle_v4bf16_v3bf16__5_2_3_3(ptr addrspace(1) inreg %ptr) {
4792; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_2_3_3:
4793; GFX900:       ; %bb.0:
4794; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4795; GFX900-NEXT:    ;;#ASMSTART
4796; GFX900-NEXT:    ; def v[0:1]
4797; GFX900-NEXT:    ;;#ASMEND
4798; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
4799; GFX900-NEXT:    v_mov_b32_e32 v4, 0
4800; GFX900-NEXT:    ;;#ASMSTART
4801; GFX900-NEXT:    ; def v[2:3]
4802; GFX900-NEXT:    ;;#ASMEND
4803; GFX900-NEXT:    v_perm_b32 v0, v1, v3, s4
4804; GFX900-NEXT:    v_perm_b32 v1, v2, v2, s4
4805; GFX900-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
4806; GFX900-NEXT:    s_waitcnt vmcnt(0)
4807; GFX900-NEXT:    s_setpc_b64 s[30:31]
4808;
4809; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_2_3_3:
4810; GFX90A:       ; %bb.0:
4811; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4812; GFX90A-NEXT:    ;;#ASMSTART
4813; GFX90A-NEXT:    ; def v[0:1]
4814; GFX90A-NEXT:    ;;#ASMEND
4815; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
4816; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
4817; GFX90A-NEXT:    ;;#ASMSTART
4818; GFX90A-NEXT:    ; def v[2:3]
4819; GFX90A-NEXT:    ;;#ASMEND
4820; GFX90A-NEXT:    v_perm_b32 v0, v1, v3, s4
4821; GFX90A-NEXT:    v_perm_b32 v1, v2, v2, s4
4822; GFX90A-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
4823; GFX90A-NEXT:    s_waitcnt vmcnt(0)
4824; GFX90A-NEXT:    s_setpc_b64 s[30:31]
4825;
4826; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_2_3_3:
4827; GFX940:       ; %bb.0:
4828; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4829; GFX940-NEXT:    ;;#ASMSTART
4830; GFX940-NEXT:    ; def v[0:1]
4831; GFX940-NEXT:    ;;#ASMEND
4832; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
4833; GFX940-NEXT:    v_mov_b32_e32 v4, 0
4834; GFX940-NEXT:    ;;#ASMSTART
4835; GFX940-NEXT:    ; def v[2:3]
4836; GFX940-NEXT:    ;;#ASMEND
4837; GFX940-NEXT:    s_nop 0
4838; GFX940-NEXT:    v_perm_b32 v0, v1, v3, s2
4839; GFX940-NEXT:    v_perm_b32 v1, v2, v2, s2
4840; GFX940-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
4841; GFX940-NEXT:    s_waitcnt vmcnt(0)
4842; GFX940-NEXT:    s_setpc_b64 s[30:31]
4843  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
4844  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
4845  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
4846  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
4847  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 2, i32 3, i32 3>
4848  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
4849  ret void
4850}
4851
4852define void @v_shuffle_v4bf16_v3bf16__5_4_3_3(ptr addrspace(1) inreg %ptr) {
4853; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_4_3_3:
4854; GFX900:       ; %bb.0:
4855; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4856; GFX900-NEXT:    ;;#ASMSTART
4857; GFX900-NEXT:    ; def v[0:1]
4858; GFX900-NEXT:    ;;#ASMEND
4859; GFX900-NEXT:    s_mov_b32 s4, 0xffff
4860; GFX900-NEXT:    v_bfi_b32 v1, s4, v1, v0
4861; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
4862; GFX900-NEXT:    v_mov_b32_e32 v3, 0
4863; GFX900-NEXT:    v_perm_b32 v2, v0, v0, s4
4864; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
4865; GFX900-NEXT:    s_waitcnt vmcnt(0)
4866; GFX900-NEXT:    s_setpc_b64 s[30:31]
4867;
4868; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_4_3_3:
4869; GFX90A:       ; %bb.0:
4870; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4871; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
4872; GFX90A-NEXT:    ;;#ASMSTART
4873; GFX90A-NEXT:    ; def v[0:1]
4874; GFX90A-NEXT:    ;;#ASMEND
4875; GFX90A-NEXT:    v_bfi_b32 v2, s4, v1, v0
4876; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
4877; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
4878; GFX90A-NEXT:    v_perm_b32 v3, v0, v0, s4
4879; GFX90A-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
4880; GFX90A-NEXT:    s_waitcnt vmcnt(0)
4881; GFX90A-NEXT:    s_setpc_b64 s[30:31]
4882;
4883; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_4_3_3:
4884; GFX940:       ; %bb.0:
4885; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4886; GFX940-NEXT:    s_mov_b32 s2, 0xffff
4887; GFX940-NEXT:    ;;#ASMSTART
4888; GFX940-NEXT:    ; def v[0:1]
4889; GFX940-NEXT:    ;;#ASMEND
4890; GFX940-NEXT:    v_mov_b32_e32 v4, 0
4891; GFX940-NEXT:    v_bfi_b32 v2, s2, v1, v0
4892; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
4893; GFX940-NEXT:    v_perm_b32 v3, v0, v0, s2
4894; GFX940-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
4895; GFX940-NEXT:    s_waitcnt vmcnt(0)
4896; GFX940-NEXT:    s_setpc_b64 s[30:31]
4897  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
4898  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
4899  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
4900  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
4901  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 4, i32 3, i32 3>
4902  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
4903  ret void
4904}
4905
4906define void @v_shuffle_v4bf16_v3bf16__5_5_3_3(ptr addrspace(1) inreg %ptr) {
4907; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_3:
4908; GFX900:       ; %bb.0:
4909; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4910; GFX900-NEXT:    ;;#ASMSTART
4911; GFX900-NEXT:    ; def v[0:1]
4912; GFX900-NEXT:    ;;#ASMEND
4913; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
4914; GFX900-NEXT:    v_mov_b32_e32 v3, 0
4915; GFX900-NEXT:    v_perm_b32 v2, v0, v0, s4
4916; GFX900-NEXT:    v_perm_b32 v1, v1, v1, s4
4917; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
4918; GFX900-NEXT:    s_waitcnt vmcnt(0)
4919; GFX900-NEXT:    s_setpc_b64 s[30:31]
4920;
4921; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_3:
4922; GFX90A:       ; %bb.0:
4923; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4924; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
4925; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
4926; GFX90A-NEXT:    ;;#ASMSTART
4927; GFX90A-NEXT:    ; def v[0:1]
4928; GFX90A-NEXT:    ;;#ASMEND
4929; GFX90A-NEXT:    v_perm_b32 v3, v0, v0, s4
4930; GFX90A-NEXT:    v_perm_b32 v2, v1, v1, s4
4931; GFX90A-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
4932; GFX90A-NEXT:    s_waitcnt vmcnt(0)
4933; GFX90A-NEXT:    s_setpc_b64 s[30:31]
4934;
4935; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_3:
4936; GFX940:       ; %bb.0:
4937; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4938; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
4939; GFX940-NEXT:    v_mov_b32_e32 v4, 0
4940; GFX940-NEXT:    ;;#ASMSTART
4941; GFX940-NEXT:    ; def v[0:1]
4942; GFX940-NEXT:    ;;#ASMEND
4943; GFX940-NEXT:    s_nop 0
4944; GFX940-NEXT:    v_perm_b32 v3, v0, v0, s2
4945; GFX940-NEXT:    v_perm_b32 v2, v1, v1, s2
4946; GFX940-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
4947; GFX940-NEXT:    s_waitcnt vmcnt(0)
4948; GFX940-NEXT:    s_setpc_b64 s[30:31]
4949  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
4950  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
4951  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
4952  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
4953  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 3, i32 3>
4954  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
4955  ret void
4956}
4957
4958define void @v_shuffle_v4bf16_v3bf16__5_5_u_3(ptr addrspace(1) inreg %ptr) {
4959; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_3:
4960; GFX900:       ; %bb.0:
4961; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4962; GFX900-NEXT:    ;;#ASMSTART
4963; GFX900-NEXT:    ; def v[0:1]
4964; GFX900-NEXT:    ;;#ASMEND
4965; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
4966; GFX900-NEXT:    v_mov_b32_e32 v3, 0
4967; GFX900-NEXT:    v_perm_b32 v1, v1, v1, s4
4968; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
4969; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
4970; GFX900-NEXT:    s_waitcnt vmcnt(0)
4971; GFX900-NEXT:    s_setpc_b64 s[30:31]
4972;
4973; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_3:
4974; GFX90A:       ; %bb.0:
4975; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4976; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
4977; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
4978; GFX90A-NEXT:    ;;#ASMSTART
4979; GFX90A-NEXT:    ; def v[0:1]
4980; GFX90A-NEXT:    ;;#ASMEND
4981; GFX90A-NEXT:    v_perm_b32 v2, v1, v1, s4
4982; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
4983; GFX90A-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
4984; GFX90A-NEXT:    s_waitcnt vmcnt(0)
4985; GFX90A-NEXT:    s_setpc_b64 s[30:31]
4986;
4987; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_3:
4988; GFX940:       ; %bb.0:
4989; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4990; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
4991; GFX940-NEXT:    v_mov_b32_e32 v4, 0
4992; GFX940-NEXT:    ;;#ASMSTART
4993; GFX940-NEXT:    ; def v[0:1]
4994; GFX940-NEXT:    ;;#ASMEND
4995; GFX940-NEXT:    s_nop 0
4996; GFX940-NEXT:    v_perm_b32 v2, v1, v1, s2
4997; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
4998; GFX940-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
4999; GFX940-NEXT:    s_waitcnt vmcnt(0)
5000; GFX940-NEXT:    s_setpc_b64 s[30:31]
5001  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
5002  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
5003  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
5004  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
5005  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 poison, i32 3>
5006  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
5007  ret void
5008}
5009
5010define void @v_shuffle_v4bf16_v3bf16__5_5_0_3(ptr addrspace(1) inreg %ptr) {
5011; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_3:
5012; GFX900:       ; %bb.0:
5013; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5014; GFX900-NEXT:    ;;#ASMSTART
5015; GFX900-NEXT:    ; def v[0:1]
5016; GFX900-NEXT:    ;;#ASMEND
5017; GFX900-NEXT:    ;;#ASMSTART
5018; GFX900-NEXT:    ; def v[1:2]
5019; GFX900-NEXT:    ;;#ASMEND
5020; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
5021; GFX900-NEXT:    v_mov_b32_e32 v3, 0
5022; GFX900-NEXT:    v_perm_b32 v1, v1, v0, s4
5023; GFX900-NEXT:    v_perm_b32 v0, v2, v2, s4
5024; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
5025; GFX900-NEXT:    s_waitcnt vmcnt(0)
5026; GFX900-NEXT:    s_setpc_b64 s[30:31]
5027;
5028; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_3:
5029; GFX90A:       ; %bb.0:
5030; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5031; GFX90A-NEXT:    ;;#ASMSTART
5032; GFX90A-NEXT:    ; def v[0:1]
5033; GFX90A-NEXT:    ;;#ASMEND
5034; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
5035; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
5036; GFX90A-NEXT:    ;;#ASMSTART
5037; GFX90A-NEXT:    ; def v[2:3]
5038; GFX90A-NEXT:    ;;#ASMEND
5039; GFX90A-NEXT:    v_perm_b32 v1, v2, v0, s4
5040; GFX90A-NEXT:    v_perm_b32 v0, v3, v3, s4
5041; GFX90A-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
5042; GFX90A-NEXT:    s_waitcnt vmcnt(0)
5043; GFX90A-NEXT:    s_setpc_b64 s[30:31]
5044;
5045; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_3:
5046; GFX940:       ; %bb.0:
5047; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5048; GFX940-NEXT:    ;;#ASMSTART
5049; GFX940-NEXT:    ; def v[0:1]
5050; GFX940-NEXT:    ;;#ASMEND
5051; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
5052; GFX940-NEXT:    v_mov_b32_e32 v4, 0
5053; GFX940-NEXT:    ;;#ASMSTART
5054; GFX940-NEXT:    ; def v[2:3]
5055; GFX940-NEXT:    ;;#ASMEND
5056; GFX940-NEXT:    s_nop 0
5057; GFX940-NEXT:    v_perm_b32 v1, v2, v0, s2
5058; GFX940-NEXT:    v_perm_b32 v0, v3, v3, s2
5059; GFX940-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
5060; GFX940-NEXT:    s_waitcnt vmcnt(0)
5061; GFX940-NEXT:    s_setpc_b64 s[30:31]
5062  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
5063  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
5064  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
5065  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
5066  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 0, i32 3>
5067  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
5068  ret void
5069}
5070
5071define void @v_shuffle_v4bf16_v3bf16__5_5_1_3(ptr addrspace(1) inreg %ptr) {
5072; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_3:
5073; GFX900:       ; %bb.0:
5074; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5075; GFX900-NEXT:    ;;#ASMSTART
5076; GFX900-NEXT:    ; def v[0:1]
5077; GFX900-NEXT:    ;;#ASMEND
5078; GFX900-NEXT:    ;;#ASMSTART
5079; GFX900-NEXT:    ; def v[1:2]
5080; GFX900-NEXT:    ;;#ASMEND
5081; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
5082; GFX900-NEXT:    v_mov_b32_e32 v3, 0
5083; GFX900-NEXT:    v_alignbit_b32 v1, v1, v0, 16
5084; GFX900-NEXT:    v_perm_b32 v0, v2, v2, s4
5085; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
5086; GFX900-NEXT:    s_waitcnt vmcnt(0)
5087; GFX900-NEXT:    s_setpc_b64 s[30:31]
5088;
5089; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_3:
5090; GFX90A:       ; %bb.0:
5091; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5092; GFX90A-NEXT:    ;;#ASMSTART
5093; GFX90A-NEXT:    ; def v[0:1]
5094; GFX90A-NEXT:    ;;#ASMEND
5095; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
5096; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
5097; GFX90A-NEXT:    ;;#ASMSTART
5098; GFX90A-NEXT:    ; def v[2:3]
5099; GFX90A-NEXT:    ;;#ASMEND
5100; GFX90A-NEXT:    v_alignbit_b32 v1, v2, v0, 16
5101; GFX90A-NEXT:    v_perm_b32 v0, v3, v3, s4
5102; GFX90A-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
5103; GFX90A-NEXT:    s_waitcnt vmcnt(0)
5104; GFX90A-NEXT:    s_setpc_b64 s[30:31]
5105;
5106; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_3:
5107; GFX940:       ; %bb.0:
5108; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5109; GFX940-NEXT:    ;;#ASMSTART
5110; GFX940-NEXT:    ; def v[0:1]
5111; GFX940-NEXT:    ;;#ASMEND
5112; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
5113; GFX940-NEXT:    v_mov_b32_e32 v4, 0
5114; GFX940-NEXT:    ;;#ASMSTART
5115; GFX940-NEXT:    ; def v[2:3]
5116; GFX940-NEXT:    ;;#ASMEND
5117; GFX940-NEXT:    s_nop 0
5118; GFX940-NEXT:    v_alignbit_b32 v1, v2, v0, 16
5119; GFX940-NEXT:    v_perm_b32 v0, v3, v3, s2
5120; GFX940-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
5121; GFX940-NEXT:    s_waitcnt vmcnt(0)
5122; GFX940-NEXT:    s_setpc_b64 s[30:31]
5123  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
5124  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
5125  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
5126  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
5127  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 1, i32 3>
5128  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
5129  ret void
5130}
5131
5132define void @v_shuffle_v4bf16_v3bf16__5_5_2_3(ptr addrspace(1) inreg %ptr) {
5133; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_3:
5134; GFX900:       ; %bb.0:
5135; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5136; GFX900-NEXT:    ;;#ASMSTART
5137; GFX900-NEXT:    ; def v[0:1]
5138; GFX900-NEXT:    ;;#ASMEND
5139; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
5140; GFX900-NEXT:    v_mov_b32_e32 v4, 0
5141; GFX900-NEXT:    ;;#ASMSTART
5142; GFX900-NEXT:    ; def v[2:3]
5143; GFX900-NEXT:    ;;#ASMEND
5144; GFX900-NEXT:    v_perm_b32 v1, v2, v1, s4
5145; GFX900-NEXT:    v_perm_b32 v0, v3, v3, s4
5146; GFX900-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
5147; GFX900-NEXT:    s_waitcnt vmcnt(0)
5148; GFX900-NEXT:    s_setpc_b64 s[30:31]
5149;
5150; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_3:
5151; GFX90A:       ; %bb.0:
5152; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5153; GFX90A-NEXT:    ;;#ASMSTART
5154; GFX90A-NEXT:    ; def v[0:1]
5155; GFX90A-NEXT:    ;;#ASMEND
5156; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
5157; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
5158; GFX90A-NEXT:    ;;#ASMSTART
5159; GFX90A-NEXT:    ; def v[2:3]
5160; GFX90A-NEXT:    ;;#ASMEND
5161; GFX90A-NEXT:    v_perm_b32 v1, v2, v1, s4
5162; GFX90A-NEXT:    v_perm_b32 v0, v3, v3, s4
5163; GFX90A-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
5164; GFX90A-NEXT:    s_waitcnt vmcnt(0)
5165; GFX90A-NEXT:    s_setpc_b64 s[30:31]
5166;
5167; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_3:
5168; GFX940:       ; %bb.0:
5169; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5170; GFX940-NEXT:    ;;#ASMSTART
5171; GFX940-NEXT:    ; def v[0:1]
5172; GFX940-NEXT:    ;;#ASMEND
5173; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
5174; GFX940-NEXT:    v_mov_b32_e32 v4, 0
5175; GFX940-NEXT:    ;;#ASMSTART
5176; GFX940-NEXT:    ; def v[2:3]
5177; GFX940-NEXT:    ;;#ASMEND
5178; GFX940-NEXT:    s_nop 0
5179; GFX940-NEXT:    v_perm_b32 v1, v2, v1, s2
5180; GFX940-NEXT:    v_perm_b32 v0, v3, v3, s2
5181; GFX940-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
5182; GFX940-NEXT:    s_waitcnt vmcnt(0)
5183; GFX940-NEXT:    s_setpc_b64 s[30:31]
5184  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
5185  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
5186  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
5187  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
5188  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 2, i32 3>
5189  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
5190  ret void
5191}
5192
5193define void @v_shuffle_v4bf16_v3bf16__5_5_4_3(ptr addrspace(1) inreg %ptr) {
5194; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_3:
5195; GFX900:       ; %bb.0:
5196; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5197; GFX900-NEXT:    ;;#ASMSTART
5198; GFX900-NEXT:    ; def v[0:1]
5199; GFX900-NEXT:    ;;#ASMEND
5200; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
5201; GFX900-NEXT:    v_mov_b32_e32 v3, 0
5202; GFX900-NEXT:    v_alignbit_b32 v2, v0, v0, 16
5203; GFX900-NEXT:    v_perm_b32 v1, v1, v1, s4
5204; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
5205; GFX900-NEXT:    s_waitcnt vmcnt(0)
5206; GFX900-NEXT:    s_setpc_b64 s[30:31]
5207;
5208; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_3:
5209; GFX90A:       ; %bb.0:
5210; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5211; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
5212; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
5213; GFX90A-NEXT:    ;;#ASMSTART
5214; GFX90A-NEXT:    ; def v[0:1]
5215; GFX90A-NEXT:    ;;#ASMEND
5216; GFX90A-NEXT:    v_alignbit_b32 v3, v0, v0, 16
5217; GFX90A-NEXT:    v_perm_b32 v2, v1, v1, s4
5218; GFX90A-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
5219; GFX90A-NEXT:    s_waitcnt vmcnt(0)
5220; GFX90A-NEXT:    s_setpc_b64 s[30:31]
5221;
5222; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_3:
5223; GFX940:       ; %bb.0:
5224; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5225; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
5226; GFX940-NEXT:    v_mov_b32_e32 v4, 0
5227; GFX940-NEXT:    ;;#ASMSTART
5228; GFX940-NEXT:    ; def v[0:1]
5229; GFX940-NEXT:    ;;#ASMEND
5230; GFX940-NEXT:    s_nop 0
5231; GFX940-NEXT:    v_alignbit_b32 v3, v0, v0, 16
5232; GFX940-NEXT:    v_perm_b32 v2, v1, v1, s2
5233; GFX940-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
5234; GFX940-NEXT:    s_waitcnt vmcnt(0)
5235; GFX940-NEXT:    s_setpc_b64 s[30:31]
5236  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
5237  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
5238  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
5239  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
5240  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 4, i32 3>
5241  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
5242  ret void
5243}
5244
5245define void @v_shuffle_v4bf16_v3bf16__u_4_4_4(ptr addrspace(1) inreg %ptr) {
5246; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__u_4_4_4:
5247; GFX900:       ; %bb.0:
5248; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5249; GFX900-NEXT:    ;;#ASMSTART
5250; GFX900-NEXT:    ; def v[0:1]
5251; GFX900-NEXT:    ;;#ASMEND
5252; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
5253; GFX900-NEXT:    v_mov_b32_e32 v2, 0
5254; GFX900-NEXT:    v_perm_b32 v1, v0, v0, s4
5255; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
5256; GFX900-NEXT:    s_waitcnt vmcnt(0)
5257; GFX900-NEXT:    s_setpc_b64 s[30:31]
5258;
5259; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__u_4_4_4:
5260; GFX90A:       ; %bb.0:
5261; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5262; GFX90A-NEXT:    ;;#ASMSTART
5263; GFX90A-NEXT:    ; def v[0:1]
5264; GFX90A-NEXT:    ;;#ASMEND
5265; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
5266; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
5267; GFX90A-NEXT:    v_perm_b32 v1, v0, v0, s4
5268; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
5269; GFX90A-NEXT:    s_waitcnt vmcnt(0)
5270; GFX90A-NEXT:    s_setpc_b64 s[30:31]
5271;
5272; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__u_4_4_4:
5273; GFX940:       ; %bb.0:
5274; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5275; GFX940-NEXT:    ;;#ASMSTART
5276; GFX940-NEXT:    ; def v[0:1]
5277; GFX940-NEXT:    ;;#ASMEND
5278; GFX940-NEXT:    s_mov_b32 s2, 0x7060302
5279; GFX940-NEXT:    v_mov_b32_e32 v2, 0
5280; GFX940-NEXT:    v_perm_b32 v1, v0, v0, s2
5281; GFX940-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
5282; GFX940-NEXT:    s_waitcnt vmcnt(0)
5283; GFX940-NEXT:    s_setpc_b64 s[30:31]
5284  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
5285  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
5286  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
5287  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
5288  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 poison, i32 4, i32 4, i32 4>
5289  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
5290  ret void
5291}
5292
5293define void @v_shuffle_v4bf16_v3bf16__0_4_4_4(ptr addrspace(1) inreg %ptr) {
5294; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__0_4_4_4:
5295; GFX900:       ; %bb.0:
5296; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5297; GFX900-NEXT:    ;;#ASMSTART
5298; GFX900-NEXT:    ; def v[0:1]
5299; GFX900-NEXT:    ;;#ASMEND
5300; GFX900-NEXT:    s_mov_b32 s4, 0xffff
5301; GFX900-NEXT:    ;;#ASMSTART
5302; GFX900-NEXT:    ; def v[1:2]
5303; GFX900-NEXT:    ;;#ASMEND
5304; GFX900-NEXT:    v_bfi_b32 v0, s4, v0, v1
5305; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
5306; GFX900-NEXT:    v_mov_b32_e32 v3, 0
5307; GFX900-NEXT:    v_perm_b32 v1, v1, v1, s4
5308; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
5309; GFX900-NEXT:    s_waitcnt vmcnt(0)
5310; GFX900-NEXT:    s_setpc_b64 s[30:31]
5311;
5312; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__0_4_4_4:
5313; GFX90A:       ; %bb.0:
5314; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5315; GFX90A-NEXT:    ;;#ASMSTART
5316; GFX90A-NEXT:    ; def v[0:1]
5317; GFX90A-NEXT:    ;;#ASMEND
5318; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
5319; GFX90A-NEXT:    ;;#ASMSTART
5320; GFX90A-NEXT:    ; def v[2:3]
5321; GFX90A-NEXT:    ;;#ASMEND
5322; GFX90A-NEXT:    v_bfi_b32 v0, s4, v0, v2
5323; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
5324; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
5325; GFX90A-NEXT:    v_perm_b32 v1, v2, v2, s4
5326; GFX90A-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
5327; GFX90A-NEXT:    s_waitcnt vmcnt(0)
5328; GFX90A-NEXT:    s_setpc_b64 s[30:31]
5329;
5330; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__0_4_4_4:
5331; GFX940:       ; %bb.0:
5332; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5333; GFX940-NEXT:    ;;#ASMSTART
5334; GFX940-NEXT:    ; def v[0:1]
5335; GFX940-NEXT:    ;;#ASMEND
5336; GFX940-NEXT:    s_mov_b32 s2, 0xffff
5337; GFX940-NEXT:    ;;#ASMSTART
5338; GFX940-NEXT:    ; def v[2:3]
5339; GFX940-NEXT:    ;;#ASMEND
5340; GFX940-NEXT:    v_mov_b32_e32 v4, 0
5341; GFX940-NEXT:    v_bfi_b32 v0, s2, v0, v2
5342; GFX940-NEXT:    s_mov_b32 s2, 0x7060302
5343; GFX940-NEXT:    v_perm_b32 v1, v2, v2, s2
5344; GFX940-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
5345; GFX940-NEXT:    s_waitcnt vmcnt(0)
5346; GFX940-NEXT:    s_setpc_b64 s[30:31]
5347  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
5348  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
5349  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
5350  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
5351  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 0, i32 4, i32 4, i32 4>
5352  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
5353  ret void
5354}
5355
5356define void @v_shuffle_v4bf16_v3bf16__1_4_4_4(ptr addrspace(1) inreg %ptr) {
5357; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__1_4_4_4:
5358; GFX900:       ; %bb.0:
5359; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5360; GFX900-NEXT:    ;;#ASMSTART
5361; GFX900-NEXT:    ; def v[0:1]
5362; GFX900-NEXT:    ;;#ASMEND
5363; GFX900-NEXT:    ;;#ASMSTART
5364; GFX900-NEXT:    ; def v[1:2]
5365; GFX900-NEXT:    ;;#ASMEND
5366; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
5367; GFX900-NEXT:    v_mov_b32_e32 v3, 0
5368; GFX900-NEXT:    v_perm_b32 v0, v1, v0, s4
5369; GFX900-NEXT:    v_perm_b32 v1, v1, v1, s4
5370; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
5371; GFX900-NEXT:    s_waitcnt vmcnt(0)
5372; GFX900-NEXT:    s_setpc_b64 s[30:31]
5373;
5374; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__1_4_4_4:
5375; GFX90A:       ; %bb.0:
5376; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5377; GFX90A-NEXT:    ;;#ASMSTART
5378; GFX90A-NEXT:    ; def v[0:1]
5379; GFX90A-NEXT:    ;;#ASMEND
5380; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
5381; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
5382; GFX90A-NEXT:    ;;#ASMSTART
5383; GFX90A-NEXT:    ; def v[2:3]
5384; GFX90A-NEXT:    ;;#ASMEND
5385; GFX90A-NEXT:    v_perm_b32 v0, v2, v0, s4
5386; GFX90A-NEXT:    v_perm_b32 v1, v2, v2, s4
5387; GFX90A-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
5388; GFX90A-NEXT:    s_waitcnt vmcnt(0)
5389; GFX90A-NEXT:    s_setpc_b64 s[30:31]
5390;
5391; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__1_4_4_4:
5392; GFX940:       ; %bb.0:
5393; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5394; GFX940-NEXT:    ;;#ASMSTART
5395; GFX940-NEXT:    ; def v[0:1]
5396; GFX940-NEXT:    ;;#ASMEND
5397; GFX940-NEXT:    s_mov_b32 s2, 0x7060302
5398; GFX940-NEXT:    v_mov_b32_e32 v4, 0
5399; GFX940-NEXT:    ;;#ASMSTART
5400; GFX940-NEXT:    ; def v[2:3]
5401; GFX940-NEXT:    ;;#ASMEND
5402; GFX940-NEXT:    s_nop 0
5403; GFX940-NEXT:    v_perm_b32 v0, v2, v0, s2
5404; GFX940-NEXT:    v_perm_b32 v1, v2, v2, s2
5405; GFX940-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
5406; GFX940-NEXT:    s_waitcnt vmcnt(0)
5407; GFX940-NEXT:    s_setpc_b64 s[30:31]
5408  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
5409  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
5410  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
5411  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
5412  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 1, i32 4, i32 4, i32 4>
5413  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
5414  ret void
5415}
5416
5417define void @v_shuffle_v4bf16_v3bf16__2_4_4_4(ptr addrspace(1) inreg %ptr) {
5418; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__2_4_4_4:
5419; GFX900:       ; %bb.0:
5420; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5421; GFX900-NEXT:    ;;#ASMSTART
5422; GFX900-NEXT:    ; def v[0:1]
5423; GFX900-NEXT:    ;;#ASMEND
5424; GFX900-NEXT:    s_mov_b32 s4, 0xffff
5425; GFX900-NEXT:    ;;#ASMSTART
5426; GFX900-NEXT:    ; def v[2:3]
5427; GFX900-NEXT:    ;;#ASMEND
5428; GFX900-NEXT:    v_bfi_b32 v0, s4, v1, v2
5429; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
5430; GFX900-NEXT:    v_mov_b32_e32 v4, 0
5431; GFX900-NEXT:    v_perm_b32 v1, v2, v2, s4
5432; GFX900-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
5433; GFX900-NEXT:    s_waitcnt vmcnt(0)
5434; GFX900-NEXT:    s_setpc_b64 s[30:31]
5435;
5436; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__2_4_4_4:
5437; GFX90A:       ; %bb.0:
5438; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5439; GFX90A-NEXT:    ;;#ASMSTART
5440; GFX90A-NEXT:    ; def v[0:1]
5441; GFX90A-NEXT:    ;;#ASMEND
5442; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
5443; GFX90A-NEXT:    ;;#ASMSTART
5444; GFX90A-NEXT:    ; def v[2:3]
5445; GFX90A-NEXT:    ;;#ASMEND
5446; GFX90A-NEXT:    v_bfi_b32 v0, s4, v1, v2
5447; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
5448; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
5449; GFX90A-NEXT:    v_perm_b32 v1, v2, v2, s4
5450; GFX90A-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
5451; GFX90A-NEXT:    s_waitcnt vmcnt(0)
5452; GFX90A-NEXT:    s_setpc_b64 s[30:31]
5453;
5454; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__2_4_4_4:
5455; GFX940:       ; %bb.0:
5456; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5457; GFX940-NEXT:    ;;#ASMSTART
5458; GFX940-NEXT:    ; def v[0:1]
5459; GFX940-NEXT:    ;;#ASMEND
5460; GFX940-NEXT:    s_mov_b32 s2, 0xffff
5461; GFX940-NEXT:    ;;#ASMSTART
5462; GFX940-NEXT:    ; def v[2:3]
5463; GFX940-NEXT:    ;;#ASMEND
5464; GFX940-NEXT:    v_mov_b32_e32 v4, 0
5465; GFX940-NEXT:    v_bfi_b32 v0, s2, v1, v2
5466; GFX940-NEXT:    s_mov_b32 s2, 0x7060302
5467; GFX940-NEXT:    v_perm_b32 v1, v2, v2, s2
5468; GFX940-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
5469; GFX940-NEXT:    s_waitcnt vmcnt(0)
5470; GFX940-NEXT:    s_setpc_b64 s[30:31]
5471  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
5472  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
5473  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
5474  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
5475  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 2, i32 4, i32 4, i32 4>
5476  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
5477  ret void
5478}
5479
5480define void @v_shuffle_v4bf16_v3bf16__3_4_4_4(ptr addrspace(1) inreg %ptr) {
5481; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__3_4_4_4:
5482; GFX900:       ; %bb.0:
5483; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5484; GFX900-NEXT:    ;;#ASMSTART
5485; GFX900-NEXT:    ; def v[0:1]
5486; GFX900-NEXT:    ;;#ASMEND
5487; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
5488; GFX900-NEXT:    v_mov_b32_e32 v2, 0
5489; GFX900-NEXT:    v_perm_b32 v1, v0, v0, s4
5490; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
5491; GFX900-NEXT:    s_waitcnt vmcnt(0)
5492; GFX900-NEXT:    s_setpc_b64 s[30:31]
5493;
5494; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__3_4_4_4:
5495; GFX90A:       ; %bb.0:
5496; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5497; GFX90A-NEXT:    ;;#ASMSTART
5498; GFX90A-NEXT:    ; def v[0:1]
5499; GFX90A-NEXT:    ;;#ASMEND
5500; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
5501; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
5502; GFX90A-NEXT:    v_perm_b32 v1, v0, v0, s4
5503; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
5504; GFX90A-NEXT:    s_waitcnt vmcnt(0)
5505; GFX90A-NEXT:    s_setpc_b64 s[30:31]
5506;
5507; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__3_4_4_4:
5508; GFX940:       ; %bb.0:
5509; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5510; GFX940-NEXT:    ;;#ASMSTART
5511; GFX940-NEXT:    ; def v[0:1]
5512; GFX940-NEXT:    ;;#ASMEND
5513; GFX940-NEXT:    s_mov_b32 s2, 0x7060302
5514; GFX940-NEXT:    v_mov_b32_e32 v2, 0
5515; GFX940-NEXT:    v_perm_b32 v1, v0, v0, s2
5516; GFX940-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
5517; GFX940-NEXT:    s_waitcnt vmcnt(0)
5518; GFX940-NEXT:    s_setpc_b64 s[30:31]
5519  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
5520  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
5521  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
5522  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
5523  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 3, i32 4, i32 4, i32 4>
5524  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
5525  ret void
5526}
5527
5528define void @v_shuffle_v4bf16_v3bf16__4_4_4_4(ptr addrspace(1) inreg %ptr) {
5529; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__4_4_4_4:
5530; GFX900:       ; %bb.0:
5531; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5532; GFX900-NEXT:    ;;#ASMSTART
5533; GFX900-NEXT:    ; def v[0:1]
5534; GFX900-NEXT:    ;;#ASMEND
5535; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
5536; GFX900-NEXT:    v_perm_b32 v0, v0, v0, s4
5537; GFX900-NEXT:    v_mov_b32_e32 v2, 0
5538; GFX900-NEXT:    v_mov_b32_e32 v1, v0
5539; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
5540; GFX900-NEXT:    s_waitcnt vmcnt(0)
5541; GFX900-NEXT:    s_setpc_b64 s[30:31]
5542;
5543; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__4_4_4_4:
5544; GFX90A:       ; %bb.0:
5545; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5546; GFX90A-NEXT:    ;;#ASMSTART
5547; GFX90A-NEXT:    ; def v[0:1]
5548; GFX90A-NEXT:    ;;#ASMEND
5549; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
5550; GFX90A-NEXT:    v_perm_b32 v0, v0, v0, s4
5551; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
5552; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
5553; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
5554; GFX90A-NEXT:    s_waitcnt vmcnt(0)
5555; GFX90A-NEXT:    s_setpc_b64 s[30:31]
5556;
5557; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__4_4_4_4:
5558; GFX940:       ; %bb.0:
5559; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5560; GFX940-NEXT:    ;;#ASMSTART
5561; GFX940-NEXT:    ; def v[0:1]
5562; GFX940-NEXT:    ;;#ASMEND
5563; GFX940-NEXT:    s_mov_b32 s2, 0x7060302
5564; GFX940-NEXT:    v_perm_b32 v0, v0, v0, s2
5565; GFX940-NEXT:    v_mov_b32_e32 v2, 0
5566; GFX940-NEXT:    v_mov_b32_e32 v1, v0
5567; GFX940-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
5568; GFX940-NEXT:    s_waitcnt vmcnt(0)
5569; GFX940-NEXT:    s_setpc_b64 s[30:31]
5570  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
5571  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
5572  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
5573  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
5574  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 4, i32 4, i32 4, i32 4>
5575  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
5576  ret void
5577}
5578
5579define void @v_shuffle_v4bf16_v3bf16__5_4_4_4(ptr addrspace(1) inreg %ptr) {
5580; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_4_4_4:
5581; GFX900:       ; %bb.0:
5582; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5583; GFX900-NEXT:    ;;#ASMSTART
5584; GFX900-NEXT:    ; def v[0:1]
5585; GFX900-NEXT:    ;;#ASMEND
5586; GFX900-NEXT:    s_mov_b32 s4, 0xffff
5587; GFX900-NEXT:    v_bfi_b32 v1, s4, v1, v0
5588; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
5589; GFX900-NEXT:    v_mov_b32_e32 v3, 0
5590; GFX900-NEXT:    v_perm_b32 v2, v0, v0, s4
5591; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
5592; GFX900-NEXT:    s_waitcnt vmcnt(0)
5593; GFX900-NEXT:    s_setpc_b64 s[30:31]
5594;
5595; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_4_4_4:
5596; GFX90A:       ; %bb.0:
5597; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5598; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
5599; GFX90A-NEXT:    ;;#ASMSTART
5600; GFX90A-NEXT:    ; def v[0:1]
5601; GFX90A-NEXT:    ;;#ASMEND
5602; GFX90A-NEXT:    v_bfi_b32 v2, s4, v1, v0
5603; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
5604; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
5605; GFX90A-NEXT:    v_perm_b32 v3, v0, v0, s4
5606; GFX90A-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
5607; GFX90A-NEXT:    s_waitcnt vmcnt(0)
5608; GFX90A-NEXT:    s_setpc_b64 s[30:31]
5609;
5610; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_4_4_4:
5611; GFX940:       ; %bb.0:
5612; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5613; GFX940-NEXT:    s_mov_b32 s2, 0xffff
5614; GFX940-NEXT:    ;;#ASMSTART
5615; GFX940-NEXT:    ; def v[0:1]
5616; GFX940-NEXT:    ;;#ASMEND
5617; GFX940-NEXT:    v_mov_b32_e32 v4, 0
5618; GFX940-NEXT:    v_bfi_b32 v2, s2, v1, v0
5619; GFX940-NEXT:    s_mov_b32 s2, 0x7060302
5620; GFX940-NEXT:    v_perm_b32 v3, v0, v0, s2
5621; GFX940-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
5622; GFX940-NEXT:    s_waitcnt vmcnt(0)
5623; GFX940-NEXT:    s_setpc_b64 s[30:31]
5624  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
5625  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
5626  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
5627  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
5628  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 4, i32 4, i32 4>
5629  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
5630  ret void
5631}
5632
5633define void @v_shuffle_v4bf16_v3bf16__5_u_4_4(ptr addrspace(1) inreg %ptr) {
5634; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_u_4_4:
5635; GFX900:       ; %bb.0:
5636; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5637; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
5638; GFX900-NEXT:    v_mov_b32_e32 v3, 0
5639; GFX900-NEXT:    ;;#ASMSTART
5640; GFX900-NEXT:    ; def v[0:1]
5641; GFX900-NEXT:    ;;#ASMEND
5642; GFX900-NEXT:    v_perm_b32 v2, v0, v0, s4
5643; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
5644; GFX900-NEXT:    s_waitcnt vmcnt(0)
5645; GFX900-NEXT:    s_setpc_b64 s[30:31]
5646;
5647; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_u_4_4:
5648; GFX90A:       ; %bb.0:
5649; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5650; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
5651; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
5652; GFX90A-NEXT:    ;;#ASMSTART
5653; GFX90A-NEXT:    ; def v[0:1]
5654; GFX90A-NEXT:    ;;#ASMEND
5655; GFX90A-NEXT:    v_perm_b32 v3, v0, v0, s4
5656; GFX90A-NEXT:    v_mov_b32_e32 v2, v1
5657; GFX90A-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
5658; GFX90A-NEXT:    s_waitcnt vmcnt(0)
5659; GFX90A-NEXT:    s_setpc_b64 s[30:31]
5660;
5661; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_u_4_4:
5662; GFX940:       ; %bb.0:
5663; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5664; GFX940-NEXT:    s_mov_b32 s2, 0x7060302
5665; GFX940-NEXT:    v_mov_b32_e32 v4, 0
5666; GFX940-NEXT:    ;;#ASMSTART
5667; GFX940-NEXT:    ; def v[0:1]
5668; GFX940-NEXT:    ;;#ASMEND
5669; GFX940-NEXT:    s_nop 0
5670; GFX940-NEXT:    v_perm_b32 v3, v0, v0, s2
5671; GFX940-NEXT:    v_mov_b32_e32 v2, v1
5672; GFX940-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
5673; GFX940-NEXT:    s_waitcnt vmcnt(0)
5674; GFX940-NEXT:    s_setpc_b64 s[30:31]
5675  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
5676  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
5677  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
5678  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
5679  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 poison, i32 4, i32 4>
5680  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
5681  ret void
5682}
5683
5684define void @v_shuffle_v4bf16_v3bf16__5_0_4_4(ptr addrspace(1) inreg %ptr) {
5685; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_0_4_4:
5686; GFX900:       ; %bb.0:
5687; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5688; GFX900-NEXT:    ;;#ASMSTART
5689; GFX900-NEXT:    ; def v[0:1]
5690; GFX900-NEXT:    ;;#ASMEND
5691; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
5692; GFX900-NEXT:    ;;#ASMSTART
5693; GFX900-NEXT:    ; def v[1:2]
5694; GFX900-NEXT:    ;;#ASMEND
5695; GFX900-NEXT:    v_perm_b32 v0, v0, v2, s4
5696; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
5697; GFX900-NEXT:    v_mov_b32_e32 v3, 0
5698; GFX900-NEXT:    v_perm_b32 v1, v1, v1, s4
5699; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
5700; GFX900-NEXT:    s_waitcnt vmcnt(0)
5701; GFX900-NEXT:    s_setpc_b64 s[30:31]
5702;
5703; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_0_4_4:
5704; GFX90A:       ; %bb.0:
5705; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5706; GFX90A-NEXT:    ;;#ASMSTART
5707; GFX90A-NEXT:    ; def v[0:1]
5708; GFX90A-NEXT:    ;;#ASMEND
5709; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
5710; GFX90A-NEXT:    ;;#ASMSTART
5711; GFX90A-NEXT:    ; def v[2:3]
5712; GFX90A-NEXT:    ;;#ASMEND
5713; GFX90A-NEXT:    v_perm_b32 v0, v0, v3, s4
5714; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
5715; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
5716; GFX90A-NEXT:    v_perm_b32 v1, v2, v2, s4
5717; GFX90A-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
5718; GFX90A-NEXT:    s_waitcnt vmcnt(0)
5719; GFX90A-NEXT:    s_setpc_b64 s[30:31]
5720;
5721; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_0_4_4:
5722; GFX940:       ; %bb.0:
5723; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5724; GFX940-NEXT:    ;;#ASMSTART
5725; GFX940-NEXT:    ; def v[0:1]
5726; GFX940-NEXT:    ;;#ASMEND
5727; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
5728; GFX940-NEXT:    ;;#ASMSTART
5729; GFX940-NEXT:    ; def v[2:3]
5730; GFX940-NEXT:    ;;#ASMEND
5731; GFX940-NEXT:    v_mov_b32_e32 v4, 0
5732; GFX940-NEXT:    v_perm_b32 v0, v0, v3, s2
5733; GFX940-NEXT:    s_mov_b32 s2, 0x7060302
5734; GFX940-NEXT:    v_perm_b32 v1, v2, v2, s2
5735; GFX940-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
5736; GFX940-NEXT:    s_waitcnt vmcnt(0)
5737; GFX940-NEXT:    s_setpc_b64 s[30:31]
5738  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
5739  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
5740  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
5741  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
5742  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 0, i32 4, i32 4>
5743  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
5744  ret void
5745}
5746
5747define void @v_shuffle_v4bf16_v3bf16__5_1_4_4(ptr addrspace(1) inreg %ptr) {
5748; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_1_4_4:
5749; GFX900:       ; %bb.0:
5750; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5751; GFX900-NEXT:    ;;#ASMSTART
5752; GFX900-NEXT:    ; def v[0:1]
5753; GFX900-NEXT:    ;;#ASMEND
5754; GFX900-NEXT:    s_mov_b32 s4, 0xffff
5755; GFX900-NEXT:    ;;#ASMSTART
5756; GFX900-NEXT:    ; def v[1:2]
5757; GFX900-NEXT:    ;;#ASMEND
5758; GFX900-NEXT:    v_bfi_b32 v0, s4, v2, v0
5759; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
5760; GFX900-NEXT:    v_mov_b32_e32 v3, 0
5761; GFX900-NEXT:    v_perm_b32 v1, v1, v1, s4
5762; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
5763; GFX900-NEXT:    s_waitcnt vmcnt(0)
5764; GFX900-NEXT:    s_setpc_b64 s[30:31]
5765;
5766; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_1_4_4:
5767; GFX90A:       ; %bb.0:
5768; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5769; GFX90A-NEXT:    ;;#ASMSTART
5770; GFX90A-NEXT:    ; def v[0:1]
5771; GFX90A-NEXT:    ;;#ASMEND
5772; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
5773; GFX90A-NEXT:    ;;#ASMSTART
5774; GFX90A-NEXT:    ; def v[2:3]
5775; GFX90A-NEXT:    ;;#ASMEND
5776; GFX90A-NEXT:    v_bfi_b32 v0, s4, v3, v0
5777; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
5778; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
5779; GFX90A-NEXT:    v_perm_b32 v1, v2, v2, s4
5780; GFX90A-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
5781; GFX90A-NEXT:    s_waitcnt vmcnt(0)
5782; GFX90A-NEXT:    s_setpc_b64 s[30:31]
5783;
5784; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_1_4_4:
5785; GFX940:       ; %bb.0:
5786; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5787; GFX940-NEXT:    ;;#ASMSTART
5788; GFX940-NEXT:    ; def v[0:1]
5789; GFX940-NEXT:    ;;#ASMEND
5790; GFX940-NEXT:    s_mov_b32 s2, 0xffff
5791; GFX940-NEXT:    ;;#ASMSTART
5792; GFX940-NEXT:    ; def v[2:3]
5793; GFX940-NEXT:    ;;#ASMEND
5794; GFX940-NEXT:    v_mov_b32_e32 v4, 0
5795; GFX940-NEXT:    v_bfi_b32 v0, s2, v3, v0
5796; GFX940-NEXT:    s_mov_b32 s2, 0x7060302
5797; GFX940-NEXT:    v_perm_b32 v1, v2, v2, s2
5798; GFX940-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
5799; GFX940-NEXT:    s_waitcnt vmcnt(0)
5800; GFX940-NEXT:    s_setpc_b64 s[30:31]
5801  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
5802  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
5803  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
5804  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
5805  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 1, i32 4, i32 4>
5806  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
5807  ret void
5808}
5809
5810define void @v_shuffle_v4bf16_v3bf16__5_2_4_4(ptr addrspace(1) inreg %ptr) {
5811; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_2_4_4:
5812; GFX900:       ; %bb.0:
5813; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5814; GFX900-NEXT:    ;;#ASMSTART
5815; GFX900-NEXT:    ; def v[0:1]
5816; GFX900-NEXT:    ;;#ASMEND
5817; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
5818; GFX900-NEXT:    ;;#ASMSTART
5819; GFX900-NEXT:    ; def v[2:3]
5820; GFX900-NEXT:    ;;#ASMEND
5821; GFX900-NEXT:    v_perm_b32 v0, v1, v3, s4
5822; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
5823; GFX900-NEXT:    v_mov_b32_e32 v4, 0
5824; GFX900-NEXT:    v_perm_b32 v1, v2, v2, s4
5825; GFX900-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
5826; GFX900-NEXT:    s_waitcnt vmcnt(0)
5827; GFX900-NEXT:    s_setpc_b64 s[30:31]
5828;
5829; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_2_4_4:
5830; GFX90A:       ; %bb.0:
5831; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5832; GFX90A-NEXT:    ;;#ASMSTART
5833; GFX90A-NEXT:    ; def v[0:1]
5834; GFX90A-NEXT:    ;;#ASMEND
5835; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
5836; GFX90A-NEXT:    ;;#ASMSTART
5837; GFX90A-NEXT:    ; def v[2:3]
5838; GFX90A-NEXT:    ;;#ASMEND
5839; GFX90A-NEXT:    v_perm_b32 v0, v1, v3, s4
5840; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
5841; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
5842; GFX90A-NEXT:    v_perm_b32 v1, v2, v2, s4
5843; GFX90A-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
5844; GFX90A-NEXT:    s_waitcnt vmcnt(0)
5845; GFX90A-NEXT:    s_setpc_b64 s[30:31]
5846;
5847; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_2_4_4:
5848; GFX940:       ; %bb.0:
5849; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5850; GFX940-NEXT:    ;;#ASMSTART
5851; GFX940-NEXT:    ; def v[0:1]
5852; GFX940-NEXT:    ;;#ASMEND
5853; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
5854; GFX940-NEXT:    ;;#ASMSTART
5855; GFX940-NEXT:    ; def v[2:3]
5856; GFX940-NEXT:    ;;#ASMEND
5857; GFX940-NEXT:    v_mov_b32_e32 v4, 0
5858; GFX940-NEXT:    v_perm_b32 v0, v1, v3, s2
5859; GFX940-NEXT:    s_mov_b32 s2, 0x7060302
5860; GFX940-NEXT:    v_perm_b32 v1, v2, v2, s2
5861; GFX940-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
5862; GFX940-NEXT:    s_waitcnt vmcnt(0)
5863; GFX940-NEXT:    s_setpc_b64 s[30:31]
5864  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
5865  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
5866  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
5867  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
5868  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 2, i32 4, i32 4>
5869  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
5870  ret void
5871}
5872
5873define void @v_shuffle_v4bf16_v3bf16__5_3_4_4(ptr addrspace(1) inreg %ptr) {
5874; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_3_4_4:
5875; GFX900:       ; %bb.0:
5876; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5877; GFX900-NEXT:    ;;#ASMSTART
5878; GFX900-NEXT:    ; def v[0:1]
5879; GFX900-NEXT:    ;;#ASMEND
5880; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
5881; GFX900-NEXT:    v_perm_b32 v1, v0, v1, s4
5882; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
5883; GFX900-NEXT:    v_mov_b32_e32 v3, 0
5884; GFX900-NEXT:    v_perm_b32 v2, v0, v0, s4
5885; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
5886; GFX900-NEXT:    s_waitcnt vmcnt(0)
5887; GFX900-NEXT:    s_setpc_b64 s[30:31]
5888;
5889; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_3_4_4:
5890; GFX90A:       ; %bb.0:
5891; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5892; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
5893; GFX90A-NEXT:    ;;#ASMSTART
5894; GFX90A-NEXT:    ; def v[0:1]
5895; GFX90A-NEXT:    ;;#ASMEND
5896; GFX90A-NEXT:    v_perm_b32 v2, v0, v1, s4
5897; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
5898; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
5899; GFX90A-NEXT:    v_perm_b32 v3, v0, v0, s4
5900; GFX90A-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
5901; GFX90A-NEXT:    s_waitcnt vmcnt(0)
5902; GFX90A-NEXT:    s_setpc_b64 s[30:31]
5903;
5904; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_3_4_4:
5905; GFX940:       ; %bb.0:
5906; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5907; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
5908; GFX940-NEXT:    ;;#ASMSTART
5909; GFX940-NEXT:    ; def v[0:1]
5910; GFX940-NEXT:    ;;#ASMEND
5911; GFX940-NEXT:    v_mov_b32_e32 v4, 0
5912; GFX940-NEXT:    v_perm_b32 v2, v0, v1, s2
5913; GFX940-NEXT:    s_mov_b32 s2, 0x7060302
5914; GFX940-NEXT:    v_perm_b32 v3, v0, v0, s2
5915; GFX940-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
5916; GFX940-NEXT:    s_waitcnt vmcnt(0)
5917; GFX940-NEXT:    s_setpc_b64 s[30:31]
5918  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
5919  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
5920  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
5921  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
5922  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 3, i32 4, i32 4>
5923  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
5924  ret void
5925}
5926
5927define void @v_shuffle_v4bf16_v3bf16__5_5_4_4(ptr addrspace(1) inreg %ptr) {
5928; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_4:
5929; GFX900:       ; %bb.0:
5930; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5931; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
5932; GFX900-NEXT:    ;;#ASMSTART
5933; GFX900-NEXT:    ; def v[0:1]
5934; GFX900-NEXT:    ;;#ASMEND
5935; GFX900-NEXT:    v_perm_b32 v2, v0, v0, s4
5936; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
5937; GFX900-NEXT:    v_mov_b32_e32 v3, 0
5938; GFX900-NEXT:    v_perm_b32 v1, v1, v1, s4
5939; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
5940; GFX900-NEXT:    s_waitcnt vmcnt(0)
5941; GFX900-NEXT:    s_setpc_b64 s[30:31]
5942;
5943; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_4:
5944; GFX90A:       ; %bb.0:
5945; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5946; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
5947; GFX90A-NEXT:    ;;#ASMSTART
5948; GFX90A-NEXT:    ; def v[0:1]
5949; GFX90A-NEXT:    ;;#ASMEND
5950; GFX90A-NEXT:    v_perm_b32 v3, v0, v0, s4
5951; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
5952; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
5953; GFX90A-NEXT:    v_perm_b32 v2, v1, v1, s4
5954; GFX90A-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
5955; GFX90A-NEXT:    s_waitcnt vmcnt(0)
5956; GFX90A-NEXT:    s_setpc_b64 s[30:31]
5957;
5958; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_4:
5959; GFX940:       ; %bb.0:
5960; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5961; GFX940-NEXT:    s_mov_b32 s2, 0x7060302
5962; GFX940-NEXT:    ;;#ASMSTART
5963; GFX940-NEXT:    ; def v[0:1]
5964; GFX940-NEXT:    ;;#ASMEND
5965; GFX940-NEXT:    v_mov_b32_e32 v4, 0
5966; GFX940-NEXT:    v_perm_b32 v3, v0, v0, s2
5967; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
5968; GFX940-NEXT:    v_perm_b32 v2, v1, v1, s2
5969; GFX940-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
5970; GFX940-NEXT:    s_waitcnt vmcnt(0)
5971; GFX940-NEXT:    s_setpc_b64 s[30:31]
5972  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
5973  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
5974  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
5975  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
5976  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 4, i32 4>
5977  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
5978  ret void
5979}
5980
5981define void @v_shuffle_v4bf16_v3bf16__5_5_u_4(ptr addrspace(1) inreg %ptr) {
5982; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_4:
5983; GFX900:       ; %bb.0:
5984; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5985; GFX900-NEXT:    ;;#ASMSTART
5986; GFX900-NEXT:    ; def v[0:1]
5987; GFX900-NEXT:    ;;#ASMEND
5988; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
5989; GFX900-NEXT:    v_mov_b32_e32 v3, 0
5990; GFX900-NEXT:    v_perm_b32 v1, v1, v1, s4
5991; GFX900-NEXT:    v_mov_b32_e32 v2, v0
5992; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
5993; GFX900-NEXT:    s_waitcnt vmcnt(0)
5994; GFX900-NEXT:    s_setpc_b64 s[30:31]
5995;
5996; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_4:
5997; GFX90A:       ; %bb.0:
5998; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5999; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
6000; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
6001; GFX90A-NEXT:    ;;#ASMSTART
6002; GFX90A-NEXT:    ; def v[0:1]
6003; GFX90A-NEXT:    ;;#ASMEND
6004; GFX90A-NEXT:    v_perm_b32 v2, v1, v1, s4
6005; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
6006; GFX90A-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
6007; GFX90A-NEXT:    s_waitcnt vmcnt(0)
6008; GFX90A-NEXT:    s_setpc_b64 s[30:31]
6009;
6010; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_4:
6011; GFX940:       ; %bb.0:
6012; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6013; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
6014; GFX940-NEXT:    v_mov_b32_e32 v4, 0
6015; GFX940-NEXT:    ;;#ASMSTART
6016; GFX940-NEXT:    ; def v[0:1]
6017; GFX940-NEXT:    ;;#ASMEND
6018; GFX940-NEXT:    s_nop 0
6019; GFX940-NEXT:    v_perm_b32 v2, v1, v1, s2
6020; GFX940-NEXT:    v_mov_b32_e32 v3, v0
6021; GFX940-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
6022; GFX940-NEXT:    s_waitcnt vmcnt(0)
6023; GFX940-NEXT:    s_setpc_b64 s[30:31]
6024  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
6025  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
6026  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
6027  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
6028  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 poison, i32 4>
6029  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
6030  ret void
6031}
6032
6033define void @v_shuffle_v4bf16_v3bf16__5_5_0_4(ptr addrspace(1) inreg %ptr) {
6034; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_4:
6035; GFX900:       ; %bb.0:
6036; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6037; GFX900-NEXT:    ;;#ASMSTART
6038; GFX900-NEXT:    ; def v[0:1]
6039; GFX900-NEXT:    ;;#ASMEND
6040; GFX900-NEXT:    ;;#ASMSTART
6041; GFX900-NEXT:    ; def v[1:2]
6042; GFX900-NEXT:    ;;#ASMEND
6043; GFX900-NEXT:    s_mov_b32 s4, 0xffff
6044; GFX900-NEXT:    v_bfi_b32 v1, s4, v0, v1
6045; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
6046; GFX900-NEXT:    v_mov_b32_e32 v3, 0
6047; GFX900-NEXT:    v_perm_b32 v0, v2, v2, s4
6048; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
6049; GFX900-NEXT:    s_waitcnt vmcnt(0)
6050; GFX900-NEXT:    s_setpc_b64 s[30:31]
6051;
6052; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_4:
6053; GFX90A:       ; %bb.0:
6054; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6055; GFX90A-NEXT:    ;;#ASMSTART
6056; GFX90A-NEXT:    ; def v[0:1]
6057; GFX90A-NEXT:    ;;#ASMEND
6058; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
6059; GFX90A-NEXT:    ;;#ASMSTART
6060; GFX90A-NEXT:    ; def v[2:3]
6061; GFX90A-NEXT:    ;;#ASMEND
6062; GFX90A-NEXT:    v_bfi_b32 v1, s4, v0, v2
6063; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
6064; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
6065; GFX90A-NEXT:    v_perm_b32 v0, v3, v3, s4
6066; GFX90A-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
6067; GFX90A-NEXT:    s_waitcnt vmcnt(0)
6068; GFX90A-NEXT:    s_setpc_b64 s[30:31]
6069;
6070; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_4:
6071; GFX940:       ; %bb.0:
6072; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6073; GFX940-NEXT:    ;;#ASMSTART
6074; GFX940-NEXT:    ; def v[0:1]
6075; GFX940-NEXT:    ;;#ASMEND
6076; GFX940-NEXT:    s_mov_b32 s2, 0xffff
6077; GFX940-NEXT:    ;;#ASMSTART
6078; GFX940-NEXT:    ; def v[2:3]
6079; GFX940-NEXT:    ;;#ASMEND
6080; GFX940-NEXT:    v_mov_b32_e32 v4, 0
6081; GFX940-NEXT:    v_bfi_b32 v1, s2, v0, v2
6082; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
6083; GFX940-NEXT:    v_perm_b32 v0, v3, v3, s2
6084; GFX940-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
6085; GFX940-NEXT:    s_waitcnt vmcnt(0)
6086; GFX940-NEXT:    s_setpc_b64 s[30:31]
6087  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
6088  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
6089  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
6090  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
6091  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 0, i32 4>
6092  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
6093  ret void
6094}
6095
6096define void @v_shuffle_v4bf16_v3bf16__5_5_1_4(ptr addrspace(1) inreg %ptr) {
6097; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_4:
6098; GFX900:       ; %bb.0:
6099; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6100; GFX900-NEXT:    ;;#ASMSTART
6101; GFX900-NEXT:    ; def v[0:1]
6102; GFX900-NEXT:    ;;#ASMEND
6103; GFX900-NEXT:    ;;#ASMSTART
6104; GFX900-NEXT:    ; def v[1:2]
6105; GFX900-NEXT:    ;;#ASMEND
6106; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
6107; GFX900-NEXT:    v_perm_b32 v1, v1, v0, s4
6108; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
6109; GFX900-NEXT:    v_mov_b32_e32 v3, 0
6110; GFX900-NEXT:    v_perm_b32 v0, v2, v2, s4
6111; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
6112; GFX900-NEXT:    s_waitcnt vmcnt(0)
6113; GFX900-NEXT:    s_setpc_b64 s[30:31]
6114;
6115; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_4:
6116; GFX90A:       ; %bb.0:
6117; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6118; GFX90A-NEXT:    ;;#ASMSTART
6119; GFX90A-NEXT:    ; def v[0:1]
6120; GFX90A-NEXT:    ;;#ASMEND
6121; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
6122; GFX90A-NEXT:    ;;#ASMSTART
6123; GFX90A-NEXT:    ; def v[2:3]
6124; GFX90A-NEXT:    ;;#ASMEND
6125; GFX90A-NEXT:    v_perm_b32 v1, v2, v0, s4
6126; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
6127; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
6128; GFX90A-NEXT:    v_perm_b32 v0, v3, v3, s4
6129; GFX90A-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
6130; GFX90A-NEXT:    s_waitcnt vmcnt(0)
6131; GFX90A-NEXT:    s_setpc_b64 s[30:31]
6132;
6133; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_4:
6134; GFX940:       ; %bb.0:
6135; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6136; GFX940-NEXT:    ;;#ASMSTART
6137; GFX940-NEXT:    ; def v[0:1]
6138; GFX940-NEXT:    ;;#ASMEND
6139; GFX940-NEXT:    s_mov_b32 s2, 0x7060302
6140; GFX940-NEXT:    ;;#ASMSTART
6141; GFX940-NEXT:    ; def v[2:3]
6142; GFX940-NEXT:    ;;#ASMEND
6143; GFX940-NEXT:    v_mov_b32_e32 v4, 0
6144; GFX940-NEXT:    v_perm_b32 v1, v2, v0, s2
6145; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
6146; GFX940-NEXT:    v_perm_b32 v0, v3, v3, s2
6147; GFX940-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
6148; GFX940-NEXT:    s_waitcnt vmcnt(0)
6149; GFX940-NEXT:    s_setpc_b64 s[30:31]
6150  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
6151  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
6152  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
6153  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
6154  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 1, i32 4>
6155  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
6156  ret void
6157}
6158
6159define void @v_shuffle_v4bf16_v3bf16__5_5_2_4(ptr addrspace(1) inreg %ptr) {
6160; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_4:
6161; GFX900:       ; %bb.0:
6162; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6163; GFX900-NEXT:    ;;#ASMSTART
6164; GFX900-NEXT:    ; def v[0:1]
6165; GFX900-NEXT:    ;;#ASMEND
6166; GFX900-NEXT:    s_mov_b32 s4, 0xffff
6167; GFX900-NEXT:    ;;#ASMSTART
6168; GFX900-NEXT:    ; def v[2:3]
6169; GFX900-NEXT:    ;;#ASMEND
6170; GFX900-NEXT:    v_bfi_b32 v1, s4, v1, v2
6171; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
6172; GFX900-NEXT:    v_mov_b32_e32 v4, 0
6173; GFX900-NEXT:    v_perm_b32 v0, v3, v3, s4
6174; GFX900-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
6175; GFX900-NEXT:    s_waitcnt vmcnt(0)
6176; GFX900-NEXT:    s_setpc_b64 s[30:31]
6177;
6178; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_4:
6179; GFX90A:       ; %bb.0:
6180; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6181; GFX90A-NEXT:    ;;#ASMSTART
6182; GFX90A-NEXT:    ; def v[0:1]
6183; GFX90A-NEXT:    ;;#ASMEND
6184; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
6185; GFX90A-NEXT:    ;;#ASMSTART
6186; GFX90A-NEXT:    ; def v[2:3]
6187; GFX90A-NEXT:    ;;#ASMEND
6188; GFX90A-NEXT:    v_bfi_b32 v1, s4, v1, v2
6189; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
6190; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
6191; GFX90A-NEXT:    v_perm_b32 v0, v3, v3, s4
6192; GFX90A-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
6193; GFX90A-NEXT:    s_waitcnt vmcnt(0)
6194; GFX90A-NEXT:    s_setpc_b64 s[30:31]
6195;
6196; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_4:
6197; GFX940:       ; %bb.0:
6198; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6199; GFX940-NEXT:    ;;#ASMSTART
6200; GFX940-NEXT:    ; def v[0:1]
6201; GFX940-NEXT:    ;;#ASMEND
6202; GFX940-NEXT:    s_mov_b32 s2, 0xffff
6203; GFX940-NEXT:    ;;#ASMSTART
6204; GFX940-NEXT:    ; def v[2:3]
6205; GFX940-NEXT:    ;;#ASMEND
6206; GFX940-NEXT:    v_mov_b32_e32 v4, 0
6207; GFX940-NEXT:    v_bfi_b32 v1, s2, v1, v2
6208; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
6209; GFX940-NEXT:    v_perm_b32 v0, v3, v3, s2
6210; GFX940-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
6211; GFX940-NEXT:    s_waitcnt vmcnt(0)
6212; GFX940-NEXT:    s_setpc_b64 s[30:31]
6213  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
6214  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
6215  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
6216  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
6217  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 2, i32 4>
6218  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
6219  ret void
6220}
6221
6222define void @v_shuffle_v4bf16_v3bf16__5_5_3_4(ptr addrspace(1) inreg %ptr) {
6223; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_4:
6224; GFX900:       ; %bb.0:
6225; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6226; GFX900-NEXT:    ;;#ASMSTART
6227; GFX900-NEXT:    ; def v[0:1]
6228; GFX900-NEXT:    ;;#ASMEND
6229; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
6230; GFX900-NEXT:    v_mov_b32_e32 v3, 0
6231; GFX900-NEXT:    v_perm_b32 v1, v1, v1, s4
6232; GFX900-NEXT:    v_mov_b32_e32 v2, v0
6233; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
6234; GFX900-NEXT:    s_waitcnt vmcnt(0)
6235; GFX900-NEXT:    s_setpc_b64 s[30:31]
6236;
6237; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_4:
6238; GFX90A:       ; %bb.0:
6239; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6240; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
6241; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
6242; GFX90A-NEXT:    ;;#ASMSTART
6243; GFX90A-NEXT:    ; def v[0:1]
6244; GFX90A-NEXT:    ;;#ASMEND
6245; GFX90A-NEXT:    v_perm_b32 v2, v1, v1, s4
6246; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
6247; GFX90A-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
6248; GFX90A-NEXT:    s_waitcnt vmcnt(0)
6249; GFX90A-NEXT:    s_setpc_b64 s[30:31]
6250;
6251; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_4:
6252; GFX940:       ; %bb.0:
6253; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6254; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
6255; GFX940-NEXT:    v_mov_b32_e32 v4, 0
6256; GFX940-NEXT:    ;;#ASMSTART
6257; GFX940-NEXT:    ; def v[0:1]
6258; GFX940-NEXT:    ;;#ASMEND
6259; GFX940-NEXT:    s_nop 0
6260; GFX940-NEXT:    v_perm_b32 v2, v1, v1, s2
6261; GFX940-NEXT:    v_mov_b32_e32 v3, v0
6262; GFX940-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
6263; GFX940-NEXT:    s_waitcnt vmcnt(0)
6264; GFX940-NEXT:    s_setpc_b64 s[30:31]
6265  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
6266  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
6267  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
6268  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
6269  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 3, i32 4>
6270  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
6271  ret void
6272}
6273
6274define void @v_shuffle_v4bf16_v3bf16__u_5_5_5(ptr addrspace(1) inreg %ptr) {
6275; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__u_5_5_5:
6276; GFX900:       ; %bb.0:
6277; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6278; GFX900-NEXT:    ;;#ASMSTART
6279; GFX900-NEXT:    ; def v[0:1]
6280; GFX900-NEXT:    ;;#ASMEND
6281; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
6282; GFX900-NEXT:    v_mov_b32_e32 v3, 0
6283; GFX900-NEXT:    v_perm_b32 v2, v1, v1, s4
6284; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
6285; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
6286; GFX900-NEXT:    s_waitcnt vmcnt(0)
6287; GFX900-NEXT:    s_setpc_b64 s[30:31]
6288;
6289; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__u_5_5_5:
6290; GFX90A:       ; %bb.0:
6291; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6292; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
6293; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
6294; GFX90A-NEXT:    ;;#ASMSTART
6295; GFX90A-NEXT:    ; def v[0:1]
6296; GFX90A-NEXT:    ;;#ASMEND
6297; GFX90A-NEXT:    v_perm_b32 v3, v1, v1, s4
6298; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
6299; GFX90A-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
6300; GFX90A-NEXT:    s_waitcnt vmcnt(0)
6301; GFX90A-NEXT:    s_setpc_b64 s[30:31]
6302;
6303; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__u_5_5_5:
6304; GFX940:       ; %bb.0:
6305; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6306; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
6307; GFX940-NEXT:    v_mov_b32_e32 v4, 0
6308; GFX940-NEXT:    ;;#ASMSTART
6309; GFX940-NEXT:    ; def v[0:1]
6310; GFX940-NEXT:    ;;#ASMEND
6311; GFX940-NEXT:    s_nop 0
6312; GFX940-NEXT:    v_perm_b32 v3, v1, v1, s2
6313; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
6314; GFX940-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
6315; GFX940-NEXT:    s_waitcnt vmcnt(0)
6316; GFX940-NEXT:    s_setpc_b64 s[30:31]
6317  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
6318  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
6319  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
6320  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
6321  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 poison, i32 5, i32 5, i32 5>
6322  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
6323  ret void
6324}
6325
6326define void @v_shuffle_v4bf16_v3bf16__0_5_5_5(ptr addrspace(1) inreg %ptr) {
6327; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__0_5_5_5:
6328; GFX900:       ; %bb.0:
6329; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6330; GFX900-NEXT:    ;;#ASMSTART
6331; GFX900-NEXT:    ; def v[0:1]
6332; GFX900-NEXT:    ;;#ASMEND
6333; GFX900-NEXT:    ;;#ASMSTART
6334; GFX900-NEXT:    ; def v[1:2]
6335; GFX900-NEXT:    ;;#ASMEND
6336; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
6337; GFX900-NEXT:    v_mov_b32_e32 v3, 0
6338; GFX900-NEXT:    v_perm_b32 v0, v2, v0, s4
6339; GFX900-NEXT:    v_perm_b32 v1, v2, v2, s4
6340; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
6341; GFX900-NEXT:    s_waitcnt vmcnt(0)
6342; GFX900-NEXT:    s_setpc_b64 s[30:31]
6343;
6344; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__0_5_5_5:
6345; GFX90A:       ; %bb.0:
6346; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6347; GFX90A-NEXT:    ;;#ASMSTART
6348; GFX90A-NEXT:    ; def v[0:1]
6349; GFX90A-NEXT:    ;;#ASMEND
6350; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
6351; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
6352; GFX90A-NEXT:    ;;#ASMSTART
6353; GFX90A-NEXT:    ; def v[2:3]
6354; GFX90A-NEXT:    ;;#ASMEND
6355; GFX90A-NEXT:    v_perm_b32 v0, v3, v0, s4
6356; GFX90A-NEXT:    v_perm_b32 v1, v3, v3, s4
6357; GFX90A-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
6358; GFX90A-NEXT:    s_waitcnt vmcnt(0)
6359; GFX90A-NEXT:    s_setpc_b64 s[30:31]
6360;
6361; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__0_5_5_5:
6362; GFX940:       ; %bb.0:
6363; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6364; GFX940-NEXT:    ;;#ASMSTART
6365; GFX940-NEXT:    ; def v[0:1]
6366; GFX940-NEXT:    ;;#ASMEND
6367; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
6368; GFX940-NEXT:    v_mov_b32_e32 v4, 0
6369; GFX940-NEXT:    ;;#ASMSTART
6370; GFX940-NEXT:    ; def v[2:3]
6371; GFX940-NEXT:    ;;#ASMEND
6372; GFX940-NEXT:    s_nop 0
6373; GFX940-NEXT:    v_perm_b32 v0, v3, v0, s2
6374; GFX940-NEXT:    v_perm_b32 v1, v3, v3, s2
6375; GFX940-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
6376; GFX940-NEXT:    s_waitcnt vmcnt(0)
6377; GFX940-NEXT:    s_setpc_b64 s[30:31]
6378  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
6379  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
6380  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
6381  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
6382  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 0, i32 5, i32 5, i32 5>
6383  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
6384  ret void
6385}
6386
6387define void @v_shuffle_v4bf16_v3bf16__1_5_5_5(ptr addrspace(1) inreg %ptr) {
6388; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__1_5_5_5:
6389; GFX900:       ; %bb.0:
6390; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6391; GFX900-NEXT:    ;;#ASMSTART
6392; GFX900-NEXT:    ; def v[0:1]
6393; GFX900-NEXT:    ;;#ASMEND
6394; GFX900-NEXT:    ;;#ASMSTART
6395; GFX900-NEXT:    ; def v[1:2]
6396; GFX900-NEXT:    ;;#ASMEND
6397; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
6398; GFX900-NEXT:    v_mov_b32_e32 v3, 0
6399; GFX900-NEXT:    v_perm_b32 v1, v2, v2, s4
6400; GFX900-NEXT:    v_alignbit_b32 v0, v2, v0, 16
6401; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
6402; GFX900-NEXT:    s_waitcnt vmcnt(0)
6403; GFX900-NEXT:    s_setpc_b64 s[30:31]
6404;
6405; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__1_5_5_5:
6406; GFX90A:       ; %bb.0:
6407; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6408; GFX90A-NEXT:    ;;#ASMSTART
6409; GFX90A-NEXT:    ; def v[0:1]
6410; GFX90A-NEXT:    ;;#ASMEND
6411; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
6412; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
6413; GFX90A-NEXT:    ;;#ASMSTART
6414; GFX90A-NEXT:    ; def v[2:3]
6415; GFX90A-NEXT:    ;;#ASMEND
6416; GFX90A-NEXT:    v_perm_b32 v1, v3, v3, s4
6417; GFX90A-NEXT:    v_alignbit_b32 v0, v3, v0, 16
6418; GFX90A-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
6419; GFX90A-NEXT:    s_waitcnt vmcnt(0)
6420; GFX90A-NEXT:    s_setpc_b64 s[30:31]
6421;
6422; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__1_5_5_5:
6423; GFX940:       ; %bb.0:
6424; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6425; GFX940-NEXT:    ;;#ASMSTART
6426; GFX940-NEXT:    ; def v[0:1]
6427; GFX940-NEXT:    ;;#ASMEND
6428; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
6429; GFX940-NEXT:    v_mov_b32_e32 v4, 0
6430; GFX940-NEXT:    ;;#ASMSTART
6431; GFX940-NEXT:    ; def v[2:3]
6432; GFX940-NEXT:    ;;#ASMEND
6433; GFX940-NEXT:    s_nop 0
6434; GFX940-NEXT:    v_perm_b32 v1, v3, v3, s2
6435; GFX940-NEXT:    v_alignbit_b32 v0, v3, v0, 16
6436; GFX940-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
6437; GFX940-NEXT:    s_waitcnt vmcnt(0)
6438; GFX940-NEXT:    s_setpc_b64 s[30:31]
6439  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
6440  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
6441  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
6442  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
6443  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 1, i32 5, i32 5, i32 5>
6444  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
6445  ret void
6446}
6447
6448define void @v_shuffle_v4bf16_v3bf16__2_5_5_5(ptr addrspace(1) inreg %ptr) {
6449; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__2_5_5_5:
6450; GFX900:       ; %bb.0:
6451; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6452; GFX900-NEXT:    ;;#ASMSTART
6453; GFX900-NEXT:    ; def v[0:1]
6454; GFX900-NEXT:    ;;#ASMEND
6455; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
6456; GFX900-NEXT:    v_mov_b32_e32 v4, 0
6457; GFX900-NEXT:    ;;#ASMSTART
6458; GFX900-NEXT:    ; def v[2:3]
6459; GFX900-NEXT:    ;;#ASMEND
6460; GFX900-NEXT:    v_perm_b32 v0, v3, v1, s4
6461; GFX900-NEXT:    v_perm_b32 v1, v3, v3, s4
6462; GFX900-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
6463; GFX900-NEXT:    s_waitcnt vmcnt(0)
6464; GFX900-NEXT:    s_setpc_b64 s[30:31]
6465;
6466; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__2_5_5_5:
6467; GFX90A:       ; %bb.0:
6468; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6469; GFX90A-NEXT:    ;;#ASMSTART
6470; GFX90A-NEXT:    ; def v[0:1]
6471; GFX90A-NEXT:    ;;#ASMEND
6472; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
6473; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
6474; GFX90A-NEXT:    ;;#ASMSTART
6475; GFX90A-NEXT:    ; def v[2:3]
6476; GFX90A-NEXT:    ;;#ASMEND
6477; GFX90A-NEXT:    v_perm_b32 v0, v3, v1, s4
6478; GFX90A-NEXT:    v_perm_b32 v1, v3, v3, s4
6479; GFX90A-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
6480; GFX90A-NEXT:    s_waitcnt vmcnt(0)
6481; GFX90A-NEXT:    s_setpc_b64 s[30:31]
6482;
6483; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__2_5_5_5:
6484; GFX940:       ; %bb.0:
6485; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6486; GFX940-NEXT:    ;;#ASMSTART
6487; GFX940-NEXT:    ; def v[0:1]
6488; GFX940-NEXT:    ;;#ASMEND
6489; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
6490; GFX940-NEXT:    v_mov_b32_e32 v4, 0
6491; GFX940-NEXT:    ;;#ASMSTART
6492; GFX940-NEXT:    ; def v[2:3]
6493; GFX940-NEXT:    ;;#ASMEND
6494; GFX940-NEXT:    s_nop 0
6495; GFX940-NEXT:    v_perm_b32 v0, v3, v1, s2
6496; GFX940-NEXT:    v_perm_b32 v1, v3, v3, s2
6497; GFX940-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
6498; GFX940-NEXT:    s_waitcnt vmcnt(0)
6499; GFX940-NEXT:    s_setpc_b64 s[30:31]
6500  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
6501  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
6502  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
6503  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
6504  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 2, i32 5, i32 5, i32 5>
6505  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
6506  ret void
6507}
6508
6509define void @v_shuffle_v4bf16_v3bf16__3_5_5_5(ptr addrspace(1) inreg %ptr) {
6510; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__3_5_5_5:
6511; GFX900:       ; %bb.0:
6512; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6513; GFX900-NEXT:    ;;#ASMSTART
6514; GFX900-NEXT:    ; def v[0:1]
6515; GFX900-NEXT:    ;;#ASMEND
6516; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
6517; GFX900-NEXT:    v_mov_b32_e32 v2, 0
6518; GFX900-NEXT:    v_perm_b32 v0, v1, v0, s4
6519; GFX900-NEXT:    v_perm_b32 v1, v1, v1, s4
6520; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
6521; GFX900-NEXT:    s_waitcnt vmcnt(0)
6522; GFX900-NEXT:    s_setpc_b64 s[30:31]
6523;
6524; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__3_5_5_5:
6525; GFX90A:       ; %bb.0:
6526; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6527; GFX90A-NEXT:    ;;#ASMSTART
6528; GFX90A-NEXT:    ; def v[0:1]
6529; GFX90A-NEXT:    ;;#ASMEND
6530; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
6531; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
6532; GFX90A-NEXT:    v_perm_b32 v0, v1, v0, s4
6533; GFX90A-NEXT:    v_perm_b32 v1, v1, v1, s4
6534; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
6535; GFX90A-NEXT:    s_waitcnt vmcnt(0)
6536; GFX90A-NEXT:    s_setpc_b64 s[30:31]
6537;
6538; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__3_5_5_5:
6539; GFX940:       ; %bb.0:
6540; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6541; GFX940-NEXT:    ;;#ASMSTART
6542; GFX940-NEXT:    ; def v[0:1]
6543; GFX940-NEXT:    ;;#ASMEND
6544; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
6545; GFX940-NEXT:    v_mov_b32_e32 v2, 0
6546; GFX940-NEXT:    v_perm_b32 v0, v1, v0, s2
6547; GFX940-NEXT:    v_perm_b32 v1, v1, v1, s2
6548; GFX940-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
6549; GFX940-NEXT:    s_waitcnt vmcnt(0)
6550; GFX940-NEXT:    s_setpc_b64 s[30:31]
6551  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
6552  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
6553  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
6554  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
6555  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 3, i32 5, i32 5, i32 5>
6556  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
6557  ret void
6558}
6559
6560define void @v_shuffle_v4bf16_v3bf16__4_5_5_5(ptr addrspace(1) inreg %ptr) {
6561; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__4_5_5_5:
6562; GFX900:       ; %bb.0:
6563; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6564; GFX900-NEXT:    ;;#ASMSTART
6565; GFX900-NEXT:    ; def v[0:1]
6566; GFX900-NEXT:    ;;#ASMEND
6567; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
6568; GFX900-NEXT:    v_mov_b32_e32 v3, 0
6569; GFX900-NEXT:    v_perm_b32 v2, v1, v1, s4
6570; GFX900-NEXT:    v_alignbit_b32 v1, v1, v0, 16
6571; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
6572; GFX900-NEXT:    s_waitcnt vmcnt(0)
6573; GFX900-NEXT:    s_setpc_b64 s[30:31]
6574;
6575; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__4_5_5_5:
6576; GFX90A:       ; %bb.0:
6577; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6578; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
6579; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
6580; GFX90A-NEXT:    ;;#ASMSTART
6581; GFX90A-NEXT:    ; def v[0:1]
6582; GFX90A-NEXT:    ;;#ASMEND
6583; GFX90A-NEXT:    v_perm_b32 v3, v1, v1, s4
6584; GFX90A-NEXT:    v_alignbit_b32 v2, v1, v0, 16
6585; GFX90A-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
6586; GFX90A-NEXT:    s_waitcnt vmcnt(0)
6587; GFX90A-NEXT:    s_setpc_b64 s[30:31]
6588;
6589; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__4_5_5_5:
6590; GFX940:       ; %bb.0:
6591; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6592; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
6593; GFX940-NEXT:    v_mov_b32_e32 v4, 0
6594; GFX940-NEXT:    ;;#ASMSTART
6595; GFX940-NEXT:    ; def v[0:1]
6596; GFX940-NEXT:    ;;#ASMEND
6597; GFX940-NEXT:    s_nop 0
6598; GFX940-NEXT:    v_perm_b32 v3, v1, v1, s2
6599; GFX940-NEXT:    v_alignbit_b32 v2, v1, v0, 16
6600; GFX940-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
6601; GFX940-NEXT:    s_waitcnt vmcnt(0)
6602; GFX940-NEXT:    s_setpc_b64 s[30:31]
6603  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
6604  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
6605  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
6606  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
6607  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 4, i32 5, i32 5, i32 5>
6608  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
6609  ret void
6610}
6611
6612define void @v_shuffle_v4bf16_v3bf16__5_u_5_5(ptr addrspace(1) inreg %ptr) {
6613; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_u_5_5:
6614; GFX900:       ; %bb.0:
6615; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6616; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
6617; GFX900-NEXT:    v_mov_b32_e32 v3, 0
6618; GFX900-NEXT:    ;;#ASMSTART
6619; GFX900-NEXT:    ; def v[0:1]
6620; GFX900-NEXT:    ;;#ASMEND
6621; GFX900-NEXT:    v_perm_b32 v2, v1, v1, s4
6622; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
6623; GFX900-NEXT:    s_waitcnt vmcnt(0)
6624; GFX900-NEXT:    s_setpc_b64 s[30:31]
6625;
6626; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_u_5_5:
6627; GFX90A:       ; %bb.0:
6628; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6629; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
6630; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
6631; GFX90A-NEXT:    ;;#ASMSTART
6632; GFX90A-NEXT:    ; def v[0:1]
6633; GFX90A-NEXT:    ;;#ASMEND
6634; GFX90A-NEXT:    v_perm_b32 v3, v1, v1, s4
6635; GFX90A-NEXT:    v_mov_b32_e32 v2, v1
6636; GFX90A-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
6637; GFX90A-NEXT:    s_waitcnt vmcnt(0)
6638; GFX90A-NEXT:    s_setpc_b64 s[30:31]
6639;
6640; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_u_5_5:
6641; GFX940:       ; %bb.0:
6642; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6643; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
6644; GFX940-NEXT:    v_mov_b32_e32 v4, 0
6645; GFX940-NEXT:    ;;#ASMSTART
6646; GFX940-NEXT:    ; def v[0:1]
6647; GFX940-NEXT:    ;;#ASMEND
6648; GFX940-NEXT:    s_nop 0
6649; GFX940-NEXT:    v_perm_b32 v3, v1, v1, s2
6650; GFX940-NEXT:    v_mov_b32_e32 v2, v1
6651; GFX940-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
6652; GFX940-NEXT:    s_waitcnt vmcnt(0)
6653; GFX940-NEXT:    s_setpc_b64 s[30:31]
6654  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
6655  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
6656  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
6657  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
6658  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 poison, i32 5, i32 5>
6659  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
6660  ret void
6661}
6662
6663define void @v_shuffle_v4bf16_v3bf16__5_0_5_5(ptr addrspace(1) inreg %ptr) {
6664; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_0_5_5:
6665; GFX900:       ; %bb.0:
6666; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6667; GFX900-NEXT:    ;;#ASMSTART
6668; GFX900-NEXT:    ; def v[0:1]
6669; GFX900-NEXT:    ;;#ASMEND
6670; GFX900-NEXT:    ;;#ASMSTART
6671; GFX900-NEXT:    ; def v[1:2]
6672; GFX900-NEXT:    ;;#ASMEND
6673; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
6674; GFX900-NEXT:    v_mov_b32_e32 v3, 0
6675; GFX900-NEXT:    v_perm_b32 v0, v0, v2, s4
6676; GFX900-NEXT:    v_perm_b32 v1, v2, v2, s4
6677; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
6678; GFX900-NEXT:    s_waitcnt vmcnt(0)
6679; GFX900-NEXT:    s_setpc_b64 s[30:31]
6680;
6681; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_0_5_5:
6682; GFX90A:       ; %bb.0:
6683; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6684; GFX90A-NEXT:    ;;#ASMSTART
6685; GFX90A-NEXT:    ; def v[0:1]
6686; GFX90A-NEXT:    ;;#ASMEND
6687; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
6688; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
6689; GFX90A-NEXT:    ;;#ASMSTART
6690; GFX90A-NEXT:    ; def v[2:3]
6691; GFX90A-NEXT:    ;;#ASMEND
6692; GFX90A-NEXT:    v_perm_b32 v0, v0, v3, s4
6693; GFX90A-NEXT:    v_perm_b32 v1, v3, v3, s4
6694; GFX90A-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
6695; GFX90A-NEXT:    s_waitcnt vmcnt(0)
6696; GFX90A-NEXT:    s_setpc_b64 s[30:31]
6697;
6698; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_0_5_5:
6699; GFX940:       ; %bb.0:
6700; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6701; GFX940-NEXT:    ;;#ASMSTART
6702; GFX940-NEXT:    ; def v[0:1]
6703; GFX940-NEXT:    ;;#ASMEND
6704; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
6705; GFX940-NEXT:    v_mov_b32_e32 v4, 0
6706; GFX940-NEXT:    ;;#ASMSTART
6707; GFX940-NEXT:    ; def v[2:3]
6708; GFX940-NEXT:    ;;#ASMEND
6709; GFX940-NEXT:    s_nop 0
6710; GFX940-NEXT:    v_perm_b32 v0, v0, v3, s2
6711; GFX940-NEXT:    v_perm_b32 v1, v3, v3, s2
6712; GFX940-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
6713; GFX940-NEXT:    s_waitcnt vmcnt(0)
6714; GFX940-NEXT:    s_setpc_b64 s[30:31]
6715  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
6716  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
6717  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
6718  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
6719  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 0, i32 5, i32 5>
6720  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
6721  ret void
6722}
6723
6724define void @v_shuffle_v4bf16_v3bf16__5_1_5_5(ptr addrspace(1) inreg %ptr) {
6725; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_1_5_5:
6726; GFX900:       ; %bb.0:
6727; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6728; GFX900-NEXT:    ;;#ASMSTART
6729; GFX900-NEXT:    ; def v[0:1]
6730; GFX900-NEXT:    ;;#ASMEND
6731; GFX900-NEXT:    s_mov_b32 s4, 0xffff
6732; GFX900-NEXT:    ;;#ASMSTART
6733; GFX900-NEXT:    ; def v[1:2]
6734; GFX900-NEXT:    ;;#ASMEND
6735; GFX900-NEXT:    v_bfi_b32 v0, s4, v2, v0
6736; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
6737; GFX900-NEXT:    v_mov_b32_e32 v3, 0
6738; GFX900-NEXT:    v_perm_b32 v1, v2, v2, s4
6739; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
6740; GFX900-NEXT:    s_waitcnt vmcnt(0)
6741; GFX900-NEXT:    s_setpc_b64 s[30:31]
6742;
6743; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_1_5_5:
6744; GFX90A:       ; %bb.0:
6745; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6746; GFX90A-NEXT:    ;;#ASMSTART
6747; GFX90A-NEXT:    ; def v[0:1]
6748; GFX90A-NEXT:    ;;#ASMEND
6749; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
6750; GFX90A-NEXT:    ;;#ASMSTART
6751; GFX90A-NEXT:    ; def v[2:3]
6752; GFX90A-NEXT:    ;;#ASMEND
6753; GFX90A-NEXT:    v_bfi_b32 v0, s4, v3, v0
6754; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
6755; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
6756; GFX90A-NEXT:    v_perm_b32 v1, v3, v3, s4
6757; GFX90A-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
6758; GFX90A-NEXT:    s_waitcnt vmcnt(0)
6759; GFX90A-NEXT:    s_setpc_b64 s[30:31]
6760;
6761; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_1_5_5:
6762; GFX940:       ; %bb.0:
6763; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6764; GFX940-NEXT:    ;;#ASMSTART
6765; GFX940-NEXT:    ; def v[0:1]
6766; GFX940-NEXT:    ;;#ASMEND
6767; GFX940-NEXT:    s_mov_b32 s2, 0xffff
6768; GFX940-NEXT:    ;;#ASMSTART
6769; GFX940-NEXT:    ; def v[2:3]
6770; GFX940-NEXT:    ;;#ASMEND
6771; GFX940-NEXT:    v_mov_b32_e32 v4, 0
6772; GFX940-NEXT:    v_bfi_b32 v0, s2, v3, v0
6773; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
6774; GFX940-NEXT:    v_perm_b32 v1, v3, v3, s2
6775; GFX940-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
6776; GFX940-NEXT:    s_waitcnt vmcnt(0)
6777; GFX940-NEXT:    s_setpc_b64 s[30:31]
6778  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
6779  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
6780  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
6781  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
6782  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 1, i32 5, i32 5>
6783  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
6784  ret void
6785}
6786
6787define void @v_shuffle_v4bf16_v3bf16__5_2_5_5(ptr addrspace(1) inreg %ptr) {
6788; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_2_5_5:
6789; GFX900:       ; %bb.0:
6790; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6791; GFX900-NEXT:    ;;#ASMSTART
6792; GFX900-NEXT:    ; def v[0:1]
6793; GFX900-NEXT:    ;;#ASMEND
6794; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
6795; GFX900-NEXT:    v_mov_b32_e32 v4, 0
6796; GFX900-NEXT:    ;;#ASMSTART
6797; GFX900-NEXT:    ; def v[2:3]
6798; GFX900-NEXT:    ;;#ASMEND
6799; GFX900-NEXT:    v_perm_b32 v0, v1, v3, s4
6800; GFX900-NEXT:    v_perm_b32 v1, v3, v3, s4
6801; GFX900-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
6802; GFX900-NEXT:    s_waitcnt vmcnt(0)
6803; GFX900-NEXT:    s_setpc_b64 s[30:31]
6804;
6805; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_2_5_5:
6806; GFX90A:       ; %bb.0:
6807; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6808; GFX90A-NEXT:    ;;#ASMSTART
6809; GFX90A-NEXT:    ; def v[0:1]
6810; GFX90A-NEXT:    ;;#ASMEND
6811; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
6812; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
6813; GFX90A-NEXT:    ;;#ASMSTART
6814; GFX90A-NEXT:    ; def v[2:3]
6815; GFX90A-NEXT:    ;;#ASMEND
6816; GFX90A-NEXT:    v_perm_b32 v0, v1, v3, s4
6817; GFX90A-NEXT:    v_perm_b32 v1, v3, v3, s4
6818; GFX90A-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
6819; GFX90A-NEXT:    s_waitcnt vmcnt(0)
6820; GFX90A-NEXT:    s_setpc_b64 s[30:31]
6821;
6822; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_2_5_5:
6823; GFX940:       ; %bb.0:
6824; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6825; GFX940-NEXT:    ;;#ASMSTART
6826; GFX940-NEXT:    ; def v[0:1]
6827; GFX940-NEXT:    ;;#ASMEND
6828; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
6829; GFX940-NEXT:    v_mov_b32_e32 v4, 0
6830; GFX940-NEXT:    ;;#ASMSTART
6831; GFX940-NEXT:    ; def v[2:3]
6832; GFX940-NEXT:    ;;#ASMEND
6833; GFX940-NEXT:    s_nop 0
6834; GFX940-NEXT:    v_perm_b32 v0, v1, v3, s2
6835; GFX940-NEXT:    v_perm_b32 v1, v3, v3, s2
6836; GFX940-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
6837; GFX940-NEXT:    s_waitcnt vmcnt(0)
6838; GFX940-NEXT:    s_setpc_b64 s[30:31]
6839  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
6840  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
6841  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
6842  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
6843  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 2, i32 5, i32 5>
6844  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
6845  ret void
6846}
6847
6848define void @v_shuffle_v4bf16_v3bf16__5_3_5_5(ptr addrspace(1) inreg %ptr) {
6849; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_3_5_5:
6850; GFX900:       ; %bb.0:
6851; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6852; GFX900-NEXT:    ;;#ASMSTART
6853; GFX900-NEXT:    ; def v[0:1]
6854; GFX900-NEXT:    ;;#ASMEND
6855; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
6856; GFX900-NEXT:    v_mov_b32_e32 v2, 0
6857; GFX900-NEXT:    v_perm_b32 v0, v0, v1, s4
6858; GFX900-NEXT:    v_perm_b32 v1, v1, v1, s4
6859; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
6860; GFX900-NEXT:    s_waitcnt vmcnt(0)
6861; GFX900-NEXT:    s_setpc_b64 s[30:31]
6862;
6863; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_3_5_5:
6864; GFX90A:       ; %bb.0:
6865; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6866; GFX90A-NEXT:    ;;#ASMSTART
6867; GFX90A-NEXT:    ; def v[0:1]
6868; GFX90A-NEXT:    ;;#ASMEND
6869; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
6870; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
6871; GFX90A-NEXT:    v_perm_b32 v0, v0, v1, s4
6872; GFX90A-NEXT:    v_perm_b32 v1, v1, v1, s4
6873; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
6874; GFX90A-NEXT:    s_waitcnt vmcnt(0)
6875; GFX90A-NEXT:    s_setpc_b64 s[30:31]
6876;
6877; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_3_5_5:
6878; GFX940:       ; %bb.0:
6879; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6880; GFX940-NEXT:    ;;#ASMSTART
6881; GFX940-NEXT:    ; def v[0:1]
6882; GFX940-NEXT:    ;;#ASMEND
6883; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
6884; GFX940-NEXT:    v_mov_b32_e32 v2, 0
6885; GFX940-NEXT:    v_perm_b32 v0, v0, v1, s2
6886; GFX940-NEXT:    v_perm_b32 v1, v1, v1, s2
6887; GFX940-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
6888; GFX940-NEXT:    s_waitcnt vmcnt(0)
6889; GFX940-NEXT:    s_setpc_b64 s[30:31]
6890  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
6891  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
6892  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
6893  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
6894  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 3, i32 5, i32 5>
6895  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
6896  ret void
6897}
6898
6899define void @v_shuffle_v4bf16_v3bf16__5_4_5_5(ptr addrspace(1) inreg %ptr) {
6900; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_4_5_5:
6901; GFX900:       ; %bb.0:
6902; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6903; GFX900-NEXT:    ;;#ASMSTART
6904; GFX900-NEXT:    ; def v[0:1]
6905; GFX900-NEXT:    ;;#ASMEND
6906; GFX900-NEXT:    s_mov_b32 s4, 0xffff
6907; GFX900-NEXT:    v_bfi_b32 v0, s4, v1, v0
6908; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
6909; GFX900-NEXT:    v_mov_b32_e32 v2, 0
6910; GFX900-NEXT:    v_perm_b32 v1, v1, v1, s4
6911; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
6912; GFX900-NEXT:    s_waitcnt vmcnt(0)
6913; GFX900-NEXT:    s_setpc_b64 s[30:31]
6914;
6915; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_4_5_5:
6916; GFX90A:       ; %bb.0:
6917; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6918; GFX90A-NEXT:    ;;#ASMSTART
6919; GFX90A-NEXT:    ; def v[0:1]
6920; GFX90A-NEXT:    ;;#ASMEND
6921; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
6922; GFX90A-NEXT:    v_bfi_b32 v0, s4, v1, v0
6923; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
6924; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
6925; GFX90A-NEXT:    v_perm_b32 v1, v1, v1, s4
6926; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
6927; GFX90A-NEXT:    s_waitcnt vmcnt(0)
6928; GFX90A-NEXT:    s_setpc_b64 s[30:31]
6929;
6930; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_4_5_5:
6931; GFX940:       ; %bb.0:
6932; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6933; GFX940-NEXT:    ;;#ASMSTART
6934; GFX940-NEXT:    ; def v[0:1]
6935; GFX940-NEXT:    ;;#ASMEND
6936; GFX940-NEXT:    s_mov_b32 s2, 0xffff
6937; GFX940-NEXT:    v_bfi_b32 v0, s2, v1, v0
6938; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
6939; GFX940-NEXT:    v_mov_b32_e32 v2, 0
6940; GFX940-NEXT:    v_perm_b32 v1, v1, v1, s2
6941; GFX940-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
6942; GFX940-NEXT:    s_waitcnt vmcnt(0)
6943; GFX940-NEXT:    s_setpc_b64 s[30:31]
6944  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
6945  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
6946  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
6947  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
6948  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 4, i32 5, i32 5>
6949  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
6950  ret void
6951}
6952
6953define void @v_shuffle_v4bf16_v3bf16__5_5_u_5(ptr addrspace(1) inreg %ptr) {
6954; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_5:
6955; GFX900:       ; %bb.0:
6956; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6957; GFX900-NEXT:    ;;#ASMSTART
6958; GFX900-NEXT:    ; def v[0:1]
6959; GFX900-NEXT:    ;;#ASMEND
6960; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
6961; GFX900-NEXT:    v_mov_b32_e32 v2, 0
6962; GFX900-NEXT:    v_perm_b32 v0, v1, v1, s4
6963; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
6964; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
6965; GFX900-NEXT:    s_waitcnt vmcnt(0)
6966; GFX900-NEXT:    s_setpc_b64 s[30:31]
6967;
6968; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_5:
6969; GFX90A:       ; %bb.0:
6970; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6971; GFX90A-NEXT:    ;;#ASMSTART
6972; GFX90A-NEXT:    ; def v[0:1]
6973; GFX90A-NEXT:    ;;#ASMEND
6974; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
6975; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
6976; GFX90A-NEXT:    v_perm_b32 v0, v1, v1, s4
6977; GFX90A-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
6978; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[16:17]
6979; GFX90A-NEXT:    s_waitcnt vmcnt(0)
6980; GFX90A-NEXT:    s_setpc_b64 s[30:31]
6981;
6982; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_u_5:
6983; GFX940:       ; %bb.0:
6984; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6985; GFX940-NEXT:    ;;#ASMSTART
6986; GFX940-NEXT:    ; def v[0:1]
6987; GFX940-NEXT:    ;;#ASMEND
6988; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
6989; GFX940-NEXT:    v_mov_b32_e32 v2, 0
6990; GFX940-NEXT:    v_perm_b32 v0, v1, v1, s2
6991; GFX940-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
6992; GFX940-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
6993; GFX940-NEXT:    s_waitcnt vmcnt(0)
6994; GFX940-NEXT:    s_setpc_b64 s[30:31]
6995  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
6996  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
6997  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
6998  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
6999  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 poison, i32 5>
7000  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
7001  ret void
7002}
7003
7004define void @v_shuffle_v4bf16_v3bf16__5_5_0_5(ptr addrspace(1) inreg %ptr) {
7005; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_5:
7006; GFX900:       ; %bb.0:
7007; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7008; GFX900-NEXT:    ;;#ASMSTART
7009; GFX900-NEXT:    ; def v[0:1]
7010; GFX900-NEXT:    ;;#ASMEND
7011; GFX900-NEXT:    ;;#ASMSTART
7012; GFX900-NEXT:    ; def v[1:2]
7013; GFX900-NEXT:    ;;#ASMEND
7014; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
7015; GFX900-NEXT:    v_mov_b32_e32 v3, 0
7016; GFX900-NEXT:    v_perm_b32 v1, v2, v0, s4
7017; GFX900-NEXT:    v_perm_b32 v0, v2, v2, s4
7018; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
7019; GFX900-NEXT:    s_waitcnt vmcnt(0)
7020; GFX900-NEXT:    s_setpc_b64 s[30:31]
7021;
7022; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_5:
7023; GFX90A:       ; %bb.0:
7024; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7025; GFX90A-NEXT:    ;;#ASMSTART
7026; GFX90A-NEXT:    ; def v[0:1]
7027; GFX90A-NEXT:    ;;#ASMEND
7028; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
7029; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
7030; GFX90A-NEXT:    ;;#ASMSTART
7031; GFX90A-NEXT:    ; def v[2:3]
7032; GFX90A-NEXT:    ;;#ASMEND
7033; GFX90A-NEXT:    v_perm_b32 v1, v3, v0, s4
7034; GFX90A-NEXT:    v_perm_b32 v0, v3, v3, s4
7035; GFX90A-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
7036; GFX90A-NEXT:    s_waitcnt vmcnt(0)
7037; GFX90A-NEXT:    s_setpc_b64 s[30:31]
7038;
7039; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_0_5:
7040; GFX940:       ; %bb.0:
7041; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7042; GFX940-NEXT:    ;;#ASMSTART
7043; GFX940-NEXT:    ; def v[0:1]
7044; GFX940-NEXT:    ;;#ASMEND
7045; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
7046; GFX940-NEXT:    v_mov_b32_e32 v4, 0
7047; GFX940-NEXT:    ;;#ASMSTART
7048; GFX940-NEXT:    ; def v[2:3]
7049; GFX940-NEXT:    ;;#ASMEND
7050; GFX940-NEXT:    s_nop 0
7051; GFX940-NEXT:    v_perm_b32 v1, v3, v0, s2
7052; GFX940-NEXT:    v_perm_b32 v0, v3, v3, s2
7053; GFX940-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
7054; GFX940-NEXT:    s_waitcnt vmcnt(0)
7055; GFX940-NEXT:    s_setpc_b64 s[30:31]
7056  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
7057  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
7058  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
7059  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
7060  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 0, i32 5>
7061  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
7062  ret void
7063}
7064
7065define void @v_shuffle_v4bf16_v3bf16__5_5_1_5(ptr addrspace(1) inreg %ptr) {
7066; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_5:
7067; GFX900:       ; %bb.0:
7068; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7069; GFX900-NEXT:    ;;#ASMSTART
7070; GFX900-NEXT:    ; def v[0:1]
7071; GFX900-NEXT:    ;;#ASMEND
7072; GFX900-NEXT:    ;;#ASMSTART
7073; GFX900-NEXT:    ; def v[1:2]
7074; GFX900-NEXT:    ;;#ASMEND
7075; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
7076; GFX900-NEXT:    v_mov_b32_e32 v3, 0
7077; GFX900-NEXT:    v_alignbit_b32 v1, v2, v0, 16
7078; GFX900-NEXT:    v_perm_b32 v0, v2, v2, s4
7079; GFX900-NEXT:    global_store_dwordx2 v3, v[0:1], s[16:17]
7080; GFX900-NEXT:    s_waitcnt vmcnt(0)
7081; GFX900-NEXT:    s_setpc_b64 s[30:31]
7082;
7083; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_5:
7084; GFX90A:       ; %bb.0:
7085; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7086; GFX90A-NEXT:    ;;#ASMSTART
7087; GFX90A-NEXT:    ; def v[0:1]
7088; GFX90A-NEXT:    ;;#ASMEND
7089; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
7090; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
7091; GFX90A-NEXT:    ;;#ASMSTART
7092; GFX90A-NEXT:    ; def v[2:3]
7093; GFX90A-NEXT:    ;;#ASMEND
7094; GFX90A-NEXT:    v_alignbit_b32 v1, v3, v0, 16
7095; GFX90A-NEXT:    v_perm_b32 v0, v3, v3, s4
7096; GFX90A-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
7097; GFX90A-NEXT:    s_waitcnt vmcnt(0)
7098; GFX90A-NEXT:    s_setpc_b64 s[30:31]
7099;
7100; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_1_5:
7101; GFX940:       ; %bb.0:
7102; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7103; GFX940-NEXT:    ;;#ASMSTART
7104; GFX940-NEXT:    ; def v[0:1]
7105; GFX940-NEXT:    ;;#ASMEND
7106; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
7107; GFX940-NEXT:    v_mov_b32_e32 v4, 0
7108; GFX940-NEXT:    ;;#ASMSTART
7109; GFX940-NEXT:    ; def v[2:3]
7110; GFX940-NEXT:    ;;#ASMEND
7111; GFX940-NEXT:    s_nop 0
7112; GFX940-NEXT:    v_alignbit_b32 v1, v3, v0, 16
7113; GFX940-NEXT:    v_perm_b32 v0, v3, v3, s2
7114; GFX940-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
7115; GFX940-NEXT:    s_waitcnt vmcnt(0)
7116; GFX940-NEXT:    s_setpc_b64 s[30:31]
7117  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
7118  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
7119  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
7120  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
7121  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 1, i32 5>
7122  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
7123  ret void
7124}
7125
7126define void @v_shuffle_v4bf16_v3bf16__5_5_2_5(ptr addrspace(1) inreg %ptr) {
7127; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_5:
7128; GFX900:       ; %bb.0:
7129; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7130; GFX900-NEXT:    ;;#ASMSTART
7131; GFX900-NEXT:    ; def v[0:1]
7132; GFX900-NEXT:    ;;#ASMEND
7133; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
7134; GFX900-NEXT:    v_mov_b32_e32 v4, 0
7135; GFX900-NEXT:    ;;#ASMSTART
7136; GFX900-NEXT:    ; def v[2:3]
7137; GFX900-NEXT:    ;;#ASMEND
7138; GFX900-NEXT:    v_perm_b32 v1, v3, v1, s4
7139; GFX900-NEXT:    v_perm_b32 v0, v3, v3, s4
7140; GFX900-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
7141; GFX900-NEXT:    s_waitcnt vmcnt(0)
7142; GFX900-NEXT:    s_setpc_b64 s[30:31]
7143;
7144; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_5:
7145; GFX90A:       ; %bb.0:
7146; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7147; GFX90A-NEXT:    ;;#ASMSTART
7148; GFX90A-NEXT:    ; def v[0:1]
7149; GFX90A-NEXT:    ;;#ASMEND
7150; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
7151; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
7152; GFX90A-NEXT:    ;;#ASMSTART
7153; GFX90A-NEXT:    ; def v[2:3]
7154; GFX90A-NEXT:    ;;#ASMEND
7155; GFX90A-NEXT:    v_perm_b32 v1, v3, v1, s4
7156; GFX90A-NEXT:    v_perm_b32 v0, v3, v3, s4
7157; GFX90A-NEXT:    global_store_dwordx2 v4, v[0:1], s[16:17]
7158; GFX90A-NEXT:    s_waitcnt vmcnt(0)
7159; GFX90A-NEXT:    s_setpc_b64 s[30:31]
7160;
7161; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_2_5:
7162; GFX940:       ; %bb.0:
7163; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7164; GFX940-NEXT:    ;;#ASMSTART
7165; GFX940-NEXT:    ; def v[0:1]
7166; GFX940-NEXT:    ;;#ASMEND
7167; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
7168; GFX940-NEXT:    v_mov_b32_e32 v4, 0
7169; GFX940-NEXT:    ;;#ASMSTART
7170; GFX940-NEXT:    ; def v[2:3]
7171; GFX940-NEXT:    ;;#ASMEND
7172; GFX940-NEXT:    s_nop 0
7173; GFX940-NEXT:    v_perm_b32 v1, v3, v1, s2
7174; GFX940-NEXT:    v_perm_b32 v0, v3, v3, s2
7175; GFX940-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
7176; GFX940-NEXT:    s_waitcnt vmcnt(0)
7177; GFX940-NEXT:    s_setpc_b64 s[30:31]
7178  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
7179  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
7180  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
7181  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
7182  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 2, i32 5>
7183  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
7184  ret void
7185}
7186
7187define void @v_shuffle_v4bf16_v3bf16__5_5_3_5(ptr addrspace(1) inreg %ptr) {
7188; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_5:
7189; GFX900:       ; %bb.0:
7190; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7191; GFX900-NEXT:    ;;#ASMSTART
7192; GFX900-NEXT:    ; def v[0:1]
7193; GFX900-NEXT:    ;;#ASMEND
7194; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
7195; GFX900-NEXT:    v_mov_b32_e32 v3, 0
7196; GFX900-NEXT:    v_perm_b32 v2, v1, v0, s4
7197; GFX900-NEXT:    v_perm_b32 v1, v1, v1, s4
7198; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
7199; GFX900-NEXT:    s_waitcnt vmcnt(0)
7200; GFX900-NEXT:    s_setpc_b64 s[30:31]
7201;
7202; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_5:
7203; GFX90A:       ; %bb.0:
7204; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7205; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
7206; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
7207; GFX90A-NEXT:    ;;#ASMSTART
7208; GFX90A-NEXT:    ; def v[0:1]
7209; GFX90A-NEXT:    ;;#ASMEND
7210; GFX90A-NEXT:    v_perm_b32 v3, v1, v0, s4
7211; GFX90A-NEXT:    v_perm_b32 v2, v1, v1, s4
7212; GFX90A-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
7213; GFX90A-NEXT:    s_waitcnt vmcnt(0)
7214; GFX90A-NEXT:    s_setpc_b64 s[30:31]
7215;
7216; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_3_5:
7217; GFX940:       ; %bb.0:
7218; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7219; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
7220; GFX940-NEXT:    v_mov_b32_e32 v4, 0
7221; GFX940-NEXT:    ;;#ASMSTART
7222; GFX940-NEXT:    ; def v[0:1]
7223; GFX940-NEXT:    ;;#ASMEND
7224; GFX940-NEXT:    s_nop 0
7225; GFX940-NEXT:    v_perm_b32 v3, v1, v0, s2
7226; GFX940-NEXT:    v_perm_b32 v2, v1, v1, s2
7227; GFX940-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
7228; GFX940-NEXT:    s_waitcnt vmcnt(0)
7229; GFX940-NEXT:    s_setpc_b64 s[30:31]
7230  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
7231  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
7232  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
7233  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
7234  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 3, i32 5>
7235  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
7236  ret void
7237}
7238
7239define void @v_shuffle_v4bf16_v3bf16__5_5_4_5(ptr addrspace(1) inreg %ptr) {
7240; GFX900-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_5:
7241; GFX900:       ; %bb.0:
7242; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7243; GFX900-NEXT:    ;;#ASMSTART
7244; GFX900-NEXT:    ; def v[0:1]
7245; GFX900-NEXT:    ;;#ASMEND
7246; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
7247; GFX900-NEXT:    v_mov_b32_e32 v3, 0
7248; GFX900-NEXT:    v_alignbit_b32 v2, v1, v0, 16
7249; GFX900-NEXT:    v_perm_b32 v1, v1, v1, s4
7250; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[16:17]
7251; GFX900-NEXT:    s_waitcnt vmcnt(0)
7252; GFX900-NEXT:    s_setpc_b64 s[30:31]
7253;
7254; GFX90A-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_5:
7255; GFX90A:       ; %bb.0:
7256; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7257; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
7258; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
7259; GFX90A-NEXT:    ;;#ASMSTART
7260; GFX90A-NEXT:    ; def v[0:1]
7261; GFX90A-NEXT:    ;;#ASMEND
7262; GFX90A-NEXT:    v_alignbit_b32 v3, v1, v0, 16
7263; GFX90A-NEXT:    v_perm_b32 v2, v1, v1, s4
7264; GFX90A-NEXT:    global_store_dwordx2 v4, v[2:3], s[16:17]
7265; GFX90A-NEXT:    s_waitcnt vmcnt(0)
7266; GFX90A-NEXT:    s_setpc_b64 s[30:31]
7267;
7268; GFX940-LABEL: v_shuffle_v4bf16_v3bf16__5_5_4_5:
7269; GFX940:       ; %bb.0:
7270; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7271; GFX940-NEXT:    s_mov_b32 s2, 0x5040100
7272; GFX940-NEXT:    v_mov_b32_e32 v4, 0
7273; GFX940-NEXT:    ;;#ASMSTART
7274; GFX940-NEXT:    ; def v[0:1]
7275; GFX940-NEXT:    ;;#ASMEND
7276; GFX940-NEXT:    s_nop 0
7277; GFX940-NEXT:    v_alignbit_b32 v3, v1, v0, 16
7278; GFX940-NEXT:    v_perm_b32 v2, v1, v1, s2
7279; GFX940-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
7280; GFX940-NEXT:    s_waitcnt vmcnt(0)
7281; GFX940-NEXT:    s_setpc_b64 s[30:31]
7282  %vec0 = call <4 x bfloat> asm "; def $0", "=v"()
7283  %vec1 = call <4 x bfloat> asm "; def $0", "=v"()
7284  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
7285  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
7286  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 4, i32 5>
7287  store <4 x bfloat> %shuf, ptr addrspace(1) %ptr, align 8
7288  ret void
7289}
7290
7291define void @s_shuffle_v4bf16_v3bf16__u_u_u_u() {
7292; GFX9-LABEL: s_shuffle_v4bf16_v3bf16__u_u_u_u:
7293; GFX9:       ; %bb.0:
7294; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7295; GFX9-NEXT:    ;;#ASMSTART
7296; GFX9-NEXT:    ; use s[8:9]
7297; GFX9-NEXT:    ;;#ASMEND
7298; GFX9-NEXT:    s_setpc_b64 s[30:31]
7299  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
7300  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
7301  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> poison
7302  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
7303  ret void
7304}
7305
7306define void @s_shuffle_v4bf16_v3bf16__0_u_u_u() {
7307; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__0_u_u_u:
7308; GFX900:       ; %bb.0:
7309; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7310; GFX900-NEXT:    ;;#ASMSTART
7311; GFX900-NEXT:    ; def s[8:9]
7312; GFX900-NEXT:    ;;#ASMEND
7313; GFX900-NEXT:    ;;#ASMSTART
7314; GFX900-NEXT:    ; use s[8:9]
7315; GFX900-NEXT:    ;;#ASMEND
7316; GFX900-NEXT:    s_setpc_b64 s[30:31]
7317;
7318; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__0_u_u_u:
7319; GFX90A:       ; %bb.0:
7320; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7321; GFX90A-NEXT:    ;;#ASMSTART
7322; GFX90A-NEXT:    ; def s[8:9]
7323; GFX90A-NEXT:    ;;#ASMEND
7324; GFX90A-NEXT:    ;;#ASMSTART
7325; GFX90A-NEXT:    ; use s[8:9]
7326; GFX90A-NEXT:    ;;#ASMEND
7327; GFX90A-NEXT:    s_setpc_b64 s[30:31]
7328;
7329; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__0_u_u_u:
7330; GFX940:       ; %bb.0:
7331; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7332; GFX940-NEXT:    ;;#ASMSTART
7333; GFX940-NEXT:    ; def s[8:9]
7334; GFX940-NEXT:    ;;#ASMEND
7335; GFX940-NEXT:    s_nop 0
7336; GFX940-NEXT:    ;;#ASMSTART
7337; GFX940-NEXT:    ; use s[8:9]
7338; GFX940-NEXT:    ;;#ASMEND
7339; GFX940-NEXT:    s_setpc_b64 s[30:31]
7340  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
7341  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
7342  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
7343  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
7344  ret void
7345}
7346
7347define void @s_shuffle_v4bf16_v3bf16__1_u_u_u() {
7348; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__1_u_u_u:
7349; GFX900:       ; %bb.0:
7350; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7351; GFX900-NEXT:    ;;#ASMSTART
7352; GFX900-NEXT:    ; def s[4:5]
7353; GFX900-NEXT:    ;;#ASMEND
7354; GFX900-NEXT:    s_lshr_b32 s8, s4, 16
7355; GFX900-NEXT:    ;;#ASMSTART
7356; GFX900-NEXT:    ; use s[8:9]
7357; GFX900-NEXT:    ;;#ASMEND
7358; GFX900-NEXT:    s_setpc_b64 s[30:31]
7359;
7360; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__1_u_u_u:
7361; GFX90A:       ; %bb.0:
7362; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7363; GFX90A-NEXT:    ;;#ASMSTART
7364; GFX90A-NEXT:    ; def s[4:5]
7365; GFX90A-NEXT:    ;;#ASMEND
7366; GFX90A-NEXT:    s_lshr_b32 s8, s4, 16
7367; GFX90A-NEXT:    ;;#ASMSTART
7368; GFX90A-NEXT:    ; use s[8:9]
7369; GFX90A-NEXT:    ;;#ASMEND
7370; GFX90A-NEXT:    s_setpc_b64 s[30:31]
7371;
7372; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__1_u_u_u:
7373; GFX940:       ; %bb.0:
7374; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7375; GFX940-NEXT:    ;;#ASMSTART
7376; GFX940-NEXT:    ; def s[0:1]
7377; GFX940-NEXT:    ;;#ASMEND
7378; GFX940-NEXT:    s_lshr_b32 s8, s0, 16
7379; GFX940-NEXT:    ;;#ASMSTART
7380; GFX940-NEXT:    ; use s[8:9]
7381; GFX940-NEXT:    ;;#ASMEND
7382; GFX940-NEXT:    s_setpc_b64 s[30:31]
7383  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
7384  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
7385  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
7386  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
7387  ret void
7388}
7389
7390define void @s_shuffle_v4bf16_v3bf16__2_u_u_u() {
7391; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__2_u_u_u:
7392; GFX900:       ; %bb.0:
7393; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7394; GFX900-NEXT:    ;;#ASMSTART
7395; GFX900-NEXT:    ; def s[4:5]
7396; GFX900-NEXT:    ;;#ASMEND
7397; GFX900-NEXT:    s_mov_b32 s8, s5
7398; GFX900-NEXT:    ;;#ASMSTART
7399; GFX900-NEXT:    ; use s[8:9]
7400; GFX900-NEXT:    ;;#ASMEND
7401; GFX900-NEXT:    s_setpc_b64 s[30:31]
7402;
7403; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__2_u_u_u:
7404; GFX90A:       ; %bb.0:
7405; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7406; GFX90A-NEXT:    ;;#ASMSTART
7407; GFX90A-NEXT:    ; def s[4:5]
7408; GFX90A-NEXT:    ;;#ASMEND
7409; GFX90A-NEXT:    s_mov_b32 s8, s5
7410; GFX90A-NEXT:    ;;#ASMSTART
7411; GFX90A-NEXT:    ; use s[8:9]
7412; GFX90A-NEXT:    ;;#ASMEND
7413; GFX90A-NEXT:    s_setpc_b64 s[30:31]
7414;
7415; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__2_u_u_u:
7416; GFX940:       ; %bb.0:
7417; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7418; GFX940-NEXT:    ;;#ASMSTART
7419; GFX940-NEXT:    ; def s[0:1]
7420; GFX940-NEXT:    ;;#ASMEND
7421; GFX940-NEXT:    s_mov_b32 s8, s1
7422; GFX940-NEXT:    ;;#ASMSTART
7423; GFX940-NEXT:    ; use s[8:9]
7424; GFX940-NEXT:    ;;#ASMEND
7425; GFX940-NEXT:    s_setpc_b64 s[30:31]
7426  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
7427  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
7428  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 2, i32 poison, i32 poison, i32 poison>
7429  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
7430  ret void
7431}
7432
7433define void @s_shuffle_v4bf16_v3bf16__3_u_u_u() {
7434; GFX9-LABEL: s_shuffle_v4bf16_v3bf16__3_u_u_u:
7435; GFX9:       ; %bb.0:
7436; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7437; GFX9-NEXT:    ;;#ASMSTART
7438; GFX9-NEXT:    ; use s[8:9]
7439; GFX9-NEXT:    ;;#ASMEND
7440; GFX9-NEXT:    s_setpc_b64 s[30:31]
7441  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
7442  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
7443  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 3, i32 poison, i32 poison, i32 poison>
7444  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
7445  ret void
7446}
7447
7448define void @s_shuffle_v4bf16_v3bf16__4_u_u_u() {
7449; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__4_u_u_u:
7450; GFX900:       ; %bb.0:
7451; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7452; GFX900-NEXT:    ;;#ASMSTART
7453; GFX900-NEXT:    ; def s[4:5]
7454; GFX900-NEXT:    ;;#ASMEND
7455; GFX900-NEXT:    s_lshr_b32 s8, s4, 16
7456; GFX900-NEXT:    ;;#ASMSTART
7457; GFX900-NEXT:    ; use s[8:9]
7458; GFX900-NEXT:    ;;#ASMEND
7459; GFX900-NEXT:    s_setpc_b64 s[30:31]
7460;
7461; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__4_u_u_u:
7462; GFX90A:       ; %bb.0:
7463; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7464; GFX90A-NEXT:    ;;#ASMSTART
7465; GFX90A-NEXT:    ; def s[4:5]
7466; GFX90A-NEXT:    ;;#ASMEND
7467; GFX90A-NEXT:    s_lshr_b32 s8, s4, 16
7468; GFX90A-NEXT:    ;;#ASMSTART
7469; GFX90A-NEXT:    ; use s[8:9]
7470; GFX90A-NEXT:    ;;#ASMEND
7471; GFX90A-NEXT:    s_setpc_b64 s[30:31]
7472;
7473; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__4_u_u_u:
7474; GFX940:       ; %bb.0:
7475; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7476; GFX940-NEXT:    ;;#ASMSTART
7477; GFX940-NEXT:    ; def s[0:1]
7478; GFX940-NEXT:    ;;#ASMEND
7479; GFX940-NEXT:    s_lshr_b32 s8, s0, 16
7480; GFX940-NEXT:    ;;#ASMSTART
7481; GFX940-NEXT:    ; use s[8:9]
7482; GFX940-NEXT:    ;;#ASMEND
7483; GFX940-NEXT:    s_setpc_b64 s[30:31]
7484  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
7485  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
7486  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
7487  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
7488  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 4, i32 poison, i32 poison, i32 poison>
7489  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
7490  ret void
7491}
7492
7493define void @s_shuffle_v4bf16_v3bf16__5_u_u_u() {
7494; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_u_u_u:
7495; GFX900:       ; %bb.0:
7496; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7497; GFX900-NEXT:    ;;#ASMSTART
7498; GFX900-NEXT:    ; def s[4:5]
7499; GFX900-NEXT:    ;;#ASMEND
7500; GFX900-NEXT:    s_mov_b32 s8, s5
7501; GFX900-NEXT:    ;;#ASMSTART
7502; GFX900-NEXT:    ; use s[8:9]
7503; GFX900-NEXT:    ;;#ASMEND
7504; GFX900-NEXT:    s_setpc_b64 s[30:31]
7505;
7506; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_u_u_u:
7507; GFX90A:       ; %bb.0:
7508; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7509; GFX90A-NEXT:    ;;#ASMSTART
7510; GFX90A-NEXT:    ; def s[4:5]
7511; GFX90A-NEXT:    ;;#ASMEND
7512; GFX90A-NEXT:    s_mov_b32 s8, s5
7513; GFX90A-NEXT:    ;;#ASMSTART
7514; GFX90A-NEXT:    ; use s[8:9]
7515; GFX90A-NEXT:    ;;#ASMEND
7516; GFX90A-NEXT:    s_setpc_b64 s[30:31]
7517;
7518; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_u_u_u:
7519; GFX940:       ; %bb.0:
7520; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7521; GFX940-NEXT:    ;;#ASMSTART
7522; GFX940-NEXT:    ; def s[0:1]
7523; GFX940-NEXT:    ;;#ASMEND
7524; GFX940-NEXT:    s_mov_b32 s8, s1
7525; GFX940-NEXT:    ;;#ASMSTART
7526; GFX940-NEXT:    ; use s[8:9]
7527; GFX940-NEXT:    ;;#ASMEND
7528; GFX940-NEXT:    s_setpc_b64 s[30:31]
7529  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
7530  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
7531  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
7532  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
7533  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 poison, i32 poison, i32 poison>
7534  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
7535  ret void
7536}
7537
7538define void @s_shuffle_v4bf16_v3bf16__5_0_u_u() {
7539; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_0_u_u:
7540; GFX900:       ; %bb.0:
7541; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7542; GFX900-NEXT:    ;;#ASMSTART
7543; GFX900-NEXT:    ; def s[4:5]
7544; GFX900-NEXT:    ;;#ASMEND
7545; GFX900-NEXT:    ;;#ASMSTART
7546; GFX900-NEXT:    ; def s[6:7]
7547; GFX900-NEXT:    ;;#ASMEND
7548; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s4
7549; GFX900-NEXT:    ;;#ASMSTART
7550; GFX900-NEXT:    ; use s[8:9]
7551; GFX900-NEXT:    ;;#ASMEND
7552; GFX900-NEXT:    s_setpc_b64 s[30:31]
7553;
7554; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_0_u_u:
7555; GFX90A:       ; %bb.0:
7556; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7557; GFX90A-NEXT:    ;;#ASMSTART
7558; GFX90A-NEXT:    ; def s[4:5]
7559; GFX90A-NEXT:    ;;#ASMEND
7560; GFX90A-NEXT:    ;;#ASMSTART
7561; GFX90A-NEXT:    ; def s[6:7]
7562; GFX90A-NEXT:    ;;#ASMEND
7563; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s4
7564; GFX90A-NEXT:    ;;#ASMSTART
7565; GFX90A-NEXT:    ; use s[8:9]
7566; GFX90A-NEXT:    ;;#ASMEND
7567; GFX90A-NEXT:    s_setpc_b64 s[30:31]
7568;
7569; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_0_u_u:
7570; GFX940:       ; %bb.0:
7571; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7572; GFX940-NEXT:    ;;#ASMSTART
7573; GFX940-NEXT:    ; def s[0:1]
7574; GFX940-NEXT:    ;;#ASMEND
7575; GFX940-NEXT:    ;;#ASMSTART
7576; GFX940-NEXT:    ; def s[2:3]
7577; GFX940-NEXT:    ;;#ASMEND
7578; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s0
7579; GFX940-NEXT:    ;;#ASMSTART
7580; GFX940-NEXT:    ; use s[8:9]
7581; GFX940-NEXT:    ;;#ASMEND
7582; GFX940-NEXT:    s_setpc_b64 s[30:31]
7583  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
7584  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
7585  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
7586  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
7587  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 0, i32 poison, i32 poison>
7588  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
7589  ret void
7590}
7591
7592define void @s_shuffle_v4bf16_v3bf16__5_1_u_u() {
7593; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_1_u_u:
7594; GFX900:       ; %bb.0:
7595; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7596; GFX900-NEXT:    ;;#ASMSTART
7597; GFX900-NEXT:    ; def s[4:5]
7598; GFX900-NEXT:    ;;#ASMEND
7599; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
7600; GFX900-NEXT:    ;;#ASMSTART
7601; GFX900-NEXT:    ; def s[6:7]
7602; GFX900-NEXT:    ;;#ASMEND
7603; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s4
7604; GFX900-NEXT:    ;;#ASMSTART
7605; GFX900-NEXT:    ; use s[8:9]
7606; GFX900-NEXT:    ;;#ASMEND
7607; GFX900-NEXT:    s_setpc_b64 s[30:31]
7608;
7609; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_1_u_u:
7610; GFX90A:       ; %bb.0:
7611; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7612; GFX90A-NEXT:    ;;#ASMSTART
7613; GFX90A-NEXT:    ; def s[4:5]
7614; GFX90A-NEXT:    ;;#ASMEND
7615; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
7616; GFX90A-NEXT:    ;;#ASMSTART
7617; GFX90A-NEXT:    ; def s[6:7]
7618; GFX90A-NEXT:    ;;#ASMEND
7619; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s4
7620; GFX90A-NEXT:    ;;#ASMSTART
7621; GFX90A-NEXT:    ; use s[8:9]
7622; GFX90A-NEXT:    ;;#ASMEND
7623; GFX90A-NEXT:    s_setpc_b64 s[30:31]
7624;
7625; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_1_u_u:
7626; GFX940:       ; %bb.0:
7627; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7628; GFX940-NEXT:    ;;#ASMSTART
7629; GFX940-NEXT:    ; def s[0:1]
7630; GFX940-NEXT:    ;;#ASMEND
7631; GFX940-NEXT:    s_lshr_b32 s0, s0, 16
7632; GFX940-NEXT:    ;;#ASMSTART
7633; GFX940-NEXT:    ; def s[2:3]
7634; GFX940-NEXT:    ;;#ASMEND
7635; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s0
7636; GFX940-NEXT:    ;;#ASMSTART
7637; GFX940-NEXT:    ; use s[8:9]
7638; GFX940-NEXT:    ;;#ASMEND
7639; GFX940-NEXT:    s_setpc_b64 s[30:31]
7640  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
7641  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
7642  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
7643  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
7644  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 1, i32 poison, i32 poison>
7645  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
7646  ret void
7647}
7648
7649define void @s_shuffle_v4bf16_v3bf16__5_2_u_u() {
7650; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_2_u_u:
7651; GFX900:       ; %bb.0:
7652; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7653; GFX900-NEXT:    ;;#ASMSTART
7654; GFX900-NEXT:    ; def s[4:5]
7655; GFX900-NEXT:    ;;#ASMEND
7656; GFX900-NEXT:    ;;#ASMSTART
7657; GFX900-NEXT:    ; def s[6:7]
7658; GFX900-NEXT:    ;;#ASMEND
7659; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s5
7660; GFX900-NEXT:    ;;#ASMSTART
7661; GFX900-NEXT:    ; use s[8:9]
7662; GFX900-NEXT:    ;;#ASMEND
7663; GFX900-NEXT:    s_setpc_b64 s[30:31]
7664;
7665; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_2_u_u:
7666; GFX90A:       ; %bb.0:
7667; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7668; GFX90A-NEXT:    ;;#ASMSTART
7669; GFX90A-NEXT:    ; def s[4:5]
7670; GFX90A-NEXT:    ;;#ASMEND
7671; GFX90A-NEXT:    ;;#ASMSTART
7672; GFX90A-NEXT:    ; def s[6:7]
7673; GFX90A-NEXT:    ;;#ASMEND
7674; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s5
7675; GFX90A-NEXT:    ;;#ASMSTART
7676; GFX90A-NEXT:    ; use s[8:9]
7677; GFX90A-NEXT:    ;;#ASMEND
7678; GFX90A-NEXT:    s_setpc_b64 s[30:31]
7679;
7680; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_2_u_u:
7681; GFX940:       ; %bb.0:
7682; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7683; GFX940-NEXT:    ;;#ASMSTART
7684; GFX940-NEXT:    ; def s[0:1]
7685; GFX940-NEXT:    ;;#ASMEND
7686; GFX940-NEXT:    ;;#ASMSTART
7687; GFX940-NEXT:    ; def s[2:3]
7688; GFX940-NEXT:    ;;#ASMEND
7689; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s1
7690; GFX940-NEXT:    ;;#ASMSTART
7691; GFX940-NEXT:    ; use s[8:9]
7692; GFX940-NEXT:    ;;#ASMEND
7693; GFX940-NEXT:    s_setpc_b64 s[30:31]
7694  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
7695  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
7696  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
7697  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
7698  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 2, i32 poison, i32 poison>
7699  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
7700  ret void
7701}
7702
7703define void @s_shuffle_v4bf16_v3bf16__5_3_u_u() {
7704; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_3_u_u:
7705; GFX900:       ; %bb.0:
7706; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7707; GFX900-NEXT:    ;;#ASMSTART
7708; GFX900-NEXT:    ; def s[4:5]
7709; GFX900-NEXT:    ;;#ASMEND
7710; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
7711; GFX900-NEXT:    ;;#ASMSTART
7712; GFX900-NEXT:    ; use s[8:9]
7713; GFX900-NEXT:    ;;#ASMEND
7714; GFX900-NEXT:    s_setpc_b64 s[30:31]
7715;
7716; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_3_u_u:
7717; GFX90A:       ; %bb.0:
7718; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7719; GFX90A-NEXT:    ;;#ASMSTART
7720; GFX90A-NEXT:    ; def s[4:5]
7721; GFX90A-NEXT:    ;;#ASMEND
7722; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
7723; GFX90A-NEXT:    ;;#ASMSTART
7724; GFX90A-NEXT:    ; use s[8:9]
7725; GFX90A-NEXT:    ;;#ASMEND
7726; GFX90A-NEXT:    s_setpc_b64 s[30:31]
7727;
7728; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_3_u_u:
7729; GFX940:       ; %bb.0:
7730; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7731; GFX940-NEXT:    ;;#ASMSTART
7732; GFX940-NEXT:    ; def s[0:1]
7733; GFX940-NEXT:    ;;#ASMEND
7734; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
7735; GFX940-NEXT:    ;;#ASMSTART
7736; GFX940-NEXT:    ; use s[8:9]
7737; GFX940-NEXT:    ;;#ASMEND
7738; GFX940-NEXT:    s_setpc_b64 s[30:31]
7739  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
7740  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
7741  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
7742  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
7743  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 3, i32 poison, i32 poison>
7744  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
7745  ret void
7746}
7747
7748define void @s_shuffle_v4bf16_v3bf16__5_4_u_u() {
7749; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_4_u_u:
7750; GFX900:       ; %bb.0:
7751; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7752; GFX900-NEXT:    ;;#ASMSTART
7753; GFX900-NEXT:    ; def s[4:5]
7754; GFX900-NEXT:    ;;#ASMEND
7755; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
7756; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
7757; GFX900-NEXT:    ;;#ASMSTART
7758; GFX900-NEXT:    ; use s[8:9]
7759; GFX900-NEXT:    ;;#ASMEND
7760; GFX900-NEXT:    s_setpc_b64 s[30:31]
7761;
7762; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_4_u_u:
7763; GFX90A:       ; %bb.0:
7764; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7765; GFX90A-NEXT:    ;;#ASMSTART
7766; GFX90A-NEXT:    ; def s[4:5]
7767; GFX90A-NEXT:    ;;#ASMEND
7768; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
7769; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
7770; GFX90A-NEXT:    ;;#ASMSTART
7771; GFX90A-NEXT:    ; use s[8:9]
7772; GFX90A-NEXT:    ;;#ASMEND
7773; GFX90A-NEXT:    s_setpc_b64 s[30:31]
7774;
7775; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_4_u_u:
7776; GFX940:       ; %bb.0:
7777; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7778; GFX940-NEXT:    ;;#ASMSTART
7779; GFX940-NEXT:    ; def s[0:1]
7780; GFX940-NEXT:    ;;#ASMEND
7781; GFX940-NEXT:    s_lshr_b32 s0, s0, 16
7782; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
7783; GFX940-NEXT:    ;;#ASMSTART
7784; GFX940-NEXT:    ; use s[8:9]
7785; GFX940-NEXT:    ;;#ASMEND
7786; GFX940-NEXT:    s_setpc_b64 s[30:31]
7787  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
7788  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
7789  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
7790  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
7791  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 4, i32 poison, i32 poison>
7792  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
7793  ret void
7794}
7795
7796define void @s_shuffle_v4bf16_v3bf16__5_5_u_u() {
7797; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_u:
7798; GFX900:       ; %bb.0:
7799; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7800; GFX900-NEXT:    ;;#ASMSTART
7801; GFX900-NEXT:    ; def s[4:5]
7802; GFX900-NEXT:    ;;#ASMEND
7803; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s5
7804; GFX900-NEXT:    ;;#ASMSTART
7805; GFX900-NEXT:    ; use s[8:9]
7806; GFX900-NEXT:    ;;#ASMEND
7807; GFX900-NEXT:    s_setpc_b64 s[30:31]
7808;
7809; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_u:
7810; GFX90A:       ; %bb.0:
7811; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7812; GFX90A-NEXT:    ;;#ASMSTART
7813; GFX90A-NEXT:    ; def s[4:5]
7814; GFX90A-NEXT:    ;;#ASMEND
7815; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s5
7816; GFX90A-NEXT:    ;;#ASMSTART
7817; GFX90A-NEXT:    ; use s[8:9]
7818; GFX90A-NEXT:    ;;#ASMEND
7819; GFX90A-NEXT:    s_setpc_b64 s[30:31]
7820;
7821; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_u:
7822; GFX940:       ; %bb.0:
7823; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7824; GFX940-NEXT:    ;;#ASMSTART
7825; GFX940-NEXT:    ; def s[0:1]
7826; GFX940-NEXT:    ;;#ASMEND
7827; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s1, s1
7828; GFX940-NEXT:    ;;#ASMSTART
7829; GFX940-NEXT:    ; use s[8:9]
7830; GFX940-NEXT:    ;;#ASMEND
7831; GFX940-NEXT:    s_setpc_b64 s[30:31]
7832  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
7833  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
7834  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
7835  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
7836  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 poison, i32 poison>
7837  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
7838  ret void
7839}
7840
7841define void @s_shuffle_v4bf16_v3bf16__5_5_0_u() {
7842; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_u:
7843; GFX900:       ; %bb.0:
7844; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7845; GFX900-NEXT:    ;;#ASMSTART
7846; GFX900-NEXT:    ; def s[4:5]
7847; GFX900-NEXT:    ;;#ASMEND
7848; GFX900-NEXT:    ;;#ASMSTART
7849; GFX900-NEXT:    ; def s[6:7]
7850; GFX900-NEXT:    ;;#ASMEND
7851; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
7852; GFX900-NEXT:    s_mov_b32 s9, s4
7853; GFX900-NEXT:    ;;#ASMSTART
7854; GFX900-NEXT:    ; use s[8:9]
7855; GFX900-NEXT:    ;;#ASMEND
7856; GFX900-NEXT:    s_setpc_b64 s[30:31]
7857;
7858; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_u:
7859; GFX90A:       ; %bb.0:
7860; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7861; GFX90A-NEXT:    ;;#ASMSTART
7862; GFX90A-NEXT:    ; def s[4:5]
7863; GFX90A-NEXT:    ;;#ASMEND
7864; GFX90A-NEXT:    ;;#ASMSTART
7865; GFX90A-NEXT:    ; def s[6:7]
7866; GFX90A-NEXT:    ;;#ASMEND
7867; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
7868; GFX90A-NEXT:    s_mov_b32 s9, s4
7869; GFX90A-NEXT:    ;;#ASMSTART
7870; GFX90A-NEXT:    ; use s[8:9]
7871; GFX90A-NEXT:    ;;#ASMEND
7872; GFX90A-NEXT:    s_setpc_b64 s[30:31]
7873;
7874; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_u:
7875; GFX940:       ; %bb.0:
7876; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7877; GFX940-NEXT:    ;;#ASMSTART
7878; GFX940-NEXT:    ; def s[0:1]
7879; GFX940-NEXT:    ;;#ASMEND
7880; GFX940-NEXT:    ;;#ASMSTART
7881; GFX940-NEXT:    ; def s[2:3]
7882; GFX940-NEXT:    ;;#ASMEND
7883; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s3
7884; GFX940-NEXT:    s_mov_b32 s9, s0
7885; GFX940-NEXT:    ;;#ASMSTART
7886; GFX940-NEXT:    ; use s[8:9]
7887; GFX940-NEXT:    ;;#ASMEND
7888; GFX940-NEXT:    s_setpc_b64 s[30:31]
7889  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
7890  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
7891  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
7892  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
7893  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 0, i32 poison>
7894  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
7895  ret void
7896}
7897
7898define void @s_shuffle_v4bf16_v3bf16__5_5_1_u() {
7899; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_u:
7900; GFX900:       ; %bb.0:
7901; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7902; GFX900-NEXT:    ;;#ASMSTART
7903; GFX900-NEXT:    ; def s[4:5]
7904; GFX900-NEXT:    ;;#ASMEND
7905; GFX900-NEXT:    ;;#ASMSTART
7906; GFX900-NEXT:    ; def s[6:7]
7907; GFX900-NEXT:    ;;#ASMEND
7908; GFX900-NEXT:    s_lshr_b32 s9, s4, 16
7909; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
7910; GFX900-NEXT:    ;;#ASMSTART
7911; GFX900-NEXT:    ; use s[8:9]
7912; GFX900-NEXT:    ;;#ASMEND
7913; GFX900-NEXT:    s_setpc_b64 s[30:31]
7914;
7915; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_u:
7916; GFX90A:       ; %bb.0:
7917; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7918; GFX90A-NEXT:    ;;#ASMSTART
7919; GFX90A-NEXT:    ; def s[4:5]
7920; GFX90A-NEXT:    ;;#ASMEND
7921; GFX90A-NEXT:    ;;#ASMSTART
7922; GFX90A-NEXT:    ; def s[6:7]
7923; GFX90A-NEXT:    ;;#ASMEND
7924; GFX90A-NEXT:    s_lshr_b32 s9, s4, 16
7925; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
7926; GFX90A-NEXT:    ;;#ASMSTART
7927; GFX90A-NEXT:    ; use s[8:9]
7928; GFX90A-NEXT:    ;;#ASMEND
7929; GFX90A-NEXT:    s_setpc_b64 s[30:31]
7930;
7931; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_u:
7932; GFX940:       ; %bb.0:
7933; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7934; GFX940-NEXT:    ;;#ASMSTART
7935; GFX940-NEXT:    ; def s[0:1]
7936; GFX940-NEXT:    ;;#ASMEND
7937; GFX940-NEXT:    ;;#ASMSTART
7938; GFX940-NEXT:    ; def s[2:3]
7939; GFX940-NEXT:    ;;#ASMEND
7940; GFX940-NEXT:    s_lshr_b32 s9, s0, 16
7941; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s3
7942; GFX940-NEXT:    ;;#ASMSTART
7943; GFX940-NEXT:    ; use s[8:9]
7944; GFX940-NEXT:    ;;#ASMEND
7945; GFX940-NEXT:    s_setpc_b64 s[30:31]
7946  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
7947  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
7948  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
7949  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
7950  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 1, i32 poison>
7951  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
7952  ret void
7953}
7954
7955define void @s_shuffle_v4bf16_v3bf16__5_5_2_u() {
7956; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_u:
7957; GFX900:       ; %bb.0:
7958; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7959; GFX900-NEXT:    ;;#ASMSTART
7960; GFX900-NEXT:    ; def s[8:9]
7961; GFX900-NEXT:    ;;#ASMEND
7962; GFX900-NEXT:    ;;#ASMSTART
7963; GFX900-NEXT:    ; def s[4:5]
7964; GFX900-NEXT:    ;;#ASMEND
7965; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s5
7966; GFX900-NEXT:    ;;#ASMSTART
7967; GFX900-NEXT:    ; use s[8:9]
7968; GFX900-NEXT:    ;;#ASMEND
7969; GFX900-NEXT:    s_setpc_b64 s[30:31]
7970;
7971; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_u:
7972; GFX90A:       ; %bb.0:
7973; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7974; GFX90A-NEXT:    ;;#ASMSTART
7975; GFX90A-NEXT:    ; def s[8:9]
7976; GFX90A-NEXT:    ;;#ASMEND
7977; GFX90A-NEXT:    ;;#ASMSTART
7978; GFX90A-NEXT:    ; def s[4:5]
7979; GFX90A-NEXT:    ;;#ASMEND
7980; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s5
7981; GFX90A-NEXT:    ;;#ASMSTART
7982; GFX90A-NEXT:    ; use s[8:9]
7983; GFX90A-NEXT:    ;;#ASMEND
7984; GFX90A-NEXT:    s_setpc_b64 s[30:31]
7985;
7986; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_u:
7987; GFX940:       ; %bb.0:
7988; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7989; GFX940-NEXT:    ;;#ASMSTART
7990; GFX940-NEXT:    ; def s[8:9]
7991; GFX940-NEXT:    ;;#ASMEND
7992; GFX940-NEXT:    ;;#ASMSTART
7993; GFX940-NEXT:    ; def s[0:1]
7994; GFX940-NEXT:    ;;#ASMEND
7995; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s1, s1
7996; GFX940-NEXT:    ;;#ASMSTART
7997; GFX940-NEXT:    ; use s[8:9]
7998; GFX940-NEXT:    ;;#ASMEND
7999; GFX940-NEXT:    s_setpc_b64 s[30:31]
8000  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
8001  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
8002  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
8003  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
8004  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 2, i32 poison>
8005  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
8006  ret void
8007}
8008
8009define void @s_shuffle_v4bf16_v3bf16__5_5_3_u() {
8010; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_u:
8011; GFX900:       ; %bb.0:
8012; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8013; GFX900-NEXT:    ;;#ASMSTART
8014; GFX900-NEXT:    ; def s[4:5]
8015; GFX900-NEXT:    ;;#ASMEND
8016; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s5
8017; GFX900-NEXT:    s_mov_b32 s9, s4
8018; GFX900-NEXT:    ;;#ASMSTART
8019; GFX900-NEXT:    ; use s[8:9]
8020; GFX900-NEXT:    ;;#ASMEND
8021; GFX900-NEXT:    s_setpc_b64 s[30:31]
8022;
8023; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_u:
8024; GFX90A:       ; %bb.0:
8025; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8026; GFX90A-NEXT:    ;;#ASMSTART
8027; GFX90A-NEXT:    ; def s[4:5]
8028; GFX90A-NEXT:    ;;#ASMEND
8029; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s5
8030; GFX90A-NEXT:    s_mov_b32 s9, s4
8031; GFX90A-NEXT:    ;;#ASMSTART
8032; GFX90A-NEXT:    ; use s[8:9]
8033; GFX90A-NEXT:    ;;#ASMEND
8034; GFX90A-NEXT:    s_setpc_b64 s[30:31]
8035;
8036; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_u:
8037; GFX940:       ; %bb.0:
8038; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8039; GFX940-NEXT:    ;;#ASMSTART
8040; GFX940-NEXT:    ; def s[0:1]
8041; GFX940-NEXT:    ;;#ASMEND
8042; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s1, s1
8043; GFX940-NEXT:    s_mov_b32 s9, s0
8044; GFX940-NEXT:    ;;#ASMSTART
8045; GFX940-NEXT:    ; use s[8:9]
8046; GFX940-NEXT:    ;;#ASMEND
8047; GFX940-NEXT:    s_setpc_b64 s[30:31]
8048  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
8049  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
8050  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
8051  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
8052  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 3, i32 poison>
8053  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
8054  ret void
8055}
8056
8057define void @s_shuffle_v4bf16_v3bf16__5_5_4_u() {
8058; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_u:
8059; GFX900:       ; %bb.0:
8060; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8061; GFX900-NEXT:    ;;#ASMSTART
8062; GFX900-NEXT:    ; def s[4:5]
8063; GFX900-NEXT:    ;;#ASMEND
8064; GFX900-NEXT:    s_lshr_b32 s9, s4, 16
8065; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s5
8066; GFX900-NEXT:    ;;#ASMSTART
8067; GFX900-NEXT:    ; use s[8:9]
8068; GFX900-NEXT:    ;;#ASMEND
8069; GFX900-NEXT:    s_setpc_b64 s[30:31]
8070;
8071; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_u:
8072; GFX90A:       ; %bb.0:
8073; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8074; GFX90A-NEXT:    ;;#ASMSTART
8075; GFX90A-NEXT:    ; def s[4:5]
8076; GFX90A-NEXT:    ;;#ASMEND
8077; GFX90A-NEXT:    s_lshr_b32 s9, s4, 16
8078; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s5
8079; GFX90A-NEXT:    ;;#ASMSTART
8080; GFX90A-NEXT:    ; use s[8:9]
8081; GFX90A-NEXT:    ;;#ASMEND
8082; GFX90A-NEXT:    s_setpc_b64 s[30:31]
8083;
8084; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_u:
8085; GFX940:       ; %bb.0:
8086; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8087; GFX940-NEXT:    ;;#ASMSTART
8088; GFX940-NEXT:    ; def s[0:1]
8089; GFX940-NEXT:    ;;#ASMEND
8090; GFX940-NEXT:    s_lshr_b32 s9, s0, 16
8091; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s1, s1
8092; GFX940-NEXT:    ;;#ASMSTART
8093; GFX940-NEXT:    ; use s[8:9]
8094; GFX940-NEXT:    ;;#ASMEND
8095; GFX940-NEXT:    s_setpc_b64 s[30:31]
8096  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
8097  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
8098  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
8099  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
8100  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 4, i32 poison>
8101  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
8102  ret void
8103}
8104
8105define void @s_shuffle_v4bf16_v3bf16__5_5_5_u() {
8106; GFX9-LABEL: s_shuffle_v4bf16_v3bf16__5_5_5_u:
8107; GFX9:       ; %bb.0:
8108; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8109; GFX9-NEXT:    ;;#ASMSTART
8110; GFX9-NEXT:    ; def s[8:9]
8111; GFX9-NEXT:    ;;#ASMEND
8112; GFX9-NEXT:    s_pack_ll_b32_b16 s8, s9, s9
8113; GFX9-NEXT:    ;;#ASMSTART
8114; GFX9-NEXT:    ; use s[8:9]
8115; GFX9-NEXT:    ;;#ASMEND
8116; GFX9-NEXT:    s_setpc_b64 s[30:31]
8117  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
8118  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
8119  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
8120  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
8121  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 5, i32 poison>
8122  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
8123  ret void
8124}
8125
8126define void @s_shuffle_v4bf16_v3bf16__5_5_5_0() {
8127; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_5_0:
8128; GFX900:       ; %bb.0:
8129; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8130; GFX900-NEXT:    ;;#ASMSTART
8131; GFX900-NEXT:    ; def s[4:5]
8132; GFX900-NEXT:    ;;#ASMEND
8133; GFX900-NEXT:    ;;#ASMSTART
8134; GFX900-NEXT:    ; def s[6:7]
8135; GFX900-NEXT:    ;;#ASMEND
8136; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s7, s4
8137; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
8138; GFX900-NEXT:    ;;#ASMSTART
8139; GFX900-NEXT:    ; use s[8:9]
8140; GFX900-NEXT:    ;;#ASMEND
8141; GFX900-NEXT:    s_setpc_b64 s[30:31]
8142;
8143; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_5_0:
8144; GFX90A:       ; %bb.0:
8145; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8146; GFX90A-NEXT:    ;;#ASMSTART
8147; GFX90A-NEXT:    ; def s[4:5]
8148; GFX90A-NEXT:    ;;#ASMEND
8149; GFX90A-NEXT:    ;;#ASMSTART
8150; GFX90A-NEXT:    ; def s[6:7]
8151; GFX90A-NEXT:    ;;#ASMEND
8152; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s7, s4
8153; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
8154; GFX90A-NEXT:    ;;#ASMSTART
8155; GFX90A-NEXT:    ; use s[8:9]
8156; GFX90A-NEXT:    ;;#ASMEND
8157; GFX90A-NEXT:    s_setpc_b64 s[30:31]
8158;
8159; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_5_0:
8160; GFX940:       ; %bb.0:
8161; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8162; GFX940-NEXT:    ;;#ASMSTART
8163; GFX940-NEXT:    ; def s[0:1]
8164; GFX940-NEXT:    ;;#ASMEND
8165; GFX940-NEXT:    ;;#ASMSTART
8166; GFX940-NEXT:    ; def s[2:3]
8167; GFX940-NEXT:    ;;#ASMEND
8168; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s3, s0
8169; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s3
8170; GFX940-NEXT:    ;;#ASMSTART
8171; GFX940-NEXT:    ; use s[8:9]
8172; GFX940-NEXT:    ;;#ASMEND
8173; GFX940-NEXT:    s_setpc_b64 s[30:31]
8174  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
8175  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
8176  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
8177  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
8178  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 5, i32 0>
8179  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
8180  ret void
8181}
8182
8183define void @s_shuffle_v4bf16_v3bf16__5_5_5_1() {
8184; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_5_1:
8185; GFX900:       ; %bb.0:
8186; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8187; GFX900-NEXT:    ;;#ASMSTART
8188; GFX900-NEXT:    ; def s[4:5]
8189; GFX900-NEXT:    ;;#ASMEND
8190; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
8191; GFX900-NEXT:    ;;#ASMSTART
8192; GFX900-NEXT:    ; def s[6:7]
8193; GFX900-NEXT:    ;;#ASMEND
8194; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s7, s4
8195; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
8196; GFX900-NEXT:    ;;#ASMSTART
8197; GFX900-NEXT:    ; use s[8:9]
8198; GFX900-NEXT:    ;;#ASMEND
8199; GFX900-NEXT:    s_setpc_b64 s[30:31]
8200;
8201; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_5_1:
8202; GFX90A:       ; %bb.0:
8203; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8204; GFX90A-NEXT:    ;;#ASMSTART
8205; GFX90A-NEXT:    ; def s[4:5]
8206; GFX90A-NEXT:    ;;#ASMEND
8207; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
8208; GFX90A-NEXT:    ;;#ASMSTART
8209; GFX90A-NEXT:    ; def s[6:7]
8210; GFX90A-NEXT:    ;;#ASMEND
8211; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s7, s4
8212; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
8213; GFX90A-NEXT:    ;;#ASMSTART
8214; GFX90A-NEXT:    ; use s[8:9]
8215; GFX90A-NEXT:    ;;#ASMEND
8216; GFX90A-NEXT:    s_setpc_b64 s[30:31]
8217;
8218; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_5_1:
8219; GFX940:       ; %bb.0:
8220; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8221; GFX940-NEXT:    ;;#ASMSTART
8222; GFX940-NEXT:    ; def s[0:1]
8223; GFX940-NEXT:    ;;#ASMEND
8224; GFX940-NEXT:    s_lshr_b32 s0, s0, 16
8225; GFX940-NEXT:    ;;#ASMSTART
8226; GFX940-NEXT:    ; def s[2:3]
8227; GFX940-NEXT:    ;;#ASMEND
8228; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s3, s0
8229; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s3
8230; GFX940-NEXT:    ;;#ASMSTART
8231; GFX940-NEXT:    ; use s[8:9]
8232; GFX940-NEXT:    ;;#ASMEND
8233; GFX940-NEXT:    s_setpc_b64 s[30:31]
8234  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
8235  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
8236  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
8237  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
8238  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 5, i32 1>
8239  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
8240  ret void
8241}
8242
8243define void @s_shuffle_v4bf16_v3bf16__5_5_5_2() {
8244; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_5_2:
8245; GFX900:       ; %bb.0:
8246; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8247; GFX900-NEXT:    ;;#ASMSTART
8248; GFX900-NEXT:    ; def s[4:5]
8249; GFX900-NEXT:    ;;#ASMEND
8250; GFX900-NEXT:    ;;#ASMSTART
8251; GFX900-NEXT:    ; def s[6:7]
8252; GFX900-NEXT:    ;;#ASMEND
8253; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s7, s5
8254; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
8255; GFX900-NEXT:    ;;#ASMSTART
8256; GFX900-NEXT:    ; use s[8:9]
8257; GFX900-NEXT:    ;;#ASMEND
8258; GFX900-NEXT:    s_setpc_b64 s[30:31]
8259;
8260; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_5_2:
8261; GFX90A:       ; %bb.0:
8262; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8263; GFX90A-NEXT:    ;;#ASMSTART
8264; GFX90A-NEXT:    ; def s[4:5]
8265; GFX90A-NEXT:    ;;#ASMEND
8266; GFX90A-NEXT:    ;;#ASMSTART
8267; GFX90A-NEXT:    ; def s[6:7]
8268; GFX90A-NEXT:    ;;#ASMEND
8269; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s7, s5
8270; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
8271; GFX90A-NEXT:    ;;#ASMSTART
8272; GFX90A-NEXT:    ; use s[8:9]
8273; GFX90A-NEXT:    ;;#ASMEND
8274; GFX90A-NEXT:    s_setpc_b64 s[30:31]
8275;
8276; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_5_2:
8277; GFX940:       ; %bb.0:
8278; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8279; GFX940-NEXT:    ;;#ASMSTART
8280; GFX940-NEXT:    ; def s[0:1]
8281; GFX940-NEXT:    ;;#ASMEND
8282; GFX940-NEXT:    ;;#ASMSTART
8283; GFX940-NEXT:    ; def s[2:3]
8284; GFX940-NEXT:    ;;#ASMEND
8285; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s3, s1
8286; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s3
8287; GFX940-NEXT:    ;;#ASMSTART
8288; GFX940-NEXT:    ; use s[8:9]
8289; GFX940-NEXT:    ;;#ASMEND
8290; GFX940-NEXT:    s_setpc_b64 s[30:31]
8291  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
8292  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
8293  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
8294  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
8295  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 5, i32 2>
8296  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
8297  ret void
8298}
8299
8300define void @s_shuffle_v4bf16_v3bf16__5_5_5_3() {
8301; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_5_3:
8302; GFX900:       ; %bb.0:
8303; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8304; GFX900-NEXT:    ;;#ASMSTART
8305; GFX900-NEXT:    ; def s[4:5]
8306; GFX900-NEXT:    ;;#ASMEND
8307; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s5, s4
8308; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s5
8309; GFX900-NEXT:    ;;#ASMSTART
8310; GFX900-NEXT:    ; use s[8:9]
8311; GFX900-NEXT:    ;;#ASMEND
8312; GFX900-NEXT:    s_setpc_b64 s[30:31]
8313;
8314; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_5_3:
8315; GFX90A:       ; %bb.0:
8316; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8317; GFX90A-NEXT:    ;;#ASMSTART
8318; GFX90A-NEXT:    ; def s[4:5]
8319; GFX90A-NEXT:    ;;#ASMEND
8320; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s5, s4
8321; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s5
8322; GFX90A-NEXT:    ;;#ASMSTART
8323; GFX90A-NEXT:    ; use s[8:9]
8324; GFX90A-NEXT:    ;;#ASMEND
8325; GFX90A-NEXT:    s_setpc_b64 s[30:31]
8326;
8327; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_5_3:
8328; GFX940:       ; %bb.0:
8329; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8330; GFX940-NEXT:    ;;#ASMSTART
8331; GFX940-NEXT:    ; def s[0:1]
8332; GFX940-NEXT:    ;;#ASMEND
8333; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s1, s0
8334; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s1, s1
8335; GFX940-NEXT:    ;;#ASMSTART
8336; GFX940-NEXT:    ; use s[8:9]
8337; GFX940-NEXT:    ;;#ASMEND
8338; GFX940-NEXT:    s_setpc_b64 s[30:31]
8339  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
8340  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
8341  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
8342  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
8343  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 5, i32 3>
8344  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
8345  ret void
8346}
8347
8348define void @s_shuffle_v4bf16_v3bf16__5_5_5_4() {
8349; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_5_4:
8350; GFX900:       ; %bb.0:
8351; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8352; GFX900-NEXT:    ;;#ASMSTART
8353; GFX900-NEXT:    ; def s[4:5]
8354; GFX900-NEXT:    ;;#ASMEND
8355; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
8356; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s5, s4
8357; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s5
8358; GFX900-NEXT:    ;;#ASMSTART
8359; GFX900-NEXT:    ; use s[8:9]
8360; GFX900-NEXT:    ;;#ASMEND
8361; GFX900-NEXT:    s_setpc_b64 s[30:31]
8362;
8363; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_5_4:
8364; GFX90A:       ; %bb.0:
8365; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8366; GFX90A-NEXT:    ;;#ASMSTART
8367; GFX90A-NEXT:    ; def s[4:5]
8368; GFX90A-NEXT:    ;;#ASMEND
8369; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
8370; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s5, s4
8371; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s5
8372; GFX90A-NEXT:    ;;#ASMSTART
8373; GFX90A-NEXT:    ; use s[8:9]
8374; GFX90A-NEXT:    ;;#ASMEND
8375; GFX90A-NEXT:    s_setpc_b64 s[30:31]
8376;
8377; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_5_4:
8378; GFX940:       ; %bb.0:
8379; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8380; GFX940-NEXT:    ;;#ASMSTART
8381; GFX940-NEXT:    ; def s[0:1]
8382; GFX940-NEXT:    ;;#ASMEND
8383; GFX940-NEXT:    s_lshr_b32 s0, s0, 16
8384; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s1, s0
8385; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s1, s1
8386; GFX940-NEXT:    ;;#ASMSTART
8387; GFX940-NEXT:    ; use s[8:9]
8388; GFX940-NEXT:    ;;#ASMEND
8389; GFX940-NEXT:    s_setpc_b64 s[30:31]
8390  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
8391  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
8392  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
8393  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
8394  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 5, i32 4>
8395  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
8396  ret void
8397}
8398
8399define void @s_shuffle_v4bf16_v3bf16__5_5_5_5() {
8400; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_5_5:
8401; GFX900:       ; %bb.0:
8402; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8403; GFX900-NEXT:    ;;#ASMSTART
8404; GFX900-NEXT:    ; def s[4:5]
8405; GFX900-NEXT:    ;;#ASMEND
8406; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s5
8407; GFX900-NEXT:    s_mov_b32 s9, s8
8408; GFX900-NEXT:    ;;#ASMSTART
8409; GFX900-NEXT:    ; use s[8:9]
8410; GFX900-NEXT:    ;;#ASMEND
8411; GFX900-NEXT:    s_setpc_b64 s[30:31]
8412;
8413; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_5_5:
8414; GFX90A:       ; %bb.0:
8415; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8416; GFX90A-NEXT:    ;;#ASMSTART
8417; GFX90A-NEXT:    ; def s[4:5]
8418; GFX90A-NEXT:    ;;#ASMEND
8419; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s5
8420; GFX90A-NEXT:    s_mov_b32 s9, s8
8421; GFX90A-NEXT:    ;;#ASMSTART
8422; GFX90A-NEXT:    ; use s[8:9]
8423; GFX90A-NEXT:    ;;#ASMEND
8424; GFX90A-NEXT:    s_setpc_b64 s[30:31]
8425;
8426; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_5_5:
8427; GFX940:       ; %bb.0:
8428; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8429; GFX940-NEXT:    ;;#ASMSTART
8430; GFX940-NEXT:    ; def s[0:1]
8431; GFX940-NEXT:    ;;#ASMEND
8432; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s1, s1
8433; GFX940-NEXT:    s_mov_b32 s9, s8
8434; GFX940-NEXT:    ;;#ASMSTART
8435; GFX940-NEXT:    ; use s[8:9]
8436; GFX940-NEXT:    ;;#ASMEND
8437; GFX940-NEXT:    s_setpc_b64 s[30:31]
8438  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
8439  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
8440  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
8441  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
8442  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 5, i32 5>
8443  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
8444  ret void
8445}
8446
8447define void @s_shuffle_v4bf16_v3bf16__u_0_0_0() {
8448; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__u_0_0_0:
8449; GFX900:       ; %bb.0:
8450; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8451; GFX900-NEXT:    ;;#ASMSTART
8452; GFX900-NEXT:    ; def s[4:5]
8453; GFX900-NEXT:    ;;#ASMEND
8454; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
8455; GFX900-NEXT:    s_lshl_b32 s8, s4, 16
8456; GFX900-NEXT:    ;;#ASMSTART
8457; GFX900-NEXT:    ; use s[8:9]
8458; GFX900-NEXT:    ;;#ASMEND
8459; GFX900-NEXT:    s_setpc_b64 s[30:31]
8460;
8461; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__u_0_0_0:
8462; GFX90A:       ; %bb.0:
8463; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8464; GFX90A-NEXT:    ;;#ASMSTART
8465; GFX90A-NEXT:    ; def s[4:5]
8466; GFX90A-NEXT:    ;;#ASMEND
8467; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
8468; GFX90A-NEXT:    s_lshl_b32 s8, s4, 16
8469; GFX90A-NEXT:    ;;#ASMSTART
8470; GFX90A-NEXT:    ; use s[8:9]
8471; GFX90A-NEXT:    ;;#ASMEND
8472; GFX90A-NEXT:    s_setpc_b64 s[30:31]
8473;
8474; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__u_0_0_0:
8475; GFX940:       ; %bb.0:
8476; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8477; GFX940-NEXT:    ;;#ASMSTART
8478; GFX940-NEXT:    ; def s[0:1]
8479; GFX940-NEXT:    ;;#ASMEND
8480; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s0, s0
8481; GFX940-NEXT:    s_lshl_b32 s8, s0, 16
8482; GFX940-NEXT:    ;;#ASMSTART
8483; GFX940-NEXT:    ; use s[8:9]
8484; GFX940-NEXT:    ;;#ASMEND
8485; GFX940-NEXT:    s_setpc_b64 s[30:31]
8486  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
8487  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
8488  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 poison, i32 0, i32 0, i32 0>
8489  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
8490  ret void
8491}
8492
8493define void @s_shuffle_v4bf16_v3bf16__0_0_0_0() {
8494; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__0_0_0_0:
8495; GFX900:       ; %bb.0:
8496; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8497; GFX900-NEXT:    ;;#ASMSTART
8498; GFX900-NEXT:    ; def s[4:5]
8499; GFX900-NEXT:    ;;#ASMEND
8500; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s4
8501; GFX900-NEXT:    s_mov_b32 s9, s8
8502; GFX900-NEXT:    ;;#ASMSTART
8503; GFX900-NEXT:    ; use s[8:9]
8504; GFX900-NEXT:    ;;#ASMEND
8505; GFX900-NEXT:    s_setpc_b64 s[30:31]
8506;
8507; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__0_0_0_0:
8508; GFX90A:       ; %bb.0:
8509; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8510; GFX90A-NEXT:    ;;#ASMSTART
8511; GFX90A-NEXT:    ; def s[4:5]
8512; GFX90A-NEXT:    ;;#ASMEND
8513; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s4
8514; GFX90A-NEXT:    s_mov_b32 s9, s8
8515; GFX90A-NEXT:    ;;#ASMSTART
8516; GFX90A-NEXT:    ; use s[8:9]
8517; GFX90A-NEXT:    ;;#ASMEND
8518; GFX90A-NEXT:    s_setpc_b64 s[30:31]
8519;
8520; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__0_0_0_0:
8521; GFX940:       ; %bb.0:
8522; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8523; GFX940-NEXT:    ;;#ASMSTART
8524; GFX940-NEXT:    ; def s[0:1]
8525; GFX940-NEXT:    ;;#ASMEND
8526; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s0, s0
8527; GFX940-NEXT:    s_mov_b32 s9, s8
8528; GFX940-NEXT:    ;;#ASMSTART
8529; GFX940-NEXT:    ; use s[8:9]
8530; GFX940-NEXT:    ;;#ASMEND
8531; GFX940-NEXT:    s_setpc_b64 s[30:31]
8532  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
8533  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
8534  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> zeroinitializer
8535  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
8536  ret void
8537}
8538
8539define void @s_shuffle_v4bf16_v3bf16__1_0_0_0() {
8540; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__1_0_0_0:
8541; GFX900:       ; %bb.0:
8542; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8543; GFX900-NEXT:    ;;#ASMSTART
8544; GFX900-NEXT:    ; def s[4:5]
8545; GFX900-NEXT:    ;;#ASMEND
8546; GFX900-NEXT:    s_lshr_b32 s5, s4, 16
8547; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
8548; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
8549; GFX900-NEXT:    ;;#ASMSTART
8550; GFX900-NEXT:    ; use s[8:9]
8551; GFX900-NEXT:    ;;#ASMEND
8552; GFX900-NEXT:    s_setpc_b64 s[30:31]
8553;
8554; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__1_0_0_0:
8555; GFX90A:       ; %bb.0:
8556; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8557; GFX90A-NEXT:    ;;#ASMSTART
8558; GFX90A-NEXT:    ; def s[4:5]
8559; GFX90A-NEXT:    ;;#ASMEND
8560; GFX90A-NEXT:    s_lshr_b32 s5, s4, 16
8561; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
8562; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
8563; GFX90A-NEXT:    ;;#ASMSTART
8564; GFX90A-NEXT:    ; use s[8:9]
8565; GFX90A-NEXT:    ;;#ASMEND
8566; GFX90A-NEXT:    s_setpc_b64 s[30:31]
8567;
8568; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__1_0_0_0:
8569; GFX940:       ; %bb.0:
8570; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8571; GFX940-NEXT:    ;;#ASMSTART
8572; GFX940-NEXT:    ; def s[0:1]
8573; GFX940-NEXT:    ;;#ASMEND
8574; GFX940-NEXT:    s_lshr_b32 s1, s0, 16
8575; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
8576; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s0, s0
8577; GFX940-NEXT:    ;;#ASMSTART
8578; GFX940-NEXT:    ; use s[8:9]
8579; GFX940-NEXT:    ;;#ASMEND
8580; GFX940-NEXT:    s_setpc_b64 s[30:31]
8581  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
8582  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
8583  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
8584  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
8585  ret void
8586}
8587
8588define void @s_shuffle_v4bf16_v3bf16__2_0_0_0() {
8589; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__2_0_0_0:
8590; GFX900:       ; %bb.0:
8591; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8592; GFX900-NEXT:    ;;#ASMSTART
8593; GFX900-NEXT:    ; def s[4:5]
8594; GFX900-NEXT:    ;;#ASMEND
8595; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
8596; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
8597; GFX900-NEXT:    ;;#ASMSTART
8598; GFX900-NEXT:    ; use s[8:9]
8599; GFX900-NEXT:    ;;#ASMEND
8600; GFX900-NEXT:    s_setpc_b64 s[30:31]
8601;
8602; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__2_0_0_0:
8603; GFX90A:       ; %bb.0:
8604; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8605; GFX90A-NEXT:    ;;#ASMSTART
8606; GFX90A-NEXT:    ; def s[4:5]
8607; GFX90A-NEXT:    ;;#ASMEND
8608; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
8609; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
8610; GFX90A-NEXT:    ;;#ASMSTART
8611; GFX90A-NEXT:    ; use s[8:9]
8612; GFX90A-NEXT:    ;;#ASMEND
8613; GFX90A-NEXT:    s_setpc_b64 s[30:31]
8614;
8615; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__2_0_0_0:
8616; GFX940:       ; %bb.0:
8617; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8618; GFX940-NEXT:    ;;#ASMSTART
8619; GFX940-NEXT:    ; def s[0:1]
8620; GFX940-NEXT:    ;;#ASMEND
8621; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
8622; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s0, s0
8623; GFX940-NEXT:    ;;#ASMSTART
8624; GFX940-NEXT:    ; use s[8:9]
8625; GFX940-NEXT:    ;;#ASMEND
8626; GFX940-NEXT:    s_setpc_b64 s[30:31]
8627  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
8628  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
8629  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 2, i32 0, i32 0, i32 0>
8630  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
8631  ret void
8632}
8633
8634define void @s_shuffle_v4bf16_v3bf16__3_0_0_0() {
8635; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__3_0_0_0:
8636; GFX900:       ; %bb.0:
8637; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8638; GFX900-NEXT:    ;;#ASMSTART
8639; GFX900-NEXT:    ; def s[4:5]
8640; GFX900-NEXT:    ;;#ASMEND
8641; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
8642; GFX900-NEXT:    s_lshl_b32 s8, s4, 16
8643; GFX900-NEXT:    ;;#ASMSTART
8644; GFX900-NEXT:    ; use s[8:9]
8645; GFX900-NEXT:    ;;#ASMEND
8646; GFX900-NEXT:    s_setpc_b64 s[30:31]
8647;
8648; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__3_0_0_0:
8649; GFX90A:       ; %bb.0:
8650; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8651; GFX90A-NEXT:    ;;#ASMSTART
8652; GFX90A-NEXT:    ; def s[4:5]
8653; GFX90A-NEXT:    ;;#ASMEND
8654; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
8655; GFX90A-NEXT:    s_lshl_b32 s8, s4, 16
8656; GFX90A-NEXT:    ;;#ASMSTART
8657; GFX90A-NEXT:    ; use s[8:9]
8658; GFX90A-NEXT:    ;;#ASMEND
8659; GFX90A-NEXT:    s_setpc_b64 s[30:31]
8660;
8661; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__3_0_0_0:
8662; GFX940:       ; %bb.0:
8663; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8664; GFX940-NEXT:    ;;#ASMSTART
8665; GFX940-NEXT:    ; def s[0:1]
8666; GFX940-NEXT:    ;;#ASMEND
8667; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s0, s0
8668; GFX940-NEXT:    s_lshl_b32 s8, s0, 16
8669; GFX940-NEXT:    ;;#ASMSTART
8670; GFX940-NEXT:    ; use s[8:9]
8671; GFX940-NEXT:    ;;#ASMEND
8672; GFX940-NEXT:    s_setpc_b64 s[30:31]
8673  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
8674  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
8675  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
8676  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
8677  ret void
8678}
8679
8680define void @s_shuffle_v4bf16_v3bf16__4_0_0_0() {
8681; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__4_0_0_0:
8682; GFX900:       ; %bb.0:
8683; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8684; GFX900-NEXT:    ;;#ASMSTART
8685; GFX900-NEXT:    ; def s[4:5]
8686; GFX900-NEXT:    ;;#ASMEND
8687; GFX900-NEXT:    ;;#ASMSTART
8688; GFX900-NEXT:    ; def s[6:7]
8689; GFX900-NEXT:    ;;#ASMEND
8690; GFX900-NEXT:    s_lshr_b32 s5, s6, 16
8691; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
8692; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
8693; GFX900-NEXT:    ;;#ASMSTART
8694; GFX900-NEXT:    ; use s[8:9]
8695; GFX900-NEXT:    ;;#ASMEND
8696; GFX900-NEXT:    s_setpc_b64 s[30:31]
8697;
8698; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__4_0_0_0:
8699; GFX90A:       ; %bb.0:
8700; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8701; GFX90A-NEXT:    ;;#ASMSTART
8702; GFX90A-NEXT:    ; def s[4:5]
8703; GFX90A-NEXT:    ;;#ASMEND
8704; GFX90A-NEXT:    ;;#ASMSTART
8705; GFX90A-NEXT:    ; def s[6:7]
8706; GFX90A-NEXT:    ;;#ASMEND
8707; GFX90A-NEXT:    s_lshr_b32 s5, s6, 16
8708; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
8709; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
8710; GFX90A-NEXT:    ;;#ASMSTART
8711; GFX90A-NEXT:    ; use s[8:9]
8712; GFX90A-NEXT:    ;;#ASMEND
8713; GFX90A-NEXT:    s_setpc_b64 s[30:31]
8714;
8715; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__4_0_0_0:
8716; GFX940:       ; %bb.0:
8717; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8718; GFX940-NEXT:    ;;#ASMSTART
8719; GFX940-NEXT:    ; def s[0:1]
8720; GFX940-NEXT:    ;;#ASMEND
8721; GFX940-NEXT:    ;;#ASMSTART
8722; GFX940-NEXT:    ; def s[2:3]
8723; GFX940-NEXT:    ;;#ASMEND
8724; GFX940-NEXT:    s_lshr_b32 s1, s2, 16
8725; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
8726; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s0, s0
8727; GFX940-NEXT:    ;;#ASMSTART
8728; GFX940-NEXT:    ; use s[8:9]
8729; GFX940-NEXT:    ;;#ASMEND
8730; GFX940-NEXT:    s_setpc_b64 s[30:31]
8731  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
8732  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
8733  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
8734  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
8735  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 4, i32 0, i32 0, i32 0>
8736  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
8737  ret void
8738}
8739
8740define void @s_shuffle_v4bf16_v3bf16__5_0_0_0() {
8741; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_0_0_0:
8742; GFX900:       ; %bb.0:
8743; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8744; GFX900-NEXT:    ;;#ASMSTART
8745; GFX900-NEXT:    ; def s[4:5]
8746; GFX900-NEXT:    ;;#ASMEND
8747; GFX900-NEXT:    ;;#ASMSTART
8748; GFX900-NEXT:    ; def s[6:7]
8749; GFX900-NEXT:    ;;#ASMEND
8750; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s4
8751; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
8752; GFX900-NEXT:    ;;#ASMSTART
8753; GFX900-NEXT:    ; use s[8:9]
8754; GFX900-NEXT:    ;;#ASMEND
8755; GFX900-NEXT:    s_setpc_b64 s[30:31]
8756;
8757; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_0_0_0:
8758; GFX90A:       ; %bb.0:
8759; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8760; GFX90A-NEXT:    ;;#ASMSTART
8761; GFX90A-NEXT:    ; def s[4:5]
8762; GFX90A-NEXT:    ;;#ASMEND
8763; GFX90A-NEXT:    ;;#ASMSTART
8764; GFX90A-NEXT:    ; def s[6:7]
8765; GFX90A-NEXT:    ;;#ASMEND
8766; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s4
8767; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
8768; GFX90A-NEXT:    ;;#ASMSTART
8769; GFX90A-NEXT:    ; use s[8:9]
8770; GFX90A-NEXT:    ;;#ASMEND
8771; GFX90A-NEXT:    s_setpc_b64 s[30:31]
8772;
8773; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_0_0_0:
8774; GFX940:       ; %bb.0:
8775; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8776; GFX940-NEXT:    ;;#ASMSTART
8777; GFX940-NEXT:    ; def s[0:1]
8778; GFX940-NEXT:    ;;#ASMEND
8779; GFX940-NEXT:    ;;#ASMSTART
8780; GFX940-NEXT:    ; def s[2:3]
8781; GFX940-NEXT:    ;;#ASMEND
8782; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s0
8783; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s0, s0
8784; GFX940-NEXT:    ;;#ASMSTART
8785; GFX940-NEXT:    ; use s[8:9]
8786; GFX940-NEXT:    ;;#ASMEND
8787; GFX940-NEXT:    s_setpc_b64 s[30:31]
8788  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
8789  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
8790  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
8791  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
8792  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 0, i32 0, i32 0>
8793  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
8794  ret void
8795}
8796
8797define void @s_shuffle_v4bf16_v3bf16__5_u_0_0() {
8798; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_u_0_0:
8799; GFX900:       ; %bb.0:
8800; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8801; GFX900-NEXT:    ;;#ASMSTART
8802; GFX900-NEXT:    ; def s[4:5]
8803; GFX900-NEXT:    ;;#ASMEND
8804; GFX900-NEXT:    ;;#ASMSTART
8805; GFX900-NEXT:    ; def s[6:7]
8806; GFX900-NEXT:    ;;#ASMEND
8807; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
8808; GFX900-NEXT:    s_mov_b32 s8, s7
8809; GFX900-NEXT:    ;;#ASMSTART
8810; GFX900-NEXT:    ; use s[8:9]
8811; GFX900-NEXT:    ;;#ASMEND
8812; GFX900-NEXT:    s_setpc_b64 s[30:31]
8813;
8814; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_u_0_0:
8815; GFX90A:       ; %bb.0:
8816; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8817; GFX90A-NEXT:    ;;#ASMSTART
8818; GFX90A-NEXT:    ; def s[4:5]
8819; GFX90A-NEXT:    ;;#ASMEND
8820; GFX90A-NEXT:    ;;#ASMSTART
8821; GFX90A-NEXT:    ; def s[6:7]
8822; GFX90A-NEXT:    ;;#ASMEND
8823; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
8824; GFX90A-NEXT:    s_mov_b32 s8, s7
8825; GFX90A-NEXT:    ;;#ASMSTART
8826; GFX90A-NEXT:    ; use s[8:9]
8827; GFX90A-NEXT:    ;;#ASMEND
8828; GFX90A-NEXT:    s_setpc_b64 s[30:31]
8829;
8830; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_u_0_0:
8831; GFX940:       ; %bb.0:
8832; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8833; GFX940-NEXT:    ;;#ASMSTART
8834; GFX940-NEXT:    ; def s[0:1]
8835; GFX940-NEXT:    ;;#ASMEND
8836; GFX940-NEXT:    ;;#ASMSTART
8837; GFX940-NEXT:    ; def s[2:3]
8838; GFX940-NEXT:    ;;#ASMEND
8839; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s0, s0
8840; GFX940-NEXT:    s_mov_b32 s8, s3
8841; GFX940-NEXT:    ;;#ASMSTART
8842; GFX940-NEXT:    ; use s[8:9]
8843; GFX940-NEXT:    ;;#ASMEND
8844; GFX940-NEXT:    s_setpc_b64 s[30:31]
8845  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
8846  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
8847  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
8848  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
8849  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 poison, i32 0, i32 0>
8850  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
8851  ret void
8852}
8853
8854define void @s_shuffle_v4bf16_v3bf16__5_1_0_0() {
8855; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_1_0_0:
8856; GFX900:       ; %bb.0:
8857; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8858; GFX900-NEXT:    ;;#ASMSTART
8859; GFX900-NEXT:    ; def s[4:5]
8860; GFX900-NEXT:    ;;#ASMEND
8861; GFX900-NEXT:    s_lshr_b32 s5, s4, 16
8862; GFX900-NEXT:    ;;#ASMSTART
8863; GFX900-NEXT:    ; def s[6:7]
8864; GFX900-NEXT:    ;;#ASMEND
8865; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s5
8866; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
8867; GFX900-NEXT:    ;;#ASMSTART
8868; GFX900-NEXT:    ; use s[8:9]
8869; GFX900-NEXT:    ;;#ASMEND
8870; GFX900-NEXT:    s_setpc_b64 s[30:31]
8871;
8872; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_1_0_0:
8873; GFX90A:       ; %bb.0:
8874; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8875; GFX90A-NEXT:    ;;#ASMSTART
8876; GFX90A-NEXT:    ; def s[4:5]
8877; GFX90A-NEXT:    ;;#ASMEND
8878; GFX90A-NEXT:    s_lshr_b32 s5, s4, 16
8879; GFX90A-NEXT:    ;;#ASMSTART
8880; GFX90A-NEXT:    ; def s[6:7]
8881; GFX90A-NEXT:    ;;#ASMEND
8882; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s5
8883; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
8884; GFX90A-NEXT:    ;;#ASMSTART
8885; GFX90A-NEXT:    ; use s[8:9]
8886; GFX90A-NEXT:    ;;#ASMEND
8887; GFX90A-NEXT:    s_setpc_b64 s[30:31]
8888;
8889; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_1_0_0:
8890; GFX940:       ; %bb.0:
8891; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8892; GFX940-NEXT:    ;;#ASMSTART
8893; GFX940-NEXT:    ; def s[0:1]
8894; GFX940-NEXT:    ;;#ASMEND
8895; GFX940-NEXT:    s_lshr_b32 s1, s0, 16
8896; GFX940-NEXT:    ;;#ASMSTART
8897; GFX940-NEXT:    ; def s[2:3]
8898; GFX940-NEXT:    ;;#ASMEND
8899; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s1
8900; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s0, s0
8901; GFX940-NEXT:    ;;#ASMSTART
8902; GFX940-NEXT:    ; use s[8:9]
8903; GFX940-NEXT:    ;;#ASMEND
8904; GFX940-NEXT:    s_setpc_b64 s[30:31]
8905  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
8906  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
8907  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
8908  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
8909  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 1, i32 0, i32 0>
8910  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
8911  ret void
8912}
8913
8914define void @s_shuffle_v4bf16_v3bf16__5_2_0_0() {
8915; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_2_0_0:
8916; GFX900:       ; %bb.0:
8917; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8918; GFX900-NEXT:    ;;#ASMSTART
8919; GFX900-NEXT:    ; def s[4:5]
8920; GFX900-NEXT:    ;;#ASMEND
8921; GFX900-NEXT:    ;;#ASMSTART
8922; GFX900-NEXT:    ; def s[6:7]
8923; GFX900-NEXT:    ;;#ASMEND
8924; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s5
8925; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
8926; GFX900-NEXT:    ;;#ASMSTART
8927; GFX900-NEXT:    ; use s[8:9]
8928; GFX900-NEXT:    ;;#ASMEND
8929; GFX900-NEXT:    s_setpc_b64 s[30:31]
8930;
8931; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_2_0_0:
8932; GFX90A:       ; %bb.0:
8933; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8934; GFX90A-NEXT:    ;;#ASMSTART
8935; GFX90A-NEXT:    ; def s[4:5]
8936; GFX90A-NEXT:    ;;#ASMEND
8937; GFX90A-NEXT:    ;;#ASMSTART
8938; GFX90A-NEXT:    ; def s[6:7]
8939; GFX90A-NEXT:    ;;#ASMEND
8940; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s5
8941; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
8942; GFX90A-NEXT:    ;;#ASMSTART
8943; GFX90A-NEXT:    ; use s[8:9]
8944; GFX90A-NEXT:    ;;#ASMEND
8945; GFX90A-NEXT:    s_setpc_b64 s[30:31]
8946;
8947; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_2_0_0:
8948; GFX940:       ; %bb.0:
8949; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8950; GFX940-NEXT:    ;;#ASMSTART
8951; GFX940-NEXT:    ; def s[0:1]
8952; GFX940-NEXT:    ;;#ASMEND
8953; GFX940-NEXT:    ;;#ASMSTART
8954; GFX940-NEXT:    ; def s[2:3]
8955; GFX940-NEXT:    ;;#ASMEND
8956; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s1
8957; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s0, s0
8958; GFX940-NEXT:    ;;#ASMSTART
8959; GFX940-NEXT:    ; use s[8:9]
8960; GFX940-NEXT:    ;;#ASMEND
8961; GFX940-NEXT:    s_setpc_b64 s[30:31]
8962  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
8963  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
8964  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
8965  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
8966  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 2, i32 0, i32 0>
8967  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
8968  ret void
8969}
8970
8971define void @s_shuffle_v4bf16_v3bf16__5_3_0_0() {
8972; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_3_0_0:
8973; GFX900:       ; %bb.0:
8974; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8975; GFX900-NEXT:    ;;#ASMSTART
8976; GFX900-NEXT:    ; def s[4:5]
8977; GFX900-NEXT:    ;;#ASMEND
8978; GFX900-NEXT:    ;;#ASMSTART
8979; GFX900-NEXT:    ; def s[6:7]
8980; GFX900-NEXT:    ;;#ASMEND
8981; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s6
8982; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
8983; GFX900-NEXT:    ;;#ASMSTART
8984; GFX900-NEXT:    ; use s[8:9]
8985; GFX900-NEXT:    ;;#ASMEND
8986; GFX900-NEXT:    s_setpc_b64 s[30:31]
8987;
8988; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_3_0_0:
8989; GFX90A:       ; %bb.0:
8990; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8991; GFX90A-NEXT:    ;;#ASMSTART
8992; GFX90A-NEXT:    ; def s[4:5]
8993; GFX90A-NEXT:    ;;#ASMEND
8994; GFX90A-NEXT:    ;;#ASMSTART
8995; GFX90A-NEXT:    ; def s[6:7]
8996; GFX90A-NEXT:    ;;#ASMEND
8997; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s6
8998; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
8999; GFX90A-NEXT:    ;;#ASMSTART
9000; GFX90A-NEXT:    ; use s[8:9]
9001; GFX90A-NEXT:    ;;#ASMEND
9002; GFX90A-NEXT:    s_setpc_b64 s[30:31]
9003;
9004; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_3_0_0:
9005; GFX940:       ; %bb.0:
9006; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9007; GFX940-NEXT:    ;;#ASMSTART
9008; GFX940-NEXT:    ; def s[0:1]
9009; GFX940-NEXT:    ;;#ASMEND
9010; GFX940-NEXT:    ;;#ASMSTART
9011; GFX940-NEXT:    ; def s[2:3]
9012; GFX940-NEXT:    ;;#ASMEND
9013; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s2
9014; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s0, s0
9015; GFX940-NEXT:    ;;#ASMSTART
9016; GFX940-NEXT:    ; use s[8:9]
9017; GFX940-NEXT:    ;;#ASMEND
9018; GFX940-NEXT:    s_setpc_b64 s[30:31]
9019  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
9020  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
9021  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
9022  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
9023  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 3, i32 0, i32 0>
9024  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
9025  ret void
9026}
9027
9028define void @s_shuffle_v4bf16_v3bf16__5_4_0_0() {
9029; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_4_0_0:
9030; GFX900:       ; %bb.0:
9031; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9032; GFX900-NEXT:    ;;#ASMSTART
9033; GFX900-NEXT:    ; def s[4:5]
9034; GFX900-NEXT:    ;;#ASMEND
9035; GFX900-NEXT:    ;;#ASMSTART
9036; GFX900-NEXT:    ; def s[6:7]
9037; GFX900-NEXT:    ;;#ASMEND
9038; GFX900-NEXT:    s_lshr_b32 s5, s6, 16
9039; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s5
9040; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
9041; GFX900-NEXT:    ;;#ASMSTART
9042; GFX900-NEXT:    ; use s[8:9]
9043; GFX900-NEXT:    ;;#ASMEND
9044; GFX900-NEXT:    s_setpc_b64 s[30:31]
9045;
9046; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_4_0_0:
9047; GFX90A:       ; %bb.0:
9048; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9049; GFX90A-NEXT:    ;;#ASMSTART
9050; GFX90A-NEXT:    ; def s[4:5]
9051; GFX90A-NEXT:    ;;#ASMEND
9052; GFX90A-NEXT:    ;;#ASMSTART
9053; GFX90A-NEXT:    ; def s[6:7]
9054; GFX90A-NEXT:    ;;#ASMEND
9055; GFX90A-NEXT:    s_lshr_b32 s5, s6, 16
9056; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s5
9057; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
9058; GFX90A-NEXT:    ;;#ASMSTART
9059; GFX90A-NEXT:    ; use s[8:9]
9060; GFX90A-NEXT:    ;;#ASMEND
9061; GFX90A-NEXT:    s_setpc_b64 s[30:31]
9062;
9063; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_4_0_0:
9064; GFX940:       ; %bb.0:
9065; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9066; GFX940-NEXT:    ;;#ASMSTART
9067; GFX940-NEXT:    ; def s[0:1]
9068; GFX940-NEXT:    ;;#ASMEND
9069; GFX940-NEXT:    ;;#ASMSTART
9070; GFX940-NEXT:    ; def s[2:3]
9071; GFX940-NEXT:    ;;#ASMEND
9072; GFX940-NEXT:    s_lshr_b32 s1, s2, 16
9073; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s1
9074; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s0, s0
9075; GFX940-NEXT:    ;;#ASMSTART
9076; GFX940-NEXT:    ; use s[8:9]
9077; GFX940-NEXT:    ;;#ASMEND
9078; GFX940-NEXT:    s_setpc_b64 s[30:31]
9079  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
9080  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
9081  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
9082  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
9083  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 4, i32 0, i32 0>
9084  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
9085  ret void
9086}
9087
9088define void @s_shuffle_v4bf16_v3bf16__5_5_0_0() {
9089; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_0:
9090; GFX900:       ; %bb.0:
9091; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9092; GFX900-NEXT:    ;;#ASMSTART
9093; GFX900-NEXT:    ; def s[4:5]
9094; GFX900-NEXT:    ;;#ASMEND
9095; GFX900-NEXT:    ;;#ASMSTART
9096; GFX900-NEXT:    ; def s[6:7]
9097; GFX900-NEXT:    ;;#ASMEND
9098; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
9099; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
9100; GFX900-NEXT:    ;;#ASMSTART
9101; GFX900-NEXT:    ; use s[8:9]
9102; GFX900-NEXT:    ;;#ASMEND
9103; GFX900-NEXT:    s_setpc_b64 s[30:31]
9104;
9105; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_0:
9106; GFX90A:       ; %bb.0:
9107; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9108; GFX90A-NEXT:    ;;#ASMSTART
9109; GFX90A-NEXT:    ; def s[4:5]
9110; GFX90A-NEXT:    ;;#ASMEND
9111; GFX90A-NEXT:    ;;#ASMSTART
9112; GFX90A-NEXT:    ; def s[6:7]
9113; GFX90A-NEXT:    ;;#ASMEND
9114; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
9115; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
9116; GFX90A-NEXT:    ;;#ASMSTART
9117; GFX90A-NEXT:    ; use s[8:9]
9118; GFX90A-NEXT:    ;;#ASMEND
9119; GFX90A-NEXT:    s_setpc_b64 s[30:31]
9120;
9121; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_0:
9122; GFX940:       ; %bb.0:
9123; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9124; GFX940-NEXT:    ;;#ASMSTART
9125; GFX940-NEXT:    ; def s[0:1]
9126; GFX940-NEXT:    ;;#ASMEND
9127; GFX940-NEXT:    ;;#ASMSTART
9128; GFX940-NEXT:    ; def s[2:3]
9129; GFX940-NEXT:    ;;#ASMEND
9130; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s0, s0
9131; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s3
9132; GFX940-NEXT:    ;;#ASMSTART
9133; GFX940-NEXT:    ; use s[8:9]
9134; GFX940-NEXT:    ;;#ASMEND
9135; GFX940-NEXT:    s_setpc_b64 s[30:31]
9136  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
9137  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
9138  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
9139  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
9140  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 0, i32 0>
9141  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
9142  ret void
9143}
9144
9145define void @s_shuffle_v4bf16_v3bf16__5_5_u_0() {
9146; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_0:
9147; GFX900:       ; %bb.0:
9148; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9149; GFX900-NEXT:    ;;#ASMSTART
9150; GFX900-NEXT:    ; def s[4:5]
9151; GFX900-NEXT:    ;;#ASMEND
9152; GFX900-NEXT:    ;;#ASMSTART
9153; GFX900-NEXT:    ; def s[6:7]
9154; GFX900-NEXT:    ;;#ASMEND
9155; GFX900-NEXT:    s_lshl_b32 s9, s4, 16
9156; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
9157; GFX900-NEXT:    ;;#ASMSTART
9158; GFX900-NEXT:    ; use s[8:9]
9159; GFX900-NEXT:    ;;#ASMEND
9160; GFX900-NEXT:    s_setpc_b64 s[30:31]
9161;
9162; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_0:
9163; GFX90A:       ; %bb.0:
9164; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9165; GFX90A-NEXT:    ;;#ASMSTART
9166; GFX90A-NEXT:    ; def s[4:5]
9167; GFX90A-NEXT:    ;;#ASMEND
9168; GFX90A-NEXT:    ;;#ASMSTART
9169; GFX90A-NEXT:    ; def s[6:7]
9170; GFX90A-NEXT:    ;;#ASMEND
9171; GFX90A-NEXT:    s_lshl_b32 s9, s4, 16
9172; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
9173; GFX90A-NEXT:    ;;#ASMSTART
9174; GFX90A-NEXT:    ; use s[8:9]
9175; GFX90A-NEXT:    ;;#ASMEND
9176; GFX90A-NEXT:    s_setpc_b64 s[30:31]
9177;
9178; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_0:
9179; GFX940:       ; %bb.0:
9180; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9181; GFX940-NEXT:    ;;#ASMSTART
9182; GFX940-NEXT:    ; def s[0:1]
9183; GFX940-NEXT:    ;;#ASMEND
9184; GFX940-NEXT:    ;;#ASMSTART
9185; GFX940-NEXT:    ; def s[2:3]
9186; GFX940-NEXT:    ;;#ASMEND
9187; GFX940-NEXT:    s_lshl_b32 s9, s0, 16
9188; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s3
9189; GFX940-NEXT:    ;;#ASMSTART
9190; GFX940-NEXT:    ; use s[8:9]
9191; GFX940-NEXT:    ;;#ASMEND
9192; GFX940-NEXT:    s_setpc_b64 s[30:31]
9193  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
9194  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
9195  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
9196  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
9197  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 poison, i32 0>
9198  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
9199  ret void
9200}
9201
9202define void @s_shuffle_v4bf16_v3bf16__5_5_1_0() {
9203; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_0:
9204; GFX900:       ; %bb.0:
9205; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9206; GFX900-NEXT:    ;;#ASMSTART
9207; GFX900-NEXT:    ; def s[4:5]
9208; GFX900-NEXT:    ;;#ASMEND
9209; GFX900-NEXT:    s_lshr_b32 s5, s4, 16
9210; GFX900-NEXT:    ;;#ASMSTART
9211; GFX900-NEXT:    ; def s[6:7]
9212; GFX900-NEXT:    ;;#ASMEND
9213; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s5, s4
9214; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
9215; GFX900-NEXT:    ;;#ASMSTART
9216; GFX900-NEXT:    ; use s[8:9]
9217; GFX900-NEXT:    ;;#ASMEND
9218; GFX900-NEXT:    s_setpc_b64 s[30:31]
9219;
9220; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_0:
9221; GFX90A:       ; %bb.0:
9222; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9223; GFX90A-NEXT:    ;;#ASMSTART
9224; GFX90A-NEXT:    ; def s[4:5]
9225; GFX90A-NEXT:    ;;#ASMEND
9226; GFX90A-NEXT:    s_lshr_b32 s5, s4, 16
9227; GFX90A-NEXT:    ;;#ASMSTART
9228; GFX90A-NEXT:    ; def s[6:7]
9229; GFX90A-NEXT:    ;;#ASMEND
9230; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s5, s4
9231; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
9232; GFX90A-NEXT:    ;;#ASMSTART
9233; GFX90A-NEXT:    ; use s[8:9]
9234; GFX90A-NEXT:    ;;#ASMEND
9235; GFX90A-NEXT:    s_setpc_b64 s[30:31]
9236;
9237; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_0:
9238; GFX940:       ; %bb.0:
9239; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9240; GFX940-NEXT:    ;;#ASMSTART
9241; GFX940-NEXT:    ; def s[0:1]
9242; GFX940-NEXT:    ;;#ASMEND
9243; GFX940-NEXT:    s_lshr_b32 s1, s0, 16
9244; GFX940-NEXT:    ;;#ASMSTART
9245; GFX940-NEXT:    ; def s[2:3]
9246; GFX940-NEXT:    ;;#ASMEND
9247; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s1, s0
9248; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s3
9249; GFX940-NEXT:    ;;#ASMSTART
9250; GFX940-NEXT:    ; use s[8:9]
9251; GFX940-NEXT:    ;;#ASMEND
9252; GFX940-NEXT:    s_setpc_b64 s[30:31]
9253  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
9254  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
9255  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
9256  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
9257  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 1, i32 0>
9258  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
9259  ret void
9260}
9261
9262define void @s_shuffle_v4bf16_v3bf16__5_5_2_0() {
9263; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_0:
9264; GFX900:       ; %bb.0:
9265; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9266; GFX900-NEXT:    ;;#ASMSTART
9267; GFX900-NEXT:    ; def s[4:5]
9268; GFX900-NEXT:    ;;#ASMEND
9269; GFX900-NEXT:    ;;#ASMSTART
9270; GFX900-NEXT:    ; def s[6:7]
9271; GFX900-NEXT:    ;;#ASMEND
9272; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s5, s4
9273; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
9274; GFX900-NEXT:    ;;#ASMSTART
9275; GFX900-NEXT:    ; use s[8:9]
9276; GFX900-NEXT:    ;;#ASMEND
9277; GFX900-NEXT:    s_setpc_b64 s[30:31]
9278;
9279; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_0:
9280; GFX90A:       ; %bb.0:
9281; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9282; GFX90A-NEXT:    ;;#ASMSTART
9283; GFX90A-NEXT:    ; def s[4:5]
9284; GFX90A-NEXT:    ;;#ASMEND
9285; GFX90A-NEXT:    ;;#ASMSTART
9286; GFX90A-NEXT:    ; def s[6:7]
9287; GFX90A-NEXT:    ;;#ASMEND
9288; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s5, s4
9289; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
9290; GFX90A-NEXT:    ;;#ASMSTART
9291; GFX90A-NEXT:    ; use s[8:9]
9292; GFX90A-NEXT:    ;;#ASMEND
9293; GFX90A-NEXT:    s_setpc_b64 s[30:31]
9294;
9295; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_0:
9296; GFX940:       ; %bb.0:
9297; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9298; GFX940-NEXT:    ;;#ASMSTART
9299; GFX940-NEXT:    ; def s[0:1]
9300; GFX940-NEXT:    ;;#ASMEND
9301; GFX940-NEXT:    ;;#ASMSTART
9302; GFX940-NEXT:    ; def s[2:3]
9303; GFX940-NEXT:    ;;#ASMEND
9304; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s1, s0
9305; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s3
9306; GFX940-NEXT:    ;;#ASMSTART
9307; GFX940-NEXT:    ; use s[8:9]
9308; GFX940-NEXT:    ;;#ASMEND
9309; GFX940-NEXT:    s_setpc_b64 s[30:31]
9310  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
9311  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
9312  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
9313  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
9314  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 2, i32 0>
9315  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
9316  ret void
9317}
9318
9319define void @s_shuffle_v4bf16_v3bf16__5_5_3_0() {
9320; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_0:
9321; GFX900:       ; %bb.0:
9322; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9323; GFX900-NEXT:    ;;#ASMSTART
9324; GFX900-NEXT:    ; def s[4:5]
9325; GFX900-NEXT:    ;;#ASMEND
9326; GFX900-NEXT:    ;;#ASMSTART
9327; GFX900-NEXT:    ; def s[6:7]
9328; GFX900-NEXT:    ;;#ASMEND
9329; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s6, s4
9330; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
9331; GFX900-NEXT:    ;;#ASMSTART
9332; GFX900-NEXT:    ; use s[8:9]
9333; GFX900-NEXT:    ;;#ASMEND
9334; GFX900-NEXT:    s_setpc_b64 s[30:31]
9335;
9336; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_0:
9337; GFX90A:       ; %bb.0:
9338; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9339; GFX90A-NEXT:    ;;#ASMSTART
9340; GFX90A-NEXT:    ; def s[4:5]
9341; GFX90A-NEXT:    ;;#ASMEND
9342; GFX90A-NEXT:    ;;#ASMSTART
9343; GFX90A-NEXT:    ; def s[6:7]
9344; GFX90A-NEXT:    ;;#ASMEND
9345; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s6, s4
9346; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
9347; GFX90A-NEXT:    ;;#ASMSTART
9348; GFX90A-NEXT:    ; use s[8:9]
9349; GFX90A-NEXT:    ;;#ASMEND
9350; GFX90A-NEXT:    s_setpc_b64 s[30:31]
9351;
9352; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_0:
9353; GFX940:       ; %bb.0:
9354; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9355; GFX940-NEXT:    ;;#ASMSTART
9356; GFX940-NEXT:    ; def s[0:1]
9357; GFX940-NEXT:    ;;#ASMEND
9358; GFX940-NEXT:    ;;#ASMSTART
9359; GFX940-NEXT:    ; def s[2:3]
9360; GFX940-NEXT:    ;;#ASMEND
9361; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s2, s0
9362; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s3
9363; GFX940-NEXT:    ;;#ASMSTART
9364; GFX940-NEXT:    ; use s[8:9]
9365; GFX940-NEXT:    ;;#ASMEND
9366; GFX940-NEXT:    s_setpc_b64 s[30:31]
9367  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
9368  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
9369  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
9370  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
9371  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 3, i32 0>
9372  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
9373  ret void
9374}
9375
9376define void @s_shuffle_v4bf16_v3bf16__5_5_4_0() {
9377; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_0:
9378; GFX900:       ; %bb.0:
9379; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9380; GFX900-NEXT:    ;;#ASMSTART
9381; GFX900-NEXT:    ; def s[4:5]
9382; GFX900-NEXT:    ;;#ASMEND
9383; GFX900-NEXT:    ;;#ASMSTART
9384; GFX900-NEXT:    ; def s[6:7]
9385; GFX900-NEXT:    ;;#ASMEND
9386; GFX900-NEXT:    s_lshr_b32 s5, s6, 16
9387; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s5, s4
9388; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
9389; GFX900-NEXT:    ;;#ASMSTART
9390; GFX900-NEXT:    ; use s[8:9]
9391; GFX900-NEXT:    ;;#ASMEND
9392; GFX900-NEXT:    s_setpc_b64 s[30:31]
9393;
9394; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_0:
9395; GFX90A:       ; %bb.0:
9396; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9397; GFX90A-NEXT:    ;;#ASMSTART
9398; GFX90A-NEXT:    ; def s[4:5]
9399; GFX90A-NEXT:    ;;#ASMEND
9400; GFX90A-NEXT:    ;;#ASMSTART
9401; GFX90A-NEXT:    ; def s[6:7]
9402; GFX90A-NEXT:    ;;#ASMEND
9403; GFX90A-NEXT:    s_lshr_b32 s5, s6, 16
9404; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s5, s4
9405; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
9406; GFX90A-NEXT:    ;;#ASMSTART
9407; GFX90A-NEXT:    ; use s[8:9]
9408; GFX90A-NEXT:    ;;#ASMEND
9409; GFX90A-NEXT:    s_setpc_b64 s[30:31]
9410;
9411; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_0:
9412; GFX940:       ; %bb.0:
9413; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9414; GFX940-NEXT:    ;;#ASMSTART
9415; GFX940-NEXT:    ; def s[0:1]
9416; GFX940-NEXT:    ;;#ASMEND
9417; GFX940-NEXT:    ;;#ASMSTART
9418; GFX940-NEXT:    ; def s[2:3]
9419; GFX940-NEXT:    ;;#ASMEND
9420; GFX940-NEXT:    s_lshr_b32 s1, s2, 16
9421; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s1, s0
9422; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s3
9423; GFX940-NEXT:    ;;#ASMSTART
9424; GFX940-NEXT:    ; use s[8:9]
9425; GFX940-NEXT:    ;;#ASMEND
9426; GFX940-NEXT:    s_setpc_b64 s[30:31]
9427  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
9428  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
9429  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
9430  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
9431  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 4, i32 0>
9432  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
9433  ret void
9434}
9435
9436define void @s_shuffle_v4bf16_v3bf16__u_1_1_1() {
9437; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__u_1_1_1:
9438; GFX900:       ; %bb.0:
9439; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9440; GFX900-NEXT:    ;;#ASMSTART
9441; GFX900-NEXT:    ; def s[8:9]
9442; GFX900-NEXT:    ;;#ASMEND
9443; GFX900-NEXT:    s_lshr_b32 s4, s8, 16
9444; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
9445; GFX900-NEXT:    ;;#ASMSTART
9446; GFX900-NEXT:    ; use s[8:9]
9447; GFX900-NEXT:    ;;#ASMEND
9448; GFX900-NEXT:    s_setpc_b64 s[30:31]
9449;
9450; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__u_1_1_1:
9451; GFX90A:       ; %bb.0:
9452; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9453; GFX90A-NEXT:    ;;#ASMSTART
9454; GFX90A-NEXT:    ; def s[8:9]
9455; GFX90A-NEXT:    ;;#ASMEND
9456; GFX90A-NEXT:    s_lshr_b32 s4, s8, 16
9457; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
9458; GFX90A-NEXT:    ;;#ASMSTART
9459; GFX90A-NEXT:    ; use s[8:9]
9460; GFX90A-NEXT:    ;;#ASMEND
9461; GFX90A-NEXT:    s_setpc_b64 s[30:31]
9462;
9463; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__u_1_1_1:
9464; GFX940:       ; %bb.0:
9465; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9466; GFX940-NEXT:    ;;#ASMSTART
9467; GFX940-NEXT:    ; def s[8:9]
9468; GFX940-NEXT:    ;;#ASMEND
9469; GFX940-NEXT:    s_lshr_b32 s0, s8, 16
9470; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s0, s0
9471; GFX940-NEXT:    ;;#ASMSTART
9472; GFX940-NEXT:    ; use s[8:9]
9473; GFX940-NEXT:    ;;#ASMEND
9474; GFX940-NEXT:    s_setpc_b64 s[30:31]
9475  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
9476  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
9477  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 poison, i32 1, i32 1, i32 1>
9478  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
9479  ret void
9480}
9481
9482define void @s_shuffle_v4bf16_v3bf16__0_1_1_1() {
9483; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__0_1_1_1:
9484; GFX900:       ; %bb.0:
9485; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9486; GFX900-NEXT:    ;;#ASMSTART
9487; GFX900-NEXT:    ; def s[8:9]
9488; GFX900-NEXT:    ;;#ASMEND
9489; GFX900-NEXT:    s_lshr_b32 s4, s8, 16
9490; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
9491; GFX900-NEXT:    ;;#ASMSTART
9492; GFX900-NEXT:    ; use s[8:9]
9493; GFX900-NEXT:    ;;#ASMEND
9494; GFX900-NEXT:    s_setpc_b64 s[30:31]
9495;
9496; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__0_1_1_1:
9497; GFX90A:       ; %bb.0:
9498; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9499; GFX90A-NEXT:    ;;#ASMSTART
9500; GFX90A-NEXT:    ; def s[8:9]
9501; GFX90A-NEXT:    ;;#ASMEND
9502; GFX90A-NEXT:    s_lshr_b32 s4, s8, 16
9503; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
9504; GFX90A-NEXT:    ;;#ASMSTART
9505; GFX90A-NEXT:    ; use s[8:9]
9506; GFX90A-NEXT:    ;;#ASMEND
9507; GFX90A-NEXT:    s_setpc_b64 s[30:31]
9508;
9509; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__0_1_1_1:
9510; GFX940:       ; %bb.0:
9511; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9512; GFX940-NEXT:    ;;#ASMSTART
9513; GFX940-NEXT:    ; def s[8:9]
9514; GFX940-NEXT:    ;;#ASMEND
9515; GFX940-NEXT:    s_lshr_b32 s0, s8, 16
9516; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s0, s0
9517; GFX940-NEXT:    ;;#ASMSTART
9518; GFX940-NEXT:    ; use s[8:9]
9519; GFX940-NEXT:    ;;#ASMEND
9520; GFX940-NEXT:    s_setpc_b64 s[30:31]
9521  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
9522  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
9523  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
9524  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
9525  ret void
9526}
9527
9528define void @s_shuffle_v4bf16_v3bf16__1_1_1_1() {
9529; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__1_1_1_1:
9530; GFX900:       ; %bb.0:
9531; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9532; GFX900-NEXT:    ;;#ASMSTART
9533; GFX900-NEXT:    ; def s[4:5]
9534; GFX900-NEXT:    ;;#ASMEND
9535; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
9536; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s4
9537; GFX900-NEXT:    s_mov_b32 s9, s8
9538; GFX900-NEXT:    ;;#ASMSTART
9539; GFX900-NEXT:    ; use s[8:9]
9540; GFX900-NEXT:    ;;#ASMEND
9541; GFX900-NEXT:    s_setpc_b64 s[30:31]
9542;
9543; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__1_1_1_1:
9544; GFX90A:       ; %bb.0:
9545; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9546; GFX90A-NEXT:    ;;#ASMSTART
9547; GFX90A-NEXT:    ; def s[4:5]
9548; GFX90A-NEXT:    ;;#ASMEND
9549; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
9550; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s4
9551; GFX90A-NEXT:    s_mov_b32 s9, s8
9552; GFX90A-NEXT:    ;;#ASMSTART
9553; GFX90A-NEXT:    ; use s[8:9]
9554; GFX90A-NEXT:    ;;#ASMEND
9555; GFX90A-NEXT:    s_setpc_b64 s[30:31]
9556;
9557; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__1_1_1_1:
9558; GFX940:       ; %bb.0:
9559; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9560; GFX940-NEXT:    ;;#ASMSTART
9561; GFX940-NEXT:    ; def s[0:1]
9562; GFX940-NEXT:    ;;#ASMEND
9563; GFX940-NEXT:    s_lshr_b32 s0, s0, 16
9564; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s0, s0
9565; GFX940-NEXT:    s_mov_b32 s9, s8
9566; GFX940-NEXT:    ;;#ASMSTART
9567; GFX940-NEXT:    ; use s[8:9]
9568; GFX940-NEXT:    ;;#ASMEND
9569; GFX940-NEXT:    s_setpc_b64 s[30:31]
9570  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
9571  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
9572  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
9573  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
9574  ret void
9575}
9576
9577define void @s_shuffle_v4bf16_v3bf16__2_1_1_1() {
9578; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__2_1_1_1:
9579; GFX900:       ; %bb.0:
9580; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9581; GFX900-NEXT:    ;;#ASMSTART
9582; GFX900-NEXT:    ; def s[4:5]
9583; GFX900-NEXT:    ;;#ASMEND
9584; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
9585; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
9586; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
9587; GFX900-NEXT:    ;;#ASMSTART
9588; GFX900-NEXT:    ; use s[8:9]
9589; GFX900-NEXT:    ;;#ASMEND
9590; GFX900-NEXT:    s_setpc_b64 s[30:31]
9591;
9592; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__2_1_1_1:
9593; GFX90A:       ; %bb.0:
9594; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9595; GFX90A-NEXT:    ;;#ASMSTART
9596; GFX90A-NEXT:    ; def s[4:5]
9597; GFX90A-NEXT:    ;;#ASMEND
9598; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
9599; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
9600; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
9601; GFX90A-NEXT:    ;;#ASMSTART
9602; GFX90A-NEXT:    ; use s[8:9]
9603; GFX90A-NEXT:    ;;#ASMEND
9604; GFX90A-NEXT:    s_setpc_b64 s[30:31]
9605;
9606; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__2_1_1_1:
9607; GFX940:       ; %bb.0:
9608; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9609; GFX940-NEXT:    ;;#ASMSTART
9610; GFX940-NEXT:    ; def s[0:1]
9611; GFX940-NEXT:    ;;#ASMEND
9612; GFX940-NEXT:    s_lshr_b32 s0, s0, 16
9613; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
9614; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s0, s0
9615; GFX940-NEXT:    ;;#ASMSTART
9616; GFX940-NEXT:    ; use s[8:9]
9617; GFX940-NEXT:    ;;#ASMEND
9618; GFX940-NEXT:    s_setpc_b64 s[30:31]
9619  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
9620  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
9621  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 2, i32 1, i32 1, i32 1>
9622  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
9623  ret void
9624}
9625
9626define void @s_shuffle_v4bf16_v3bf16__3_1_1_1() {
9627; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__3_1_1_1:
9628; GFX900:       ; %bb.0:
9629; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9630; GFX900-NEXT:    ;;#ASMSTART
9631; GFX900-NEXT:    ; def s[8:9]
9632; GFX900-NEXT:    ;;#ASMEND
9633; GFX900-NEXT:    s_lshr_b32 s4, s8, 16
9634; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
9635; GFX900-NEXT:    ;;#ASMSTART
9636; GFX900-NEXT:    ; use s[8:9]
9637; GFX900-NEXT:    ;;#ASMEND
9638; GFX900-NEXT:    s_setpc_b64 s[30:31]
9639;
9640; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__3_1_1_1:
9641; GFX90A:       ; %bb.0:
9642; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9643; GFX90A-NEXT:    ;;#ASMSTART
9644; GFX90A-NEXT:    ; def s[8:9]
9645; GFX90A-NEXT:    ;;#ASMEND
9646; GFX90A-NEXT:    s_lshr_b32 s4, s8, 16
9647; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
9648; GFX90A-NEXT:    ;;#ASMSTART
9649; GFX90A-NEXT:    ; use s[8:9]
9650; GFX90A-NEXT:    ;;#ASMEND
9651; GFX90A-NEXT:    s_setpc_b64 s[30:31]
9652;
9653; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__3_1_1_1:
9654; GFX940:       ; %bb.0:
9655; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9656; GFX940-NEXT:    ;;#ASMSTART
9657; GFX940-NEXT:    ; def s[8:9]
9658; GFX940-NEXT:    ;;#ASMEND
9659; GFX940-NEXT:    s_lshr_b32 s0, s8, 16
9660; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s0, s0
9661; GFX940-NEXT:    ;;#ASMSTART
9662; GFX940-NEXT:    ; use s[8:9]
9663; GFX940-NEXT:    ;;#ASMEND
9664; GFX940-NEXT:    s_setpc_b64 s[30:31]
9665  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
9666  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
9667  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 3, i32 1, i32 1, i32 1>
9668  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
9669  ret void
9670}
9671
9672define void @s_shuffle_v4bf16_v3bf16__4_1_1_1() {
9673; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__4_1_1_1:
9674; GFX900:       ; %bb.0:
9675; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9676; GFX900-NEXT:    ;;#ASMSTART
9677; GFX900-NEXT:    ; def s[4:5]
9678; GFX900-NEXT:    ;;#ASMEND
9679; GFX900-NEXT:    ;;#ASMSTART
9680; GFX900-NEXT:    ; def s[6:7]
9681; GFX900-NEXT:    ;;#ASMEND
9682; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
9683; GFX900-NEXT:    s_lshr_b32 s5, s6, 16
9684; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
9685; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
9686; GFX900-NEXT:    ;;#ASMSTART
9687; GFX900-NEXT:    ; use s[8:9]
9688; GFX900-NEXT:    ;;#ASMEND
9689; GFX900-NEXT:    s_setpc_b64 s[30:31]
9690;
9691; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__4_1_1_1:
9692; GFX90A:       ; %bb.0:
9693; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9694; GFX90A-NEXT:    ;;#ASMSTART
9695; GFX90A-NEXT:    ; def s[4:5]
9696; GFX90A-NEXT:    ;;#ASMEND
9697; GFX90A-NEXT:    ;;#ASMSTART
9698; GFX90A-NEXT:    ; def s[6:7]
9699; GFX90A-NEXT:    ;;#ASMEND
9700; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
9701; GFX90A-NEXT:    s_lshr_b32 s5, s6, 16
9702; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
9703; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
9704; GFX90A-NEXT:    ;;#ASMSTART
9705; GFX90A-NEXT:    ; use s[8:9]
9706; GFX90A-NEXT:    ;;#ASMEND
9707; GFX90A-NEXT:    s_setpc_b64 s[30:31]
9708;
9709; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__4_1_1_1:
9710; GFX940:       ; %bb.0:
9711; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9712; GFX940-NEXT:    ;;#ASMSTART
9713; GFX940-NEXT:    ; def s[0:1]
9714; GFX940-NEXT:    ;;#ASMEND
9715; GFX940-NEXT:    ;;#ASMSTART
9716; GFX940-NEXT:    ; def s[2:3]
9717; GFX940-NEXT:    ;;#ASMEND
9718; GFX940-NEXT:    s_lshr_b32 s0, s0, 16
9719; GFX940-NEXT:    s_lshr_b32 s1, s2, 16
9720; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
9721; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s0, s0
9722; GFX940-NEXT:    ;;#ASMSTART
9723; GFX940-NEXT:    ; use s[8:9]
9724; GFX940-NEXT:    ;;#ASMEND
9725; GFX940-NEXT:    s_setpc_b64 s[30:31]
9726  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
9727  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
9728  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
9729  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
9730  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 4, i32 1, i32 1, i32 1>
9731  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
9732  ret void
9733}
9734
9735define void @s_shuffle_v4bf16_v3bf16__5_1_1_1() {
9736; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_1_1_1:
9737; GFX900:       ; %bb.0:
9738; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9739; GFX900-NEXT:    ;;#ASMSTART
9740; GFX900-NEXT:    ; def s[4:5]
9741; GFX900-NEXT:    ;;#ASMEND
9742; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
9743; GFX900-NEXT:    ;;#ASMSTART
9744; GFX900-NEXT:    ; def s[6:7]
9745; GFX900-NEXT:    ;;#ASMEND
9746; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s4
9747; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
9748; GFX900-NEXT:    ;;#ASMSTART
9749; GFX900-NEXT:    ; use s[8:9]
9750; GFX900-NEXT:    ;;#ASMEND
9751; GFX900-NEXT:    s_setpc_b64 s[30:31]
9752;
9753; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_1_1_1:
9754; GFX90A:       ; %bb.0:
9755; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9756; GFX90A-NEXT:    ;;#ASMSTART
9757; GFX90A-NEXT:    ; def s[4:5]
9758; GFX90A-NEXT:    ;;#ASMEND
9759; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
9760; GFX90A-NEXT:    ;;#ASMSTART
9761; GFX90A-NEXT:    ; def s[6:7]
9762; GFX90A-NEXT:    ;;#ASMEND
9763; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s4
9764; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
9765; GFX90A-NEXT:    ;;#ASMSTART
9766; GFX90A-NEXT:    ; use s[8:9]
9767; GFX90A-NEXT:    ;;#ASMEND
9768; GFX90A-NEXT:    s_setpc_b64 s[30:31]
9769;
9770; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_1_1_1:
9771; GFX940:       ; %bb.0:
9772; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9773; GFX940-NEXT:    ;;#ASMSTART
9774; GFX940-NEXT:    ; def s[0:1]
9775; GFX940-NEXT:    ;;#ASMEND
9776; GFX940-NEXT:    s_lshr_b32 s0, s0, 16
9777; GFX940-NEXT:    ;;#ASMSTART
9778; GFX940-NEXT:    ; def s[2:3]
9779; GFX940-NEXT:    ;;#ASMEND
9780; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s0
9781; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s0, s0
9782; GFX940-NEXT:    ;;#ASMSTART
9783; GFX940-NEXT:    ; use s[8:9]
9784; GFX940-NEXT:    ;;#ASMEND
9785; GFX940-NEXT:    s_setpc_b64 s[30:31]
9786  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
9787  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
9788  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
9789  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
9790  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 1, i32 1, i32 1>
9791  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
9792  ret void
9793}
9794
9795define void @s_shuffle_v4bf16_v3bf16__5_u_1_1() {
9796; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_u_1_1:
9797; GFX900:       ; %bb.0:
9798; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9799; GFX900-NEXT:    ;;#ASMSTART
9800; GFX900-NEXT:    ; def s[4:5]
9801; GFX900-NEXT:    ;;#ASMEND
9802; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
9803; GFX900-NEXT:    ;;#ASMSTART
9804; GFX900-NEXT:    ; def s[6:7]
9805; GFX900-NEXT:    ;;#ASMEND
9806; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
9807; GFX900-NEXT:    s_mov_b32 s8, s7
9808; GFX900-NEXT:    ;;#ASMSTART
9809; GFX900-NEXT:    ; use s[8:9]
9810; GFX900-NEXT:    ;;#ASMEND
9811; GFX900-NEXT:    s_setpc_b64 s[30:31]
9812;
9813; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_u_1_1:
9814; GFX90A:       ; %bb.0:
9815; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9816; GFX90A-NEXT:    ;;#ASMSTART
9817; GFX90A-NEXT:    ; def s[4:5]
9818; GFX90A-NEXT:    ;;#ASMEND
9819; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
9820; GFX90A-NEXT:    ;;#ASMSTART
9821; GFX90A-NEXT:    ; def s[6:7]
9822; GFX90A-NEXT:    ;;#ASMEND
9823; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
9824; GFX90A-NEXT:    s_mov_b32 s8, s7
9825; GFX90A-NEXT:    ;;#ASMSTART
9826; GFX90A-NEXT:    ; use s[8:9]
9827; GFX90A-NEXT:    ;;#ASMEND
9828; GFX90A-NEXT:    s_setpc_b64 s[30:31]
9829;
9830; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_u_1_1:
9831; GFX940:       ; %bb.0:
9832; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9833; GFX940-NEXT:    ;;#ASMSTART
9834; GFX940-NEXT:    ; def s[0:1]
9835; GFX940-NEXT:    ;;#ASMEND
9836; GFX940-NEXT:    s_lshr_b32 s0, s0, 16
9837; GFX940-NEXT:    ;;#ASMSTART
9838; GFX940-NEXT:    ; def s[2:3]
9839; GFX940-NEXT:    ;;#ASMEND
9840; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s0, s0
9841; GFX940-NEXT:    s_mov_b32 s8, s3
9842; GFX940-NEXT:    ;;#ASMSTART
9843; GFX940-NEXT:    ; use s[8:9]
9844; GFX940-NEXT:    ;;#ASMEND
9845; GFX940-NEXT:    s_setpc_b64 s[30:31]
9846  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
9847  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
9848  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
9849  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
9850  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 poison, i32 1, i32 1>
9851  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
9852  ret void
9853}
9854
9855define void @s_shuffle_v4bf16_v3bf16__5_0_1_1() {
9856; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_0_1_1:
9857; GFX900:       ; %bb.0:
9858; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9859; GFX900-NEXT:    ;;#ASMSTART
9860; GFX900-NEXT:    ; def s[4:5]
9861; GFX900-NEXT:    ;;#ASMEND
9862; GFX900-NEXT:    ;;#ASMSTART
9863; GFX900-NEXT:    ; def s[6:7]
9864; GFX900-NEXT:    ;;#ASMEND
9865; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s4
9866; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
9867; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
9868; GFX900-NEXT:    ;;#ASMSTART
9869; GFX900-NEXT:    ; use s[8:9]
9870; GFX900-NEXT:    ;;#ASMEND
9871; GFX900-NEXT:    s_setpc_b64 s[30:31]
9872;
9873; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_0_1_1:
9874; GFX90A:       ; %bb.0:
9875; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9876; GFX90A-NEXT:    ;;#ASMSTART
9877; GFX90A-NEXT:    ; def s[4:5]
9878; GFX90A-NEXT:    ;;#ASMEND
9879; GFX90A-NEXT:    ;;#ASMSTART
9880; GFX90A-NEXT:    ; def s[6:7]
9881; GFX90A-NEXT:    ;;#ASMEND
9882; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s4
9883; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
9884; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
9885; GFX90A-NEXT:    ;;#ASMSTART
9886; GFX90A-NEXT:    ; use s[8:9]
9887; GFX90A-NEXT:    ;;#ASMEND
9888; GFX90A-NEXT:    s_setpc_b64 s[30:31]
9889;
9890; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_0_1_1:
9891; GFX940:       ; %bb.0:
9892; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9893; GFX940-NEXT:    ;;#ASMSTART
9894; GFX940-NEXT:    ; def s[0:1]
9895; GFX940-NEXT:    ;;#ASMEND
9896; GFX940-NEXT:    ;;#ASMSTART
9897; GFX940-NEXT:    ; def s[2:3]
9898; GFX940-NEXT:    ;;#ASMEND
9899; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s0
9900; GFX940-NEXT:    s_lshr_b32 s0, s0, 16
9901; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s0, s0
9902; GFX940-NEXT:    ;;#ASMSTART
9903; GFX940-NEXT:    ; use s[8:9]
9904; GFX940-NEXT:    ;;#ASMEND
9905; GFX940-NEXT:    s_setpc_b64 s[30:31]
9906  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
9907  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
9908  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
9909  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
9910  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 0, i32 1, i32 1>
9911  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
9912  ret void
9913}
9914
9915define void @s_shuffle_v4bf16_v3bf16__5_2_1_1() {
9916; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_2_1_1:
9917; GFX900:       ; %bb.0:
9918; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9919; GFX900-NEXT:    ;;#ASMSTART
9920; GFX900-NEXT:    ; def s[4:5]
9921; GFX900-NEXT:    ;;#ASMEND
9922; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
9923; GFX900-NEXT:    ;;#ASMSTART
9924; GFX900-NEXT:    ; def s[6:7]
9925; GFX900-NEXT:    ;;#ASMEND
9926; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s5
9927; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
9928; GFX900-NEXT:    ;;#ASMSTART
9929; GFX900-NEXT:    ; use s[8:9]
9930; GFX900-NEXT:    ;;#ASMEND
9931; GFX900-NEXT:    s_setpc_b64 s[30:31]
9932;
9933; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_2_1_1:
9934; GFX90A:       ; %bb.0:
9935; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9936; GFX90A-NEXT:    ;;#ASMSTART
9937; GFX90A-NEXT:    ; def s[4:5]
9938; GFX90A-NEXT:    ;;#ASMEND
9939; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
9940; GFX90A-NEXT:    ;;#ASMSTART
9941; GFX90A-NEXT:    ; def s[6:7]
9942; GFX90A-NEXT:    ;;#ASMEND
9943; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s5
9944; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
9945; GFX90A-NEXT:    ;;#ASMSTART
9946; GFX90A-NEXT:    ; use s[8:9]
9947; GFX90A-NEXT:    ;;#ASMEND
9948; GFX90A-NEXT:    s_setpc_b64 s[30:31]
9949;
9950; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_2_1_1:
9951; GFX940:       ; %bb.0:
9952; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9953; GFX940-NEXT:    ;;#ASMSTART
9954; GFX940-NEXT:    ; def s[0:1]
9955; GFX940-NEXT:    ;;#ASMEND
9956; GFX940-NEXT:    s_lshr_b32 s0, s0, 16
9957; GFX940-NEXT:    ;;#ASMSTART
9958; GFX940-NEXT:    ; def s[2:3]
9959; GFX940-NEXT:    ;;#ASMEND
9960; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s1
9961; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s0, s0
9962; GFX940-NEXT:    ;;#ASMSTART
9963; GFX940-NEXT:    ; use s[8:9]
9964; GFX940-NEXT:    ;;#ASMEND
9965; GFX940-NEXT:    s_setpc_b64 s[30:31]
9966  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
9967  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
9968  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
9969  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
9970  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 2, i32 1, i32 1>
9971  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
9972  ret void
9973}
9974
9975define void @s_shuffle_v4bf16_v3bf16__5_3_1_1() {
9976; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_3_1_1:
9977; GFX900:       ; %bb.0:
9978; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9979; GFX900-NEXT:    ;;#ASMSTART
9980; GFX900-NEXT:    ; def s[4:5]
9981; GFX900-NEXT:    ;;#ASMEND
9982; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
9983; GFX900-NEXT:    ;;#ASMSTART
9984; GFX900-NEXT:    ; def s[6:7]
9985; GFX900-NEXT:    ;;#ASMEND
9986; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s6
9987; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
9988; GFX900-NEXT:    ;;#ASMSTART
9989; GFX900-NEXT:    ; use s[8:9]
9990; GFX900-NEXT:    ;;#ASMEND
9991; GFX900-NEXT:    s_setpc_b64 s[30:31]
9992;
9993; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_3_1_1:
9994; GFX90A:       ; %bb.0:
9995; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9996; GFX90A-NEXT:    ;;#ASMSTART
9997; GFX90A-NEXT:    ; def s[4:5]
9998; GFX90A-NEXT:    ;;#ASMEND
9999; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
10000; GFX90A-NEXT:    ;;#ASMSTART
10001; GFX90A-NEXT:    ; def s[6:7]
10002; GFX90A-NEXT:    ;;#ASMEND
10003; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s6
10004; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
10005; GFX90A-NEXT:    ;;#ASMSTART
10006; GFX90A-NEXT:    ; use s[8:9]
10007; GFX90A-NEXT:    ;;#ASMEND
10008; GFX90A-NEXT:    s_setpc_b64 s[30:31]
10009;
10010; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_3_1_1:
10011; GFX940:       ; %bb.0:
10012; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10013; GFX940-NEXT:    ;;#ASMSTART
10014; GFX940-NEXT:    ; def s[0:1]
10015; GFX940-NEXT:    ;;#ASMEND
10016; GFX940-NEXT:    s_lshr_b32 s0, s0, 16
10017; GFX940-NEXT:    ;;#ASMSTART
10018; GFX940-NEXT:    ; def s[2:3]
10019; GFX940-NEXT:    ;;#ASMEND
10020; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s2
10021; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s0, s0
10022; GFX940-NEXT:    ;;#ASMSTART
10023; GFX940-NEXT:    ; use s[8:9]
10024; GFX940-NEXT:    ;;#ASMEND
10025; GFX940-NEXT:    s_setpc_b64 s[30:31]
10026  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
10027  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
10028  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
10029  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
10030  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 3, i32 1, i32 1>
10031  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
10032  ret void
10033}
10034
10035define void @s_shuffle_v4bf16_v3bf16__5_4_1_1() {
10036; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_4_1_1:
10037; GFX900:       ; %bb.0:
10038; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10039; GFX900-NEXT:    ;;#ASMSTART
10040; GFX900-NEXT:    ; def s[4:5]
10041; GFX900-NEXT:    ;;#ASMEND
10042; GFX900-NEXT:    ;;#ASMSTART
10043; GFX900-NEXT:    ; def s[6:7]
10044; GFX900-NEXT:    ;;#ASMEND
10045; GFX900-NEXT:    s_lshr_b32 s5, s6, 16
10046; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
10047; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s5
10048; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
10049; GFX900-NEXT:    ;;#ASMSTART
10050; GFX900-NEXT:    ; use s[8:9]
10051; GFX900-NEXT:    ;;#ASMEND
10052; GFX900-NEXT:    s_setpc_b64 s[30:31]
10053;
10054; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_4_1_1:
10055; GFX90A:       ; %bb.0:
10056; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10057; GFX90A-NEXT:    ;;#ASMSTART
10058; GFX90A-NEXT:    ; def s[4:5]
10059; GFX90A-NEXT:    ;;#ASMEND
10060; GFX90A-NEXT:    ;;#ASMSTART
10061; GFX90A-NEXT:    ; def s[6:7]
10062; GFX90A-NEXT:    ;;#ASMEND
10063; GFX90A-NEXT:    s_lshr_b32 s5, s6, 16
10064; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
10065; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s5
10066; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
10067; GFX90A-NEXT:    ;;#ASMSTART
10068; GFX90A-NEXT:    ; use s[8:9]
10069; GFX90A-NEXT:    ;;#ASMEND
10070; GFX90A-NEXT:    s_setpc_b64 s[30:31]
10071;
10072; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_4_1_1:
10073; GFX940:       ; %bb.0:
10074; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10075; GFX940-NEXT:    ;;#ASMSTART
10076; GFX940-NEXT:    ; def s[0:1]
10077; GFX940-NEXT:    ;;#ASMEND
10078; GFX940-NEXT:    ;;#ASMSTART
10079; GFX940-NEXT:    ; def s[2:3]
10080; GFX940-NEXT:    ;;#ASMEND
10081; GFX940-NEXT:    s_lshr_b32 s1, s2, 16
10082; GFX940-NEXT:    s_lshr_b32 s0, s0, 16
10083; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s1
10084; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s0, s0
10085; GFX940-NEXT:    ;;#ASMSTART
10086; GFX940-NEXT:    ; use s[8:9]
10087; GFX940-NEXT:    ;;#ASMEND
10088; GFX940-NEXT:    s_setpc_b64 s[30:31]
10089  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
10090  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
10091  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
10092  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
10093  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 4, i32 1, i32 1>
10094  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
10095  ret void
10096}
10097
10098define void @s_shuffle_v4bf16_v3bf16__5_5_1_1() {
10099; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_1:
10100; GFX900:       ; %bb.0:
10101; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10102; GFX900-NEXT:    ;;#ASMSTART
10103; GFX900-NEXT:    ; def s[4:5]
10104; GFX900-NEXT:    ;;#ASMEND
10105; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
10106; GFX900-NEXT:    ;;#ASMSTART
10107; GFX900-NEXT:    ; def s[6:7]
10108; GFX900-NEXT:    ;;#ASMEND
10109; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
10110; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
10111; GFX900-NEXT:    ;;#ASMSTART
10112; GFX900-NEXT:    ; use s[8:9]
10113; GFX900-NEXT:    ;;#ASMEND
10114; GFX900-NEXT:    s_setpc_b64 s[30:31]
10115;
10116; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_1:
10117; GFX90A:       ; %bb.0:
10118; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10119; GFX90A-NEXT:    ;;#ASMSTART
10120; GFX90A-NEXT:    ; def s[4:5]
10121; GFX90A-NEXT:    ;;#ASMEND
10122; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
10123; GFX90A-NEXT:    ;;#ASMSTART
10124; GFX90A-NEXT:    ; def s[6:7]
10125; GFX90A-NEXT:    ;;#ASMEND
10126; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
10127; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
10128; GFX90A-NEXT:    ;;#ASMSTART
10129; GFX90A-NEXT:    ; use s[8:9]
10130; GFX90A-NEXT:    ;;#ASMEND
10131; GFX90A-NEXT:    s_setpc_b64 s[30:31]
10132;
10133; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_1:
10134; GFX940:       ; %bb.0:
10135; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10136; GFX940-NEXT:    ;;#ASMSTART
10137; GFX940-NEXT:    ; def s[0:1]
10138; GFX940-NEXT:    ;;#ASMEND
10139; GFX940-NEXT:    s_lshr_b32 s0, s0, 16
10140; GFX940-NEXT:    ;;#ASMSTART
10141; GFX940-NEXT:    ; def s[2:3]
10142; GFX940-NEXT:    ;;#ASMEND
10143; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s0, s0
10144; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s3
10145; GFX940-NEXT:    ;;#ASMSTART
10146; GFX940-NEXT:    ; use s[8:9]
10147; GFX940-NEXT:    ;;#ASMEND
10148; GFX940-NEXT:    s_setpc_b64 s[30:31]
10149  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
10150  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
10151  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
10152  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
10153  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 1, i32 1>
10154  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
10155  ret void
10156}
10157
10158define void @s_shuffle_v4bf16_v3bf16__5_5_u_1() {
10159; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_1:
10160; GFX900:       ; %bb.0:
10161; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10162; GFX900-NEXT:    ;;#ASMSTART
10163; GFX900-NEXT:    ; def s[4:5]
10164; GFX900-NEXT:    ;;#ASMEND
10165; GFX900-NEXT:    ;;#ASMSTART
10166; GFX900-NEXT:    ; def s[6:7]
10167; GFX900-NEXT:    ;;#ASMEND
10168; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
10169; GFX900-NEXT:    s_mov_b32 s9, s4
10170; GFX900-NEXT:    ;;#ASMSTART
10171; GFX900-NEXT:    ; use s[8:9]
10172; GFX900-NEXT:    ;;#ASMEND
10173; GFX900-NEXT:    s_setpc_b64 s[30:31]
10174;
10175; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_1:
10176; GFX90A:       ; %bb.0:
10177; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10178; GFX90A-NEXT:    ;;#ASMSTART
10179; GFX90A-NEXT:    ; def s[4:5]
10180; GFX90A-NEXT:    ;;#ASMEND
10181; GFX90A-NEXT:    ;;#ASMSTART
10182; GFX90A-NEXT:    ; def s[6:7]
10183; GFX90A-NEXT:    ;;#ASMEND
10184; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
10185; GFX90A-NEXT:    s_mov_b32 s9, s4
10186; GFX90A-NEXT:    ;;#ASMSTART
10187; GFX90A-NEXT:    ; use s[8:9]
10188; GFX90A-NEXT:    ;;#ASMEND
10189; GFX90A-NEXT:    s_setpc_b64 s[30:31]
10190;
10191; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_1:
10192; GFX940:       ; %bb.0:
10193; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10194; GFX940-NEXT:    ;;#ASMSTART
10195; GFX940-NEXT:    ; def s[0:1]
10196; GFX940-NEXT:    ;;#ASMEND
10197; GFX940-NEXT:    ;;#ASMSTART
10198; GFX940-NEXT:    ; def s[2:3]
10199; GFX940-NEXT:    ;;#ASMEND
10200; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s3
10201; GFX940-NEXT:    s_mov_b32 s9, s0
10202; GFX940-NEXT:    ;;#ASMSTART
10203; GFX940-NEXT:    ; use s[8:9]
10204; GFX940-NEXT:    ;;#ASMEND
10205; GFX940-NEXT:    s_setpc_b64 s[30:31]
10206  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
10207  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
10208  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
10209  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
10210  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 poison, i32 1>
10211  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
10212  ret void
10213}
10214
10215define void @s_shuffle_v4bf16_v3bf16__5_5_0_1() {
10216; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_1:
10217; GFX900:       ; %bb.0:
10218; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10219; GFX900-NEXT:    ;;#ASMSTART
10220; GFX900-NEXT:    ; def s[4:5]
10221; GFX900-NEXT:    ;;#ASMEND
10222; GFX900-NEXT:    ;;#ASMSTART
10223; GFX900-NEXT:    ; def s[6:7]
10224; GFX900-NEXT:    ;;#ASMEND
10225; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
10226; GFX900-NEXT:    s_mov_b32 s9, s4
10227; GFX900-NEXT:    ;;#ASMSTART
10228; GFX900-NEXT:    ; use s[8:9]
10229; GFX900-NEXT:    ;;#ASMEND
10230; GFX900-NEXT:    s_setpc_b64 s[30:31]
10231;
10232; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_1:
10233; GFX90A:       ; %bb.0:
10234; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10235; GFX90A-NEXT:    ;;#ASMSTART
10236; GFX90A-NEXT:    ; def s[4:5]
10237; GFX90A-NEXT:    ;;#ASMEND
10238; GFX90A-NEXT:    ;;#ASMSTART
10239; GFX90A-NEXT:    ; def s[6:7]
10240; GFX90A-NEXT:    ;;#ASMEND
10241; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
10242; GFX90A-NEXT:    s_mov_b32 s9, s4
10243; GFX90A-NEXT:    ;;#ASMSTART
10244; GFX90A-NEXT:    ; use s[8:9]
10245; GFX90A-NEXT:    ;;#ASMEND
10246; GFX90A-NEXT:    s_setpc_b64 s[30:31]
10247;
10248; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_1:
10249; GFX940:       ; %bb.0:
10250; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10251; GFX940-NEXT:    ;;#ASMSTART
10252; GFX940-NEXT:    ; def s[0:1]
10253; GFX940-NEXT:    ;;#ASMEND
10254; GFX940-NEXT:    ;;#ASMSTART
10255; GFX940-NEXT:    ; def s[2:3]
10256; GFX940-NEXT:    ;;#ASMEND
10257; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s3
10258; GFX940-NEXT:    s_mov_b32 s9, s0
10259; GFX940-NEXT:    ;;#ASMSTART
10260; GFX940-NEXT:    ; use s[8:9]
10261; GFX940-NEXT:    ;;#ASMEND
10262; GFX940-NEXT:    s_setpc_b64 s[30:31]
10263  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
10264  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
10265  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
10266  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
10267  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 0, i32 1>
10268  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
10269  ret void
10270}
10271
10272define void @s_shuffle_v4bf16_v3bf16__5_5_2_1() {
10273; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_1:
10274; GFX900:       ; %bb.0:
10275; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10276; GFX900-NEXT:    ;;#ASMSTART
10277; GFX900-NEXT:    ; def s[4:5]
10278; GFX900-NEXT:    ;;#ASMEND
10279; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
10280; GFX900-NEXT:    ;;#ASMSTART
10281; GFX900-NEXT:    ; def s[6:7]
10282; GFX900-NEXT:    ;;#ASMEND
10283; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s5, s4
10284; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
10285; GFX900-NEXT:    ;;#ASMSTART
10286; GFX900-NEXT:    ; use s[8:9]
10287; GFX900-NEXT:    ;;#ASMEND
10288; GFX900-NEXT:    s_setpc_b64 s[30:31]
10289;
10290; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_1:
10291; GFX90A:       ; %bb.0:
10292; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10293; GFX90A-NEXT:    ;;#ASMSTART
10294; GFX90A-NEXT:    ; def s[4:5]
10295; GFX90A-NEXT:    ;;#ASMEND
10296; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
10297; GFX90A-NEXT:    ;;#ASMSTART
10298; GFX90A-NEXT:    ; def s[6:7]
10299; GFX90A-NEXT:    ;;#ASMEND
10300; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s5, s4
10301; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
10302; GFX90A-NEXT:    ;;#ASMSTART
10303; GFX90A-NEXT:    ; use s[8:9]
10304; GFX90A-NEXT:    ;;#ASMEND
10305; GFX90A-NEXT:    s_setpc_b64 s[30:31]
10306;
10307; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_1:
10308; GFX940:       ; %bb.0:
10309; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10310; GFX940-NEXT:    ;;#ASMSTART
10311; GFX940-NEXT:    ; def s[0:1]
10312; GFX940-NEXT:    ;;#ASMEND
10313; GFX940-NEXT:    s_lshr_b32 s0, s0, 16
10314; GFX940-NEXT:    ;;#ASMSTART
10315; GFX940-NEXT:    ; def s[2:3]
10316; GFX940-NEXT:    ;;#ASMEND
10317; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s1, s0
10318; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s3
10319; GFX940-NEXT:    ;;#ASMSTART
10320; GFX940-NEXT:    ; use s[8:9]
10321; GFX940-NEXT:    ;;#ASMEND
10322; GFX940-NEXT:    s_setpc_b64 s[30:31]
10323  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
10324  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
10325  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
10326  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
10327  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 2, i32 1>
10328  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
10329  ret void
10330}
10331
10332define void @s_shuffle_v4bf16_v3bf16__5_5_3_1() {
10333; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_1:
10334; GFX900:       ; %bb.0:
10335; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10336; GFX900-NEXT:    ;;#ASMSTART
10337; GFX900-NEXT:    ; def s[4:5]
10338; GFX900-NEXT:    ;;#ASMEND
10339; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
10340; GFX900-NEXT:    ;;#ASMSTART
10341; GFX900-NEXT:    ; def s[6:7]
10342; GFX900-NEXT:    ;;#ASMEND
10343; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s6, s4
10344; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
10345; GFX900-NEXT:    ;;#ASMSTART
10346; GFX900-NEXT:    ; use s[8:9]
10347; GFX900-NEXT:    ;;#ASMEND
10348; GFX900-NEXT:    s_setpc_b64 s[30:31]
10349;
10350; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_1:
10351; GFX90A:       ; %bb.0:
10352; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10353; GFX90A-NEXT:    ;;#ASMSTART
10354; GFX90A-NEXT:    ; def s[4:5]
10355; GFX90A-NEXT:    ;;#ASMEND
10356; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
10357; GFX90A-NEXT:    ;;#ASMSTART
10358; GFX90A-NEXT:    ; def s[6:7]
10359; GFX90A-NEXT:    ;;#ASMEND
10360; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s6, s4
10361; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
10362; GFX90A-NEXT:    ;;#ASMSTART
10363; GFX90A-NEXT:    ; use s[8:9]
10364; GFX90A-NEXT:    ;;#ASMEND
10365; GFX90A-NEXT:    s_setpc_b64 s[30:31]
10366;
10367; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_1:
10368; GFX940:       ; %bb.0:
10369; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10370; GFX940-NEXT:    ;;#ASMSTART
10371; GFX940-NEXT:    ; def s[0:1]
10372; GFX940-NEXT:    ;;#ASMEND
10373; GFX940-NEXT:    s_lshr_b32 s0, s0, 16
10374; GFX940-NEXT:    ;;#ASMSTART
10375; GFX940-NEXT:    ; def s[2:3]
10376; GFX940-NEXT:    ;;#ASMEND
10377; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s2, s0
10378; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s3
10379; GFX940-NEXT:    ;;#ASMSTART
10380; GFX940-NEXT:    ; use s[8:9]
10381; GFX940-NEXT:    ;;#ASMEND
10382; GFX940-NEXT:    s_setpc_b64 s[30:31]
10383  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
10384  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
10385  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
10386  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
10387  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 3, i32 1>
10388  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
10389  ret void
10390}
10391
10392define void @s_shuffle_v4bf16_v3bf16__5_5_4_1() {
10393; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_1:
10394; GFX900:       ; %bb.0:
10395; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10396; GFX900-NEXT:    ;;#ASMSTART
10397; GFX900-NEXT:    ; def s[4:5]
10398; GFX900-NEXT:    ;;#ASMEND
10399; GFX900-NEXT:    ;;#ASMSTART
10400; GFX900-NEXT:    ; def s[6:7]
10401; GFX900-NEXT:    ;;#ASMEND
10402; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
10403; GFX900-NEXT:    s_lshr_b32 s5, s6, 16
10404; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s5, s4
10405; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
10406; GFX900-NEXT:    ;;#ASMSTART
10407; GFX900-NEXT:    ; use s[8:9]
10408; GFX900-NEXT:    ;;#ASMEND
10409; GFX900-NEXT:    s_setpc_b64 s[30:31]
10410;
10411; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_1:
10412; GFX90A:       ; %bb.0:
10413; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10414; GFX90A-NEXT:    ;;#ASMSTART
10415; GFX90A-NEXT:    ; def s[4:5]
10416; GFX90A-NEXT:    ;;#ASMEND
10417; GFX90A-NEXT:    ;;#ASMSTART
10418; GFX90A-NEXT:    ; def s[6:7]
10419; GFX90A-NEXT:    ;;#ASMEND
10420; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
10421; GFX90A-NEXT:    s_lshr_b32 s5, s6, 16
10422; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s5, s4
10423; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
10424; GFX90A-NEXT:    ;;#ASMSTART
10425; GFX90A-NEXT:    ; use s[8:9]
10426; GFX90A-NEXT:    ;;#ASMEND
10427; GFX90A-NEXT:    s_setpc_b64 s[30:31]
10428;
10429; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_1:
10430; GFX940:       ; %bb.0:
10431; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10432; GFX940-NEXT:    ;;#ASMSTART
10433; GFX940-NEXT:    ; def s[0:1]
10434; GFX940-NEXT:    ;;#ASMEND
10435; GFX940-NEXT:    ;;#ASMSTART
10436; GFX940-NEXT:    ; def s[2:3]
10437; GFX940-NEXT:    ;;#ASMEND
10438; GFX940-NEXT:    s_lshr_b32 s0, s0, 16
10439; GFX940-NEXT:    s_lshr_b32 s1, s2, 16
10440; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s1, s0
10441; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s3
10442; GFX940-NEXT:    ;;#ASMSTART
10443; GFX940-NEXT:    ; use s[8:9]
10444; GFX940-NEXT:    ;;#ASMEND
10445; GFX940-NEXT:    s_setpc_b64 s[30:31]
10446  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
10447  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
10448  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
10449  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
10450  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 4, i32 1>
10451  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
10452  ret void
10453}
10454
10455define void @s_shuffle_v4bf16_v3bf16__u_2_2_2() {
10456; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__u_2_2_2:
10457; GFX900:       ; %bb.0:
10458; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10459; GFX900-NEXT:    ;;#ASMSTART
10460; GFX900-NEXT:    ; def s[4:5]
10461; GFX900-NEXT:    ;;#ASMEND
10462; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s5, s5
10463; GFX900-NEXT:    s_lshl_b32 s8, s5, 16
10464; GFX900-NEXT:    ;;#ASMSTART
10465; GFX900-NEXT:    ; use s[8:9]
10466; GFX900-NEXT:    ;;#ASMEND
10467; GFX900-NEXT:    s_setpc_b64 s[30:31]
10468;
10469; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__u_2_2_2:
10470; GFX90A:       ; %bb.0:
10471; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10472; GFX90A-NEXT:    ;;#ASMSTART
10473; GFX90A-NEXT:    ; def s[4:5]
10474; GFX90A-NEXT:    ;;#ASMEND
10475; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s5, s5
10476; GFX90A-NEXT:    s_lshl_b32 s8, s5, 16
10477; GFX90A-NEXT:    ;;#ASMSTART
10478; GFX90A-NEXT:    ; use s[8:9]
10479; GFX90A-NEXT:    ;;#ASMEND
10480; GFX90A-NEXT:    s_setpc_b64 s[30:31]
10481;
10482; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__u_2_2_2:
10483; GFX940:       ; %bb.0:
10484; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10485; GFX940-NEXT:    ;;#ASMSTART
10486; GFX940-NEXT:    ; def s[0:1]
10487; GFX940-NEXT:    ;;#ASMEND
10488; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s1, s1
10489; GFX940-NEXT:    s_lshl_b32 s8, s1, 16
10490; GFX940-NEXT:    ;;#ASMSTART
10491; GFX940-NEXT:    ; use s[8:9]
10492; GFX940-NEXT:    ;;#ASMEND
10493; GFX940-NEXT:    s_setpc_b64 s[30:31]
10494  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
10495  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
10496  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 poison, i32 2, i32 2, i32 2>
10497  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
10498  ret void
10499}
10500
10501define void @s_shuffle_v4bf16_v3bf16__0_2_2_2() {
10502; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__0_2_2_2:
10503; GFX900:       ; %bb.0:
10504; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10505; GFX900-NEXT:    ;;#ASMSTART
10506; GFX900-NEXT:    ; def s[4:5]
10507; GFX900-NEXT:    ;;#ASMEND
10508; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
10509; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s5, s5
10510; GFX900-NEXT:    ;;#ASMSTART
10511; GFX900-NEXT:    ; use s[8:9]
10512; GFX900-NEXT:    ;;#ASMEND
10513; GFX900-NEXT:    s_setpc_b64 s[30:31]
10514;
10515; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__0_2_2_2:
10516; GFX90A:       ; %bb.0:
10517; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10518; GFX90A-NEXT:    ;;#ASMSTART
10519; GFX90A-NEXT:    ; def s[4:5]
10520; GFX90A-NEXT:    ;;#ASMEND
10521; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
10522; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s5, s5
10523; GFX90A-NEXT:    ;;#ASMSTART
10524; GFX90A-NEXT:    ; use s[8:9]
10525; GFX90A-NEXT:    ;;#ASMEND
10526; GFX90A-NEXT:    s_setpc_b64 s[30:31]
10527;
10528; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__0_2_2_2:
10529; GFX940:       ; %bb.0:
10530; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10531; GFX940-NEXT:    ;;#ASMSTART
10532; GFX940-NEXT:    ; def s[0:1]
10533; GFX940-NEXT:    ;;#ASMEND
10534; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s0, s1
10535; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s1, s1
10536; GFX940-NEXT:    ;;#ASMSTART
10537; GFX940-NEXT:    ; use s[8:9]
10538; GFX940-NEXT:    ;;#ASMEND
10539; GFX940-NEXT:    s_setpc_b64 s[30:31]
10540  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
10541  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
10542  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 0, i32 2, i32 2, i32 2>
10543  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
10544  ret void
10545}
10546
10547define void @s_shuffle_v4bf16_v3bf16__1_2_2_2() {
10548; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__1_2_2_2:
10549; GFX900:       ; %bb.0:
10550; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10551; GFX900-NEXT:    ;;#ASMSTART
10552; GFX900-NEXT:    ; def s[4:5]
10553; GFX900-NEXT:    ;;#ASMEND
10554; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
10555; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
10556; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s5, s5
10557; GFX900-NEXT:    ;;#ASMSTART
10558; GFX900-NEXT:    ; use s[8:9]
10559; GFX900-NEXT:    ;;#ASMEND
10560; GFX900-NEXT:    s_setpc_b64 s[30:31]
10561;
10562; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__1_2_2_2:
10563; GFX90A:       ; %bb.0:
10564; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10565; GFX90A-NEXT:    ;;#ASMSTART
10566; GFX90A-NEXT:    ; def s[4:5]
10567; GFX90A-NEXT:    ;;#ASMEND
10568; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
10569; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
10570; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s5, s5
10571; GFX90A-NEXT:    ;;#ASMSTART
10572; GFX90A-NEXT:    ; use s[8:9]
10573; GFX90A-NEXT:    ;;#ASMEND
10574; GFX90A-NEXT:    s_setpc_b64 s[30:31]
10575;
10576; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__1_2_2_2:
10577; GFX940:       ; %bb.0:
10578; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10579; GFX940-NEXT:    ;;#ASMSTART
10580; GFX940-NEXT:    ; def s[0:1]
10581; GFX940-NEXT:    ;;#ASMEND
10582; GFX940-NEXT:    s_lshr_b32 s0, s0, 16
10583; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s0, s1
10584; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s1, s1
10585; GFX940-NEXT:    ;;#ASMSTART
10586; GFX940-NEXT:    ; use s[8:9]
10587; GFX940-NEXT:    ;;#ASMEND
10588; GFX940-NEXT:    s_setpc_b64 s[30:31]
10589  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
10590  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
10591  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 1, i32 2, i32 2, i32 2>
10592  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
10593  ret void
10594}
10595
10596define void @s_shuffle_v4bf16_v3bf16__2_2_2_2() {
10597; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__2_2_2_2:
10598; GFX900:       ; %bb.0:
10599; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10600; GFX900-NEXT:    ;;#ASMSTART
10601; GFX900-NEXT:    ; def s[4:5]
10602; GFX900-NEXT:    ;;#ASMEND
10603; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s5
10604; GFX900-NEXT:    s_mov_b32 s9, s8
10605; GFX900-NEXT:    ;;#ASMSTART
10606; GFX900-NEXT:    ; use s[8:9]
10607; GFX900-NEXT:    ;;#ASMEND
10608; GFX900-NEXT:    s_setpc_b64 s[30:31]
10609;
10610; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__2_2_2_2:
10611; GFX90A:       ; %bb.0:
10612; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10613; GFX90A-NEXT:    ;;#ASMSTART
10614; GFX90A-NEXT:    ; def s[4:5]
10615; GFX90A-NEXT:    ;;#ASMEND
10616; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s5
10617; GFX90A-NEXT:    s_mov_b32 s9, s8
10618; GFX90A-NEXT:    ;;#ASMSTART
10619; GFX90A-NEXT:    ; use s[8:9]
10620; GFX90A-NEXT:    ;;#ASMEND
10621; GFX90A-NEXT:    s_setpc_b64 s[30:31]
10622;
10623; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__2_2_2_2:
10624; GFX940:       ; %bb.0:
10625; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10626; GFX940-NEXT:    ;;#ASMSTART
10627; GFX940-NEXT:    ; def s[0:1]
10628; GFX940-NEXT:    ;;#ASMEND
10629; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s1, s1
10630; GFX940-NEXT:    s_mov_b32 s9, s8
10631; GFX940-NEXT:    ;;#ASMSTART
10632; GFX940-NEXT:    ; use s[8:9]
10633; GFX940-NEXT:    ;;#ASMEND
10634; GFX940-NEXT:    s_setpc_b64 s[30:31]
10635  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
10636  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
10637  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
10638  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
10639  ret void
10640}
10641
10642define void @s_shuffle_v4bf16_v3bf16__3_2_2_2() {
10643; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__3_2_2_2:
10644; GFX900:       ; %bb.0:
10645; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10646; GFX900-NEXT:    ;;#ASMSTART
10647; GFX900-NEXT:    ; def s[4:5]
10648; GFX900-NEXT:    ;;#ASMEND
10649; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s5, s5
10650; GFX900-NEXT:    s_lshl_b32 s8, s5, 16
10651; GFX900-NEXT:    ;;#ASMSTART
10652; GFX900-NEXT:    ; use s[8:9]
10653; GFX900-NEXT:    ;;#ASMEND
10654; GFX900-NEXT:    s_setpc_b64 s[30:31]
10655;
10656; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__3_2_2_2:
10657; GFX90A:       ; %bb.0:
10658; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10659; GFX90A-NEXT:    ;;#ASMSTART
10660; GFX90A-NEXT:    ; def s[4:5]
10661; GFX90A-NEXT:    ;;#ASMEND
10662; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s5, s5
10663; GFX90A-NEXT:    s_lshl_b32 s8, s5, 16
10664; GFX90A-NEXT:    ;;#ASMSTART
10665; GFX90A-NEXT:    ; use s[8:9]
10666; GFX90A-NEXT:    ;;#ASMEND
10667; GFX90A-NEXT:    s_setpc_b64 s[30:31]
10668;
10669; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__3_2_2_2:
10670; GFX940:       ; %bb.0:
10671; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10672; GFX940-NEXT:    ;;#ASMSTART
10673; GFX940-NEXT:    ; def s[0:1]
10674; GFX940-NEXT:    ;;#ASMEND
10675; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s1, s1
10676; GFX940-NEXT:    s_lshl_b32 s8, s1, 16
10677; GFX940-NEXT:    ;;#ASMSTART
10678; GFX940-NEXT:    ; use s[8:9]
10679; GFX940-NEXT:    ;;#ASMEND
10680; GFX940-NEXT:    s_setpc_b64 s[30:31]
10681  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
10682  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
10683  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 3, i32 2, i32 2, i32 2>
10684  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
10685  ret void
10686}
10687
10688define void @s_shuffle_v4bf16_v3bf16__4_2_2_2() {
10689; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__4_2_2_2:
10690; GFX900:       ; %bb.0:
10691; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10692; GFX900-NEXT:    ;;#ASMSTART
10693; GFX900-NEXT:    ; def s[4:5]
10694; GFX900-NEXT:    ;;#ASMEND
10695; GFX900-NEXT:    ;;#ASMSTART
10696; GFX900-NEXT:    ; def s[6:7]
10697; GFX900-NEXT:    ;;#ASMEND
10698; GFX900-NEXT:    s_lshr_b32 s4, s6, 16
10699; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
10700; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s5, s5
10701; GFX900-NEXT:    ;;#ASMSTART
10702; GFX900-NEXT:    ; use s[8:9]
10703; GFX900-NEXT:    ;;#ASMEND
10704; GFX900-NEXT:    s_setpc_b64 s[30:31]
10705;
10706; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__4_2_2_2:
10707; GFX90A:       ; %bb.0:
10708; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10709; GFX90A-NEXT:    ;;#ASMSTART
10710; GFX90A-NEXT:    ; def s[4:5]
10711; GFX90A-NEXT:    ;;#ASMEND
10712; GFX90A-NEXT:    ;;#ASMSTART
10713; GFX90A-NEXT:    ; def s[6:7]
10714; GFX90A-NEXT:    ;;#ASMEND
10715; GFX90A-NEXT:    s_lshr_b32 s4, s6, 16
10716; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
10717; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s5, s5
10718; GFX90A-NEXT:    ;;#ASMSTART
10719; GFX90A-NEXT:    ; use s[8:9]
10720; GFX90A-NEXT:    ;;#ASMEND
10721; GFX90A-NEXT:    s_setpc_b64 s[30:31]
10722;
10723; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__4_2_2_2:
10724; GFX940:       ; %bb.0:
10725; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10726; GFX940-NEXT:    ;;#ASMSTART
10727; GFX940-NEXT:    ; def s[0:1]
10728; GFX940-NEXT:    ;;#ASMEND
10729; GFX940-NEXT:    ;;#ASMSTART
10730; GFX940-NEXT:    ; def s[2:3]
10731; GFX940-NEXT:    ;;#ASMEND
10732; GFX940-NEXT:    s_lshr_b32 s0, s2, 16
10733; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s0, s1
10734; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s1, s1
10735; GFX940-NEXT:    ;;#ASMSTART
10736; GFX940-NEXT:    ; use s[8:9]
10737; GFX940-NEXT:    ;;#ASMEND
10738; GFX940-NEXT:    s_setpc_b64 s[30:31]
10739  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
10740  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
10741  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
10742  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
10743  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 4, i32 2, i32 2, i32 2>
10744  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
10745  ret void
10746}
10747
10748define void @s_shuffle_v4bf16_v3bf16__5_2_2_2() {
10749; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_2_2_2:
10750; GFX900:       ; %bb.0:
10751; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10752; GFX900-NEXT:    ;;#ASMSTART
10753; GFX900-NEXT:    ; def s[4:5]
10754; GFX900-NEXT:    ;;#ASMEND
10755; GFX900-NEXT:    ;;#ASMSTART
10756; GFX900-NEXT:    ; def s[6:7]
10757; GFX900-NEXT:    ;;#ASMEND
10758; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s5
10759; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s5, s5
10760; GFX900-NEXT:    ;;#ASMSTART
10761; GFX900-NEXT:    ; use s[8:9]
10762; GFX900-NEXT:    ;;#ASMEND
10763; GFX900-NEXT:    s_setpc_b64 s[30:31]
10764;
10765; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_2_2_2:
10766; GFX90A:       ; %bb.0:
10767; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10768; GFX90A-NEXT:    ;;#ASMSTART
10769; GFX90A-NEXT:    ; def s[4:5]
10770; GFX90A-NEXT:    ;;#ASMEND
10771; GFX90A-NEXT:    ;;#ASMSTART
10772; GFX90A-NEXT:    ; def s[6:7]
10773; GFX90A-NEXT:    ;;#ASMEND
10774; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s5
10775; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s5, s5
10776; GFX90A-NEXT:    ;;#ASMSTART
10777; GFX90A-NEXT:    ; use s[8:9]
10778; GFX90A-NEXT:    ;;#ASMEND
10779; GFX90A-NEXT:    s_setpc_b64 s[30:31]
10780;
10781; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_2_2_2:
10782; GFX940:       ; %bb.0:
10783; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10784; GFX940-NEXT:    ;;#ASMSTART
10785; GFX940-NEXT:    ; def s[0:1]
10786; GFX940-NEXT:    ;;#ASMEND
10787; GFX940-NEXT:    ;;#ASMSTART
10788; GFX940-NEXT:    ; def s[2:3]
10789; GFX940-NEXT:    ;;#ASMEND
10790; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s1
10791; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s1, s1
10792; GFX940-NEXT:    ;;#ASMSTART
10793; GFX940-NEXT:    ; use s[8:9]
10794; GFX940-NEXT:    ;;#ASMEND
10795; GFX940-NEXT:    s_setpc_b64 s[30:31]
10796  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
10797  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
10798  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
10799  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
10800  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 2, i32 2, i32 2>
10801  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
10802  ret void
10803}
10804
10805define void @s_shuffle_v4bf16_v3bf16__5_u_2_2() {
10806; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_u_2_2:
10807; GFX900:       ; %bb.0:
10808; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10809; GFX900-NEXT:    ;;#ASMSTART
10810; GFX900-NEXT:    ; def s[4:5]
10811; GFX900-NEXT:    ;;#ASMEND
10812; GFX900-NEXT:    ;;#ASMSTART
10813; GFX900-NEXT:    ; def s[6:7]
10814; GFX900-NEXT:    ;;#ASMEND
10815; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s5, s5
10816; GFX900-NEXT:    s_mov_b32 s8, s7
10817; GFX900-NEXT:    ;;#ASMSTART
10818; GFX900-NEXT:    ; use s[8:9]
10819; GFX900-NEXT:    ;;#ASMEND
10820; GFX900-NEXT:    s_setpc_b64 s[30:31]
10821;
10822; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_u_2_2:
10823; GFX90A:       ; %bb.0:
10824; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10825; GFX90A-NEXT:    ;;#ASMSTART
10826; GFX90A-NEXT:    ; def s[4:5]
10827; GFX90A-NEXT:    ;;#ASMEND
10828; GFX90A-NEXT:    ;;#ASMSTART
10829; GFX90A-NEXT:    ; def s[6:7]
10830; GFX90A-NEXT:    ;;#ASMEND
10831; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s5, s5
10832; GFX90A-NEXT:    s_mov_b32 s8, s7
10833; GFX90A-NEXT:    ;;#ASMSTART
10834; GFX90A-NEXT:    ; use s[8:9]
10835; GFX90A-NEXT:    ;;#ASMEND
10836; GFX90A-NEXT:    s_setpc_b64 s[30:31]
10837;
10838; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_u_2_2:
10839; GFX940:       ; %bb.0:
10840; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10841; GFX940-NEXT:    ;;#ASMSTART
10842; GFX940-NEXT:    ; def s[0:1]
10843; GFX940-NEXT:    ;;#ASMEND
10844; GFX940-NEXT:    ;;#ASMSTART
10845; GFX940-NEXT:    ; def s[2:3]
10846; GFX940-NEXT:    ;;#ASMEND
10847; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s1, s1
10848; GFX940-NEXT:    s_mov_b32 s8, s3
10849; GFX940-NEXT:    ;;#ASMSTART
10850; GFX940-NEXT:    ; use s[8:9]
10851; GFX940-NEXT:    ;;#ASMEND
10852; GFX940-NEXT:    s_setpc_b64 s[30:31]
10853  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
10854  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
10855  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
10856  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
10857  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 poison, i32 2, i32 2>
10858  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
10859  ret void
10860}
10861
10862define void @s_shuffle_v4bf16_v3bf16__5_0_2_2() {
10863; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_0_2_2:
10864; GFX900:       ; %bb.0:
10865; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10866; GFX900-NEXT:    ;;#ASMSTART
10867; GFX900-NEXT:    ; def s[4:5]
10868; GFX900-NEXT:    ;;#ASMEND
10869; GFX900-NEXT:    ;;#ASMSTART
10870; GFX900-NEXT:    ; def s[6:7]
10871; GFX900-NEXT:    ;;#ASMEND
10872; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s4
10873; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s5, s5
10874; GFX900-NEXT:    ;;#ASMSTART
10875; GFX900-NEXT:    ; use s[8:9]
10876; GFX900-NEXT:    ;;#ASMEND
10877; GFX900-NEXT:    s_setpc_b64 s[30:31]
10878;
10879; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_0_2_2:
10880; GFX90A:       ; %bb.0:
10881; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10882; GFX90A-NEXT:    ;;#ASMSTART
10883; GFX90A-NEXT:    ; def s[4:5]
10884; GFX90A-NEXT:    ;;#ASMEND
10885; GFX90A-NEXT:    ;;#ASMSTART
10886; GFX90A-NEXT:    ; def s[6:7]
10887; GFX90A-NEXT:    ;;#ASMEND
10888; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s4
10889; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s5, s5
10890; GFX90A-NEXT:    ;;#ASMSTART
10891; GFX90A-NEXT:    ; use s[8:9]
10892; GFX90A-NEXT:    ;;#ASMEND
10893; GFX90A-NEXT:    s_setpc_b64 s[30:31]
10894;
10895; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_0_2_2:
10896; GFX940:       ; %bb.0:
10897; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10898; GFX940-NEXT:    ;;#ASMSTART
10899; GFX940-NEXT:    ; def s[0:1]
10900; GFX940-NEXT:    ;;#ASMEND
10901; GFX940-NEXT:    ;;#ASMSTART
10902; GFX940-NEXT:    ; def s[2:3]
10903; GFX940-NEXT:    ;;#ASMEND
10904; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s0
10905; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s1, s1
10906; GFX940-NEXT:    ;;#ASMSTART
10907; GFX940-NEXT:    ; use s[8:9]
10908; GFX940-NEXT:    ;;#ASMEND
10909; GFX940-NEXT:    s_setpc_b64 s[30:31]
10910  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
10911  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
10912  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
10913  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
10914  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 0, i32 2, i32 2>
10915  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
10916  ret void
10917}
10918
10919define void @s_shuffle_v4bf16_v3bf16__5_1_2_2() {
10920; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_1_2_2:
10921; GFX900:       ; %bb.0:
10922; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10923; GFX900-NEXT:    ;;#ASMSTART
10924; GFX900-NEXT:    ; def s[4:5]
10925; GFX900-NEXT:    ;;#ASMEND
10926; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
10927; GFX900-NEXT:    ;;#ASMSTART
10928; GFX900-NEXT:    ; def s[6:7]
10929; GFX900-NEXT:    ;;#ASMEND
10930; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s4
10931; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s5, s5
10932; GFX900-NEXT:    ;;#ASMSTART
10933; GFX900-NEXT:    ; use s[8:9]
10934; GFX900-NEXT:    ;;#ASMEND
10935; GFX900-NEXT:    s_setpc_b64 s[30:31]
10936;
10937; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_1_2_2:
10938; GFX90A:       ; %bb.0:
10939; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10940; GFX90A-NEXT:    ;;#ASMSTART
10941; GFX90A-NEXT:    ; def s[4:5]
10942; GFX90A-NEXT:    ;;#ASMEND
10943; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
10944; GFX90A-NEXT:    ;;#ASMSTART
10945; GFX90A-NEXT:    ; def s[6:7]
10946; GFX90A-NEXT:    ;;#ASMEND
10947; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s4
10948; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s5, s5
10949; GFX90A-NEXT:    ;;#ASMSTART
10950; GFX90A-NEXT:    ; use s[8:9]
10951; GFX90A-NEXT:    ;;#ASMEND
10952; GFX90A-NEXT:    s_setpc_b64 s[30:31]
10953;
10954; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_1_2_2:
10955; GFX940:       ; %bb.0:
10956; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10957; GFX940-NEXT:    ;;#ASMSTART
10958; GFX940-NEXT:    ; def s[0:1]
10959; GFX940-NEXT:    ;;#ASMEND
10960; GFX940-NEXT:    s_lshr_b32 s0, s0, 16
10961; GFX940-NEXT:    ;;#ASMSTART
10962; GFX940-NEXT:    ; def s[2:3]
10963; GFX940-NEXT:    ;;#ASMEND
10964; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s0
10965; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s1, s1
10966; GFX940-NEXT:    ;;#ASMSTART
10967; GFX940-NEXT:    ; use s[8:9]
10968; GFX940-NEXT:    ;;#ASMEND
10969; GFX940-NEXT:    s_setpc_b64 s[30:31]
10970  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
10971  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
10972  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
10973  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
10974  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 1, i32 2, i32 2>
10975  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
10976  ret void
10977}
10978
10979define void @s_shuffle_v4bf16_v3bf16__5_3_2_2() {
10980; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_3_2_2:
10981; GFX900:       ; %bb.0:
10982; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10983; GFX900-NEXT:    ;;#ASMSTART
10984; GFX900-NEXT:    ; def s[4:5]
10985; GFX900-NEXT:    ;;#ASMEND
10986; GFX900-NEXT:    ;;#ASMSTART
10987; GFX900-NEXT:    ; def s[6:7]
10988; GFX900-NEXT:    ;;#ASMEND
10989; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s6
10990; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s5, s5
10991; GFX900-NEXT:    ;;#ASMSTART
10992; GFX900-NEXT:    ; use s[8:9]
10993; GFX900-NEXT:    ;;#ASMEND
10994; GFX900-NEXT:    s_setpc_b64 s[30:31]
10995;
10996; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_3_2_2:
10997; GFX90A:       ; %bb.0:
10998; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10999; GFX90A-NEXT:    ;;#ASMSTART
11000; GFX90A-NEXT:    ; def s[4:5]
11001; GFX90A-NEXT:    ;;#ASMEND
11002; GFX90A-NEXT:    ;;#ASMSTART
11003; GFX90A-NEXT:    ; def s[6:7]
11004; GFX90A-NEXT:    ;;#ASMEND
11005; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s6
11006; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s5, s5
11007; GFX90A-NEXT:    ;;#ASMSTART
11008; GFX90A-NEXT:    ; use s[8:9]
11009; GFX90A-NEXT:    ;;#ASMEND
11010; GFX90A-NEXT:    s_setpc_b64 s[30:31]
11011;
11012; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_3_2_2:
11013; GFX940:       ; %bb.0:
11014; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11015; GFX940-NEXT:    ;;#ASMSTART
11016; GFX940-NEXT:    ; def s[0:1]
11017; GFX940-NEXT:    ;;#ASMEND
11018; GFX940-NEXT:    ;;#ASMSTART
11019; GFX940-NEXT:    ; def s[2:3]
11020; GFX940-NEXT:    ;;#ASMEND
11021; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s2
11022; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s1, s1
11023; GFX940-NEXT:    ;;#ASMSTART
11024; GFX940-NEXT:    ; use s[8:9]
11025; GFX940-NEXT:    ;;#ASMEND
11026; GFX940-NEXT:    s_setpc_b64 s[30:31]
11027  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
11028  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
11029  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
11030  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
11031  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 3, i32 2, i32 2>
11032  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
11033  ret void
11034}
11035
11036define void @s_shuffle_v4bf16_v3bf16__5_4_2_2() {
11037; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_4_2_2:
11038; GFX900:       ; %bb.0:
11039; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11040; GFX900-NEXT:    ;;#ASMSTART
11041; GFX900-NEXT:    ; def s[4:5]
11042; GFX900-NEXT:    ;;#ASMEND
11043; GFX900-NEXT:    ;;#ASMSTART
11044; GFX900-NEXT:    ; def s[6:7]
11045; GFX900-NEXT:    ;;#ASMEND
11046; GFX900-NEXT:    s_lshr_b32 s4, s6, 16
11047; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s4
11048; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s5, s5
11049; GFX900-NEXT:    ;;#ASMSTART
11050; GFX900-NEXT:    ; use s[8:9]
11051; GFX900-NEXT:    ;;#ASMEND
11052; GFX900-NEXT:    s_setpc_b64 s[30:31]
11053;
11054; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_4_2_2:
11055; GFX90A:       ; %bb.0:
11056; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11057; GFX90A-NEXT:    ;;#ASMSTART
11058; GFX90A-NEXT:    ; def s[4:5]
11059; GFX90A-NEXT:    ;;#ASMEND
11060; GFX90A-NEXT:    ;;#ASMSTART
11061; GFX90A-NEXT:    ; def s[6:7]
11062; GFX90A-NEXT:    ;;#ASMEND
11063; GFX90A-NEXT:    s_lshr_b32 s4, s6, 16
11064; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s4
11065; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s5, s5
11066; GFX90A-NEXT:    ;;#ASMSTART
11067; GFX90A-NEXT:    ; use s[8:9]
11068; GFX90A-NEXT:    ;;#ASMEND
11069; GFX90A-NEXT:    s_setpc_b64 s[30:31]
11070;
11071; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_4_2_2:
11072; GFX940:       ; %bb.0:
11073; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11074; GFX940-NEXT:    ;;#ASMSTART
11075; GFX940-NEXT:    ; def s[0:1]
11076; GFX940-NEXT:    ;;#ASMEND
11077; GFX940-NEXT:    ;;#ASMSTART
11078; GFX940-NEXT:    ; def s[2:3]
11079; GFX940-NEXT:    ;;#ASMEND
11080; GFX940-NEXT:    s_lshr_b32 s0, s2, 16
11081; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s0
11082; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s1, s1
11083; GFX940-NEXT:    ;;#ASMSTART
11084; GFX940-NEXT:    ; use s[8:9]
11085; GFX940-NEXT:    ;;#ASMEND
11086; GFX940-NEXT:    s_setpc_b64 s[30:31]
11087  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
11088  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
11089  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
11090  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
11091  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 4, i32 2, i32 2>
11092  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
11093  ret void
11094}
11095
11096define void @s_shuffle_v4bf16_v3bf16__5_5_2_2() {
11097; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_2:
11098; GFX900:       ; %bb.0:
11099; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11100; GFX900-NEXT:    ;;#ASMSTART
11101; GFX900-NEXT:    ; def s[4:5]
11102; GFX900-NEXT:    ;;#ASMEND
11103; GFX900-NEXT:    ;;#ASMSTART
11104; GFX900-NEXT:    ; def s[6:7]
11105; GFX900-NEXT:    ;;#ASMEND
11106; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s5, s5
11107; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
11108; GFX900-NEXT:    ;;#ASMSTART
11109; GFX900-NEXT:    ; use s[8:9]
11110; GFX900-NEXT:    ;;#ASMEND
11111; GFX900-NEXT:    s_setpc_b64 s[30:31]
11112;
11113; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_2:
11114; GFX90A:       ; %bb.0:
11115; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11116; GFX90A-NEXT:    ;;#ASMSTART
11117; GFX90A-NEXT:    ; def s[4:5]
11118; GFX90A-NEXT:    ;;#ASMEND
11119; GFX90A-NEXT:    ;;#ASMSTART
11120; GFX90A-NEXT:    ; def s[6:7]
11121; GFX90A-NEXT:    ;;#ASMEND
11122; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s5, s5
11123; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
11124; GFX90A-NEXT:    ;;#ASMSTART
11125; GFX90A-NEXT:    ; use s[8:9]
11126; GFX90A-NEXT:    ;;#ASMEND
11127; GFX90A-NEXT:    s_setpc_b64 s[30:31]
11128;
11129; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_2:
11130; GFX940:       ; %bb.0:
11131; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11132; GFX940-NEXT:    ;;#ASMSTART
11133; GFX940-NEXT:    ; def s[0:1]
11134; GFX940-NEXT:    ;;#ASMEND
11135; GFX940-NEXT:    ;;#ASMSTART
11136; GFX940-NEXT:    ; def s[2:3]
11137; GFX940-NEXT:    ;;#ASMEND
11138; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s1, s1
11139; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s3
11140; GFX940-NEXT:    ;;#ASMSTART
11141; GFX940-NEXT:    ; use s[8:9]
11142; GFX940-NEXT:    ;;#ASMEND
11143; GFX940-NEXT:    s_setpc_b64 s[30:31]
11144  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
11145  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
11146  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
11147  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
11148  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 2, i32 2>
11149  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
11150  ret void
11151}
11152
11153define void @s_shuffle_v4bf16_v3bf16__5_5_u_2() {
11154; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_2:
11155; GFX900:       ; %bb.0:
11156; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11157; GFX900-NEXT:    ;;#ASMSTART
11158; GFX900-NEXT:    ; def s[4:5]
11159; GFX900-NEXT:    ;;#ASMEND
11160; GFX900-NEXT:    ;;#ASMSTART
11161; GFX900-NEXT:    ; def s[6:7]
11162; GFX900-NEXT:    ;;#ASMEND
11163; GFX900-NEXT:    s_lshl_b32 s9, s5, 16
11164; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
11165; GFX900-NEXT:    ;;#ASMSTART
11166; GFX900-NEXT:    ; use s[8:9]
11167; GFX900-NEXT:    ;;#ASMEND
11168; GFX900-NEXT:    s_setpc_b64 s[30:31]
11169;
11170; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_2:
11171; GFX90A:       ; %bb.0:
11172; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11173; GFX90A-NEXT:    ;;#ASMSTART
11174; GFX90A-NEXT:    ; def s[4:5]
11175; GFX90A-NEXT:    ;;#ASMEND
11176; GFX90A-NEXT:    ;;#ASMSTART
11177; GFX90A-NEXT:    ; def s[6:7]
11178; GFX90A-NEXT:    ;;#ASMEND
11179; GFX90A-NEXT:    s_lshl_b32 s9, s5, 16
11180; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
11181; GFX90A-NEXT:    ;;#ASMSTART
11182; GFX90A-NEXT:    ; use s[8:9]
11183; GFX90A-NEXT:    ;;#ASMEND
11184; GFX90A-NEXT:    s_setpc_b64 s[30:31]
11185;
11186; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_2:
11187; GFX940:       ; %bb.0:
11188; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11189; GFX940-NEXT:    ;;#ASMSTART
11190; GFX940-NEXT:    ; def s[0:1]
11191; GFX940-NEXT:    ;;#ASMEND
11192; GFX940-NEXT:    ;;#ASMSTART
11193; GFX940-NEXT:    ; def s[2:3]
11194; GFX940-NEXT:    ;;#ASMEND
11195; GFX940-NEXT:    s_lshl_b32 s9, s1, 16
11196; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s3
11197; GFX940-NEXT:    ;;#ASMSTART
11198; GFX940-NEXT:    ; use s[8:9]
11199; GFX940-NEXT:    ;;#ASMEND
11200; GFX940-NEXT:    s_setpc_b64 s[30:31]
11201  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
11202  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
11203  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
11204  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
11205  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 poison, i32 2>
11206  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
11207  ret void
11208}
11209
11210define void @s_shuffle_v4bf16_v3bf16__5_5_0_2() {
11211; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_2:
11212; GFX900:       ; %bb.0:
11213; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11214; GFX900-NEXT:    ;;#ASMSTART
11215; GFX900-NEXT:    ; def s[4:5]
11216; GFX900-NEXT:    ;;#ASMEND
11217; GFX900-NEXT:    ;;#ASMSTART
11218; GFX900-NEXT:    ; def s[6:7]
11219; GFX900-NEXT:    ;;#ASMEND
11220; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s4, s5
11221; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
11222; GFX900-NEXT:    ;;#ASMSTART
11223; GFX900-NEXT:    ; use s[8:9]
11224; GFX900-NEXT:    ;;#ASMEND
11225; GFX900-NEXT:    s_setpc_b64 s[30:31]
11226;
11227; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_2:
11228; GFX90A:       ; %bb.0:
11229; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11230; GFX90A-NEXT:    ;;#ASMSTART
11231; GFX90A-NEXT:    ; def s[4:5]
11232; GFX90A-NEXT:    ;;#ASMEND
11233; GFX90A-NEXT:    ;;#ASMSTART
11234; GFX90A-NEXT:    ; def s[6:7]
11235; GFX90A-NEXT:    ;;#ASMEND
11236; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s4, s5
11237; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
11238; GFX90A-NEXT:    ;;#ASMSTART
11239; GFX90A-NEXT:    ; use s[8:9]
11240; GFX90A-NEXT:    ;;#ASMEND
11241; GFX90A-NEXT:    s_setpc_b64 s[30:31]
11242;
11243; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_2:
11244; GFX940:       ; %bb.0:
11245; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11246; GFX940-NEXT:    ;;#ASMSTART
11247; GFX940-NEXT:    ; def s[0:1]
11248; GFX940-NEXT:    ;;#ASMEND
11249; GFX940-NEXT:    ;;#ASMSTART
11250; GFX940-NEXT:    ; def s[2:3]
11251; GFX940-NEXT:    ;;#ASMEND
11252; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s0, s1
11253; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s3
11254; GFX940-NEXT:    ;;#ASMSTART
11255; GFX940-NEXT:    ; use s[8:9]
11256; GFX940-NEXT:    ;;#ASMEND
11257; GFX940-NEXT:    s_setpc_b64 s[30:31]
11258  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
11259  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
11260  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
11261  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
11262  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 0, i32 2>
11263  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
11264  ret void
11265}
11266
11267define void @s_shuffle_v4bf16_v3bf16__5_5_1_2() {
11268; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_2:
11269; GFX900:       ; %bb.0:
11270; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11271; GFX900-NEXT:    ;;#ASMSTART
11272; GFX900-NEXT:    ; def s[4:5]
11273; GFX900-NEXT:    ;;#ASMEND
11274; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
11275; GFX900-NEXT:    ;;#ASMSTART
11276; GFX900-NEXT:    ; def s[6:7]
11277; GFX900-NEXT:    ;;#ASMEND
11278; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s4, s5
11279; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
11280; GFX900-NEXT:    ;;#ASMSTART
11281; GFX900-NEXT:    ; use s[8:9]
11282; GFX900-NEXT:    ;;#ASMEND
11283; GFX900-NEXT:    s_setpc_b64 s[30:31]
11284;
11285; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_2:
11286; GFX90A:       ; %bb.0:
11287; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11288; GFX90A-NEXT:    ;;#ASMSTART
11289; GFX90A-NEXT:    ; def s[4:5]
11290; GFX90A-NEXT:    ;;#ASMEND
11291; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
11292; GFX90A-NEXT:    ;;#ASMSTART
11293; GFX90A-NEXT:    ; def s[6:7]
11294; GFX90A-NEXT:    ;;#ASMEND
11295; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s4, s5
11296; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
11297; GFX90A-NEXT:    ;;#ASMSTART
11298; GFX90A-NEXT:    ; use s[8:9]
11299; GFX90A-NEXT:    ;;#ASMEND
11300; GFX90A-NEXT:    s_setpc_b64 s[30:31]
11301;
11302; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_2:
11303; GFX940:       ; %bb.0:
11304; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11305; GFX940-NEXT:    ;;#ASMSTART
11306; GFX940-NEXT:    ; def s[0:1]
11307; GFX940-NEXT:    ;;#ASMEND
11308; GFX940-NEXT:    s_lshr_b32 s0, s0, 16
11309; GFX940-NEXT:    ;;#ASMSTART
11310; GFX940-NEXT:    ; def s[2:3]
11311; GFX940-NEXT:    ;;#ASMEND
11312; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s0, s1
11313; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s3
11314; GFX940-NEXT:    ;;#ASMSTART
11315; GFX940-NEXT:    ; use s[8:9]
11316; GFX940-NEXT:    ;;#ASMEND
11317; GFX940-NEXT:    s_setpc_b64 s[30:31]
11318  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
11319  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
11320  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
11321  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
11322  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 1, i32 2>
11323  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
11324  ret void
11325}
11326
11327define void @s_shuffle_v4bf16_v3bf16__5_5_3_2() {
11328; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_2:
11329; GFX900:       ; %bb.0:
11330; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11331; GFX900-NEXT:    ;;#ASMSTART
11332; GFX900-NEXT:    ; def s[4:5]
11333; GFX900-NEXT:    ;;#ASMEND
11334; GFX900-NEXT:    ;;#ASMSTART
11335; GFX900-NEXT:    ; def s[6:7]
11336; GFX900-NEXT:    ;;#ASMEND
11337; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s6, s5
11338; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
11339; GFX900-NEXT:    ;;#ASMSTART
11340; GFX900-NEXT:    ; use s[8:9]
11341; GFX900-NEXT:    ;;#ASMEND
11342; GFX900-NEXT:    s_setpc_b64 s[30:31]
11343;
11344; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_2:
11345; GFX90A:       ; %bb.0:
11346; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11347; GFX90A-NEXT:    ;;#ASMSTART
11348; GFX90A-NEXT:    ; def s[4:5]
11349; GFX90A-NEXT:    ;;#ASMEND
11350; GFX90A-NEXT:    ;;#ASMSTART
11351; GFX90A-NEXT:    ; def s[6:7]
11352; GFX90A-NEXT:    ;;#ASMEND
11353; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s6, s5
11354; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
11355; GFX90A-NEXT:    ;;#ASMSTART
11356; GFX90A-NEXT:    ; use s[8:9]
11357; GFX90A-NEXT:    ;;#ASMEND
11358; GFX90A-NEXT:    s_setpc_b64 s[30:31]
11359;
11360; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_2:
11361; GFX940:       ; %bb.0:
11362; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11363; GFX940-NEXT:    ;;#ASMSTART
11364; GFX940-NEXT:    ; def s[0:1]
11365; GFX940-NEXT:    ;;#ASMEND
11366; GFX940-NEXT:    ;;#ASMSTART
11367; GFX940-NEXT:    ; def s[2:3]
11368; GFX940-NEXT:    ;;#ASMEND
11369; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s2, s1
11370; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s3
11371; GFX940-NEXT:    ;;#ASMSTART
11372; GFX940-NEXT:    ; use s[8:9]
11373; GFX940-NEXT:    ;;#ASMEND
11374; GFX940-NEXT:    s_setpc_b64 s[30:31]
11375  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
11376  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
11377  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
11378  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
11379  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 3, i32 2>
11380  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
11381  ret void
11382}
11383
11384define void @s_shuffle_v4bf16_v3bf16__5_5_4_2() {
11385; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_2:
11386; GFX900:       ; %bb.0:
11387; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11388; GFX900-NEXT:    ;;#ASMSTART
11389; GFX900-NEXT:    ; def s[4:5]
11390; GFX900-NEXT:    ;;#ASMEND
11391; GFX900-NEXT:    ;;#ASMSTART
11392; GFX900-NEXT:    ; def s[6:7]
11393; GFX900-NEXT:    ;;#ASMEND
11394; GFX900-NEXT:    s_lshr_b32 s4, s6, 16
11395; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s4, s5
11396; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
11397; GFX900-NEXT:    ;;#ASMSTART
11398; GFX900-NEXT:    ; use s[8:9]
11399; GFX900-NEXT:    ;;#ASMEND
11400; GFX900-NEXT:    s_setpc_b64 s[30:31]
11401;
11402; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_2:
11403; GFX90A:       ; %bb.0:
11404; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11405; GFX90A-NEXT:    ;;#ASMSTART
11406; GFX90A-NEXT:    ; def s[4:5]
11407; GFX90A-NEXT:    ;;#ASMEND
11408; GFX90A-NEXT:    ;;#ASMSTART
11409; GFX90A-NEXT:    ; def s[6:7]
11410; GFX90A-NEXT:    ;;#ASMEND
11411; GFX90A-NEXT:    s_lshr_b32 s4, s6, 16
11412; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s4, s5
11413; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
11414; GFX90A-NEXT:    ;;#ASMSTART
11415; GFX90A-NEXT:    ; use s[8:9]
11416; GFX90A-NEXT:    ;;#ASMEND
11417; GFX90A-NEXT:    s_setpc_b64 s[30:31]
11418;
11419; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_2:
11420; GFX940:       ; %bb.0:
11421; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11422; GFX940-NEXT:    ;;#ASMSTART
11423; GFX940-NEXT:    ; def s[0:1]
11424; GFX940-NEXT:    ;;#ASMEND
11425; GFX940-NEXT:    ;;#ASMSTART
11426; GFX940-NEXT:    ; def s[2:3]
11427; GFX940-NEXT:    ;;#ASMEND
11428; GFX940-NEXT:    s_lshr_b32 s0, s2, 16
11429; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s0, s1
11430; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s3
11431; GFX940-NEXT:    ;;#ASMSTART
11432; GFX940-NEXT:    ; use s[8:9]
11433; GFX940-NEXT:    ;;#ASMEND
11434; GFX940-NEXT:    s_setpc_b64 s[30:31]
11435  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
11436  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
11437  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
11438  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
11439  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 4, i32 2>
11440  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
11441  ret void
11442}
11443
11444define void @s_shuffle_v4bf16_v3bf16__u_3_3_3() {
11445; GFX9-LABEL: s_shuffle_v4bf16_v3bf16__u_3_3_3:
11446; GFX9:       ; %bb.0:
11447; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11448; GFX9-NEXT:    ;;#ASMSTART
11449; GFX9-NEXT:    ; use s[8:9]
11450; GFX9-NEXT:    ;;#ASMEND
11451; GFX9-NEXT:    s_setpc_b64 s[30:31]
11452  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
11453  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
11454  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 poison, i32 3, i32 3, i32 3>
11455  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
11456  ret void
11457}
11458
11459define void @s_shuffle_v4bf16_v3bf16__0_3_3_3() {
11460; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__0_3_3_3:
11461; GFX900:       ; %bb.0:
11462; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11463; GFX900-NEXT:    ;;#ASMSTART
11464; GFX900-NEXT:    ; def s[8:9]
11465; GFX900-NEXT:    ;;#ASMEND
11466; GFX900-NEXT:    ;;#ASMSTART
11467; GFX900-NEXT:    ; use s[8:9]
11468; GFX900-NEXT:    ;;#ASMEND
11469; GFX900-NEXT:    s_setpc_b64 s[30:31]
11470;
11471; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__0_3_3_3:
11472; GFX90A:       ; %bb.0:
11473; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11474; GFX90A-NEXT:    ;;#ASMSTART
11475; GFX90A-NEXT:    ; def s[8:9]
11476; GFX90A-NEXT:    ;;#ASMEND
11477; GFX90A-NEXT:    ;;#ASMSTART
11478; GFX90A-NEXT:    ; use s[8:9]
11479; GFX90A-NEXT:    ;;#ASMEND
11480; GFX90A-NEXT:    s_setpc_b64 s[30:31]
11481;
11482; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__0_3_3_3:
11483; GFX940:       ; %bb.0:
11484; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11485; GFX940-NEXT:    ;;#ASMSTART
11486; GFX940-NEXT:    ; def s[8:9]
11487; GFX940-NEXT:    ;;#ASMEND
11488; GFX940-NEXT:    s_nop 0
11489; GFX940-NEXT:    ;;#ASMSTART
11490; GFX940-NEXT:    ; use s[8:9]
11491; GFX940-NEXT:    ;;#ASMEND
11492; GFX940-NEXT:    s_setpc_b64 s[30:31]
11493  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
11494  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
11495  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 0, i32 3, i32 3, i32 3>
11496  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
11497  ret void
11498}
11499
11500define void @s_shuffle_v4bf16_v3bf16__1_3_3_3() {
11501; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__1_3_3_3:
11502; GFX900:       ; %bb.0:
11503; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11504; GFX900-NEXT:    ;;#ASMSTART
11505; GFX900-NEXT:    ; def s[4:5]
11506; GFX900-NEXT:    ;;#ASMEND
11507; GFX900-NEXT:    s_lshr_b32 s8, s4, 16
11508; GFX900-NEXT:    ;;#ASMSTART
11509; GFX900-NEXT:    ; use s[8:9]
11510; GFX900-NEXT:    ;;#ASMEND
11511; GFX900-NEXT:    s_setpc_b64 s[30:31]
11512;
11513; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__1_3_3_3:
11514; GFX90A:       ; %bb.0:
11515; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11516; GFX90A-NEXT:    ;;#ASMSTART
11517; GFX90A-NEXT:    ; def s[4:5]
11518; GFX90A-NEXT:    ;;#ASMEND
11519; GFX90A-NEXT:    s_lshr_b32 s8, s4, 16
11520; GFX90A-NEXT:    ;;#ASMSTART
11521; GFX90A-NEXT:    ; use s[8:9]
11522; GFX90A-NEXT:    ;;#ASMEND
11523; GFX90A-NEXT:    s_setpc_b64 s[30:31]
11524;
11525; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__1_3_3_3:
11526; GFX940:       ; %bb.0:
11527; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11528; GFX940-NEXT:    ;;#ASMSTART
11529; GFX940-NEXT:    ; def s[0:1]
11530; GFX940-NEXT:    ;;#ASMEND
11531; GFX940-NEXT:    s_lshr_b32 s8, s0, 16
11532; GFX940-NEXT:    ;;#ASMSTART
11533; GFX940-NEXT:    ; use s[8:9]
11534; GFX940-NEXT:    ;;#ASMEND
11535; GFX940-NEXT:    s_setpc_b64 s[30:31]
11536  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
11537  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
11538  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 1, i32 3, i32 3, i32 3>
11539  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
11540  ret void
11541}
11542
11543define void @s_shuffle_v4bf16_v3bf16__2_3_3_3() {
11544; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__2_3_3_3:
11545; GFX900:       ; %bb.0:
11546; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11547; GFX900-NEXT:    ;;#ASMSTART
11548; GFX900-NEXT:    ; def s[4:5]
11549; GFX900-NEXT:    ;;#ASMEND
11550; GFX900-NEXT:    s_mov_b32 s8, s5
11551; GFX900-NEXT:    ;;#ASMSTART
11552; GFX900-NEXT:    ; use s[8:9]
11553; GFX900-NEXT:    ;;#ASMEND
11554; GFX900-NEXT:    s_setpc_b64 s[30:31]
11555;
11556; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__2_3_3_3:
11557; GFX90A:       ; %bb.0:
11558; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11559; GFX90A-NEXT:    ;;#ASMSTART
11560; GFX90A-NEXT:    ; def s[4:5]
11561; GFX90A-NEXT:    ;;#ASMEND
11562; GFX90A-NEXT:    s_mov_b32 s8, s5
11563; GFX90A-NEXT:    ;;#ASMSTART
11564; GFX90A-NEXT:    ; use s[8:9]
11565; GFX90A-NEXT:    ;;#ASMEND
11566; GFX90A-NEXT:    s_setpc_b64 s[30:31]
11567;
11568; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__2_3_3_3:
11569; GFX940:       ; %bb.0:
11570; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11571; GFX940-NEXT:    ;;#ASMSTART
11572; GFX940-NEXT:    ; def s[0:1]
11573; GFX940-NEXT:    ;;#ASMEND
11574; GFX940-NEXT:    s_mov_b32 s8, s1
11575; GFX940-NEXT:    ;;#ASMSTART
11576; GFX940-NEXT:    ; use s[8:9]
11577; GFX940-NEXT:    ;;#ASMEND
11578; GFX940-NEXT:    s_setpc_b64 s[30:31]
11579  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
11580  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
11581  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 2, i32 3, i32 3, i32 3>
11582  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
11583  ret void
11584}
11585
11586define void @s_shuffle_v4bf16_v3bf16__3_3_3_3() {
11587; GFX9-LABEL: s_shuffle_v4bf16_v3bf16__3_3_3_3:
11588; GFX9:       ; %bb.0:
11589; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11590; GFX9-NEXT:    ;;#ASMSTART
11591; GFX9-NEXT:    ; use s[8:9]
11592; GFX9-NEXT:    ;;#ASMEND
11593; GFX9-NEXT:    s_setpc_b64 s[30:31]
11594  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
11595  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
11596  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
11597  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
11598  ret void
11599}
11600
11601define void @s_shuffle_v4bf16_v3bf16__4_3_3_3() {
11602; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__4_3_3_3:
11603; GFX900:       ; %bb.0:
11604; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11605; GFX900-NEXT:    ;;#ASMSTART
11606; GFX900-NEXT:    ; def s[4:5]
11607; GFX900-NEXT:    ;;#ASMEND
11608; GFX900-NEXT:    s_lshr_b32 s5, s4, 16
11609; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
11610; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
11611; GFX900-NEXT:    ;;#ASMSTART
11612; GFX900-NEXT:    ; use s[8:9]
11613; GFX900-NEXT:    ;;#ASMEND
11614; GFX900-NEXT:    s_setpc_b64 s[30:31]
11615;
11616; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__4_3_3_3:
11617; GFX90A:       ; %bb.0:
11618; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11619; GFX90A-NEXT:    ;;#ASMSTART
11620; GFX90A-NEXT:    ; def s[4:5]
11621; GFX90A-NEXT:    ;;#ASMEND
11622; GFX90A-NEXT:    s_lshr_b32 s5, s4, 16
11623; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
11624; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
11625; GFX90A-NEXT:    ;;#ASMSTART
11626; GFX90A-NEXT:    ; use s[8:9]
11627; GFX90A-NEXT:    ;;#ASMEND
11628; GFX90A-NEXT:    s_setpc_b64 s[30:31]
11629;
11630; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__4_3_3_3:
11631; GFX940:       ; %bb.0:
11632; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11633; GFX940-NEXT:    ;;#ASMSTART
11634; GFX940-NEXT:    ; def s[0:1]
11635; GFX940-NEXT:    ;;#ASMEND
11636; GFX940-NEXT:    s_lshr_b32 s1, s0, 16
11637; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
11638; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s0, s0
11639; GFX940-NEXT:    ;;#ASMSTART
11640; GFX940-NEXT:    ; use s[8:9]
11641; GFX940-NEXT:    ;;#ASMEND
11642; GFX940-NEXT:    s_setpc_b64 s[30:31]
11643  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
11644  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
11645  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
11646  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
11647  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 4, i32 3, i32 3, i32 3>
11648  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
11649  ret void
11650}
11651
11652define void @s_shuffle_v4bf16_v3bf16__5_3_3_3() {
11653; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_3_3_3:
11654; GFX900:       ; %bb.0:
11655; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11656; GFX900-NEXT:    ;;#ASMSTART
11657; GFX900-NEXT:    ; def s[4:5]
11658; GFX900-NEXT:    ;;#ASMEND
11659; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
11660; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
11661; GFX900-NEXT:    ;;#ASMSTART
11662; GFX900-NEXT:    ; use s[8:9]
11663; GFX900-NEXT:    ;;#ASMEND
11664; GFX900-NEXT:    s_setpc_b64 s[30:31]
11665;
11666; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_3_3_3:
11667; GFX90A:       ; %bb.0:
11668; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11669; GFX90A-NEXT:    ;;#ASMSTART
11670; GFX90A-NEXT:    ; def s[4:5]
11671; GFX90A-NEXT:    ;;#ASMEND
11672; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
11673; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
11674; GFX90A-NEXT:    ;;#ASMSTART
11675; GFX90A-NEXT:    ; use s[8:9]
11676; GFX90A-NEXT:    ;;#ASMEND
11677; GFX90A-NEXT:    s_setpc_b64 s[30:31]
11678;
11679; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_3_3_3:
11680; GFX940:       ; %bb.0:
11681; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11682; GFX940-NEXT:    ;;#ASMSTART
11683; GFX940-NEXT:    ; def s[0:1]
11684; GFX940-NEXT:    ;;#ASMEND
11685; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
11686; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s0, s0
11687; GFX940-NEXT:    ;;#ASMSTART
11688; GFX940-NEXT:    ; use s[8:9]
11689; GFX940-NEXT:    ;;#ASMEND
11690; GFX940-NEXT:    s_setpc_b64 s[30:31]
11691  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
11692  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
11693  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
11694  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
11695  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 3, i32 3, i32 3>
11696  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
11697  ret void
11698}
11699
11700define void @s_shuffle_v4bf16_v3bf16__5_u_3_3() {
11701; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_u_3_3:
11702; GFX900:       ; %bb.0:
11703; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11704; GFX900-NEXT:    ;;#ASMSTART
11705; GFX900-NEXT:    ; def s[4:5]
11706; GFX900-NEXT:    ;;#ASMEND
11707; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
11708; GFX900-NEXT:    s_mov_b32 s8, s5
11709; GFX900-NEXT:    ;;#ASMSTART
11710; GFX900-NEXT:    ; use s[8:9]
11711; GFX900-NEXT:    ;;#ASMEND
11712; GFX900-NEXT:    s_setpc_b64 s[30:31]
11713;
11714; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_u_3_3:
11715; GFX90A:       ; %bb.0:
11716; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11717; GFX90A-NEXT:    ;;#ASMSTART
11718; GFX90A-NEXT:    ; def s[4:5]
11719; GFX90A-NEXT:    ;;#ASMEND
11720; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
11721; GFX90A-NEXT:    s_mov_b32 s8, s5
11722; GFX90A-NEXT:    ;;#ASMSTART
11723; GFX90A-NEXT:    ; use s[8:9]
11724; GFX90A-NEXT:    ;;#ASMEND
11725; GFX90A-NEXT:    s_setpc_b64 s[30:31]
11726;
11727; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_u_3_3:
11728; GFX940:       ; %bb.0:
11729; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11730; GFX940-NEXT:    ;;#ASMSTART
11731; GFX940-NEXT:    ; def s[0:1]
11732; GFX940-NEXT:    ;;#ASMEND
11733; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s0, s0
11734; GFX940-NEXT:    s_mov_b32 s8, s1
11735; GFX940-NEXT:    ;;#ASMSTART
11736; GFX940-NEXT:    ; use s[8:9]
11737; GFX940-NEXT:    ;;#ASMEND
11738; GFX940-NEXT:    s_setpc_b64 s[30:31]
11739  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
11740  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
11741  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
11742  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
11743  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 poison, i32 3, i32 3>
11744  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
11745  ret void
11746}
11747
11748define void @s_shuffle_v4bf16_v3bf16__5_0_3_3() {
11749; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_0_3_3:
11750; GFX900:       ; %bb.0:
11751; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11752; GFX900-NEXT:    ;;#ASMSTART
11753; GFX900-NEXT:    ; def s[4:5]
11754; GFX900-NEXT:    ;;#ASMEND
11755; GFX900-NEXT:    ;;#ASMSTART
11756; GFX900-NEXT:    ; def s[6:7]
11757; GFX900-NEXT:    ;;#ASMEND
11758; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s4
11759; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s6, s6
11760; GFX900-NEXT:    ;;#ASMSTART
11761; GFX900-NEXT:    ; use s[8:9]
11762; GFX900-NEXT:    ;;#ASMEND
11763; GFX900-NEXT:    s_setpc_b64 s[30:31]
11764;
11765; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_0_3_3:
11766; GFX90A:       ; %bb.0:
11767; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11768; GFX90A-NEXT:    ;;#ASMSTART
11769; GFX90A-NEXT:    ; def s[4:5]
11770; GFX90A-NEXT:    ;;#ASMEND
11771; GFX90A-NEXT:    ;;#ASMSTART
11772; GFX90A-NEXT:    ; def s[6:7]
11773; GFX90A-NEXT:    ;;#ASMEND
11774; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s4
11775; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s6, s6
11776; GFX90A-NEXT:    ;;#ASMSTART
11777; GFX90A-NEXT:    ; use s[8:9]
11778; GFX90A-NEXT:    ;;#ASMEND
11779; GFX90A-NEXT:    s_setpc_b64 s[30:31]
11780;
11781; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_0_3_3:
11782; GFX940:       ; %bb.0:
11783; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11784; GFX940-NEXT:    ;;#ASMSTART
11785; GFX940-NEXT:    ; def s[0:1]
11786; GFX940-NEXT:    ;;#ASMEND
11787; GFX940-NEXT:    ;;#ASMSTART
11788; GFX940-NEXT:    ; def s[2:3]
11789; GFX940-NEXT:    ;;#ASMEND
11790; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s0
11791; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s2, s2
11792; GFX940-NEXT:    ;;#ASMSTART
11793; GFX940-NEXT:    ; use s[8:9]
11794; GFX940-NEXT:    ;;#ASMEND
11795; GFX940-NEXT:    s_setpc_b64 s[30:31]
11796  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
11797  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
11798  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
11799  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
11800  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 0, i32 3, i32 3>
11801  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
11802  ret void
11803}
11804
11805define void @s_shuffle_v4bf16_v3bf16__5_1_3_3() {
11806; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_1_3_3:
11807; GFX900:       ; %bb.0:
11808; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11809; GFX900-NEXT:    ;;#ASMSTART
11810; GFX900-NEXT:    ; def s[4:5]
11811; GFX900-NEXT:    ;;#ASMEND
11812; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
11813; GFX900-NEXT:    ;;#ASMSTART
11814; GFX900-NEXT:    ; def s[6:7]
11815; GFX900-NEXT:    ;;#ASMEND
11816; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s4
11817; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s6, s6
11818; GFX900-NEXT:    ;;#ASMSTART
11819; GFX900-NEXT:    ; use s[8:9]
11820; GFX900-NEXT:    ;;#ASMEND
11821; GFX900-NEXT:    s_setpc_b64 s[30:31]
11822;
11823; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_1_3_3:
11824; GFX90A:       ; %bb.0:
11825; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11826; GFX90A-NEXT:    ;;#ASMSTART
11827; GFX90A-NEXT:    ; def s[4:5]
11828; GFX90A-NEXT:    ;;#ASMEND
11829; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
11830; GFX90A-NEXT:    ;;#ASMSTART
11831; GFX90A-NEXT:    ; def s[6:7]
11832; GFX90A-NEXT:    ;;#ASMEND
11833; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s4
11834; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s6, s6
11835; GFX90A-NEXT:    ;;#ASMSTART
11836; GFX90A-NEXT:    ; use s[8:9]
11837; GFX90A-NEXT:    ;;#ASMEND
11838; GFX90A-NEXT:    s_setpc_b64 s[30:31]
11839;
11840; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_1_3_3:
11841; GFX940:       ; %bb.0:
11842; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11843; GFX940-NEXT:    ;;#ASMSTART
11844; GFX940-NEXT:    ; def s[0:1]
11845; GFX940-NEXT:    ;;#ASMEND
11846; GFX940-NEXT:    s_lshr_b32 s0, s0, 16
11847; GFX940-NEXT:    ;;#ASMSTART
11848; GFX940-NEXT:    ; def s[2:3]
11849; GFX940-NEXT:    ;;#ASMEND
11850; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s0
11851; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s2, s2
11852; GFX940-NEXT:    ;;#ASMSTART
11853; GFX940-NEXT:    ; use s[8:9]
11854; GFX940-NEXT:    ;;#ASMEND
11855; GFX940-NEXT:    s_setpc_b64 s[30:31]
11856  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
11857  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
11858  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
11859  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
11860  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 1, i32 3, i32 3>
11861  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
11862  ret void
11863}
11864
11865define void @s_shuffle_v4bf16_v3bf16__5_2_3_3() {
11866; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_2_3_3:
11867; GFX900:       ; %bb.0:
11868; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11869; GFX900-NEXT:    ;;#ASMSTART
11870; GFX900-NEXT:    ; def s[4:5]
11871; GFX900-NEXT:    ;;#ASMEND
11872; GFX900-NEXT:    ;;#ASMSTART
11873; GFX900-NEXT:    ; def s[6:7]
11874; GFX900-NEXT:    ;;#ASMEND
11875; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s5
11876; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s6, s6
11877; GFX900-NEXT:    ;;#ASMSTART
11878; GFX900-NEXT:    ; use s[8:9]
11879; GFX900-NEXT:    ;;#ASMEND
11880; GFX900-NEXT:    s_setpc_b64 s[30:31]
11881;
11882; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_2_3_3:
11883; GFX90A:       ; %bb.0:
11884; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11885; GFX90A-NEXT:    ;;#ASMSTART
11886; GFX90A-NEXT:    ; def s[4:5]
11887; GFX90A-NEXT:    ;;#ASMEND
11888; GFX90A-NEXT:    ;;#ASMSTART
11889; GFX90A-NEXT:    ; def s[6:7]
11890; GFX90A-NEXT:    ;;#ASMEND
11891; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s5
11892; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s6, s6
11893; GFX90A-NEXT:    ;;#ASMSTART
11894; GFX90A-NEXT:    ; use s[8:9]
11895; GFX90A-NEXT:    ;;#ASMEND
11896; GFX90A-NEXT:    s_setpc_b64 s[30:31]
11897;
11898; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_2_3_3:
11899; GFX940:       ; %bb.0:
11900; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11901; GFX940-NEXT:    ;;#ASMSTART
11902; GFX940-NEXT:    ; def s[0:1]
11903; GFX940-NEXT:    ;;#ASMEND
11904; GFX940-NEXT:    ;;#ASMSTART
11905; GFX940-NEXT:    ; def s[2:3]
11906; GFX940-NEXT:    ;;#ASMEND
11907; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s1
11908; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s2, s2
11909; GFX940-NEXT:    ;;#ASMSTART
11910; GFX940-NEXT:    ; use s[8:9]
11911; GFX940-NEXT:    ;;#ASMEND
11912; GFX940-NEXT:    s_setpc_b64 s[30:31]
11913  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
11914  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
11915  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
11916  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
11917  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 2, i32 3, i32 3>
11918  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
11919  ret void
11920}
11921
11922define void @s_shuffle_v4bf16_v3bf16__5_4_3_3() {
11923; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_4_3_3:
11924; GFX900:       ; %bb.0:
11925; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11926; GFX900-NEXT:    ;;#ASMSTART
11927; GFX900-NEXT:    ; def s[4:5]
11928; GFX900-NEXT:    ;;#ASMEND
11929; GFX900-NEXT:    s_lshr_b32 s6, s4, 16
11930; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s6
11931; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
11932; GFX900-NEXT:    ;;#ASMSTART
11933; GFX900-NEXT:    ; use s[8:9]
11934; GFX900-NEXT:    ;;#ASMEND
11935; GFX900-NEXT:    s_setpc_b64 s[30:31]
11936;
11937; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_4_3_3:
11938; GFX90A:       ; %bb.0:
11939; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11940; GFX90A-NEXT:    ;;#ASMSTART
11941; GFX90A-NEXT:    ; def s[4:5]
11942; GFX90A-NEXT:    ;;#ASMEND
11943; GFX90A-NEXT:    s_lshr_b32 s6, s4, 16
11944; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s6
11945; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
11946; GFX90A-NEXT:    ;;#ASMSTART
11947; GFX90A-NEXT:    ; use s[8:9]
11948; GFX90A-NEXT:    ;;#ASMEND
11949; GFX90A-NEXT:    s_setpc_b64 s[30:31]
11950;
11951; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_4_3_3:
11952; GFX940:       ; %bb.0:
11953; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11954; GFX940-NEXT:    ;;#ASMSTART
11955; GFX940-NEXT:    ; def s[0:1]
11956; GFX940-NEXT:    ;;#ASMEND
11957; GFX940-NEXT:    s_lshr_b32 s2, s0, 16
11958; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s1, s2
11959; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s0, s0
11960; GFX940-NEXT:    ;;#ASMSTART
11961; GFX940-NEXT:    ; use s[8:9]
11962; GFX940-NEXT:    ;;#ASMEND
11963; GFX940-NEXT:    s_setpc_b64 s[30:31]
11964  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
11965  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
11966  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
11967  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
11968  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 4, i32 3, i32 3>
11969  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
11970  ret void
11971}
11972
11973define void @s_shuffle_v4bf16_v3bf16__5_5_3_3() {
11974; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_3:
11975; GFX900:       ; %bb.0:
11976; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11977; GFX900-NEXT:    ;;#ASMSTART
11978; GFX900-NEXT:    ; def s[4:5]
11979; GFX900-NEXT:    ;;#ASMEND
11980; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
11981; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s5
11982; GFX900-NEXT:    ;;#ASMSTART
11983; GFX900-NEXT:    ; use s[8:9]
11984; GFX900-NEXT:    ;;#ASMEND
11985; GFX900-NEXT:    s_setpc_b64 s[30:31]
11986;
11987; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_3:
11988; GFX90A:       ; %bb.0:
11989; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11990; GFX90A-NEXT:    ;;#ASMSTART
11991; GFX90A-NEXT:    ; def s[4:5]
11992; GFX90A-NEXT:    ;;#ASMEND
11993; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
11994; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s5
11995; GFX90A-NEXT:    ;;#ASMSTART
11996; GFX90A-NEXT:    ; use s[8:9]
11997; GFX90A-NEXT:    ;;#ASMEND
11998; GFX90A-NEXT:    s_setpc_b64 s[30:31]
11999;
12000; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_3:
12001; GFX940:       ; %bb.0:
12002; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12003; GFX940-NEXT:    ;;#ASMSTART
12004; GFX940-NEXT:    ; def s[0:1]
12005; GFX940-NEXT:    ;;#ASMEND
12006; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s0, s0
12007; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s1, s1
12008; GFX940-NEXT:    ;;#ASMSTART
12009; GFX940-NEXT:    ; use s[8:9]
12010; GFX940-NEXT:    ;;#ASMEND
12011; GFX940-NEXT:    s_setpc_b64 s[30:31]
12012  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
12013  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
12014  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
12015  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
12016  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 3, i32 3>
12017  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
12018  ret void
12019}
12020
12021define void @s_shuffle_v4bf16_v3bf16__5_5_u_3() {
12022; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_3:
12023; GFX900:       ; %bb.0:
12024; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12025; GFX900-NEXT:    ;;#ASMSTART
12026; GFX900-NEXT:    ; def s[4:5]
12027; GFX900-NEXT:    ;;#ASMEND
12028; GFX900-NEXT:    s_lshl_b32 s9, s4, 16
12029; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s5
12030; GFX900-NEXT:    ;;#ASMSTART
12031; GFX900-NEXT:    ; use s[8:9]
12032; GFX900-NEXT:    ;;#ASMEND
12033; GFX900-NEXT:    s_setpc_b64 s[30:31]
12034;
12035; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_3:
12036; GFX90A:       ; %bb.0:
12037; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12038; GFX90A-NEXT:    ;;#ASMSTART
12039; GFX90A-NEXT:    ; def s[4:5]
12040; GFX90A-NEXT:    ;;#ASMEND
12041; GFX90A-NEXT:    s_lshl_b32 s9, s4, 16
12042; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s5
12043; GFX90A-NEXT:    ;;#ASMSTART
12044; GFX90A-NEXT:    ; use s[8:9]
12045; GFX90A-NEXT:    ;;#ASMEND
12046; GFX90A-NEXT:    s_setpc_b64 s[30:31]
12047;
12048; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_3:
12049; GFX940:       ; %bb.0:
12050; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12051; GFX940-NEXT:    ;;#ASMSTART
12052; GFX940-NEXT:    ; def s[0:1]
12053; GFX940-NEXT:    ;;#ASMEND
12054; GFX940-NEXT:    s_lshl_b32 s9, s0, 16
12055; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s1, s1
12056; GFX940-NEXT:    ;;#ASMSTART
12057; GFX940-NEXT:    ; use s[8:9]
12058; GFX940-NEXT:    ;;#ASMEND
12059; GFX940-NEXT:    s_setpc_b64 s[30:31]
12060  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
12061  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
12062  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
12063  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
12064  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 poison, i32 3>
12065  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
12066  ret void
12067}
12068
12069define void @s_shuffle_v4bf16_v3bf16__5_5_0_3() {
12070; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_3:
12071; GFX900:       ; %bb.0:
12072; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12073; GFX900-NEXT:    ;;#ASMSTART
12074; GFX900-NEXT:    ; def s[4:5]
12075; GFX900-NEXT:    ;;#ASMEND
12076; GFX900-NEXT:    ;;#ASMSTART
12077; GFX900-NEXT:    ; def s[6:7]
12078; GFX900-NEXT:    ;;#ASMEND
12079; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s4, s6
12080; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
12081; GFX900-NEXT:    ;;#ASMSTART
12082; GFX900-NEXT:    ; use s[8:9]
12083; GFX900-NEXT:    ;;#ASMEND
12084; GFX900-NEXT:    s_setpc_b64 s[30:31]
12085;
12086; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_3:
12087; GFX90A:       ; %bb.0:
12088; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12089; GFX90A-NEXT:    ;;#ASMSTART
12090; GFX90A-NEXT:    ; def s[4:5]
12091; GFX90A-NEXT:    ;;#ASMEND
12092; GFX90A-NEXT:    ;;#ASMSTART
12093; GFX90A-NEXT:    ; def s[6:7]
12094; GFX90A-NEXT:    ;;#ASMEND
12095; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s4, s6
12096; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
12097; GFX90A-NEXT:    ;;#ASMSTART
12098; GFX90A-NEXT:    ; use s[8:9]
12099; GFX90A-NEXT:    ;;#ASMEND
12100; GFX90A-NEXT:    s_setpc_b64 s[30:31]
12101;
12102; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_3:
12103; GFX940:       ; %bb.0:
12104; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12105; GFX940-NEXT:    ;;#ASMSTART
12106; GFX940-NEXT:    ; def s[0:1]
12107; GFX940-NEXT:    ;;#ASMEND
12108; GFX940-NEXT:    ;;#ASMSTART
12109; GFX940-NEXT:    ; def s[2:3]
12110; GFX940-NEXT:    ;;#ASMEND
12111; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s0, s2
12112; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s3
12113; GFX940-NEXT:    ;;#ASMSTART
12114; GFX940-NEXT:    ; use s[8:9]
12115; GFX940-NEXT:    ;;#ASMEND
12116; GFX940-NEXT:    s_setpc_b64 s[30:31]
12117  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
12118  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
12119  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
12120  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
12121  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 0, i32 3>
12122  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
12123  ret void
12124}
12125
12126define void @s_shuffle_v4bf16_v3bf16__5_5_1_3() {
12127; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_3:
12128; GFX900:       ; %bb.0:
12129; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12130; GFX900-NEXT:    ;;#ASMSTART
12131; GFX900-NEXT:    ; def s[4:5]
12132; GFX900-NEXT:    ;;#ASMEND
12133; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
12134; GFX900-NEXT:    ;;#ASMSTART
12135; GFX900-NEXT:    ; def s[6:7]
12136; GFX900-NEXT:    ;;#ASMEND
12137; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s4, s6
12138; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
12139; GFX900-NEXT:    ;;#ASMSTART
12140; GFX900-NEXT:    ; use s[8:9]
12141; GFX900-NEXT:    ;;#ASMEND
12142; GFX900-NEXT:    s_setpc_b64 s[30:31]
12143;
12144; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_3:
12145; GFX90A:       ; %bb.0:
12146; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12147; GFX90A-NEXT:    ;;#ASMSTART
12148; GFX90A-NEXT:    ; def s[4:5]
12149; GFX90A-NEXT:    ;;#ASMEND
12150; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
12151; GFX90A-NEXT:    ;;#ASMSTART
12152; GFX90A-NEXT:    ; def s[6:7]
12153; GFX90A-NEXT:    ;;#ASMEND
12154; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s4, s6
12155; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
12156; GFX90A-NEXT:    ;;#ASMSTART
12157; GFX90A-NEXT:    ; use s[8:9]
12158; GFX90A-NEXT:    ;;#ASMEND
12159; GFX90A-NEXT:    s_setpc_b64 s[30:31]
12160;
12161; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_3:
12162; GFX940:       ; %bb.0:
12163; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12164; GFX940-NEXT:    ;;#ASMSTART
12165; GFX940-NEXT:    ; def s[0:1]
12166; GFX940-NEXT:    ;;#ASMEND
12167; GFX940-NEXT:    s_lshr_b32 s0, s0, 16
12168; GFX940-NEXT:    ;;#ASMSTART
12169; GFX940-NEXT:    ; def s[2:3]
12170; GFX940-NEXT:    ;;#ASMEND
12171; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s0, s2
12172; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s3
12173; GFX940-NEXT:    ;;#ASMSTART
12174; GFX940-NEXT:    ; use s[8:9]
12175; GFX940-NEXT:    ;;#ASMEND
12176; GFX940-NEXT:    s_setpc_b64 s[30:31]
12177  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
12178  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
12179  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
12180  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
12181  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 1, i32 3>
12182  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
12183  ret void
12184}
12185
12186define void @s_shuffle_v4bf16_v3bf16__5_5_2_3() {
12187; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_3:
12188; GFX900:       ; %bb.0:
12189; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12190; GFX900-NEXT:    ;;#ASMSTART
12191; GFX900-NEXT:    ; def s[4:5]
12192; GFX900-NEXT:    ;;#ASMEND
12193; GFX900-NEXT:    ;;#ASMSTART
12194; GFX900-NEXT:    ; def s[6:7]
12195; GFX900-NEXT:    ;;#ASMEND
12196; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s5, s6
12197; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
12198; GFX900-NEXT:    ;;#ASMSTART
12199; GFX900-NEXT:    ; use s[8:9]
12200; GFX900-NEXT:    ;;#ASMEND
12201; GFX900-NEXT:    s_setpc_b64 s[30:31]
12202;
12203; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_3:
12204; GFX90A:       ; %bb.0:
12205; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12206; GFX90A-NEXT:    ;;#ASMSTART
12207; GFX90A-NEXT:    ; def s[4:5]
12208; GFX90A-NEXT:    ;;#ASMEND
12209; GFX90A-NEXT:    ;;#ASMSTART
12210; GFX90A-NEXT:    ; def s[6:7]
12211; GFX90A-NEXT:    ;;#ASMEND
12212; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s5, s6
12213; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
12214; GFX90A-NEXT:    ;;#ASMSTART
12215; GFX90A-NEXT:    ; use s[8:9]
12216; GFX90A-NEXT:    ;;#ASMEND
12217; GFX90A-NEXT:    s_setpc_b64 s[30:31]
12218;
12219; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_3:
12220; GFX940:       ; %bb.0:
12221; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12222; GFX940-NEXT:    ;;#ASMSTART
12223; GFX940-NEXT:    ; def s[0:1]
12224; GFX940-NEXT:    ;;#ASMEND
12225; GFX940-NEXT:    ;;#ASMSTART
12226; GFX940-NEXT:    ; def s[2:3]
12227; GFX940-NEXT:    ;;#ASMEND
12228; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s1, s2
12229; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s3
12230; GFX940-NEXT:    ;;#ASMSTART
12231; GFX940-NEXT:    ; use s[8:9]
12232; GFX940-NEXT:    ;;#ASMEND
12233; GFX940-NEXT:    s_setpc_b64 s[30:31]
12234  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
12235  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
12236  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
12237  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
12238  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 2, i32 3>
12239  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
12240  ret void
12241}
12242
12243define void @s_shuffle_v4bf16_v3bf16__5_5_4_3() {
12244; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_3:
12245; GFX900:       ; %bb.0:
12246; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12247; GFX900-NEXT:    ;;#ASMSTART
12248; GFX900-NEXT:    ; def s[4:5]
12249; GFX900-NEXT:    ;;#ASMEND
12250; GFX900-NEXT:    s_lshr_b32 s6, s4, 16
12251; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s6, s4
12252; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s5
12253; GFX900-NEXT:    ;;#ASMSTART
12254; GFX900-NEXT:    ; use s[8:9]
12255; GFX900-NEXT:    ;;#ASMEND
12256; GFX900-NEXT:    s_setpc_b64 s[30:31]
12257;
12258; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_3:
12259; GFX90A:       ; %bb.0:
12260; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12261; GFX90A-NEXT:    ;;#ASMSTART
12262; GFX90A-NEXT:    ; def s[4:5]
12263; GFX90A-NEXT:    ;;#ASMEND
12264; GFX90A-NEXT:    s_lshr_b32 s6, s4, 16
12265; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s6, s4
12266; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s5
12267; GFX90A-NEXT:    ;;#ASMSTART
12268; GFX90A-NEXT:    ; use s[8:9]
12269; GFX90A-NEXT:    ;;#ASMEND
12270; GFX90A-NEXT:    s_setpc_b64 s[30:31]
12271;
12272; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_3:
12273; GFX940:       ; %bb.0:
12274; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12275; GFX940-NEXT:    ;;#ASMSTART
12276; GFX940-NEXT:    ; def s[0:1]
12277; GFX940-NEXT:    ;;#ASMEND
12278; GFX940-NEXT:    s_lshr_b32 s2, s0, 16
12279; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s2, s0
12280; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s1, s1
12281; GFX940-NEXT:    ;;#ASMSTART
12282; GFX940-NEXT:    ; use s[8:9]
12283; GFX940-NEXT:    ;;#ASMEND
12284; GFX940-NEXT:    s_setpc_b64 s[30:31]
12285  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
12286  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
12287  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
12288  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
12289  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 4, i32 3>
12290  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
12291  ret void
12292}
12293
12294define void @s_shuffle_v4bf16_v3bf16__u_4_4_4() {
12295; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__u_4_4_4:
12296; GFX900:       ; %bb.0:
12297; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12298; GFX900-NEXT:    ;;#ASMSTART
12299; GFX900-NEXT:    ; def s[8:9]
12300; GFX900-NEXT:    ;;#ASMEND
12301; GFX900-NEXT:    s_lshr_b32 s4, s8, 16
12302; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
12303; GFX900-NEXT:    ;;#ASMSTART
12304; GFX900-NEXT:    ; use s[8:9]
12305; GFX900-NEXT:    ;;#ASMEND
12306; GFX900-NEXT:    s_setpc_b64 s[30:31]
12307;
12308; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__u_4_4_4:
12309; GFX90A:       ; %bb.0:
12310; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12311; GFX90A-NEXT:    ;;#ASMSTART
12312; GFX90A-NEXT:    ; def s[8:9]
12313; GFX90A-NEXT:    ;;#ASMEND
12314; GFX90A-NEXT:    s_lshr_b32 s4, s8, 16
12315; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
12316; GFX90A-NEXT:    ;;#ASMSTART
12317; GFX90A-NEXT:    ; use s[8:9]
12318; GFX90A-NEXT:    ;;#ASMEND
12319; GFX90A-NEXT:    s_setpc_b64 s[30:31]
12320;
12321; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__u_4_4_4:
12322; GFX940:       ; %bb.0:
12323; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12324; GFX940-NEXT:    ;;#ASMSTART
12325; GFX940-NEXT:    ; def s[8:9]
12326; GFX940-NEXT:    ;;#ASMEND
12327; GFX940-NEXT:    s_lshr_b32 s0, s8, 16
12328; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s0, s0
12329; GFX940-NEXT:    ;;#ASMSTART
12330; GFX940-NEXT:    ; use s[8:9]
12331; GFX940-NEXT:    ;;#ASMEND
12332; GFX940-NEXT:    s_setpc_b64 s[30:31]
12333  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
12334  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
12335  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
12336  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
12337  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 poison, i32 4, i32 4, i32 4>
12338  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
12339  ret void
12340}
12341
12342define void @s_shuffle_v4bf16_v3bf16__0_4_4_4() {
12343; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__0_4_4_4:
12344; GFX900:       ; %bb.0:
12345; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12346; GFX900-NEXT:    ;;#ASMSTART
12347; GFX900-NEXT:    ; def s[4:5]
12348; GFX900-NEXT:    ;;#ASMEND
12349; GFX900-NEXT:    ;;#ASMSTART
12350; GFX900-NEXT:    ; def s[6:7]
12351; GFX900-NEXT:    ;;#ASMEND
12352; GFX900-NEXT:    s_lshr_b32 s5, s6, 16
12353; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
12354; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s5, s5
12355; GFX900-NEXT:    ;;#ASMSTART
12356; GFX900-NEXT:    ; use s[8:9]
12357; GFX900-NEXT:    ;;#ASMEND
12358; GFX900-NEXT:    s_setpc_b64 s[30:31]
12359;
12360; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__0_4_4_4:
12361; GFX90A:       ; %bb.0:
12362; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12363; GFX90A-NEXT:    ;;#ASMSTART
12364; GFX90A-NEXT:    ; def s[4:5]
12365; GFX90A-NEXT:    ;;#ASMEND
12366; GFX90A-NEXT:    ;;#ASMSTART
12367; GFX90A-NEXT:    ; def s[6:7]
12368; GFX90A-NEXT:    ;;#ASMEND
12369; GFX90A-NEXT:    s_lshr_b32 s5, s6, 16
12370; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
12371; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s5, s5
12372; GFX90A-NEXT:    ;;#ASMSTART
12373; GFX90A-NEXT:    ; use s[8:9]
12374; GFX90A-NEXT:    ;;#ASMEND
12375; GFX90A-NEXT:    s_setpc_b64 s[30:31]
12376;
12377; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__0_4_4_4:
12378; GFX940:       ; %bb.0:
12379; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12380; GFX940-NEXT:    ;;#ASMSTART
12381; GFX940-NEXT:    ; def s[0:1]
12382; GFX940-NEXT:    ;;#ASMEND
12383; GFX940-NEXT:    ;;#ASMSTART
12384; GFX940-NEXT:    ; def s[2:3]
12385; GFX940-NEXT:    ;;#ASMEND
12386; GFX940-NEXT:    s_lshr_b32 s1, s2, 16
12387; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s0, s1
12388; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s1, s1
12389; GFX940-NEXT:    ;;#ASMSTART
12390; GFX940-NEXT:    ; use s[8:9]
12391; GFX940-NEXT:    ;;#ASMEND
12392; GFX940-NEXT:    s_setpc_b64 s[30:31]
12393  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
12394  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
12395  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
12396  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
12397  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 0, i32 4, i32 4, i32 4>
12398  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
12399  ret void
12400}
12401
12402define void @s_shuffle_v4bf16_v3bf16__1_4_4_4() {
12403; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__1_4_4_4:
12404; GFX900:       ; %bb.0:
12405; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12406; GFX900-NEXT:    ;;#ASMSTART
12407; GFX900-NEXT:    ; def s[4:5]
12408; GFX900-NEXT:    ;;#ASMEND
12409; GFX900-NEXT:    ;;#ASMSTART
12410; GFX900-NEXT:    ; def s[6:7]
12411; GFX900-NEXT:    ;;#ASMEND
12412; GFX900-NEXT:    s_lshr_b32 s5, s6, 16
12413; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
12414; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
12415; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s5, s5
12416; GFX900-NEXT:    ;;#ASMSTART
12417; GFX900-NEXT:    ; use s[8:9]
12418; GFX900-NEXT:    ;;#ASMEND
12419; GFX900-NEXT:    s_setpc_b64 s[30:31]
12420;
12421; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__1_4_4_4:
12422; GFX90A:       ; %bb.0:
12423; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12424; GFX90A-NEXT:    ;;#ASMSTART
12425; GFX90A-NEXT:    ; def s[4:5]
12426; GFX90A-NEXT:    ;;#ASMEND
12427; GFX90A-NEXT:    ;;#ASMSTART
12428; GFX90A-NEXT:    ; def s[6:7]
12429; GFX90A-NEXT:    ;;#ASMEND
12430; GFX90A-NEXT:    s_lshr_b32 s5, s6, 16
12431; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
12432; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
12433; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s5, s5
12434; GFX90A-NEXT:    ;;#ASMSTART
12435; GFX90A-NEXT:    ; use s[8:9]
12436; GFX90A-NEXT:    ;;#ASMEND
12437; GFX90A-NEXT:    s_setpc_b64 s[30:31]
12438;
12439; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__1_4_4_4:
12440; GFX940:       ; %bb.0:
12441; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12442; GFX940-NEXT:    ;;#ASMSTART
12443; GFX940-NEXT:    ; def s[0:1]
12444; GFX940-NEXT:    ;;#ASMEND
12445; GFX940-NEXT:    ;;#ASMSTART
12446; GFX940-NEXT:    ; def s[2:3]
12447; GFX940-NEXT:    ;;#ASMEND
12448; GFX940-NEXT:    s_lshr_b32 s1, s2, 16
12449; GFX940-NEXT:    s_lshr_b32 s0, s0, 16
12450; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s0, s1
12451; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s1, s1
12452; GFX940-NEXT:    ;;#ASMSTART
12453; GFX940-NEXT:    ; use s[8:9]
12454; GFX940-NEXT:    ;;#ASMEND
12455; GFX940-NEXT:    s_setpc_b64 s[30:31]
12456  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
12457  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
12458  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
12459  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
12460  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 1, i32 4, i32 4, i32 4>
12461  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
12462  ret void
12463}
12464
12465define void @s_shuffle_v4bf16_v3bf16__2_4_4_4() {
12466; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__2_4_4_4:
12467; GFX900:       ; %bb.0:
12468; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12469; GFX900-NEXT:    ;;#ASMSTART
12470; GFX900-NEXT:    ; def s[4:5]
12471; GFX900-NEXT:    ;;#ASMEND
12472; GFX900-NEXT:    ;;#ASMSTART
12473; GFX900-NEXT:    ; def s[6:7]
12474; GFX900-NEXT:    ;;#ASMEND
12475; GFX900-NEXT:    s_lshr_b32 s4, s6, 16
12476; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
12477; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
12478; GFX900-NEXT:    ;;#ASMSTART
12479; GFX900-NEXT:    ; use s[8:9]
12480; GFX900-NEXT:    ;;#ASMEND
12481; GFX900-NEXT:    s_setpc_b64 s[30:31]
12482;
12483; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__2_4_4_4:
12484; GFX90A:       ; %bb.0:
12485; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12486; GFX90A-NEXT:    ;;#ASMSTART
12487; GFX90A-NEXT:    ; def s[4:5]
12488; GFX90A-NEXT:    ;;#ASMEND
12489; GFX90A-NEXT:    ;;#ASMSTART
12490; GFX90A-NEXT:    ; def s[6:7]
12491; GFX90A-NEXT:    ;;#ASMEND
12492; GFX90A-NEXT:    s_lshr_b32 s4, s6, 16
12493; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
12494; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
12495; GFX90A-NEXT:    ;;#ASMSTART
12496; GFX90A-NEXT:    ; use s[8:9]
12497; GFX90A-NEXT:    ;;#ASMEND
12498; GFX90A-NEXT:    s_setpc_b64 s[30:31]
12499;
12500; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__2_4_4_4:
12501; GFX940:       ; %bb.0:
12502; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12503; GFX940-NEXT:    ;;#ASMSTART
12504; GFX940-NEXT:    ; def s[0:1]
12505; GFX940-NEXT:    ;;#ASMEND
12506; GFX940-NEXT:    ;;#ASMSTART
12507; GFX940-NEXT:    ; def s[2:3]
12508; GFX940-NEXT:    ;;#ASMEND
12509; GFX940-NEXT:    s_lshr_b32 s0, s2, 16
12510; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
12511; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s0, s0
12512; GFX940-NEXT:    ;;#ASMSTART
12513; GFX940-NEXT:    ; use s[8:9]
12514; GFX940-NEXT:    ;;#ASMEND
12515; GFX940-NEXT:    s_setpc_b64 s[30:31]
12516  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
12517  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
12518  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
12519  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
12520  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 2, i32 4, i32 4, i32 4>
12521  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
12522  ret void
12523}
12524
12525define void @s_shuffle_v4bf16_v3bf16__3_4_4_4() {
12526; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__3_4_4_4:
12527; GFX900:       ; %bb.0:
12528; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12529; GFX900-NEXT:    ;;#ASMSTART
12530; GFX900-NEXT:    ; def s[8:9]
12531; GFX900-NEXT:    ;;#ASMEND
12532; GFX900-NEXT:    s_lshr_b32 s4, s8, 16
12533; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
12534; GFX900-NEXT:    ;;#ASMSTART
12535; GFX900-NEXT:    ; use s[8:9]
12536; GFX900-NEXT:    ;;#ASMEND
12537; GFX900-NEXT:    s_setpc_b64 s[30:31]
12538;
12539; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__3_4_4_4:
12540; GFX90A:       ; %bb.0:
12541; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12542; GFX90A-NEXT:    ;;#ASMSTART
12543; GFX90A-NEXT:    ; def s[8:9]
12544; GFX90A-NEXT:    ;;#ASMEND
12545; GFX90A-NEXT:    s_lshr_b32 s4, s8, 16
12546; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
12547; GFX90A-NEXT:    ;;#ASMSTART
12548; GFX90A-NEXT:    ; use s[8:9]
12549; GFX90A-NEXT:    ;;#ASMEND
12550; GFX90A-NEXT:    s_setpc_b64 s[30:31]
12551;
12552; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__3_4_4_4:
12553; GFX940:       ; %bb.0:
12554; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12555; GFX940-NEXT:    ;;#ASMSTART
12556; GFX940-NEXT:    ; def s[8:9]
12557; GFX940-NEXT:    ;;#ASMEND
12558; GFX940-NEXT:    s_lshr_b32 s0, s8, 16
12559; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s0, s0
12560; GFX940-NEXT:    ;;#ASMSTART
12561; GFX940-NEXT:    ; use s[8:9]
12562; GFX940-NEXT:    ;;#ASMEND
12563; GFX940-NEXT:    s_setpc_b64 s[30:31]
12564  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
12565  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
12566  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
12567  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
12568  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 3, i32 4, i32 4, i32 4>
12569  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
12570  ret void
12571}
12572
12573define void @s_shuffle_v4bf16_v3bf16__4_4_4_4() {
12574; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__4_4_4_4:
12575; GFX900:       ; %bb.0:
12576; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12577; GFX900-NEXT:    ;;#ASMSTART
12578; GFX900-NEXT:    ; def s[4:5]
12579; GFX900-NEXT:    ;;#ASMEND
12580; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
12581; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s4
12582; GFX900-NEXT:    s_mov_b32 s9, s8
12583; GFX900-NEXT:    ;;#ASMSTART
12584; GFX900-NEXT:    ; use s[8:9]
12585; GFX900-NEXT:    ;;#ASMEND
12586; GFX900-NEXT:    s_setpc_b64 s[30:31]
12587;
12588; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__4_4_4_4:
12589; GFX90A:       ; %bb.0:
12590; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12591; GFX90A-NEXT:    ;;#ASMSTART
12592; GFX90A-NEXT:    ; def s[4:5]
12593; GFX90A-NEXT:    ;;#ASMEND
12594; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
12595; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s4
12596; GFX90A-NEXT:    s_mov_b32 s9, s8
12597; GFX90A-NEXT:    ;;#ASMSTART
12598; GFX90A-NEXT:    ; use s[8:9]
12599; GFX90A-NEXT:    ;;#ASMEND
12600; GFX90A-NEXT:    s_setpc_b64 s[30:31]
12601;
12602; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__4_4_4_4:
12603; GFX940:       ; %bb.0:
12604; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12605; GFX940-NEXT:    ;;#ASMSTART
12606; GFX940-NEXT:    ; def s[0:1]
12607; GFX940-NEXT:    ;;#ASMEND
12608; GFX940-NEXT:    s_lshr_b32 s0, s0, 16
12609; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s0, s0
12610; GFX940-NEXT:    s_mov_b32 s9, s8
12611; GFX940-NEXT:    ;;#ASMSTART
12612; GFX940-NEXT:    ; use s[8:9]
12613; GFX940-NEXT:    ;;#ASMEND
12614; GFX940-NEXT:    s_setpc_b64 s[30:31]
12615  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
12616  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
12617  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
12618  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
12619  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 4, i32 4, i32 4, i32 4>
12620  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
12621  ret void
12622}
12623
12624define void @s_shuffle_v4bf16_v3bf16__5_4_4_4() {
12625; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_4_4_4:
12626; GFX900:       ; %bb.0:
12627; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12628; GFX900-NEXT:    ;;#ASMSTART
12629; GFX900-NEXT:    ; def s[4:5]
12630; GFX900-NEXT:    ;;#ASMEND
12631; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
12632; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
12633; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
12634; GFX900-NEXT:    ;;#ASMSTART
12635; GFX900-NEXT:    ; use s[8:9]
12636; GFX900-NEXT:    ;;#ASMEND
12637; GFX900-NEXT:    s_setpc_b64 s[30:31]
12638;
12639; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_4_4_4:
12640; GFX90A:       ; %bb.0:
12641; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12642; GFX90A-NEXT:    ;;#ASMSTART
12643; GFX90A-NEXT:    ; def s[4:5]
12644; GFX90A-NEXT:    ;;#ASMEND
12645; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
12646; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
12647; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
12648; GFX90A-NEXT:    ;;#ASMSTART
12649; GFX90A-NEXT:    ; use s[8:9]
12650; GFX90A-NEXT:    ;;#ASMEND
12651; GFX90A-NEXT:    s_setpc_b64 s[30:31]
12652;
12653; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_4_4_4:
12654; GFX940:       ; %bb.0:
12655; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12656; GFX940-NEXT:    ;;#ASMSTART
12657; GFX940-NEXT:    ; def s[0:1]
12658; GFX940-NEXT:    ;;#ASMEND
12659; GFX940-NEXT:    s_lshr_b32 s0, s0, 16
12660; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
12661; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s0, s0
12662; GFX940-NEXT:    ;;#ASMSTART
12663; GFX940-NEXT:    ; use s[8:9]
12664; GFX940-NEXT:    ;;#ASMEND
12665; GFX940-NEXT:    s_setpc_b64 s[30:31]
12666  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
12667  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
12668  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
12669  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
12670  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 4, i32 4, i32 4>
12671  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
12672  ret void
12673}
12674
12675define void @s_shuffle_v4bf16_v3bf16__5_u_4_4() {
12676; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_u_4_4:
12677; GFX900:       ; %bb.0:
12678; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12679; GFX900-NEXT:    ;;#ASMSTART
12680; GFX900-NEXT:    ; def s[4:5]
12681; GFX900-NEXT:    ;;#ASMEND
12682; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
12683; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
12684; GFX900-NEXT:    s_mov_b32 s8, s5
12685; GFX900-NEXT:    ;;#ASMSTART
12686; GFX900-NEXT:    ; use s[8:9]
12687; GFX900-NEXT:    ;;#ASMEND
12688; GFX900-NEXT:    s_setpc_b64 s[30:31]
12689;
12690; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_u_4_4:
12691; GFX90A:       ; %bb.0:
12692; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12693; GFX90A-NEXT:    ;;#ASMSTART
12694; GFX90A-NEXT:    ; def s[4:5]
12695; GFX90A-NEXT:    ;;#ASMEND
12696; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
12697; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
12698; GFX90A-NEXT:    s_mov_b32 s8, s5
12699; GFX90A-NEXT:    ;;#ASMSTART
12700; GFX90A-NEXT:    ; use s[8:9]
12701; GFX90A-NEXT:    ;;#ASMEND
12702; GFX90A-NEXT:    s_setpc_b64 s[30:31]
12703;
12704; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_u_4_4:
12705; GFX940:       ; %bb.0:
12706; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12707; GFX940-NEXT:    ;;#ASMSTART
12708; GFX940-NEXT:    ; def s[0:1]
12709; GFX940-NEXT:    ;;#ASMEND
12710; GFX940-NEXT:    s_lshr_b32 s0, s0, 16
12711; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s0, s0
12712; GFX940-NEXT:    s_mov_b32 s8, s1
12713; GFX940-NEXT:    ;;#ASMSTART
12714; GFX940-NEXT:    ; use s[8:9]
12715; GFX940-NEXT:    ;;#ASMEND
12716; GFX940-NEXT:    s_setpc_b64 s[30:31]
12717  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
12718  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
12719  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
12720  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
12721  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 poison, i32 4, i32 4>
12722  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
12723  ret void
12724}
12725
12726define void @s_shuffle_v4bf16_v3bf16__5_0_4_4() {
12727; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_0_4_4:
12728; GFX900:       ; %bb.0:
12729; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12730; GFX900-NEXT:    ;;#ASMSTART
12731; GFX900-NEXT:    ; def s[4:5]
12732; GFX900-NEXT:    ;;#ASMEND
12733; GFX900-NEXT:    ;;#ASMSTART
12734; GFX900-NEXT:    ; def s[6:7]
12735; GFX900-NEXT:    ;;#ASMEND
12736; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s4
12737; GFX900-NEXT:    s_lshr_b32 s4, s6, 16
12738; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
12739; GFX900-NEXT:    ;;#ASMSTART
12740; GFX900-NEXT:    ; use s[8:9]
12741; GFX900-NEXT:    ;;#ASMEND
12742; GFX900-NEXT:    s_setpc_b64 s[30:31]
12743;
12744; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_0_4_4:
12745; GFX90A:       ; %bb.0:
12746; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12747; GFX90A-NEXT:    ;;#ASMSTART
12748; GFX90A-NEXT:    ; def s[4:5]
12749; GFX90A-NEXT:    ;;#ASMEND
12750; GFX90A-NEXT:    ;;#ASMSTART
12751; GFX90A-NEXT:    ; def s[6:7]
12752; GFX90A-NEXT:    ;;#ASMEND
12753; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s4
12754; GFX90A-NEXT:    s_lshr_b32 s4, s6, 16
12755; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
12756; GFX90A-NEXT:    ;;#ASMSTART
12757; GFX90A-NEXT:    ; use s[8:9]
12758; GFX90A-NEXT:    ;;#ASMEND
12759; GFX90A-NEXT:    s_setpc_b64 s[30:31]
12760;
12761; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_0_4_4:
12762; GFX940:       ; %bb.0:
12763; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12764; GFX940-NEXT:    ;;#ASMSTART
12765; GFX940-NEXT:    ; def s[0:1]
12766; GFX940-NEXT:    ;;#ASMEND
12767; GFX940-NEXT:    ;;#ASMSTART
12768; GFX940-NEXT:    ; def s[2:3]
12769; GFX940-NEXT:    ;;#ASMEND
12770; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s0
12771; GFX940-NEXT:    s_lshr_b32 s0, s2, 16
12772; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s0, s0
12773; GFX940-NEXT:    ;;#ASMSTART
12774; GFX940-NEXT:    ; use s[8:9]
12775; GFX940-NEXT:    ;;#ASMEND
12776; GFX940-NEXT:    s_setpc_b64 s[30:31]
12777  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
12778  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
12779  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
12780  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
12781  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 0, i32 4, i32 4>
12782  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
12783  ret void
12784}
12785
12786define void @s_shuffle_v4bf16_v3bf16__5_1_4_4() {
12787; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_1_4_4:
12788; GFX900:       ; %bb.0:
12789; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12790; GFX900-NEXT:    ;;#ASMSTART
12791; GFX900-NEXT:    ; def s[4:5]
12792; GFX900-NEXT:    ;;#ASMEND
12793; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
12794; GFX900-NEXT:    ;;#ASMSTART
12795; GFX900-NEXT:    ; def s[6:7]
12796; GFX900-NEXT:    ;;#ASMEND
12797; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s4
12798; GFX900-NEXT:    s_lshr_b32 s4, s6, 16
12799; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
12800; GFX900-NEXT:    ;;#ASMSTART
12801; GFX900-NEXT:    ; use s[8:9]
12802; GFX900-NEXT:    ;;#ASMEND
12803; GFX900-NEXT:    s_setpc_b64 s[30:31]
12804;
12805; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_1_4_4:
12806; GFX90A:       ; %bb.0:
12807; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12808; GFX90A-NEXT:    ;;#ASMSTART
12809; GFX90A-NEXT:    ; def s[4:5]
12810; GFX90A-NEXT:    ;;#ASMEND
12811; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
12812; GFX90A-NEXT:    ;;#ASMSTART
12813; GFX90A-NEXT:    ; def s[6:7]
12814; GFX90A-NEXT:    ;;#ASMEND
12815; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s4
12816; GFX90A-NEXT:    s_lshr_b32 s4, s6, 16
12817; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
12818; GFX90A-NEXT:    ;;#ASMSTART
12819; GFX90A-NEXT:    ; use s[8:9]
12820; GFX90A-NEXT:    ;;#ASMEND
12821; GFX90A-NEXT:    s_setpc_b64 s[30:31]
12822;
12823; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_1_4_4:
12824; GFX940:       ; %bb.0:
12825; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12826; GFX940-NEXT:    ;;#ASMSTART
12827; GFX940-NEXT:    ; def s[0:1]
12828; GFX940-NEXT:    ;;#ASMEND
12829; GFX940-NEXT:    s_lshr_b32 s0, s0, 16
12830; GFX940-NEXT:    ;;#ASMSTART
12831; GFX940-NEXT:    ; def s[2:3]
12832; GFX940-NEXT:    ;;#ASMEND
12833; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s0
12834; GFX940-NEXT:    s_lshr_b32 s0, s2, 16
12835; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s0, s0
12836; GFX940-NEXT:    ;;#ASMSTART
12837; GFX940-NEXT:    ; use s[8:9]
12838; GFX940-NEXT:    ;;#ASMEND
12839; GFX940-NEXT:    s_setpc_b64 s[30:31]
12840  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
12841  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
12842  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
12843  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
12844  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 1, i32 4, i32 4>
12845  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
12846  ret void
12847}
12848
12849define void @s_shuffle_v4bf16_v3bf16__5_2_4_4() {
12850; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_2_4_4:
12851; GFX900:       ; %bb.0:
12852; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12853; GFX900-NEXT:    ;;#ASMSTART
12854; GFX900-NEXT:    ; def s[4:5]
12855; GFX900-NEXT:    ;;#ASMEND
12856; GFX900-NEXT:    ;;#ASMSTART
12857; GFX900-NEXT:    ; def s[6:7]
12858; GFX900-NEXT:    ;;#ASMEND
12859; GFX900-NEXT:    s_lshr_b32 s4, s6, 16
12860; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s5
12861; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
12862; GFX900-NEXT:    ;;#ASMSTART
12863; GFX900-NEXT:    ; use s[8:9]
12864; GFX900-NEXT:    ;;#ASMEND
12865; GFX900-NEXT:    s_setpc_b64 s[30:31]
12866;
12867; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_2_4_4:
12868; GFX90A:       ; %bb.0:
12869; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12870; GFX90A-NEXT:    ;;#ASMSTART
12871; GFX90A-NEXT:    ; def s[4:5]
12872; GFX90A-NEXT:    ;;#ASMEND
12873; GFX90A-NEXT:    ;;#ASMSTART
12874; GFX90A-NEXT:    ; def s[6:7]
12875; GFX90A-NEXT:    ;;#ASMEND
12876; GFX90A-NEXT:    s_lshr_b32 s4, s6, 16
12877; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s5
12878; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
12879; GFX90A-NEXT:    ;;#ASMSTART
12880; GFX90A-NEXT:    ; use s[8:9]
12881; GFX90A-NEXT:    ;;#ASMEND
12882; GFX90A-NEXT:    s_setpc_b64 s[30:31]
12883;
12884; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_2_4_4:
12885; GFX940:       ; %bb.0:
12886; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12887; GFX940-NEXT:    ;;#ASMSTART
12888; GFX940-NEXT:    ; def s[0:1]
12889; GFX940-NEXT:    ;;#ASMEND
12890; GFX940-NEXT:    ;;#ASMSTART
12891; GFX940-NEXT:    ; def s[2:3]
12892; GFX940-NEXT:    ;;#ASMEND
12893; GFX940-NEXT:    s_lshr_b32 s0, s2, 16
12894; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s1
12895; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s0, s0
12896; GFX940-NEXT:    ;;#ASMSTART
12897; GFX940-NEXT:    ; use s[8:9]
12898; GFX940-NEXT:    ;;#ASMEND
12899; GFX940-NEXT:    s_setpc_b64 s[30:31]
12900  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
12901  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
12902  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
12903  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
12904  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 2, i32 4, i32 4>
12905  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
12906  ret void
12907}
12908
12909define void @s_shuffle_v4bf16_v3bf16__5_3_4_4() {
12910; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_3_4_4:
12911; GFX900:       ; %bb.0:
12912; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12913; GFX900-NEXT:    ;;#ASMSTART
12914; GFX900-NEXT:    ; def s[4:5]
12915; GFX900-NEXT:    ;;#ASMEND
12916; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
12917; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
12918; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
12919; GFX900-NEXT:    ;;#ASMSTART
12920; GFX900-NEXT:    ; use s[8:9]
12921; GFX900-NEXT:    ;;#ASMEND
12922; GFX900-NEXT:    s_setpc_b64 s[30:31]
12923;
12924; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_3_4_4:
12925; GFX90A:       ; %bb.0:
12926; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12927; GFX90A-NEXT:    ;;#ASMSTART
12928; GFX90A-NEXT:    ; def s[4:5]
12929; GFX90A-NEXT:    ;;#ASMEND
12930; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
12931; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
12932; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
12933; GFX90A-NEXT:    ;;#ASMSTART
12934; GFX90A-NEXT:    ; use s[8:9]
12935; GFX90A-NEXT:    ;;#ASMEND
12936; GFX90A-NEXT:    s_setpc_b64 s[30:31]
12937;
12938; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_3_4_4:
12939; GFX940:       ; %bb.0:
12940; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12941; GFX940-NEXT:    ;;#ASMSTART
12942; GFX940-NEXT:    ; def s[0:1]
12943; GFX940-NEXT:    ;;#ASMEND
12944; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
12945; GFX940-NEXT:    s_lshr_b32 s0, s0, 16
12946; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s0, s0
12947; GFX940-NEXT:    ;;#ASMSTART
12948; GFX940-NEXT:    ; use s[8:9]
12949; GFX940-NEXT:    ;;#ASMEND
12950; GFX940-NEXT:    s_setpc_b64 s[30:31]
12951  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
12952  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
12953  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
12954  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
12955  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 3, i32 4, i32 4>
12956  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
12957  ret void
12958}
12959
12960define void @s_shuffle_v4bf16_v3bf16__5_5_4_4() {
12961; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_4:
12962; GFX900:       ; %bb.0:
12963; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12964; GFX900-NEXT:    ;;#ASMSTART
12965; GFX900-NEXT:    ; def s[4:5]
12966; GFX900-NEXT:    ;;#ASMEND
12967; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
12968; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
12969; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s5
12970; GFX900-NEXT:    ;;#ASMSTART
12971; GFX900-NEXT:    ; use s[8:9]
12972; GFX900-NEXT:    ;;#ASMEND
12973; GFX900-NEXT:    s_setpc_b64 s[30:31]
12974;
12975; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_4:
12976; GFX90A:       ; %bb.0:
12977; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12978; GFX90A-NEXT:    ;;#ASMSTART
12979; GFX90A-NEXT:    ; def s[4:5]
12980; GFX90A-NEXT:    ;;#ASMEND
12981; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
12982; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s4, s4
12983; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s5
12984; GFX90A-NEXT:    ;;#ASMSTART
12985; GFX90A-NEXT:    ; use s[8:9]
12986; GFX90A-NEXT:    ;;#ASMEND
12987; GFX90A-NEXT:    s_setpc_b64 s[30:31]
12988;
12989; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_4:
12990; GFX940:       ; %bb.0:
12991; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12992; GFX940-NEXT:    ;;#ASMSTART
12993; GFX940-NEXT:    ; def s[0:1]
12994; GFX940-NEXT:    ;;#ASMEND
12995; GFX940-NEXT:    s_lshr_b32 s0, s0, 16
12996; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s0, s0
12997; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s1, s1
12998; GFX940-NEXT:    ;;#ASMSTART
12999; GFX940-NEXT:    ; use s[8:9]
13000; GFX940-NEXT:    ;;#ASMEND
13001; GFX940-NEXT:    s_setpc_b64 s[30:31]
13002  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
13003  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
13004  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
13005  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
13006  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 4, i32 4>
13007  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
13008  ret void
13009}
13010
13011define void @s_shuffle_v4bf16_v3bf16__5_5_u_4() {
13012; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_4:
13013; GFX900:       ; %bb.0:
13014; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13015; GFX900-NEXT:    ;;#ASMSTART
13016; GFX900-NEXT:    ; def s[4:5]
13017; GFX900-NEXT:    ;;#ASMEND
13018; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s5
13019; GFX900-NEXT:    s_mov_b32 s9, s4
13020; GFX900-NEXT:    ;;#ASMSTART
13021; GFX900-NEXT:    ; use s[8:9]
13022; GFX900-NEXT:    ;;#ASMEND
13023; GFX900-NEXT:    s_setpc_b64 s[30:31]
13024;
13025; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_4:
13026; GFX90A:       ; %bb.0:
13027; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13028; GFX90A-NEXT:    ;;#ASMSTART
13029; GFX90A-NEXT:    ; def s[4:5]
13030; GFX90A-NEXT:    ;;#ASMEND
13031; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s5
13032; GFX90A-NEXT:    s_mov_b32 s9, s4
13033; GFX90A-NEXT:    ;;#ASMSTART
13034; GFX90A-NEXT:    ; use s[8:9]
13035; GFX90A-NEXT:    ;;#ASMEND
13036; GFX90A-NEXT:    s_setpc_b64 s[30:31]
13037;
13038; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_4:
13039; GFX940:       ; %bb.0:
13040; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13041; GFX940-NEXT:    ;;#ASMSTART
13042; GFX940-NEXT:    ; def s[0:1]
13043; GFX940-NEXT:    ;;#ASMEND
13044; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s1, s1
13045; GFX940-NEXT:    s_mov_b32 s9, s0
13046; GFX940-NEXT:    ;;#ASMSTART
13047; GFX940-NEXT:    ; use s[8:9]
13048; GFX940-NEXT:    ;;#ASMEND
13049; GFX940-NEXT:    s_setpc_b64 s[30:31]
13050  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
13051  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
13052  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
13053  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
13054  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 poison, i32 4>
13055  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
13056  ret void
13057}
13058
13059define void @s_shuffle_v4bf16_v3bf16__5_5_0_4() {
13060; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_4:
13061; GFX900:       ; %bb.0:
13062; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13063; GFX900-NEXT:    ;;#ASMSTART
13064; GFX900-NEXT:    ; def s[4:5]
13065; GFX900-NEXT:    ;;#ASMEND
13066; GFX900-NEXT:    ;;#ASMSTART
13067; GFX900-NEXT:    ; def s[6:7]
13068; GFX900-NEXT:    ;;#ASMEND
13069; GFX900-NEXT:    s_lshr_b32 s5, s6, 16
13070; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s4, s5
13071; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
13072; GFX900-NEXT:    ;;#ASMSTART
13073; GFX900-NEXT:    ; use s[8:9]
13074; GFX900-NEXT:    ;;#ASMEND
13075; GFX900-NEXT:    s_setpc_b64 s[30:31]
13076;
13077; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_4:
13078; GFX90A:       ; %bb.0:
13079; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13080; GFX90A-NEXT:    ;;#ASMSTART
13081; GFX90A-NEXT:    ; def s[4:5]
13082; GFX90A-NEXT:    ;;#ASMEND
13083; GFX90A-NEXT:    ;;#ASMSTART
13084; GFX90A-NEXT:    ; def s[6:7]
13085; GFX90A-NEXT:    ;;#ASMEND
13086; GFX90A-NEXT:    s_lshr_b32 s5, s6, 16
13087; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s4, s5
13088; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
13089; GFX90A-NEXT:    ;;#ASMSTART
13090; GFX90A-NEXT:    ; use s[8:9]
13091; GFX90A-NEXT:    ;;#ASMEND
13092; GFX90A-NEXT:    s_setpc_b64 s[30:31]
13093;
13094; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_4:
13095; GFX940:       ; %bb.0:
13096; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13097; GFX940-NEXT:    ;;#ASMSTART
13098; GFX940-NEXT:    ; def s[0:1]
13099; GFX940-NEXT:    ;;#ASMEND
13100; GFX940-NEXT:    ;;#ASMSTART
13101; GFX940-NEXT:    ; def s[2:3]
13102; GFX940-NEXT:    ;;#ASMEND
13103; GFX940-NEXT:    s_lshr_b32 s1, s2, 16
13104; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s0, s1
13105; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s3
13106; GFX940-NEXT:    ;;#ASMSTART
13107; GFX940-NEXT:    ; use s[8:9]
13108; GFX940-NEXT:    ;;#ASMEND
13109; GFX940-NEXT:    s_setpc_b64 s[30:31]
13110  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
13111  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
13112  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
13113  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
13114  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 0, i32 4>
13115  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
13116  ret void
13117}
13118
13119define void @s_shuffle_v4bf16_v3bf16__5_5_1_4() {
13120; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_4:
13121; GFX900:       ; %bb.0:
13122; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13123; GFX900-NEXT:    ;;#ASMSTART
13124; GFX900-NEXT:    ; def s[4:5]
13125; GFX900-NEXT:    ;;#ASMEND
13126; GFX900-NEXT:    ;;#ASMSTART
13127; GFX900-NEXT:    ; def s[6:7]
13128; GFX900-NEXT:    ;;#ASMEND
13129; GFX900-NEXT:    s_lshr_b32 s5, s6, 16
13130; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
13131; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s4, s5
13132; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
13133; GFX900-NEXT:    ;;#ASMSTART
13134; GFX900-NEXT:    ; use s[8:9]
13135; GFX900-NEXT:    ;;#ASMEND
13136; GFX900-NEXT:    s_setpc_b64 s[30:31]
13137;
13138; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_4:
13139; GFX90A:       ; %bb.0:
13140; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13141; GFX90A-NEXT:    ;;#ASMSTART
13142; GFX90A-NEXT:    ; def s[4:5]
13143; GFX90A-NEXT:    ;;#ASMEND
13144; GFX90A-NEXT:    ;;#ASMSTART
13145; GFX90A-NEXT:    ; def s[6:7]
13146; GFX90A-NEXT:    ;;#ASMEND
13147; GFX90A-NEXT:    s_lshr_b32 s5, s6, 16
13148; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
13149; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s4, s5
13150; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
13151; GFX90A-NEXT:    ;;#ASMSTART
13152; GFX90A-NEXT:    ; use s[8:9]
13153; GFX90A-NEXT:    ;;#ASMEND
13154; GFX90A-NEXT:    s_setpc_b64 s[30:31]
13155;
13156; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_4:
13157; GFX940:       ; %bb.0:
13158; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13159; GFX940-NEXT:    ;;#ASMSTART
13160; GFX940-NEXT:    ; def s[0:1]
13161; GFX940-NEXT:    ;;#ASMEND
13162; GFX940-NEXT:    ;;#ASMSTART
13163; GFX940-NEXT:    ; def s[2:3]
13164; GFX940-NEXT:    ;;#ASMEND
13165; GFX940-NEXT:    s_lshr_b32 s1, s2, 16
13166; GFX940-NEXT:    s_lshr_b32 s0, s0, 16
13167; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s0, s1
13168; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s3
13169; GFX940-NEXT:    ;;#ASMSTART
13170; GFX940-NEXT:    ; use s[8:9]
13171; GFX940-NEXT:    ;;#ASMEND
13172; GFX940-NEXT:    s_setpc_b64 s[30:31]
13173  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
13174  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
13175  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
13176  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
13177  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 1, i32 4>
13178  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
13179  ret void
13180}
13181
13182define void @s_shuffle_v4bf16_v3bf16__5_5_2_4() {
13183; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_4:
13184; GFX900:       ; %bb.0:
13185; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13186; GFX900-NEXT:    ;;#ASMSTART
13187; GFX900-NEXT:    ; def s[4:5]
13188; GFX900-NEXT:    ;;#ASMEND
13189; GFX900-NEXT:    ;;#ASMSTART
13190; GFX900-NEXT:    ; def s[6:7]
13191; GFX900-NEXT:    ;;#ASMEND
13192; GFX900-NEXT:    s_lshr_b32 s4, s6, 16
13193; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s5, s4
13194; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
13195; GFX900-NEXT:    ;;#ASMSTART
13196; GFX900-NEXT:    ; use s[8:9]
13197; GFX900-NEXT:    ;;#ASMEND
13198; GFX900-NEXT:    s_setpc_b64 s[30:31]
13199;
13200; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_4:
13201; GFX90A:       ; %bb.0:
13202; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13203; GFX90A-NEXT:    ;;#ASMSTART
13204; GFX90A-NEXT:    ; def s[4:5]
13205; GFX90A-NEXT:    ;;#ASMEND
13206; GFX90A-NEXT:    ;;#ASMSTART
13207; GFX90A-NEXT:    ; def s[6:7]
13208; GFX90A-NEXT:    ;;#ASMEND
13209; GFX90A-NEXT:    s_lshr_b32 s4, s6, 16
13210; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s5, s4
13211; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
13212; GFX90A-NEXT:    ;;#ASMSTART
13213; GFX90A-NEXT:    ; use s[8:9]
13214; GFX90A-NEXT:    ;;#ASMEND
13215; GFX90A-NEXT:    s_setpc_b64 s[30:31]
13216;
13217; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_4:
13218; GFX940:       ; %bb.0:
13219; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13220; GFX940-NEXT:    ;;#ASMSTART
13221; GFX940-NEXT:    ; def s[0:1]
13222; GFX940-NEXT:    ;;#ASMEND
13223; GFX940-NEXT:    ;;#ASMSTART
13224; GFX940-NEXT:    ; def s[2:3]
13225; GFX940-NEXT:    ;;#ASMEND
13226; GFX940-NEXT:    s_lshr_b32 s0, s2, 16
13227; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s1, s0
13228; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s3
13229; GFX940-NEXT:    ;;#ASMSTART
13230; GFX940-NEXT:    ; use s[8:9]
13231; GFX940-NEXT:    ;;#ASMEND
13232; GFX940-NEXT:    s_setpc_b64 s[30:31]
13233  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
13234  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
13235  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
13236  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
13237  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 2, i32 4>
13238  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
13239  ret void
13240}
13241
13242define void @s_shuffle_v4bf16_v3bf16__5_5_3_4() {
13243; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_4:
13244; GFX900:       ; %bb.0:
13245; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13246; GFX900-NEXT:    ;;#ASMSTART
13247; GFX900-NEXT:    ; def s[4:5]
13248; GFX900-NEXT:    ;;#ASMEND
13249; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s5
13250; GFX900-NEXT:    s_mov_b32 s9, s4
13251; GFX900-NEXT:    ;;#ASMSTART
13252; GFX900-NEXT:    ; use s[8:9]
13253; GFX900-NEXT:    ;;#ASMEND
13254; GFX900-NEXT:    s_setpc_b64 s[30:31]
13255;
13256; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_4:
13257; GFX90A:       ; %bb.0:
13258; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13259; GFX90A-NEXT:    ;;#ASMSTART
13260; GFX90A-NEXT:    ; def s[4:5]
13261; GFX90A-NEXT:    ;;#ASMEND
13262; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s5
13263; GFX90A-NEXT:    s_mov_b32 s9, s4
13264; GFX90A-NEXT:    ;;#ASMSTART
13265; GFX90A-NEXT:    ; use s[8:9]
13266; GFX90A-NEXT:    ;;#ASMEND
13267; GFX90A-NEXT:    s_setpc_b64 s[30:31]
13268;
13269; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_4:
13270; GFX940:       ; %bb.0:
13271; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13272; GFX940-NEXT:    ;;#ASMSTART
13273; GFX940-NEXT:    ; def s[0:1]
13274; GFX940-NEXT:    ;;#ASMEND
13275; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s1, s1
13276; GFX940-NEXT:    s_mov_b32 s9, s0
13277; GFX940-NEXT:    ;;#ASMSTART
13278; GFX940-NEXT:    ; use s[8:9]
13279; GFX940-NEXT:    ;;#ASMEND
13280; GFX940-NEXT:    s_setpc_b64 s[30:31]
13281  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
13282  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
13283  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
13284  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
13285  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 3, i32 4>
13286  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
13287  ret void
13288}
13289
13290define void @s_shuffle_v4bf16_v3bf16__u_5_5_5() {
13291; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__u_5_5_5:
13292; GFX900:       ; %bb.0:
13293; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13294; GFX900-NEXT:    ;;#ASMSTART
13295; GFX900-NEXT:    ; def s[4:5]
13296; GFX900-NEXT:    ;;#ASMEND
13297; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s5, s5
13298; GFX900-NEXT:    s_lshl_b32 s8, s5, 16
13299; GFX900-NEXT:    ;;#ASMSTART
13300; GFX900-NEXT:    ; use s[8:9]
13301; GFX900-NEXT:    ;;#ASMEND
13302; GFX900-NEXT:    s_setpc_b64 s[30:31]
13303;
13304; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__u_5_5_5:
13305; GFX90A:       ; %bb.0:
13306; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13307; GFX90A-NEXT:    ;;#ASMSTART
13308; GFX90A-NEXT:    ; def s[4:5]
13309; GFX90A-NEXT:    ;;#ASMEND
13310; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s5, s5
13311; GFX90A-NEXT:    s_lshl_b32 s8, s5, 16
13312; GFX90A-NEXT:    ;;#ASMSTART
13313; GFX90A-NEXT:    ; use s[8:9]
13314; GFX90A-NEXT:    ;;#ASMEND
13315; GFX90A-NEXT:    s_setpc_b64 s[30:31]
13316;
13317; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__u_5_5_5:
13318; GFX940:       ; %bb.0:
13319; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13320; GFX940-NEXT:    ;;#ASMSTART
13321; GFX940-NEXT:    ; def s[0:1]
13322; GFX940-NEXT:    ;;#ASMEND
13323; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s1, s1
13324; GFX940-NEXT:    s_lshl_b32 s8, s1, 16
13325; GFX940-NEXT:    ;;#ASMSTART
13326; GFX940-NEXT:    ; use s[8:9]
13327; GFX940-NEXT:    ;;#ASMEND
13328; GFX940-NEXT:    s_setpc_b64 s[30:31]
13329  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
13330  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
13331  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
13332  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
13333  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 poison, i32 5, i32 5, i32 5>
13334  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
13335  ret void
13336}
13337
13338define void @s_shuffle_v4bf16_v3bf16__0_5_5_5() {
13339; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__0_5_5_5:
13340; GFX900:       ; %bb.0:
13341; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13342; GFX900-NEXT:    ;;#ASMSTART
13343; GFX900-NEXT:    ; def s[4:5]
13344; GFX900-NEXT:    ;;#ASMEND
13345; GFX900-NEXT:    ;;#ASMSTART
13346; GFX900-NEXT:    ; def s[6:7]
13347; GFX900-NEXT:    ;;#ASMEND
13348; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s7
13349; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s7, s7
13350; GFX900-NEXT:    ;;#ASMSTART
13351; GFX900-NEXT:    ; use s[8:9]
13352; GFX900-NEXT:    ;;#ASMEND
13353; GFX900-NEXT:    s_setpc_b64 s[30:31]
13354;
13355; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__0_5_5_5:
13356; GFX90A:       ; %bb.0:
13357; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13358; GFX90A-NEXT:    ;;#ASMSTART
13359; GFX90A-NEXT:    ; def s[4:5]
13360; GFX90A-NEXT:    ;;#ASMEND
13361; GFX90A-NEXT:    ;;#ASMSTART
13362; GFX90A-NEXT:    ; def s[6:7]
13363; GFX90A-NEXT:    ;;#ASMEND
13364; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s7
13365; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s7, s7
13366; GFX90A-NEXT:    ;;#ASMSTART
13367; GFX90A-NEXT:    ; use s[8:9]
13368; GFX90A-NEXT:    ;;#ASMEND
13369; GFX90A-NEXT:    s_setpc_b64 s[30:31]
13370;
13371; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__0_5_5_5:
13372; GFX940:       ; %bb.0:
13373; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13374; GFX940-NEXT:    ;;#ASMSTART
13375; GFX940-NEXT:    ; def s[0:1]
13376; GFX940-NEXT:    ;;#ASMEND
13377; GFX940-NEXT:    ;;#ASMSTART
13378; GFX940-NEXT:    ; def s[2:3]
13379; GFX940-NEXT:    ;;#ASMEND
13380; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s0, s3
13381; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s3, s3
13382; GFX940-NEXT:    ;;#ASMSTART
13383; GFX940-NEXT:    ; use s[8:9]
13384; GFX940-NEXT:    ;;#ASMEND
13385; GFX940-NEXT:    s_setpc_b64 s[30:31]
13386  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
13387  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
13388  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
13389  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
13390  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 0, i32 5, i32 5, i32 5>
13391  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
13392  ret void
13393}
13394
13395define void @s_shuffle_v4bf16_v3bf16__1_5_5_5() {
13396; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__1_5_5_5:
13397; GFX900:       ; %bb.0:
13398; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13399; GFX900-NEXT:    ;;#ASMSTART
13400; GFX900-NEXT:    ; def s[4:5]
13401; GFX900-NEXT:    ;;#ASMEND
13402; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
13403; GFX900-NEXT:    ;;#ASMSTART
13404; GFX900-NEXT:    ; def s[6:7]
13405; GFX900-NEXT:    ;;#ASMEND
13406; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s7
13407; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s7, s7
13408; GFX900-NEXT:    ;;#ASMSTART
13409; GFX900-NEXT:    ; use s[8:9]
13410; GFX900-NEXT:    ;;#ASMEND
13411; GFX900-NEXT:    s_setpc_b64 s[30:31]
13412;
13413; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__1_5_5_5:
13414; GFX90A:       ; %bb.0:
13415; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13416; GFX90A-NEXT:    ;;#ASMSTART
13417; GFX90A-NEXT:    ; def s[4:5]
13418; GFX90A-NEXT:    ;;#ASMEND
13419; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
13420; GFX90A-NEXT:    ;;#ASMSTART
13421; GFX90A-NEXT:    ; def s[6:7]
13422; GFX90A-NEXT:    ;;#ASMEND
13423; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s7
13424; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s7, s7
13425; GFX90A-NEXT:    ;;#ASMSTART
13426; GFX90A-NEXT:    ; use s[8:9]
13427; GFX90A-NEXT:    ;;#ASMEND
13428; GFX90A-NEXT:    s_setpc_b64 s[30:31]
13429;
13430; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__1_5_5_5:
13431; GFX940:       ; %bb.0:
13432; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13433; GFX940-NEXT:    ;;#ASMSTART
13434; GFX940-NEXT:    ; def s[0:1]
13435; GFX940-NEXT:    ;;#ASMEND
13436; GFX940-NEXT:    s_lshr_b32 s0, s0, 16
13437; GFX940-NEXT:    ;;#ASMSTART
13438; GFX940-NEXT:    ; def s[2:3]
13439; GFX940-NEXT:    ;;#ASMEND
13440; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s0, s3
13441; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s3, s3
13442; GFX940-NEXT:    ;;#ASMSTART
13443; GFX940-NEXT:    ; use s[8:9]
13444; GFX940-NEXT:    ;;#ASMEND
13445; GFX940-NEXT:    s_setpc_b64 s[30:31]
13446  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
13447  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
13448  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
13449  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
13450  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 1, i32 5, i32 5, i32 5>
13451  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
13452  ret void
13453}
13454
13455define void @s_shuffle_v4bf16_v3bf16__2_5_5_5() {
13456; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__2_5_5_5:
13457; GFX900:       ; %bb.0:
13458; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13459; GFX900-NEXT:    ;;#ASMSTART
13460; GFX900-NEXT:    ; def s[4:5]
13461; GFX900-NEXT:    ;;#ASMEND
13462; GFX900-NEXT:    ;;#ASMSTART
13463; GFX900-NEXT:    ; def s[6:7]
13464; GFX900-NEXT:    ;;#ASMEND
13465; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s7
13466; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s7, s7
13467; GFX900-NEXT:    ;;#ASMSTART
13468; GFX900-NEXT:    ; use s[8:9]
13469; GFX900-NEXT:    ;;#ASMEND
13470; GFX900-NEXT:    s_setpc_b64 s[30:31]
13471;
13472; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__2_5_5_5:
13473; GFX90A:       ; %bb.0:
13474; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13475; GFX90A-NEXT:    ;;#ASMSTART
13476; GFX90A-NEXT:    ; def s[4:5]
13477; GFX90A-NEXT:    ;;#ASMEND
13478; GFX90A-NEXT:    ;;#ASMSTART
13479; GFX90A-NEXT:    ; def s[6:7]
13480; GFX90A-NEXT:    ;;#ASMEND
13481; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s7
13482; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s7, s7
13483; GFX90A-NEXT:    ;;#ASMSTART
13484; GFX90A-NEXT:    ; use s[8:9]
13485; GFX90A-NEXT:    ;;#ASMEND
13486; GFX90A-NEXT:    s_setpc_b64 s[30:31]
13487;
13488; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__2_5_5_5:
13489; GFX940:       ; %bb.0:
13490; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13491; GFX940-NEXT:    ;;#ASMSTART
13492; GFX940-NEXT:    ; def s[0:1]
13493; GFX940-NEXT:    ;;#ASMEND
13494; GFX940-NEXT:    ;;#ASMSTART
13495; GFX940-NEXT:    ; def s[2:3]
13496; GFX940-NEXT:    ;;#ASMEND
13497; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s1, s3
13498; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s3, s3
13499; GFX940-NEXT:    ;;#ASMSTART
13500; GFX940-NEXT:    ; use s[8:9]
13501; GFX940-NEXT:    ;;#ASMEND
13502; GFX940-NEXT:    s_setpc_b64 s[30:31]
13503  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
13504  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
13505  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
13506  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
13507  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 2, i32 5, i32 5, i32 5>
13508  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
13509  ret void
13510}
13511
13512define void @s_shuffle_v4bf16_v3bf16__3_5_5_5() {
13513; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__3_5_5_5:
13514; GFX900:       ; %bb.0:
13515; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13516; GFX900-NEXT:    ;;#ASMSTART
13517; GFX900-NEXT:    ; def s[4:5]
13518; GFX900-NEXT:    ;;#ASMEND
13519; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
13520; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s5, s5
13521; GFX900-NEXT:    ;;#ASMSTART
13522; GFX900-NEXT:    ; use s[8:9]
13523; GFX900-NEXT:    ;;#ASMEND
13524; GFX900-NEXT:    s_setpc_b64 s[30:31]
13525;
13526; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__3_5_5_5:
13527; GFX90A:       ; %bb.0:
13528; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13529; GFX90A-NEXT:    ;;#ASMSTART
13530; GFX90A-NEXT:    ; def s[4:5]
13531; GFX90A-NEXT:    ;;#ASMEND
13532; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
13533; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s5, s5
13534; GFX90A-NEXT:    ;;#ASMSTART
13535; GFX90A-NEXT:    ; use s[8:9]
13536; GFX90A-NEXT:    ;;#ASMEND
13537; GFX90A-NEXT:    s_setpc_b64 s[30:31]
13538;
13539; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__3_5_5_5:
13540; GFX940:       ; %bb.0:
13541; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13542; GFX940-NEXT:    ;;#ASMSTART
13543; GFX940-NEXT:    ; def s[0:1]
13544; GFX940-NEXT:    ;;#ASMEND
13545; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s0, s1
13546; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s1, s1
13547; GFX940-NEXT:    ;;#ASMSTART
13548; GFX940-NEXT:    ; use s[8:9]
13549; GFX940-NEXT:    ;;#ASMEND
13550; GFX940-NEXT:    s_setpc_b64 s[30:31]
13551  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
13552  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
13553  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
13554  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
13555  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 3, i32 5, i32 5, i32 5>
13556  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
13557  ret void
13558}
13559
13560define void @s_shuffle_v4bf16_v3bf16__4_5_5_5() {
13561; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__4_5_5_5:
13562; GFX900:       ; %bb.0:
13563; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13564; GFX900-NEXT:    ;;#ASMSTART
13565; GFX900-NEXT:    ; def s[4:5]
13566; GFX900-NEXT:    ;;#ASMEND
13567; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
13568; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
13569; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s5, s5
13570; GFX900-NEXT:    ;;#ASMSTART
13571; GFX900-NEXT:    ; use s[8:9]
13572; GFX900-NEXT:    ;;#ASMEND
13573; GFX900-NEXT:    s_setpc_b64 s[30:31]
13574;
13575; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__4_5_5_5:
13576; GFX90A:       ; %bb.0:
13577; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13578; GFX90A-NEXT:    ;;#ASMSTART
13579; GFX90A-NEXT:    ; def s[4:5]
13580; GFX90A-NEXT:    ;;#ASMEND
13581; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
13582; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
13583; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s5, s5
13584; GFX90A-NEXT:    ;;#ASMSTART
13585; GFX90A-NEXT:    ; use s[8:9]
13586; GFX90A-NEXT:    ;;#ASMEND
13587; GFX90A-NEXT:    s_setpc_b64 s[30:31]
13588;
13589; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__4_5_5_5:
13590; GFX940:       ; %bb.0:
13591; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13592; GFX940-NEXT:    ;;#ASMSTART
13593; GFX940-NEXT:    ; def s[0:1]
13594; GFX940-NEXT:    ;;#ASMEND
13595; GFX940-NEXT:    s_lshr_b32 s0, s0, 16
13596; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s0, s1
13597; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s1, s1
13598; GFX940-NEXT:    ;;#ASMSTART
13599; GFX940-NEXT:    ; use s[8:9]
13600; GFX940-NEXT:    ;;#ASMEND
13601; GFX940-NEXT:    s_setpc_b64 s[30:31]
13602  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
13603  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
13604  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
13605  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
13606  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 4, i32 5, i32 5, i32 5>
13607  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
13608  ret void
13609}
13610
13611define void @s_shuffle_v4bf16_v3bf16__5_u_5_5() {
13612; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_u_5_5:
13613; GFX900:       ; %bb.0:
13614; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13615; GFX900-NEXT:    ;;#ASMSTART
13616; GFX900-NEXT:    ; def s[4:5]
13617; GFX900-NEXT:    ;;#ASMEND
13618; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s5, s5
13619; GFX900-NEXT:    s_mov_b32 s8, s5
13620; GFX900-NEXT:    ;;#ASMSTART
13621; GFX900-NEXT:    ; use s[8:9]
13622; GFX900-NEXT:    ;;#ASMEND
13623; GFX900-NEXT:    s_setpc_b64 s[30:31]
13624;
13625; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_u_5_5:
13626; GFX90A:       ; %bb.0:
13627; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13628; GFX90A-NEXT:    ;;#ASMSTART
13629; GFX90A-NEXT:    ; def s[4:5]
13630; GFX90A-NEXT:    ;;#ASMEND
13631; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s5, s5
13632; GFX90A-NEXT:    s_mov_b32 s8, s5
13633; GFX90A-NEXT:    ;;#ASMSTART
13634; GFX90A-NEXT:    ; use s[8:9]
13635; GFX90A-NEXT:    ;;#ASMEND
13636; GFX90A-NEXT:    s_setpc_b64 s[30:31]
13637;
13638; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_u_5_5:
13639; GFX940:       ; %bb.0:
13640; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13641; GFX940-NEXT:    ;;#ASMSTART
13642; GFX940-NEXT:    ; def s[0:1]
13643; GFX940-NEXT:    ;;#ASMEND
13644; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s1, s1
13645; GFX940-NEXT:    s_mov_b32 s8, s1
13646; GFX940-NEXT:    ;;#ASMSTART
13647; GFX940-NEXT:    ; use s[8:9]
13648; GFX940-NEXT:    ;;#ASMEND
13649; GFX940-NEXT:    s_setpc_b64 s[30:31]
13650  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
13651  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
13652  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
13653  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
13654  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 poison, i32 5, i32 5>
13655  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
13656  ret void
13657}
13658
13659define void @s_shuffle_v4bf16_v3bf16__5_0_5_5() {
13660; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_0_5_5:
13661; GFX900:       ; %bb.0:
13662; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13663; GFX900-NEXT:    ;;#ASMSTART
13664; GFX900-NEXT:    ; def s[4:5]
13665; GFX900-NEXT:    ;;#ASMEND
13666; GFX900-NEXT:    ;;#ASMSTART
13667; GFX900-NEXT:    ; def s[6:7]
13668; GFX900-NEXT:    ;;#ASMEND
13669; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s4
13670; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s7, s7
13671; GFX900-NEXT:    ;;#ASMSTART
13672; GFX900-NEXT:    ; use s[8:9]
13673; GFX900-NEXT:    ;;#ASMEND
13674; GFX900-NEXT:    s_setpc_b64 s[30:31]
13675;
13676; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_0_5_5:
13677; GFX90A:       ; %bb.0:
13678; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13679; GFX90A-NEXT:    ;;#ASMSTART
13680; GFX90A-NEXT:    ; def s[4:5]
13681; GFX90A-NEXT:    ;;#ASMEND
13682; GFX90A-NEXT:    ;;#ASMSTART
13683; GFX90A-NEXT:    ; def s[6:7]
13684; GFX90A-NEXT:    ;;#ASMEND
13685; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s4
13686; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s7, s7
13687; GFX90A-NEXT:    ;;#ASMSTART
13688; GFX90A-NEXT:    ; use s[8:9]
13689; GFX90A-NEXT:    ;;#ASMEND
13690; GFX90A-NEXT:    s_setpc_b64 s[30:31]
13691;
13692; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_0_5_5:
13693; GFX940:       ; %bb.0:
13694; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13695; GFX940-NEXT:    ;;#ASMSTART
13696; GFX940-NEXT:    ; def s[0:1]
13697; GFX940-NEXT:    ;;#ASMEND
13698; GFX940-NEXT:    ;;#ASMSTART
13699; GFX940-NEXT:    ; def s[2:3]
13700; GFX940-NEXT:    ;;#ASMEND
13701; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s0
13702; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s3, s3
13703; GFX940-NEXT:    ;;#ASMSTART
13704; GFX940-NEXT:    ; use s[8:9]
13705; GFX940-NEXT:    ;;#ASMEND
13706; GFX940-NEXT:    s_setpc_b64 s[30:31]
13707  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
13708  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
13709  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
13710  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
13711  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 0, i32 5, i32 5>
13712  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
13713  ret void
13714}
13715
13716define void @s_shuffle_v4bf16_v3bf16__5_1_5_5() {
13717; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_1_5_5:
13718; GFX900:       ; %bb.0:
13719; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13720; GFX900-NEXT:    ;;#ASMSTART
13721; GFX900-NEXT:    ; def s[4:5]
13722; GFX900-NEXT:    ;;#ASMEND
13723; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
13724; GFX900-NEXT:    ;;#ASMSTART
13725; GFX900-NEXT:    ; def s[6:7]
13726; GFX900-NEXT:    ;;#ASMEND
13727; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s4
13728; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s7, s7
13729; GFX900-NEXT:    ;;#ASMSTART
13730; GFX900-NEXT:    ; use s[8:9]
13731; GFX900-NEXT:    ;;#ASMEND
13732; GFX900-NEXT:    s_setpc_b64 s[30:31]
13733;
13734; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_1_5_5:
13735; GFX90A:       ; %bb.0:
13736; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13737; GFX90A-NEXT:    ;;#ASMSTART
13738; GFX90A-NEXT:    ; def s[4:5]
13739; GFX90A-NEXT:    ;;#ASMEND
13740; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
13741; GFX90A-NEXT:    ;;#ASMSTART
13742; GFX90A-NEXT:    ; def s[6:7]
13743; GFX90A-NEXT:    ;;#ASMEND
13744; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s4
13745; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s7, s7
13746; GFX90A-NEXT:    ;;#ASMSTART
13747; GFX90A-NEXT:    ; use s[8:9]
13748; GFX90A-NEXT:    ;;#ASMEND
13749; GFX90A-NEXT:    s_setpc_b64 s[30:31]
13750;
13751; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_1_5_5:
13752; GFX940:       ; %bb.0:
13753; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13754; GFX940-NEXT:    ;;#ASMSTART
13755; GFX940-NEXT:    ; def s[0:1]
13756; GFX940-NEXT:    ;;#ASMEND
13757; GFX940-NEXT:    s_lshr_b32 s0, s0, 16
13758; GFX940-NEXT:    ;;#ASMSTART
13759; GFX940-NEXT:    ; def s[2:3]
13760; GFX940-NEXT:    ;;#ASMEND
13761; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s0
13762; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s3, s3
13763; GFX940-NEXT:    ;;#ASMSTART
13764; GFX940-NEXT:    ; use s[8:9]
13765; GFX940-NEXT:    ;;#ASMEND
13766; GFX940-NEXT:    s_setpc_b64 s[30:31]
13767  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
13768  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
13769  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
13770  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
13771  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 1, i32 5, i32 5>
13772  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
13773  ret void
13774}
13775
13776define void @s_shuffle_v4bf16_v3bf16__5_2_5_5() {
13777; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_2_5_5:
13778; GFX900:       ; %bb.0:
13779; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13780; GFX900-NEXT:    ;;#ASMSTART
13781; GFX900-NEXT:    ; def s[4:5]
13782; GFX900-NEXT:    ;;#ASMEND
13783; GFX900-NEXT:    ;;#ASMSTART
13784; GFX900-NEXT:    ; def s[6:7]
13785; GFX900-NEXT:    ;;#ASMEND
13786; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s5
13787; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s7, s7
13788; GFX900-NEXT:    ;;#ASMSTART
13789; GFX900-NEXT:    ; use s[8:9]
13790; GFX900-NEXT:    ;;#ASMEND
13791; GFX900-NEXT:    s_setpc_b64 s[30:31]
13792;
13793; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_2_5_5:
13794; GFX90A:       ; %bb.0:
13795; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13796; GFX90A-NEXT:    ;;#ASMSTART
13797; GFX90A-NEXT:    ; def s[4:5]
13798; GFX90A-NEXT:    ;;#ASMEND
13799; GFX90A-NEXT:    ;;#ASMSTART
13800; GFX90A-NEXT:    ; def s[6:7]
13801; GFX90A-NEXT:    ;;#ASMEND
13802; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s5
13803; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s7, s7
13804; GFX90A-NEXT:    ;;#ASMSTART
13805; GFX90A-NEXT:    ; use s[8:9]
13806; GFX90A-NEXT:    ;;#ASMEND
13807; GFX90A-NEXT:    s_setpc_b64 s[30:31]
13808;
13809; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_2_5_5:
13810; GFX940:       ; %bb.0:
13811; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13812; GFX940-NEXT:    ;;#ASMSTART
13813; GFX940-NEXT:    ; def s[0:1]
13814; GFX940-NEXT:    ;;#ASMEND
13815; GFX940-NEXT:    ;;#ASMSTART
13816; GFX940-NEXT:    ; def s[2:3]
13817; GFX940-NEXT:    ;;#ASMEND
13818; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s1
13819; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s3, s3
13820; GFX940-NEXT:    ;;#ASMSTART
13821; GFX940-NEXT:    ; use s[8:9]
13822; GFX940-NEXT:    ;;#ASMEND
13823; GFX940-NEXT:    s_setpc_b64 s[30:31]
13824  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
13825  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
13826  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
13827  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
13828  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 2, i32 5, i32 5>
13829  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
13830  ret void
13831}
13832
13833define void @s_shuffle_v4bf16_v3bf16__5_3_5_5() {
13834; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_3_5_5:
13835; GFX900:       ; %bb.0:
13836; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13837; GFX900-NEXT:    ;;#ASMSTART
13838; GFX900-NEXT:    ; def s[4:5]
13839; GFX900-NEXT:    ;;#ASMEND
13840; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
13841; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s5, s5
13842; GFX900-NEXT:    ;;#ASMSTART
13843; GFX900-NEXT:    ; use s[8:9]
13844; GFX900-NEXT:    ;;#ASMEND
13845; GFX900-NEXT:    s_setpc_b64 s[30:31]
13846;
13847; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_3_5_5:
13848; GFX90A:       ; %bb.0:
13849; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13850; GFX90A-NEXT:    ;;#ASMSTART
13851; GFX90A-NEXT:    ; def s[4:5]
13852; GFX90A-NEXT:    ;;#ASMEND
13853; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
13854; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s5, s5
13855; GFX90A-NEXT:    ;;#ASMSTART
13856; GFX90A-NEXT:    ; use s[8:9]
13857; GFX90A-NEXT:    ;;#ASMEND
13858; GFX90A-NEXT:    s_setpc_b64 s[30:31]
13859;
13860; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_3_5_5:
13861; GFX940:       ; %bb.0:
13862; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13863; GFX940-NEXT:    ;;#ASMSTART
13864; GFX940-NEXT:    ; def s[0:1]
13865; GFX940-NEXT:    ;;#ASMEND
13866; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
13867; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s1, s1
13868; GFX940-NEXT:    ;;#ASMSTART
13869; GFX940-NEXT:    ; use s[8:9]
13870; GFX940-NEXT:    ;;#ASMEND
13871; GFX940-NEXT:    s_setpc_b64 s[30:31]
13872  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
13873  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
13874  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
13875  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
13876  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 3, i32 5, i32 5>
13877  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
13878  ret void
13879}
13880
13881define void @s_shuffle_v4bf16_v3bf16__5_4_5_5() {
13882; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_4_5_5:
13883; GFX900:       ; %bb.0:
13884; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13885; GFX900-NEXT:    ;;#ASMSTART
13886; GFX900-NEXT:    ; def s[4:5]
13887; GFX900-NEXT:    ;;#ASMEND
13888; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
13889; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
13890; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s5, s5
13891; GFX900-NEXT:    ;;#ASMSTART
13892; GFX900-NEXT:    ; use s[8:9]
13893; GFX900-NEXT:    ;;#ASMEND
13894; GFX900-NEXT:    s_setpc_b64 s[30:31]
13895;
13896; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_4_5_5:
13897; GFX90A:       ; %bb.0:
13898; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13899; GFX90A-NEXT:    ;;#ASMSTART
13900; GFX90A-NEXT:    ; def s[4:5]
13901; GFX90A-NEXT:    ;;#ASMEND
13902; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
13903; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
13904; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s5, s5
13905; GFX90A-NEXT:    ;;#ASMSTART
13906; GFX90A-NEXT:    ; use s[8:9]
13907; GFX90A-NEXT:    ;;#ASMEND
13908; GFX90A-NEXT:    s_setpc_b64 s[30:31]
13909;
13910; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_4_5_5:
13911; GFX940:       ; %bb.0:
13912; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13913; GFX940-NEXT:    ;;#ASMSTART
13914; GFX940-NEXT:    ; def s[0:1]
13915; GFX940-NEXT:    ;;#ASMEND
13916; GFX940-NEXT:    s_lshr_b32 s0, s0, 16
13917; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
13918; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s1, s1
13919; GFX940-NEXT:    ;;#ASMSTART
13920; GFX940-NEXT:    ; use s[8:9]
13921; GFX940-NEXT:    ;;#ASMEND
13922; GFX940-NEXT:    s_setpc_b64 s[30:31]
13923  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
13924  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
13925  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
13926  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
13927  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 4, i32 5, i32 5>
13928  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
13929  ret void
13930}
13931
13932define void @s_shuffle_v4bf16_v3bf16__5_5_u_5() {
13933; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_5:
13934; GFX900:       ; %bb.0:
13935; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13936; GFX900-NEXT:    ;;#ASMSTART
13937; GFX900-NEXT:    ; def s[4:5]
13938; GFX900-NEXT:    ;;#ASMEND
13939; GFX900-NEXT:    s_lshl_b32 s9, s5, 16
13940; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s5
13941; GFX900-NEXT:    ;;#ASMSTART
13942; GFX900-NEXT:    ; use s[8:9]
13943; GFX900-NEXT:    ;;#ASMEND
13944; GFX900-NEXT:    s_setpc_b64 s[30:31]
13945;
13946; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_5:
13947; GFX90A:       ; %bb.0:
13948; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13949; GFX90A-NEXT:    ;;#ASMSTART
13950; GFX90A-NEXT:    ; def s[4:5]
13951; GFX90A-NEXT:    ;;#ASMEND
13952; GFX90A-NEXT:    s_lshl_b32 s9, s5, 16
13953; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s5
13954; GFX90A-NEXT:    ;;#ASMSTART
13955; GFX90A-NEXT:    ; use s[8:9]
13956; GFX90A-NEXT:    ;;#ASMEND
13957; GFX90A-NEXT:    s_setpc_b64 s[30:31]
13958;
13959; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_u_5:
13960; GFX940:       ; %bb.0:
13961; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13962; GFX940-NEXT:    ;;#ASMSTART
13963; GFX940-NEXT:    ; def s[0:1]
13964; GFX940-NEXT:    ;;#ASMEND
13965; GFX940-NEXT:    s_lshl_b32 s9, s1, 16
13966; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s1, s1
13967; GFX940-NEXT:    ;;#ASMSTART
13968; GFX940-NEXT:    ; use s[8:9]
13969; GFX940-NEXT:    ;;#ASMEND
13970; GFX940-NEXT:    s_setpc_b64 s[30:31]
13971  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
13972  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
13973  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
13974  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
13975  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 poison, i32 5>
13976  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
13977  ret void
13978}
13979
13980define void @s_shuffle_v4bf16_v3bf16__5_5_0_5() {
13981; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_5:
13982; GFX900:       ; %bb.0:
13983; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13984; GFX900-NEXT:    ;;#ASMSTART
13985; GFX900-NEXT:    ; def s[4:5]
13986; GFX900-NEXT:    ;;#ASMEND
13987; GFX900-NEXT:    ;;#ASMSTART
13988; GFX900-NEXT:    ; def s[6:7]
13989; GFX900-NEXT:    ;;#ASMEND
13990; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s4, s7
13991; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
13992; GFX900-NEXT:    ;;#ASMSTART
13993; GFX900-NEXT:    ; use s[8:9]
13994; GFX900-NEXT:    ;;#ASMEND
13995; GFX900-NEXT:    s_setpc_b64 s[30:31]
13996;
13997; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_5:
13998; GFX90A:       ; %bb.0:
13999; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14000; GFX90A-NEXT:    ;;#ASMSTART
14001; GFX90A-NEXT:    ; def s[4:5]
14002; GFX90A-NEXT:    ;;#ASMEND
14003; GFX90A-NEXT:    ;;#ASMSTART
14004; GFX90A-NEXT:    ; def s[6:7]
14005; GFX90A-NEXT:    ;;#ASMEND
14006; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s4, s7
14007; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
14008; GFX90A-NEXT:    ;;#ASMSTART
14009; GFX90A-NEXT:    ; use s[8:9]
14010; GFX90A-NEXT:    ;;#ASMEND
14011; GFX90A-NEXT:    s_setpc_b64 s[30:31]
14012;
14013; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_0_5:
14014; GFX940:       ; %bb.0:
14015; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14016; GFX940-NEXT:    ;;#ASMSTART
14017; GFX940-NEXT:    ; def s[0:1]
14018; GFX940-NEXT:    ;;#ASMEND
14019; GFX940-NEXT:    ;;#ASMSTART
14020; GFX940-NEXT:    ; def s[2:3]
14021; GFX940-NEXT:    ;;#ASMEND
14022; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s0, s3
14023; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s3
14024; GFX940-NEXT:    ;;#ASMSTART
14025; GFX940-NEXT:    ; use s[8:9]
14026; GFX940-NEXT:    ;;#ASMEND
14027; GFX940-NEXT:    s_setpc_b64 s[30:31]
14028  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
14029  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
14030  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
14031  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
14032  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 0, i32 5>
14033  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
14034  ret void
14035}
14036
14037define void @s_shuffle_v4bf16_v3bf16__5_5_1_5() {
14038; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_5:
14039; GFX900:       ; %bb.0:
14040; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14041; GFX900-NEXT:    ;;#ASMSTART
14042; GFX900-NEXT:    ; def s[4:5]
14043; GFX900-NEXT:    ;;#ASMEND
14044; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
14045; GFX900-NEXT:    ;;#ASMSTART
14046; GFX900-NEXT:    ; def s[6:7]
14047; GFX900-NEXT:    ;;#ASMEND
14048; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s4, s7
14049; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
14050; GFX900-NEXT:    ;;#ASMSTART
14051; GFX900-NEXT:    ; use s[8:9]
14052; GFX900-NEXT:    ;;#ASMEND
14053; GFX900-NEXT:    s_setpc_b64 s[30:31]
14054;
14055; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_5:
14056; GFX90A:       ; %bb.0:
14057; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14058; GFX90A-NEXT:    ;;#ASMSTART
14059; GFX90A-NEXT:    ; def s[4:5]
14060; GFX90A-NEXT:    ;;#ASMEND
14061; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
14062; GFX90A-NEXT:    ;;#ASMSTART
14063; GFX90A-NEXT:    ; def s[6:7]
14064; GFX90A-NEXT:    ;;#ASMEND
14065; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s4, s7
14066; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
14067; GFX90A-NEXT:    ;;#ASMSTART
14068; GFX90A-NEXT:    ; use s[8:9]
14069; GFX90A-NEXT:    ;;#ASMEND
14070; GFX90A-NEXT:    s_setpc_b64 s[30:31]
14071;
14072; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_1_5:
14073; GFX940:       ; %bb.0:
14074; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14075; GFX940-NEXT:    ;;#ASMSTART
14076; GFX940-NEXT:    ; def s[0:1]
14077; GFX940-NEXT:    ;;#ASMEND
14078; GFX940-NEXT:    s_lshr_b32 s0, s0, 16
14079; GFX940-NEXT:    ;;#ASMSTART
14080; GFX940-NEXT:    ; def s[2:3]
14081; GFX940-NEXT:    ;;#ASMEND
14082; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s0, s3
14083; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s3
14084; GFX940-NEXT:    ;;#ASMSTART
14085; GFX940-NEXT:    ; use s[8:9]
14086; GFX940-NEXT:    ;;#ASMEND
14087; GFX940-NEXT:    s_setpc_b64 s[30:31]
14088  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
14089  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
14090  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
14091  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
14092  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 1, i32 5>
14093  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
14094  ret void
14095}
14096
14097define void @s_shuffle_v4bf16_v3bf16__5_5_2_5() {
14098; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_5:
14099; GFX900:       ; %bb.0:
14100; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14101; GFX900-NEXT:    ;;#ASMSTART
14102; GFX900-NEXT:    ; def s[4:5]
14103; GFX900-NEXT:    ;;#ASMEND
14104; GFX900-NEXT:    ;;#ASMSTART
14105; GFX900-NEXT:    ; def s[6:7]
14106; GFX900-NEXT:    ;;#ASMEND
14107; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s5, s7
14108; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
14109; GFX900-NEXT:    ;;#ASMSTART
14110; GFX900-NEXT:    ; use s[8:9]
14111; GFX900-NEXT:    ;;#ASMEND
14112; GFX900-NEXT:    s_setpc_b64 s[30:31]
14113;
14114; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_5:
14115; GFX90A:       ; %bb.0:
14116; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14117; GFX90A-NEXT:    ;;#ASMSTART
14118; GFX90A-NEXT:    ; def s[4:5]
14119; GFX90A-NEXT:    ;;#ASMEND
14120; GFX90A-NEXT:    ;;#ASMSTART
14121; GFX90A-NEXT:    ; def s[6:7]
14122; GFX90A-NEXT:    ;;#ASMEND
14123; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s5, s7
14124; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
14125; GFX90A-NEXT:    ;;#ASMSTART
14126; GFX90A-NEXT:    ; use s[8:9]
14127; GFX90A-NEXT:    ;;#ASMEND
14128; GFX90A-NEXT:    s_setpc_b64 s[30:31]
14129;
14130; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_2_5:
14131; GFX940:       ; %bb.0:
14132; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14133; GFX940-NEXT:    ;;#ASMSTART
14134; GFX940-NEXT:    ; def s[0:1]
14135; GFX940-NEXT:    ;;#ASMEND
14136; GFX940-NEXT:    ;;#ASMSTART
14137; GFX940-NEXT:    ; def s[2:3]
14138; GFX940-NEXT:    ;;#ASMEND
14139; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s1, s3
14140; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s3, s3
14141; GFX940-NEXT:    ;;#ASMSTART
14142; GFX940-NEXT:    ; use s[8:9]
14143; GFX940-NEXT:    ;;#ASMEND
14144; GFX940-NEXT:    s_setpc_b64 s[30:31]
14145  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
14146  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
14147  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
14148  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
14149  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 2, i32 5>
14150  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
14151  ret void
14152}
14153
14154define void @s_shuffle_v4bf16_v3bf16__5_5_3_5() {
14155; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_5:
14156; GFX900:       ; %bb.0:
14157; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14158; GFX900-NEXT:    ;;#ASMSTART
14159; GFX900-NEXT:    ; def s[4:5]
14160; GFX900-NEXT:    ;;#ASMEND
14161; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s4, s5
14162; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s5
14163; GFX900-NEXT:    ;;#ASMSTART
14164; GFX900-NEXT:    ; use s[8:9]
14165; GFX900-NEXT:    ;;#ASMEND
14166; GFX900-NEXT:    s_setpc_b64 s[30:31]
14167;
14168; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_5:
14169; GFX90A:       ; %bb.0:
14170; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14171; GFX90A-NEXT:    ;;#ASMSTART
14172; GFX90A-NEXT:    ; def s[4:5]
14173; GFX90A-NEXT:    ;;#ASMEND
14174; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s4, s5
14175; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s5
14176; GFX90A-NEXT:    ;;#ASMSTART
14177; GFX90A-NEXT:    ; use s[8:9]
14178; GFX90A-NEXT:    ;;#ASMEND
14179; GFX90A-NEXT:    s_setpc_b64 s[30:31]
14180;
14181; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_3_5:
14182; GFX940:       ; %bb.0:
14183; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14184; GFX940-NEXT:    ;;#ASMSTART
14185; GFX940-NEXT:    ; def s[0:1]
14186; GFX940-NEXT:    ;;#ASMEND
14187; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s0, s1
14188; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s1, s1
14189; GFX940-NEXT:    ;;#ASMSTART
14190; GFX940-NEXT:    ; use s[8:9]
14191; GFX940-NEXT:    ;;#ASMEND
14192; GFX940-NEXT:    s_setpc_b64 s[30:31]
14193  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
14194  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
14195  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
14196  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
14197  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 3, i32 5>
14198  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
14199  ret void
14200}
14201
14202define void @s_shuffle_v4bf16_v3bf16__5_5_4_5() {
14203; GFX900-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_5:
14204; GFX900:       ; %bb.0:
14205; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14206; GFX900-NEXT:    ;;#ASMSTART
14207; GFX900-NEXT:    ; def s[4:5]
14208; GFX900-NEXT:    ;;#ASMEND
14209; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
14210; GFX900-NEXT:    s_pack_ll_b32_b16 s9, s4, s5
14211; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s5
14212; GFX900-NEXT:    ;;#ASMSTART
14213; GFX900-NEXT:    ; use s[8:9]
14214; GFX900-NEXT:    ;;#ASMEND
14215; GFX900-NEXT:    s_setpc_b64 s[30:31]
14216;
14217; GFX90A-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_5:
14218; GFX90A:       ; %bb.0:
14219; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14220; GFX90A-NEXT:    ;;#ASMSTART
14221; GFX90A-NEXT:    ; def s[4:5]
14222; GFX90A-NEXT:    ;;#ASMEND
14223; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
14224; GFX90A-NEXT:    s_pack_ll_b32_b16 s9, s4, s5
14225; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s5
14226; GFX90A-NEXT:    ;;#ASMSTART
14227; GFX90A-NEXT:    ; use s[8:9]
14228; GFX90A-NEXT:    ;;#ASMEND
14229; GFX90A-NEXT:    s_setpc_b64 s[30:31]
14230;
14231; GFX940-LABEL: s_shuffle_v4bf16_v3bf16__5_5_4_5:
14232; GFX940:       ; %bb.0:
14233; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14234; GFX940-NEXT:    ;;#ASMSTART
14235; GFX940-NEXT:    ; def s[0:1]
14236; GFX940-NEXT:    ;;#ASMEND
14237; GFX940-NEXT:    s_lshr_b32 s0, s0, 16
14238; GFX940-NEXT:    s_pack_ll_b32_b16 s9, s0, s1
14239; GFX940-NEXT:    s_pack_ll_b32_b16 s8, s1, s1
14240; GFX940-NEXT:    ;;#ASMSTART
14241; GFX940-NEXT:    ; use s[8:9]
14242; GFX940-NEXT:    ;;#ASMEND
14243; GFX940-NEXT:    s_setpc_b64 s[30:31]
14244  %vec0 = call <4 x bfloat> asm "; def $0", "=s"()
14245  %vec1 = call <4 x bfloat> asm "; def $0", "=s"()
14246  %extract3 = shufflevector <4 x bfloat> %vec0, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
14247  %extract31 = shufflevector <4 x bfloat> %vec1, <4 x bfloat> poison, <3 x i32> <i32 0, i32 1, i32 2>
14248  %shuf = shufflevector <3 x bfloat> %extract3, <3 x bfloat> %extract31, <4 x i32> <i32 5, i32 5, i32 4, i32 5>
14249  call void asm sideeffect "; use $0", "{s[8:9]}"(<4 x bfloat> %shuf)
14250  ret void
14251}
14252;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
14253; GFX90APLUS: {{.*}}
14254